From 0793a61d4df8daeac6492dbf8d2f3e5713caae5e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 4 Dec 2008 20:12:29 +0100 Subject: [PATCH 0001/2061] performance counters: core code Implement the core kernel bits of Performance Counters subsystem. The Linux Performance Counter subsystem provides an abstraction of performance counter hardware capabilities. It provides per task and per CPU counters, and it provides event capabilities on top of those. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. The special file descriptor is opened via the perf_counter_open() system call: int perf_counter_open(u32 hw_event_type, u32 hw_event_period, u32 record_type, pid_t pid, int cpu); The syscall returns the new fd. The fd can be used via the normal VFS system calls: read() can be used to read the counter, fcntl() can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. See more details in Documentation/perf-counters.txt. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- drivers/char/sysrq.c | 2 + include/linux/perf_counter.h | 171 +++++++ include/linux/sched.h | 9 + include/linux/syscalls.h | 6 + init/Kconfig | 29 ++ kernel/Makefile | 1 + kernel/fork.c | 1 + kernel/perf_counter.c | 943 +++++++++++++++++++++++++++++++++++ kernel/sched.c | 24 + kernel/sys_ni.c | 3 + 10 files changed, 1189 insertions(+) create mode 100644 include/linux/perf_counter.h create mode 100644 kernel/perf_counter.c diff --git a/drivers/char/sysrq.c b/drivers/char/sysrq.c index ce0d9da52a8..52146c2a8d9 100644 --- a/drivers/char/sysrq.c +++ b/drivers/char/sysrq.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include @@ -244,6 +245,7 @@ static void sysrq_handle_showregs(int key, struct tty_struct *tty) struct pt_regs *regs = get_irq_regs(); if (regs) show_regs(regs); + perf_counter_print_debug(); } static struct sysrq_key_op sysrq_showregs_op = { .handler = sysrq_handle_showregs, diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h new file mode 100644 index 00000000000..22c4469abf4 --- /dev/null +++ b/include/linux/perf_counter.h @@ -0,0 +1,171 @@ +/* + * Performance counters: + * + * Copyright(C) 2008, Thomas Gleixner + * Copyright(C) 2008, Red Hat, Inc., Ingo Molnar + * + * Data type definitions, declarations, prototypes. + * + * Started by: Thomas Gleixner and Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ +#ifndef _LINUX_PERF_COUNTER_H +#define _LINUX_PERF_COUNTER_H + +#include + +#include +#include +#include +#include +#include + +struct task_struct; + +/* + * Generalized hardware event types, used by the hw_event_type parameter + * of the sys_perf_counter_open() syscall: + */ +enum hw_event_types { + PERF_COUNT_CYCLES, + PERF_COUNT_INSTRUCTIONS, + PERF_COUNT_CACHE_REFERENCES, + PERF_COUNT_CACHE_MISSES, + PERF_COUNT_BRANCH_INSTRUCTIONS, + PERF_COUNT_BRANCH_MISSES, + /* + * If this bit is set in the type, then trigger NMI sampling: + */ + PERF_COUNT_NMI = (1 << 30), +}; + +/* + * IRQ-notification data record type: + */ +enum perf_record_type { + PERF_RECORD_SIMPLE, + PERF_RECORD_IRQ, + PERF_RECORD_GROUP, +}; + +/** + * struct hw_perf_counter - performance counter hardware details + */ +struct hw_perf_counter { + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + u64 prev_count; + s32 next_count; + u64 irq_period; +}; + +/* + * Hardcoded buffer length limit for now, for IRQ-fed events: + */ +#define PERF_DATA_BUFLEN 2048 + +/** + * struct perf_data - performance counter IRQ data sampling ... + */ +struct perf_data { + int len; + int rd_idx; + int overrun; + u8 data[PERF_DATA_BUFLEN]; +}; + +/** + * struct perf_counter - performance counter kernel representation: + */ +struct perf_counter { + struct list_head list; + int active; +#if BITS_PER_LONG == 64 + atomic64_t count; +#else + atomic_t count32[2]; +#endif + u64 __irq_period; + + struct hw_perf_counter hw; + + struct perf_counter_context *ctx; + struct task_struct *task; + + /* + * Protect attach/detach: + */ + struct mutex mutex; + + int oncpu; + int cpu; + + s32 hw_event_type; + enum perf_record_type record_type; + + /* read() / irq related data */ + wait_queue_head_t waitq; + /* optional: for NMIs */ + int wakeup_pending; + struct perf_data *irqdata; + struct perf_data *usrdata; + struct perf_data data[2]; +}; + +/** + * struct perf_counter_context - counter context structure + * + * Used as a container for task counters and CPU counters as well: + */ +struct perf_counter_context { +#ifdef CONFIG_PERF_COUNTERS + /* + * Protect the list of counters: + */ + spinlock_t lock; + struct list_head counters; + int nr_counters; + int nr_active; + struct task_struct *task; +#endif +}; + +/** + * struct perf_counter_cpu_context - per cpu counter context structure + */ +struct perf_cpu_context { + struct perf_counter_context ctx; + struct perf_counter_context *task_ctx; + int active_oncpu; + int max_pertask; +}; + +/* + * Set by architecture code: + */ +extern int perf_max_counters; + +#ifdef CONFIG_PERF_COUNTERS +extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); +extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); +extern void perf_counter_task_tick(struct task_struct *task, int cpu); +extern void perf_counter_init_task(struct task_struct *task); +extern void perf_counter_notify(struct pt_regs *regs); +extern void perf_counter_print_debug(void); +#else +static inline void +perf_counter_task_sched_in(struct task_struct *task, int cpu) { } +static inline void +perf_counter_task_sched_out(struct task_struct *task, int cpu) { } +static inline void +perf_counter_task_tick(struct task_struct *task, int cpu) { } +static inline void perf_counter_init_task(struct task_struct *task) { } +static inline void perf_counter_notify(struct pt_regs *regs) { } +static inline void perf_counter_print_debug(void) { } +#endif + +#endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index 55e30d11447..4c530278391 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,6 +71,7 @@ struct sched_param { #include #include #include +#include #include #include #include @@ -1326,6 +1327,7 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif + struct perf_counter_context perf_counter_ctx; #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; @@ -2285,6 +2287,13 @@ static inline void inc_syscw(struct task_struct *tsk) #define TASK_SIZE_OF(tsk) TASK_SIZE #endif +/* + * Call the function if the target task is executing on a CPU right now: + */ +extern void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info); + + #ifdef CONFIG_MM_OWNER extern void mm_update_next_owner(struct mm_struct *mm); extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p); diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 04fb47bfb92..6cce728a626 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -624,4 +624,10 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); +asmlinkage int +sys_perf_counter_open(u32 hw_event_type, + u32 hw_event_period, + u32 record_type, + pid_t pid, + int cpu); #endif diff --git a/init/Kconfig b/init/Kconfig index f763762d544..78bede218f1 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -732,6 +732,35 @@ config AIO by some high performance threaded applications. Disabling this option saves about 7k. +config HAVE_PERF_COUNTERS + bool + +menu "Performance Counters" + +config PERF_COUNTERS + bool "Kernel Performance Counters" + depends on HAVE_PERF_COUNTERS + default y + help + Enable kernel support for performance counter hardware. + + Performance counters are special hardware registers available + on most modern CPUs. These registers count the number of certain + types of hw events: such as instructions executed, cachemisses + suffered, or branches mis-predicted - without slowing down the + kernel or applications. These registers can also trigger interrupts + when a threshold number of events have passed - and can thus be + used to profile the code that runs on that CPU. + + The Linux Performance Counter subsystem provides an abstraction of + these hardware capabilities, available via a system call. It + provides per task and per CPU counters, and it provides event + capabilities on top of those. + + Say Y if unsure. + +endmenu + config VM_EVENT_COUNTERS default y bool "Enable VM event counters for /proc/vmstat" if EMBEDDED diff --git a/kernel/Makefile b/kernel/Makefile index 19fad003b19..1f184a1dc40 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -89,6 +89,7 @@ obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ifneq ($(CONFIG_SCHED_NO_NO_OMIT_FRAME_POINTER),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff --git a/kernel/fork.c b/kernel/fork.c index 2a372a0e206..441fadff1fa 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -975,6 +975,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; rt_mutex_init_task(p); + perf_counter_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c new file mode 100644 index 00000000000..20508f05365 --- /dev/null +++ b/kernel/perf_counter.c @@ -0,0 +1,943 @@ +/* + * Performance counter core code + * + * Copyright(C) 2008 Thomas Gleixner + * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * Each CPU has a list of per CPU counters: + */ +DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); + +int perf_max_counters __read_mostly; +static int perf_reserved_percpu __read_mostly; +static int perf_overcommit __read_mostly = 1; + +/* + * Mutex for (sysadmin-configurable) counter reservations: + */ +static DEFINE_MUTEX(perf_resource_mutex); + +/* + * Architecture provided APIs - weak aliases: + */ + +int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type) +{ + return -EINVAL; +} + +void __weak hw_perf_counter_enable(struct perf_counter *counter) { } +void __weak hw_perf_counter_disable(struct perf_counter *counter) { } +void __weak hw_perf_counter_read(struct perf_counter *counter) { } +void __weak hw_perf_disable_all(void) { } +void __weak hw_perf_enable_all(void) { } +void __weak hw_perf_counter_setup(void) { } + +#if BITS_PER_LONG == 64 + +/* + * Read the cached counter in counter safe against cross CPU / NMI + * modifications. 64 bit version - no complications. + */ +static inline u64 perf_read_counter_safe(struct perf_counter *counter) +{ + return (u64) atomic64_read(&counter->count); +} + +#else + +/* + * Read the cached counter in counter safe against cross CPU / NMI + * modifications. 32 bit version. + */ +static u64 perf_read_counter_safe(struct perf_counter *counter) +{ + u32 cntl, cnth; + + local_irq_disable(); + do { + cnth = atomic_read(&counter->count32[1]); + cntl = atomic_read(&counter->count32[0]); + } while (cnth != atomic_read(&counter->count32[1])); + + local_irq_enable(); + + return cntl | ((u64) cnth) << 32; +} + +#endif + +/* + * Cross CPU call to remove a performance counter + * + * We disable the counter on the hardware level first. After that we + * remove it from the context list. + */ +static void __perf_remove_from_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + if (counter->active) { + hw_perf_counter_disable(counter); + counter->active = 0; + ctx->nr_active--; + cpuctx->active_oncpu--; + counter->task = NULL; + } + ctx->nr_counters--; + + /* + * Protect the list operation against NMI by disabling the + * counters on a global level. NOP for non NMI based counters. + */ + hw_perf_disable_all(); + list_del_init(&counter->list); + hw_perf_enable_all(); + + if (!ctx->task) { + /* + * Allow more per task counters with respect to the + * reservation: + */ + cpuctx->max_pertask = + min(perf_max_counters - ctx->nr_counters, + perf_max_counters - perf_reserved_percpu); + } + + spin_unlock(&ctx->lock); +} + + +/* + * Remove the counter from a task's (or a CPU's) list of counters. + * + * Must be called with counter->mutex held. + * + * CPU counters are removed with a smp call. For task counters we only + * call when the task is on a CPU. + */ +static void perf_remove_from_context(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Per cpu counters are removed via an smp call and + * the removal is always sucessful. + */ + smp_call_function_single(counter->cpu, + __perf_remove_from_context, + counter, 1); + return; + } + +retry: + task_oncpu_function_call(task, __perf_remove_from_context, + counter); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active we need to retry the smp call. + */ + if (ctx->nr_active && !list_empty(&counter->list)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can remove the counter safely, if it the call above did not + * succeed. + */ + if (!list_empty(&counter->list)) { + ctx->nr_counters--; + list_del_init(&counter->list); + counter->task = NULL; + } + spin_unlock_irq(&ctx->lock); +} + +/* + * Cross CPU call to install and enable a preformance counter + */ +static void __perf_install_in_context(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + int cpu = smp_processor_id(); + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + spin_lock(&ctx->lock); + + /* + * Protect the list operation against NMI by disabling the + * counters on a global level. NOP for non NMI based counters. + */ + hw_perf_disable_all(); + list_add_tail(&counter->list, &ctx->counters); + hw_perf_enable_all(); + + ctx->nr_counters++; + + if (cpuctx->active_oncpu < perf_max_counters) { + hw_perf_counter_enable(counter); + counter->active = 1; + counter->oncpu = cpu; + ctx->nr_active++; + cpuctx->active_oncpu++; + } + + if (!ctx->task && cpuctx->max_pertask) + cpuctx->max_pertask--; + + spin_unlock(&ctx->lock); +} + +/* + * Attach a performance counter to a context + * + * First we add the counter to the list with the hardware enable bit + * in counter->hw_config cleared. + * + * If the counter is attached to a task which is on a CPU we use a smp + * call to enable it in the task context. The task might have been + * scheduled away, but we check this in the smp call again. + */ +static void +perf_install_in_context(struct perf_counter_context *ctx, + struct perf_counter *counter, + int cpu) +{ + struct task_struct *task = ctx->task; + + counter->ctx = ctx; + if (!task) { + /* + * Per cpu counters are installed via an smp call and + * the install is always sucessful. + */ + smp_call_function_single(cpu, __perf_install_in_context, + counter, 1); + return; + } + + counter->task = task; +retry: + task_oncpu_function_call(task, __perf_install_in_context, + counter); + + spin_lock_irq(&ctx->lock); + /* + * If the context is active and the counter has not been added + * we need to retry the smp call. + */ + if (ctx->nr_active && list_empty(&counter->list)) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * The lock prevents that this context is scheduled in so we + * can add the counter safely, if it the call above did not + * succeed. + */ + if (list_empty(&counter->list)) { + list_add_tail(&counter->list, &ctx->counters); + ctx->nr_counters++; + } + spin_unlock_irq(&ctx->lock); +} + +/* + * Called from scheduler to remove the counters of the current task, + * with interrupts disabled. + * + * We stop each counter and update the counter value in counter->count. + * + * This does not protect us against NMI, but hw_perf_counter_disable() + * sets the disabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * not restart the counter. + */ +void perf_counter_task_sched_out(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter *counter; + + if (likely(!cpuctx->task_ctx)) + return; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counters, list) { + if (!ctx->nr_active) + break; + if (counter->active) { + hw_perf_counter_disable(counter); + counter->active = 0; + counter->oncpu = -1; + ctx->nr_active--; + cpuctx->active_oncpu--; + } + } + spin_unlock(&ctx->lock); + cpuctx->task_ctx = NULL; +} + +/* + * Called from scheduler to add the counters of the current task + * with interrupts disabled. + * + * We restore the counter value and then enable it. + * + * This does not protect us against NMI, but hw_perf_counter_enable() + * sets the enabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * keep the counter running. + */ +void perf_counter_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter *counter; + + if (likely(!ctx->nr_counters)) + return; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counters, list) { + if (ctx->nr_active == cpuctx->max_pertask) + break; + if (counter->cpu != -1 && counter->cpu != cpu) + continue; + + hw_perf_counter_enable(counter); + counter->active = 1; + counter->oncpu = cpu; + ctx->nr_active++; + cpuctx->active_oncpu++; + } + spin_unlock(&ctx->lock); + cpuctx->task_ctx = ctx; +} + +void perf_counter_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter *counter; + + if (likely(!ctx->nr_counters)) + return; + + perf_counter_task_sched_out(curr, cpu); + + spin_lock(&ctx->lock); + + /* + * Rotate the first entry last: + */ + hw_perf_disable_all(); + list_for_each_entry(counter, &ctx->counters, list) { + list_del(&counter->list); + list_add_tail(&counter->list, &ctx->counters); + break; + } + hw_perf_enable_all(); + + spin_unlock(&ctx->lock); + + perf_counter_task_sched_in(curr, cpu); +} + +/* + * Initialize the perf_counter context in task_struct + */ +void perf_counter_init_task(struct task_struct *task) +{ + struct perf_counter_context *ctx = &task->perf_counter_ctx; + + spin_lock_init(&ctx->lock); + INIT_LIST_HEAD(&ctx->counters); + ctx->nr_counters = 0; + ctx->task = task; +} + +/* + * Cross CPU call to read the hardware counter + */ +static void __hw_perf_counter_read(void *info) +{ + hw_perf_counter_read(info); +} + +static u64 perf_read_counter(struct perf_counter *counter) +{ + /* + * If counter is enabled and currently active on a CPU, update the + * value in the counter structure: + */ + if (counter->active) { + smp_call_function_single(counter->oncpu, + __hw_perf_counter_read, counter, 1); + } + + return perf_read_counter_safe(counter); +} + +/* + * Cross CPU call to switch performance data pointers + */ +static void __perf_switch_irq_data(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; + struct perf_data *oldirqdata = counter->irqdata; + + /* + * If this is a task context, we need to check whether it is + * the current task context of this cpu. If not it has been + * scheduled out before the smp call arrived. + */ + if (ctx->task) { + if (cpuctx->task_ctx != ctx) + return; + spin_lock(&ctx->lock); + } + + /* Change the pointer NMI safe */ + atomic_long_set((atomic_long_t *)&counter->irqdata, + (unsigned long) counter->usrdata); + counter->usrdata = oldirqdata; + + if (ctx->task) + spin_unlock(&ctx->lock); +} + +static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct perf_data *oldirqdata = counter->irqdata; + struct task_struct *task = ctx->task; + + if (!task) { + smp_call_function_single(counter->cpu, + __perf_switch_irq_data, + counter, 1); + return counter->usrdata; + } + +retry: + spin_lock_irq(&ctx->lock); + if (!counter->active) { + counter->irqdata = counter->usrdata; + counter->usrdata = oldirqdata; + spin_unlock_irq(&ctx->lock); + return oldirqdata; + } + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_switch_irq_data, counter); + /* Might have failed, because task was scheduled out */ + if (counter->irqdata == oldirqdata) + goto retry; + + return counter->usrdata; +} + +static void put_context(struct perf_counter_context *ctx) +{ + if (ctx->task) + put_task_struct(ctx->task); +} + +static struct perf_counter_context *find_get_context(pid_t pid, int cpu) +{ + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + struct task_struct *task; + + /* + * If cpu is not a wildcard then this is a percpu counter: + */ + if (cpu != -1) { + /* Must be root to operate on a CPU counter: */ + if (!capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EACCES); + + if (cpu < 0 || cpu > num_possible_cpus()) + return ERR_PTR(-EINVAL); + + /* + * We could be clever and allow to attach a counter to an + * offline CPU and activate it when the CPU comes up, but + * that's for later. + */ + if (!cpu_isset(cpu, cpu_online_map)) + return ERR_PTR(-ENODEV); + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &cpuctx->ctx; + + WARN_ON_ONCE(ctx->task); + return ctx; + } + + rcu_read_lock(); + if (!pid) + task = current; + else + task = find_task_by_vpid(pid); + if (task) + get_task_struct(task); + rcu_read_unlock(); + + if (!task) + return ERR_PTR(-ESRCH); + + ctx = &task->perf_counter_ctx; + ctx->task = task; + + /* Reuse ptrace permission checks for now. */ + if (!ptrace_may_access(task, PTRACE_MODE_READ)) { + put_context(ctx); + return ERR_PTR(-EACCES); + } + + return ctx; +} + +/* + * Called when the last reference to the file is gone. + */ +static int perf_release(struct inode *inode, struct file *file) +{ + struct perf_counter *counter = file->private_data; + struct perf_counter_context *ctx = counter->ctx; + + file->private_data = NULL; + + mutex_lock(&counter->mutex); + + perf_remove_from_context(counter); + put_context(ctx); + + mutex_unlock(&counter->mutex); + + kfree(counter); + + return 0; +} + +/* + * Read the performance counter - simple non blocking version for now + */ +static ssize_t +perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) +{ + u64 cntval; + + if (count != sizeof(cntval)) + return -EINVAL; + + mutex_lock(&counter->mutex); + cntval = perf_read_counter(counter); + mutex_unlock(&counter->mutex); + + return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); +} + +static ssize_t +perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) +{ + if (!usrdata->len) + return 0; + + count = min(count, (size_t)usrdata->len); + if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) + return -EFAULT; + + /* Adjust the counters */ + usrdata->len -= count; + if (!usrdata->len) + usrdata->rd_idx = 0; + else + usrdata->rd_idx += count; + + return count; +} + +static ssize_t +perf_read_irq_data(struct perf_counter *counter, + char __user *buf, + size_t count, + int nonblocking) +{ + struct perf_data *irqdata, *usrdata; + DECLARE_WAITQUEUE(wait, current); + ssize_t res; + + irqdata = counter->irqdata; + usrdata = counter->usrdata; + + if (usrdata->len + irqdata->len >= count) + goto read_pending; + + if (nonblocking) + return -EAGAIN; + + spin_lock_irq(&counter->waitq.lock); + __add_wait_queue(&counter->waitq, &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + if (usrdata->len + irqdata->len >= count) + break; + + if (signal_pending(current)) + break; + + spin_unlock_irq(&counter->waitq.lock); + schedule(); + spin_lock_irq(&counter->waitq.lock); + } + __remove_wait_queue(&counter->waitq, &wait); + __set_current_state(TASK_RUNNING); + spin_unlock_irq(&counter->waitq.lock); + + if (usrdata->len + irqdata->len < count) + return -ERESTARTSYS; +read_pending: + mutex_lock(&counter->mutex); + + /* Drain pending data first: */ + res = perf_copy_usrdata(usrdata, buf, count); + if (res < 0 || res == count) + goto out; + + /* Switch irq buffer: */ + usrdata = perf_switch_irq_data(counter); + if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { + if (!res) + res = -EFAULT; + } else { + res = count; + } +out: + mutex_unlock(&counter->mutex); + + return res; +} + +static ssize_t +perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) +{ + struct perf_counter *counter = file->private_data; + + switch (counter->record_type) { + case PERF_RECORD_SIMPLE: + return perf_read_hw(counter, buf, count); + + case PERF_RECORD_IRQ: + case PERF_RECORD_GROUP: + return perf_read_irq_data(counter, buf, count, + file->f_flags & O_NONBLOCK); + } + return -EINVAL; +} + +static unsigned int perf_poll(struct file *file, poll_table *wait) +{ + struct perf_counter *counter = file->private_data; + unsigned int events = 0; + unsigned long flags; + + poll_wait(file, &counter->waitq, wait); + + spin_lock_irqsave(&counter->waitq.lock, flags); + if (counter->usrdata->len || counter->irqdata->len) + events |= POLLIN; + spin_unlock_irqrestore(&counter->waitq.lock, flags); + + return events; +} + +static const struct file_operations perf_fops = { + .release = perf_release, + .read = perf_read, + .poll = perf_poll, +}; + +/* + * Allocate and initialize a counter structure + */ +static struct perf_counter * +perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) +{ + struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); + + if (!counter) + return NULL; + + mutex_init(&counter->mutex); + INIT_LIST_HEAD(&counter->list); + init_waitqueue_head(&counter->waitq); + + counter->irqdata = &counter->data[0]; + counter->usrdata = &counter->data[1]; + counter->cpu = cpu; + counter->record_type = record_type; + counter->__irq_period = hw_event_period; + counter->wakeup_pending = 0; + + return counter; +} + +/** + * sys_perf_task_open - open a performance counter associate it to a task + * @hw_event_type: event type for monitoring/sampling... + * @pid: target pid + */ +asmlinkage int +sys_perf_counter_open(u32 hw_event_type, + u32 hw_event_period, + u32 record_type, + pid_t pid, + int cpu) +{ + struct perf_counter_context *ctx; + struct perf_counter *counter; + int ret; + + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ret = -ENOMEM; + counter = perf_counter_alloc(hw_event_period, cpu, record_type); + if (!counter) + goto err_put_context; + + ret = hw_perf_counter_init(counter, hw_event_type); + if (ret) + goto err_free_put_context; + + perf_install_in_context(ctx, counter, cpu); + + ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); + if (ret < 0) + goto err_remove_free_put_context; + + return ret; + +err_remove_free_put_context: + mutex_lock(&counter->mutex); + perf_remove_from_context(counter); + mutex_unlock(&counter->mutex); + +err_free_put_context: + kfree(counter); + +err_put_context: + put_context(ctx); + + return ret; +} + +static void __cpuinit perf_init_cpu(int cpu) +{ + struct perf_cpu_context *ctx; + + ctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_init(&ctx->ctx.lock); + INIT_LIST_HEAD(&ctx->ctx.counters); + + mutex_lock(&perf_resource_mutex); + ctx->max_pertask = perf_max_counters - perf_reserved_percpu; + mutex_unlock(&perf_resource_mutex); + hw_perf_counter_setup(); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void __perf_exit_cpu(void *info) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = &cpuctx->ctx; + struct perf_counter *counter, *tmp; + + list_for_each_entry_safe(counter, tmp, &ctx->counters, list) + __perf_remove_from_context(counter); + +} +static void perf_exit_cpu(int cpu) +{ + smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1); +} +#else +static inline void perf_exit_cpu(int cpu) { } +#endif + +static int __cpuinit +perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) +{ + unsigned int cpu = (long)hcpu; + + switch (action) { + + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + perf_init_cpu(cpu); + break; + + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + perf_exit_cpu(cpu); + break; + + default: + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block __cpuinitdata perf_cpu_nb = { + .notifier_call = perf_cpu_notify, +}; + +static int __init perf_counter_init(void) +{ + perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, + (void *)(long)smp_processor_id()); + register_cpu_notifier(&perf_cpu_nb); + + return 0; +} +early_initcall(perf_counter_init); + +static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_reserved_percpu); +} + +static ssize_t +perf_set_reserve_percpu(struct sysdev_class *class, + const char *buf, + size_t count) +{ + struct perf_cpu_context *cpuctx; + unsigned long val; + int err, cpu, mpt; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > perf_max_counters) + return -EINVAL; + + mutex_lock(&perf_resource_mutex); + perf_reserved_percpu = val; + for_each_online_cpu(cpu) { + cpuctx = &per_cpu(perf_cpu_context, cpu); + spin_lock_irq(&cpuctx->ctx.lock); + mpt = min(perf_max_counters - cpuctx->ctx.nr_counters, + perf_max_counters - perf_reserved_percpu); + cpuctx->max_pertask = mpt; + spin_unlock_irq(&cpuctx->ctx.lock); + } + mutex_unlock(&perf_resource_mutex); + + return count; +} + +static ssize_t perf_show_overcommit(struct sysdev_class *class, char *buf) +{ + return sprintf(buf, "%d\n", perf_overcommit); +} + +static ssize_t +perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) +{ + unsigned long val; + int err; + + err = strict_strtoul(buf, 10, &val); + if (err) + return err; + if (val > 1) + return -EINVAL; + + mutex_lock(&perf_resource_mutex); + perf_overcommit = val; + mutex_unlock(&perf_resource_mutex); + + return count; +} + +static SYSDEV_CLASS_ATTR( + reserve_percpu, + 0644, + perf_show_reserve_percpu, + perf_set_reserve_percpu + ); + +static SYSDEV_CLASS_ATTR( + overcommit, + 0644, + perf_show_overcommit, + perf_set_overcommit + ); + +static struct attribute *perfclass_attrs[] = { + &attr_reserve_percpu.attr, + &attr_overcommit.attr, + NULL +}; + +static struct attribute_group perfclass_attr_group = { + .attrs = perfclass_attrs, + .name = "perf_counters", +}; + +static int __init perf_counter_sysfs_init(void) +{ + return sysfs_create_group(&cpu_sysdev_class.kset.kobj, + &perfclass_attr_group); +} +device_initcall(perf_counter_sysfs_init); + diff --git a/kernel/sched.c b/kernel/sched.c index b7480fb5c3d..254d56de254 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2212,6 +2212,27 @@ static int sched_balance_self(int cpu, int flag) #endif /* CONFIG_SMP */ +/** + * task_oncpu_function_call - call a function on the cpu on which a task runs + * @p: the task to evaluate + * @func: the function to be called + * @info: the function call argument + * + * Calls the function @func when the task is currently running. This might + * be on the current CPU, which just calls the function directly + */ +void task_oncpu_function_call(struct task_struct *p, + void (*func) (void *info), void *info) +{ + int cpu; + + preempt_disable(); + cpu = task_cpu(p); + if (task_curr(p)) + smp_call_function_single(cpu, func, info, 1); + preempt_enable(); +} + /*** * try_to_wake_up - wake up a thread * @p: the to-be-woken-up thread @@ -2534,6 +2555,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { fire_sched_out_preempt_notifiers(prev, next); + perf_counter_task_sched_out(prev, cpu_of(rq)); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -2574,6 +2596,7 @@ static void finish_task_switch(struct rq *rq, struct task_struct *prev) */ prev_state = prev->state; finish_arch_switch(prev); + perf_counter_task_sched_in(current, cpu_of(rq)); finish_lock_switch(rq, prev); #ifdef CONFIG_SMP if (current->sched_class->post_schedule) @@ -4296,6 +4319,7 @@ void scheduler_tick(void) rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif + perf_counter_task_tick(curr, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index e14a2328170..4be8bbc7577 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -174,3 +174,6 @@ cond_syscall(compat_sys_timerfd_settime); cond_syscall(compat_sys_timerfd_gettime); cond_syscall(sys_eventfd); cond_syscall(sys_eventfd2); + +/* performance counters: */ +cond_syscall(sys_perf_counter_open); From e7bc62b6b3aeaa8849f8383e0cfb7ca6c003adc6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 4 Dec 2008 20:13:45 +0100 Subject: [PATCH 0002/2061] performance counters: documentation Add more documentation about performance counters. Signed-off-by: Ingo Molnar --- Documentation/perf-counters.txt | 104 ++++++++++++++++++++++++++++++++ 1 file changed, 104 insertions(+) create mode 100644 Documentation/perf-counters.txt diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt new file mode 100644 index 00000000000..19033a0bb52 --- /dev/null +++ b/Documentation/perf-counters.txt @@ -0,0 +1,104 @@ + +Performance Counters for Linux +------------------------------ + +Performance counters are special hardware registers available on most modern +CPUs. These registers count the number of certain types of hw events: such +as instructions executed, cachemisses suffered, or branches mis-predicted - +without slowing down the kernel or applications. These registers can also +trigger interrupts when a threshold number of events have passed - and can +thus be used to profile the code that runs on that CPU. + +The Linux Performance Counter subsystem provides an abstraction of these +hardware capabilities. It provides per task and per CPU counters, and +it provides event capabilities on top of those. + +Performance counters are accessed via special file descriptors. +There's one file descriptor per virtual counter used. + +The special file descriptor is opened via the perf_counter_open() +system call: + + int + perf_counter_open(u32 hw_event_type, + u32 hw_event_period, + u32 record_type, + pid_t pid, + int cpu); + +The syscall returns the new fd. The fd can be used via the normal +VFS system calls: read() can be used to read the counter, fcntl() +can be used to set the blocking mode, etc. + +Multiple counters can be kept open at a time, and the counters +can be poll()ed. + +When creating a new counter fd, 'hw_event_type' is one of: + + enum hw_event_types { + PERF_COUNT_CYCLES, + PERF_COUNT_INSTRUCTIONS, + PERF_COUNT_CACHE_REFERENCES, + PERF_COUNT_CACHE_MISSES, + PERF_COUNT_BRANCH_INSTRUCTIONS, + PERF_COUNT_BRANCH_MISSES, + }; + +These are standardized types of events that work uniformly on all CPUs +that implements Performance Counters support under Linux. If a CPU is +not able to count branch-misses, then the system call will return +-EINVAL. + +[ Note: more hw_event_types are supported as well, but they are CPU + specific and are enumerated via /sys on a per CPU basis. Raw hw event + types can be passed in as negative numbers. For example, to count + "External bus cycles while bus lock signal asserted" events on Intel + Core CPUs, pass in a -0x4064 event type value. ] + +The parameter 'hw_event_period' is the number of events before waking up +a read() that is blocked on a counter fd. Zero value means a non-blocking +counter. + +'record_type' is the type of data that a read() will provide for the +counter, and it can be one of: + + enum perf_record_type { + PERF_RECORD_SIMPLE, + PERF_RECORD_IRQ, + }; + +a "simple" counter is one that counts hardware events and allows +them to be read out into a u64 count value. (read() returns 8 on +a successful read of a simple counter.) + +An "irq" counter is one that will also provide an IRQ context information: +the IP of the interrupted context. In this case read() will return +the 8-byte counter value, plus the Instruction Pointer address of the +interrupted context. + +The 'pid' parameter allows the counter to be specific to a task: + + pid == 0: if the pid parameter is zero, the counter is attached to the + current task. + + pid > 0: the counter is attached to a specific task (if the current task + has sufficient privilege to do so) + + pid < 0: all tasks are counted (per cpu counters) + +The 'cpu' parameter allows a counter to be made specific to a full +CPU: + + cpu >= 0: the counter is restricted to a specific CPU + cpu == -1: the counter counts on all CPUs + +Note: the combination of 'pid == -1' and 'cpu == -1' is not valid. + +A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts +events of that task and 'follows' that task to whatever CPU the task +gets schedule to. Per task counters can be created by any user, for +their own tasks. + +A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts +all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. + From 241771ef016b5c0c83cd7a4372a74321c973c1e6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 3 Dec 2008 10:39:53 +0100 Subject: [PATCH 0003/2061] performance counters: x86 support Implement performance counters for x86 Intel CPUs. It's simplified right now: the PERFMON CPU feature is assumed, which is available in Core2 and later Intel CPUs. The design is flexible to be extended to more CPU types as well. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 1 + arch/x86/ia32/ia32entry.S | 3 +- arch/x86/include/asm/hardirq_32.h | 1 + arch/x86/include/asm/hw_irq.h | 2 + arch/x86/include/asm/intel_arch_perfmon.h | 34 +- arch/x86/include/asm/irq_vectors.h | 5 + .../x86/include/asm/mach-default/entry_arch.h | 5 + arch/x86/include/asm/pda.h | 1 + arch/x86/include/asm/thread_info.h | 4 +- arch/x86/include/asm/unistd_32.h | 1 + arch/x86/include/asm/unistd_64.h | 3 +- arch/x86/kernel/apic.c | 2 + arch/x86/kernel/cpu/Makefile | 12 +- arch/x86/kernel/cpu/common.c | 2 + arch/x86/kernel/cpu/perf_counter.c | 571 ++++++++++++++++++ arch/x86/kernel/entry_64.S | 5 + arch/x86/kernel/irq.c | 5 + arch/x86/kernel/irqinit_32.c | 3 + arch/x86/kernel/irqinit_64.c | 5 + arch/x86/kernel/signal.c | 7 +- arch/x86/kernel/syscall_table_32.S | 1 + 21 files changed, 652 insertions(+), 21 deletions(-) create mode 100644 arch/x86/kernel/cpu/perf_counter.c diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index d4d4cb7629e..f2fdc186724 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -643,6 +643,7 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) + select HAVE_PERF_COUNTERS config X86_IO_APIC def_bool y diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 256b00b6189..3c14ed07dc4 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -823,7 +823,8 @@ ia32_sys_call_table: .quad compat_sys_signalfd4 .quad sys_eventfd2 .quad sys_epoll_create1 - .quad sys_dup3 /* 330 */ + .quad sys_dup3 /* 330 */ .quad sys_pipe2 .quad sys_inotify_init1 + .quad sys_perf_counter_open ia32_syscall_end: diff --git a/arch/x86/include/asm/hardirq_32.h b/arch/x86/include/asm/hardirq_32.h index 5ca135e72f2..b3e475dc933 100644 --- a/arch/x86/include/asm/hardirq_32.h +++ b/arch/x86/include/asm/hardirq_32.h @@ -9,6 +9,7 @@ typedef struct { unsigned long idle_timestamp; unsigned int __nmi_count; /* arch dependent */ unsigned int apic_timer_irqs; /* arch dependent */ + unsigned int apic_perf_irqs; /* arch dependent */ unsigned int irq0_irqs; unsigned int irq_resched_count; unsigned int irq_call_count; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index 8de644b6b95..aa93e53b85e 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -30,6 +30,8 @@ /* Interrupt handlers registered during init_IRQ */ extern void apic_timer_interrupt(void); extern void error_interrupt(void); +extern void perf_counter_interrupt(void); + extern void spurious_interrupt(void); extern void thermal_interrupt(void); extern void reschedule_interrupt(void); diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/intel_arch_perfmon.h index fa0fd068bc2..71598a9eab6 100644 --- a/arch/x86/include/asm/intel_arch_perfmon.h +++ b/arch/x86/include/asm/intel_arch_perfmon.h @@ -1,22 +1,24 @@ #ifndef _ASM_X86_INTEL_ARCH_PERFMON_H #define _ASM_X86_INTEL_ARCH_PERFMON_H -#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 -#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 +#define MSR_ARCH_PERFMON_PERFCTR0 0xc1 +#define MSR_ARCH_PERFMON_PERFCTR1 0xc2 -#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 -#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 +#define MSR_ARCH_PERFMON_EVENTSEL0 0x186 +#define MSR_ARCH_PERFMON_EVENTSEL1 0x187 -#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) -#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) -#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) -#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) +#define ARCH_PERFMON_EVENTSEL0_ENABLE (1 << 22) +#define ARCH_PERFMON_EVENTSEL_INT (1 << 20) +#define ARCH_PERFMON_EVENTSEL_OS (1 << 17) +#define ARCH_PERFMON_EVENTSEL_USR (1 << 16) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL (0x3c) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) -#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX (0) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) +#define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_PRESENT \ - (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + (1 << (ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX)) + +#define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 union cpuid10_eax { struct { @@ -28,4 +30,12 @@ union cpuid10_eax { unsigned int full; }; +#ifdef CONFIG_PERF_COUNTERS +extern void init_hw_perf_counters(void); +extern void perf_counters_lapic_init(int nmi); +#else +static inline void init_hw_perf_counters(void) { } +static inline void perf_counters_lapic_init(int nmi) { } +#endif + #endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */ diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 0005adb0f94..b8d277f1252 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -86,6 +86,11 @@ */ #define LOCAL_TIMER_VECTOR 0xef +/* + * Performance monitoring interrupt vector: + */ +#define LOCAL_PERF_VECTOR 0xee + /* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority diff --git a/arch/x86/include/asm/mach-default/entry_arch.h b/arch/x86/include/asm/mach-default/entry_arch.h index 6b1add8e31d..ad31e5d90e9 100644 --- a/arch/x86/include/asm/mach-default/entry_arch.h +++ b/arch/x86/include/asm/mach-default/entry_arch.h @@ -25,10 +25,15 @@ BUILD_INTERRUPT(irq_move_cleanup_interrupt,IRQ_MOVE_CLEANUP_VECTOR) * a much simpler SMP time architecture: */ #ifdef CONFIG_X86_LOCAL_APIC + BUILD_INTERRUPT(apic_timer_interrupt,LOCAL_TIMER_VECTOR) BUILD_INTERRUPT(error_interrupt,ERROR_APIC_VECTOR) BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) +#ifdef CONFIG_PERF_COUNTERS +BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) +#endif + #ifdef CONFIG_X86_MCE_P4THERMAL BUILD_INTERRUPT(thermal_interrupt,THERMAL_APIC_VECTOR) #endif diff --git a/arch/x86/include/asm/pda.h b/arch/x86/include/asm/pda.h index 2fbfff88df3..90a8d9d4206 100644 --- a/arch/x86/include/asm/pda.h +++ b/arch/x86/include/asm/pda.h @@ -30,6 +30,7 @@ struct x8664_pda { short isidle; struct mm_struct *active_mm; unsigned apic_timer_irqs; + unsigned apic_perf_irqs; unsigned irq0_irqs; unsigned irq_resched_count; unsigned irq_call_count; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index e44d379faad..810bf266d13 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -80,6 +80,7 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ +#define TIF_PERF_COUNTERS 11 /* notify perf counter work */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ @@ -103,6 +104,7 @@ struct thread_info { #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) +#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) @@ -135,7 +137,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) + (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index f2bba78430a..7e47658b0a6 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -338,6 +338,7 @@ #define __NR_dup3 330 #define __NR_pipe2 331 #define __NR_inotify_init1 332 +#define __NR_perf_counter_open 333 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index d2e415e6666..53025feaf88 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -653,7 +653,8 @@ __SYSCALL(__NR_dup3, sys_dup3) __SYSCALL(__NR_pipe2, sys_pipe2) #define __NR_inotify_init1 294 __SYSCALL(__NR_inotify_init1, sys_inotify_init1) - +#define __NR_perf_counter_open 295 +__SYSCALL(__NR_perf_counter_open, sys_perf_counter_open) #ifndef __NO_STUBS #define __ARCH_WANT_OLD_READDIR diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 16f94879b52..8ab8c185867 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -31,6 +31,7 @@ #include #include +#include #include #include #include @@ -1147,6 +1148,7 @@ void __cpuinit setup_local_APIC(void) apic_write(APIC_ESR, 0); } #endif + perf_counters_lapic_init(0); preempt_disable(); diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile index 82ec6075c05..89e53361fe2 100644 --- a/arch/x86/kernel/cpu/Makefile +++ b/arch/x86/kernel/cpu/Makefile @@ -1,5 +1,5 @@ # -# Makefile for x86-compatible CPU details and quirks +# Makefile for x86-compatible CPU details, features and quirks # obj-y := intel_cacheinfo.o addon_cpuid_features.o @@ -16,11 +16,13 @@ obj-$(CONFIG_CPU_SUP_CENTAUR_64) += centaur_64.o obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_X86_MCE) += mcheck/ -obj-$(CONFIG_MTRR) += mtrr/ -obj-$(CONFIG_CPU_FREQ) += cpufreq/ +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o -obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o +obj-$(CONFIG_X86_MCE) += mcheck/ +obj-$(CONFIG_MTRR) += mtrr/ +obj-$(CONFIG_CPU_FREQ) += cpufreq/ + +obj-$(CONFIG_X86_LOCAL_APIC) += perfctr-watchdog.o quiet_cmd_mkcapflags = MKCAP $@ cmd_mkcapflags = $(PERL) $(srctree)/$(src)/mkcapflags.pl $< $@ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b9c9ea0217a..4461011db47 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -750,6 +751,7 @@ void __init identify_boot_cpu(void) #else vgetcpu_set_mode(); #endif + init_hw_perf_counters(); } void __cpuinit identify_secondary_cpu(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c new file mode 100644 index 00000000000..82440cbed0e --- /dev/null +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -0,0 +1,571 @@ +/* + * Performance counter x86 architecture code + * + * Copyright(C) 2008 Thomas Gleixner + * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * + * For licencing details see kernel-base/COPYING + */ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +static bool perf_counters_initialized __read_mostly; + +/* + * Number of (generic) HW counters: + */ +static int nr_hw_counters __read_mostly; +static u32 perf_counter_mask __read_mostly; + +/* No support for fixed function counters yet */ + +#define MAX_HW_COUNTERS 8 + +struct cpu_hw_counters { + struct perf_counter *counters[MAX_HW_COUNTERS]; + unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; + int enable_all; +}; + +/* + * Intel PerfMon v3. Used on Core2 and later. + */ +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); + +const int intel_perfmon_event_map[] = +{ + [PERF_COUNT_CYCLES] = 0x003c, + [PERF_COUNT_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, + [PERF_COUNT_CACHE_MISSES] = 0x412e, + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_BRANCH_MISSES] = 0x00c5, +}; + +const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); + +/* + * Setup the hardware configuration for a given hw_event_type + */ +int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) +{ + struct hw_perf_counter *hwc = &counter->hw; + + if (unlikely(!perf_counters_initialized)) + return -EINVAL; + + /* + * Count user events, and generate PMC IRQs: + * (keep 'enabled' bit clear for now) + */ + hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT; + + /* + * If privileged enough, count OS events too, and allow + * NMI events as well: + */ + hwc->nmi = 0; + if (capable(CAP_SYS_ADMIN)) { + hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + if (hw_event_type & PERF_COUNT_NMI) + hwc->nmi = 1; + } + + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + + hwc->irq_period = counter->__irq_period; + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic counter period: + */ + if (!hwc->irq_period) + hwc->irq_period = 0x7FFFFFFF; + + hwc->next_count = -((s32) hwc->irq_period); + + /* + * Negative event types mean raw encoded event+umask values: + */ + if (hw_event_type < 0) { + counter->hw_event_type = -hw_event_type; + counter->hw_event_type &= ~PERF_COUNT_NMI; + } else { + hw_event_type &= ~PERF_COUNT_NMI; + if (hw_event_type >= max_intel_perfmon_events) + return -EINVAL; + /* + * The generic map: + */ + counter->hw_event_type = intel_perfmon_event_map[hw_event_type]; + } + hwc->config |= counter->hw_event_type; + counter->wakeup_pending = 0; + + return 0; +} + +static void __hw_perf_enable_all(void) +{ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); +} + +void hw_perf_enable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + cpuc->enable_all = 1; + __hw_perf_enable_all(); +} + +void hw_perf_disable_all(void) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + cpuc->enable_all = 0; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); +} + +static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); + +static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +{ + per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; + + wrmsr(hwc->counter_base + idx, hwc->next_count, 0); + wrmsr(hwc->config_base + idx, hwc->config, 0); +} + +void hw_perf_counter_enable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + + /* Try to get the previous counter again */ + if (test_and_set_bit(idx, cpuc->used)) { + idx = find_first_zero_bit(cpuc->used, nr_hw_counters); + set_bit(idx, cpuc->used); + hwc->idx = idx; + } + + perf_counters_lapic_init(hwc->nmi); + + wrmsr(hwc->config_base + idx, + hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + + cpuc->counters[idx] = counter; + counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + __hw_perf_counter_enable(hwc, idx); +} + +#ifdef CONFIG_X86_64 +static inline void atomic64_counter_set(struct perf_counter *counter, u64 val) +{ + atomic64_set(&counter->count, val); +} + +static inline u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic64_read(&counter->count); +} +#else +/* + * Todo: add proper atomic64_t support to 32-bit x86: + */ +static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64) +{ + u32 *val32 = (void *)&val64; + + atomic_set(counter->count32 + 0, *(val32 + 0)); + atomic_set(counter->count32 + 1, *(val32 + 1)); +} + +static inline u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic_read(counter->count32 + 0) | + (u64) atomic_read(counter->count32 + 1) << 32; +} +#endif + +static void __hw_perf_save_counter(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + s64 raw = -1; + s64 delta; + int err; + + /* + * Get the raw hw counter value: + */ + err = rdmsrl_safe(hwc->counter_base + idx, &raw); + WARN_ON_ONCE(err); + + /* + * Rebase it to zero (it started counting at -irq_period), + * to see the delta since ->prev_count: + */ + delta = (s64)hwc->irq_period + (s64)(s32)raw; + + atomic64_counter_set(counter, hwc->prev_count + delta); + + /* + * Adjust the ->prev_count offset - if we went beyond + * irq_period of units, then we got an IRQ and the counter + * was set back to -irq_period: + */ + while (delta >= (s64)hwc->irq_period) { + hwc->prev_count += hwc->irq_period; + delta -= (s64)hwc->irq_period; + } + + /* + * Calculate the next raw counter value we'll write into + * the counter at the next sched-in time: + */ + delta -= (s64)hwc->irq_period; + + hwc->next_count = (s32)delta; +} + +void perf_counter_print_debug(void) +{ + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; + int cpu, err, idx; + + local_irq_disable(); + + cpu = smp_processor_id(); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow); + WARN_ON_ONCE(err); + + printk(KERN_INFO "\n"); + printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); + printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); + printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + + for (idx = 0; idx < nr_hw_counters; idx++) { + err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); + WARN_ON_ONCE(err); + + err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count); + WARN_ON_ONCE(err); + + next_count = per_cpu(prev_next_count[idx], cpu); + + printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", + cpu, idx, pmc_ctrl); + printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", + cpu, idx, pmc_count); + printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n", + cpu, idx, next_count); + } + local_irq_enable(); +} + +void hw_perf_counter_disable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + unsigned int idx = hwc->idx; + + counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsr(hwc->config_base + idx, hwc->config, 0); + + clear_bit(idx, cpuc->used); + cpuc->counters[idx] = NULL; + __hw_perf_save_counter(counter, hwc, idx); +} + +void hw_perf_counter_read(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + unsigned long addr = hwc->counter_base + hwc->idx; + s64 offs, val = -1LL; + s32 val32; + int err; + + /* Careful: NMI might modify the counter offset */ + do { + offs = hwc->prev_count; + err = rdmsrl_safe(addr, &val); + WARN_ON_ONCE(err); + } while (offs != hwc->prev_count); + + val32 = (s32) val; + val = (s64)hwc->irq_period + (s64)val32; + atomic64_counter_set(counter, hwc->prev_count + val); +} + +static void perf_store_irq_data(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +static void perf_save_and_restart(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int idx = hwc->idx; + + wrmsr(hwc->config_base + idx, + hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + + if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) { + __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_counter_enable(hwc, idx); + } +} + +static void +perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) +{ + struct perf_counter_context *ctx = leader->ctx; + struct perf_counter *counter; + int bit; + + list_for_each_entry(counter, &ctx->counters, list) { + if (counter->record_type != PERF_RECORD_SIMPLE || + counter == leader) + continue; + + if (counter->active) { + /* + * When counter was not in the overflow mask, we have to + * read it from hardware. We read it as well, when it + * has not been read yet and clear the bit in the + * status mask. + */ + bit = counter->hw.idx; + if (!test_bit(bit, (unsigned long *) overflown) || + test_bit(bit, (unsigned long *) status)) { + clear_bit(bit, (unsigned long *) status); + perf_save_and_restart(counter); + } + } + perf_store_irq_data(leader, counter->hw_event_type); + perf_store_irq_data(leader, atomic64_counter_read(counter)); + } +} + +/* + * This handler is triggered by the local APIC, so the APIC IRQ handling + * rules apply: + */ +static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +{ + int bit, cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc; + u64 ack, status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (!status) { + ack_APIC_irq(); + return; + } + + /* Disable counters globally */ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + ack_APIC_irq(); + + cpuc = &per_cpu(cpu_hw_counters, cpu); + +again: + ack = status; + for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { + struct perf_counter *counter = cpuc->counters[bit]; + + clear_bit(bit, (unsigned long *) &status); + if (!counter) + continue; + + perf_save_and_restart(counter); + + switch (counter->record_type) { + case PERF_RECORD_SIMPLE: + continue; + case PERF_RECORD_IRQ: + perf_store_irq_data(counter, instruction_pointer(regs)); + break; + case PERF_RECORD_GROUP: + perf_store_irq_data(counter, counter->hw_event_type); + perf_store_irq_data(counter, + atomic64_counter_read(counter)); + perf_handle_group(counter, &status, &ack); + break; + } + /* + * From NMI context we cannot call into the scheduler to + * do a task wakeup - but we mark these counters as + * wakeup_pending and initate a wakeup callback: + */ + if (nmi) { + counter->wakeup_pending = 1; + set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + } else { + wake_up(&counter->waitq); + } + } + + wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0); + + /* + * Repeat if there is more work to be done: + */ + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (status) + goto again; + + /* + * Do not reenable when global enable is off: + */ + if (cpuc->enable_all) + __hw_perf_enable_all(); +} + +void smp_perf_counter_interrupt(struct pt_regs *regs) +{ + irq_enter(); +#ifdef CONFIG_X86_64 + add_pda(apic_perf_irqs, 1); +#else + per_cpu(irq_stat, smp_processor_id()).apic_perf_irqs++; +#endif + apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + __smp_perf_counter_interrupt(regs, 0); + + irq_exit(); +} + +/* + * This handler is triggered by NMI contexts: + */ +void perf_counter_notify(struct pt_regs *regs) +{ + struct cpu_hw_counters *cpuc; + unsigned long flags; + int bit, cpu; + + local_irq_save(flags); + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); + + for_each_bit(bit, cpuc->used, nr_hw_counters) { + struct perf_counter *counter = cpuc->counters[bit]; + + if (!counter) + continue; + + if (counter->wakeup_pending) { + counter->wakeup_pending = 0; + wake_up(&counter->waitq); + } + } + + local_irq_restore(flags); +} + +void __cpuinit perf_counters_lapic_init(int nmi) +{ + u32 apic_val; + + if (!perf_counters_initialized) + return; + /* + * Enable the performance counter vector in the APIC LVT: + */ + apic_val = apic_read(APIC_LVTERR); + + apic_write(APIC_LVTERR, apic_val | APIC_LVT_MASKED); + if (nmi) + apic_write(APIC_LVTPC, APIC_DM_NMI); + else + apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + apic_write(APIC_LVTERR, apic_val); +} + +static int __kprobes +perf_counter_nmi_handler(struct notifier_block *self, + unsigned long cmd, void *__args) +{ + struct die_args *args = __args; + struct pt_regs *regs; + + if (likely(cmd != DIE_NMI_IPI)) + return NOTIFY_DONE; + + regs = args->regs; + + apic_write(APIC_LVTPC, APIC_DM_NMI); + __smp_perf_counter_interrupt(regs, 1); + + return NOTIFY_STOP; +} + +static __read_mostly struct notifier_block perf_counter_nmi_notifier = { + .notifier_call = perf_counter_nmi_handler +}; + +void __init init_hw_perf_counters(void) +{ + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + /* + * Check whether the Architectural PerfMon supports + * Branch Misses Retired Event or not. + */ + cpuid(10, &(eax.full), &ebx, &unused, &unused); + if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) + return; + + printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); + + printk(KERN_INFO "... version: %d\n", eax.split.version_id); + printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); + nr_hw_counters = eax.split.num_counters; + if (nr_hw_counters > MAX_HW_COUNTERS) { + nr_hw_counters = MAX_HW_COUNTERS; + WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", + nr_hw_counters, MAX_HW_COUNTERS); + } + perf_counter_mask = (1 << nr_hw_counters) - 1; + perf_max_counters = nr_hw_counters; + + printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + + perf_counters_lapic_init(0); + register_die_notifier(&perf_counter_nmi_notifier); + + perf_counters_initialized = true; +} diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3194636a429..fc013cfde30 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -984,6 +984,11 @@ apicinterrupt ERROR_APIC_VECTOR \ apicinterrupt SPURIOUS_APIC_VECTOR \ spurious_interrupt smp_spurious_interrupt +#ifdef CONFIG_PERF_COUNTERS +apicinterrupt LOCAL_PERF_VECTOR \ + perf_counter_interrupt smp_perf_counter_interrupt +#endif + /* * Exception entry points. */ diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d1d4dc52f64..d92bc71e41a 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -56,6 +56,10 @@ static int show_other_interrupts(struct seq_file *p) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_timer_irqs); seq_printf(p, " Local timer interrupts\n"); + seq_printf(p, "CNT: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); + seq_printf(p, " Performance counter interrupts\n"); #endif #ifdef CONFIG_SMP seq_printf(p, "RES: "); @@ -160,6 +164,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) #ifdef CONFIG_X86_LOCAL_APIC sum += irq_stats(cpu)->apic_timer_irqs; + sum += irq_stats(cpu)->apic_perf_irqs; #endif #ifdef CONFIG_SMP sum += irq_stats(cpu)->irq_resched_count; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 607db63044a..6a33b5e3016 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -160,6 +160,9 @@ void __init native_init_IRQ(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); +# ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); +# endif #endif #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 8670b3ce626..91d785c25ad 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -138,6 +138,11 @@ static void __init apic_intr_init(void) /* IPI vectors for APIC spurious and error interrupts */ alloc_intr_gate(SPURIOUS_APIC_VECTOR, spurious_interrupt); alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); + + /* Performance monitoring interrupt: */ +#ifdef CONFIG_PERF_COUNTERS + alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); +#endif } void __init native_init_IRQ(void) diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index b1cc6da6420..dee553c503d 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,7 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ - +#include #include #include #include @@ -891,6 +891,11 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } + if (thread_info_flags & _TIF_PERF_COUNTERS) { + clear_thread_flag(TIF_PERF_COUNTERS); + perf_counter_notify(regs); + } + #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index d44395ff34c..496726ddcea 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -332,3 +332,4 @@ ENTRY(sys_call_table) .long sys_dup3 /* 330 */ .long sys_pipe2 .long sys_inotify_init1 + .long sys_perf_counter_open From 87b9cf4623ad4e5fc009e48c020593dffd5d3793 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Dec 2008 14:20:16 +0100 Subject: [PATCH 0004/2061] x86, perfcounters: read out MSR_CORE_PERF_GLOBAL_STATUS with counters disabled Impact: make perfcounter NMI and IRQ sequence more robust Make __smp_perf_counter_interrupt() a bit more conservative: first disable all counters, then read out the status. Most invocations are because there are real events, so there's no performance impact. Code flow gets a bit simpler as well this way. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 82440cbed0e..615e953208e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -383,18 +383,16 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) struct cpu_hw_counters *cpuc; u64 ack, status; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - if (!status) { - ack_APIC_irq(); - return; - } - /* Disable counters globally */ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); ack_APIC_irq(); cpuc = &per_cpu(cpu_hw_counters, cpu); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + if (!status) + goto out; + again: ack = status; for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { @@ -440,7 +438,7 @@ again: rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (status) goto again; - +out: /* * Do not reenable when global enable is off: */ From 4c59e4676dc95f6f58a2cff5390b2699fa5b5549 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 8 Dec 2008 19:38:33 +0100 Subject: [PATCH 0005/2061] perfcounters: select ANON_INODES The perfcounters subsystem depends on CONFIG_ANON_INODES facilities, so make sure it's selected. Signed-off-by: Ingo Molnar --- init/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/init/Kconfig b/init/Kconfig index 78bede218f1..7d147a36e96 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -741,6 +741,7 @@ config PERF_COUNTERS bool "Kernel Performance Counters" depends on HAVE_PERF_COUNTERS default y + select ANON_INODES help Enable kernel support for performance counter hardware. From 7e2ae34749edf19e76e594b9c4b2cdde1066afc5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 11:40:46 +0100 Subject: [PATCH 0006/2061] perfcounters, x86: simplify disable/enable of counters Impact: fix spurious missed counter wakeups In the case of NMI events, close a race window that can occur if an NMI hits counter code that temporarily disables+enables a counter, and the NMI leaks into the disabled section. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 40 +++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 615e953208e..7d528ffc2d2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -136,14 +136,25 @@ void hw_perf_disable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); } +static inline void +__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +{ + wrmsr(hwc->config_base + idx, hwc->config, 0); +} + static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); -static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) { per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; wrmsr(hwc->counter_base + idx, hwc->next_count, 0); - wrmsr(hwc->config_base + idx, hwc->config, 0); +} + +static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +{ + wrmsr(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } void hw_perf_counter_enable(struct perf_counter *counter) @@ -161,11 +172,11 @@ void hw_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - wrmsr(hwc->config_base + idx, - hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + __hw_perf_counter_disable(hwc, idx); cpuc->counters[idx] = counter; - counter->hw.config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + __hw_perf_counter_set_period(hwc, idx); __hw_perf_counter_enable(hwc, idx); } @@ -286,8 +297,7 @@ void hw_perf_counter_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - counter->hw.config &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsr(hwc->config_base + idx, hwc->config, 0); + __hw_perf_counter_disable(hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; @@ -328,18 +338,24 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data) } } +/* + * NMI-safe enable method: + */ static void perf_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; + u64 pmc_ctrl; + int err; - wrmsr(hwc->config_base + idx, - hwc->config & ~ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); + WARN_ON_ONCE(err); - if (hwc->config & ARCH_PERFMON_EVENTSEL0_ENABLE) { - __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_save_counter(counter, hwc, idx); + __hw_perf_counter_set_period(hwc, idx); + + if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) __hw_perf_counter_enable(hwc, idx); - } } static void From 1e12567678054bc1d4c944ecfad17624b3e49345 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 12:18:18 +0100 Subject: [PATCH 0007/2061] perfcounters, x86: clean up debug code Impact: cleanup Get rid of unused debug code. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 35 ++++++++++-------------------- 1 file changed, 12 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7d528ffc2d2..919ec46679b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -214,13 +214,11 @@ static void __hw_perf_save_counter(struct perf_counter *counter, { s64 raw = -1; s64 delta; - int err; /* * Get the raw hw counter value: */ - err = rdmsrl_safe(hwc->counter_base + idx, &raw); - WARN_ON_ONCE(err); + rdmsrl(hwc->counter_base + idx, raw); /* * Rebase it to zero (it started counting at -irq_period), @@ -252,20 +250,18 @@ static void __hw_perf_save_counter(struct perf_counter *counter, void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; - int cpu, err, idx; + int cpu, idx; + + if (!nr_hw_counters) + return; local_irq_disable(); cpu = smp_processor_id(); - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_CTRL, &ctrl); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_STATUS, &status); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_CORE_PERF_GLOBAL_OVF_CTRL, &overflow); - WARN_ON_ONCE(err); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); printk(KERN_INFO "\n"); printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); @@ -273,11 +269,8 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); for (idx = 0; idx < nr_hw_counters; idx++) { - err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); - WARN_ON_ONCE(err); - - err = rdmsrl_safe(MSR_ARCH_PERFMON_PERFCTR0 + idx, &pmc_count); - WARN_ON_ONCE(err); + rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); + rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); next_count = per_cpu(prev_next_count[idx], cpu); @@ -310,13 +303,11 @@ void hw_perf_counter_read(struct perf_counter *counter) unsigned long addr = hwc->counter_base + hwc->idx; s64 offs, val = -1LL; s32 val32; - int err; /* Careful: NMI might modify the counter offset */ do { offs = hwc->prev_count; - err = rdmsrl_safe(addr, &val); - WARN_ON_ONCE(err); + rdmsrl(addr, val); } while (offs != hwc->prev_count); val32 = (s32) val; @@ -346,10 +337,8 @@ static void perf_save_and_restart(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; u64 pmc_ctrl; - int err; - err = rdmsrl_safe(MSR_ARCH_PERFMON_EVENTSEL0 + idx, &pmc_ctrl); - WARN_ON_ONCE(err); + rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); __hw_perf_save_counter(counter, hwc, idx); __hw_perf_counter_set_period(hwc, idx); From 43874d238d5f208854a73c3225ca2a22833eec8b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 9 Dec 2008 12:23:59 +0100 Subject: [PATCH 0008/2061] perfcounters: consolidate global-disable codepaths Impact: cleanup Simplify global disable handling. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 23 ++++++----------------- 1 file changed, 6 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 919ec46679b..6a93d1f04d9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,7 +33,6 @@ static u32 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[MAX_HW_COUNTERS]; unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; - int enable_all; }; /* @@ -115,24 +114,13 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) return 0; } -static void __hw_perf_enable_all(void) +void hw_perf_enable_all(void) { wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_enable_all(void) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - cpuc->enable_all = 1; - __hw_perf_enable_all(); -} - void hw_perf_disable_all(void) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - cpuc->enable_all = 0; wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); } @@ -385,8 +373,10 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); + u64 ack, status, saved_global; struct cpu_hw_counters *cpuc; - u64 ack, status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); /* Disable counters globally */ wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); @@ -445,10 +435,9 @@ again: goto again; out: /* - * Do not reenable when global enable is off: + * Restore - do not reenable when global enable is off: */ - if (cpuc->enable_all) - __hw_perf_enable_all(); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0); } void smp_perf_counter_interrupt(struct pt_regs *regs) From 4ac13294e44664bb7edf4daf52edb71e7c6bbe84 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Dec 2008 21:43:39 +0100 Subject: [PATCH 0009/2061] perf counters: protect them against CSTATE transitions Impact: fix rare lost events problem There are CPUs whose performance counters misbehave on CSTATE transitions, so provide a way to just disable/enable them around deep idle methods. (hw_perf_enable_all() is cheap on x86.) Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 ++++++++++++++-- drivers/acpi/processor_idle.c | 8 ++++++++ include/linux/perf_counter.h | 4 ++++ 3 files changed, 26 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6a93d1f04d9..0a7f3bea2dc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -119,10 +120,21 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_disable_all(void) +void hw_perf_restore_ctrl(u64 ctrl) { - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); } +EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl); + +u64 hw_perf_disable_all(void) +{ + u64 ctrl; + + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + return ctrl; +} +EXPORT_SYMBOL_GPL(hw_perf_disable_all); static inline void __hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 5f8d746a9b8..cca804e6f1d 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -270,8 +270,11 @@ static atomic_t c3_cpu_count; /* Common C-state entry for C2, C3, .. */ static void acpi_cstate_enter(struct acpi_processor_cx *cstate) { + u64 pctrl; + /* Don't trace irqs off for idle */ stop_critical_timings(); + pctrl = hw_perf_disable_all(); if (cstate->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cstate); @@ -284,6 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } + hw_perf_restore_ctrl(pctrl); start_critical_timings(); } #endif /* !CONFIG_CPU_IDLE */ @@ -1425,8 +1429,11 @@ static inline void acpi_idle_update_bm_rld(struct acpi_processor *pr, */ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) { + u64 pctrl; + /* Don't trace irqs off for idle */ stop_critical_timings(); + pctrl = hw_perf_disable_all(); if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -1441,6 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } + hw_perf_restore_ctrl(pctrl); start_critical_timings(); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 22c4469abf4..5031b5614f2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -156,6 +156,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *task); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); +extern void hw_perf_restore_ctrl(u64 ctrl); +extern u64 hw_perf_disable_all(void); #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -166,6 +168,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *task) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } +static inline void hw_perf_restore_ctrl(u64 ctrl) { } +static inline u64 hw_perf_disable_all(void) { return 0; } #endif #endif /* _LINUX_PERF_COUNTER_H */ From eab656ae04b9d3b83265e3db01c0d2c46b748ef7 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:26:59 +0100 Subject: [PATCH 0010/2061] perf counters: clean up 'raw' type API Impact: cleanup Introduce a separate hw_event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++++++ include/linux/syscalls.h | 8 +++----- kernel/perf_counter.c | 15 ++++++++------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5031b5614f2..daedd7d87c2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -38,6 +38,7 @@ enum hw_event_types { * If this bit is set in the type, then trigger NMI sampling: */ PERF_COUNT_NMI = (1 << 30), + PERF_COUNT_RAW = (1 << 31), }; /* @@ -49,6 +50,12 @@ enum perf_record_type { PERF_RECORD_GROUP, }; +struct perf_counter_event { + u32 hw_event_type; + u32 hw_event_period; + u64 hw_raw_ctrl; +}; + /** * struct hw_perf_counter - performance counter hardware details */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 6cce728a626..3ecd73d03da 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,6 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; +struct perf_counter_event; #include #include @@ -625,9 +626,6 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage int -sys_perf_counter_open(u32 hw_event_type, - u32 hw_event_period, - u32 record_type, - pid_t pid, - int cpu); +sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, + pid_t pid, int cpu, int masterfd); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 20508f05365..96c333a5b0f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -734,26 +734,27 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) * @pid: target pid */ asmlinkage int -sys_perf_counter_open(u32 hw_event_type, - u32 hw_event_period, - u32 record_type, - pid_t pid, - int cpu) +sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, + pid_t pid, int cpu, int masterfd) { struct perf_counter_context *ctx; + struct perf_counter_event event; struct perf_counter *counter; int ret; + if (copy_from_user(&event, uevent, sizeof(event)) != 0) + return -EFAULT; + ctx = find_get_context(pid, cpu); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(hw_event_period, cpu, record_type); + counter = perf_counter_alloc(event.hw_event_period, cpu, record_type); if (!counter) goto err_put_context; - ret = hw_perf_counter_init(counter, hw_event_type); + ret = hw_perf_counter_init(counter, event.hw_event_type); if (ret) goto err_free_put_context; From dfa7c899b401d7dc5d85aca416aee64ac82812f2 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 8 Dec 2008 19:35:37 +0100 Subject: [PATCH 0011/2061] perf counters: expand use of counter->event Impact: change syscall, cleanup Make use of the new perf_counters event type. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 22 +++++++++++----------- include/linux/perf_counter.h | 4 +--- kernel/perf_counter.c | 10 +++++----- 3 files changed, 17 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0a7f3bea2dc..30e7ebf7827 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -56,9 +56,10 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Setup the hardware configuration for a given hw_event_type */ -int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) +int hw_perf_counter_init(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; + u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -83,7 +84,7 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->__irq_period; + hwc->irq_period = counter->event.hw_event_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -95,21 +96,19 @@ int hw_perf_counter_init(struct perf_counter *counter, s32 hw_event_type) hwc->next_count = -((s32) hwc->irq_period); /* - * Negative event types mean raw encoded event+umask values: + * Raw event type provide the config in the event structure */ - if (hw_event_type < 0) { - counter->hw_event_type = -hw_event_type; - counter->hw_event_type &= ~PERF_COUNT_NMI; + hw_event_type &= ~PERF_COUNT_NMI; + if (hw_event_type == PERF_COUNT_RAW) { + hwc->config |= counter->event.hw_raw_ctrl; } else { - hw_event_type &= ~PERF_COUNT_NMI; if (hw_event_type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - counter->hw_event_type = intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event_type]; } - hwc->config |= counter->hw_event_type; counter->wakeup_pending = 0; return 0; @@ -373,7 +372,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->hw_event_type); + perf_store_irq_data(leader, counter->event.hw_event_type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -418,7 +417,8 @@ again: perf_store_irq_data(counter, instruction_pointer(regs)); break; case PERF_RECORD_GROUP: - perf_store_irq_data(counter, counter->hw_event_type); + perf_store_irq_data(counter, + counter->event.hw_event_type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index daedd7d87c2..1f0017673e7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -96,8 +96,7 @@ struct perf_counter { #else atomic_t count32[2]; #endif - u64 __irq_period; - + struct perf_counter_event event; struct hw_perf_counter hw; struct perf_counter_context *ctx; @@ -111,7 +110,6 @@ struct perf_counter { int oncpu; int cpu; - s32 hw_event_type; enum perf_record_type record_type; /* read() / irq related data */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 96c333a5b0f..2557c670a3b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -37,7 +37,7 @@ static DEFINE_MUTEX(perf_resource_mutex); * Architecture provided APIs - weak aliases: */ -int __weak hw_perf_counter_init(struct perf_counter *counter, u32 hw_event_type) +int __weak hw_perf_counter_init(struct perf_counter *counter) { return -EINVAL; } @@ -707,7 +707,7 @@ static const struct file_operations perf_fops = { * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) +perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) { struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -722,7 +722,7 @@ perf_counter_alloc(u32 hw_event_period, int cpu, u32 record_type) counter->usrdata = &counter->data[1]; counter->cpu = cpu; counter->record_type = record_type; - counter->__irq_period = hw_event_period; + counter->event = *event; counter->wakeup_pending = 0; return counter; @@ -750,11 +750,11 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(event.hw_event_period, cpu, record_type); + counter = perf_counter_alloc(&event, cpu, record_type); if (!counter) goto err_put_context; - ret = hw_perf_counter_init(counter, event.hw_event_type); + ret = hw_perf_counter_init(counter); if (ret) goto err_free_put_context; From 9f66a3810fe0d4100972db84290f3ae4a4d77025 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 10 Dec 2008 12:33:23 +0100 Subject: [PATCH 0012/2061] perf counters: restructure the API Impact: clean up new API Thorough cleanup of the new perf counters API, we now get clean separation of the various concepts: - introduce perf_counter_hw_event to separate out the event source details - move special type flags into separate attributes: PERF_COUNT_NMI, PERF_COUNT_RAW - extend the type to u64 and reserve it fully to the architecture in the raw type case. And make use of all these changes in the core and x86 perfcounters code. Also change the syscall signature to: asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, pid_t pid, int cpu, int group_fd); ( Note that group_fd is unused for now - it's reserved for the counter groups abstraction. ) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 29 +++++---- include/linux/perf_counter.h | 98 +++++++++++++++++++----------- include/linux/syscalls.h | 12 ++-- kernel/perf_counter.c | 38 +++++++----- 4 files changed, 106 insertions(+), 71 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 30e7ebf7827..ef1936a871a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -58,8 +58,8 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); */ int hw_perf_counter_init(struct perf_counter *counter) { + struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; - u32 hw_event_type = counter->event.hw_event_type; if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -77,14 +77,14 @@ int hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 0; if (capable(CAP_SYS_ADMIN)) { hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event_type & PERF_COUNT_NMI) + if (hw_event->nmi) hwc->nmi = 1; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = counter->event.hw_event_period; + hwc->irq_period = hw_event->irq_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, * so we install an artificial 1<<31 period regardless of @@ -93,21 +93,20 @@ int hw_perf_counter_init(struct perf_counter *counter) if (!hwc->irq_period) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -((s32) hwc->irq_period); + hwc->next_count = -(s32)hwc->irq_period; /* * Raw event type provide the config in the event structure */ - hw_event_type &= ~PERF_COUNT_NMI; - if (hw_event_type == PERF_COUNT_RAW) { - hwc->config |= counter->event.hw_raw_ctrl; + if (hw_event->raw) { + hwc->config |= hw_event->type; } else { - if (hw_event_type >= max_intel_perfmon_events) + if (hw_event->type >= max_intel_perfmon_events) return -EINVAL; /* * The generic map: */ - hwc->config |= intel_perfmon_event_map[hw_event_type]; + hwc->config |= intel_perfmon_event_map[hw_event->type]; } counter->wakeup_pending = 0; @@ -354,7 +353,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) int bit; list_for_each_entry(counter, &ctx->counters, list) { - if (counter->record_type != PERF_RECORD_SIMPLE || + if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || counter == leader) continue; @@ -372,7 +371,7 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->event.hw_event_type); + perf_store_irq_data(leader, counter->hw_event.type); perf_store_irq_data(leader, atomic64_counter_read(counter)); } } @@ -410,7 +409,7 @@ again: perf_save_and_restart(counter); - switch (counter->record_type) { + switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: continue; case PERF_RECORD_IRQ: @@ -418,7 +417,7 @@ again: break; case PERF_RECORD_GROUP: perf_store_irq_data(counter, - counter->event.hw_event_type); + counter->hw_event.type); perf_store_irq_data(counter, atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1f0017673e7..a2b4852e2d7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -24,65 +24,93 @@ struct task_struct; /* - * Generalized hardware event types, used by the hw_event_type parameter - * of the sys_perf_counter_open() syscall: + * User-space ABI bits: + */ + +/* + * Generalized performance counter event types, used by the hw_event.type + * parameter of the sys_perf_counter_open() syscall: */ enum hw_event_types { - PERF_COUNT_CYCLES, - PERF_COUNT_INSTRUCTIONS, - PERF_COUNT_CACHE_REFERENCES, - PERF_COUNT_CACHE_MISSES, - PERF_COUNT_BRANCH_INSTRUCTIONS, - PERF_COUNT_BRANCH_MISSES, /* - * If this bit is set in the type, then trigger NMI sampling: + * Common hardware events, generalized by the kernel: */ - PERF_COUNT_NMI = (1 << 30), - PERF_COUNT_RAW = (1 << 31), + PERF_COUNT_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + + /* + * Special "software" counters provided by the kernel, even if + * the hardware does not support performance counters. These + * counters measure various physical and sw events of the + * kernel (and allow the profiling of them as well): + */ + PERF_COUNT_CPU_CLOCK = -1, + PERF_COUNT_TASK_CLOCK = -2, + PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, }; /* * IRQ-notification data record type: */ -enum perf_record_type { - PERF_RECORD_SIMPLE, - PERF_RECORD_IRQ, - PERF_RECORD_GROUP, +enum perf_counter_record_type { + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, }; -struct perf_counter_event { - u32 hw_event_type; - u32 hw_event_period; - u64 hw_raw_ctrl; +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_hw_event { + u64 type; + + u64 irq_period; + u32 record_type; + + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + __reserved_1 : 29; + + u64 __reserved_2; }; +/* + * Kernel-internal data types: + */ + /** - * struct hw_perf_counter - performance counter hardware details + * struct hw_perf_counter - performance counter hardware details: */ struct hw_perf_counter { - u64 config; - unsigned long config_base; - unsigned long counter_base; - int nmi; - unsigned int idx; - u64 prev_count; - s32 next_count; - u64 irq_period; + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + u64 prev_count; + u64 irq_period; + s32 next_count; }; /* * Hardcoded buffer length limit for now, for IRQ-fed events: */ -#define PERF_DATA_BUFLEN 2048 +#define PERF_DATA_BUFLEN 2048 /** * struct perf_data - performance counter IRQ data sampling ... */ struct perf_data { - int len; - int rd_idx; - int overrun; - u8 data[PERF_DATA_BUFLEN]; + int len; + int rd_idx; + int overrun; + u8 data[PERF_DATA_BUFLEN]; }; /** @@ -96,7 +124,7 @@ struct perf_counter { #else atomic_t count32[2]; #endif - struct perf_counter_event event; + struct perf_counter_hw_event hw_event; struct hw_perf_counter hw; struct perf_counter_context *ctx; @@ -110,8 +138,6 @@ struct perf_counter { int oncpu; int cpu; - enum perf_record_type record_type; - /* read() / irq related data */ wait_queue_head_t waitq; /* optional: for NMIs */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 3ecd73d03da..a549678b7c3 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -54,7 +54,7 @@ struct compat_stat; struct compat_timeval; struct robust_list_head; struct getcpu_cache; -struct perf_counter_event; +struct perf_counter_hw_event; #include #include @@ -625,7 +625,11 @@ asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage int -sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, - pid_t pid, int cpu, int masterfd); + +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2557c670a3b..0d323ceda3a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -669,7 +669,7 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_counter *counter = file->private_data; - switch (counter->record_type) { + switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: return perf_read_hw(counter, buf, count); @@ -707,7 +707,7 @@ static const struct file_operations perf_fops = { * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) +perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu) { struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -718,31 +718,37 @@ perf_counter_alloc(struct perf_counter_event *event, int cpu, u32 record_type) INIT_LIST_HEAD(&counter->list); init_waitqueue_head(&counter->waitq); - counter->irqdata = &counter->data[0]; - counter->usrdata = &counter->data[1]; - counter->cpu = cpu; - counter->record_type = record_type; - counter->event = *event; - counter->wakeup_pending = 0; + counter->irqdata = &counter->data[0]; + counter->usrdata = &counter->data[1]; + counter->cpu = cpu; + counter->hw_event = *hw_event; + counter->wakeup_pending = 0; return counter; } /** - * sys_perf_task_open - open a performance counter associate it to a task - * @hw_event_type: event type for monitoring/sampling... + * sys_perf_task_open - open a performance counter, associate it to a task/cpu + * + * @hw_event_uptr: event type attributes for monitoring/sampling * @pid: target pid + * @cpu: target cpu + * @group_fd: group leader counter fd */ -asmlinkage int -sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, - pid_t pid, int cpu, int masterfd) +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd) + { struct perf_counter_context *ctx; - struct perf_counter_event event; + struct perf_counter_hw_event hw_event; struct perf_counter *counter; int ret; - if (copy_from_user(&event, uevent, sizeof(event)) != 0) + if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; ctx = find_get_context(pid, cpu); @@ -750,7 +756,7 @@ sys_perf_counter_open(struct perf_counter_event __user *uevent, u32 record_type, return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(&event, cpu, record_type); + counter = perf_counter_alloc(&hw_event, cpu); if (!counter) goto err_put_context; From 04289bb9891882202d7e961c4c04d2376930e9f9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 08:38:42 +0100 Subject: [PATCH 0013/2061] perf counters: add support for group counters Impact: add group counters This patch adds the "counter groups" abstraction. Groups of counters behave much like normal 'single' counters, with a few semantic and behavioral extensions on top of that. A counter group is created by creating a new counter with the open() syscall's group-leader group_fd file descriptor parameter pointing to another, already existing counter. Groups of counters are scheduled in and out in one atomic group, and they are also roundrobin-scheduled atomically. Counters that are member of a group can also record events with an (atomic) extended timestamp that extends to all members of the group, if the record type is set to PERF_RECORD_GROUP. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 +-- include/linux/perf_counter.h | 8 +- kernel/perf_counter.c | 282 ++++++++++++++++++++++------- 3 files changed, 236 insertions(+), 82 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ef1936a871a..54b4ad0cce6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -346,18 +346,22 @@ static void perf_save_and_restart(struct perf_counter *counter) } static void -perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) +perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) { - struct perf_counter_context *ctx = leader->ctx; - struct perf_counter *counter; + struct perf_counter *counter, *group_leader = sibling->group_leader; int bit; - list_for_each_entry(counter, &ctx->counters, list) { - if (counter->hw_event.record_type != PERF_RECORD_SIMPLE || - counter == leader) - continue; + /* + * Store the counter's own timestamp first: + */ + perf_store_irq_data(sibling, sibling->hw_event.type); + perf_store_irq_data(sibling, atomic64_counter_read(sibling)); - if (counter->active) { + /* + * Then store sibling timestamps (if any): + */ + list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + if (!counter->active) { /* * When counter was not in the overflow mask, we have to * read it from hardware. We read it as well, when it @@ -371,8 +375,8 @@ perf_handle_group(struct perf_counter *leader, u64 *status, u64 *overflown) perf_save_and_restart(counter); } } - perf_store_irq_data(leader, counter->hw_event.type); - perf_store_irq_data(leader, atomic64_counter_read(counter)); + perf_store_irq_data(sibling, counter->hw_event.type); + perf_store_irq_data(sibling, atomic64_counter_read(counter)); } } @@ -416,10 +420,6 @@ again: perf_store_irq_data(counter, instruction_pointer(regs)); break; case PERF_RECORD_GROUP: - perf_store_irq_data(counter, - counter->hw_event.type); - perf_store_irq_data(counter, - atomic64_counter_read(counter)); perf_handle_group(counter, &status, &ack); break; } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a2b4852e2d7..7af7d896546 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -117,7 +117,10 @@ struct perf_data { * struct perf_counter - performance counter kernel representation: */ struct perf_counter { - struct list_head list; + struct list_head list_entry; + struct list_head sibling_list; + struct perf_counter *group_leader; + int active; #if BITS_PER_LONG == 64 atomic64_t count; @@ -158,7 +161,8 @@ struct perf_counter_context { * Protect the list of counters: */ spinlock_t lock; - struct list_head counters; + + struct list_head counter_list; int nr_counters; int nr_active; struct task_struct *task; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0d323ceda3a..fa59fe8c02d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include #include @@ -55,7 +56,7 @@ void __weak hw_perf_counter_setup(void) { } * Read the cached counter in counter safe against cross CPU / NMI * modifications. 64 bit version - no complications. */ -static inline u64 perf_read_counter_safe(struct perf_counter *counter) +static inline u64 perf_counter_read_safe(struct perf_counter *counter) { return (u64) atomic64_read(&counter->count); } @@ -66,7 +67,7 @@ static inline u64 perf_read_counter_safe(struct perf_counter *counter) * Read the cached counter in counter safe against cross CPU / NMI * modifications. 32 bit version. */ -static u64 perf_read_counter_safe(struct perf_counter *counter) +static u64 perf_counter_read_safe(struct perf_counter *counter) { u32 cntl, cnth; @@ -83,13 +84,55 @@ static u64 perf_read_counter_safe(struct perf_counter *counter) #endif +static void +list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ + struct perf_counter *group_leader = counter->group_leader; + + /* + * Depending on whether it is a standalone or sibling counter, + * add it straight to the context's counter list, or to the group + * leader's sibling list: + */ + if (counter->group_leader == counter) + list_add_tail(&counter->list_entry, &ctx->counter_list); + else + list_add_tail(&counter->list_entry, &group_leader->sibling_list); +} + +static void +list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) +{ + struct perf_counter *sibling, *tmp; + + list_del_init(&counter->list_entry); + + if (list_empty(&counter->sibling_list)) + return; + + /* + * If this was a group counter with sibling counters then + * upgrade the siblings to singleton counters by adding them + * to the context list directly: + */ + list_for_each_entry_safe(sibling, tmp, + &counter->sibling_list, list_entry) { + + list_del_init(&sibling->list_entry); + list_add_tail(&sibling->list_entry, &ctx->counter_list); + WARN_ON_ONCE(!sibling->group_leader); + WARN_ON_ONCE(sibling->group_leader == sibling); + sibling->group_leader = sibling; + } +} + /* * Cross CPU call to remove a performance counter * * We disable the counter on the hardware level first. After that we * remove it from the context list. */ -static void __perf_remove_from_context(void *info) +static void __perf_counter_remove_from_context(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; @@ -119,7 +162,7 @@ static void __perf_remove_from_context(void *info) * counters on a global level. NOP for non NMI based counters. */ hw_perf_disable_all(); - list_del_init(&counter->list); + list_del_counter(counter, ctx); hw_perf_enable_all(); if (!ctx->task) { @@ -144,7 +187,7 @@ static void __perf_remove_from_context(void *info) * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. */ -static void perf_remove_from_context(struct perf_counter *counter) +static void perf_counter_remove_from_context(struct perf_counter *counter) { struct perf_counter_context *ctx = counter->ctx; struct task_struct *task = ctx->task; @@ -155,32 +198,32 @@ static void perf_remove_from_context(struct perf_counter *counter) * the removal is always sucessful. */ smp_call_function_single(counter->cpu, - __perf_remove_from_context, + __perf_counter_remove_from_context, counter, 1); return; } retry: - task_oncpu_function_call(task, __perf_remove_from_context, + task_oncpu_function_call(task, __perf_counter_remove_from_context, counter); spin_lock_irq(&ctx->lock); /* * If the context is active we need to retry the smp call. */ - if (ctx->nr_active && !list_empty(&counter->list)) { + if (ctx->nr_active && !list_empty(&counter->list_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } /* * The lock prevents that this context is scheduled in so we - * can remove the counter safely, if it the call above did not + * can remove the counter safely, if the call above did not * succeed. */ - if (!list_empty(&counter->list)) { + if (!list_empty(&counter->list_entry)) { ctx->nr_counters--; - list_del_init(&counter->list); + list_del_counter(counter, ctx); counter->task = NULL; } spin_unlock_irq(&ctx->lock); @@ -211,7 +254,7 @@ static void __perf_install_in_context(void *info) * counters on a global level. NOP for non NMI based counters. */ hw_perf_disable_all(); - list_add_tail(&counter->list, &ctx->counters); + list_add_counter(counter, ctx); hw_perf_enable_all(); ctx->nr_counters++; @@ -268,7 +311,7 @@ retry: * If the context is active and the counter has not been added * we need to retry the smp call. */ - if (ctx->nr_active && list_empty(&counter->list)) { + if (ctx->nr_active && list_empty(&counter->list_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } @@ -278,13 +321,45 @@ retry: * can add the counter safely, if it the call above did not * succeed. */ - if (list_empty(&counter->list)) { - list_add_tail(&counter->list, &ctx->counters); + if (list_empty(&counter->list_entry)) { + list_add_counter(counter, ctx); ctx->nr_counters++; } spin_unlock_irq(&ctx->lock); } +static void +counter_sched_out(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + if (!counter->active) + return; + + hw_perf_counter_disable(counter); + counter->active = 0; + counter->oncpu = -1; + + cpuctx->active_oncpu--; + ctx->nr_active--; +} + +static void +group_sched_out(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + + counter_sched_out(group_counter, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + counter_sched_out(counter, cpuctx, ctx); +} + /* * Called from scheduler to remove the counters of the current task, * with interrupts disabled. @@ -306,21 +381,48 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) return; spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counters, list) { - if (!ctx->nr_active) - break; - if (counter->active) { - hw_perf_counter_disable(counter); - counter->active = 0; - counter->oncpu = -1; - ctx->nr_active--; - cpuctx->active_oncpu--; - } + if (ctx->nr_active) { + list_for_each_entry(counter, &ctx->counter_list, list_entry) + group_sched_out(counter, cpuctx, ctx); } spin_unlock(&ctx->lock); cpuctx->task_ctx = NULL; } +static void +counter_sched_in(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + if (!counter->active) + return; + + hw_perf_counter_enable(counter); + counter->active = 1; + counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + + cpuctx->active_oncpu++; + ctx->nr_active++; +} + +static void +group_sched_in(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + struct perf_counter *counter; + + counter_sched_in(group_counter, cpuctx, ctx, cpu); + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + counter_sched_in(counter, cpuctx, ctx, cpu); +} + /* * Called from scheduler to add the counters of the current task * with interrupts disabled. @@ -342,19 +444,21 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) return; spin_lock(&ctx->lock); - list_for_each_entry(counter, &ctx->counters, list) { + list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (ctx->nr_active == cpuctx->max_pertask) break; + + /* + * Listen to the 'cpu' scheduling filter constraint + * of counters: + */ if (counter->cpu != -1 && counter->cpu != cpu) continue; - hw_perf_counter_enable(counter); - counter->active = 1; - counter->oncpu = cpu; - ctx->nr_active++; - cpuctx->active_oncpu++; + group_sched_in(counter, cpuctx, ctx, cpu); } spin_unlock(&ctx->lock); + cpuctx->task_ctx = ctx; } @@ -371,12 +475,12 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) spin_lock(&ctx->lock); /* - * Rotate the first entry last: + * Rotate the first entry last (works just fine for group counters too): */ hw_perf_disable_all(); - list_for_each_entry(counter, &ctx->counters, list) { - list_del(&counter->list); - list_add_tail(&counter->list, &ctx->counters); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + list_del(&counter->list_entry); + list_add_tail(&counter->list_entry, &ctx->counter_list); break; } hw_perf_enable_all(); @@ -386,17 +490,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) perf_counter_task_sched_in(curr, cpu); } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + spin_lock_init(&ctx->lock); + INIT_LIST_HEAD(&ctx->counter_list); + ctx->nr_counters = 0; + ctx->task = task; +} /* * Initialize the perf_counter context in task_struct */ void perf_counter_init_task(struct task_struct *task) { - struct perf_counter_context *ctx = &task->perf_counter_ctx; - - spin_lock_init(&ctx->lock); - INIT_LIST_HEAD(&ctx->counters); - ctx->nr_counters = 0; - ctx->task = task; + __perf_counter_init_context(&task->perf_counter_ctx, task); } /* @@ -407,7 +518,7 @@ static void __hw_perf_counter_read(void *info) hw_perf_counter_read(info); } -static u64 perf_read_counter(struct perf_counter *counter) +static u64 perf_counter_read(struct perf_counter *counter) { /* * If counter is enabled and currently active on a CPU, update the @@ -418,7 +529,7 @@ static u64 perf_read_counter(struct perf_counter *counter) __hw_perf_counter_read, counter, 1); } - return perf_read_counter_safe(counter); + return perf_counter_read_safe(counter); } /* @@ -555,7 +666,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_lock(&counter->mutex); - perf_remove_from_context(counter); + perf_counter_remove_from_context(counter); put_context(ctx); mutex_unlock(&counter->mutex); @@ -577,7 +688,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return -EINVAL; mutex_lock(&counter->mutex); - cntval = perf_read_counter(counter); + cntval = perf_counter_read(counter); mutex_unlock(&counter->mutex); return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); @@ -707,15 +818,25 @@ static const struct file_operations perf_fops = { * Allocate and initialize a counter structure */ static struct perf_counter * -perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu) +perf_counter_alloc(struct perf_counter_hw_event *hw_event, + int cpu, + struct perf_counter *group_leader) { struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); if (!counter) return NULL; + /* + * Single counters are their own group leaders, with an + * empty sibling list: + */ + if (!group_leader) + group_leader = counter; + mutex_init(&counter->mutex); - INIT_LIST_HEAD(&counter->list); + INIT_LIST_HEAD(&counter->list_entry); + INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); counter->irqdata = &counter->data[0]; @@ -723,6 +844,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu) counter->cpu = cpu; counter->hw_event = *hw_event; counter->wakeup_pending = 0; + counter->group_leader = group_leader; return counter; } @@ -743,20 +865,45 @@ asmlinkage int sys_perf_counter_open( int group_fd) { - struct perf_counter_context *ctx; + struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; - struct perf_counter *counter; + struct perf_counter_context *ctx; + struct file *group_file = NULL; + int fput_needed = 0; int ret; if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; + /* + * Look up the group leader: + */ + group_leader = NULL; + if (group_fd != -1) { + ret = -EINVAL; + group_file = fget_light(group_fd, &fput_needed); + if (!group_file) + goto out_fput; + if (group_file->f_op != &perf_fops) + goto out_fput; + + group_leader = group_file->private_data; + /* + * Do not allow a recursive hierarchy: + */ + if (group_leader->group_leader) + goto out_fput; + } + + /* + * Get the target context (task or percpu): + */ ctx = find_get_context(pid, cpu); if (IS_ERR(ctx)) return PTR_ERR(ctx); ret = -ENOMEM; - counter = perf_counter_alloc(&hw_event, cpu); + counter = perf_counter_alloc(&hw_event, cpu, group_leader); if (!counter) goto err_put_context; @@ -770,11 +917,14 @@ asmlinkage int sys_perf_counter_open( if (ret < 0) goto err_remove_free_put_context; +out_fput: + fput_light(group_file, fput_needed); + return ret; err_remove_free_put_context: mutex_lock(&counter->mutex); - perf_remove_from_context(counter); + perf_counter_remove_from_context(counter); mutex_unlock(&counter->mutex); err_free_put_context: @@ -783,40 +933,40 @@ err_free_put_context: err_put_context: put_context(ctx); - return ret; + goto out_fput; } -static void __cpuinit perf_init_cpu(int cpu) +static void __cpuinit perf_counter_init_cpu(int cpu) { - struct perf_cpu_context *ctx; + struct perf_cpu_context *cpuctx; - ctx = &per_cpu(perf_cpu_context, cpu); - spin_lock_init(&ctx->ctx.lock); - INIT_LIST_HEAD(&ctx->ctx.counters); + cpuctx = &per_cpu(perf_cpu_context, cpu); + __perf_counter_init_context(&cpuctx->ctx, NULL); mutex_lock(&perf_resource_mutex); - ctx->max_pertask = perf_max_counters - perf_reserved_percpu; + cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; mutex_unlock(&perf_resource_mutex); + hw_perf_counter_setup(); } #ifdef CONFIG_HOTPLUG_CPU -static void __perf_exit_cpu(void *info) +static void __perf_counter_exit_cpu(void *info) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = &cpuctx->ctx; struct perf_counter *counter, *tmp; - list_for_each_entry_safe(counter, tmp, &ctx->counters, list) - __perf_remove_from_context(counter); + list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) + __perf_counter_remove_from_context(counter); } -static void perf_exit_cpu(int cpu) +static void perf_counter_exit_cpu(int cpu) { - smp_call_function_single(cpu, __perf_exit_cpu, NULL, 1); + smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); } #else -static inline void perf_exit_cpu(int cpu) { } +static inline void perf_counter_exit_cpu(int cpu) { } #endif static int __cpuinit @@ -828,12 +978,12 @@ perf_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - perf_init_cpu(cpu); + perf_counter_init_cpu(cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: - perf_exit_cpu(cpu); + perf_counter_exit_cpu(cpu); break; default: From ccff286d85098ba5438e22aa2ea807fc1e18cf2f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 11:26:29 +0100 Subject: [PATCH 0014/2061] perf counters: group counter, fixes Impact: bugfix Check that a group does not span outside the context of a CPU or a task. Also, do not allow deep recursive hierarchies. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 39 ++++++++++++++++++++------------------- 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fa59fe8c02d..278209c547a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -107,9 +107,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_del_init(&counter->list_entry); - if (list_empty(&counter->sibling_list)) - return; - /* * If this was a group counter with sibling counters then * upgrade the siblings to singleton counters by adding them @@ -395,9 +392,6 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - if (!counter->active) - return; - hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -876,32 +870,39 @@ asmlinkage int sys_perf_counter_open( return -EFAULT; /* - * Look up the group leader: + * Get the target context (task or percpu): + */ + ctx = find_get_context(pid, cpu); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + /* + * Look up the group leader (we will attach this counter to it): */ group_leader = NULL; if (group_fd != -1) { ret = -EINVAL; group_file = fget_light(group_fd, &fput_needed); if (!group_file) - goto out_fput; + goto err_put_context; if (group_file->f_op != &perf_fops) - goto out_fput; + goto err_put_context; group_leader = group_file->private_data; /* - * Do not allow a recursive hierarchy: + * Do not allow a recursive hierarchy (this new sibling + * becoming part of another group-sibling): */ - if (group_leader->group_leader) - goto out_fput; + if (group_leader->group_leader != group_leader) + goto err_put_context; + /* + * Do not allow to attach to a group in a different + * task or CPU context: + */ + if (group_leader->ctx != ctx) + goto err_put_context; } - /* - * Get the target context (task or percpu): - */ - ctx = find_get_context(pid, cpu); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - ret = -ENOMEM; counter = perf_counter_alloc(&hw_event, cpu, group_leader); if (!counter) From 621a01eac89b5e2f81a4cf576568b31f40a02724 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 12:46:46 +0100 Subject: [PATCH 0015/2061] perf counters: hw driver API Impact: restructure code, introduce hw_ops driver abstraction Introduce this abstraction to handle counter details: struct hw_perf_counter_ops { void (*hw_perf_counter_enable) (struct perf_counter *counter); void (*hw_perf_counter_disable) (struct perf_counter *counter); void (*hw_perf_counter_read) (struct perf_counter *counter); }; This will be useful to support assymetric hw details, and it will also be useful to implement "software counters". (Counters that count kernel managed sw events such as pagefaults, context-switches, wall-clock time or task-local time.) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 37 +++++++++++++++++------- include/linux/perf_counter.h | 15 ++++++++++ kernel/perf_counter.c | 45 ++++++++++++++++-------------- 3 files changed, 66 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 54b4ad0cce6..718b635dece 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -56,7 +56,7 @@ const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Setup the hardware configuration for a given hw_event_type */ -int hw_perf_counter_init(struct perf_counter *counter) +static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; @@ -135,7 +135,7 @@ u64 hw_perf_disable_all(void) EXPORT_SYMBOL_GPL(hw_perf_disable_all); static inline void -__hw_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) { wrmsr(hwc->config_base + idx, hwc->config, 0); } @@ -149,13 +149,13 @@ static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) wrmsr(hwc->counter_base + idx, hwc->next_count, 0); } -static void __hw_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } -void hw_perf_counter_enable(struct perf_counter *counter) +static void x86_perf_counter_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -170,12 +170,12 @@ void hw_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __hw_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(hwc, idx); cpuc->counters[idx] = counter; __hw_perf_counter_set_period(hwc, idx); - __hw_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(hwc, idx); } #ifdef CONFIG_X86_64 @@ -282,20 +282,20 @@ void perf_counter_print_debug(void) local_irq_enable(); } -void hw_perf_counter_disable(struct perf_counter *counter) +static void x86_perf_counter_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __hw_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; __hw_perf_save_counter(counter, hwc, idx); } -void hw_perf_counter_read(struct perf_counter *counter) +static void x86_perf_counter_read(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; unsigned long addr = hwc->counter_base + hwc->idx; @@ -342,7 +342,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __hw_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(hwc, idx); } static void @@ -572,3 +572,20 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } + +static struct hw_perf_counter_ops x86_perf_counter_ops = { + .hw_perf_counter_enable = x86_perf_counter_enable, + .hw_perf_counter_disable = x86_perf_counter_disable, + .hw_perf_counter_read = x86_perf_counter_read, +}; + +struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter) +{ + int err; + + err = __hw_perf_counter_init(counter); + if (err) + return NULL; + + return &x86_perf_counter_ops; +} diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7af7d896546..27385641ecb 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -113,6 +113,17 @@ struct perf_data { u8 data[PERF_DATA_BUFLEN]; }; +struct perf_counter; + +/** + * struct hw_perf_counter_ops - performance counter hw ops + */ +struct hw_perf_counter_ops { + void (*hw_perf_counter_enable) (struct perf_counter *counter); + void (*hw_perf_counter_disable) (struct perf_counter *counter); + void (*hw_perf_counter_read) (struct perf_counter *counter); +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -120,6 +131,7 @@ struct perf_counter { struct list_head list_entry; struct list_head sibling_list; struct perf_counter *group_leader; + struct hw_perf_counter_ops *hw_ops; int active; #if BITS_PER_LONG == 64 @@ -185,6 +197,9 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS +extern struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter); + extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 278209c547a..e6e41ca9546 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -37,18 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex); /* * Architecture provided APIs - weak aliases: */ - -int __weak hw_perf_counter_init(struct perf_counter *counter) +extern __weak struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter) { - return -EINVAL; + return ERR_PTR(-EINVAL); } -void __weak hw_perf_counter_enable(struct perf_counter *counter) { } -void __weak hw_perf_counter_disable(struct perf_counter *counter) { } -void __weak hw_perf_counter_read(struct perf_counter *counter) { } -void __weak hw_perf_disable_all(void) { } -void __weak hw_perf_enable_all(void) { } -void __weak hw_perf_counter_setup(void) { } +void __weak hw_perf_disable_all(void) { } +void __weak hw_perf_enable_all(void) { } +void __weak hw_perf_counter_setup(void) { } #if BITS_PER_LONG == 64 @@ -146,7 +143,7 @@ static void __perf_counter_remove_from_context(void *info) spin_lock(&ctx->lock); if (counter->active) { - hw_perf_counter_disable(counter); + counter->hw_ops->hw_perf_counter_disable(counter); counter->active = 0; ctx->nr_active--; cpuctx->active_oncpu--; @@ -257,7 +254,7 @@ static void __perf_install_in_context(void *info) ctx->nr_counters++; if (cpuctx->active_oncpu < perf_max_counters) { - hw_perf_counter_enable(counter); + counter->hw_ops->hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; ctx->nr_active++; @@ -333,7 +330,7 @@ counter_sched_out(struct perf_counter *counter, if (!counter->active) return; - hw_perf_counter_disable(counter); + counter->hw_ops->hw_perf_counter_disable(counter); counter->active = 0; counter->oncpu = -1; @@ -392,7 +389,7 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - hw_perf_counter_enable(counter); + counter->hw_ops->hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -509,7 +506,9 @@ void perf_counter_init_task(struct task_struct *task) */ static void __hw_perf_counter_read(void *info) { - hw_perf_counter_read(info); + struct perf_counter *counter = info; + + counter->hw_ops->hw_perf_counter_read(counter); } static u64 perf_counter_read(struct perf_counter *counter) @@ -816,8 +815,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, struct perf_counter *group_leader) { - struct perf_counter *counter = kzalloc(sizeof(*counter), GFP_KERNEL); + struct hw_perf_counter_ops *hw_ops; + struct perf_counter *counter; + counter = kzalloc(sizeof(*counter), GFP_KERNEL); if (!counter) return NULL; @@ -839,6 +840,14 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->hw_event = *hw_event; counter->wakeup_pending = 0; counter->group_leader = group_leader; + counter->hw_ops = NULL; + + hw_ops = hw_perf_counter_init(counter); + if (!hw_ops) { + kfree(counter); + return NULL; + } + counter->hw_ops = hw_ops; return counter; } @@ -908,10 +917,6 @@ asmlinkage int sys_perf_counter_open( if (!counter) goto err_put_context; - ret = hw_perf_counter_init(counter); - if (ret) - goto err_free_put_context; - perf_install_in_context(ctx, counter, cpu); ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); @@ -927,8 +932,6 @@ err_remove_free_put_context: mutex_lock(&counter->mutex); perf_counter_remove_from_context(counter); mutex_unlock(&counter->mutex); - -err_free_put_context: kfree(counter); err_put_context: From 5c92d12411dfe5f0f3d1b1c1e2f756245e6f7249 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:21:10 +0100 Subject: [PATCH 0016/2061] perf counters: implement PERF_COUNT_CPU_CLOCK Impact: add new perf-counter type The 'CPU clock' counter counts the amount of CPU clock time that is elapsing, in nanoseconds. (regardless of how much of it the task is spending on a CPU executing) This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 36 ++--------- include/linux/perf_counter.h | 9 ++- kernel/perf_counter.c | 95 ++++++++++++++++++++++++++---- 3 files changed, 92 insertions(+), 48 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 718b635dece..43c8e9a38b4 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -178,35 +178,6 @@ static void x86_perf_counter_enable(struct perf_counter *counter) __x86_perf_counter_enable(hwc, idx); } -#ifdef CONFIG_X86_64 -static inline void atomic64_counter_set(struct perf_counter *counter, u64 val) -{ - atomic64_set(&counter->count, val); -} - -static inline u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic64_read(&counter->count); -} -#else -/* - * Todo: add proper atomic64_t support to 32-bit x86: - */ -static inline void atomic64_counter_set(struct perf_counter *counter, u64 val64) -{ - u32 *val32 = (void *)&val64; - - atomic_set(counter->count32 + 0, *(val32 + 0)); - atomic_set(counter->count32 + 1, *(val32 + 1)); -} - -static inline u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic_read(counter->count32 + 0) | - (u64) atomic_read(counter->count32 + 1) << 32; -} -#endif - static void __hw_perf_save_counter(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { @@ -309,7 +280,7 @@ static void x86_perf_counter_read(struct perf_counter *counter) } while (offs != hwc->prev_count); val32 = (s32) val; - val = (s64)hwc->irq_period + (s64)val32; + val = (s64)hwc->irq_period + (s64)val32; atomic64_counter_set(counter, hwc->prev_count + val); } @@ -573,13 +544,14 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } -static struct hw_perf_counter_ops x86_perf_counter_ops = { +static const struct hw_perf_counter_ops x86_perf_counter_ops = { .hw_perf_counter_enable = x86_perf_counter_enable, .hw_perf_counter_disable = x86_perf_counter_disable, .hw_perf_counter_read = x86_perf_counter_read, }; -struct hw_perf_counter_ops *hw_perf_counter_init(struct perf_counter *counter) +const struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter) { int err; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 27385641ecb..9a1713a1be2 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -131,7 +131,7 @@ struct perf_counter { struct list_head list_entry; struct list_head sibling_list; struct perf_counter *group_leader; - struct hw_perf_counter_ops *hw_ops; + const struct hw_perf_counter_ops *hw_ops; int active; #if BITS_PER_LONG == 64 @@ -197,7 +197,7 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS -extern struct hw_perf_counter_ops * +extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); @@ -208,6 +208,9 @@ extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); extern void hw_perf_restore_ctrl(u64 ctrl); extern u64 hw_perf_disable_all(void); +extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); +extern u64 atomic64_counter_read(struct perf_counter *counter); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -219,7 +222,7 @@ static inline void perf_counter_init_task(struct task_struct *task) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void hw_perf_restore_ctrl(u64 ctrl) { } -static inline u64 hw_perf_disable_all(void) { return 0; } +static inline u64 hw_perf_disable_all(void) { return 0; } #endif #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e6e41ca9546..506286e5ba6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -37,15 +37,15 @@ static DEFINE_MUTEX(perf_resource_mutex); /* * Architecture provided APIs - weak aliases: */ -extern __weak struct hw_perf_counter_ops * +extern __weak const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter) { return ERR_PTR(-EINVAL); } -void __weak hw_perf_disable_all(void) { } -void __weak hw_perf_enable_all(void) { } -void __weak hw_perf_counter_setup(void) { } +u64 __weak hw_perf_disable_all(void) { return 0; } +void __weak hw_perf_restore_ctrl(u64 ctrl) { } +void __weak hw_perf_counter_setup(void) { } #if BITS_PER_LONG == 64 @@ -58,6 +58,16 @@ static inline u64 perf_counter_read_safe(struct perf_counter *counter) return (u64) atomic64_read(&counter->count); } +void atomic64_counter_set(struct perf_counter *counter, u64 val) +{ + atomic64_set(&counter->count, val); +} + +u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic64_read(&counter->count); +} + #else /* @@ -79,6 +89,20 @@ static u64 perf_counter_read_safe(struct perf_counter *counter) return cntl | ((u64) cnth) << 32; } +void atomic64_counter_set(struct perf_counter *counter, u64 val64) +{ + u32 *val32 = (void *)&val64; + + atomic_set(counter->count32 + 0, *(val32 + 0)); + atomic_set(counter->count32 + 1, *(val32 + 1)); +} + +u64 atomic64_counter_read(struct perf_counter *counter) +{ + return atomic_read(counter->count32 + 0) | + (u64) atomic_read(counter->count32 + 1) << 32; +} + #endif static void @@ -131,6 +155,7 @@ static void __perf_counter_remove_from_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; + u64 perf_flags; /* * If this is a task context, we need to check whether it is @@ -155,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - hw_perf_disable_all(); + perf_flags = hw_perf_disable_all(); list_del_counter(counter, ctx); - hw_perf_enable_all(); + hw_perf_restore_ctrl(perf_flags); if (!ctx->task) { /* @@ -232,6 +257,7 @@ static void __perf_install_in_context(void *info) struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; int cpu = smp_processor_id(); + u64 perf_flags; /* * If this is a task context, we need to check whether it is @@ -247,9 +273,9 @@ static void __perf_install_in_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - hw_perf_disable_all(); + perf_flags = hw_perf_disable_all(); list_add_counter(counter, ctx); - hw_perf_enable_all(); + hw_perf_restore_ctrl(perf_flags); ctx->nr_counters++; @@ -457,6 +483,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) { struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + u64 perf_flags; if (likely(!ctx->nr_counters)) return; @@ -468,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Rotate the first entry last (works just fine for group counters too): */ - hw_perf_disable_all(); + perf_flags = hw_perf_disable_all(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { list_del(&counter->list_entry); list_add_tail(&counter->list_entry, &ctx->counter_list); break; } - hw_perf_enable_all(); + hw_perf_restore_ctrl(perf_flags); spin_unlock(&ctx->lock); @@ -807,6 +834,42 @@ static const struct file_operations perf_fops = { .poll = perf_poll, }; +static void cpu_clock_perf_counter_enable(struct perf_counter *counter) +{ +} + +static void cpu_clock_perf_counter_disable(struct perf_counter *counter) +{ +} + +static void cpu_clock_perf_counter_read(struct perf_counter *counter) +{ + int cpu = raw_smp_processor_id(); + + atomic64_counter_set(counter, cpu_clock(cpu)); +} + +static const struct hw_perf_counter_ops perf_ops_cpu_clock = { + .hw_perf_counter_enable = cpu_clock_perf_counter_enable, + .hw_perf_counter_disable = cpu_clock_perf_counter_disable, + .hw_perf_counter_read = cpu_clock_perf_counter_read, +}; + +static const struct hw_perf_counter_ops * +sw_perf_counter_init(struct perf_counter *counter) +{ + const struct hw_perf_counter_ops *hw_ops = NULL; + + switch (counter->hw_event.type) { + case PERF_COUNT_CPU_CLOCK: + hw_ops = &perf_ops_cpu_clock; + break; + default: + break; + } + return hw_ops; +} + /* * Allocate and initialize a counter structure */ @@ -815,7 +878,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, struct perf_counter *group_leader) { - struct hw_perf_counter_ops *hw_ops; + const struct hw_perf_counter_ops *hw_ops; struct perf_counter *counter; counter = kzalloc(sizeof(*counter), GFP_KERNEL); @@ -842,7 +905,13 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->hw_ops = NULL; - hw_ops = hw_perf_counter_init(counter); + hw_ops = NULL; + if (!hw_event->raw && hw_event->type < 0) + hw_ops = sw_perf_counter_init(counter); + if (!hw_ops) { + hw_ops = hw_perf_counter_init(counter); + } + if (!hw_ops) { kfree(counter); return NULL; @@ -912,7 +981,7 @@ asmlinkage int sys_perf_counter_open( goto err_put_context; } - ret = -ENOMEM; + ret = -EINVAL; counter = perf_counter_alloc(&hw_event, cpu, group_leader); if (!counter) goto err_put_context; From 01b2838c4298c5e0d30b4993c195ac34dd9df61e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 13:45:51 +0100 Subject: [PATCH 0017/2061] perf counters: consolidate hw_perf save/restore APIs Impact: cleanup Rename them to better match up the usual IRQ disable/enable APIs: hw_perf_disable_all() => hw_perf_save_disable() hw_perf_restore_ctrl() => hw_perf_restore() Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- drivers/acpi/processor_idle.c | 10 +++++----- include/linux/perf_counter.h | 10 +++++----- kernel/perf_counter.c | 16 ++++++++-------- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 43c8e9a38b4..3e1dbebe22b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -118,13 +118,13 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_restore_ctrl(u64 ctrl) +void hw_perf_restore(u64 ctrl) { wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); } -EXPORT_SYMBOL_GPL(hw_perf_restore_ctrl); +EXPORT_SYMBOL_GPL(hw_perf_restore); -u64 hw_perf_disable_all(void) +u64 hw_perf_save_disable(void) { u64 ctrl; @@ -132,7 +132,7 @@ u64 hw_perf_disable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); return ctrl; } -EXPORT_SYMBOL_GPL(hw_perf_disable_all); +EXPORT_SYMBOL_GPL(hw_perf_save_disable); static inline void __x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index cca804e6f1d..a3e66a33b7a 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -270,11 +270,11 @@ static atomic_t c3_cpu_count; /* Common C-state entry for C2, C3, .. */ static void acpi_cstate_enter(struct acpi_processor_cx *cstate) { - u64 pctrl; + u64 perf_flags; /* Don't trace irqs off for idle */ stop_critical_timings(); - pctrl = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); if (cstate->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cstate); @@ -287,7 +287,7 @@ static void acpi_cstate_enter(struct acpi_processor_cx *cstate) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } - hw_perf_restore_ctrl(pctrl); + hw_perf_restore(perf_flags); start_critical_timings(); } #endif /* !CONFIG_CPU_IDLE */ @@ -1433,7 +1433,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) /* Don't trace irqs off for idle */ stop_critical_timings(); - pctrl = hw_perf_disable_all(); + pctrl = hw_perf_save_disable(); if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -1448,7 +1448,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } - hw_perf_restore_ctrl(pctrl); + hw_perf_restore(pctrl); start_critical_timings(); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 9a1713a1be2..68f6e3ad531 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -67,7 +67,7 @@ enum perf_counter_record_type { * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - u64 type; + s64 type; u64 irq_period; u32 record_type; @@ -206,8 +206,8 @@ extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *task); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); -extern void hw_perf_restore_ctrl(u64 ctrl); -extern u64 hw_perf_disable_all(void); +extern u64 hw_perf_save_disable(void); +extern void hw_perf_restore(u64 ctrl); extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); extern u64 atomic64_counter_read(struct perf_counter *counter); @@ -221,8 +221,8 @@ perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *task) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } -static inline void hw_perf_restore_ctrl(u64 ctrl) { } -static inline u64 hw_perf_disable_all(void) { return 0; } +static inline void hw_perf_restore(u64 ctrl) { } +static inline u64 hw_perf_save_disable(void) { return 0; } #endif #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 506286e5ba6..0e93fea1712 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -43,8 +43,8 @@ hw_perf_counter_init(struct perf_counter *counter) return ERR_PTR(-EINVAL); } -u64 __weak hw_perf_disable_all(void) { return 0; } -void __weak hw_perf_restore_ctrl(u64 ctrl) { } +u64 __weak hw_perf_save_disable(void) { return 0; } +void __weak hw_perf_restore(u64 ctrl) { } void __weak hw_perf_counter_setup(void) { } #if BITS_PER_LONG == 64 @@ -180,9 +180,9 @@ static void __perf_counter_remove_from_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); list_del_counter(counter, ctx); - hw_perf_restore_ctrl(perf_flags); + hw_perf_restore(perf_flags); if (!ctx->task) { /* @@ -273,9 +273,9 @@ static void __perf_install_in_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); list_add_counter(counter, ctx); - hw_perf_restore_ctrl(perf_flags); + hw_perf_restore(perf_flags); ctx->nr_counters++; @@ -495,13 +495,13 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Rotate the first entry last (works just fine for group counters too): */ - perf_flags = hw_perf_disable_all(); + perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { list_del(&counter->list_entry); list_add_tail(&counter->list_entry, &ctx->counter_list); break; } - hw_perf_restore_ctrl(perf_flags); + hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); From bae43c9945ebeef15e7952e317efb02393d3bfc7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 14:03:20 +0100 Subject: [PATCH 0018/2061] perf counters: implement PERF_COUNT_TASK_CLOCK Impact: add new perf-counter type The 'task clock' counter counts the amount of time a task is executing, in nanoseconds. It stops ticking when a task is scheduled out either due to it blocking, sleeping or it being preempted. This counter type is a Linux kernel based abstraction, it is available even if the hardware does not support native hardware performance counters. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++++-- kernel/perf_counter.c | 22 ++++++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 68f6e3ad531..30c0ec8c1ee 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -50,8 +50,11 @@ enum hw_event_types { */ PERF_COUNT_CPU_CLOCK = -1, PERF_COUNT_TASK_CLOCK = -2, - PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, + /* + * Future software events: + */ + /* PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, */ }; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0e93fea1712..a0fe8474ee2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -855,6 +855,25 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .hw_perf_counter_read = cpu_clock_perf_counter_read, }; +static void task_clock_perf_counter_enable(struct perf_counter *counter) +{ +} + +static void task_clock_perf_counter_disable(struct perf_counter *counter) +{ +} + +static void task_clock_perf_counter_read(struct perf_counter *counter) +{ + atomic64_counter_set(counter, current->se.sum_exec_runtime); +} + +static const struct hw_perf_counter_ops perf_ops_task_clock = { + .hw_perf_counter_enable = task_clock_perf_counter_enable, + .hw_perf_counter_disable = task_clock_perf_counter_disable, + .hw_perf_counter_read = task_clock_perf_counter_read, +}; + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -864,6 +883,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_CPU_CLOCK: hw_ops = &perf_ops_cpu_clock; break; + case PERF_COUNT_TASK_CLOCK: + hw_ops = &perf_ops_task_clock; + break; default: break; } From 1d1c7ddbfab358445a542715551301b7fc363e28 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 14:59:31 +0100 Subject: [PATCH 0019/2061] perf counters: add prctl interface to disable/enable counters Add a way for self-monitoring tasks to disable/enable counters summarily, via a prctl: PR_TASK_PERF_COUNTERS_DISABLE 31 PR_TASK_PERF_COUNTERS_ENABLE 32 Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++ include/linux/prctl.h | 3 ++ kernel/perf_counter.c | 86 +++++++++++++++++++++++++++++++++--- kernel/sys.c | 7 +++ 4 files changed, 93 insertions(+), 7 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 30c0ec8c1ee..97d86c293ee 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -213,6 +213,8 @@ extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); extern u64 atomic64_counter_read(struct perf_counter *counter); +extern int perf_counter_task_disable(void); +extern int perf_counter_task_enable(void); #else static inline void @@ -226,6 +228,8 @@ static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void hw_perf_restore(u64 ctrl) { } static inline u64 hw_perf_save_disable(void) { return 0; } +static inline int perf_counter_task_disable(void) { return -EINVAL; } +static inline int perf_counter_task_enable(void) { return -EINVAL; } #endif #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/prctl.h b/include/linux/prctl.h index 48d887e3c6e..b00df4c79c6 100644 --- a/include/linux/prctl.h +++ b/include/linux/prctl.h @@ -85,4 +85,7 @@ #define PR_SET_TIMERSLACK 29 #define PR_GET_TIMERSLACK 30 +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a0fe8474ee2..4e679b91d8b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -415,6 +415,9 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { + if (counter->active == -1) + return; + counter->hw_ops->hw_perf_counter_enable(counter); counter->active = 1; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -479,6 +482,79 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) cpuctx->task_ctx = ctx; } +int perf_counter_task_disable(void) +{ + struct task_struct *curr = current; + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter *counter; + u64 perf_flags; + int cpu; + + if (likely(!ctx->nr_counters)) + return 0; + + local_irq_disable(); + cpu = smp_processor_id(); + + perf_counter_task_sched_out(curr, cpu); + + spin_lock(&ctx->lock); + + /* + * Disable all the counters: + */ + perf_flags = hw_perf_save_disable(); + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + WARN_ON_ONCE(counter->active == 1); + counter->active = -1; + } + hw_perf_restore(perf_flags); + + spin_unlock(&ctx->lock); + + local_irq_enable(); + + return 0; +} + +int perf_counter_task_enable(void) +{ + struct task_struct *curr = current; + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter *counter; + u64 perf_flags; + int cpu; + + if (likely(!ctx->nr_counters)) + return 0; + + local_irq_disable(); + cpu = smp_processor_id(); + + spin_lock(&ctx->lock); + + /* + * Disable all the counters: + */ + perf_flags = hw_perf_save_disable(); + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->active != -1) + continue; + counter->active = 0; + } + hw_perf_restore(perf_flags); + + spin_unlock(&ctx->lock); + + perf_counter_task_sched_in(curr, cpu); + + local_irq_enable(); + + return 0; +} + void perf_counter_task_tick(struct task_struct *curr, int cpu) { struct perf_counter_context *ctx = &curr->perf_counter_ctx; @@ -951,13 +1027,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, * @cpu: target cpu * @group_fd: group leader counter fd */ -asmlinkage int sys_perf_counter_open( - - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd) - +asmlinkage int +sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, int cpu, int group_fd) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; diff --git a/kernel/sys.c b/kernel/sys.c index 31deba8f7d1..0f66633be31 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -1716,6 +1717,12 @@ asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3, case PR_SET_TSC: error = SET_TSC_CTL(arg2); break; + case PR_TASK_PERF_COUNTERS_DISABLE: + error = perf_counter_task_disable(); + break; + case PR_TASK_PERF_COUNTERS_ENABLE: + error = perf_counter_task_enable(); + break; case PR_GET_TIMERSLACK: error = current->timer_slack_ns; break; From 6a930700c8b655a9e25e42fc4adc0b225ebbcefc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 15:17:03 +0100 Subject: [PATCH 0020/2061] perf counters: clean up state transitions Impact: cleanup Introduce a proper enum for the 3 states of a counter: PERF_COUNTER_STATE_OFF = -1 PERF_COUNTER_STATE_INACTIVE = 0 PERF_COUNTER_STATE_ACTIVE = 1 and rename counter->active to counter->state and propagate the changes everywhere. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- include/linux/perf_counter.h | 11 ++++++++++- kernel/perf_counter.c | 29 ++++++++++++++--------------- 3 files changed, 25 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3e1dbebe22b..4854cca7fff 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -332,7 +332,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * Then store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - if (!counter->active) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) { /* * When counter was not in the overflow mask, we have to * read it from hardware. We read it as well, when it diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 97d86c293ee..8cb095fa442 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -127,6 +127,15 @@ struct hw_perf_counter_ops { void (*hw_perf_counter_read) (struct perf_counter *counter); }; +/** + * enum perf_counter_active_state - the states of a counter + */ +enum perf_counter_active_state { + PERF_COUNTER_STATE_OFF = -1, + PERF_COUNTER_STATE_INACTIVE = 0, + PERF_COUNTER_STATE_ACTIVE = 1, +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -136,7 +145,7 @@ struct perf_counter { struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; - int active; + enum perf_counter_active_state state; #if BITS_PER_LONG == 64 atomic64_t count; #else diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4e679b91d8b..559130b8774 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -167,9 +167,9 @@ static void __perf_counter_remove_from_context(void *info) spin_lock(&ctx->lock); - if (counter->active) { + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->hw_perf_counter_disable(counter); - counter->active = 0; + counter->state = PERF_COUNTER_STATE_INACTIVE; ctx->nr_active--; cpuctx->active_oncpu--; counter->task = NULL; @@ -281,7 +281,7 @@ static void __perf_install_in_context(void *info) if (cpuctx->active_oncpu < perf_max_counters) { counter->hw_ops->hw_perf_counter_enable(counter); - counter->active = 1; + counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; ctx->nr_active++; cpuctx->active_oncpu++; @@ -328,7 +328,6 @@ retry: spin_lock_irq(&ctx->lock); /* - * If the context is active and the counter has not been added * we need to retry the smp call. */ if (ctx->nr_active && list_empty(&counter->list_entry)) { @@ -353,12 +352,12 @@ counter_sched_out(struct perf_counter *counter, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx) { - if (!counter->active) + if (counter->state != PERF_COUNTER_STATE_ACTIVE) return; counter->hw_ops->hw_perf_counter_disable(counter); - counter->active = 0; - counter->oncpu = -1; + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->oncpu = -1; cpuctx->active_oncpu--; ctx->nr_active--; @@ -415,11 +414,11 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - if (counter->active == -1) + if (counter->state == PERF_COUNTER_STATE_OFF) return; counter->hw_ops->hw_perf_counter_enable(counter); - counter->active = 1; + counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ cpuctx->active_oncpu++; @@ -506,8 +505,8 @@ int perf_counter_task_disable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - WARN_ON_ONCE(counter->active == 1); - counter->active = -1; + WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE); + counter->state = PERF_COUNTER_STATE_OFF; } hw_perf_restore(perf_flags); @@ -540,9 +539,9 @@ int perf_counter_task_enable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->active != -1) + if (counter->state != PERF_COUNTER_STATE_OFF) continue; - counter->active = 0; + counter->state = PERF_COUNTER_STATE_INACTIVE; } hw_perf_restore(perf_flags); @@ -620,7 +619,7 @@ static u64 perf_counter_read(struct perf_counter *counter) * If counter is enabled and currently active on a CPU, update the * value in the counter structure: */ - if (counter->active) { + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, __hw_perf_counter_read, counter, 1); } @@ -673,7 +672,7 @@ static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) retry: spin_lock_irq(&ctx->lock); - if (!counter->active) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) { counter->irqdata = counter->usrdata; counter->usrdata = oldirqdata; spin_unlock_irq(&ctx->lock); From 447557ac7ce120306b4a31d6003faef39cb1bf14 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Dec 2008 20:40:18 +0100 Subject: [PATCH 0021/2061] perf counters: update docs Impact: update docs Signed-off-by: Ingo Molnar --- Documentation/perf-counters.txt | 105 ++++++++++++++++++++++---------- 1 file changed, 74 insertions(+), 31 deletions(-) diff --git a/Documentation/perf-counters.txt b/Documentation/perf-counters.txt index 19033a0bb52..fddd32189a5 100644 --- a/Documentation/perf-counters.txt +++ b/Documentation/perf-counters.txt @@ -10,8 +10,8 @@ trigger interrupts when a threshold number of events have passed - and can thus be used to profile the code that runs on that CPU. The Linux Performance Counter subsystem provides an abstraction of these -hardware capabilities. It provides per task and per CPU counters, and -it provides event capabilities on top of those. +hardware capabilities. It provides per task and per CPU counters, counter +groups, and it provides event capabilities on top of those. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. @@ -19,12 +19,8 @@ There's one file descriptor per virtual counter used. The special file descriptor is opened via the perf_counter_open() system call: - int - perf_counter_open(u32 hw_event_type, - u32 hw_event_period, - u32 record_type, - pid_t pid, - int cpu); + int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, + pid_t pid, int cpu, int group_fd); The syscall returns the new fd. The fd can be used via the normal VFS system calls: read() can be used to read the counter, fcntl() @@ -33,39 +29,78 @@ can be used to set the blocking mode, etc. Multiple counters can be kept open at a time, and the counters can be poll()ed. -When creating a new counter fd, 'hw_event_type' is one of: +When creating a new counter fd, 'perf_counter_hw_event' is: - enum hw_event_types { - PERF_COUNT_CYCLES, - PERF_COUNT_INSTRUCTIONS, - PERF_COUNT_CACHE_REFERENCES, - PERF_COUNT_CACHE_MISSES, - PERF_COUNT_BRANCH_INSTRUCTIONS, - PERF_COUNT_BRANCH_MISSES, - }; +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_hw_event { + s64 type; + + u64 irq_period; + u32 record_type; + + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + __reserved_1 : 29; + + u64 __reserved_2; +}; + +/* + * Generalized performance counter event types, used by the hw_event.type + * parameter of the sys_perf_counter_open() syscall: + */ +enum hw_event_types { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + + /* + * Special "software" counters provided by the kernel, even if + * the hardware does not support performance counters. These + * counters measure various physical and sw events of the + * kernel (and allow the profiling of them as well): + */ + PERF_COUNT_CPU_CLOCK = -1, + PERF_COUNT_TASK_CLOCK = -2, + /* + * Future software events: + */ + /* PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, */ +}; These are standardized types of events that work uniformly on all CPUs that implements Performance Counters support under Linux. If a CPU is not able to count branch-misses, then the system call will return -EINVAL. -[ Note: more hw_event_types are supported as well, but they are CPU - specific and are enumerated via /sys on a per CPU basis. Raw hw event - types can be passed in as negative numbers. For example, to count - "External bus cycles while bus lock signal asserted" events on Intel - Core CPUs, pass in a -0x4064 event type value. ] - -The parameter 'hw_event_period' is the number of events before waking up -a read() that is blocked on a counter fd. Zero value means a non-blocking -counter. +More hw_event_types are supported as well, but they are CPU +specific and are enumerated via /sys on a per CPU basis. Raw hw event +types can be passed in under hw_event.type if hw_event.raw is 1. +For example, to count "External bus cycles while bus lock signal asserted" +events on Intel Core CPUs, pass in a 0x4064 event type value and set +hw_event.raw to 1. 'record_type' is the type of data that a read() will provide for the counter, and it can be one of: - enum perf_record_type { - PERF_RECORD_SIMPLE, - PERF_RECORD_IRQ, - }; +/* + * IRQ-notification data record type: + */ +enum perf_counter_record_type { + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, +}; a "simple" counter is one that counts hardware events and allows them to be read out into a u64 count value. (read() returns 8 on @@ -76,6 +111,10 @@ the IP of the interrupted context. In this case read() will return the 8-byte counter value, plus the Instruction Pointer address of the interrupted context. +The parameter 'hw_event_period' is the number of events before waking up +a read() that is blocked on a counter fd. Zero value means a non-blocking +counter. + The 'pid' parameter allows the counter to be specific to a task: pid == 0: if the pid parameter is zero, the counter is attached to the @@ -92,7 +131,7 @@ CPU: cpu >= 0: the counter is restricted to a specific CPU cpu == -1: the counter counts on all CPUs -Note: the combination of 'pid == -1' and 'cpu == -1' is not valid. +(Note: the combination of 'pid == -1' and 'cpu == -1' is not valid.) A 'pid > 0' and 'cpu == -1' counter is a per task counter that counts events of that task and 'follows' that task to whatever CPU the task @@ -102,3 +141,7 @@ their own tasks. A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. +Group counters are created by passing in a group_fd of another counter. +Groups are scheduled at once and can be used with PERF_RECORD_GROUP +to record multi-dimensional timestamps. + From 9b194e831fb2c322ed81a373e49620f34edc2778 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 20:22:35 +0100 Subject: [PATCH 0022/2061] x86: implement atomic64_t on 32-bit Impact: new API Implement the atomic64_t APIs on 32-bit as well. Will be used by the performance counters code. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 218 +++++++++++++++++++++++++++++++ 1 file changed, 218 insertions(+) diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index ad5b9f6ecdd..9927e01b03c 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -255,5 +255,223 @@ static inline int atomic_add_unless(atomic_t *v, int a, int u) #define smp_mb__before_atomic_inc() barrier() #define smp_mb__after_atomic_inc() barrier() +/* An 64bit atomic type */ + +typedef struct { + unsigned long long counter; +} atomic64_t; + +#define ATOMIC64_INIT(val) { (val) } + +/** + * atomic64_read - read atomic64 variable + * @v: pointer of type atomic64_t + * + * Atomically reads the value of @v. + * Doesn't imply a read memory barrier. + */ +#define __atomic64_read(ptr) ((ptr)->counter) + +static inline unsigned long long +cmpxchg8b(unsigned long long *ptr, unsigned long long old, unsigned long long new) +{ + asm volatile( + + LOCK_PREFIX "cmpxchg8b (%[ptr])\n" + + : "=A" (old) + + : [ptr] "D" (ptr), + "A" (old), + "b" (ll_low(new)), + "c" (ll_high(new)) + + : "memory"); + + return old; +} + +static inline unsigned long long +atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, + unsigned long long new_val) +{ + return cmpxchg8b(&ptr->counter, old_val, new_val); +} + +/** + * atomic64_set - set atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * + * Atomically sets the value of @ptr to @new_val. + */ +static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) +{ + unsigned long long old_val; + + do { + old_val = atomic_read(ptr); + } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); +} + +/** + * atomic64_read - read atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically reads the value of @ptr and returns it. + */ +static inline unsigned long long atomic64_read(atomic64_t *ptr) +{ + unsigned long long curr_val; + + do { + curr_val = __atomic64_read(ptr); + } while (atomic64_cmpxchg(ptr, curr_val, curr_val) != curr_val); + + return curr_val; +} + +/** + * atomic64_add_return - add and return + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns @delta + *@ptr + */ +static inline unsigned long long +atomic64_add_return(unsigned long long delta, atomic64_t *ptr) +{ + unsigned long long old_val, new_val; + + do { + old_val = atomic_read(ptr); + new_val = old_val + delta; + + } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + + return new_val; +} + +static inline long atomic64_sub_return(unsigned long long delta, atomic64_t *ptr) +{ + return atomic64_add_return(-delta, ptr); +} + +static inline long atomic64_inc_return(atomic64_t *ptr) +{ + return atomic64_add_return(1, ptr); +} + +static inline long atomic64_dec_return(atomic64_t *ptr) +{ + return atomic64_sub_return(1, ptr); +} + +/** + * atomic64_add - add integer to atomic64 variable + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr. + */ +static inline void atomic64_add(unsigned long long delta, atomic64_t *ptr) +{ + atomic64_add_return(delta, ptr); +} + +/** + * atomic64_sub - subtract the atomic64 variable + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr. + */ +static inline void atomic64_sub(unsigned long long delta, atomic64_t *ptr) +{ + atomic64_add(-delta, ptr); +} + +/** + * atomic64_sub_and_test - subtract value from variable and test result + * @delta: integer value to subtract + * @ptr: pointer to type atomic64_t + * + * Atomically subtracts @delta from @ptr and returns + * true if the result is zero, or false for all + * other cases. + */ +static inline int +atomic64_sub_and_test(unsigned long long delta, atomic64_t *ptr) +{ + unsigned long long old_val = atomic64_sub_return(delta, ptr); + + return old_val == 0; +} + +/** + * atomic64_inc - increment atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1. + */ +static inline void atomic64_inc(atomic64_t *ptr) +{ + atomic64_add(1, ptr); +} + +/** + * atomic64_dec - decrement atomic64 variable + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1. + */ +static inline void atomic64_dec(atomic64_t *ptr) +{ + atomic64_sub(1, ptr); +} + +/** + * atomic64_dec_and_test - decrement and test + * @ptr: pointer to type atomic64_t + * + * Atomically decrements @ptr by 1 and + * returns true if the result is 0, or false for all other + * cases. + */ +static inline int atomic64_dec_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(1, ptr); +} + +/** + * atomic64_inc_and_test - increment and test + * @ptr: pointer to type atomic64_t + * + * Atomically increments @ptr by 1 + * and returns true if the result is zero, or false for all + * other cases. + */ +static inline int atomic64_inc_and_test(atomic64_t *ptr) +{ + return atomic64_sub_and_test(-1, ptr); +} + +/** + * atomic64_add_negative - add and test if negative + * @delta: integer value to add + * @ptr: pointer to type atomic64_t + * + * Atomically adds @delta to @ptr and returns true + * if the result is negative, or false when + * result is greater than or equal to zero. + */ +static inline int +atomic64_add_negative(unsigned long long delta, atomic64_t *ptr) +{ + long long old_val = atomic64_add_return(delta, ptr); + + return old_val < 0; +} + #include #endif /* _ASM_X86_ATOMIC_32_H */ From ee06094f8279e1312fc0a31591320cc7b6f0ab1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 13 Dec 2008 09:00:03 +0100 Subject: [PATCH 0023/2061] perfcounters: restructure x86 counter math Impact: restructure code Change counter math from absolute values to clear delta logic. We try to extract elapsed deltas from the raw hw counter - and put that into the generic counter. Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 2 +- arch/x86/kernel/cpu/perf_counter.c | 230 ++++++++++++++++------------- include/linux/perf_counter.h | 15 +- kernel/perf_counter.c | 68 +-------- 4 files changed, 137 insertions(+), 178 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index f2fdc186724..fe94490bab6 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -643,7 +643,7 @@ config X86_UP_IOAPIC config X86_LOCAL_APIC def_bool y depends on X86_64 || (X86_32 && (X86_UP_APIC || (SMP && !X86_VOYAGER) || X86_GENERICARCH)) - select HAVE_PERF_COUNTERS + select HAVE_PERF_COUNTERS if (!M386 && !M486) config X86_IO_APIC def_bool y diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b903f8df72b..5afae13d8d5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -53,6 +53,48 @@ const int intel_perfmon_event_map[] = const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +/* + * Propagate counter elapsed time into the generic counter. + * Can only be executed on the CPU where the counter is active. + * Returns the delta events processed. + */ +static void +x86_perf_counter_update(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) +{ + u64 prev_raw_count, new_raw_count, delta; + + WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE); + /* + * Careful: an NMI might modify the previous counter value. + * + * Our tactic to handle this is to first atomically read and + * exchange a new raw count - then add that new-prev delta + * count to the generic counter atomically: + */ +again: + prev_raw_count = atomic64_read(&hwc->prev_count); + rdmsrl(hwc->counter_base + idx, new_raw_count); + + if (atomic64_cmpxchg(&hwc->prev_count, prev_raw_count, + new_raw_count) != prev_raw_count) + goto again; + + /* + * Now we have the new raw value and have updated the prev + * timestamp already. We can now calculate the elapsed delta + * (counter-)time and add that to the generic counter. + * + * Careful, not all hw sign-extends above the physical width + * of the count, so we do that by clipping the delta to 32 bits: + */ + delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); + WARN_ON_ONCE((int)delta < 0); + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -90,10 +132,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * so we install an artificial 1<<31 period regardless of * the generic counter period: */ - if (!hwc->irq_period) + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) hwc->irq_period = 0x7FFFFFFF; - hwc->next_count = -(s32)hwc->irq_period; + atomic64_set(&hwc->period_left, hwc->irq_period); /* * Raw event type provide the config in the event structure @@ -118,12 +160,6 @@ void hw_perf_enable_all(void) wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } -void hw_perf_restore(u64 ctrl) -{ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); -} -EXPORT_SYMBOL_GPL(hw_perf_restore); - u64 hw_perf_save_disable(void) { u64 ctrl; @@ -134,27 +170,74 @@ u64 hw_perf_save_disable(void) } EXPORT_SYMBOL_GPL(hw_perf_save_disable); +void hw_perf_restore(u64 ctrl) +{ + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); +} +EXPORT_SYMBOL_GPL(hw_perf_restore); + static inline void -__x86_perf_counter_disable(struct hw_perf_counter *hwc, unsigned int idx) +__x86_perf_counter_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int idx) { - wrmsr(hwc->config_base + idx, hwc->config, 0); + int err; + + err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + WARN_ON_ONCE(err); } -static DEFINE_PER_CPU(u64, prev_next_count[MAX_HW_COUNTERS]); +static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); -static void __hw_perf_counter_set_period(struct hw_perf_counter *hwc, int idx) +/* + * Set the next IRQ period, based on the hwc->period_left value. + * To be called with the counter disabled in hw: + */ +static void +__hw_perf_counter_set_period(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { - per_cpu(prev_next_count[idx], smp_processor_id()) = hwc->next_count; + s32 left = atomic64_read(&hwc->period_left); + s32 period = hwc->irq_period; - wrmsr(hwc->counter_base + idx, hwc->next_count, 0); + WARN_ON_ONCE(period <= 0); + + /* + * If we are way outside a reasoable range then just skip forward: + */ + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_set(&hwc->period_left, left); + } + + WARN_ON_ONCE(left <= 0); + + per_cpu(prev_left[idx], smp_processor_id()) = left; + + /* + * The hw counter starts counting from this counter offset, + * mark it to be able to extra future deltas: + */ + atomic64_set(&hwc->prev_count, (u64)(s64)-left); + + wrmsr(hwc->counter_base + idx, -left, 0); } -static void __x86_perf_counter_enable(struct hw_perf_counter *hwc, int idx) +static void +__x86_perf_counter_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } +/* + * Find a PMC slot for the freshly enabled / scheduled in counter: + */ static void x86_perf_counter_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -170,55 +253,17 @@ static void x86_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __x86_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(counter, hwc, idx); cpuc->counters[idx] = counter; - __hw_perf_counter_set_period(hwc, idx); - __x86_perf_counter_enable(hwc, idx); -} - -static void __hw_perf_save_counter(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) -{ - s64 raw = -1; - s64 delta; - - /* - * Get the raw hw counter value: - */ - rdmsrl(hwc->counter_base + idx, raw); - - /* - * Rebase it to zero (it started counting at -irq_period), - * to see the delta since ->prev_count: - */ - delta = (s64)hwc->irq_period + (s64)(s32)raw; - - atomic64_counter_set(counter, hwc->prev_count + delta); - - /* - * Adjust the ->prev_count offset - if we went beyond - * irq_period of units, then we got an IRQ and the counter - * was set back to -irq_period: - */ - while (delta >= (s64)hwc->irq_period) { - hwc->prev_count += hwc->irq_period; - delta -= (s64)hwc->irq_period; - } - - /* - * Calculate the next raw counter value we'll write into - * the counter at the next sched-in time: - */ - delta -= (s64)hwc->irq_period; - - hwc->next_count = (s32)delta; + __hw_perf_counter_set_period(counter, hwc, idx); + __x86_perf_counter_enable(counter, hwc, idx); } void perf_counter_print_debug(void) { - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, next_count; + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; int cpu, idx; if (!nr_hw_counters) @@ -241,14 +286,14 @@ void perf_counter_print_debug(void) rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); - next_count = per_cpu(prev_next_count[idx], cpu); + prev_left = per_cpu(prev_left[idx], cpu); printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: PMC%d next: %016llx\n", - cpu, idx, next_count); + printk(KERN_INFO "CPU#%d: PMC%d left: %016llx\n", + cpu, idx, prev_left); } local_irq_enable(); } @@ -259,29 +304,16 @@ static void x86_perf_counter_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __x86_perf_counter_disable(hwc, idx); + __x86_perf_counter_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; - __hw_perf_save_counter(counter, hwc, idx); -} -static void x86_perf_counter_read(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - unsigned long addr = hwc->counter_base + hwc->idx; - s64 offs, val = -1LL; - s32 val32; - - /* Careful: NMI might modify the counter offset */ - do { - offs = hwc->prev_count; - rdmsrl(addr, val); - } while (offs != hwc->prev_count); - - val32 = (s32) val; - val = (s64)hwc->irq_period + (s64)val32; - atomic64_counter_set(counter, hwc->prev_count + val); + /* + * Drain the remaining delta count out of a counter + * that we are disabling: + */ + x86_perf_counter_update(counter, hwc, idx); } static void perf_store_irq_data(struct perf_counter *counter, u64 data) @@ -299,7 +331,8 @@ static void perf_store_irq_data(struct perf_counter *counter, u64 data) } /* - * NMI-safe enable method: + * Save and restart an expired counter. Called by NMI contexts, + * so it has to be careful about preempting normal counter ops: */ static void perf_save_and_restart(struct perf_counter *counter) { @@ -309,45 +342,25 @@ static void perf_save_and_restart(struct perf_counter *counter) rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); - __hw_perf_save_counter(counter, hwc, idx); - __hw_perf_counter_set_period(hwc, idx); + x86_perf_counter_update(counter, hwc, idx); + __hw_perf_counter_set_period(counter, hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __x86_perf_counter_enable(hwc, idx); + __x86_perf_counter_enable(counter, hwc, idx); } static void perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) { struct perf_counter *counter, *group_leader = sibling->group_leader; - int bit; /* - * Store the counter's own timestamp first: - */ - perf_store_irq_data(sibling, sibling->hw_event.type); - perf_store_irq_data(sibling, atomic64_counter_read(sibling)); - - /* - * Then store sibling timestamps (if any): + * Store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ACTIVE) { - /* - * When counter was not in the overflow mask, we have to - * read it from hardware. We read it as well, when it - * has not been read yet and clear the bit in the - * status mask. - */ - bit = counter->hw.idx; - if (!test_bit(bit, (unsigned long *) overflown) || - test_bit(bit, (unsigned long *) status)) { - clear_bit(bit, (unsigned long *) status); - perf_save_and_restart(counter); - } - } + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); perf_store_irq_data(sibling, counter->hw_event.type); - perf_store_irq_data(sibling, atomic64_counter_read(counter)); + perf_store_irq_data(sibling, atomic64_read(&counter->count)); } } @@ -540,6 +553,11 @@ void __init init_hw_perf_counters(void) perf_counters_initialized = true; } +static void x86_perf_counter_read(struct perf_counter *counter) +{ + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); +} + static const struct hw_perf_counter_ops x86_perf_counter_ops = { .hw_perf_counter_enable = x86_perf_counter_enable, .hw_perf_counter_disable = x86_perf_counter_disable, diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8cb095fa442..72460289c65 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -91,14 +91,16 @@ struct perf_counter_hw_event { * struct hw_perf_counter - performance counter hardware details: */ struct hw_perf_counter { +#ifdef CONFIG_PERF_COUNTERS u64 config; unsigned long config_base; unsigned long counter_base; int nmi; unsigned int idx; - u64 prev_count; + atomic64_t prev_count; u64 irq_period; - s32 next_count; + atomic64_t period_left; +#endif }; /* @@ -140,17 +142,15 @@ enum perf_counter_active_state { * struct perf_counter - performance counter kernel representation: */ struct perf_counter { +#ifdef CONFIG_PERF_COUNTERS struct list_head list_entry; struct list_head sibling_list; struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; enum perf_counter_active_state state; -#if BITS_PER_LONG == 64 atomic64_t count; -#else - atomic_t count32[2]; -#endif + struct perf_counter_hw_event hw_event; struct hw_perf_counter hw; @@ -172,6 +172,7 @@ struct perf_counter { struct perf_data *irqdata; struct perf_data *usrdata; struct perf_data data[2]; +#endif }; /** @@ -220,8 +221,6 @@ extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); -extern void atomic64_counter_set(struct perf_counter *counter, u64 val64); -extern u64 atomic64_counter_read(struct perf_counter *counter); extern int perf_counter_task_disable(void); extern int perf_counter_task_enable(void); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 559130b8774..416861ce8b2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -44,67 +44,9 @@ hw_perf_counter_init(struct perf_counter *counter) } u64 __weak hw_perf_save_disable(void) { return 0; } -void __weak hw_perf_restore(u64 ctrl) { } +void __weak hw_perf_restore(u64 ctrl) { } void __weak hw_perf_counter_setup(void) { } -#if BITS_PER_LONG == 64 - -/* - * Read the cached counter in counter safe against cross CPU / NMI - * modifications. 64 bit version - no complications. - */ -static inline u64 perf_counter_read_safe(struct perf_counter *counter) -{ - return (u64) atomic64_read(&counter->count); -} - -void atomic64_counter_set(struct perf_counter *counter, u64 val) -{ - atomic64_set(&counter->count, val); -} - -u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic64_read(&counter->count); -} - -#else - -/* - * Read the cached counter in counter safe against cross CPU / NMI - * modifications. 32 bit version. - */ -static u64 perf_counter_read_safe(struct perf_counter *counter) -{ - u32 cntl, cnth; - - local_irq_disable(); - do { - cnth = atomic_read(&counter->count32[1]); - cntl = atomic_read(&counter->count32[0]); - } while (cnth != atomic_read(&counter->count32[1])); - - local_irq_enable(); - - return cntl | ((u64) cnth) << 32; -} - -void atomic64_counter_set(struct perf_counter *counter, u64 val64) -{ - u32 *val32 = (void *)&val64; - - atomic_set(counter->count32 + 0, *(val32 + 0)); - atomic_set(counter->count32 + 1, *(val32 + 1)); -} - -u64 atomic64_counter_read(struct perf_counter *counter) -{ - return atomic_read(counter->count32 + 0) | - (u64) atomic_read(counter->count32 + 1) << 32; -} - -#endif - static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -280,11 +222,11 @@ static void __perf_install_in_context(void *info) ctx->nr_counters++; if (cpuctx->active_oncpu < perf_max_counters) { - counter->hw_ops->hw_perf_counter_enable(counter); counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; ctx->nr_active++; cpuctx->active_oncpu++; + counter->hw_ops->hw_perf_counter_enable(counter); } if (!ctx->task && cpuctx->max_pertask) @@ -624,7 +566,7 @@ static u64 perf_counter_read(struct perf_counter *counter) __hw_perf_counter_read, counter, 1); } - return perf_counter_read_safe(counter); + return atomic64_read(&counter->count); } /* @@ -921,7 +863,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) { int cpu = raw_smp_processor_id(); - atomic64_counter_set(counter, cpu_clock(cpu)); + atomic64_set(&counter->count, cpu_clock(cpu)); } static const struct hw_perf_counter_ops perf_ops_cpu_clock = { @@ -940,7 +882,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter) static void task_clock_perf_counter_read(struct perf_counter *counter) { - atomic64_counter_set(counter, current->se.sum_exec_runtime); + atomic64_set(&counter->count, current->se.sum_exec_runtime); } static const struct hw_perf_counter_ops perf_ops_task_clock = { From 9b51f66dcb09ac5eb6bc68fc111d5c7a1e0131d6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 13:49:45 +0100 Subject: [PATCH 0024/2061] perfcounters: implement "counter inheritance" Impact: implement new performance feature Counter inheritance can be used to run performance counters in a workload, transparently - and pipe back the counter results to the parent counter. Inheritance for performance counters works the following way: when creating a counter it can be marked with the .inherit=1 flag. Such counters are then 'inherited' by all child tasks (be they fork()-ed or clone()-ed). These counters get inherited through exec() boundaries as well (except through setuid boundaries). The counter values get added back to the parent counter(s) when the child task(s) exit - much like stime/utime statistics are gathered. So inherited counters are ideal to gather summary statistics about an application's behavior via shell commands, without having to modify that application. The timec.c command utilizes counter inheritance: http://redhat.com/~mingo/perfcounters/timec.c Sample output: $ ./timec -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null Performance counter stats for 'ls': 163516953 instructions 2295 cache-misses 2855182 branch-misses Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 24 +++- kernel/exit.c | 7 +- kernel/perf_counter.c | 248 +++++++++++++++++++++++++++++------ 3 files changed, 228 insertions(+), 51 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 72460289c65..e5d25bf8f74 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -75,10 +75,11 @@ struct perf_counter_hw_event { u64 irq_period; u32 record_type; - u32 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - __reserved_1 : 29; + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + inherit : 1, /* children inherit it */ + __reserved_1 : 28; u64 __reserved_2; }; @@ -138,6 +139,8 @@ enum perf_counter_active_state { PERF_COUNTER_STATE_ACTIVE = 1, }; +struct file; + /** * struct perf_counter - performance counter kernel representation: */ @@ -156,7 +159,10 @@ struct perf_counter { struct perf_counter_context *ctx; struct task_struct *task; + struct file *filp; + unsigned int nr_inherited; + struct perf_counter *parent; /* * Protect attach/detach: */ @@ -210,13 +216,16 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS +extern void +perf_counter_show(struct perf_counter *counter, char *str, int trace); extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); -extern void perf_counter_init_task(struct task_struct *task); +extern void perf_counter_init_task(struct task_struct *child); +extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); extern u64 hw_perf_save_disable(void); @@ -226,12 +235,15 @@ extern int perf_counter_task_enable(void); #else static inline void +perf_counter_show(struct perf_counter *counter, char *str, int trace) { } +static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } static inline void perf_counter_task_sched_out(struct task_struct *task, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline void perf_counter_init_task(struct task_struct *task) { } +static inline void perf_counter_init_task(struct task_struct *child) { } +static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void hw_perf_restore(u64 ctrl) { } diff --git a/kernel/exit.c b/kernel/exit.c index 2d8be7ebb0f..d336c90a5f1 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1093,11 +1093,12 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif -#ifdef CONFIG_FUTEX /* - * This must happen late, after the PID is not - * hashed anymore: + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: */ + perf_counter_exit_task(tsk); +#ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); if (unlikely(current->pi_state_cache)) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 416861ce8b2..f5e81dd193d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -80,8 +80,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_del_init(&sibling->list_entry); list_add_tail(&sibling->list_entry, &ctx->counter_list); - WARN_ON_ONCE(!sibling->group_leader); - WARN_ON_ONCE(sibling->group_leader == sibling); sibling->group_leader = sibling; } } @@ -97,6 +95,7 @@ static void __perf_counter_remove_from_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; + unsigned long flags; u64 perf_flags; /* @@ -107,7 +106,7 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->hw_perf_counter_disable(counter); @@ -136,7 +135,7 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock(&ctx->lock); + spin_unlock_irqrestore(&ctx->lock, flags); } @@ -199,6 +198,7 @@ static void __perf_install_in_context(void *info) struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; int cpu = smp_processor_id(); + unsigned long flags; u64 perf_flags; /* @@ -209,7 +209,7 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); /* * Protect the list operation against NMI by disabling the @@ -232,7 +232,7 @@ static void __perf_install_in_context(void *info) if (!ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; - spin_unlock(&ctx->lock); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -446,10 +446,9 @@ int perf_counter_task_disable(void) */ perf_flags = hw_perf_save_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - WARN_ON_ONCE(counter->state == PERF_COUNTER_STATE_ACTIVE); + list_for_each_entry(counter, &ctx->counter_list, list_entry) counter->state = PERF_COUNTER_STATE_OFF; - } + hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); @@ -525,26 +524,6 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) perf_counter_task_sched_in(curr, cpu); } -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - spin_lock_init(&ctx->lock); - INIT_LIST_HEAD(&ctx->counter_list); - ctx->nr_counters = 0; - ctx->task = task; -} -/* - * Initialize the perf_counter context in task_struct - */ -void perf_counter_init_task(struct task_struct *task) -{ - __perf_counter_init_context(&task->perf_counter_ctx, task); -} - /* * Cross CPU call to read the hardware counter */ @@ -663,7 +642,6 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = &cpuctx->ctx; - WARN_ON_ONCE(ctx->task); return ctx; } @@ -915,12 +893,13 @@ sw_perf_counter_init(struct perf_counter *counter) static struct perf_counter * perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, - struct perf_counter *group_leader) + struct perf_counter *group_leader, + gfp_t gfpflags) { const struct hw_perf_counter_ops *hw_ops; struct perf_counter *counter; - counter = kzalloc(sizeof(*counter), GFP_KERNEL); + counter = kzalloc(sizeof(*counter), gfpflags); if (!counter) return NULL; @@ -947,9 +926,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = NULL; if (!hw_event->raw && hw_event->type < 0) hw_ops = sw_perf_counter_init(counter); - if (!hw_ops) { + if (!hw_ops) hw_ops = hw_perf_counter_init(counter); - } if (!hw_ops) { kfree(counter); @@ -975,8 +953,10 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; struct perf_counter_context *ctx; + struct file *counter_file = NULL; struct file *group_file = NULL; int fput_needed = 0; + int fput_needed2 = 0; int ret; if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) @@ -1017,25 +997,29 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, } ret = -EINVAL; - counter = perf_counter_alloc(&hw_event, cpu, group_leader); + counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); if (!counter) goto err_put_context; - perf_install_in_context(ctx, counter, cpu); - ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); if (ret < 0) - goto err_remove_free_put_context; + goto err_free_put_context; + + counter_file = fget_light(ret, &fput_needed2); + if (!counter_file) + goto err_free_put_context; + + counter->filp = counter_file; + perf_install_in_context(ctx, counter, cpu); + + fput_light(counter_file, fput_needed2); out_fput: fput_light(group_file, fput_needed); return ret; -err_remove_free_put_context: - mutex_lock(&counter->mutex); - perf_counter_remove_from_context(counter); - mutex_unlock(&counter->mutex); +err_free_put_context: kfree(counter); err_put_context: @@ -1044,6 +1028,186 @@ err_put_context: goto out_fput; } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + INIT_LIST_HEAD(&ctx->counter_list); + ctx->task = task; +} + +/* + * inherit a counter from parent task to child task: + */ +static int +inherit_counter(struct perf_counter *parent_counter, + struct task_struct *parent, + struct perf_counter_context *parent_ctx, + struct task_struct *child, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *child_counter; + + child_counter = perf_counter_alloc(&parent_counter->hw_event, + parent_counter->cpu, NULL, + GFP_ATOMIC); + if (!child_counter) + return -ENOMEM; + + /* + * Link it up in the child's context: + */ + child_counter->ctx = child_ctx; + child_counter->task = child; + list_add_counter(child_counter, child_ctx); + child_ctx->nr_counters++; + + child_counter->parent = parent_counter; + parent_counter->nr_inherited++; + /* + * inherit into child's child as well: + */ + child_counter->hw_event.inherit = 1; + + /* + * Get a reference to the parent filp - we will fput it + * when the child counter exits. This is safe to do because + * we are in the parent and we know that the filp still + * exists and has a nonzero count: + */ + atomic_long_inc(&parent_counter->filp->f_count); + + return 0; +} + +static void +__perf_counter_exit_task(struct task_struct *child, + struct perf_counter *child_counter, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *parent_counter; + u64 parent_val, child_val; + u64 perf_flags; + + /* + * Disable and unlink this counter. + * + * Be careful about zapping the list - IRQ/NMI context + * could still be processing it: + */ + local_irq_disable(); + perf_flags = hw_perf_save_disable(); + + if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) + child_counter->hw_ops->hw_perf_counter_disable(child_counter); + list_del_init(&child_counter->list_entry); + + hw_perf_restore(perf_flags); + local_irq_enable(); + + parent_counter = child_counter->parent; + /* + * It can happen that parent exits first, and has counters + * that are still around due to the child reference. These + * counters need to be zapped - but otherwise linger. + */ + if (!parent_counter) + return; + + parent_val = atomic64_read(&parent_counter->count); + child_val = atomic64_read(&child_counter->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_counter->count); + + fput(parent_counter->filp); + + kfree(child_counter); +} + +/* + * When a child task exist, feed back counter values to parent counters. + * + * Note: we are running in child context, but the PID is not hashed + * anymore so new counters will not be added. + */ +void perf_counter_exit_task(struct task_struct *child) +{ + struct perf_counter *child_counter, *tmp; + struct perf_counter_context *child_ctx; + + child_ctx = &child->perf_counter_ctx; + + if (likely(!child_ctx->nr_counters)) + return; + + list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, + list_entry) + __perf_counter_exit_task(child, child_counter, child_ctx); +} + +/* + * Initialize the perf_counter context in task_struct + */ +void perf_counter_init_task(struct task_struct *child) +{ + struct perf_counter_context *child_ctx, *parent_ctx; + struct perf_counter *counter, *parent_counter; + struct task_struct *parent = current; + unsigned long flags; + + child_ctx = &child->perf_counter_ctx; + parent_ctx = &parent->perf_counter_ctx; + + __perf_counter_init_context(child_ctx, child); + + /* + * This is executed from the parent task context, so inherit + * counters that have been marked for cloning: + */ + + if (likely(!parent_ctx->nr_counters)) + return; + + /* + * Lock the parent list. No need to lock the child - not PID + * hashed yet and not running, so nobody can access it. + */ + spin_lock_irqsave(&parent_ctx->lock, flags); + + /* + * We dont have to disable NMIs - we are only looking at + * the list, not manipulating it: + */ + list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { + if (!counter->hw_event.inherit || counter->group_leader != counter) + continue; + + /* + * Instead of creating recursive hierarchies of counters, + * we link inheritd counters back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + parent_counter = counter; + if (counter->parent) + parent_counter = counter->parent; + + if (inherit_counter(parent_counter, parent, + parent_ctx, child, child_ctx)) + break; + } + + spin_unlock_irqrestore(&parent_ctx->lock, flags); +} + static void __cpuinit perf_counter_init_cpu(int cpu) { struct perf_cpu_context *cpuctx; From 8cb391e8786c8072367f0aeb90551903fef074ba Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:22:31 +0100 Subject: [PATCH 0025/2061] perfcounters: fix task clock counter Impact: bugfix Update the task clock counter to the new math. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f5e81dd193d..1f81cde0dc4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -850,17 +850,36 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .hw_perf_counter_read = cpu_clock_perf_counter_read, }; -static void task_clock_perf_counter_enable(struct perf_counter *counter) +static void task_clock_perf_counter_update(struct perf_counter *counter) { -} + u64 prev, now; + s64 delta; -static void task_clock_perf_counter_disable(struct perf_counter *counter) -{ + prev = atomic64_read(&counter->hw.prev_count); + now = current->se.sum_exec_runtime; + + atomic64_set(&counter->hw.prev_count, now); + + delta = now - prev; + if (WARN_ON_ONCE(delta < 0)) + delta = 0; + + atomic64_add(delta, &counter->count); } static void task_clock_perf_counter_read(struct perf_counter *counter) { - atomic64_set(&counter->count, current->se.sum_exec_runtime); + task_clock_perf_counter_update(counter); +} + +static void task_clock_perf_counter_enable(struct perf_counter *counter) +{ + atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime); +} + +static void task_clock_perf_counter_disable(struct perf_counter *counter) +{ + task_clock_perf_counter_update(counter); } static const struct hw_perf_counter_ops perf_ops_task_clock = { From 5d6a27d8a096868ae313f71f563b06074a7e34fe Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:28:33 +0100 Subject: [PATCH 0026/2061] perfcounters: add context switch counter Impact: add new feature, new sw counter Add a counter that counts the number of context-switches a task is doing. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 +-- kernel/perf_counter.c | 51 ++++++++++++++++++++++++++++++++++++ 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e5d25bf8f74..d2a16563415 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -53,8 +53,8 @@ enum hw_event_types { /* * Future software events: */ - /* PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, */ + PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, }; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1f81cde0dc4..09287091c52 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .hw_perf_counter_read = task_clock_perf_counter_read, }; +static u64 get_context_switches(void) +{ + struct task_struct *curr = current; + + return curr->nvcsw + curr->nivcsw; +} + +static void context_switches_perf_counter_update(struct perf_counter *counter) +{ + u64 prev, now; + s64 delta; + + prev = atomic64_read(&counter->hw.prev_count); + now = get_context_switches(); + + atomic64_set(&counter->hw.prev_count, now); + + delta = now - prev; + if (WARN_ON_ONCE(delta < 0)) + delta = 0; + + atomic64_add(delta, &counter->count); +} + +static void context_switches_perf_counter_read(struct perf_counter *counter) +{ + context_switches_perf_counter_update(counter); +} + +static void context_switches_perf_counter_enable(struct perf_counter *counter) +{ + /* + * ->nvcsw + curr->nivcsw is a per-task value already, + * so we dont have to clear it on switch-in. + */ +} + +static void context_switches_perf_counter_disable(struct perf_counter *counter) +{ + context_switches_perf_counter_update(counter); +} + +static const struct hw_perf_counter_ops perf_ops_context_switches = { + .hw_perf_counter_enable = context_switches_perf_counter_enable, + .hw_perf_counter_disable = context_switches_perf_counter_disable, + .hw_perf_counter_read = context_switches_perf_counter_read, +}; + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -900,6 +948,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_TASK_CLOCK: hw_ops = &perf_ops_task_clock; break; + case PERF_COUNT_CONTEXT_SWITCHES: + hw_ops = &perf_ops_context_switches; + break; default: break; } From 6c594c21fcb02c662f11c97be4d7d2b73060a205 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 12:34:15 +0100 Subject: [PATCH 0027/2061] perfcounters: add task migrations counter Impact: add new feature, new sw counter Add a counter that counts the number of cross-CPU migrations a task is suffering. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 +++--- include/linux/sched.h | 3 ++- kernel/perf_counter.c | 49 ++++++++++++++++++++++++++++++++++++ kernel/sched.c | 7 ++++-- 4 files changed, 61 insertions(+), 6 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d2a16563415..f30486fc55d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -42,6 +42,8 @@ enum hw_event_types { PERF_COUNT_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_BRANCH_MISSES = 5, + PERF_HW_EVENTS_MAX = 6, + /* * Special "software" counters provided by the kernel, even if * the hardware does not support performance counters. These @@ -50,11 +52,11 @@ enum hw_event_types { */ PERF_COUNT_CPU_CLOCK = -1, PERF_COUNT_TASK_CLOCK = -2, - /* - * Future software events: - */ PERF_COUNT_PAGE_FAULTS = -3, PERF_COUNT_CONTEXT_SWITCHES = -4, + PERF_COUNT_CPU_MIGRATIONS = -5, + + PERF_SW_EVENTS_MIN = -6, }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index 4c530278391..2e15be8fc79 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1014,6 +1014,8 @@ struct sched_entity { u64 last_wakeup; u64 avg_overlap; + u64 nr_migrations; + #ifdef CONFIG_SCHEDSTATS u64 wait_start; u64 wait_max; @@ -1029,7 +1031,6 @@ struct sched_entity { u64 exec_max; u64 slice_max; - u64 nr_migrations; u64 nr_migrations_cold; u64 nr_failed_migrations_affine; u64 nr_failed_migrations_running; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 09287091c52..fb11e351e44 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -936,6 +936,52 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = { .hw_perf_counter_read = context_switches_perf_counter_read, }; +static inline u64 get_cpu_migrations(void) +{ + return current->se.nr_migrations; +} + +static void cpu_migrations_perf_counter_update(struct perf_counter *counter) +{ + u64 prev, now; + s64 delta; + + prev = atomic64_read(&counter->hw.prev_count); + now = get_cpu_migrations(); + + atomic64_set(&counter->hw.prev_count, now); + + delta = now - prev; + if (WARN_ON_ONCE(delta < 0)) + delta = 0; + + atomic64_add(delta, &counter->count); +} + +static void cpu_migrations_perf_counter_read(struct perf_counter *counter) +{ + cpu_migrations_perf_counter_update(counter); +} + +static void cpu_migrations_perf_counter_enable(struct perf_counter *counter) +{ + /* + * se.nr_migrations is a per-task value already, + * so we dont have to clear it on switch-in. + */ +} + +static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) +{ + cpu_migrations_perf_counter_update(counter); +} + +static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { + .hw_perf_counter_enable = cpu_migrations_perf_counter_enable, + .hw_perf_counter_disable = cpu_migrations_perf_counter_disable, + .hw_perf_counter_read = cpu_migrations_perf_counter_read, +}; + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -951,6 +997,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_CONTEXT_SWITCHES: hw_ops = &perf_ops_context_switches; break; + case PERF_COUNT_CPU_MIGRATIONS: + hw_ops = &perf_ops_cpu_migrations; + break; default: break; } diff --git a/kernel/sched.c b/kernel/sched.c index 5c3f4106314..382cfdb5e38 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1852,12 +1852,14 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) p->se.sleep_start -= clock_offset; if (p->se.block_start) p->se.block_start -= clock_offset; +#endif if (old_cpu != new_cpu) { - schedstat_inc(p, se.nr_migrations); + p->se.nr_migrations++; +#ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); - } #endif + } p->se.vruntime -= old_cfsrq->min_vruntime - new_cfsrq->min_vruntime; @@ -2375,6 +2377,7 @@ static void __sched_fork(struct task_struct *p) p->se.exec_start = 0; p->se.sum_exec_runtime = 0; p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; p->se.last_wakeup = 0; p->se.avg_overlap = 0; From e06c61a879910869aa5bf3f8f634abfee1a7bebc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 14:44:31 +0100 Subject: [PATCH 0028/2061] perfcounters: add nr-of-faults counter Impact: add new feature, new sw counter Add a counter that counts the number of pagefaults a task is experiencing. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 51 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 51 insertions(+) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fb11e351e44..59c52f9ee43 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -888,6 +888,54 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .hw_perf_counter_read = task_clock_perf_counter_read, }; +static u64 get_page_faults(void) +{ + struct task_struct *curr = current; + + return curr->maj_flt + curr->min_flt; +} + +static void page_faults_perf_counter_update(struct perf_counter *counter) +{ + u64 prev, now; + s64 delta; + + prev = atomic64_read(&counter->hw.prev_count); + now = get_page_faults(); + + atomic64_set(&counter->hw.prev_count, now); + + delta = now - prev; + if (WARN_ON_ONCE(delta < 0)) + delta = 0; + + atomic64_add(delta, &counter->count); +} + +static void page_faults_perf_counter_read(struct perf_counter *counter) +{ + page_faults_perf_counter_update(counter); +} + +static void page_faults_perf_counter_enable(struct perf_counter *counter) +{ + /* + * page-faults is a per-task value already, + * so we dont have to clear it on switch-in. + */ +} + +static void page_faults_perf_counter_disable(struct perf_counter *counter) +{ + page_faults_perf_counter_update(counter); +} + +static const struct hw_perf_counter_ops perf_ops_page_faults = { + .hw_perf_counter_enable = page_faults_perf_counter_enable, + .hw_perf_counter_disable = page_faults_perf_counter_disable, + .hw_perf_counter_read = page_faults_perf_counter_read, +}; + static u64 get_context_switches(void) { struct task_struct *curr = current; @@ -994,6 +1042,9 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_TASK_CLOCK: hw_ops = &perf_ops_task_clock; break; + case PERF_COUNT_PAGE_FAULTS: + hw_ops = &perf_ops_page_faults; + break; case PERF_COUNT_CONTEXT_SWITCHES: hw_ops = &perf_ops_context_switches; break; From 2b9ff0db19b5e2c77000b7201525f9c3d6e8328d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 18:36:30 +0100 Subject: [PATCH 0029/2061] perfcounters: fix non-intel-perfmon CPUs Do not write MSR_CORE_PERF_GLOBAL_CTRL on CPUs where it does not exist. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5afae13d8d5..6d30f603b62 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -157,6 +157,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) void hw_perf_enable_all(void) { + if (unlikely(!perf_counters_initialized)) + return; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); } @@ -164,14 +167,21 @@ u64 hw_perf_save_disable(void) { u64 ctrl; + if (unlikely(!perf_counters_initialized)) + return 0; + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + return ctrl; } EXPORT_SYMBOL_GPL(hw_perf_save_disable); void hw_perf_restore(u64 ctrl) { + if (unlikely(!perf_counters_initialized)) + return; + wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); } EXPORT_SYMBOL_GPL(hw_perf_restore); From 088e2852c858159d47f71ee8da38e0fb1b21f806 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 20:21:00 +0100 Subject: [PATCH 0030/2061] perfcounters, x86: fix sw counters on non-PMC CPUs Make perf_max_counters default to at least 1 - this allows the sw counters to be used. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 59c52f9ee43..539fa8283a0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -25,7 +25,7 @@ */ DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); -int perf_max_counters __read_mostly; +int perf_max_counters __read_mostly = 1; static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; From 75f224cf7700ed6006574dc3f2efa29860727570 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 21:58:46 +0100 Subject: [PATCH 0031/2061] perfcounters: fix lapic initialization Fix non-working NMI sampling in certain bootup scenarios. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6d30f603b62..8a154bd7ba9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -557,10 +557,10 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + perf_counters_initialized = true; + perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); - - perf_counters_initialized = true; } static void x86_perf_counter_read(struct perf_counter *counter) From 0cc0c027d4e028632933f1be2dc4cd730358183b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Dec 2008 23:20:36 +0100 Subject: [PATCH 0032/2061] perfcounters: release CPU context when exiting task counters If counters are exiting via do_exit() not via filp close, then the CPU context needs to be released - otherwise future percpu counter creations might fail. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 539fa8283a0..16396e9406f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1273,8 +1273,19 @@ __perf_counter_exit_task(struct task_struct *child, local_irq_disable(); perf_flags = hw_perf_save_disable(); - if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) + if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { + struct perf_cpu_context *cpuctx; + + cpuctx = &__get_cpu_var(perf_cpu_context); + child_counter->hw_ops->hw_perf_counter_disable(child_counter); + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + child_counter->oncpu = -1; + + cpuctx->active_oncpu--; + child_ctx->nr_active--; + } + list_del_init(&child_counter->list_entry); hw_perf_restore(perf_flags); @@ -1539,4 +1550,3 @@ static int __init perf_counter_sysfs_init(void) &perfclass_attr_group); } device_initcall(perf_counter_sysfs_init); - From f65cb45cba63f249458b669aa67069eabc37b2f5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 16 Dec 2008 13:40:44 +0100 Subject: [PATCH 0033/2061] perfcounters: flush on setuid exec Pavel Machek pointed out that performance counters should be flushed when crossing protection domains on setuid execution. Reported-by: Pavel Machek Acked-by: Pavel Machek Signed-off-by: Ingo Molnar --- fs/exec.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/exec.c b/fs/exec.c index ec5df9a3831..d5165d899a4 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include @@ -1017,6 +1018,13 @@ int flush_old_exec(struct linux_binprm * bprm) set_dumpable(current->mm, suid_dumpable); } + /* + * Flush performance counters when crossing a + * security domain: + */ + if (!get_dumpable(current->mm)) + perf_counter_exit_task(current); + /* An exec changes our domain. We are no longer part of the thread group */ From a86ed50859d65a08beec9474df97b88438a996df Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 00:43:10 +0100 Subject: [PATCH 0034/2061] perfcounters: use hw_event.disable flag Impact: implement default-off counters Make sure that counters that are created with counter.hw_event.disabled=1, get created in disabled state. They can be enabled via: prctl(PR_TASK_PERF_COUNTERS_ENABLE); Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 16396e9406f..5431e790b5d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1093,6 +1093,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->hw_ops = NULL; + if (hw_event->disabled) + counter->state = PERF_COUNTER_STATE_OFF; + hw_ops = NULL; if (!hw_event->raw && hw_event->type < 0) hw_ops = sw_perf_counter_init(counter); From 94c46572a6d9bb497eda0a14099d9f1360d57d5d Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Date: Fri, 19 Dec 2008 22:37:58 +0530 Subject: [PATCH 0035/2061] x86: perf_counter.c intel_perfmon_event_map and max_intel_perfmon_events should be static Impact: cleanup, avoid sparse warnings, reduce kernel size a bit Fixes these sparse warnings: arch/x86/kernel/cpu/perf_counter.c:44:11: warning: symbol 'intel_perfmon_event_map' was not declared. Should it be static? arch/x86/kernel/cpu/perf_counter.c:54:11: warning: symbol 'max_intel_perfmon_events' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8a154bd7ba9..bdbdb56eaa3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -41,7 +41,7 @@ struct cpu_hw_counters { */ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); -const int intel_perfmon_event_map[] = +static const int intel_perfmon_event_map[] = { [PERF_COUNT_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -51,7 +51,7 @@ const int intel_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); /* * Propagate counter elapsed time into the generic counter. From 8fb9331391af95ca1f4e5c0a0da8120b13cbae01 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:04:16 +0100 Subject: [PATCH 0036/2061] perfcounters: remove warnings Impact: remove debug checks Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 ------- include/linux/perf_counter.h | 4 ---- kernel/perf_counter.c | 8 -------- 3 files changed, 19 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index bdbdb56eaa3..89fad5d4fb3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -64,7 +64,6 @@ x86_perf_counter_update(struct perf_counter *counter, { u64 prev_raw_count, new_raw_count, delta; - WARN_ON_ONCE(counter->state != PERF_COUNTER_STATE_ACTIVE); /* * Careful: an NMI might modify the previous counter value. * @@ -89,7 +88,6 @@ again: * of the count, so we do that by clipping the delta to 32 bits: */ delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); - WARN_ON_ONCE((int)delta < 0); atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); @@ -193,7 +191,6 @@ __x86_perf_counter_disable(struct perf_counter *counter, int err; err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); - WARN_ON_ONCE(err); } static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); @@ -209,8 +206,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter, s32 left = atomic64_read(&hwc->period_left); s32 period = hwc->irq_period; - WARN_ON_ONCE(period <= 0); - /* * If we are way outside a reasoable range then just skip forward: */ @@ -224,8 +219,6 @@ __hw_perf_counter_set_period(struct perf_counter *counter, atomic64_set(&hwc->period_left, left); } - WARN_ON_ONCE(left <= 0); - per_cpu(prev_left[idx], smp_processor_id()) = left; /* diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f30486fc55d..d038450de87 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -218,8 +218,6 @@ struct perf_cpu_context { extern int perf_max_counters; #ifdef CONFIG_PERF_COUNTERS -extern void -perf_counter_show(struct perf_counter *counter, char *str, int trace); extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); @@ -237,8 +235,6 @@ extern int perf_counter_task_enable(void); #else static inline void -perf_counter_show(struct perf_counter *counter, char *str, int trace) { } -static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } static inline void perf_counter_task_sched_out(struct task_struct *task, int cpu) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5431e790b5d..aab6c123b02 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -861,8 +861,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } @@ -906,8 +904,6 @@ static void page_faults_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } @@ -954,8 +950,6 @@ static void context_switches_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } @@ -1000,8 +994,6 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter) atomic64_set(&counter->hw.prev_count, now); delta = now - prev; - if (WARN_ON_ONCE(delta < 0)) - delta = 0; atomic64_add(delta, &counter->count); } From 7995888fcb0246543ee8027bf2835a250ba8c925 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 08:54:56 +0100 Subject: [PATCH 0037/2061] perfcounters: tweak group scheduling Impact: schedule in groups atomically If there are multiple groups in a task, make sure they are scheduled in and out atomically. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index aab6c123b02..f8a4d9a5d5d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -367,21 +367,26 @@ counter_sched_in(struct perf_counter *counter, ctx->nr_active++; } -static void +static int group_sched_in(struct perf_counter *group_counter, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu) { struct perf_counter *counter; + int was_group = 0; counter_sched_in(group_counter, cpuctx, ctx, cpu); /* * Schedule in siblings as one group (if any): */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { counter_sched_in(counter, cpuctx, ctx, cpu); + was_group = 1; + } + + return was_group; } /* @@ -416,7 +421,12 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) if (counter->cpu != -1 && counter->cpu != cpu) continue; - group_sched_in(counter, cpuctx, ctx, cpu); + /* + * If we scheduled in a group atomically and + * exclusively, break out: + */ + if (group_sched_in(counter, cpuctx, ctx, cpu)) + break; } spin_unlock(&ctx->lock); From 5c167b8585c8d91206b395d57011ead7711e322f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 09:02:19 +0100 Subject: [PATCH 0038/2061] x86, perfcounters: rename intel_arch_perfmon.h => perf_counter.h Impact: rename include file We'll be providing an asm/perf_counter.h to the generic perfcounter code, so use the already existing x86 file for this purpose and rename it. Signed-off-by: Ingo Molnar --- .../include/asm/{intel_arch_perfmon.h => perf_counter.h} | 6 +++--- arch/x86/kernel/apic.c | 2 +- arch/x86/kernel/cpu/common.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 2 +- arch/x86/kernel/cpu/perfctr-watchdog.c | 2 +- arch/x86/oprofile/op_model_ppro.c | 2 +- 6 files changed, 8 insertions(+), 8 deletions(-) rename arch/x86/include/asm/{intel_arch_perfmon.h => perf_counter.h} (90%) diff --git a/arch/x86/include/asm/intel_arch_perfmon.h b/arch/x86/include/asm/perf_counter.h similarity index 90% rename from arch/x86/include/asm/intel_arch_perfmon.h rename to arch/x86/include/asm/perf_counter.h index 71598a9eab6..9dadce1124e 100644 --- a/arch/x86/include/asm/intel_arch_perfmon.h +++ b/arch/x86/include/asm/perf_counter.h @@ -1,5 +1,5 @@ -#ifndef _ASM_X86_INTEL_ARCH_PERFMON_H -#define _ASM_X86_INTEL_ARCH_PERFMON_H +#ifndef _ASM_X86_PERF_COUNTER_H +#define _ASM_X86_PERF_COUNTER_H #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 #define MSR_ARCH_PERFMON_PERFCTR1 0xc2 @@ -38,4 +38,4 @@ static inline void init_hw_perf_counters(void) { } static inline void perf_counters_lapic_init(int nmi) { } #endif -#endif /* _ASM_X86_INTEL_ARCH_PERFMON_H */ +#endif /* _ASM_X86_PERF_COUNTER_H */ diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 0579ec1cd6e..4f859acb156 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -31,7 +31,7 @@ #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 4461011db47..ad331b4d623 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 89fad5d4fb3..a4a3a09a654 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -16,7 +16,7 @@ #include #include -#include +#include #include static bool perf_counters_initialized __read_mostly; diff --git a/arch/x86/kernel/cpu/perfctr-watchdog.c b/arch/x86/kernel/cpu/perfctr-watchdog.c index 9abd48b2267..d6f5b9fbde3 100644 --- a/arch/x86/kernel/cpu/perfctr-watchdog.c +++ b/arch/x86/kernel/cpu/perfctr-watchdog.c @@ -20,7 +20,7 @@ #include #include -#include +#include struct nmi_watchdog_ctlblk { unsigned int cccr_msr; diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index e9f80c744cf..07c914555a5 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include "op_x86_model.h" #include "op_counter.h" From eb2b861810d4ff72454c83996b891df4e0aaff9a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 09:09:13 +0100 Subject: [PATCH 0039/2061] x86, perfcounters: prepare for fixed-mode PMCs Impact: refactor the x86 code for fixed-mode PMCs Extend the data structures and rename the existing facilities to allow for a 'generic' versus 'fixed' counter distinction. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 11 ++++++ arch/x86/kernel/cpu/perf_counter.c | 53 ++++++++++++++--------------- include/linux/perf_counter.h | 1 + 3 files changed, 38 insertions(+), 27 deletions(-) diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 9dadce1124e..dd5a4a559e2 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -1,6 +1,13 @@ #ifndef _ASM_X86_PERF_COUNTER_H #define _ASM_X86_PERF_COUNTER_H +/* + * Performance counter hw details: + */ + +#define X86_PMC_MAX_GENERIC 8 +#define X86_PMC_MAX_FIXED 3 + #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 #define MSR_ARCH_PERFMON_PERFCTR1 0xc2 @@ -20,6 +27,10 @@ #define ARCH_PERFMON_BRANCH_MISSES_RETIRED 6 +/* + * Intel "Architectural Performance Monitoring" CPUID + * detection/enumeration details: + */ union cpuid10_eax { struct { unsigned int version_id:8; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a4a3a09a654..fc3af868823 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -27,13 +27,12 @@ static bool perf_counters_initialized __read_mostly; static int nr_hw_counters __read_mostly; static u32 perf_counter_mask __read_mostly; -/* No support for fixed function counters yet */ - -#define MAX_HW_COUNTERS 8 - struct cpu_hw_counters { - struct perf_counter *counters[MAX_HW_COUNTERS]; - unsigned long used[BITS_TO_LONGS(MAX_HW_COUNTERS)]; + struct perf_counter *generic[X86_PMC_MAX_GENERIC]; + unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; + + struct perf_counter *fixed[X86_PMC_MAX_FIXED]; + unsigned long used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)]; }; /* @@ -185,7 +184,7 @@ void hw_perf_restore(u64 ctrl) EXPORT_SYMBOL_GPL(hw_perf_restore); static inline void -__x86_perf_counter_disable(struct perf_counter *counter, +__pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { int err; @@ -193,7 +192,7 @@ __x86_perf_counter_disable(struct perf_counter *counter, err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } -static DEFINE_PER_CPU(u64, prev_left[MAX_HW_COUNTERS]); +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -231,7 +230,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter, } static void -__x86_perf_counter_enable(struct perf_counter *counter, +__pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { wrmsr(hwc->config_base + idx, @@ -241,7 +240,7 @@ __x86_perf_counter_enable(struct perf_counter *counter, /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static void x86_perf_counter_enable(struct perf_counter *counter) +static void pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -256,12 +255,12 @@ static void x86_perf_counter_enable(struct perf_counter *counter) perf_counters_lapic_init(hwc->nmi); - __x86_perf_counter_disable(counter, hwc, idx); + __pmc_generic_disable(counter, hwc, idx); - cpuc->counters[idx] = counter; + cpuc->generic[idx] = counter; __hw_perf_counter_set_period(counter, hwc, idx); - __x86_perf_counter_enable(counter, hwc, idx); + __pmc_generic_enable(counter, hwc, idx); } void perf_counter_print_debug(void) @@ -301,16 +300,16 @@ void perf_counter_print_debug(void) local_irq_enable(); } -static void x86_perf_counter_disable(struct perf_counter *counter) +static void pmc_generic_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __x86_perf_counter_disable(counter, hwc, idx); + __pmc_generic_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); - cpuc->counters[idx] = NULL; + cpuc->generic[idx] = NULL; /* * Drain the remaining delta count out of a counter @@ -349,7 +348,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) - __x86_perf_counter_enable(counter, hwc, idx); + __pmc_generic_enable(counter, hwc, idx); } static void @@ -392,7 +391,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_counter *counter = cpuc->generic[bit]; clear_bit(bit, (unsigned long *) &status); if (!counter) @@ -412,7 +411,7 @@ again: } /* * From NMI context we cannot call into the scheduler to - * do a task wakeup - but we mark these counters as + * do a task wakeup - but we mark these generic as * wakeup_pending and initate a wakeup callback: */ if (nmi) { @@ -462,7 +461,7 @@ void perf_counter_notify(struct pt_regs *regs) cpuc = &per_cpu(cpu_hw_counters, cpu); for_each_bit(bit, cpuc->used, nr_hw_counters) { - struct perf_counter *counter = cpuc->counters[bit]; + struct perf_counter *counter = cpuc->generic[bit]; if (!counter) continue; @@ -539,10 +538,10 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... version: %d\n", eax.split.version_id); printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); nr_hw_counters = eax.split.num_counters; - if (nr_hw_counters > MAX_HW_COUNTERS) { - nr_hw_counters = MAX_HW_COUNTERS; + if (nr_hw_counters > X86_PMC_MAX_GENERIC) { + nr_hw_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_hw_counters, MAX_HW_COUNTERS); + nr_hw_counters, X86_PMC_MAX_GENERIC); } perf_counter_mask = (1 << nr_hw_counters) - 1; perf_max_counters = nr_hw_counters; @@ -556,15 +555,15 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void x86_perf_counter_read(struct perf_counter *counter) +static void pmc_generic_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .hw_perf_counter_enable = x86_perf_counter_enable, - .hw_perf_counter_disable = x86_perf_counter_disable, - .hw_perf_counter_read = x86_perf_counter_read, + .hw_perf_counter_enable = pmc_generic_enable, + .hw_perf_counter_disable = pmc_generic_disable, + .hw_perf_counter_read = pmc_generic_read, }; const struct hw_perf_counter_ops * diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d038450de87..984da540224 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -14,6 +14,7 @@ #define _LINUX_PERF_COUNTER_H #include +#include #include #include From 703e937c83bbad79075a7846e062e447c2fee6a4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 10:51:15 +0100 Subject: [PATCH 0040/2061] perfcounters: add fixed-mode PMC enumeration Enumerate fixed-mode PMCs based on CPUID, and feed that into the perfcounter code. Does not use fixed-mode PMCs yet. Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 23 +++++++++++++++++++++++ arch/x86/kernel/cpu/perf_counter.c | 23 +++++++++++++++++------ 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index dd5a4a559e2..945a315e6d6 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -41,6 +41,29 @@ union cpuid10_eax { unsigned int full; }; +union cpuid10_edx { + struct { + unsigned int num_counters_fixed:4; + unsigned int reserved:28; + } split; + unsigned int full; +}; + + +/* + * Fixed-purpose performance counters: + */ + +/* Instr_Retired.Any: */ +#define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 + +/* CPU_CLK_Unhalted.Core: */ +#define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a + +/* CPU_CLK_Unhalted.Ref: */ +#define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b + + #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(int nmi); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fc3af868823..2fca50c4597 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -27,6 +27,8 @@ static bool perf_counters_initialized __read_mostly; static int nr_hw_counters __read_mostly; static u32 perf_counter_mask __read_mostly; +static int nr_hw_counters_fixed __read_mostly; + struct cpu_hw_counters { struct perf_counter *generic[X86_PMC_MAX_GENERIC]; unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; @@ -519,8 +521,9 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { void __init init_hw_perf_counters(void) { union cpuid10_eax eax; - unsigned int unused; unsigned int ebx; + unsigned int unused; + union cpuid10_edx edx; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; @@ -529,14 +532,14 @@ void __init init_hw_perf_counters(void) * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. */ - cpuid(10, &(eax.full), &ebx, &unused, &unused); + cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return; printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... num_counters: %d\n", eax.split.num_counters); + printk(KERN_INFO "... version: %d\n", eax.split.version_id); + printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); nr_hw_counters = eax.split.num_counters; if (nr_hw_counters > X86_PMC_MAX_GENERIC) { nr_hw_counters = X86_PMC_MAX_GENERIC; @@ -546,8 +549,16 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_hw_counters) - 1; perf_max_counters = nr_hw_counters; - printk(KERN_INFO "... bit_width: %d\n", eax.split.bit_width); - printk(KERN_INFO "... mask_length: %d\n", eax.split.mask_length); + printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + + nr_hw_counters_fixed = edx.split.num_counters_fixed; + if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) { + nr_hw_counters_fixed = X86_PMC_MAX_FIXED; + WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", + nr_hw_counters_fixed, X86_PMC_MAX_FIXED); + } + printk(KERN_INFO "... fixed counters: %d\n", nr_hw_counters_fixed); perf_counters_initialized = true; From 862a1a5f346fe7e9181ea51eaae48cf2cd70f746 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 13:09:20 +0100 Subject: [PATCH 0041/2061] x86, perfcounters: refactor code for fixed-function PMCs Impact: clean up Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 14 +++++- arch/x86/kernel/cpu/perf_counter.c | 73 +++++++++++++++-------------- 2 files changed, 52 insertions(+), 35 deletions(-) diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 945a315e6d6..13745deb16c 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -8,6 +8,10 @@ #define X86_PMC_MAX_GENERIC 8 #define X86_PMC_MAX_FIXED 3 +#define X86_PMC_IDX_GENERIC 0 +#define X86_PMC_IDX_FIXED 32 +#define X86_PMC_IDX_MAX 64 + #define MSR_ARCH_PERFMON_PERFCTR0 0xc1 #define MSR_ARCH_PERFMON_PERFCTR1 0xc2 @@ -54,6 +58,15 @@ union cpuid10_edx { * Fixed-purpose performance counters: */ +/* + * All 3 fixed-mode PMCs are configured via this single MSR: + */ +#define MSR_ARCH_PERFMON_FIXED_CTR_CTRL 0x38d + +/* + * The counts are available in three separate MSRs: + */ + /* Instr_Retired.Any: */ #define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 @@ -63,7 +76,6 @@ union cpuid10_edx { /* CPU_CLK_Unhalted.Ref: */ #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b - #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(int nmi); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2fca50c4597..358af526640 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -24,17 +24,14 @@ static bool perf_counters_initialized __read_mostly; /* * Number of (generic) HW counters: */ -static int nr_hw_counters __read_mostly; -static u32 perf_counter_mask __read_mostly; +static int nr_counters_generic __read_mostly; +static u64 perf_counter_mask __read_mostly; -static int nr_hw_counters_fixed __read_mostly; +static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { - struct perf_counter *generic[X86_PMC_MAX_GENERIC]; - unsigned long used[BITS_TO_LONGS(X86_PMC_MAX_GENERIC)]; - - struct perf_counter *fixed[X86_PMC_MAX_FIXED]; - unsigned long used_fixed[BITS_TO_LONGS(X86_PMC_MAX_FIXED)]; + struct perf_counter *counters[X86_PMC_IDX_MAX]; + unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; }; /* @@ -159,7 +156,7 @@ void hw_perf_enable_all(void) if (unlikely(!perf_counters_initialized)) return; - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask); } u64 hw_perf_save_disable(void) @@ -170,7 +167,7 @@ u64 hw_perf_save_disable(void) return 0; rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); return ctrl; } @@ -181,7 +178,7 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, ctrl, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } EXPORT_SYMBOL_GPL(hw_perf_restore); @@ -239,6 +236,11 @@ __pmc_generic_enable(struct perf_counter *counter, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } +static int fixed_mode_idx(struct hw_perf_counter *hwc) +{ + return -1; +} + /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ @@ -250,7 +252,7 @@ static void pmc_generic_enable(struct perf_counter *counter) /* Try to get the previous counter again */ if (test_and_set_bit(idx, cpuc->used)) { - idx = find_first_zero_bit(cpuc->used, nr_hw_counters); + idx = find_first_zero_bit(cpuc->used, nr_counters_generic); set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -259,7 +261,7 @@ static void pmc_generic_enable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); - cpuc->generic[idx] = counter; + cpuc->counters[idx] = counter; __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -270,7 +272,7 @@ void perf_counter_print_debug(void) u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; int cpu, idx; - if (!nr_hw_counters) + if (!nr_counters_generic) return; local_irq_disable(); @@ -286,7 +288,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); - for (idx = 0; idx < nr_hw_counters; idx++) { + for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); @@ -311,7 +313,7 @@ static void pmc_generic_disable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); - cpuc->generic[idx] = NULL; + cpuc->counters[idx] = NULL; /* * Drain the remaining delta count out of a counter @@ -381,7 +383,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); /* Disable counters globally */ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, 0, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -392,8 +394,8 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; - for_each_bit(bit, (unsigned long *) &status, nr_hw_counters) { - struct perf_counter *counter = cpuc->generic[bit]; + for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) { + struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); if (!counter) @@ -424,7 +426,7 @@ again: } } - wrmsr(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); /* * Repeat if there is more work to be done: @@ -436,7 +438,7 @@ out: /* * Restore - do not reenable when global enable is off: */ - wrmsr(MSR_CORE_PERF_GLOBAL_CTRL, saved_global, 0); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); } void smp_perf_counter_interrupt(struct pt_regs *regs) @@ -462,8 +464,8 @@ void perf_counter_notify(struct pt_regs *regs) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - for_each_bit(bit, cpuc->used, nr_hw_counters) { - struct perf_counter *counter = cpuc->generic[bit]; + for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) { + struct perf_counter *counter = cpuc->counters[bit]; if (!counter) continue; @@ -540,26 +542,29 @@ void __init init_hw_perf_counters(void) printk(KERN_INFO "... version: %d\n", eax.split.version_id); printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); - nr_hw_counters = eax.split.num_counters; - if (nr_hw_counters > X86_PMC_MAX_GENERIC) { - nr_hw_counters = X86_PMC_MAX_GENERIC; + nr_counters_generic = eax.split.num_counters; + if (nr_counters_generic > X86_PMC_MAX_GENERIC) { + nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_hw_counters, X86_PMC_MAX_GENERIC); + nr_counters_generic, X86_PMC_MAX_GENERIC); } - perf_counter_mask = (1 << nr_hw_counters) - 1; - perf_max_counters = nr_hw_counters; + perf_counter_mask = (1 << nr_counters_generic) - 1; + perf_max_counters = nr_counters_generic; printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); - nr_hw_counters_fixed = edx.split.num_counters_fixed; - if (nr_hw_counters_fixed > X86_PMC_MAX_FIXED) { - nr_hw_counters_fixed = X86_PMC_MAX_FIXED; + nr_counters_fixed = edx.split.num_counters_fixed; + if (nr_counters_fixed > X86_PMC_MAX_FIXED) { + nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - nr_hw_counters_fixed, X86_PMC_MAX_FIXED); + nr_counters_fixed, X86_PMC_MAX_FIXED); } - printk(KERN_INFO "... fixed counters: %d\n", nr_hw_counters_fixed); + printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed); + perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; + + printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; perf_counters_lapic_init(0); From 7671581f1666ef4b54a1c1e598c51ac44c060a9b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:20:28 +0100 Subject: [PATCH 0042/2061] perfcounters: hw ops rename Impact: rename field names Shorten them. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ++-- include/linux/perf_counter.h | 6 ++-- kernel/perf_counter.c | 50 +++++++++++++++--------------- 3 files changed, 31 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 358af526640..b6755712142 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -577,9 +577,9 @@ static void pmc_generic_read(struct perf_counter *counter) } static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .hw_perf_counter_enable = pmc_generic_enable, - .hw_perf_counter_disable = pmc_generic_disable, - .hw_perf_counter_read = pmc_generic_read, + .enable = pmc_generic_enable, + .disable = pmc_generic_disable, + .read = pmc_generic_read, }; const struct hw_perf_counter_ops * diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 984da540224..48f76d2e54c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -128,9 +128,9 @@ struct perf_counter; * struct hw_perf_counter_ops - performance counter hw ops */ struct hw_perf_counter_ops { - void (*hw_perf_counter_enable) (struct perf_counter *counter); - void (*hw_perf_counter_disable) (struct perf_counter *counter); - void (*hw_perf_counter_read) (struct perf_counter *counter); + void (*enable) (struct perf_counter *counter); + void (*disable) (struct perf_counter *counter); + void (*read) (struct perf_counter *counter); }; /** diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f8a4d9a5d5d..961d651aa57 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -109,7 +109,7 @@ static void __perf_counter_remove_from_context(void *info) spin_lock_irqsave(&ctx->lock, flags); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - counter->hw_ops->hw_perf_counter_disable(counter); + counter->hw_ops->disable(counter); counter->state = PERF_COUNTER_STATE_INACTIVE; ctx->nr_active--; cpuctx->active_oncpu--; @@ -226,7 +226,7 @@ static void __perf_install_in_context(void *info) counter->oncpu = cpu; ctx->nr_active++; cpuctx->active_oncpu++; - counter->hw_ops->hw_perf_counter_enable(counter); + counter->hw_ops->enable(counter); } if (!ctx->task && cpuctx->max_pertask) @@ -297,7 +297,7 @@ counter_sched_out(struct perf_counter *counter, if (counter->state != PERF_COUNTER_STATE_ACTIVE) return; - counter->hw_ops->hw_perf_counter_disable(counter); + counter->hw_ops->disable(counter); counter->state = PERF_COUNTER_STATE_INACTIVE; counter->oncpu = -1; @@ -327,7 +327,7 @@ group_sched_out(struct perf_counter *group_counter, * * We stop each counter and update the counter value in counter->count. * - * This does not protect us against NMI, but hw_perf_counter_disable() + * This does not protect us against NMI, but disable() * sets the disabled bit in the control field of counter _before_ * accessing the counter control register. If a NMI hits, then it will * not restart the counter. @@ -359,7 +359,7 @@ counter_sched_in(struct perf_counter *counter, if (counter->state == PERF_COUNTER_STATE_OFF) return; - counter->hw_ops->hw_perf_counter_enable(counter); + counter->hw_ops->enable(counter); counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ @@ -395,7 +395,7 @@ group_sched_in(struct perf_counter *group_counter, * * We restore the counter value and then enable it. * - * This does not protect us against NMI, but hw_perf_counter_enable() + * This does not protect us against NMI, but enable() * sets the enabled bit in the control field of counter _before_ * accessing the counter control register. If a NMI hits, then it will * keep the counter running. @@ -537,11 +537,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) /* * Cross CPU call to read the hardware counter */ -static void __hw_perf_counter_read(void *info) +static void __read(void *info) { struct perf_counter *counter = info; - counter->hw_ops->hw_perf_counter_read(counter); + counter->hw_ops->read(counter); } static u64 perf_counter_read(struct perf_counter *counter) @@ -552,7 +552,7 @@ static u64 perf_counter_read(struct perf_counter *counter) */ if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, - __hw_perf_counter_read, counter, 1); + __read, counter, 1); } return atomic64_read(&counter->count); @@ -855,9 +855,9 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_cpu_clock = { - .hw_perf_counter_enable = cpu_clock_perf_counter_enable, - .hw_perf_counter_disable = cpu_clock_perf_counter_disable, - .hw_perf_counter_read = cpu_clock_perf_counter_read, + .enable = cpu_clock_perf_counter_enable, + .disable = cpu_clock_perf_counter_disable, + .read = cpu_clock_perf_counter_read, }; static void task_clock_perf_counter_update(struct perf_counter *counter) @@ -891,9 +891,9 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_task_clock = { - .hw_perf_counter_enable = task_clock_perf_counter_enable, - .hw_perf_counter_disable = task_clock_perf_counter_disable, - .hw_perf_counter_read = task_clock_perf_counter_read, + .enable = task_clock_perf_counter_enable, + .disable = task_clock_perf_counter_disable, + .read = task_clock_perf_counter_read, }; static u64 get_page_faults(void) @@ -937,9 +937,9 @@ static void page_faults_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_page_faults = { - .hw_perf_counter_enable = page_faults_perf_counter_enable, - .hw_perf_counter_disable = page_faults_perf_counter_disable, - .hw_perf_counter_read = page_faults_perf_counter_read, + .enable = page_faults_perf_counter_enable, + .disable = page_faults_perf_counter_disable, + .read = page_faults_perf_counter_read, }; static u64 get_context_switches(void) @@ -983,9 +983,9 @@ static void context_switches_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_context_switches = { - .hw_perf_counter_enable = context_switches_perf_counter_enable, - .hw_perf_counter_disable = context_switches_perf_counter_disable, - .hw_perf_counter_read = context_switches_perf_counter_read, + .enable = context_switches_perf_counter_enable, + .disable = context_switches_perf_counter_disable, + .read = context_switches_perf_counter_read, }; static inline u64 get_cpu_migrations(void) @@ -1027,9 +1027,9 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) } static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { - .hw_perf_counter_enable = cpu_migrations_perf_counter_enable, - .hw_perf_counter_disable = cpu_migrations_perf_counter_disable, - .hw_perf_counter_read = cpu_migrations_perf_counter_read, + .enable = cpu_migrations_perf_counter_enable, + .disable = cpu_migrations_perf_counter_disable, + .read = cpu_migrations_perf_counter_read, }; static const struct hw_perf_counter_ops * @@ -1283,7 +1283,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); - child_counter->hw_ops->hw_perf_counter_disable(child_counter); + child_counter->hw_ops->disable(child_counter); child_counter->state = PERF_COUNTER_STATE_INACTIVE; child_counter->oncpu = -1; From aa9c4c0f967fdb482ea95e8473ec3d201e6e0781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 17 Dec 2008 14:10:57 +0100 Subject: [PATCH 0043/2061] perfcounters: fix task clock counter Impact: fix per task clock counter precision Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 8 +++++ kernel/exit.c | 17 ++++++--- kernel/perf_counter.c | 70 ++++++++++++++++++++++++++++--------- kernel/sched.c | 49 ++++++++++++++++++++++++-- 4 files changed, 120 insertions(+), 24 deletions(-) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 4a145caeee0..1b2e3242497 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -66,7 +66,15 @@ static inline unsigned int kstat_irqs(unsigned int irq) return sum; } + +/* + * Lock/unlock the current runqueue - to extract task statistics: + */ +extern void curr_rq_lock_irq_save(unsigned long *flags); +extern void curr_rq_unlock_irq_restore(unsigned long *flags); +extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); + extern void account_user_time(struct task_struct *, cputime_t); extern void account_user_time_scaled(struct task_struct *, cputime_t); extern void account_system_time(struct task_struct *, int, cputime_t); diff --git a/kernel/exit.c b/kernel/exit.c index d336c90a5f1..244edfd9686 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -922,6 +922,12 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1093,11 +1099,6 @@ NORET_TYPE void do_exit(long code) mpol_put(tsk->mempolicy); tsk->mempolicy = NULL; #endif - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); #ifdef CONFIG_FUTEX if (unlikely(!list_empty(&tsk->pi_state_list))) exit_pi_state_list(tsk); @@ -1121,6 +1122,12 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); + /* + * These must happen late, after the PID is not + * hashed anymore, but still at a point that may sleep: + */ + perf_counter_exit_task(tsk); + preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 961d651aa57..f1110ac1267 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -18,6 +18,7 @@ #include #include #include +#include #include /* @@ -106,7 +107,8 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { counter->hw_ops->disable(counter); @@ -135,7 +137,8 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); } @@ -209,7 +212,8 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - spin_lock_irqsave(&ctx->lock, flags); + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); /* * Protect the list operation against NMI by disabling the @@ -232,7 +236,8 @@ static void __perf_install_in_context(void *info) if (!ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; - spin_unlock_irqrestore(&ctx->lock, flags); + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); } /* @@ -438,15 +443,19 @@ int perf_counter_task_disable(void) struct task_struct *curr = current; struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + unsigned long flags; u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - local_irq_disable(); + curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); + /* force the update of the task clock: */ + __task_delta_exec(curr, 1); + perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -463,7 +472,7 @@ int perf_counter_task_disable(void) spin_unlock(&ctx->lock); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); return 0; } @@ -473,15 +482,19 @@ int perf_counter_task_enable(void) struct task_struct *curr = current; struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; + unsigned long flags; u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) return 0; - local_irq_disable(); + curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); + /* force the update of the task clock: */ + __task_delta_exec(curr, 1); + spin_lock(&ctx->lock); /* @@ -493,6 +506,7 @@ int perf_counter_task_enable(void) if (counter->state != PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -500,7 +514,7 @@ int perf_counter_task_enable(void) perf_counter_task_sched_in(curr, cpu); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); return 0; } @@ -540,8 +554,11 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) static void __read(void *info) { struct perf_counter *counter = info; + unsigned long flags; + curr_rq_lock_irq_save(&flags); counter->hw_ops->read(counter); + curr_rq_unlock_irq_restore(&flags); } static u64 perf_counter_read(struct perf_counter *counter) @@ -860,13 +877,27 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .read = cpu_clock_perf_counter_read, }; -static void task_clock_perf_counter_update(struct perf_counter *counter) +/* + * Called from within the scheduler: + */ +static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) { - u64 prev, now; + struct task_struct *curr = counter->task; + u64 delta; + + WARN_ON_ONCE(counter->task != current); + + delta = __task_delta_exec(curr, update); + + return curr->se.sum_exec_runtime + delta; +} + +static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) +{ + u64 prev; s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = current->se.sum_exec_runtime; atomic64_set(&counter->hw.prev_count, now); @@ -877,17 +908,23 @@ static void task_clock_perf_counter_update(struct perf_counter *counter) static void task_clock_perf_counter_read(struct perf_counter *counter) { - task_clock_perf_counter_update(counter); + u64 now = task_clock_perf_counter_val(counter, 1); + + task_clock_perf_counter_update(counter, now); } static void task_clock_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, current->se.sum_exec_runtime); + u64 now = task_clock_perf_counter_val(counter, 0); + + atomic64_set(&counter->hw.prev_count, now); } static void task_clock_perf_counter_disable(struct perf_counter *counter) { - task_clock_perf_counter_update(counter); + u64 now = task_clock_perf_counter_val(counter, 0); + + task_clock_perf_counter_update(counter, now); } static const struct hw_perf_counter_ops perf_ops_task_clock = { @@ -1267,6 +1304,7 @@ __perf_counter_exit_task(struct task_struct *child, { struct perf_counter *parent_counter; u64 parent_val, child_val; + unsigned long flags; u64 perf_flags; /* @@ -1275,7 +1313,7 @@ __perf_counter_exit_task(struct task_struct *child, * Be careful about zapping the list - IRQ/NMI context * could still be processing it: */ - local_irq_disable(); + curr_rq_lock_irq_save(&flags); perf_flags = hw_perf_save_disable(); if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { @@ -1294,7 +1332,7 @@ __perf_counter_exit_task(struct task_struct *child, list_del_init(&child_counter->list_entry); hw_perf_restore(perf_flags); - local_irq_enable(); + curr_rq_unlock_irq_restore(&flags); parent_counter = child_counter->parent; /* diff --git a/kernel/sched.c b/kernel/sched.c index 382cfdb5e38..4d84ff4c877 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -638,7 +638,7 @@ static inline int cpu_of(struct rq *rq) #define task_rq(p) cpu_rq(task_cpu(p)) #define cpu_curr(cpu) (cpu_rq(cpu)->curr) -static inline void update_rq_clock(struct rq *rq) +inline void update_rq_clock(struct rq *rq) { rq->clock = sched_clock_cpu(cpu_of(rq)); } @@ -969,6 +969,26 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } +void curr_rq_lock_irq_save(unsigned long *flags) + __acquires(rq->lock) +{ + struct rq *rq; + + local_irq_save(*flags); + rq = cpu_rq(smp_processor_id()); + spin_lock(&rq->lock); +} + +void curr_rq_unlock_irq_restore(unsigned long *flags) + __releases(rq->lock) +{ + struct rq *rq; + + rq = cpu_rq(smp_processor_id()); + spin_unlock(&rq->lock); + local_irq_restore(*flags); +} + void task_rq_unlock_wait(struct task_struct *p) { struct rq *rq = task_rq(p); @@ -2558,7 +2578,6 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, struct task_struct *next) { fire_sched_out_preempt_notifiers(prev, next); - perf_counter_task_sched_out(prev, cpu_of(rq)); prepare_lock_switch(rq, next); prepare_arch_switch(next); } @@ -4089,6 +4108,29 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); +/* + * Return any ns on the sched_clock that have not yet been banked in + * @p in case that task is currently running. + */ +unsigned long long __task_delta_exec(struct task_struct *p, int update) +{ + s64 delta_exec; + struct rq *rq; + + rq = task_rq(p); + WARN_ON_ONCE(!runqueue_is_locked()); + WARN_ON_ONCE(!task_current(rq, p)); + + if (update) + update_rq_clock(rq); + + delta_exec = rq->clock - p->se.exec_start; + + WARN_ON_ONCE(delta_exec < 0); + + return delta_exec; +} + /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. @@ -4316,13 +4358,13 @@ void scheduler_tick(void) update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); + perf_counter_task_tick(curr, cpu); spin_unlock(&rq->lock); #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); #endif - perf_counter_task_tick(curr, cpu); } #if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ @@ -4512,6 +4554,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); + perf_counter_task_sched_out(prev, cpu); rq->nr_switches++; rq->curr = next; From eef6cbf5844c620d9db9be99e4908cdf92492fb9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 19 Dec 2008 10:20:42 +0100 Subject: [PATCH 0044/2061] perfcounters: pull inherited counters Change counter inheritance from a 'push' to a 'pull' model: instead of child tasks pushing their final counts to the parent, reuse the wait4 infrastructure to pull counters as child tasks are exit-processed, much like how cutime/cstime is collected. Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 9 +++++++++ kernel/exit.c | 21 +++++++++------------ 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 23fd8909b9e..54fa2fa2c8e 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -113,6 +113,14 @@ extern struct group_info init_groups; # define CAP_INIT_BSET CAP_INIT_EFF_SET #endif +#ifdef CONFIG_PERF_COUNTERS +# define INIT_PERF_COUNTERS(tsk) \ + .perf_counter_ctx.counter_list = \ + LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), +#else +# define INIT_PERF_COUNTERS(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -180,6 +188,7 @@ extern struct group_info init_groups; INIT_IDS \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ + INIT_PERF_COUNTERS(tsk) \ } diff --git a/kernel/exit.c b/kernel/exit.c index 244edfd9686..101b7eeff44 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -153,6 +153,9 @@ static void delayed_put_task_struct(struct rcu_head *rhp) { struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); +#ifdef CONFIG_PERF_COUNTERS + WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); +#endif trace_sched_process_free(tsk); put_task_struct(tsk); } @@ -922,12 +925,6 @@ static void exit_notify(struct task_struct *tsk, int group_dead) forget_original_parent(tsk); exit_task_namespaces(tsk); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(tsk); - write_lock_irq(&tasklist_lock); if (group_dead) kill_orphaned_pgrp(tsk->group_leader, NULL); @@ -1122,12 +1119,6 @@ NORET_TYPE void do_exit(long code) if (tsk->splice_pipe) __free_pipe_info(tsk->splice_pipe); - /* - * These must happen late, after the PID is not - * hashed anymore, but still at a point that may sleep: - */ - perf_counter_exit_task(tsk); - preempt_disable(); /* causes final put_task_struct in finish_task_switch(). */ tsk->state = TASK_DEAD; @@ -1371,6 +1362,12 @@ static int wait_task_zombie(struct task_struct *p, int options, */ read_unlock(&tasklist_lock); + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(p); + retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; From 78b6084c907cea15bb40a564b974e072f5163781 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 15:07:49 +0100 Subject: [PATCH 0045/2061] perfcounters: fix init context lock Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 54fa2fa2c8e..467cff545c3 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -116,7 +116,9 @@ extern struct group_info init_groups; #ifdef CONFIG_PERF_COUNTERS # define INIT_PERF_COUNTERS(tsk) \ .perf_counter_ctx.counter_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), + LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ + .perf_counter_ctx.lock = \ + __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), #else # define INIT_PERF_COUNTERS(tsk) #endif From 95cdd2e7851cce79ab839cb0b3cbe68d7911d0f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 13:50:42 +0100 Subject: [PATCH 0046/2061] perfcounters: enable lowlevel pmc code to schedule counters Allow lowlevel ->enable() op to return an error if a counter can not be added. This can be used to handle counter constraints. Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 ++- include/linux/perf_counter.h | 2 +- kernel/perf_counter.c | 62 ++++++++++++++++++++++-------- 3 files changed, 51 insertions(+), 19 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b6755712142..74090a393a7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -244,7 +244,7 @@ static int fixed_mode_idx(struct hw_perf_counter *hwc) /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static void pmc_generic_enable(struct perf_counter *counter) +static int pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -253,6 +253,8 @@ static void pmc_generic_enable(struct perf_counter *counter) /* Try to get the previous counter again */ if (test_and_set_bit(idx, cpuc->used)) { idx = find_first_zero_bit(cpuc->used, nr_counters_generic); + if (idx == nr_counters_generic) + return -EAGAIN; set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -265,6 +267,8 @@ static void pmc_generic_enable(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); + + return 0; } void perf_counter_print_debug(void) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 48f76d2e54c..53af11d3767 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -128,7 +128,7 @@ struct perf_counter; * struct hw_perf_counter_ops - performance counter hw ops */ struct hw_perf_counter_ops { - void (*enable) (struct perf_counter *counter); + int (*enable) (struct perf_counter *counter); void (*disable) (struct perf_counter *counter); void (*read) (struct perf_counter *counter); }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f1110ac1267..2e73929a695 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -355,21 +355,25 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) cpuctx->task_ctx = NULL; } -static void +static int counter_sched_in(struct perf_counter *counter, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu) { if (counter->state == PERF_COUNTER_STATE_OFF) - return; + return 0; + + if (counter->hw_ops->enable(counter)) + return -EAGAIN; - counter->hw_ops->enable(counter); counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ cpuctx->active_oncpu++; ctx->nr_active++; + + return 0; } static int @@ -378,20 +382,38 @@ group_sched_in(struct perf_counter *group_counter, struct perf_counter_context *ctx, int cpu) { - struct perf_counter *counter; - int was_group = 0; + struct perf_counter *counter, *partial_group; + int ret = 0; - counter_sched_in(group_counter, cpuctx, ctx, cpu); + if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) + return -EAGAIN; /* * Schedule in siblings as one group (if any): */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - counter_sched_in(counter, cpuctx, ctx, cpu); - was_group = 1; + if (counter_sched_in(counter, cpuctx, ctx, cpu)) { + partial_group = counter; + goto group_error; + } + ret = -EAGAIN; } - return was_group; + return ret; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + if (counter == partial_group) + break; + counter_sched_out(counter, cpuctx, ctx); + } + counter_sched_out(group_counter, cpuctx, ctx); + + return -EAGAIN; } /* @@ -416,9 +438,6 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) spin_lock(&ctx->lock); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (ctx->nr_active == cpuctx->max_pertask) - break; - /* * Listen to the 'cpu' scheduling filter constraint * of counters: @@ -856,8 +875,9 @@ static const struct file_operations perf_fops = { .poll = perf_poll, }; -static void cpu_clock_perf_counter_enable(struct perf_counter *counter) +static int cpu_clock_perf_counter_enable(struct perf_counter *counter) { + return 0; } static void cpu_clock_perf_counter_disable(struct perf_counter *counter) @@ -913,11 +933,13 @@ static void task_clock_perf_counter_read(struct perf_counter *counter) task_clock_perf_counter_update(counter, now); } -static void task_clock_perf_counter_enable(struct perf_counter *counter) +static int task_clock_perf_counter_enable(struct perf_counter *counter) { u64 now = task_clock_perf_counter_val(counter, 0); atomic64_set(&counter->hw.prev_count, now); + + return 0; } static void task_clock_perf_counter_disable(struct perf_counter *counter) @@ -960,12 +982,14 @@ static void page_faults_perf_counter_read(struct perf_counter *counter) page_faults_perf_counter_update(counter); } -static void page_faults_perf_counter_enable(struct perf_counter *counter) +static int page_faults_perf_counter_enable(struct perf_counter *counter) { /* * page-faults is a per-task value already, * so we dont have to clear it on switch-in. */ + + return 0; } static void page_faults_perf_counter_disable(struct perf_counter *counter) @@ -1006,12 +1030,14 @@ static void context_switches_perf_counter_read(struct perf_counter *counter) context_switches_perf_counter_update(counter); } -static void context_switches_perf_counter_enable(struct perf_counter *counter) +static int context_switches_perf_counter_enable(struct perf_counter *counter) { /* * ->nvcsw + curr->nivcsw is a per-task value already, * so we dont have to clear it on switch-in. */ + + return 0; } static void context_switches_perf_counter_disable(struct perf_counter *counter) @@ -1050,12 +1076,14 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter) cpu_migrations_perf_counter_update(counter); } -static void cpu_migrations_perf_counter_enable(struct perf_counter *counter) +static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) { /* * se.nr_migrations is a per-task value already, * so we dont have to clear it on switch-in. */ + + return 0; } static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) From 0dff86aa7b9ec65a6d07167b7afb050b5fc98ddc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:28:12 +0100 Subject: [PATCH 0047/2061] x86, perfcounters: print out the ->used bitmask Impact: extend debug printouts Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 74090a393a7..f3359c2b391 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -255,6 +255,7 @@ static int pmc_generic_enable(struct perf_counter *counter) idx = find_first_zero_bit(cpuc->used, nr_counters_generic); if (idx == nr_counters_generic) return -EAGAIN; + set_bit(idx, cpuc->used); hwc->idx = idx; } @@ -274,6 +275,7 @@ static int pmc_generic_enable(struct perf_counter *counter) void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; + struct cpu_hw_counters *cpuc; int cpu, idx; if (!nr_counters_generic) @@ -282,6 +284,7 @@ void perf_counter_print_debug(void) local_irq_disable(); cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); @@ -291,6 +294,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); From 8fe91e61cdc407c7556d3cd71cf20141a25bbcea Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:29:25 +0100 Subject: [PATCH 0048/2061] perfcounters: remove ->nr_inherited Impact: remove dead code nr_inherited was not maintained correctly (not decremented) - and also not used - remove it. Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - kernel/perf_counter.c | 1 - 2 files changed, 2 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 53af11d3767..1ea08e9f31c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -164,7 +164,6 @@ struct perf_counter { struct task_struct *task; struct file *filp; - unsigned int nr_inherited; struct perf_counter *parent; /* * Protect attach/detach: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2e73929a695..48e1dbcdc1c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1308,7 +1308,6 @@ inherit_counter(struct perf_counter *parent_counter, child_ctx->nr_counters++; child_counter->parent = parent_counter; - parent_counter->nr_inherited++; /* * inherit into child's child as well: */ From 235c7fc7c500e4fd1700c4ad01b5612bcdc1b449 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 21 Dec 2008 14:43:25 +0100 Subject: [PATCH 0049/2061] perfcounters: generalize the counter scheduler Impact: clean up and refactor code refactor the counter scheduler: separate out in/out functions and introduce a counter-rotation function as well. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 222 +++++++++++++++++++++++++++--------------- 1 file changed, 143 insertions(+), 79 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 48e1dbcdc1c..d7a79f321b1 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -111,11 +111,12 @@ static void __perf_counter_remove_from_context(void *info) spin_lock(&ctx->lock); if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - counter->hw_ops->disable(counter); counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_ops->disable(counter); ctx->nr_active--; cpuctx->active_oncpu--; counter->task = NULL; + counter->oncpu = -1; } ctx->nr_counters--; @@ -192,8 +193,36 @@ retry: spin_unlock_irq(&ctx->lock); } +static int +counter_sched_in(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + if (counter->state == PERF_COUNTER_STATE_OFF) + return 0; + + counter->state = PERF_COUNTER_STATE_ACTIVE; + counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ + /* + * The new state must be visible before we turn it on in the hardware: + */ + smp_wmb(); + + if (counter->hw_ops->enable(counter)) { + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->oncpu = -1; + return -EAGAIN; + } + + cpuctx->active_oncpu++; + ctx->nr_active++; + + return 0; +} + /* - * Cross CPU call to install and enable a preformance counter + * Cross CPU call to install and enable a performance counter */ static void __perf_install_in_context(void *info) { @@ -220,22 +249,17 @@ static void __perf_install_in_context(void *info) * counters on a global level. NOP for non NMI based counters. */ perf_flags = hw_perf_save_disable(); - list_add_counter(counter, ctx); - hw_perf_restore(perf_flags); + list_add_counter(counter, ctx); ctx->nr_counters++; - if (cpuctx->active_oncpu < perf_max_counters) { - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; - ctx->nr_active++; - cpuctx->active_oncpu++; - counter->hw_ops->enable(counter); - } + counter_sched_in(counter, cpuctx, ctx, cpu); if (!ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; + hw_perf_restore(perf_flags); + spin_unlock(&ctx->lock); curr_rq_unlock_irq_restore(&flags); } @@ -302,8 +326,8 @@ counter_sched_out(struct perf_counter *counter, if (counter->state != PERF_COUNTER_STATE_ACTIVE) return; - counter->hw_ops->disable(counter); counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_ops->disable(counter); counter->oncpu = -1; cpuctx->active_oncpu--; @@ -326,6 +350,22 @@ group_sched_out(struct perf_counter *group_counter, counter_sched_out(counter, cpuctx, ctx); } +void __perf_counter_sched_out(struct perf_counter_context *ctx, + struct perf_cpu_context *cpuctx) +{ + struct perf_counter *counter; + + if (likely(!ctx->nr_counters)) + return; + + spin_lock(&ctx->lock); + if (ctx->nr_active) { + list_for_each_entry(counter, &ctx->counter_list, list_entry) + group_sched_out(counter, cpuctx, ctx); + } + spin_unlock(&ctx->lock); +} + /* * Called from scheduler to remove the counters of the current task, * with interrupts disabled. @@ -341,39 +381,18 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = &task->perf_counter_ctx; - struct perf_counter *counter; if (likely(!cpuctx->task_ctx)) return; - spin_lock(&ctx->lock); - if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->counter_list, list_entry) - group_sched_out(counter, cpuctx, ctx); - } - spin_unlock(&ctx->lock); + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; } -static int -counter_sched_in(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) +static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) { - if (counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - if (counter->hw_ops->enable(counter)) - return -EAGAIN; - - counter->state = PERF_COUNTER_STATE_ACTIVE; - counter->oncpu = cpu; /* TODO: put 'cpu' into cpuctx->cpu */ - - cpuctx->active_oncpu++; - ctx->nr_active++; - - return 0; + __perf_counter_sched_out(&cpuctx->ctx, cpuctx); } static int @@ -416,21 +435,10 @@ group_error: return -EAGAIN; } -/* - * Called from scheduler to add the counters of the current task - * with interrupts disabled. - * - * We restore the counter value and then enable it. - * - * This does not protect us against NMI, but enable() - * sets the enabled bit in the control field of counter _before_ - * accessing the counter control register. If a NMI hits, then it will - * keep the counter running. - */ -void perf_counter_task_sched_in(struct task_struct *task, int cpu) +static void +__perf_counter_sched_in(struct perf_counter_context *ctx, + struct perf_cpu_context *cpuctx, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; struct perf_counter *counter; if (likely(!ctx->nr_counters)) @@ -453,10 +461,35 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) break; } spin_unlock(&ctx->lock); +} +/* + * Called from scheduler to add the counters of the current task + * with interrupts disabled. + * + * We restore the counter value and then enable it. + * + * This does not protect us against NMI, but enable() + * sets the enabled bit in the control field of counter _before_ + * accessing the counter control register. If a NMI hits, then it will + * keep the counter running. + */ +void perf_counter_task_sched_in(struct task_struct *task, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &task->perf_counter_ctx; + + __perf_counter_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } +static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) +{ + struct perf_counter_context *ctx = &cpuctx->ctx; + + __perf_counter_sched_in(ctx, cpuctx, cpu); +} + int perf_counter_task_disable(void) { struct task_struct *curr = current; @@ -514,6 +547,8 @@ int perf_counter_task_enable(void) /* force the update of the task clock: */ __task_delta_exec(curr, 1); + perf_counter_task_sched_out(curr, cpu); + spin_lock(&ctx->lock); /* @@ -538,19 +573,18 @@ int perf_counter_task_enable(void) return 0; } -void perf_counter_task_tick(struct task_struct *curr, int cpu) +/* + * Round-robin a context's counters: + */ +static void rotate_ctx(struct perf_counter_context *ctx) { - struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; u64 perf_flags; - if (likely(!ctx->nr_counters)) + if (!ctx->nr_counters) return; - perf_counter_task_sched_out(curr, cpu); - spin_lock(&ctx->lock); - /* * Rotate the first entry last (works just fine for group counters too): */ @@ -563,7 +597,24 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); +} +void perf_counter_task_tick(struct task_struct *curr, int cpu) +{ + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &curr->perf_counter_ctx; + const int rotate_percpu = 0; + + if (rotate_percpu) + perf_counter_cpu_sched_out(cpuctx); + perf_counter_task_sched_out(curr, cpu); + + if (rotate_percpu) + rotate_ctx(&cpuctx->ctx); + rotate_ctx(ctx); + + if (rotate_percpu) + perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); } @@ -905,8 +956,6 @@ static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) struct task_struct *curr = counter->task; u64 delta; - WARN_ON_ONCE(counter->task != current); - delta = __task_delta_exec(curr, update); return curr->se.sum_exec_runtime + delta; @@ -1160,6 +1209,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->hw_ops = NULL; + counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) counter->state = PERF_COUNTER_STATE_OFF; @@ -1331,36 +1381,50 @@ __perf_counter_exit_task(struct task_struct *child, { struct perf_counter *parent_counter; u64 parent_val, child_val; - unsigned long flags; - u64 perf_flags; /* - * Disable and unlink this counter. - * - * Be careful about zapping the list - IRQ/NMI context - * could still be processing it: + * If we do not self-reap then we have to wait for the + * child task to unschedule (it will happen for sure), + * so that its counter is at its final count. (This + * condition triggers rarely - child tasks usually get + * off their CPU before the parent has a chance to + * get this far into the reaping action) */ - curr_rq_lock_irq_save(&flags); - perf_flags = hw_perf_save_disable(); - - if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { + if (child != current) { + wait_task_inactive(child, 0); + list_del_init(&child_counter->list_entry); + } else { struct perf_cpu_context *cpuctx; + unsigned long flags; + u64 perf_flags; + + /* + * Disable and unlink this counter. + * + * Be careful about zapping the list - IRQ/NMI context + * could still be processing it: + */ + curr_rq_lock_irq_save(&flags); + perf_flags = hw_perf_save_disable(); cpuctx = &__get_cpu_var(perf_cpu_context); - child_counter->hw_ops->disable(child_counter); - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - child_counter->oncpu = -1; + if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + child_counter->hw_ops->disable(child_counter); + cpuctx->active_oncpu--; + child_ctx->nr_active--; + child_counter->oncpu = -1; + } - cpuctx->active_oncpu--; - child_ctx->nr_active--; + list_del_init(&child_counter->list_entry); + + child_ctx->nr_counters--; + + hw_perf_restore(perf_flags); + curr_rq_unlock_irq_restore(&flags); } - list_del_init(&child_counter->list_entry); - - hw_perf_restore(perf_flags); - curr_rq_unlock_irq_restore(&flags); - parent_counter = child_counter->parent; /* * It can happen that parent exits first, and has counters From f650a672359819454c3d8d4135ecd1558cde0b24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 23 Dec 2008 12:17:29 +0100 Subject: [PATCH 0050/2061] perfcounters: add PERF_COUNT_BUS_CYCLES Generalize "bus cycles" hw events - and map them to CPU_CLK_Unhalted.Ref on x86. (which is a good enough approximation) Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 ++- include/linux/perf_counter.h | 5 +++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f3359c2b391..86b2fdd344a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -41,12 +41,13 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); static const int intel_perfmon_event_map[] = { - [PERF_COUNT_CYCLES] = 0x003c, + [PERF_COUNT_CPU_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, [PERF_COUNT_CACHE_REFERENCES] = 0x4f2e, [PERF_COUNT_CACHE_MISSES] = 0x412e, [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, [PERF_COUNT_BRANCH_MISSES] = 0x00c5, + [PERF_COUNT_BUS_CYCLES] = 0x013c, }; static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 1ea08e9f31c..ec77d1643d3 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -36,14 +36,15 @@ enum hw_event_types { /* * Common hardware events, generalized by the kernel: */ - PERF_COUNT_CYCLES = 0, + PERF_COUNT_CPU_CYCLES = 0, PERF_COUNT_INSTRUCTIONS = 1, PERF_COUNT_CACHE_REFERENCES = 2, PERF_COUNT_CACHE_MISSES = 3, PERF_COUNT_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_BRANCH_MISSES = 5, + PERF_COUNT_BUS_CYCLES = 6, - PERF_HW_EVENTS_MAX = 6, + PERF_HW_EVENTS_MAX = 7, /* * Special "software" counters provided by the kernel, even if From 2f18d1e8d07ae67dd0afce875287756d4bd31a46 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 22 Dec 2008 11:10:42 +0100 Subject: [PATCH 0051/2061] x86, perfcounters: add support for fixed-function pmcs Impact: extend performance counter support on x86 Intel CPUs Modern Intel CPUs have 3 "fixed-function" performance counters, which count these hardware events: Instr_Retired.Any CPU_CLK_Unhalted.Core CPU_CLK_Unhalted.Ref Add support for them to the performance counters subsystem. Their use is transparent to user-space: the counter scheduler is extended to automatically recognize the cases where a fixed-function PMC can be utilized instead of a generic PMC. In such cases the generic PMC is kept available for more counters. The above fixed-function events map to these generic counter hw events: PERF_COUNT_INSTRUCTIONS PERF_COUNT_CPU_CYCLES PERF_COUNT_BUS_CYCLES (The 'bus' cycles are in reality often CPU-ish cycles, just with a fixed frequency.) Signed-off-by: Ingo Molnar --- arch/x86/include/asm/perf_counter.h | 8 ++ arch/x86/kernel/cpu/perf_counter.c | 149 +++++++++++++++++++++++----- 2 files changed, 133 insertions(+), 24 deletions(-) diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 13745deb16c..2e08ed73664 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -23,6 +23,11 @@ #define ARCH_PERFMON_EVENTSEL_OS (1 << 17) #define ARCH_PERFMON_EVENTSEL_USR (1 << 16) +/* + * Includes eventsel and unit mask as well: + */ +#define ARCH_PERFMON_EVENT_MASK 0xffff + #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_SEL 0x3c #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_UMASK (0x00 << 8) #define ARCH_PERFMON_UNHALTED_CORE_CYCLES_INDEX 0 @@ -69,12 +74,15 @@ union cpuid10_edx { /* Instr_Retired.Any: */ #define MSR_ARCH_PERFMON_FIXED_CTR0 0x309 +#define X86_PMC_IDX_FIXED_INSTRUCTIONS (X86_PMC_IDX_FIXED + 0) /* CPU_CLK_Unhalted.Core: */ #define MSR_ARCH_PERFMON_FIXED_CTR1 0x30a +#define X86_PMC_IDX_FIXED_CPU_CYCLES (X86_PMC_IDX_FIXED + 1) /* CPU_CLK_Unhalted.Ref: */ #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b +#define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 86b2fdd344a..da46eca1254 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -26,6 +26,7 @@ static bool perf_counters_initialized __read_mostly; */ static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; +static u64 counter_value_mask __read_mostly; static int nr_counters_fixed __read_mostly; @@ -120,9 +121,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; - hwc->irq_period = hw_event->irq_period; /* * Intel PMCs cannot be accessed sanely above 32 bit width, @@ -183,16 +181,34 @@ void hw_perf_restore(u64 ctrl) } EXPORT_SYMBOL_GPL(hw_perf_restore); +static inline void +__pmc_fixed_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, mask; + int err; + + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + err = checking_wrmsrl(hwc->config_base, ctrl_val); +} + static inline void __pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { int err; + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) + return __pmc_fixed_disable(counter, hwc, idx); + err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } -static DEFINE_PER_CPU(u64, prev_left[X86_PMC_MAX_GENERIC]); +static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); /* * Set the next IRQ period, based on the hwc->period_left value. @@ -202,8 +218,9 @@ static void __hw_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { - s32 left = atomic64_read(&hwc->period_left); + s64 left = atomic64_read(&hwc->period_left); s32 period = hwc->irq_period; + int err; /* * If we are way outside a reasoable range then just skip forward: @@ -224,21 +241,64 @@ __hw_perf_counter_set_period(struct perf_counter *counter, * The hw counter starts counting from this counter offset, * mark it to be able to extra future deltas: */ - atomic64_set(&hwc->prev_count, (u64)(s64)-left); + atomic64_set(&hwc->prev_count, (u64)-left); - wrmsr(hwc->counter_base + idx, -left, 0); + err = checking_wrmsrl(hwc->counter_base + idx, + (u64)(-left) & counter_value_mask); +} + +static inline void +__pmc_fixed_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int __idx) +{ + int idx = __idx - X86_PMC_IDX_FIXED; + u64 ctrl_val, bits, mask; + int err; + + /* + * Enable IRQ generation (0x8) and ring-3 counting (0x2), + * and enable ring-0 counting if allowed: + */ + bits = 0x8ULL | 0x2ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) + bits |= 0x1; + bits <<= (idx * 4); + mask = 0xfULL << (idx * 4); + + rdmsrl(hwc->config_base, ctrl_val); + ctrl_val &= ~mask; + ctrl_val |= bits; + err = checking_wrmsrl(hwc->config_base, ctrl_val); } static void __pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) + return __pmc_fixed_enable(counter, hwc, idx); + wrmsr(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } -static int fixed_mode_idx(struct hw_perf_counter *hwc) +static int +fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { + unsigned int event; + + if (unlikely(hwc->nmi)) + return -1; + + event = hwc->config & ARCH_PERFMON_EVENT_MASK; + + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS])) + return X86_PMC_IDX_FIXED_INSTRUCTIONS; + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES])) + return X86_PMC_IDX_FIXED_CPU_CYCLES; + if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES])) + return X86_PMC_IDX_FIXED_BUS_CYCLES; + return -1; } @@ -249,16 +309,39 @@ static int pmc_generic_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; - int idx = hwc->idx; + int idx; - /* Try to get the previous counter again */ - if (test_and_set_bit(idx, cpuc->used)) { - idx = find_first_zero_bit(cpuc->used, nr_counters_generic); - if (idx == nr_counters_generic) - return -EAGAIN; + idx = fixed_mode_idx(counter, hwc); + if (idx >= 0) { + /* + * Try to get the fixed counter, if that is already taken + * then try to get a generic counter: + */ + if (test_and_set_bit(idx, cpuc->used)) + goto try_generic; - set_bit(idx, cpuc->used); + hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; + /* + * We set it so that counter_base + idx in wrmsr/rdmsr maps to + * MSR_ARCH_PERFMON_FIXED_CTR0 ... CTR2: + */ + hwc->counter_base = + MSR_ARCH_PERFMON_FIXED_CTR0 - X86_PMC_IDX_FIXED; hwc->idx = idx; + } else { + idx = hwc->idx; + /* Try to get the previous generic counter again */ + if (test_and_set_bit(idx, cpuc->used)) { +try_generic: + idx = find_first_zero_bit(cpuc->used, nr_counters_generic); + if (idx == nr_counters_generic) + return -EAGAIN; + + set_bit(idx, cpuc->used); + hwc->idx = idx; + } + hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; + hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; } perf_counters_lapic_init(hwc->nmi); @@ -266,6 +349,10 @@ static int pmc_generic_enable(struct perf_counter *counter) __pmc_generic_disable(counter, hwc, idx); cpuc->counters[idx] = counter; + /* + * Make it visible before enabling the hw: + */ + smp_wmb(); __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -275,7 +362,7 @@ static int pmc_generic_enable(struct perf_counter *counter) void perf_counter_print_debug(void) { - u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left; + u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; struct cpu_hw_counters *cpuc; int cpu, idx; @@ -290,11 +377,13 @@ void perf_counter_print_debug(void) rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); printk(KERN_INFO "\n"); printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); + printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { @@ -303,13 +392,19 @@ void perf_counter_print_debug(void) prev_left = per_cpu(prev_left[idx], cpu); - printk(KERN_INFO "CPU#%d: PMC%d ctrl: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); - printk(KERN_INFO "CPU#%d: PMC%d count: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: PMC%d left: %016llx\n", + printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } + for (idx = 0; idx < nr_counters_fixed; idx++) { + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); + + printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n", + cpu, idx, pmc_count); + } local_irq_enable(); } @@ -323,6 +418,11 @@ static void pmc_generic_disable(struct perf_counter *counter) clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; + /* + * Make sure the cleared pointer becomes visible before we + * (potentially) free the counter: + */ + smp_wmb(); /* * Drain the remaining delta count out of a counter @@ -353,14 +453,11 @@ static void perf_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; - u64 pmc_ctrl; - - rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); x86_perf_counter_update(counter, hwc, idx); __hw_perf_counter_set_period(counter, hwc, idx); - if (pmc_ctrl & ARCH_PERFMON_EVENTSEL0_ENABLE) + if (counter->state == PERF_COUNTER_STATE_ACTIVE) __pmc_generic_enable(counter, hwc, idx); } @@ -373,6 +470,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * Store sibling timestamps (if any): */ list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); perf_store_irq_data(sibling, counter->hw_event.type); perf_store_irq_data(sibling, atomic64_read(&counter->count)); @@ -403,7 +501,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) again: ack = status; - for_each_bit(bit, (unsigned long *) &status, nr_counters_generic) { + for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); @@ -561,6 +659,9 @@ void __init init_hw_perf_counters(void) perf_max_counters = nr_counters_generic; printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + counter_value_mask = (1ULL << eax.split.bit_width) - 1; + printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); nr_counters_fixed = edx.split.num_counters_fixed; From e44aef58ecfbe061eb4c53b939bcc35fb1bee82d Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 25 Dec 2008 09:02:11 +0100 Subject: [PATCH 0052/2061] perfcounters: include asm/perf_counter.h only if CONFIG_PERF_COUNTERS=y Impact: build fix on ia64 KOSAKI Motohiro reported that -tip doesnt build on ia64 because asm/perf_counter.h only exists on x86 for now. Fix it. Reported-by: KOSAKI Motohiro Tested-by: KOSAKI Motohiro Acked-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index ec77d1643d3..cc3a75a239a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -14,7 +14,10 @@ #define _LINUX_PERF_COUNTER_H #include -#include + +#ifdef CONFIG_PERF_COUNTERS +# include +#endif #include #include From 01ea1ccaa24dea3552e103be13b7897211607a8b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 26 Dec 2008 21:05:06 -0800 Subject: [PATCH 0053/2061] perf_counter: more barrier in blank weak function Impact: fix panic possible panic Some versions of GCC inline the weak global function if it's empty. Add a barrier() to work it around. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d7a79f321b1..37f771691f9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -45,8 +45,8 @@ hw_perf_counter_init(struct perf_counter *counter) } u64 __weak hw_perf_save_disable(void) { return 0; } -void __weak hw_perf_restore(u64 ctrl) { } -void __weak hw_perf_counter_setup(void) { } +void __weak hw_perf_restore(u64 ctrl) { barrier(); } +void __weak hw_perf_counter_setup(void) { barrier(); } static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) From 2b583d8bc8d7105b58d7481a4a0ceb718dac49c6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 27 Dec 2008 19:15:43 +0530 Subject: [PATCH 0054/2061] x86: perf_counter remove unwanted hw_perf_enable_all Impact: clean, reduce kernel size a bit, avoid sparse warnings Fixes sparse warnings: arch/x86/kernel/cpu/perf_counter.c:153:6: warning: symbol 'hw_perf_enable_all' was not declared. Should it be static? arch/x86/kernel/cpu/perf_counter.c:279:3: warning: returning void-valued expression arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression arch/x86/kernel/cpu/perf_counter.c:206:3: warning: returning void-valued expression Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index da46eca1254..9376771f757 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -150,14 +150,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -void hw_perf_enable_all(void) -{ - if (unlikely(!perf_counters_initialized)) - return; - - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, perf_counter_mask); -} - u64 hw_perf_save_disable(void) { u64 ctrl; @@ -200,12 +192,10 @@ static inline void __pmc_generic_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int idx) { - int err; - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - return __pmc_fixed_disable(counter, hwc, idx); - - err = wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + __pmc_fixed_disable(counter, hwc, idx); + else + wrmsr_safe(hwc->config_base + idx, hwc->config, 0); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -276,10 +266,10 @@ __pmc_generic_enable(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - return __pmc_fixed_enable(counter, hwc, idx); - - wrmsr(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + __pmc_fixed_enable(counter, hwc, idx); + else + wrmsr(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); } static int From ff6f05416ece2caec1a7a1f8180d6598e0ab9272 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 16:19:25 +1100 Subject: [PATCH 0055/2061] perf_counter: Fix return value from dummy hw_perf_counter_init Impact: fix oops-causing bug Currently, if you try to use perf_counters on an architecture that has no hardware support, and you select an event that doesn't map to any of the defined software counters, you get an oops rather than an error. This is because the dummy hw_perf_counter_init returns ERR_PTR(-EINVAL) but the caller (perf_counter_alloc) only tests for NULL. This makes the dummy hw_perf_counter_init return NULL instead. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 37f771691f9..4be1a8d872b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -41,7 +41,7 @@ static DEFINE_MUTEX(perf_resource_mutex); extern __weak const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter) { - return ERR_PTR(-EINVAL); + return NULL; } u64 __weak hw_perf_save_disable(void) { return 0; } From 9abf8a08bc8f18a3b125f834f00e2e71b49c15d2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 16:26:43 +1100 Subject: [PATCH 0056/2061] perf_counter: Fix the cpu_clock software counter Impact: bug fix Currently if you do (e.g.) timec -e -1 ls, it will report 0 for the value of the cpu_clock counter. The reason is that the core assumes that a counter's count field is up-to-date when the counter is inactive, and doesn't call the counter's read function. However, the cpu_clock counter code only updates the count in the read function. This fixes it by making both the read and disable functions update the count. It also makes the counter ignore time passing while the counter is disabled, by making the enable function update the hw.prev_count field. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 20 +++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4be1a8d872b..b7a027a2ef0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -928,18 +928,32 @@ static const struct file_operations perf_fops = { static int cpu_clock_perf_counter_enable(struct perf_counter *counter) { + int cpu = raw_smp_processor_id(); + + atomic64_set(&counter->hw.prev_count, cpu_clock(cpu)); return 0; } +static void cpu_clock_perf_counter_update(struct perf_counter *counter) +{ + int cpu = raw_smp_processor_id(); + s64 prev; + u64 now; + + now = cpu_clock(cpu); + prev = atomic64_read(&counter->hw.prev_count); + atomic64_set(&counter->hw.prev_count, now); + atomic64_add(now - prev, &counter->count); +} + static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { + cpu_clock_perf_counter_update(counter); } static void cpu_clock_perf_counter_read(struct perf_counter *counter) { - int cpu = raw_smp_processor_id(); - - atomic64_set(&counter->count, cpu_clock(cpu)); + cpu_clock_perf_counter_update(counter); } static const struct hw_perf_counter_ops perf_ops_cpu_clock = { From 3cbed429a9ccdb7a243f733b1056fe5c39e9004c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 16:43:42 +1100 Subject: [PATCH 0057/2061] perf_counter: Add optional hw_perf_group_sched_in arch function Impact: extend perf_counter infrastructure This adds an optional hw_perf_group_sched_in() arch function that enables a whole group of counters in one go. It returns 1 if it added the group successfully, 0 if it did nothing (and therefore the core needs to add the counters individually), or a negative number if an error occurred. It should add all the counters and enable any software counters in the group, or else add none of them and return an error. There are a couple of related changes/improvements in the group handling here: * As an optimization, group_sched_out() and group_sched_in() now check the state of the group leader, and do nothing if the leader is not active or disabled. * We now call hw_perf_save_disable/hw_perf_restore around the complete set of counter enable/disable calls in __perf_counter_sched_in/out, to give the arch code the opportunity to defer updating the hardware state until the hw_perf_restore call if it wants. * We no longer stop adding groups after we get to a group that has more than one counter. We will ultimately add an option for a group to be exclusive. The current code doesn't really implement exclusive groups anyway, since a group could end up going on with other counters that get added before it. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 3 +++ kernel/perf_counter.c | 31 ++++++++++++++++++++++++++----- 2 files changed, 29 insertions(+), 5 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index cc3a75a239a..b21d1ea4c05 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -236,6 +236,9 @@ extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); extern int perf_counter_task_disable(void); extern int perf_counter_task_enable(void); +extern int hw_perf_group_sched_in(struct perf_counter *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, int cpu); #else static inline void diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b7a027a2ef0..9ad11e44d9a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -47,6 +47,12 @@ hw_perf_counter_init(struct perf_counter *counter) u64 __weak hw_perf_save_disable(void) { return 0; } void __weak hw_perf_restore(u64 ctrl) { barrier(); } void __weak hw_perf_counter_setup(void) { barrier(); } +int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, int cpu) +{ + return 0; +} static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) @@ -341,6 +347,9 @@ group_sched_out(struct perf_counter *group_counter, { struct perf_counter *counter; + if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + counter_sched_out(group_counter, cpuctx, ctx); /* @@ -354,15 +363,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx) { struct perf_counter *counter; + u64 flags; if (likely(!ctx->nr_counters)) return; spin_lock(&ctx->lock); + flags = hw_perf_save_disable(); if (ctx->nr_active) { list_for_each_entry(counter, &ctx->counter_list, list_entry) group_sched_out(counter, cpuctx, ctx); } + hw_perf_restore(flags); spin_unlock(&ctx->lock); } @@ -402,7 +414,14 @@ group_sched_in(struct perf_counter *group_counter, int cpu) { struct perf_counter *counter, *partial_group; - int ret = 0; + int ret; + + if (group_counter->state == PERF_COUNTER_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) return -EAGAIN; @@ -415,10 +434,9 @@ group_sched_in(struct perf_counter *group_counter, partial_group = counter; goto group_error; } - ret = -EAGAIN; } - return ret; + return 0; group_error: /* @@ -440,11 +458,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { struct perf_counter *counter; + u64 flags; if (likely(!ctx->nr_counters)) return; spin_lock(&ctx->lock); + flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { /* * Listen to the 'cpu' scheduling filter constraint @@ -454,12 +474,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, continue; /* - * If we scheduled in a group atomically and - * exclusively, break out: + * If we scheduled in a group atomically and exclusively, + * or if this group can't go on, break out: */ if (group_sched_in(counter, cpuctx, ctx, cpu)) break; } + hw_perf_restore(flags); spin_unlock(&ctx->lock); } From 4eb96fcfe07b7f2a05577e57533840f8e26bea53 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 17:24:34 +1100 Subject: [PATCH 0058/2061] perf_counter: Add dummy perf_counter_print_debug function Impact: minimize requirements on architectures Currently, an architecture just enabling CONFIG_PERF_COUNTERS but not providing any extra functions will fail to build with perf_counter_print_debug being undefined, since we don't provide an empty dummy definition like we do with the hw_perf_* functions. This provides an empty dummy perf_counter_print_debug() to make it easier for architectures to turn on CONFIG_PERF_COUNTERS. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 9ad11e44d9a..4c0dccb756a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -54,6 +54,8 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, return 0; } +void __weak perf_counter_print_debug(void) { } + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { From d662ed26734473d4cb5f3d78cebfec8f9126e97c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 17:01:53 +1100 Subject: [PATCH 0059/2061] powerpc/perf_counter: Add perf_counter system call on powerpc ... with an empty/dummy asm/perf_counter.h so it builds. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/perf_counter.h | 10 ++++++++++ arch/powerpc/include/asm/systbl.h | 1 + arch/powerpc/include/asm/unistd.h | 3 ++- arch/powerpc/platforms/Kconfig.cputype | 1 + 4 files changed, 14 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/include/asm/perf_counter.h diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h new file mode 100644 index 00000000000..59530ae1d53 --- /dev/null +++ b/arch/powerpc/include/asm/perf_counter.h @@ -0,0 +1,10 @@ +/* + * Performance counter support - PowerPC-specific definitions. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 803def23665..da300c4d288 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,3 +322,4 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) +SYSCALL(perf_counter_open) diff --git a/arch/powerpc/include/asm/unistd.h b/arch/powerpc/include/asm/unistd.h index e07d0c76ed7..7cef5afe89d 100644 --- a/arch/powerpc/include/asm/unistd.h +++ b/arch/powerpc/include/asm/unistd.h @@ -341,10 +341,11 @@ #define __NR_dup3 316 #define __NR_pipe2 317 #define __NR_inotify_init1 318 +#define __NR_perf_counter_open 319 #ifdef __KERNEL__ -#define __NR_syscalls 319 +#define __NR_syscalls 320 #define __NR__exit __NR_exit #define NR_syscalls __NR_syscalls diff --git a/arch/powerpc/platforms/Kconfig.cputype b/arch/powerpc/platforms/Kconfig.cputype index 3d0c776f888..94dd1fb9a00 100644 --- a/arch/powerpc/platforms/Kconfig.cputype +++ b/arch/powerpc/platforms/Kconfig.cputype @@ -1,6 +1,7 @@ config PPC64 bool "64-bit kernel" default n + select HAVE_PERF_COUNTERS help This option selects whether a 32-bit or a 64-bit kernel will be built. From 93a6d3ce6962044fe9badf528fed46b455d58292 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 16:52:19 +1100 Subject: [PATCH 0060/2061] powerpc: Provide a way to defer perf counter work until interrupts are enabled Because 64-bit powerpc uses lazy (soft) interrupt disabling, it is possible for a performance monitor exception to come in when the kernel thinks interrupts are disabled (i.e. when they are soft-disabled but hard-enabled). In such a situation the performance monitor exception handler might have some processing to do (such as process wakeups) which can't be done in what is effectively an NMI handler. This provides a way to defer that work until interrupts get enabled, either in raw_local_irq_restore() or by returning from an interrupt handler to code that had interrupts enabled. We have a per-processor flag that indicates that there is work pending to do when interrupts subsequently get re-enabled. This flag is checked in the interrupt return path and in raw_local_irq_restore(), and if it is set, perf_counter_do_pending() is called to do the pending work. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/hw_irq.h | 31 +++++++++++++++++++++++++++++++ arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/entry_64.S | 9 +++++++++ arch/powerpc/kernel/irq.c | 10 ++++++++++ 5 files changed, 52 insertions(+) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index f75a5fc64d2..e10f151c3db 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -131,5 +131,36 @@ static inline int irqs_disabled_flags(unsigned long flags) */ struct hw_interrupt_type; +#ifdef CONFIG_PERF_COUNTERS +static inline unsigned long get_perf_counter_pending(void) +{ + unsigned long x; + + asm volatile("lbz %0,%1(13)" + : "=r" (x) + : "i" (offsetof(struct paca_struct, perf_counter_pending))); + return x; +} + +static inline void set_perf_counter_pending(int x) +{ + asm volatile("stb %0,%1(13)" : : + "r" (x), + "i" (offsetof(struct paca_struct, perf_counter_pending))); +} + +extern void perf_counter_do_pending(void); + +#else + +static inline unsigned long get_perf_counter_pending(void) +{ + return 0; +} + +static inline void set_perf_counter_pending(int x) {} +static inline void perf_counter_do_pending(void) {} +#endif /* CONFIG_PERF_COUNTERS */ + #endif /* __KERNEL__ */ #endif /* _ASM_POWERPC_HW_IRQ_H */ diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index 082b3aedf14..6ef05572301 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -99,6 +99,7 @@ struct paca_struct { u8 soft_enabled; /* irq soft-enable flag */ u8 hard_enabled; /* set if irqs are enabled in MSR */ u8 io_sync; /* writel() needs spin_unlock sync */ + u8 perf_counter_pending; /* PM interrupt while soft-disabled */ /* Stuff for accurate time accounting */ u64 user_time; /* accumulated usermode TB ticks */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 661d07d2146..cea46290011 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -127,6 +127,7 @@ int main(void) DEFINE(PACAKMSR, offsetof(struct paca_struct, kernel_msr)); DEFINE(PACASOFTIRQEN, offsetof(struct paca_struct, soft_enabled)); DEFINE(PACAHARDIRQEN, offsetof(struct paca_struct, hard_enabled)); + DEFINE(PACAPERFPEND, offsetof(struct paca_struct, perf_counter_pending)); DEFINE(PACASLBCACHE, offsetof(struct paca_struct, slb_cache)); DEFINE(PACASLBCACHEPTR, offsetof(struct paca_struct, slb_cache_ptr)); DEFINE(PACACONTEXTID, offsetof(struct paca_struct, context.id)); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 383ed6eb008..f30b4e553c5 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -526,6 +526,15 @@ ALT_FW_FTR_SECTION_END_IFCLR(FW_FEATURE_ISERIES) 2: TRACE_AND_RESTORE_IRQ(r5); +#ifdef CONFIG_PERF_COUNTERS + /* check paca->perf_counter_pending if we're enabling ints */ + lbz r3,PACAPERFPEND(r13) + and. r3,r3,r5 + beq 27f + bl .perf_counter_do_pending +27: +#endif /* CONFIG_PERF_COUNTERS */ + /* extract EE bit and use it to restore paca->hard_enabled */ ld r3,_MSR(r1) rldicl r4,r3,49,63 /* r0 = (r3 >> 15) & 1 */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index ac222d0ab12..4efb886ea43 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -104,6 +104,13 @@ static inline notrace void set_soft_enabled(unsigned long enable) : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } +#ifdef CONFIG_PERF_COUNTERS +notrace void __weak perf_counter_do_pending(void) +{ + set_perf_counter_pending(0); +} +#endif + notrace void raw_local_irq_restore(unsigned long en) { /* @@ -135,6 +142,9 @@ notrace void raw_local_irq_restore(unsigned long en) iseries_handle_interrupts(); } + if (get_perf_counter_pending()) + perf_counter_do_pending(); + /* * if (get_paca()->hard_enabled) return; * But again we need to take care that gcc gets hard_enabled directly From 4574910e5087085a1f330ff8373cee4503f5c77c Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 20:21:55 +1100 Subject: [PATCH 0061/2061] powerpc/perf_counter: Add generic support for POWER-family PMU hardware This provides the architecture-specific functions needed to access PMU hardware on the 64-bit PowerPC processors. It has been designed for the IBM POWER family (POWER 4/4+/5/5+/6 and PPC970) but will hopefully also suit other 64-bit PowerPC machines (although probably not Cell given how different it is in this area). This doesn't include back-ends for any specific processors. This implements a system which allows back-ends to express the constraints that their hardware has on what events can be counted simultaneously. The constraints are expressed as a 64-bit mask + 64-bit value for each event, and the encoding is capable of expressing the constraints arising from having a set of multiplexers feeding an event bus, with some events being available through multiple multiplexer settings, such as we get on POWER4 and PPC970. Furthermore, the back-end can supply alternative event codes for each event, and the constraint checking code will try all possible combinations of alternative event codes to try to find a combination that will fit. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/perf_counter.h | 62 ++ arch/powerpc/kernel/Makefile | 1 + arch/powerpc/kernel/perf_counter.c | 754 ++++++++++++++++++++++++ 3 files changed, 817 insertions(+) create mode 100644 arch/powerpc/kernel/perf_counter.c diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h index 59530ae1d53..9d7ff6d7fb5 100644 --- a/arch/powerpc/include/asm/perf_counter.h +++ b/arch/powerpc/include/asm/perf_counter.h @@ -8,3 +8,65 @@ * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. */ +#include + +#define MAX_HWCOUNTERS 8 +#define MAX_EVENT_ALTERNATIVES 8 + +/* + * This struct provides the constants and functions needed to + * describe the PMU on a particular POWER-family CPU. + */ +struct power_pmu { + int n_counter; + int max_alternatives; + u64 add_fields; + u64 test_adder; + int (*compute_mmcr)(unsigned int events[], int n_ev, + unsigned int hwc[], u64 mmcr[]); + int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp); + int (*get_alternatives)(unsigned int event, unsigned int alt[]); + void (*disable_pmc)(unsigned int pmc, u64 mmcr[]); + int n_generic; + int *generic_events; +}; + +extern struct power_pmu *ppmu; + +/* + * The power_pmu.get_constraint function returns a 64-bit value and + * a 64-bit mask that express the constraints between this event and + * other events. + * + * The value and mask are divided up into (non-overlapping) bitfields + * of three different types: + * + * Select field: this expresses the constraint that some set of bits + * in MMCR* needs to be set to a specific value for this event. For a + * select field, the mask contains 1s in every bit of the field, and + * the value contains a unique value for each possible setting of the + * MMCR* bits. The constraint checking code will ensure that two events + * that set the same field in their masks have the same value in their + * value dwords. + * + * Add field: this expresses the constraint that there can be at most + * N events in a particular class. A field of k bits can be used for + * N <= 2^(k-1) - 1. The mask has the most significant bit of the field + * set (and the other bits 0), and the value has only the least significant + * bit of the field set. In addition, the 'add_fields' and 'test_adder' + * in the struct power_pmu for this processor come into play. The + * add_fields value contains 1 in the LSB of the field, and the + * test_adder contains 2^(k-1) - 1 - N in the field. + * + * NAND field: this expresses the constraint that you may not have events + * in all of a set of classes. (For example, on PPC970, you can't select + * events from the FPU, ISU and IDU simultaneously, although any two are + * possible.) For N classes, the field is N+1 bits wide, and each class + * is assigned one bit from the least-significant N bits. The mask has + * only the most-significant bit set, and the value has only the bit + * for the event's class set. The test_adder has the least significant + * bit set in the field. + * + * If an event is not subject to the constraint expressed by a particular + * field, then it will have 0 in both the mask and value for that field. + */ diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 1308a86e907..fde190bbb2b 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,6 +94,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c new file mode 100644 index 00000000000..c7d4c2966a5 --- /dev/null +++ b/arch/powerpc/kernel/perf_counter.c @@ -0,0 +1,754 @@ +/* + * Performance counter support - powerpc architecture code + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include +#include +#include +#include +#include + +struct cpu_hw_counters { + int n_counters; + int n_percpu; + int disabled; + int n_added; + struct perf_counter *counter[MAX_HWCOUNTERS]; + unsigned int events[MAX_HWCOUNTERS]; + u64 mmcr[3]; +}; +DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); + +struct power_pmu *ppmu; + +void perf_counter_print_debug(void) +{ +} + +/* + * Return 1 for a software counter, 0 for a hardware counter + */ +static inline int is_software_counter(struct perf_counter *counter) +{ + return !counter->hw_event.raw && counter->hw_event.type < 0; +} + +/* + * Read one performance monitor counter (PMC). + */ +static unsigned long read_pmc(int idx) +{ + unsigned long val; + + switch (idx) { + case 1: + val = mfspr(SPRN_PMC1); + break; + case 2: + val = mfspr(SPRN_PMC2); + break; + case 3: + val = mfspr(SPRN_PMC3); + break; + case 4: + val = mfspr(SPRN_PMC4); + break; + case 5: + val = mfspr(SPRN_PMC5); + break; + case 6: + val = mfspr(SPRN_PMC6); + break; + case 7: + val = mfspr(SPRN_PMC7); + break; + case 8: + val = mfspr(SPRN_PMC8); + break; + default: + printk(KERN_ERR "oops trying to read PMC%d\n", idx); + val = 0; + } + return val; +} + +/* + * Write one PMC. + */ +static void write_pmc(int idx, unsigned long val) +{ + switch (idx) { + case 1: + mtspr(SPRN_PMC1, val); + break; + case 2: + mtspr(SPRN_PMC2, val); + break; + case 3: + mtspr(SPRN_PMC3, val); + break; + case 4: + mtspr(SPRN_PMC4, val); + break; + case 5: + mtspr(SPRN_PMC5, val); + break; + case 6: + mtspr(SPRN_PMC6, val); + break; + case 7: + mtspr(SPRN_PMC7, val); + break; + case 8: + mtspr(SPRN_PMC8, val); + break; + default: + printk(KERN_ERR "oops trying to write PMC%d\n", idx); + } +} + +/* + * Check if a set of events can all go on the PMU at once. + * If they can't, this will look at alternative codes for the events + * and see if any combination of alternative codes is feasible. + * The feasible set is returned in event[]. + */ +static int power_check_constraints(unsigned int event[], int n_ev) +{ + u64 mask, value, nv; + unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; + u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; + u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; + u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS]; + int n_alt[MAX_HWCOUNTERS], choice[MAX_HWCOUNTERS]; + int i, j; + u64 addf = ppmu->add_fields; + u64 tadd = ppmu->test_adder; + + if (n_ev > ppmu->n_counter) + return -1; + + /* First see if the events will go on as-is */ + for (i = 0; i < n_ev; ++i) { + alternatives[i][0] = event[i]; + if (ppmu->get_constraint(event[i], &amasks[i][0], + &avalues[i][0])) + return -1; + choice[i] = 0; + } + value = mask = 0; + for (i = 0; i < n_ev; ++i) { + nv = (value | avalues[i][0]) + (value & avalues[i][0] & addf); + if ((((nv + tadd) ^ value) & mask) != 0 || + (((nv + tadd) ^ avalues[i][0]) & amasks[i][0]) != 0) + break; + value = nv; + mask |= amasks[i][0]; + } + if (i == n_ev) + return 0; /* all OK */ + + /* doesn't work, gather alternatives... */ + if (!ppmu->get_alternatives) + return -1; + for (i = 0; i < n_ev; ++i) { + n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]); + for (j = 1; j < n_alt[i]; ++j) + ppmu->get_constraint(alternatives[i][j], + &amasks[i][j], &avalues[i][j]); + } + + /* enumerate all possibilities and see if any will work */ + i = 0; + j = -1; + value = mask = nv = 0; + while (i < n_ev) { + if (j >= 0) { + /* we're backtracking, restore context */ + value = svalues[i]; + mask = smasks[i]; + j = choice[i]; + } + /* + * See if any alternative k for event i, + * where k > j, will satisfy the constraints. + */ + while (++j < n_alt[i]) { + nv = (value | avalues[i][j]) + + (value & avalues[i][j] & addf); + if ((((nv + tadd) ^ value) & mask) == 0 && + (((nv + tadd) ^ avalues[i][j]) + & amasks[i][j]) == 0) + break; + } + if (j >= n_alt[i]) { + /* + * No feasible alternative, backtrack + * to event i-1 and continue enumerating its + * alternatives from where we got up to. + */ + if (--i < 0) + return -1; + } else { + /* + * Found a feasible alternative for event i, + * remember where we got up to with this event, + * go on to the next event, and start with + * the first alternative for it. + */ + choice[i] = j; + svalues[i] = value; + smasks[i] = mask; + value = nv; + mask |= amasks[i][j]; + ++i; + j = -1; + } + } + + /* OK, we have a feasible combination, tell the caller the solution */ + for (i = 0; i < n_ev; ++i) + event[i] = alternatives[i][choice[i]]; + return 0; +} + +static void power_perf_read(struct perf_counter *counter) +{ + long val, delta, prev; + + if (!counter->hw.idx) + return; + /* + * Performance monitor interrupts come even when interrupts + * are soft-disabled, as long as interrupts are hard-enabled. + * Therefore we treat them like NMIs. + */ + do { + prev = atomic64_read(&counter->hw.prev_count); + barrier(); + val = read_pmc(counter->hw.idx); + } while (atomic64_cmpxchg(&counter->hw.prev_count, prev, val) != prev); + + /* The counters are only 32 bits wide */ + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &counter->hw.period_left); +} + +/* + * Disable all counters to prevent PMU interrupts and to allow + * counters to be added or removed. + */ +u64 hw_perf_save_disable(void) +{ + struct cpu_hw_counters *cpuhw; + unsigned long ret; + unsigned long flags; + + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_counters); + + ret = cpuhw->disabled; + if (!ret) { + cpuhw->disabled = 1; + cpuhw->n_added = 0; + + /* + * Set the 'freeze counters' bit. + * The barrier is to make sure the mtspr has been + * executed and the PMU has frozen the counters + * before we return. + */ + mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); + mb(); + } + local_irq_restore(flags); + return ret; +} + +/* + * Re-enable all counters if disable == 0. + * If we were previously disabled and counters were added, then + * put the new config on the PMU. + */ +void hw_perf_restore(u64 disable) +{ + struct perf_counter *counter; + struct cpu_hw_counters *cpuhw; + unsigned long flags; + long i; + unsigned long val; + s64 left; + unsigned int hwc_index[MAX_HWCOUNTERS]; + + if (disable) + return; + local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_counters); + cpuhw->disabled = 0; + + /* + * If we didn't change anything, or only removed counters, + * no need to recalculate MMCR* settings and reset the PMCs. + * Just reenable the PMU with the current MMCR* settings + * (possibly updated for removal of counters). + */ + if (!cpuhw->n_added) { + mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); + mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + goto out; + } + + /* + * Compute MMCR* values for the new set of counters + */ + if (ppmu->compute_mmcr(cpuhw->events, cpuhw->n_counters, hwc_index, + cpuhw->mmcr)) { + /* shouldn't ever get here */ + printk(KERN_ERR "oops compute_mmcr failed\n"); + goto out; + } + + /* + * Write the new configuration to MMCR* with the freeze + * bit set and set the hardware counters to their initial values. + * Then unfreeze the counters. + */ + mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); + mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) + | MMCR0_FC); + + /* + * Read off any pre-existing counters that need to move + * to another PMC. + */ + for (i = 0; i < cpuhw->n_counters; ++i) { + counter = cpuhw->counter[i]; + if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { + power_perf_read(counter); + write_pmc(counter->hw.idx, 0); + counter->hw.idx = 0; + } + } + + /* + * Initialize the PMCs for all the new and moved counters. + */ + for (i = 0; i < cpuhw->n_counters; ++i) { + counter = cpuhw->counter[i]; + if (counter->hw.idx) + continue; + val = 0; + if (counter->hw_event.irq_period) { + left = atomic64_read(&counter->hw.period_left); + if (left < 0x80000000L) + val = 0x80000000L - left; + } + atomic64_set(&counter->hw.prev_count, val); + counter->hw.idx = hwc_index[i] + 1; + write_pmc(counter->hw.idx, val); + } + mb(); + cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; + mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + + out: + local_irq_restore(flags); +} + +static int collect_events(struct perf_counter *group, int max_count, + struct perf_counter *ctrs[], unsigned int *events) +{ + int n = 0; + struct perf_counter *counter; + + if (!is_software_counter(group)) { + if (n >= max_count) + return -1; + ctrs[n] = group; + events[n++] = group->hw.config; + } + list_for_each_entry(counter, &group->sibling_list, list_entry) { + if (!is_software_counter(counter) && + counter->state != PERF_COUNTER_STATE_OFF) { + if (n >= max_count) + return -1; + ctrs[n] = counter; + events[n++] = counter->hw.config; + } + } + return n; +} + +static void counter_sched_in(struct perf_counter *counter, int cpu) +{ + counter->state = PERF_COUNTER_STATE_ACTIVE; + counter->oncpu = cpu; + if (is_software_counter(counter)) + counter->hw_ops->enable(counter); +} + +/* + * Called to enable a whole group of counters. + * Returns 1 if the group was enabled, or -EAGAIN if it could not be. + * Assumes the caller has disabled interrupts and has + * frozen the PMU with hw_perf_save_disable. + */ +int hw_perf_group_sched_in(struct perf_counter *group_leader, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, int cpu) +{ + struct cpu_hw_counters *cpuhw; + long i, n, n0; + struct perf_counter *sub; + + cpuhw = &__get_cpu_var(cpu_hw_counters); + n0 = cpuhw->n_counters; + n = collect_events(group_leader, ppmu->n_counter - n0, + &cpuhw->counter[n0], &cpuhw->events[n0]); + if (n < 0) + return -EAGAIN; + if (power_check_constraints(cpuhw->events, n + n0)) + return -EAGAIN; + cpuhw->n_counters = n0 + n; + cpuhw->n_added += n; + + /* + * OK, this group can go on; update counter states etc., + * and enable any software counters + */ + for (i = n0; i < n0 + n; ++i) + cpuhw->counter[i]->hw.config = cpuhw->events[i]; + n = 1; + counter_sched_in(group_leader, cpu); + list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { + if (sub->state != PERF_COUNTER_STATE_OFF) { + counter_sched_in(sub, cpu); + ++n; + } + } + cpuctx->active_oncpu += n; + ctx->nr_active += n; + + return 1; +} + +/* + * Add a counter to the PMU. + * If all counters are not already frozen, then we disable and + * re-enable the PMU in order to get hw_perf_restore to do the + * actual work of reconfiguring the PMU. + */ +static int power_perf_enable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuhw; + unsigned long flags; + u64 pmudis; + int n0; + int ret = -EAGAIN; + + local_irq_save(flags); + pmudis = hw_perf_save_disable(); + + /* + * Add the counter to the list (if there is room) + * and check whether the total set is still feasible. + */ + cpuhw = &__get_cpu_var(cpu_hw_counters); + n0 = cpuhw->n_counters; + if (n0 >= ppmu->n_counter) + goto out; + cpuhw->counter[n0] = counter; + cpuhw->events[n0] = counter->hw.config; + if (power_check_constraints(cpuhw->events, n0 + 1)) + goto out; + + counter->hw.config = cpuhw->events[n0]; + ++cpuhw->n_counters; + ++cpuhw->n_added; + + ret = 0; + out: + hw_perf_restore(pmudis); + local_irq_restore(flags); + return ret; +} + +/* + * Remove a counter from the PMU. + */ +static void power_perf_disable(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuhw; + long i; + u64 pmudis; + unsigned long flags; + + local_irq_save(flags); + pmudis = hw_perf_save_disable(); + + power_perf_read(counter); + + cpuhw = &__get_cpu_var(cpu_hw_counters); + for (i = 0; i < cpuhw->n_counters; ++i) { + if (counter == cpuhw->counter[i]) { + while (++i < cpuhw->n_counters) + cpuhw->counter[i-1] = cpuhw->counter[i]; + --cpuhw->n_counters; + ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); + write_pmc(counter->hw.idx, 0); + counter->hw.idx = 0; + break; + } + } + if (cpuhw->n_counters == 0) { + /* disable exceptions if no counters are running */ + cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); + } + + hw_perf_restore(pmudis); + local_irq_restore(flags); +} + +struct hw_perf_counter_ops power_perf_ops = { + .enable = power_perf_enable, + .disable = power_perf_disable, + .read = power_perf_read +}; + +const struct hw_perf_counter_ops * +hw_perf_counter_init(struct perf_counter *counter) +{ + unsigned long ev; + struct perf_counter *ctrs[MAX_HWCOUNTERS]; + unsigned int events[MAX_HWCOUNTERS]; + int n; + + if (!ppmu) + return NULL; + if ((s64)counter->hw_event.irq_period < 0) + return NULL; + ev = counter->hw_event.type; + if (!counter->hw_event.raw) { + if (ev >= ppmu->n_generic || + ppmu->generic_events[ev] == 0) + return NULL; + ev = ppmu->generic_events[ev]; + } + counter->hw.config_base = ev; + counter->hw.idx = 0; + + /* + * If this is in a group, check if it can go on with all the + * other hardware counters in the group. We assume the counter + * hasn't been linked into its leader's sibling list at this point. + */ + n = 0; + if (counter->group_leader != counter) { + n = collect_events(counter->group_leader, ppmu->n_counter - 1, + ctrs, events); + if (n < 0) + return NULL; + } + events[n++] = ev; + if (power_check_constraints(events, n)) + return NULL; + + counter->hw.config = events[n - 1]; + atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); + return &power_perf_ops; +} + +/* + * Handle wakeups. + */ +void perf_counter_do_pending(void) +{ + int i; + struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); + struct perf_counter *counter; + + set_perf_counter_pending(0); + for (i = 0; i < cpuhw->n_counters; ++i) { + counter = cpuhw->counter[i]; + if (counter && counter->wakeup_pending) { + counter->wakeup_pending = 0; + wake_up(&counter->waitq); + } + } +} + +/* + * Record data for an irq counter. + * This function was lifted from the x86 code; maybe it should + * go in the core? + */ +static void perf_store_irq_data(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +/* + * Record all the values of the counters in a group + */ +static void perf_handle_group(struct perf_counter *counter) +{ + struct perf_counter *leader, *sub; + + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->hw_ops->read(sub); + perf_store_irq_data(counter, sub->hw_event.type); + perf_store_irq_data(counter, atomic64_read(&sub->count)); + } +} + +/* + * A counter has overflowed; update its count and record + * things if requested. Note that interrupts are hard-disabled + * here so there is no possibility of being interrupted. + */ +static void record_and_restart(struct perf_counter *counter, long val, + struct pt_regs *regs) +{ + s64 prev, delta, left; + int record = 0; + + /* we don't have to worry about interrupts here */ + prev = atomic64_read(&counter->hw.prev_count); + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &counter->count); + + /* + * See if the total period for this counter has expired, + * and update for the next period. + */ + val = 0; + left = atomic64_read(&counter->hw.period_left) - delta; + if (counter->hw_event.irq_period) { + if (left <= 0) { + left += counter->hw_event.irq_period; + if (left <= 0) + left = counter->hw_event.irq_period; + record = 1; + } + if (left < 0x80000000L) + val = 0x80000000L - left; + } + write_pmc(counter->hw.idx, val); + atomic64_set(&counter->hw.prev_count, val); + atomic64_set(&counter->hw.period_left, left); + + /* + * Finally record data if requested. + */ + if (record) { + switch (counter->hw_event.record_type) { + case PERF_RECORD_SIMPLE: + break; + case PERF_RECORD_IRQ: + perf_store_irq_data(counter, instruction_pointer(regs)); + counter->wakeup_pending = 1; + break; + case PERF_RECORD_GROUP: + perf_handle_group(counter); + counter->wakeup_pending = 1; + break; + } + } +} + +/* + * Performance monitor interrupt stuff + */ +static void perf_counter_interrupt(struct pt_regs *regs) +{ + int i; + struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); + struct perf_counter *counter; + long val; + int need_wakeup = 0, found = 0; + + for (i = 0; i < cpuhw->n_counters; ++i) { + counter = cpuhw->counter[i]; + val = read_pmc(counter->hw.idx); + if ((int)val < 0) { + /* counter has overflowed */ + found = 1; + record_and_restart(counter, val, regs); + if (counter->wakeup_pending) + need_wakeup = 1; + } + } + + /* + * In case we didn't find and reset the counter that caused + * the interrupt, scan all counters and reset any that are + * negative, to avoid getting continual interrupts. + * Any that we processed in the previous loop will not be negative. + */ + if (!found) { + for (i = 0; i < ppmu->n_counter; ++i) { + val = read_pmc(i + 1); + if ((int)val < 0) + write_pmc(i + 1, 0); + } + } + + /* + * Reset MMCR0 to its normal value. This will set PMXE and + * clear FC (freeze counters) and PMAO (perf mon alert occurred) + * and thus allow interrupts to occur again. + * XXX might want to use MSR.PM to keep the counters frozen until + * we get back out of this interrupt. + */ + mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + + /* + * If we need a wakeup, check whether interrupts were soft-enabled + * when we took the interrupt. If they were, we can wake stuff up + * immediately; otherwise we'll have to set a flag and do the + * wakeup when interrupts get soft-enabled. + */ + if (need_wakeup) { + if (regs->softe) { + irq_enter(); + perf_counter_do_pending(); + irq_exit(); + } else { + set_perf_counter_pending(1); + } + } +} + +static int init_perf_counters(void) +{ + if (reserve_pmc_hardware(perf_counter_interrupt)) { + printk(KERN_ERR "Couldn't init performance monitor subsystem\n"); + return -EBUSY; + } + + return 0; +} + +arch_initcall(init_perf_counters); From 16b067993dee3dfde61b20027e0b168dc06201ee Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 10 Jan 2009 16:34:07 +1100 Subject: [PATCH 0062/2061] powerpc/perf_counter: Add support for PPC970 family This adds the back-end for the PMU on the PPC970 family. The PPC970 allows events from the ISU to be selected in two different ways. Rather than use alternative event codes to express this, we instead use a single encoding for ISU events and express the resulting constraint (that you can't select events from all three of FPU/IFU/VPU, ISU and IDU/STS at the same time, since they all come in through only 2 multiplexers) using a NAND constraint field, and work out which multiplexer is used for ISU events at compute_mmcr time. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/perf_counter.c | 13 + arch/powerpc/kernel/ppc970-pmu.c | 375 +++++++++++++++++++++++++++++ 3 files changed, 389 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/ppc970-pmu.c diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index fde190bbb2b..45798f6fb13 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index c7d4c2966a5..5561ecb02a4 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -741,13 +741,26 @@ static void perf_counter_interrupt(struct pt_regs *regs) } } +extern struct power_pmu ppc970_pmu; + static int init_perf_counters(void) { + unsigned long pvr; + if (reserve_pmc_hardware(perf_counter_interrupt)) { printk(KERN_ERR "Couldn't init performance monitor subsystem\n"); return -EBUSY; } + /* XXX should get this from cputable */ + pvr = mfspr(SPRN_PVR); + switch (PVR_VER(pvr)) { + case PV_970: + case PV_970FX: + case PV_970MP: + ppmu = &ppc970_pmu; + break; + } return 0; } diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c new file mode 100644 index 00000000000..c3256580be1 --- /dev/null +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -0,0 +1,375 @@ +/* + * Performance counter support for PPC970-family processors. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for PPC970 + */ +#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0xf +#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK 0xf +#define PM_BYTE_SH 4 /* Byte number of event bus to use */ +#define PM_BYTE_MSK 3 +#define PM_PMCSEL_MSK 0xf + +/* Values in PM_UNIT field */ +#define PM_NONE 0 +#define PM_FPU 1 +#define PM_VPU 2 +#define PM_ISU 3 +#define PM_IFU 4 +#define PM_IDU 5 +#define PM_STS 6 +#define PM_LSU0 7 +#define PM_LSU1U 8 +#define PM_LSU1L 9 +#define PM_LASTUNIT 9 + +/* + * Bits in MMCR0 for PPC970 + */ +#define MMCR0_PMC1SEL_SH 8 +#define MMCR0_PMC2SEL_SH 1 +#define MMCR_PMCSEL_MSK 0x1f + +/* + * Bits in MMCR1 for PPC970 + */ +#define MMCR1_TTM0SEL_SH 62 +#define MMCR1_TTM1SEL_SH 59 +#define MMCR1_TTM3SEL_SH 53 +#define MMCR1_TTMSEL_MSK 3 +#define MMCR1_TD_CP_DBG0SEL_SH 50 +#define MMCR1_TD_CP_DBG1SEL_SH 48 +#define MMCR1_TD_CP_DBG2SEL_SH 46 +#define MMCR1_TD_CP_DBG3SEL_SH 44 +#define MMCR1_PMC1_ADDER_SEL_SH 39 +#define MMCR1_PMC2_ADDER_SEL_SH 38 +#define MMCR1_PMC6_ADDER_SEL_SH 37 +#define MMCR1_PMC5_ADDER_SEL_SH 36 +#define MMCR1_PMC8_ADDER_SEL_SH 35 +#define MMCR1_PMC7_ADDER_SEL_SH 34 +#define MMCR1_PMC3_ADDER_SEL_SH 33 +#define MMCR1_PMC4_ADDER_SEL_SH 32 +#define MMCR1_PMC3SEL_SH 27 +#define MMCR1_PMC4SEL_SH 22 +#define MMCR1_PMC5SEL_SH 17 +#define MMCR1_PMC6SEL_SH 12 +#define MMCR1_PMC7SEL_SH 7 +#define MMCR1_PMC8SEL_SH 2 + +static short mmcr1_adder_bits[8] = { + MMCR1_PMC1_ADDER_SEL_SH, + MMCR1_PMC2_ADDER_SEL_SH, + MMCR1_PMC3_ADDER_SEL_SH, + MMCR1_PMC4_ADDER_SEL_SH, + MMCR1_PMC5_ADDER_SEL_SH, + MMCR1_PMC6_ADDER_SEL_SH, + MMCR1_PMC7_ADDER_SEL_SH, + MMCR1_PMC8_ADDER_SEL_SH +}; + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + * <><>[ >[ >[ >< >< >< >< ><><><><><><><><> + * T0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 + * + * T0 - TTM0 constraint + * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000 + * + * T1 - TTM1 constraint + * 44-45: TTM1SEL value (0=IDU, 3=STS) 0x3000_0000_0000 + * + * UC - unit constraint: can't have all three of FPU|IFU|VPU, ISU, IDU|STS + * 43: UC3 error 0x0800_0000_0000 + * 42: FPU|IFU|VPU events needed 0x0400_0000_0000 + * 41: ISU events needed 0x0200_0000_0000 + * 40: IDU|STS events needed 0x0100_0000_0000 + * + * PS1 + * 39: PS1 error 0x0080_0000_0000 + * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 + * + * PS2 + * 35: PS2 error 0x0008_0000_0000 + * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 + * + * B0 + * 28-31: Byte 0 event source 0xf000_0000 + * Encoding as for the event code + * + * B1, B2, B3 + * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources + * + * P1 + * 15: P1 error 0x8000 + * 14-15: Count of events needing PMC1 + * + * P2..P8 + * 0-13: Count of events needing PMC2..PMC8 + */ + +/* Masks and values for using events from the various units */ +static u64 unit_cons[PM_LASTUNIT+1][2] = { + [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull }, + [PM_VPU] = { 0xc80000000000ull, 0xc40000000000ull }, + [PM_ISU] = { 0x080000000000ull, 0x020000000000ull }, + [PM_IFU] = { 0xc80000000000ull, 0x840000000000ull }, + [PM_IDU] = { 0x380000000000ull, 0x010000000000ull }, + [PM_STS] = { 0x380000000000ull, 0x310000000000ull }, +}; + +static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +{ + int pmc, byte, unit, sh; + u64 mask = 0, value = 0; + int grp = -1; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 8) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + grp = ((pmc - 1) >> 1) & 1; + } + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + if (unit) { + if (unit > PM_LASTUNIT) + return -1; + mask |= unit_cons[unit][0]; + value |= unit_cons[unit][1]; + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + /* + * Bus events on bytes 0 and 2 can be counted + * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. + */ + if (!pmc) + grp = byte & 1; + /* Set byte lane select field */ + mask |= 0xfULL << (28 - 4 * byte); + value |= (u64)unit << (28 - 4 * byte); + } + if (grp == 0) { + /* increment PMC1/2/5/6 field */ + mask |= 0x8000000000ull; + value |= 0x1000000000ull; + } else if (grp == 1) { + /* increment PMC3/4/7/8 field */ + mask |= 0x800000000ull; + value |= 0x100000000ull; + } + *maskp = mask; + *valp = value; + return 0; +} + +static int p970_get_alternatives(unsigned int event, unsigned int alt[]) +{ + alt[0] = event; + + /* 2 alternatives for LSU empty */ + if (event == 0x2002 || event == 0x3002) { + alt[1] = event ^ 0x1000; + return 2; + } + + return 1; +} + +static int p970_compute_mmcr(unsigned int event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; + unsigned int pmc, unit, byte, psel; + unsigned int ttm, grp; + unsigned int pmc_inuse = 0; + unsigned int pmc_grp_use[2]; + unsigned char busbyte[4]; + unsigned char unituse[16]; + unsigned char unitmap[] = { 0, 0<<3, 3<<3, 1<<3, 2<<3, 0|4, 3|4 }; + unsigned char ttmuse[2]; + unsigned char pmcsel[8]; + int i; + + if (n_ev > 8) + return -1; + + /* First pass to count resource use */ + pmc_grp_use[0] = pmc_grp_use[1] = 0; + memset(busbyte, 0, sizeof(busbyte)); + memset(unituse, 0, sizeof(unituse)); + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc_inuse & (1 << (pmc - 1))) + return -1; + pmc_inuse |= 1 << (pmc - 1); + /* count 1/2/5/6 vs 3/4/7/8 use */ + ++pmc_grp_use[((pmc - 1) >> 1) & 1]; + } + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + if (unit) { + if (unit > PM_LASTUNIT) + return -1; + if (!pmc) + ++pmc_grp_use[byte & 1]; + if (busbyte[byte] && busbyte[byte] != unit) + return -1; + busbyte[byte] = unit; + unituse[unit] = 1; + } + } + if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) + return -1; + + /* + * Assign resources and set multiplexer selects. + * + * PM_ISU can go either on TTM0 or TTM1, but that's the only + * choice we have to deal with. + */ + if (unituse[PM_ISU] & + (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_VPU])) + unitmap[PM_ISU] = 2 | 4; /* move ISU to TTM1 */ + /* Set TTM[01]SEL fields. */ + ttmuse[0] = ttmuse[1] = 0; + for (i = PM_FPU; i <= PM_STS; ++i) { + if (!unituse[i]) + continue; + ttm = unitmap[i]; + ++ttmuse[(ttm >> 2) & 1]; + mmcr1 |= (u64)(ttm & ~4) << MMCR1_TTM1SEL_SH; + } + /* Check only one unit per TTMx */ + if (ttmuse[0] > 1 || ttmuse[1] > 1) + return -1; + + /* Set byte lane select fields and TTM3SEL. */ + for (byte = 0; byte < 4; ++byte) { + unit = busbyte[byte]; + if (!unit) + continue; + if (unit <= PM_STS) + ttm = (unitmap[unit] >> 2) & 1; + else if (unit == PM_LSU0) + ttm = 2; + else { + ttm = 3; + if (unit == PM_LSU1L && byte >= 2) + mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); + } + mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); + } + + /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ + memset(pmcsel, 0x8, sizeof(pmcsel)); /* 8 means don't count */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + psel = event[i] & PM_PMCSEL_MSK; + if (!pmc) { + /* Bus event or any-PMC direct event */ + if (unit) + psel |= 0x10 | ((byte & 2) << 2); + else + psel |= 8; + for (pmc = 0; pmc < 8; ++pmc) { + if (pmc_inuse & (1 << pmc)) + continue; + grp = (pmc >> 1) & 1; + if (unit) { + if (grp == (byte & 1)) + break; + } else if (pmc_grp_use[grp] < 4) { + ++pmc_grp_use[grp]; + break; + } + } + pmc_inuse |= 1 << pmc; + } else { + /* Direct event */ + --pmc; + if (psel == 0 && (byte & 2)) + /* add events on higher-numbered bus */ + mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; + } + pmcsel[pmc] = psel; + hwc[i] = pmc; + } + for (pmc = 0; pmc < 2; ++pmc) + mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc); + for (; pmc < 8; ++pmc) + mmcr1 |= (u64)pmcsel[pmc] << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); + if (pmc_inuse & 1) + mmcr0 |= MMCR0_PMC1CE; + if (pmc_inuse & 0xfe) + mmcr0 |= MMCR0_PMCjCE; + + mmcra |= 0x2000; /* mark only one IOP per PPC instruction */ + + /* Return MMCRx values */ + mmcr[0] = mmcr0; + mmcr[1] = mmcr1; + mmcr[2] = mmcra; + return 0; +} + +static void p970_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + int shift, i; + + if (pmc <= 1) { + shift = MMCR0_PMC1SEL_SH - 7 * pmc; + i = 0; + } else { + shift = MMCR1_PMC3SEL_SH - 5 * (pmc - 2); + i = 1; + } + /* + * Setting the PMCxSEL field to 0x08 disables PMC x. + */ + mmcr[i] = (mmcr[i] & ~(0x1fUL << shift)) | (0x08UL << shift); +} + +static int ppc970_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 7, + [PERF_COUNT_INSTRUCTIONS] = 1, + [PERF_COUNT_CACHE_REFERENCES] = 0x8810, /* PM_LD_REF_L1 */ + [PERF_COUNT_CACHE_MISSES] = 0x3810, /* PM_LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x431, /* PM_BR_ISSUED */ + [PERF_COUNT_BRANCH_MISSES] = 0x327, /* PM_GRP_BR_MPRED */ +}; + +struct power_pmu ppc970_pmu = { + .n_counter = 8, + .max_alternatives = 2, + .add_fields = 0x001100005555ull, + .test_adder = 0x013300000000ull, + .compute_mmcr = p970_compute_mmcr, + .get_constraint = p970_get_constraint, + .get_alternatives = p970_get_alternatives, + .disable_pmc = p970_disable_pmc, + .n_generic = ARRAY_SIZE(ppc970_generic_events), + .generic_events = ppc970_generic_events, +}; From f78628374a13bc150db77c6e02d4f2c0a7f932ef Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 9 Jan 2009 21:05:35 +1100 Subject: [PATCH 0063/2061] powerpc/perf_counter: Add support for POWER6 This adds the back-end for the PMU on the POWER6 processor. Fortunately, the event selection hardware is somewhat simpler on POWER6 than on other POWER family processors, so the constraints fit into only 32 bits. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/perf_counter.c | 4 + arch/powerpc/kernel/power6-pmu.c | 283 +++++++++++++++++++++++++++++ 3 files changed, 288 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/power6-pmu.c diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 45798f6fb13..0ebf4d04d4b 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o power6-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5561ecb02a4..df3fe057dee 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -742,6 +742,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) } extern struct power_pmu ppc970_pmu; +extern struct power_pmu power6_pmu; static int init_perf_counters(void) { @@ -760,6 +761,9 @@ static int init_perf_counters(void) case PV_970MP: ppmu = &ppc970_pmu; break; + case 0x3e: + ppmu = &power6_pmu; + break; } return 0; } diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c new file mode 100644 index 00000000000..b1f61f3c97b --- /dev/null +++ b/arch/powerpc/kernel/power6-pmu.c @@ -0,0 +1,283 @@ +/* + * Performance counter support for POWER6 processors. + * + * Copyright 2008-2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for POWER6 + */ +#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0x7 +#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH 16 /* Unit event comes (TTMxSEL encoding) */ +#define PM_UNIT_MSK 0xf +#define PM_UNIT_MSKS (PM_UNIT_MSK << PM_UNIT_SH) +#define PM_LLAV 0x8000 /* Load lookahead match value */ +#define PM_LLA 0x4000 /* Load lookahead match enable */ +#define PM_BYTE_SH 12 /* Byte of event bus to use */ +#define PM_BYTE_MSK 3 +#define PM_SUBUNIT_SH 8 /* Subunit event comes from (NEST_SEL enc.) */ +#define PM_SUBUNIT_MSK 7 +#define PM_SUBUNIT_MSKS (PM_SUBUNIT_MSK << PM_SUBUNIT_SH) +#define PM_PMCSEL_MSK 0xff /* PMCxSEL value */ +#define PM_BUSEVENT_MSK 0xf3700 + +/* + * Bits in MMCR1 for POWER6 + */ +#define MMCR1_TTM0SEL_SH 60 +#define MMCR1_TTMSEL_SH(n) (MMCR1_TTM0SEL_SH - (n) * 4) +#define MMCR1_TTMSEL_MSK 0xf +#define MMCR1_TTMSEL(m, n) (((m) >> MMCR1_TTMSEL_SH(n)) & MMCR1_TTMSEL_MSK) +#define MMCR1_NESTSEL_SH 45 +#define MMCR1_NESTSEL_MSK 0x7 +#define MMCR1_NESTSEL(m) (((m) >> MMCR1_NESTSEL_SH) & MMCR1_NESTSEL_MSK) +#define MMCR1_PMC1_LLA ((u64)1 << 44) +#define MMCR1_PMC1_LLA_VALUE ((u64)1 << 39) +#define MMCR1_PMC1_ADDR_SEL ((u64)1 << 35) +#define MMCR1_PMC1SEL_SH 24 +#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK 0xff + +/* + * Assign PMC numbers and compute MMCR1 value for a set of events + */ +static int p6_compute_mmcr(unsigned int event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr1 = 0; + int i; + unsigned int pmc, ev, b, u, s, psel; + unsigned int ttmset = 0; + unsigned int pmc_inuse = 0; + + if (n_ev > 4) + return -1; + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc_inuse & (1 << (pmc - 1))) + return -1; /* collision! */ + pmc_inuse |= 1 << (pmc - 1); + } + } + for (i = 0; i < n_ev; ++i) { + ev = event[i]; + pmc = (ev >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + --pmc; + } else { + /* can go on any PMC; find a free one */ + for (pmc = 0; pmc < 4; ++pmc) + if (!(pmc_inuse & (1 << pmc))) + break; + pmc_inuse |= 1 << pmc; + } + hwc[i] = pmc; + psel = ev & PM_PMCSEL_MSK; + if (ev & PM_BUSEVENT_MSK) { + /* this event uses the event bus */ + b = (ev >> PM_BYTE_SH) & PM_BYTE_MSK; + u = (ev >> PM_UNIT_SH) & PM_UNIT_MSK; + /* check for conflict on this byte of event bus */ + if ((ttmset & (1 << b)) && MMCR1_TTMSEL(mmcr1, b) != u) + return -1; + mmcr1 |= (u64)u << MMCR1_TTMSEL_SH(b); + ttmset |= 1 << b; + if (u == 5) { + /* Nest events have a further mux */ + s = (ev >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK; + if ((ttmset & 0x10) && + MMCR1_NESTSEL(mmcr1) != s) + return -1; + ttmset |= 0x10; + mmcr1 |= (u64)s << MMCR1_NESTSEL_SH; + } + if (0x30 <= psel && psel <= 0x3d) { + /* these need the PMCx_ADDR_SEL bits */ + if (b >= 2) + mmcr1 |= MMCR1_PMC1_ADDR_SEL >> pmc; + } + /* bus select values are different for PMC3/4 */ + if (pmc >= 2 && (psel & 0x90) == 0x80) + psel ^= 0x20; + } + if (ev & PM_LLA) { + mmcr1 |= MMCR1_PMC1_LLA >> pmc; + if (ev & PM_LLAV) + mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc; + } + mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); + } + mmcr[0] = 0; + if (pmc_inuse & 1) + mmcr[0] = MMCR0_PMC1CE; + if (pmc_inuse & 0xe) + mmcr[0] |= MMCR0_PMCjCE; + mmcr[1] = mmcr1; + mmcr[2] = 0; + return 0; +} + +/* + * Layout of constraint bits: + * + * 0-1 add field: number of uses of PMC1 (max 1) + * 2-3, 4-5, 6-7: ditto for PMC2, 3, 4 + * 8-10 select field: nest (subunit) event selector + * 16-19 select field: unit on byte 0 of event bus + * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3 + */ +static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +{ + int pmc, byte, sh; + unsigned int mask = 0, value = 0; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 4) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + } + if (event & PM_BUSEVENT_MSK) { + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + sh = byte * 4; + mask |= PM_UNIT_MSKS << sh; + value |= (event & PM_UNIT_MSKS) << sh; + if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) { + mask |= PM_SUBUNIT_MSKS; + value |= event & PM_SUBUNIT_MSKS; + } + } + *maskp = mask; + *valp = value; + return 0; +} + +#define MAX_ALT 4 /* at most 4 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { + { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */ + { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */ + { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */ + { 0x10000a, 0x2000f4 }, /* PM_RUN_CYC */ + { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */ + { 0x10000e, 0x400010 }, /* PM_PURR */ + { 0x100010, 0x4000f8 }, /* PM_FLUSH */ + { 0x10001a, 0x200010 }, /* PM_MRK_INST_DISP */ + { 0x100026, 0x3000f8 }, /* PM_TB_BIT_TRANS */ + { 0x100054, 0x2000f0 }, /* PM_ST_FIN */ + { 0x100056, 0x2000fc }, /* PM_L1_ICACHE_MISS */ + { 0x1000f0, 0x40000a }, /* PM_INST_IMC_MATCH_CMPL */ + { 0x1000f8, 0x200008 }, /* PM_GCT_EMPTY_CYC */ + { 0x1000fc, 0x400006 }, /* PM_LSU_DERAT_MISS_CYC */ + { 0x20000e, 0x400007 }, /* PM_LSU_DERAT_MISS */ + { 0x200012, 0x300012 }, /* PM_INST_DISP */ + { 0x2000f2, 0x3000f2 }, /* PM_INST_DISP */ + { 0x2000f8, 0x300010 }, /* PM_EXT_INT */ + { 0x2000fe, 0x300056 }, /* PM_DATA_FROM_L2MISS */ + { 0x2d0030, 0x30001a }, /* PM_MRK_FPU_FIN */ + { 0x30000a, 0x400018 }, /* PM_MRK_INST_FIN */ + { 0x3000f6, 0x40000e }, /* PM_L1_DCACHE_RELOAD_VALID */ + { 0x3000fe, 0x400056 }, /* PM_DATA_FROM_L3MISS */ +}; + +/* + * This could be made more efficient with a binary search on + * a presorted list, if necessary + */ +static int find_alternatives_list(unsigned int event) +{ + int i, j; + unsigned int alt; + + for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { + if (event < event_alternatives[i][0]) + return -1; + for (j = 0; j < MAX_ALT; ++j) { + alt = event_alternatives[i][j]; + if (!alt || event < alt) + break; + if (event == alt) + return i; + } + } + return -1; +} + +static int p6_get_alternatives(unsigned int event, unsigned int alt[]) +{ + int i, j; + unsigned int aevent, psel, pmc; + unsigned int nalt = 1; + + alt[0] = event; + + /* check the alternatives table */ + i = find_alternatives_list(event); + if (i >= 0) { + /* copy out alternatives from list */ + for (j = 0; j < MAX_ALT; ++j) { + aevent = event_alternatives[i][j]; + if (!aevent) + break; + if (aevent != event) + alt[nalt++] = aevent; + } + + } else { + /* Check for alternative ways of computing sum events */ + /* PMCSEL 0x32 counter N == PMCSEL 0x34 counter 5-N */ + psel = event & (PM_PMCSEL_MSK & ~1); /* ignore edge bit */ + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc && (psel == 0x32 || psel == 0x34)) + alt[nalt++] = ((event ^ 0x6) & ~PM_PMC_MSKS) | + ((5 - pmc) << PM_PMC_SH); + + /* PMCSEL 0x38 counter N == PMCSEL 0x3a counter N+/-2 */ + if (pmc && (psel == 0x38 || psel == 0x3a)) + alt[nalt++] = ((event ^ 0x2) & ~PM_PMC_MSKS) | + ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH); + } + + return nalt; +} + +static void p6_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + /* Set PMCxSEL to 0 to disable PMCx */ + mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power6_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 0x1e, + [PERF_COUNT_INSTRUCTIONS] = 2, + [PERF_COUNT_CACHE_REFERENCES] = 0x280030, /* LD_REF_L1 */ + [PERF_COUNT_CACHE_MISSES] = 0x30000c, /* LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x410a0, /* BR_PRED */ + [PERF_COUNT_BRANCH_MISSES] = 0x400052, /* BR_MPRED */ +}; + +struct power_pmu power6_pmu = { + .n_counter = 4, + .max_alternatives = MAX_ALT, + .add_fields = 0x55, + .test_adder = 0, + .compute_mmcr = p6_compute_mmcr, + .get_constraint = p6_get_constraint, + .get_alternatives = p6_get_alternatives, + .disable_pmc = p6_disable_pmc, + .n_generic = ARRAY_SIZE(power6_generic_events), + .generic_events = power6_generic_events, +}; From dd0e6ba22ea21bcc2c420b385a170593c58f4c08 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 12 Jan 2009 15:11:00 +1100 Subject: [PATCH 0064/2061] perf_counter: Always schedule all software counters in Software counters aren't subject to the limitations imposed by the fixed number of hardware counter registers, so there is no reason not to enable them all in __perf_counter_sched_in. Previously we used to break out of the loop when we got to a group that wouldn't fit on the PMU; with this we continue through the list but only schedule in software counters (or groups containing only software counters) from there on. Signed-off-by: Paul Mackerras --- kernel/perf_counter.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4c0dccb756a..3aef3062ff7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -455,12 +455,37 @@ group_error: return -EAGAIN; } +/* + * Return 1 for a software counter, 0 for a hardware counter + */ +static inline int is_software_counter(struct perf_counter *counter) +{ + return !counter->hw_event.raw && counter->hw_event.type < 0; +} + +/* + * Return 1 for a group consisting entirely of software counters, + * 0 if the group contains any hardware counters. + */ +static int is_software_only_group(struct perf_counter *leader) +{ + struct perf_counter *counter; + + if (!is_software_counter(leader)) + return 0; + list_for_each_entry(counter, &leader->sibling_list, list_entry) + if (!is_software_counter(counter)) + return 0; + return 1; +} + static void __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { struct perf_counter *counter; u64 flags; + int can_add_hw = 1; if (likely(!ctx->nr_counters)) return; @@ -477,10 +502,12 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, /* * If we scheduled in a group atomically and exclusively, - * or if this group can't go on, break out: + * or if this group can't go on, don't add any more + * hardware counters. */ - if (group_sched_in(counter, cpuctx, ctx, cpu)) - break; + if (can_add_hw || is_software_only_group(counter)) + if (group_sched_in(counter, cpuctx, ctx, cpu)) + can_add_hw = 0; } hw_perf_restore(flags); spin_unlock(&ctx->lock); From 01d0287f068de2934109ba9b989d8807526cccc2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 14 Jan 2009 13:44:19 +1100 Subject: [PATCH 0065/2061] powerpc/perf_counter: Make sure PMU gets enabled properly This makes sure that we call the platform-specific ppc_md.enable_pmcs function on each CPU before we try to use the PMU on that CPU. If the CPU goes off-line and then on-line, we need to do the enable_pmcs call again, so we use the hw_perf_counter_setup hook to ensure that. It gets called as each CPU comes online, but it isn't called on the CPU that is coming up, so this adds the CPU number as an argument to it (there were no non-empty instances of hw_perf_counter_setup before). This also arranges to set the pmcregs_in_use field of the lppaca (data structure shared with the hypervisor) on each CPU when we are using the PMU and clear it when we are not. This allows the hypervisor to optimize partition switches by not saving/restoring the PMU registers when we aren't using the PMU. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 22 ++++++++++++++++++++++ kernel/perf_counter.c | 4 ++-- 2 files changed, 24 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index df3fe057dee..85ad25923c2 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -15,6 +15,7 @@ #include #include #include +#include struct cpu_hw_counters { int n_counters; @@ -24,6 +25,7 @@ struct cpu_hw_counters { struct perf_counter *counter[MAX_HWCOUNTERS]; unsigned int events[MAX_HWCOUNTERS]; u64 mmcr[3]; + u8 pmcs_enabled; }; DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); @@ -261,6 +263,15 @@ u64 hw_perf_save_disable(void) cpuhw->disabled = 1; cpuhw->n_added = 0; + /* + * Check if we ever enabled the PMU on this cpu. + */ + if (!cpuhw->pmcs_enabled) { + if (ppc_md.enable_pmcs) + ppc_md.enable_pmcs(); + cpuhw->pmcs_enabled = 1; + } + /* * Set the 'freeze counters' bit. * The barrier is to make sure the mtspr has been @@ -305,6 +316,8 @@ void hw_perf_restore(u64 disable) mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + if (cpuhw->n_counters == 0) + get_lppaca()->pmcregs_in_use = 0; goto out; } @@ -323,6 +336,7 @@ void hw_perf_restore(u64 disable) * bit set and set the hardware counters to their initial values. * Then unfreeze the counters. */ + get_lppaca()->pmcregs_in_use = 1; mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) @@ -741,6 +755,14 @@ static void perf_counter_interrupt(struct pt_regs *regs) } } +void hw_perf_counter_setup(int cpu) +{ + struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu); + + memset(cpuhw, 0, sizeof(*cpuhw)); + cpuhw->mmcr[0] = MMCR0_FC; +} + extern struct power_pmu ppc970_pmu; extern struct power_pmu power6_pmu; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3aef3062ff7..52f2f526248 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -46,7 +46,7 @@ hw_perf_counter_init(struct perf_counter *counter) u64 __weak hw_perf_save_disable(void) { return 0; } void __weak hw_perf_restore(u64 ctrl) { barrier(); } -void __weak hw_perf_counter_setup(void) { barrier(); } +void __weak hw_perf_counter_setup(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu) @@ -1598,7 +1598,7 @@ static void __cpuinit perf_counter_init_cpu(int cpu) cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; mutex_unlock(&perf_resource_mutex); - hw_perf_counter_setup(); + hw_perf_counter_setup(cpu); } #ifdef CONFIG_HOTPLUG_CPU From 3b6f9e5cb21964b7ce12bf81076f830885563ec8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 14 Jan 2009 21:00:30 +1100 Subject: [PATCH 0066/2061] perf_counter: Add support for pinned and exclusive counter groups Impact: New perf_counter features A pinned counter group is one that the user wants to have on the CPU whenever possible, i.e. whenever the associated task is running, for a per-task group, or always for a per-cpu group. If the system cannot satisfy that, it puts the group into an error state where it is not scheduled any more and reads from it return EOF (i.e. 0 bytes read). The group can be released from error state and made readable again using prctl(PR_TASK_PERF_COUNTERS_ENABLE). When we have finer-grained enable/disable controls on counters we'll be able to reset the error state on individual groups. An exclusive group is one that the user wants to be the only group using the CPU performance monitor hardware whenever it is on. The counter group scheduler will not schedule an exclusive group if there are already other groups on the CPU and will not schedule other groups onto the CPU if there is an exclusive group scheduled (that statement does not apply to groups containing only software counters, which can always go on and which do not prevent an exclusive group from going on). With an exclusive group, we will be able to let users program PMU registers at a low level without the concern that those settings will perturb other measurements. Along the way this reorganizes things a little: - is_software_counter() is moved to perf_counter.h. - cpuctx->active_oncpu now records the number of hardware counters on the CPU, i.e. it now excludes software counters. Nothing was reading cpuctx->active_oncpu before, so this change is harmless. - A new cpuctx->exclusive field records whether we currently have an exclusive group on the CPU. - counter_sched_out moves higher up in perf_counter.c and gets called from __perf_counter_remove_from_context and __perf_counter_exit_task, where we used to have essentially the same code. - __perf_counter_sched_in now goes through the counter list twice, doing the pinned counters in the first loop and the non-pinned counters in the second loop, in order to give the pinned counters the best chance to be scheduled in. Note that only a group leader can be exclusive or pinned, and that attribute applies to the whole group. This avoids some awkwardness in some corner cases (e.g. where a group leader is closed and the other group members get added to the context list). If we want to relax that restriction later, we can, and it is easier to relax a restriction than to apply a new one. This doesn't yet handle the case where a pinned counter is inherited and goes into error state in the child - the error state is not propagated up to the parent when the child exits, and arguably it should. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 10 +- include/linux/perf_counter.h | 15 +- kernel/perf_counter.c | 226 ++++++++++++++++++++--------- 3 files changed, 169 insertions(+), 82 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 85ad25923c2..5b0211348c7 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -35,14 +35,6 @@ void perf_counter_print_debug(void) { } -/* - * Return 1 for a software counter, 0 for a hardware counter - */ -static inline int is_software_counter(struct perf_counter *counter) -{ - return !counter->hw_event.raw && counter->hw_event.type < 0; -} - /* * Read one performance monitor counter (PMC). */ @@ -443,6 +435,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, */ for (i = n0; i < n0 + n; ++i) cpuhw->counter[i]->hw.config = cpuhw->events[i]; + cpuctx->active_oncpu += n; n = 1; counter_sched_in(group_leader, cpu); list_for_each_entry(sub, &group_leader->sibling_list, list_entry) { @@ -451,7 +444,6 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, ++n; } } - cpuctx->active_oncpu += n; ctx->nr_active += n; return 1; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index b21d1ea4c05..7ab8e5f96f5 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -86,7 +86,10 @@ struct perf_counter_hw_event { nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ - __reserved_1 : 28; + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only counter on PMU */ + + __reserved_1 : 26; u64 __reserved_2; }; @@ -141,6 +144,7 @@ struct hw_perf_counter_ops { * enum perf_counter_active_state - the states of a counter */ enum perf_counter_active_state { + PERF_COUNTER_STATE_ERROR = -2, PERF_COUNTER_STATE_OFF = -1, PERF_COUNTER_STATE_INACTIVE = 0, PERF_COUNTER_STATE_ACTIVE = 1, @@ -214,6 +218,7 @@ struct perf_cpu_context { struct perf_counter_context *task_ctx; int active_oncpu; int max_pertask; + int exclusive; }; /* @@ -240,6 +245,14 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu); +/* + * Return 1 for a software counter, 0 for a hardware counter + */ +static inline int is_software_counter(struct perf_counter *counter) +{ + return !counter->hw_event.raw && counter->hw_event.type < 0; +} + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 52f2f526248..faf671b2956 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -93,6 +93,25 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) } } +static void +counter_sched_out(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + + counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->hw_ops->disable(counter); + counter->oncpu = -1; + + if (!is_software_counter(counter)) + cpuctx->active_oncpu--; + ctx->nr_active--; + if (counter->hw_event.exclusive || !cpuctx->active_oncpu) + cpuctx->exclusive = 0; +} + /* * Cross CPU call to remove a performance counter * @@ -118,14 +137,9 @@ static void __perf_counter_remove_from_context(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); - if (counter->state == PERF_COUNTER_STATE_ACTIVE) { - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->hw_ops->disable(counter); - ctx->nr_active--; - cpuctx->active_oncpu--; - counter->task = NULL; - counter->oncpu = -1; - } + counter_sched_out(counter, cpuctx, ctx); + + counter->task = NULL; ctx->nr_counters--; /* @@ -207,7 +221,7 @@ counter_sched_in(struct perf_counter *counter, struct perf_counter_context *ctx, int cpu) { - if (counter->state == PERF_COUNTER_STATE_OFF) + if (counter->state <= PERF_COUNTER_STATE_OFF) return 0; counter->state = PERF_COUNTER_STATE_ACTIVE; @@ -223,12 +237,63 @@ counter_sched_in(struct perf_counter *counter, return -EAGAIN; } - cpuctx->active_oncpu++; + if (!is_software_counter(counter)) + cpuctx->active_oncpu++; ctx->nr_active++; + if (counter->hw_event.exclusive) + cpuctx->exclusive = 1; + return 0; } +/* + * Return 1 for a group consisting entirely of software counters, + * 0 if the group contains any hardware counters. + */ +static int is_software_only_group(struct perf_counter *leader) +{ + struct perf_counter *counter; + + if (!is_software_counter(leader)) + return 0; + list_for_each_entry(counter, &leader->sibling_list, list_entry) + if (!is_software_counter(counter)) + return 0; + return 1; +} + +/* + * Work out whether we can put this counter group on the CPU now. + */ +static int group_can_go_on(struct perf_counter *counter, + struct perf_cpu_context *cpuctx, + int can_add_hw) +{ + /* + * Groups consisting entirely of software counters can always go on. + */ + if (is_software_only_group(counter)) + return 1; + /* + * If an exclusive group is already on, no other hardware + * counters can go on. + */ + if (cpuctx->exclusive) + return 0; + /* + * If this group is exclusive and there are already + * counters on the CPU, it can't go on. + */ + if (counter->hw_event.exclusive && cpuctx->active_oncpu) + return 0; + /* + * Otherwise, try to add it if all previous groups were able + * to go on. + */ + return can_add_hw; +} + /* * Cross CPU call to install and enable a performance counter */ @@ -240,6 +305,7 @@ static void __perf_install_in_context(void *info) int cpu = smp_processor_id(); unsigned long flags; u64 perf_flags; + int err; /* * If this is a task context, we need to check whether it is @@ -261,9 +327,21 @@ static void __perf_install_in_context(void *info) list_add_counter(counter, ctx); ctx->nr_counters++; - counter_sched_in(counter, cpuctx, ctx, cpu); + /* + * An exclusive counter can't go on if there are already active + * hardware counters, and no hardware counter can go on if there + * is already an exclusive counter on. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE && + !group_can_go_on(counter, cpuctx, 1)) + err = -EEXIST; + else + err = counter_sched_in(counter, cpuctx, ctx, cpu); - if (!ctx->task && cpuctx->max_pertask) + if (err && counter->hw_event.pinned) + counter->state = PERF_COUNTER_STATE_ERROR; + + if (!err && !ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; hw_perf_restore(perf_flags); @@ -326,22 +404,6 @@ retry: spin_unlock_irq(&ctx->lock); } -static void -counter_sched_out(struct perf_counter *counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) -{ - if (counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->hw_ops->disable(counter); - counter->oncpu = -1; - - cpuctx->active_oncpu--; - ctx->nr_active--; -} - static void group_sched_out(struct perf_counter *group_counter, struct perf_cpu_context *cpuctx, @@ -359,6 +421,9 @@ group_sched_out(struct perf_counter *group_counter, */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) counter_sched_out(counter, cpuctx, ctx); + + if (group_counter->hw_event.exclusive) + cpuctx->exclusive = 0; } void __perf_counter_sched_out(struct perf_counter_context *ctx, @@ -455,30 +520,6 @@ group_error: return -EAGAIN; } -/* - * Return 1 for a software counter, 0 for a hardware counter - */ -static inline int is_software_counter(struct perf_counter *counter) -{ - return !counter->hw_event.raw && counter->hw_event.type < 0; -} - -/* - * Return 1 for a group consisting entirely of software counters, - * 0 if the group contains any hardware counters. - */ -static int is_software_only_group(struct perf_counter *leader) -{ - struct perf_counter *counter; - - if (!is_software_counter(leader)) - return 0; - list_for_each_entry(counter, &leader->sibling_list, list_entry) - if (!is_software_counter(counter)) - return 0; - return 1; -} - static void __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) @@ -492,7 +533,38 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, spin_lock(&ctx->lock); flags = hw_perf_save_disable(); + + /* + * First go through the list and put on any pinned groups + * in order to give them the best chance of going on. + */ list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state <= PERF_COUNTER_STATE_OFF || + !counter->hw_event.pinned) + continue; + if (counter->cpu != -1 && counter->cpu != cpu) + continue; + + if (group_can_go_on(counter, cpuctx, 1)) + group_sched_in(counter, cpuctx, ctx, cpu); + + /* + * If this pinned group hasn't been scheduled, + * put it in error state. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + counter->state = PERF_COUNTER_STATE_ERROR; + } + + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + /* + * Ignore counters in OFF or ERROR state, and + * ignore pinned counters since we did them already. + */ + if (counter->state <= PERF_COUNTER_STATE_OFF || + counter->hw_event.pinned) + continue; + /* * Listen to the 'cpu' scheduling filter constraint * of counters: @@ -500,14 +572,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (counter->cpu != -1 && counter->cpu != cpu) continue; - /* - * If we scheduled in a group atomically and exclusively, - * or if this group can't go on, don't add any more - * hardware counters. - */ - if (can_add_hw || is_software_only_group(counter)) + if (group_can_go_on(counter, cpuctx, can_add_hw)) { if (group_sched_in(counter, cpuctx, ctx, cpu)) can_add_hw = 0; + } } hw_perf_restore(flags); spin_unlock(&ctx->lock); @@ -567,8 +635,10 @@ int perf_counter_task_disable(void) */ perf_flags = hw_perf_save_disable(); - list_for_each_entry(counter, &ctx->counter_list, list_entry) - counter->state = PERF_COUNTER_STATE_OFF; + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state != PERF_COUNTER_STATE_ERROR) + counter->state = PERF_COUNTER_STATE_OFF; + } hw_perf_restore(perf_flags); @@ -607,7 +677,7 @@ int perf_counter_task_enable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_OFF) + if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; counter->hw_event.disabled = 0; @@ -849,6 +919,14 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (count != sizeof(cntval)) return -EINVAL; + /* + * Return end-of-file for a read on a counter that is in + * error state (i.e. because it was pinned but it couldn't be + * scheduled on to the CPU at some point). + */ + if (counter->state == PERF_COUNTER_STATE_ERROR) + return 0; + mutex_lock(&counter->mutex); cntval = perf_counter_read(counter); mutex_unlock(&counter->mutex); @@ -884,7 +962,7 @@ perf_read_irq_data(struct perf_counter *counter, { struct perf_data *irqdata, *usrdata; DECLARE_WAITQUEUE(wait, current); - ssize_t res; + ssize_t res, res2; irqdata = counter->irqdata; usrdata = counter->usrdata; @@ -905,6 +983,9 @@ perf_read_irq_data(struct perf_counter *counter, if (signal_pending(current)) break; + if (counter->state == PERF_COUNTER_STATE_ERROR) + break; + spin_unlock_irq(&counter->waitq.lock); schedule(); spin_lock_irq(&counter->waitq.lock); @@ -913,7 +994,8 @@ perf_read_irq_data(struct perf_counter *counter, __set_current_state(TASK_RUNNING); spin_unlock_irq(&counter->waitq.lock); - if (usrdata->len + irqdata->len < count) + if (usrdata->len + irqdata->len < count && + counter->state != PERF_COUNTER_STATE_ERROR) return -ERESTARTSYS; read_pending: mutex_lock(&counter->mutex); @@ -925,11 +1007,12 @@ read_pending: /* Switch irq buffer: */ usrdata = perf_switch_irq_data(counter); - if (perf_copy_usrdata(usrdata, buf + res, count - res) < 0) { + res2 = perf_copy_usrdata(usrdata, buf + res, count - res); + if (res2 < 0) { if (!res) res = -EFAULT; } else { - res = count; + res += res2; } out: mutex_unlock(&counter->mutex); @@ -1348,6 +1431,11 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, */ if (group_leader->ctx != ctx) goto err_put_context; + /* + * Only a group leader can be exclusive or pinned + */ + if (hw_event.exclusive || hw_event.pinned) + goto err_put_context; } ret = -EINVAL; @@ -1473,13 +1561,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); - if (child_counter->state == PERF_COUNTER_STATE_ACTIVE) { - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - child_counter->hw_ops->disable(child_counter); - cpuctx->active_oncpu--; - child_ctx->nr_active--; - child_counter->oncpu = -1; - } + counter_sched_out(child_counter, cpuctx, child_ctx); list_del_init(&child_counter->list_entry); From d859e29fe34cb833071b20aef860ee94fbad9bb2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 17 Jan 2009 18:10:22 +1100 Subject: [PATCH 0067/2061] perf_counter: Add counter enable/disable ioctls Impact: New perf_counter features This primarily adds a way for perf_counter users to enable and disable counters and groups. Enabling or disabling a counter or group also enables or disables all of the child counters that have been cloned from it to monitor children of the task monitored by the top-level counter. The userspace interface to enable/disable counters is via ioctl on the counter file descriptor. Along the way this extends the code that handles child counters to handle child counter groups properly. A group with multiple counters will be cloned to child tasks if and only if the group leader has the hw_event.inherit bit set - if it is set the whole group is cloned as a group in the child task. In order to be able to enable or disable all child counters of a given top-level counter, we need a way to find them all. Hence I have added a child_list field to struct perf_counter, which is the head of the list of children for a top-level counter, or the link in that list for a child counter. That list is protected by the perf_counter.mutex field. This also adds a mutex to the perf_counter_context struct. Previously the list of counters was protected just by the lock field in the context, which meant that perf_counter_init_task had to take that lock and then take whatever lock/mutex protects the top-level counter's child_list. But the counter enable/disable functions need to take that lock in order to traverse the list, then for each counter take the lock in that counter's context in order to change the counter's state safely, which will lead to a deadlock. To solve this, we now have both a mutex and a spinlock in the context, and taking either is sufficient to ensure the list of counters can't change - you have to take both before changing the list. Now perf_counter_init_task takes the mutex instead of the lock (which incidentally means that inherit_counter can use GFP_KERNEL instead of GFP_ATOMIC) and thus avoids the possible deadlock. Similarly the new enable/disable functions can take the mutex while traversing the list of child counters without incurring a possible deadlock when the counter manipulation code locks the context for a child counter. We also had an misfeature that the first counter added to a context would possibly not go on until the next sched-in, because we were using ctx->nr_active to detect if the context was running on a CPU. But nr_active is the number of active counters, and if that was zero (because the context didn't have any counters yet) it would look like the context wasn't running on a cpu and so the retry code in __perf_install_in_context wouldn't retry. So this adds an 'is_active' field that is set when the context is on a CPU, even if it has no counters. The is_active field is only used for task contexts, not for per-cpu contexts. If we enable a subsidiary counter in a group that is active on a CPU, and the arch code can't enable the counter, then we have to pull the whole group off the CPU. We do this with group_sched_out, which gets moved up in the file so it comes before all its callers. This also adds similar logic to __perf_install_in_context so that the "all on, or none" invariant of groups is preserved when adding a new counter to a group. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 21 +- kernel/perf_counter.c | 465 ++++++++++++++++++++++++++++++----- 2 files changed, 420 insertions(+), 66 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7ab8e5f96f5..33ba9fe0a78 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -14,6 +14,7 @@ #define _LINUX_PERF_COUNTER_H #include +#include #ifdef CONFIG_PERF_COUNTERS # include @@ -94,6 +95,12 @@ struct perf_counter_hw_event { u64 __reserved_2; }; +/* + * Ioctls that can be done on a perf counter fd: + */ +#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) + /* * Kernel-internal data types: */ @@ -173,8 +180,10 @@ struct perf_counter { struct file *filp; struct perf_counter *parent; + struct list_head child_list; + /* - * Protect attach/detach: + * Protect attach/detach and child_list: */ struct mutex mutex; @@ -199,13 +208,21 @@ struct perf_counter { struct perf_counter_context { #ifdef CONFIG_PERF_COUNTERS /* - * Protect the list of counters: + * Protect the states of the counters in the list, + * nr_active, and the list: */ spinlock_t lock; + /* + * Protect the list of counters. Locking either mutex or lock + * is sufficient to ensure the list doesn't change; to change + * the list you need to lock both the mutex and the spinlock. + */ + struct mutex mutex; struct list_head counter_list; int nr_counters; int nr_active; + int is_active; struct task_struct *task; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index faf671b2956..1ac18daa424 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -112,6 +112,28 @@ counter_sched_out(struct perf_counter *counter, cpuctx->exclusive = 0; } +static void +group_sched_out(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + + if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) + return; + + counter_sched_out(group_counter, cpuctx, ctx); + + /* + * Schedule out siblings (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) + counter_sched_out(counter, cpuctx, ctx); + + if (group_counter->hw_event.exclusive) + cpuctx->exclusive = 0; +} + /* * Cross CPU call to remove a performance counter * @@ -168,7 +190,7 @@ static void __perf_counter_remove_from_context(void *info) /* * Remove the counter from a task's (or a CPU's) list of counters. * - * Must be called with counter->mutex held. + * Must be called with counter->mutex and ctx->mutex held. * * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. @@ -215,6 +237,99 @@ retry: spin_unlock_irq(&ctx->lock); } +/* + * Cross CPU call to disable a performance counter + */ +static void __perf_counter_disable(void *info) +{ + struct perf_counter *counter = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = counter->ctx; + unsigned long flags; + + /* + * If this is a per-task counter, need to check whether this + * counter's task is the current task on this cpu. + */ + if (ctx->task && cpuctx->task_ctx != ctx) + return; + + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); + + /* + * If the counter is on, turn it off. + * If it is in error state, leave it in error state. + */ + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + if (counter == counter->group_leader) + group_sched_out(counter, cpuctx, ctx); + else + counter_sched_out(counter, cpuctx, ctx); + counter->state = PERF_COUNTER_STATE_OFF; + } + + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); +} + +/* + * Disable a counter. + */ +static void perf_counter_disable(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Disable the counter on the cpu that it's on + */ + smp_call_function_single(counter->cpu, __perf_counter_disable, + counter, 1); + return; + } + + retry: + task_oncpu_function_call(task, __perf_counter_disable, counter); + + spin_lock_irq(&ctx->lock); + /* + * If the counter is still active, we need to retry the cross-call. + */ + if (counter->state == PERF_COUNTER_STATE_ACTIVE) { + spin_unlock_irq(&ctx->lock); + goto retry; + } + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + counter->state = PERF_COUNTER_STATE_OFF; + + spin_unlock_irq(&ctx->lock); +} + +/* + * Disable a counter and all its children. + */ +static void perf_counter_disable_family(struct perf_counter *counter) +{ + struct perf_counter *child; + + perf_counter_disable(counter); + + /* + * Lock the mutex to protect the list of children + */ + mutex_lock(&counter->mutex); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_disable(child); + mutex_unlock(&counter->mutex); +} + static int counter_sched_in(struct perf_counter *counter, struct perf_cpu_context *cpuctx, @@ -302,6 +417,7 @@ static void __perf_install_in_context(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *leader = counter->group_leader; int cpu = smp_processor_id(); unsigned long flags; u64 perf_flags; @@ -327,23 +443,40 @@ static void __perf_install_in_context(void *info) list_add_counter(counter, ctx); ctx->nr_counters++; + /* + * Don't put the counter on if it is disabled or if + * it is in a group and the group isn't on. + */ + if (counter->state != PERF_COUNTER_STATE_INACTIVE || + (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE)) + goto unlock; + /* * An exclusive counter can't go on if there are already active * hardware counters, and no hardware counter can go on if there * is already an exclusive counter on. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE && - !group_can_go_on(counter, cpuctx, 1)) + if (!group_can_go_on(counter, cpuctx, 1)) err = -EEXIST; else err = counter_sched_in(counter, cpuctx, ctx, cpu); - if (err && counter->hw_event.pinned) - counter->state = PERF_COUNTER_STATE_ERROR; + if (err) { + /* + * This counter couldn't go on. If it is in a group + * then we have to pull the whole group off. + * If the counter group is pinned then put it in error state. + */ + if (leader != counter) + group_sched_out(leader, cpuctx, ctx); + if (leader->hw_event.pinned) + leader->state = PERF_COUNTER_STATE_ERROR; + } if (!err && !ctx->task && cpuctx->max_pertask) cpuctx->max_pertask--; + unlock: hw_perf_restore(perf_flags); spin_unlock(&ctx->lock); @@ -359,6 +492,8 @@ static void __perf_install_in_context(void *info) * If the counter is attached to a task which is on a CPU we use a smp * call to enable it in the task context. The task might have been * scheduled away, but we check this in the smp call again. + * + * Must be called with ctx->mutex held. */ static void perf_install_in_context(struct perf_counter_context *ctx, @@ -387,7 +522,7 @@ retry: /* * we need to retry the smp call. */ - if (ctx->nr_active && list_empty(&counter->list_entry)) { + if (ctx->is_active && list_empty(&counter->list_entry)) { spin_unlock_irq(&ctx->lock); goto retry; } @@ -404,26 +539,131 @@ retry: spin_unlock_irq(&ctx->lock); } -static void -group_sched_out(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx) +/* + * Cross CPU call to enable a performance counter + */ +static void __perf_counter_enable(void *info) { - struct perf_counter *counter; - - if (group_counter->state != PERF_COUNTER_STATE_ACTIVE) - return; - - counter_sched_out(group_counter, cpuctx, ctx); + struct perf_counter *counter = info; + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *leader = counter->group_leader; + unsigned long flags; + int err; /* - * Schedule out siblings (if any): + * If this is a per-task counter, need to check whether this + * counter's task is the current task on this cpu. */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) - counter_sched_out(counter, cpuctx, ctx); + if (ctx->task && cpuctx->task_ctx != ctx) + return; - if (group_counter->hw_event.exclusive) - cpuctx->exclusive = 0; + curr_rq_lock_irq_save(&flags); + spin_lock(&ctx->lock); + + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + goto unlock; + counter->state = PERF_COUNTER_STATE_INACTIVE; + + /* + * If the counter is in a group and isn't the group leader, + * then don't put it on unless the group is on. + */ + if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) + goto unlock; + + if (!group_can_go_on(counter, cpuctx, 1)) + err = -EEXIST; + else + err = counter_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + + if (err) { + /* + * If this counter can't go on and it's part of a + * group, then the whole group has to come off. + */ + if (leader != counter) + group_sched_out(leader, cpuctx, ctx); + if (leader->hw_event.pinned) + leader->state = PERF_COUNTER_STATE_ERROR; + } + + unlock: + spin_unlock(&ctx->lock); + curr_rq_unlock_irq_restore(&flags); +} + +/* + * Enable a counter. + */ +static void perf_counter_enable(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + struct task_struct *task = ctx->task; + + if (!task) { + /* + * Enable the counter on the cpu that it's on + */ + smp_call_function_single(counter->cpu, __perf_counter_enable, + counter, 1); + return; + } + + spin_lock_irq(&ctx->lock); + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + goto out; + + /* + * If the counter is in error state, clear that first. + * That way, if we see the counter in error state below, we + * know that it has gone back into error state, as distinct + * from the task having been scheduled away before the + * cross-call arrived. + */ + if (counter->state == PERF_COUNTER_STATE_ERROR) + counter->state = PERF_COUNTER_STATE_OFF; + + retry: + spin_unlock_irq(&ctx->lock); + task_oncpu_function_call(task, __perf_counter_enable, counter); + + spin_lock_irq(&ctx->lock); + + /* + * If the context is active and the counter is still off, + * we need to retry the cross-call. + */ + if (ctx->is_active && counter->state == PERF_COUNTER_STATE_OFF) + goto retry; + + /* + * Since we have the lock this context can't be scheduled + * in, so we can change the state safely. + */ + if (counter->state == PERF_COUNTER_STATE_OFF) + counter->state = PERF_COUNTER_STATE_INACTIVE; + out: + spin_unlock_irq(&ctx->lock); +} + +/* + * Enable a counter and all its children. + */ +static void perf_counter_enable_family(struct perf_counter *counter) +{ + struct perf_counter *child; + + perf_counter_enable(counter); + + /* + * Lock the mutex to protect the list of children + */ + mutex_lock(&counter->mutex); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_enable(child); + mutex_unlock(&counter->mutex); } void __perf_counter_sched_out(struct perf_counter_context *ctx, @@ -432,16 +672,18 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_counter *counter; u64 flags; - if (likely(!ctx->nr_counters)) - return; - spin_lock(&ctx->lock); + ctx->is_active = 0; + if (likely(!ctx->nr_counters)) + goto out; + flags = hw_perf_save_disable(); if (ctx->nr_active) { list_for_each_entry(counter, &ctx->counter_list, list_entry) group_sched_out(counter, cpuctx, ctx); } hw_perf_restore(flags); + out: spin_unlock(&ctx->lock); } @@ -528,10 +770,11 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, u64 flags; int can_add_hw = 1; - if (likely(!ctx->nr_counters)) - return; - spin_lock(&ctx->lock); + ctx->is_active = 1; + if (likely(!ctx->nr_counters)) + goto out; + flags = hw_perf_save_disable(); /* @@ -578,6 +821,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, } } hw_perf_restore(flags); + out: spin_unlock(&ctx->lock); } @@ -896,12 +1140,14 @@ static int perf_release(struct inode *inode, struct file *file) file->private_data = NULL; + mutex_lock(&ctx->mutex); mutex_lock(&counter->mutex); perf_counter_remove_from_context(counter); put_context(ctx); mutex_unlock(&counter->mutex); + mutex_unlock(&ctx->mutex); kfree(counter); @@ -1053,10 +1299,30 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } +static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) +{ + struct perf_counter *counter = file->private_data; + int err = 0; + + switch (cmd) { + case PERF_COUNTER_IOC_ENABLE: + perf_counter_enable_family(counter); + break; + case PERF_COUNTER_IOC_DISABLE: + perf_counter_disable_family(counter); + break; + default: + err = -ENOTTY; + } + return err; +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, .poll = perf_poll, + .unlocked_ioctl = perf_ioctl, + .compat_ioctl = perf_ioctl, }; static int cpu_clock_perf_counter_enable(struct perf_counter *counter) @@ -1348,6 +1614,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); + INIT_LIST_HEAD(&counter->child_list); + counter->irqdata = &counter->data[0]; counter->usrdata = &counter->data[1]; counter->cpu = cpu; @@ -1452,7 +1720,9 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, goto err_free_put_context; counter->filp = counter_file; + mutex_lock(&ctx->mutex); perf_install_in_context(ctx, counter, cpu); + mutex_unlock(&ctx->mutex); fput_light(counter_file, fput_needed2); @@ -1479,6 +1749,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, { memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); ctx->task = task; } @@ -1486,20 +1757,30 @@ __perf_counter_init_context(struct perf_counter_context *ctx, /* * inherit a counter from parent task to child task: */ -static int +static struct perf_counter * inherit_counter(struct perf_counter *parent_counter, struct task_struct *parent, struct perf_counter_context *parent_ctx, struct task_struct *child, + struct perf_counter *group_leader, struct perf_counter_context *child_ctx) { struct perf_counter *child_counter; + /* + * Instead of creating recursive hierarchies of counters, + * we link inherited counters back to the original parent, + * which has a filp for sure, which we use as the reference + * count: + */ + if (parent_counter->parent) + parent_counter = parent_counter->parent; + child_counter = perf_counter_alloc(&parent_counter->hw_event, - parent_counter->cpu, NULL, - GFP_ATOMIC); + parent_counter->cpu, group_leader, + GFP_KERNEL); if (!child_counter) - return -ENOMEM; + return NULL; /* * Link it up in the child's context: @@ -1523,16 +1804,82 @@ inherit_counter(struct perf_counter *parent_counter, */ atomic_long_inc(&parent_counter->filp->f_count); + /* + * Link this into the parent counter's child list + */ + mutex_lock(&parent_counter->mutex); + list_add_tail(&child_counter->child_list, &parent_counter->child_list); + + /* + * Make the child state follow the state of the parent counter, + * not its hw_event.disabled bit. We hold the parent's mutex, + * so we won't race with perf_counter_{en,dis}able_family. + */ + if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + else + child_counter->state = PERF_COUNTER_STATE_OFF; + + mutex_unlock(&parent_counter->mutex); + + return child_counter; +} + +static int inherit_group(struct perf_counter *parent_counter, + struct task_struct *parent, + struct perf_counter_context *parent_ctx, + struct task_struct *child, + struct perf_counter_context *child_ctx) +{ + struct perf_counter *leader; + struct perf_counter *sub; + + leader = inherit_counter(parent_counter, parent, parent_ctx, + child, NULL, child_ctx); + if (!leader) + return -ENOMEM; + list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { + if (!inherit_counter(sub, parent, parent_ctx, + child, leader, child_ctx)) + return -ENOMEM; + } return 0; } +static void sync_child_counter(struct perf_counter *child_counter, + struct perf_counter *parent_counter) +{ + u64 parent_val, child_val; + + parent_val = atomic64_read(&parent_counter->count); + child_val = atomic64_read(&child_counter->count); + + /* + * Add back the child's count to the parent's count: + */ + atomic64_add(child_val, &parent_counter->count); + + /* + * Remove this counter from the parent's list + */ + mutex_lock(&parent_counter->mutex); + list_del_init(&child_counter->child_list); + mutex_unlock(&parent_counter->mutex); + + /* + * Release the parent counter, if this was the last + * reference to it. + */ + fput(parent_counter->filp); +} + static void __perf_counter_exit_task(struct task_struct *child, struct perf_counter *child_counter, struct perf_counter_context *child_ctx) { struct perf_counter *parent_counter; - u64 parent_val, child_val; + struct perf_counter *sub, *tmp; /* * If we do not self-reap then we have to wait for the @@ -1561,7 +1908,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); - counter_sched_out(child_counter, cpuctx, child_ctx); + group_sched_out(child_counter, cpuctx, child_ctx); list_del_init(&child_counter->list_entry); @@ -1577,26 +1924,23 @@ __perf_counter_exit_task(struct task_struct *child, * that are still around due to the child reference. These * counters need to be zapped - but otherwise linger. */ - if (!parent_counter) - return; - - parent_val = atomic64_read(&parent_counter->count); - child_val = atomic64_read(&child_counter->count); - - /* - * Add back the child's count to the parent's count: - */ - atomic64_add(child_val, &parent_counter->count); - - fput(parent_counter->filp); + if (parent_counter) { + sync_child_counter(child_counter, parent_counter); + list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, + list_entry) { + if (sub->parent) + sync_child_counter(sub, sub->parent); + kfree(sub); + } + } kfree(child_counter); } /* - * When a child task exist, feed back counter values to parent counters. + * When a child task exits, feed back counter values to parent counters. * - * Note: we are running in child context, but the PID is not hashed + * Note: we may be running in child context, but the PID is not hashed * anymore so new counters will not be added. */ void perf_counter_exit_task(struct task_struct *child) @@ -1620,9 +1964,8 @@ void perf_counter_exit_task(struct task_struct *child) void perf_counter_init_task(struct task_struct *child) { struct perf_counter_context *child_ctx, *parent_ctx; - struct perf_counter *counter, *parent_counter; + struct perf_counter *counter; struct task_struct *parent = current; - unsigned long flags; child_ctx = &child->perf_counter_ctx; parent_ctx = &parent->perf_counter_ctx; @@ -1641,32 +1984,22 @@ void perf_counter_init_task(struct task_struct *child) * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. */ - spin_lock_irqsave(&parent_ctx->lock, flags); + mutex_lock(&parent_ctx->mutex); /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { - if (!counter->hw_event.inherit || counter->group_leader != counter) + if (!counter->hw_event.inherit) continue; - /* - * Instead of creating recursive hierarchies of counters, - * we link inheritd counters back to the original parent, - * which has a filp for sure, which we use as the reference - * count: - */ - parent_counter = counter; - if (counter->parent) - parent_counter = counter->parent; - - if (inherit_counter(parent_counter, parent, + if (inherit_group(counter, parent, parent_ctx, child, child_ctx)) break; } - spin_unlock_irqrestore(&parent_ctx->lock, flags); + mutex_unlock(&parent_ctx->mutex); } static void __cpuinit perf_counter_init_cpu(int cpu) @@ -1692,11 +2025,15 @@ static void __perf_counter_exit_cpu(void *info) list_for_each_entry_safe(counter, tmp, &ctx->counter_list, list_entry) __perf_counter_remove_from_context(counter); - } static void perf_counter_exit_cpu(int cpu) { + struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); + struct perf_counter_context *ctx = &cpuctx->ctx; + + mutex_lock(&ctx->mutex); smp_call_function_single(cpu, __perf_counter_exit_cpu, NULL, 1); + mutex_unlock(&ctx->mutex); } #else static inline void perf_counter_exit_cpu(int cpu) { } From 1b023a96d9b44f50f4d8ff28c15f5b80e354760f Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 10:13:01 +0100 Subject: [PATCH 0068/2061] perfcounters: throttle on too high IRQ rates Starting kerneltop with only -c 100 seems to be a bad idea, it can easily lock the system due to perfcounter IRQ overload. So add throttling: if a new IRQ arrives in a shorter than PERFMON_MIN_PERIOD_NS time, turn off perfcounters and untrottle them from the next timer tick. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic.c | 2 ++ arch/x86/kernel/cpu/perf_counter.c | 38 +++++++++++++++++++++++++----- include/linux/perf_counter.h | 4 ++++ 3 files changed, 38 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/apic.c b/arch/x86/kernel/apic.c index 7b434e5b14c..849c23009bf 100644 --- a/arch/x86/kernel/apic.c +++ b/arch/x86/kernel/apic.c @@ -781,6 +781,8 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); + + perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9376771f757..1a040b179b5 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,6 +33,9 @@ static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + u64 last_interrupt; + u64 global_enable; + int throttled; }; /* @@ -474,16 +477,19 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); - u64 ack, status, saved_global; - struct cpu_hw_counters *cpuc; + u64 ack, status, now; + struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); /* Disable counters globally */ wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); - cpuc = &per_cpu(cpu_hw_counters, cpu); + now = sched_clock(); + if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS) + cpuc->throttled = 1; + cpuc->last_interrupt = now; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (!status) @@ -533,9 +539,29 @@ again: goto again; out: /* - * Restore - do not reenable when global enable is off: + * Restore - do not reenable when global enable is off or throttled: */ - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, saved_global); + if (!cpuc->throttled) + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); +} + +void perf_counter_unthrottle(void) +{ + struct cpu_hw_counters *cpuc; + + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + if (unlikely(!perf_counters_initialized)) + return; + + cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); + if (cpuc->throttled) { + if (printk_ratelimit()) + printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n"); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->throttled = 0; + } } void smp_perf_counter_interrupt(struct pt_regs *regs) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 33ba9fe0a78..91f1ca4c01c 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -254,6 +254,7 @@ extern void perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_notify(struct pt_regs *regs); extern void perf_counter_print_debug(void); +extern void perf_counter_unthrottle(void); extern u64 hw_perf_save_disable(void); extern void hw_perf_restore(u64 ctrl); extern int perf_counter_task_disable(void); @@ -270,6 +271,8 @@ static inline int is_software_counter(struct perf_counter *counter) return !counter->hw_event.raw && counter->hw_event.type < 0; } +#define PERFMON_MIN_PERIOD_NS 10000 + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -281,6 +284,7 @@ static inline void perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } +static inline void perf_counter_unthrottle(void) { } static inline void hw_perf_restore(u64 ctrl) { } static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } From 4b39fd96855254a244f71245b41a91cdecb87d63 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 14:36:16 +0100 Subject: [PATCH 0069/2061] perfcounters: ratelimit performance counter interrupts Ratelimit performance counter interrupts to 100KHz per CPU. This replaces the irq-delta-time based method. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 27 +++++++++++++++------------ include/linux/perf_counter.h | 2 -- 2 files changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1a040b179b5..a56d4cf92f3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -33,9 +33,8 @@ static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - u64 last_interrupt; + unsigned long interrupts; u64 global_enable; - int throttled; }; /* @@ -470,6 +469,11 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) } } +/* + * Maximum interrupt frequency of 100KHz per CPU + */ +#define PERFMON_MAX_INTERRUPTS 100000/HZ + /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -477,7 +481,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); - u64 ack, status, now; + u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); @@ -486,11 +490,6 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); ack_APIC_irq(); - now = sched_clock(); - if (now - cpuc->last_interrupt < PERFMON_MIN_PERIOD_NS) - cpuc->throttled = 1; - cpuc->last_interrupt = now; - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); if (!status) goto out; @@ -541,13 +540,14 @@ out: /* * Restore - do not reenable when global enable is off or throttled: */ - if (!cpuc->throttled) + if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); } void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; + u64 global_enable; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; @@ -556,12 +556,15 @@ void perf_counter_unthrottle(void) return; cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); - if (cpuc->throttled) { + if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) - printk(KERN_WARNING "PERFMON: max event frequency exceeded!\n"); + printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); - cpuc->throttled = 0; } + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable); + if (unlikely(cpuc->global_enable && !global_enable)) + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->interrupts = 0; } void smp_perf_counter_interrupt(struct pt_regs *regs) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 91f1ca4c01c..f55381fbcac 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -271,8 +271,6 @@ static inline int is_software_counter(struct perf_counter *counter) return !counter->hw_event.raw && counter->hw_event.type < 0; } -#define PERFMON_MIN_PERIOD_NS 10000 - #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } From 3415dd9146c574bffe8f012c096bfc2bc62b9508 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 23 Jan 2009 14:16:53 +0100 Subject: [PATCH 0070/2061] perfcounters fix section mismatch warning in perf_counter.c::perf_counters_lapic_init() Fix: WARNING: arch/x86/kernel/built-in.o(.text+0xdd0f): Section mismatch in reference from the function pmc_generic_enable() to the function .cpuinit.text:perf_counters_lapic_init() The function pmc_generic_enable() references the function __cpuinit perf_counters_lapic_init(). This is often because pmc_generic_enable lacks a __cpuinit annotation or the annotation of perf_counters_lapic_init is wrong. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a56d4cf92f3..46c436cdd73 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -605,7 +605,7 @@ void perf_counter_notify(struct pt_regs *regs) local_irq_restore(flags); } -void __cpuinit perf_counters_lapic_init(int nmi) +void perf_counters_lapic_init(int nmi) { u32 apic_val; From bb3f0b59ad005d2d2ecbbe9bd048eab6d1ecbd31 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sun, 25 Jan 2009 02:38:09 -0800 Subject: [PATCH 0071/2061] x86: make irqinit_32.c more like irqinit_64.c, v2 Impact: cleanup 1. add smp_intr_init and apic_intr_init for 32bit, the same as 64bit 2. move the apic_intr_init() call before set gate with interrupt[i] 3. for 64bit, if ia32_emulation is not used, will make per_cpu to use 0x80 vector. [ v2: should use !test_bit() instead of test_bit() with 32bit ] Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 56 +++++++++++++++++++++--------------- arch/x86/kernel/irqinit_64.c | 7 +++-- arch/x86/kernel/traps.c | 15 ++++------ 3 files changed, 43 insertions(+), 35 deletions(-) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index c56496f8c6f..ddf3eb72f86 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -120,28 +120,8 @@ int vector_used_by_percpu_irq(unsigned int vector) return 0; } -/* Overridden in paravirt.c */ -void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); - -void __init native_init_IRQ(void) +static void __init smp_intr_init(void) { - int i; - - /* all the set up before the call gates are initialised */ - pre_intr_init_hook(); - - /* - * Cover the whole vector space, no vector can escape - * us. (some of these will be overridden and become - * 'special' SMP interrupts) - */ - for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* SYSCALL_VECTOR was reserved in trap_init. */ - if (i != SYSCALL_VECTOR) - set_intr_gate(i, interrupt[i-FIRST_EXTERNAL_VECTOR]); - } - - #if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_SMP) /* * The reschedule interrupt is a CPU-to-CPU reschedule-helper @@ -170,8 +150,13 @@ void __init native_init_IRQ(void) set_intr_gate(IRQ_MOVE_CLEANUP_VECTOR, irq_move_cleanup_interrupt); set_bit(IRQ_MOVE_CLEANUP_VECTOR, used_vectors); #endif +} +static void __init apic_intr_init(void) +{ #ifdef CONFIG_X86_LOCAL_APIC + smp_intr_init(); + /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); @@ -181,12 +166,37 @@ void __init native_init_IRQ(void) # ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); # endif -#endif -#if defined(CONFIG_X86_LOCAL_APIC) && defined(CONFIG_X86_MCE_P4THERMAL) +# ifdef CONFIG_X86_MCE_P4THERMAL /* thermal monitor LVT interrupt */ alloc_intr_gate(THERMAL_APIC_VECTOR, thermal_interrupt); +# endif #endif +} + +/* Overridden in paravirt.c */ +void init_IRQ(void) __attribute__((weak, alias("native_init_IRQ"))); + +void __init native_init_IRQ(void) +{ + int i; + + /* all the set up before the call gates are initialised */ + pre_intr_init_hook(); + + apic_intr_init(); + + /* + * Cover the whole vector space, no vector can escape + * us. (some of these will be overridden and become + * 'special' SMP interrupts) + */ + for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { + int vector = FIRST_EXTERNAL_VECTOR + i; + /* SYSCALL_VECTOR was reserved in trap_init. */ + if (!test_bit(vector, used_vectors)) + set_intr_gate(vector, interrupt[i]); + } if (!acpi_ioapic) setup_irq(2, &irq2); diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 6a71bfc51e5..16e1fc68750 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -162,6 +162,9 @@ void __init native_init_IRQ(void) int i; init_ISA_irqs(); + + apic_intr_init(); + /* * Cover the whole vector space, no vector can escape * us. (some of these will be overridden and become @@ -169,12 +172,10 @@ void __init native_init_IRQ(void) */ for (i = 0; i < (NR_VECTORS - FIRST_EXTERNAL_VECTOR); i++) { int vector = FIRST_EXTERNAL_VECTOR + i; - if (vector != IA32_SYSCALL_VECTOR) + if (!test_bit(vector, used_vectors)) set_intr_gate(vector, interrupt[i]); } - apic_intr_init(); - if (!acpi_ioapic) setup_irq(2, &irq2); } diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index ed5aee5f3fc..d36a502d87a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -979,8 +979,13 @@ void __init trap_init(void) #endif set_intr_gate(19, &simd_coprocessor_error); + /* Reserve all the builtin and the syscall vector: */ + for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) + set_bit(i, used_vectors); + #ifdef CONFIG_IA32_EMULATION set_system_intr_gate(IA32_SYSCALL_VECTOR, ia32_syscall); + set_bit(IA32_SYSCALL_VECTOR, used_vectors); #endif #ifdef CONFIG_X86_32 @@ -997,17 +1002,9 @@ void __init trap_init(void) } set_system_trap_gate(SYSCALL_VECTOR, &system_call); -#endif - - /* Reserve all the builtin and the syscall vector: */ - for (i = 0; i < FIRST_EXTERNAL_VECTOR; i++) - set_bit(i, used_vectors); - -#ifdef CONFIG_X86_64 - set_bit(IA32_SYSCALL_VECTOR, used_vectors); -#else set_bit(SYSCALL_VECTOR, used_vectors); #endif + /* * Should be a barrier for any external CPU state: */ From 65d370862f64973611a271ced61864b5f9bb6fc0 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 29 Jan 2009 14:06:52 +0100 Subject: [PATCH 0072/2061] perfcounters: fix refcounting bug don't kfree in use counters. Running... while true; do perfstat -e 1 -c true; done ...on all cores for a while doesn't seem to be eating ram, and my oops is gone. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1ac18daa424..f27a7e9f3c4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1934,7 +1934,8 @@ __perf_counter_exit_task(struct task_struct *child, } } - kfree(child_counter); + if (!child_counter->filp || !atomic_long_read(&child_counter->filp->f_count)) + kfree(child_counter); } /* From 15081c61362618a0c81cc8d04e45e7427bc1ed71 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 1 Feb 2009 22:07:39 +0530 Subject: [PATCH 0073/2061] x86: irqinit_32.c fix compilation warning Fix: arch/x86/kernel/irqinit_32.c:124: warning: 'smp_intr_init' defined but not used Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/irqinit_32.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index ddf3eb72f86..520e6c1c5d2 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -154,9 +154,9 @@ static void __init smp_intr_init(void) static void __init apic_intr_init(void) { -#ifdef CONFIG_X86_LOCAL_APIC smp_intr_init(); +#ifdef CONFIG_X86_LOCAL_APIC /* self generated IPI for local APIC timer */ alloc_intr_gate(LOCAL_TIMER_VECTOR, apic_timer_interrupt); From 5b75af0a02fcf3b8899f38ff6f22164c5d8e2fdd Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 4 Feb 2009 17:11:34 +0100 Subject: [PATCH 0074/2061] perfcounters: fix "perf counters kill oprofile" bug With oprofile as a module, and unloaded by profiling script, both oprofile and kerneltop work fine.. unless you leave kerneltop running when you start profiling, then you may see badness. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 +++- arch/x86/oprofile/nmi_int.c | 7 ++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 46c436cdd73..8bb213323fe 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -643,7 +643,9 @@ perf_counter_nmi_handler(struct notifier_block *self, } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { - .notifier_call = perf_counter_nmi_handler + .notifier_call = perf_counter_nmi_handler, + .next = NULL, + .priority = 1 }; void __init init_hw_perf_counters(void) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 202864ad49a..c638685136e 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -40,8 +40,9 @@ static int profile_exceptions_notify(struct notifier_block *self, switch (val) { case DIE_NMI: - if (model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu))) - ret = NOTIFY_STOP; + case DIE_NMI_IPI: + model->check_ctrs(args->regs, &per_cpu(cpu_msrs, cpu)); + ret = NOTIFY_STOP; break; default: break; @@ -134,7 +135,7 @@ static void nmi_cpu_setup(void *dummy) static struct notifier_block profile_exceptions_nb = { .notifier_call = profile_exceptions_notify, .next = NULL, - .priority = 0 + .priority = 2 }; static int nmi_setup(void) From 82aa9a1829199233f9bdaf26e2ee271114f4701e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Feb 2009 15:23:08 +0100 Subject: [PATCH 0075/2061] perfcounters: fix "perf counters kills oprofile" bug, v2 Impact: fix kernel crash Both oprofile and perfcounters register an NMI die handler, but only one can handle the NMI. Conveniently, oprofile unregisters it's notifier when not actively in use, so setting it's notifier priority higher than perfcounter's allows oprofile to borrow the NMI for the duration of it's run. Tested/works both as module and built-in. While testing, I found that if kerneltop was generating NMIs at very high frequency, the kernel may panic when oprofile registered it's handler. This turned out to be because oprofile registers it's handler before reset_value has been allocated, so if an NMI comes in while it's still setting up, kabOom. Rather than try more invasive changes, I followed the lead of other places in op_model_ppro.c, and simply returned in that highly unlikely event. (debug warnings attached) Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- arch/x86/oprofile/op_model_ppro.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/oprofile/op_model_ppro.c b/arch/x86/oprofile/op_model_ppro.c index 07c914555a5..85eb6268374 100644 --- a/arch/x86/oprofile/op_model_ppro.c +++ b/arch/x86/oprofile/op_model_ppro.c @@ -126,6 +126,13 @@ static int ppro_check_ctrs(struct pt_regs * const regs, u64 val; int i; + /* + * This can happen if perf counters are in use when + * we steal the die notifier NMI. + */ + if (unlikely(!reset_value)) + goto out; + for (i = 0 ; i < num_counters; ++i) { if (!reset_value[i]) continue; @@ -136,6 +143,7 @@ static int ppro_check_ctrs(struct pt_regs * const regs, } } +out: /* Only P6 based Pentium M need to re-unmask the apic vector but it * doesn't hurt other P6 variant */ apic_write(APIC_LVTPC, apic_read(APIC_LVTPC) & ~APIC_LVT_MASKED); From 23a185ca8abbeef64b6ffc33059b1d630e43ec10 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 9 Feb 2009 22:42:47 +1100 Subject: [PATCH 0076/2061] perf_counters: make software counters work as per-cpu counters Impact: kernel crash fix Yanmin Zhang reported that using a PERF_COUNT_TASK_CLOCK software counter as a per-cpu counter would reliably crash the system, because it calls __task_delta_exec with a null pointer. The page fault, context switch and cpu migration counters also won't function correctly as per-cpu counters since they reference the current task. This fixes the problem by redirecting the task_clock counter to the cpu_clock counter when used as a per-cpu counter, and by implementing per-cpu page fault, context switch and cpu migration counters. Along the way, this: - Initializes counter->ctx earlier, in perf_counter_alloc, so that sw_perf_counter_init can use it - Adds code to kernel/sched.c to count task migrations into each cpu, in rq->nr_migrations_in - Exports the per-cpu context switch and task migration counts via new functions added to kernel/sched.c - Makes sure that if sw_perf_counter_init fails, we don't try to initialize the counter as a hardware counter. Since the user has passed a negative, non-raw event type, they clearly don't intend for it to be interpreted as a hardware event. Reported-by: "Zhang Yanmin" Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/perf_counter.c | 80 +++++++++++++++++++++++++------------------ kernel/sched.c | 17 +++++++++ 3 files changed, 65 insertions(+), 34 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b85b10abf77..1e5f70062a9 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -137,6 +137,8 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern u64 cpu_nr_switches(int cpu); +extern u64 cpu_nr_migrations(int cpu); struct seq_file; struct cfs_rq; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f27a7e9f3c4..544193cbc47 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -20,6 +20,8 @@ #include #include #include +#include +#include /* * Each CPU has a list of per CPU counters: @@ -502,7 +504,6 @@ perf_install_in_context(struct perf_counter_context *ctx, { struct task_struct *task = ctx->task; - counter->ctx = ctx; if (!task) { /* * Per cpu counters are installed via an smp call and @@ -1417,11 +1418,19 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -static u64 get_page_faults(void) -{ - struct task_struct *curr = current; +#ifdef CONFIG_VM_EVENT_COUNTERS +#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT] +#else +#define cpu_page_faults() 0 +#endif - return curr->maj_flt + curr->min_flt; +static u64 get_page_faults(struct perf_counter *counter) +{ + struct task_struct *curr = counter->ctx->task; + + if (curr) + return curr->maj_flt + curr->min_flt; + return cpu_page_faults(); } static void page_faults_perf_counter_update(struct perf_counter *counter) @@ -1430,7 +1439,7 @@ static void page_faults_perf_counter_update(struct perf_counter *counter) s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = get_page_faults(); + now = get_page_faults(counter); atomic64_set(&counter->hw.prev_count, now); @@ -1446,11 +1455,7 @@ static void page_faults_perf_counter_read(struct perf_counter *counter) static int page_faults_perf_counter_enable(struct perf_counter *counter) { - /* - * page-faults is a per-task value already, - * so we dont have to clear it on switch-in. - */ - + atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); return 0; } @@ -1465,11 +1470,13 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = { .read = page_faults_perf_counter_read, }; -static u64 get_context_switches(void) +static u64 get_context_switches(struct perf_counter *counter) { - struct task_struct *curr = current; + struct task_struct *curr = counter->ctx->task; - return curr->nvcsw + curr->nivcsw; + if (curr) + return curr->nvcsw + curr->nivcsw; + return cpu_nr_switches(smp_processor_id()); } static void context_switches_perf_counter_update(struct perf_counter *counter) @@ -1478,7 +1485,7 @@ static void context_switches_perf_counter_update(struct perf_counter *counter) s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = get_context_switches(); + now = get_context_switches(counter); atomic64_set(&counter->hw.prev_count, now); @@ -1494,11 +1501,7 @@ static void context_switches_perf_counter_read(struct perf_counter *counter) static int context_switches_perf_counter_enable(struct perf_counter *counter) { - /* - * ->nvcsw + curr->nivcsw is a per-task value already, - * so we dont have to clear it on switch-in. - */ - + atomic64_set(&counter->hw.prev_count, get_context_switches(counter)); return 0; } @@ -1513,9 +1516,13 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = { .read = context_switches_perf_counter_read, }; -static inline u64 get_cpu_migrations(void) +static inline u64 get_cpu_migrations(struct perf_counter *counter) { - return current->se.nr_migrations; + struct task_struct *curr = counter->ctx->task; + + if (curr) + return curr->se.nr_migrations; + return cpu_nr_migrations(smp_processor_id()); } static void cpu_migrations_perf_counter_update(struct perf_counter *counter) @@ -1524,7 +1531,7 @@ static void cpu_migrations_perf_counter_update(struct perf_counter *counter) s64 delta; prev = atomic64_read(&counter->hw.prev_count); - now = get_cpu_migrations(); + now = get_cpu_migrations(counter); atomic64_set(&counter->hw.prev_count, now); @@ -1540,11 +1547,7 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter) static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) { - /* - * se.nr_migrations is a per-task value already, - * so we dont have to clear it on switch-in. - */ - + atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter)); return 0; } @@ -1569,7 +1572,14 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_TASK_CLOCK: - hw_ops = &perf_ops_task_clock; + /* + * If the user instantiates this as a per-cpu counter, + * use the cpu_clock counter instead. + */ + if (counter->ctx->task) + hw_ops = &perf_ops_task_clock; + else + hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_PAGE_FAULTS: hw_ops = &perf_ops_page_faults; @@ -1592,6 +1602,7 @@ sw_perf_counter_init(struct perf_counter *counter) static struct perf_counter * perf_counter_alloc(struct perf_counter_hw_event *hw_event, int cpu, + struct perf_counter_context *ctx, struct perf_counter *group_leader, gfp_t gfpflags) { @@ -1623,6 +1634,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->wakeup_pending = 0; counter->group_leader = group_leader; counter->hw_ops = NULL; + counter->ctx = ctx; counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) @@ -1631,7 +1643,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = NULL; if (!hw_event->raw && hw_event->type < 0) hw_ops = sw_perf_counter_init(counter); - if (!hw_ops) + else hw_ops = hw_perf_counter_init(counter); if (!hw_ops) { @@ -1707,7 +1719,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, } ret = -EINVAL; - counter = perf_counter_alloc(&hw_event, cpu, group_leader, GFP_KERNEL); + counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, + GFP_KERNEL); if (!counter) goto err_put_context; @@ -1777,15 +1790,14 @@ inherit_counter(struct perf_counter *parent_counter, parent_counter = parent_counter->parent; child_counter = perf_counter_alloc(&parent_counter->hw_event, - parent_counter->cpu, group_leader, - GFP_KERNEL); + parent_counter->cpu, child_ctx, + group_leader, GFP_KERNEL); if (!child_counter) return NULL; /* * Link it up in the child's context: */ - child_counter->ctx = child_ctx; child_counter->task = child; list_add_counter(child_counter, child_ctx); child_ctx->nr_counters++; diff --git a/kernel/sched.c b/kernel/sched.c index 8db1a4cf208..173768f142a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -558,6 +558,7 @@ struct rq { struct load_weight load; unsigned long nr_load_updates; u64 nr_switches; + u64 nr_migrations_in; struct cfs_rq cfs; struct rt_rq rt; @@ -1908,6 +1909,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) #endif if (old_cpu != new_cpu) { p->se.nr_migrations++; + new_rq->nr_migrations_in++; #ifdef CONFIG_SCHEDSTATS if (task_hot(p, old_rq->clock, NULL)) schedstat_inc(p, se.nr_forced2_migrations); @@ -2810,6 +2812,21 @@ unsigned long nr_active(void) return running + uninterruptible; } +/* + * Externally visible per-cpu scheduler statistics: + * cpu_nr_switches(cpu) - number of context switches on that cpu + * cpu_nr_migrations(cpu) - number of migrations into that cpu + */ +u64 cpu_nr_switches(int cpu) +{ + return cpu_rq(cpu)->nr_switches; +} + +u64 cpu_nr_migrations(int cpu) +{ + return cpu_rq(cpu)->nr_migrations_in; +} + /* * Update rq->cpu_load[] statistics. This function is usually called every * scheduler tick (TICK_NSEC). From d278c48435625cb6b7edcf6a547620768b175709 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 9 Feb 2009 07:38:50 +0100 Subject: [PATCH 0077/2061] perf_counters: account NMI interrupts I noticed that kerneltop interrupts were accounted as NMI, but not their perf counter origin. Account NMI performance counter interrupts. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8bb213323fe..9901e46998d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -495,6 +495,7 @@ static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) goto out; again: + inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { struct perf_counter *counter = cpuc->counters[bit]; @@ -570,7 +571,6 @@ void perf_counter_unthrottle(void) void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); - inc_irq_stat(apic_perf_irqs); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); __smp_perf_counter_interrupt(regs, 0); From 0475f9ea8e2cc030298908949e0d5da9f2fc2cfe Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Feb 2009 14:35:35 +1100 Subject: [PATCH 0078/2061] perf_counters: allow users to count user, kernel and/or hypervisor events Impact: new perf_counter feature This extends the perf_counter_hw_event struct with bits that specify that events in user, kernel and/or hypervisor mode should not be counted (i.e. should be excluded), and adds code to program the PMU mode selection bits accordingly on x86 and powerpc. For software counters, we don't currently have the infrastructure to distinguish which mode an event occurs in, so we currently fail the counter initialization if the setting of the hw_event.exclude_* bits would require us to distinguish. Context switches and CPU migrations are currently considered to occur in kernel mode. On x86, this changes the previous policy that only root can count kernel events. Now non-root users can count kernel events or exclude them. Non-root users still can't use NMI events, though. On x86 we don't appear to have any way to control whether hypervisor events are counted or not, so hw_event.exclude_hv is ignored. On powerpc, the selection of whether to count events in user, kernel and/or hypervisor mode is PMU-wide, not per-counter, so this adds a check that the hw_event.exclude_* settings are the same as other events on the PMU. Counters being added to a group have to have the same settings as the other hardware counters in the group. Counters and groups can only be enabled in hw_perf_group_sched_in or power_perf_enable if they have the same settings as any other counters already on the PMU. If we are not running on a hypervisor, the exclude_hv setting is ignored (by forcing it to 0) since we can't ever get any hypervisor events. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 68 ++++++++++++++++++++++++++++-- arch/x86/kernel/cpu/perf_counter.c | 31 ++++++++------ include/linux/perf_counter.h | 17 +++++--- kernel/perf_counter.c | 26 ++++++++++-- 4 files changed, 116 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5b0211348c7..bd6ba85beb5 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -16,6 +16,7 @@ #include #include #include +#include struct cpu_hw_counters { int n_counters; @@ -214,6 +215,36 @@ static int power_check_constraints(unsigned int event[], int n_ev) return 0; } +/* + * Check if newly-added counters have consistent settings for + * exclude_{user,kernel,hv} with each other and any previously + * added counters. + */ +static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) +{ + int eu, ek, eh; + int i, n; + struct perf_counter *counter; + + n = n_prev + n_new; + if (n <= 1) + return 0; + + eu = ctrs[0]->hw_event.exclude_user; + ek = ctrs[0]->hw_event.exclude_kernel; + eh = ctrs[0]->hw_event.exclude_hv; + if (n_prev == 0) + n_prev = 1; + for (i = n_prev; i < n; ++i) { + counter = ctrs[i]; + if (counter->hw_event.exclude_user != eu || + counter->hw_event.exclude_kernel != ek || + counter->hw_event.exclude_hv != eh) + return -EAGAIN; + } + return 0; +} + static void power_perf_read(struct perf_counter *counter) { long val, delta, prev; @@ -323,6 +354,20 @@ void hw_perf_restore(u64 disable) goto out; } + /* + * Add in MMCR0 freeze bits corresponding to the + * hw_event.exclude_* bits for the first counter. + * We have already checked that all counters have the + * same values for these bits as the first counter. + */ + counter = cpuhw->counter[0]; + if (counter->hw_event.exclude_user) + cpuhw->mmcr[0] |= MMCR0_FCP; + if (counter->hw_event.exclude_kernel) + cpuhw->mmcr[0] |= MMCR0_FCS; + if (counter->hw_event.exclude_hv) + cpuhw->mmcr[0] |= MMCR0_FCHV; + /* * Write the new configuration to MMCR* with the freeze * bit set and set the hardware counters to their initial values. @@ -424,6 +469,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, &cpuhw->counter[n0], &cpuhw->events[n0]); if (n < 0) return -EAGAIN; + if (check_excludes(cpuhw->counter, n0, n)) + return -EAGAIN; if (power_check_constraints(cpuhw->events, n + n0)) return -EAGAIN; cpuhw->n_counters = n0 + n; @@ -476,6 +523,8 @@ static int power_perf_enable(struct perf_counter *counter) goto out; cpuhw->counter[n0] = counter; cpuhw->events[n0] = counter->hw.config; + if (check_excludes(cpuhw->counter, n0, 1)) + goto out; if (power_check_constraints(cpuhw->events, n0 + 1)) goto out; @@ -554,6 +603,17 @@ hw_perf_counter_init(struct perf_counter *counter) counter->hw.config_base = ev; counter->hw.idx = 0; + /* + * If we are not running on a hypervisor, force the + * exclude_hv bit to 0 so that we don't care what + * the user set it to. This also means that we don't + * set the MMCR0_FCHV bit, which unconditionally freezes + * the counters on the PPC970 variants used in Apple G5 + * machines (since MSR.HV is always 1 on those machines). + */ + if (!firmware_has_feature(FW_FEATURE_LPAR)) + counter->hw_event.exclude_hv = 0; + /* * If this is in a group, check if it can go on with all the * other hardware counters in the group. We assume the counter @@ -566,11 +626,13 @@ hw_perf_counter_init(struct perf_counter *counter) if (n < 0) return NULL; } - events[n++] = ev; - if (power_check_constraints(events, n)) + events[n] = ev; + if (check_excludes(ctrs, n, 1)) + return NULL; + if (power_check_constraints(events, n + 1)) return NULL; - counter->hw.config = events[n - 1]; + counter->hw.config = events[n]; atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); return &power_perf_ops; } diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9901e46998d..383d4c6423a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -107,21 +107,25 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EINVAL; /* - * Count user events, and generate PMC IRQs: + * Generate PMC IRQs: * (keep 'enabled' bit clear for now) */ - hwc->config = ARCH_PERFMON_EVENTSEL_USR | ARCH_PERFMON_EVENTSEL_INT; + hwc->config = ARCH_PERFMON_EVENTSEL_INT; /* - * If privileged enough, count OS events too, and allow - * NMI events as well: + * Count user and OS events unless requested not to. + */ + if (!hw_event->exclude_user) + hwc->config |= ARCH_PERFMON_EVENTSEL_USR; + if (!hw_event->exclude_kernel) + hwc->config |= ARCH_PERFMON_EVENTSEL_OS; + + /* + * If privileged enough, allow NMI events: */ hwc->nmi = 0; - if (capable(CAP_SYS_ADMIN)) { - hwc->config |= ARCH_PERFMON_EVENTSEL_OS; - if (hw_event->nmi) - hwc->nmi = 1; - } + if (capable(CAP_SYS_ADMIN) && hw_event->nmi) + hwc->nmi = 1; hwc->irq_period = hw_event->irq_period; /* @@ -248,10 +252,13 @@ __pmc_fixed_enable(struct perf_counter *counter, int err; /* - * Enable IRQ generation (0x8) and ring-3 counting (0x2), - * and enable ring-0 counting if allowed: + * Enable IRQ generation (0x8), + * and enable ring-3 counting (0x2) and ring-0 counting (0x1) + * if requested: */ - bits = 0x8ULL | 0x2ULL; + bits = 0x8ULL; + if (hwc->config & ARCH_PERFMON_EVENTSEL_USR) + bits |= 0x2; if (hwc->config & ARCH_PERFMON_EVENTSEL_OS) bits |= 0x1; bits <<= (idx * 4); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f55381fbcac..c83f51d6e35 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -83,14 +83,17 @@ struct perf_counter_hw_event { u64 irq_period; u32 record_type; - u32 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only counter on PMU */ + u32 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ - __reserved_1 : 26; + __reserved_1 : 23; u64 __reserved_2; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 544193cbc47..89d5e3fe970 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1567,11 +1567,25 @@ sw_perf_counter_init(struct perf_counter *counter) { const struct hw_perf_counter_ops *hw_ops = NULL; + /* + * Software counters (currently) can't in general distinguish + * between user, kernel and hypervisor events. + * However, context switches and cpu migrations are considered + * to be kernel events, and page faults are never hypervisor + * events. + */ switch (counter->hw_event.type) { case PERF_COUNT_CPU_CLOCK: - hw_ops = &perf_ops_cpu_clock; + if (!(counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel || + counter->hw_event.exclude_hv)) + hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_TASK_CLOCK: + if (counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel || + counter->hw_event.exclude_hv) + break; /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. @@ -1582,13 +1596,17 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_PAGE_FAULTS: - hw_ops = &perf_ops_page_faults; + if (!(counter->hw_event.exclude_user || + counter->hw_event.exclude_kernel)) + hw_ops = &perf_ops_page_faults; break; case PERF_COUNT_CONTEXT_SWITCHES: - hw_ops = &perf_ops_context_switches; + if (!counter->hw_event.exclude_kernel) + hw_ops = &perf_ops_context_switches; break; case PERF_COUNT_CPU_MIGRATIONS: - hw_ops = &perf_ops_cpu_migrations; + if (!counter->hw_event.exclude_kernel) + hw_ops = &perf_ops_cpu_migrations; break; default: break; From 5af759176cc767e7426f89764bde4996ebaaf419 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 11 Feb 2009 10:53:37 +0100 Subject: [PATCH 0079/2061] perfcounters: fix use after free in perf_release() running... while true; do foo -d 1 -f 1 -c 100000 & sleep 1 kerneltop -d 1 -f 1 -e 1 -c 25000 -p `pidof foo` done while true; do killall foo; killall kerneltop; sleep 2 done ...in two shells with SLUB_DEBUG enabled produces flood of: BUG task_struct: Poison overwritten. Fix the use-after-free bug in perf_release(). Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 89d5e3fe970..e0576c3fdb5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1145,12 +1145,12 @@ static int perf_release(struct inode *inode, struct file *file) mutex_lock(&counter->mutex); perf_counter_remove_from_context(counter); - put_context(ctx); mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); kfree(counter); + put_context(ctx); return 0; } From 4bcf349a0f90d1e69eb35c6df0fa285c886c1cd6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 11 Feb 2009 13:53:19 +0100 Subject: [PATCH 0080/2061] perfcounters: fix refcounting bug, take 2 Only free child_counter if it has a parent; if it doesn't, then it has a file pointing to it and we'll free it in perf_release. Signed-off-by: Mike Galbraith Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e0576c3fdb5..fcefb0a726f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1958,14 +1958,13 @@ __perf_counter_exit_task(struct task_struct *child, sync_child_counter(child_counter, parent_counter); list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, list_entry) { - if (sub->parent) + if (sub->parent) { sync_child_counter(sub, sub->parent); - kfree(sub); + kfree(sub); + } } - } - - if (!child_counter->filp || !atomic_long_read(&child_counter->filp->f_count)) kfree(child_counter); + } } /* From c07c99b67233ccaad38a961c17405dc1e1542aa4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 13 Feb 2009 22:10:34 +1100 Subject: [PATCH 0081/2061] perfcounters: make context switch and migration software counters work again Jaswinder Singh Rajput reported that commit 23a185ca8abbeef caused the context switch and migration software counters to report zero always. With that commit, the software counters only count events that occur between sched-in and sched-out for a task. This is necessary for the counter enable/disable prctls and ioctls to work. However, the context switch and migration counts are incremented after sched-out for one task and before sched-in for the next. Since the increment doesn't occur while a task is scheduled in (as far as the software counters are concerned) it doesn't count towards any counter. Thus the context switch and migration counters need to count events that occur at any time, provided the counter is enabled, not just those that occur while the task is scheduled in (from the perf_counter subsystem's point of view). The problem though is that the software counter code can't tell the difference between being enabled and being scheduled in, and between being disabled and being scheduled out, since we use the one pair of enable/disable entry points for both. That is, the high-level disable operation simply arranges for the counter to not be scheduled in any more, and the high-level enable operation arranges for it to be scheduled in again. One way to solve this would be to have sched_in/out operations in the hw_perf_counter_ops struct as well as enable/disable. However, this takes a simpler approach: it adds a 'prev_state' field to the perf_counter struct that allows a counter's enable method to know whether the counter was previously disabled or just inactive (scheduled out), and therefore whether the enable method is being called as a result of a high-level enable or a schedule-in operation. This then allows the context switch, migration and page fault counters to reset their hw.prev_count value in their enable functions only if they are called as a result of a high-level enable operation. Although page faults would normally only occur while the counter is scheduled in, this changes the page fault counter code too in case there are ever circumstances where page faults get counted against a task while its counters are not scheduled in. Reported-by: Jaswinder Singh Rajput Signed-off-by: Paul Mackerras Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 21 +++++++++++++++------ 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c83f51d6e35..32cd1acb738 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -173,6 +173,7 @@ struct perf_counter { const struct hw_perf_counter_ops *hw_ops; enum perf_counter_active_state state; + enum perf_counter_active_state prev_state; atomic64_t count; struct perf_counter_hw_event hw_event; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fcefb0a726f..ad62965828d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -444,6 +444,7 @@ static void __perf_install_in_context(void *info) list_add_counter(counter, ctx); ctx->nr_counters++; + counter->prev_state = PERF_COUNTER_STATE_OFF; /* * Don't put the counter on if it is disabled or if @@ -562,6 +563,7 @@ static void __perf_counter_enable(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); + counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; @@ -733,6 +735,7 @@ group_sched_in(struct perf_counter *group_counter, if (ret) return ret < 0 ? ret : 0; + group_counter->prev_state = group_counter->state; if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) return -EAGAIN; @@ -740,6 +743,7 @@ group_sched_in(struct perf_counter *group_counter, * Schedule in siblings as one group (if any): */ list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + counter->prev_state = counter->state; if (counter_sched_in(counter, cpuctx, ctx, cpu)) { partial_group = counter; goto group_error; @@ -1398,9 +1402,9 @@ static void task_clock_perf_counter_read(struct perf_counter *counter) static int task_clock_perf_counter_enable(struct perf_counter *counter) { - u64 now = task_clock_perf_counter_val(counter, 0); - - atomic64_set(&counter->hw.prev_count, now); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, + task_clock_perf_counter_val(counter, 0)); return 0; } @@ -1455,7 +1459,8 @@ static void page_faults_perf_counter_read(struct perf_counter *counter) static int page_faults_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); return 0; } @@ -1501,7 +1506,9 @@ static void context_switches_perf_counter_read(struct perf_counter *counter) static int context_switches_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, get_context_switches(counter)); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, + get_context_switches(counter)); return 0; } @@ -1547,7 +1554,9 @@ static void cpu_migrations_perf_counter_read(struct perf_counter *counter) static int cpu_migrations_perf_counter_enable(struct perf_counter *counter) { - atomic64_set(&counter->hw.prev_count, get_cpu_migrations(counter)); + if (counter->prev_state <= PERF_COUNTER_STATE_OFF) + atomic64_set(&counter->hw.prev_count, + get_cpu_migrations(counter)); return 0; } From 73ca2f8380311115723c7afe811f3ed1f0ba945e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 16 Feb 2009 01:08:17 +0100 Subject: [PATCH 0082/2061] perfcounters: remove duplicate definition of LOCAL_PERF_VECTOR Signed-off-by: Ingo Molnar --- arch/x86/include/asm/irq_vectors.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index b66b518ff00..b07278c55e9 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -111,11 +111,6 @@ */ #define LOCAL_PERF_VECTOR 0xee -/* - * Performance monitoring interrupt vector: - */ -#define LOCAL_PERF_VECTOR 0xee - /* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority From 37a25424252b6cff4dd4b1937ab6a1dbfcadabcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 16 Feb 2009 15:32:23 +0100 Subject: [PATCH 0083/2061] perfcounters: fix acpi_idle_do_entry() workaround Fix merge error in drivers/acpi/processor_idle.c. This resulted in non-working perfcounters on certain Nehalem systems. Signed-off-by: Ingo Molnar --- drivers/acpi/processor_idle.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 259f6e80631..08def2f20cd 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -826,12 +826,9 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) { u64 perf_flags; - u64 pctrl; - /* Don't trace irqs off for idle */ stop_critical_timings(); perf_flags = hw_perf_save_disable(); - pctrl = hw_perf_save_disable(); if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -847,7 +844,6 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } hw_perf_restore(perf_flags); - hw_perf_restore(pctrl); start_critical_timings(); } From d095cd46dac104e4d2a4967c7c19b55a12f78240 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 23 Feb 2009 23:01:28 +1100 Subject: [PATCH 0084/2061] perfcounters/powerpc: Make exclude_kernel bit work on Apple G5 processors Currently, setting hw_event.exclude_kernel does nothing on the PPC970 variants used in Apple G5 machines, because they have the HV (hypervisor) bit in the MSR forced to 1, so as far as the PMU is concerned, the kernel runs in hypervisor mode. Thus we have to use the MMCR0_FCHV (freeze counters in hypervisor mode) bit rather than the MMCR0_FCS (freeze counters in supervisor mode) bit. This checks the MSR.HV bit at startup, and if it is set, we set the freeze_counters_kernel variable to MMCR0_FCHV (it was initialized to MMCR0_FCS). We then use that whenever we need to exclude kernel events. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index bd6ba85beb5..6e27913ec0d 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -32,6 +32,15 @@ DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); struct power_pmu *ppmu; +/* + * Normally, to ignore kernel events we set the FCS (freeze counters + * in supervisor mode) bit in MMCR0, but if the kernel runs with the + * hypervisor bit set in the MSR, or if we are running on a processor + * where the hypervisor bit is forced to 1 (as on Apple G5 processors), + * then we need to use the FCHV bit to ignore kernel events. + */ +static unsigned int freeze_counters_kernel = MMCR0_FCS; + void perf_counter_print_debug(void) { } @@ -364,7 +373,7 @@ void hw_perf_restore(u64 disable) if (counter->hw_event.exclude_user) cpuhw->mmcr[0] |= MMCR0_FCP; if (counter->hw_event.exclude_kernel) - cpuhw->mmcr[0] |= MMCR0_FCS; + cpuhw->mmcr[0] |= freeze_counters_kernel; if (counter->hw_event.exclude_hv) cpuhw->mmcr[0] |= MMCR0_FCHV; @@ -606,10 +615,7 @@ hw_perf_counter_init(struct perf_counter *counter) /* * If we are not running on a hypervisor, force the * exclude_hv bit to 0 so that we don't care what - * the user set it to. This also means that we don't - * set the MMCR0_FCHV bit, which unconditionally freezes - * the counters on the PPC970 variants used in Apple G5 - * machines (since MSR.HV is always 1 on those machines). + * the user set it to. */ if (!firmware_has_feature(FW_FEATURE_LPAR)) counter->hw_event.exclude_hv = 0; @@ -841,6 +847,13 @@ static int init_perf_counters(void) ppmu = &power6_pmu; break; } + + /* + * Use FCHV to ignore kernel events if MSR.HV is set. + */ + if (mfmsr() & MSR_HV) + freeze_counters_kernel = MMCR0_FCHV; + return 0; } From 742bd95ba96e19b3f7196c3a0834ebc17c8ba006 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 24 Feb 2009 11:33:56 +1100 Subject: [PATCH 0085/2061] perfcounters/powerpc: Add support for POWER5 processors This adds the back-end for the PMU on the POWER5 processor. This knows how to use the fixed-function PMC5 and PMC6 (instructions completed and run cycles). Unlike POWER6, PMC5/6 obey the freeze conditions and can generate interrupts, so their use doesn't impose any extra restrictions. POWER5+ is different and is not supported by this patch. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/Makefile | 3 +- arch/powerpc/kernel/perf_counter.c | 4 + arch/powerpc/kernel/power5-pmu.c | 475 +++++++++++++++++++++++++++++ 3 files changed, 481 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/power5-pmu.c diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 7c941ec3b23..b4c6f466164 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,7 +94,8 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o power6-pmu.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o power5-pmu.o \ + power6-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 6e27913ec0d..112332d07fc 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -824,6 +824,7 @@ void hw_perf_counter_setup(int cpu) } extern struct power_pmu ppc970_pmu; +extern struct power_pmu power5_pmu; extern struct power_pmu power6_pmu; static int init_perf_counters(void) @@ -843,6 +844,9 @@ static int init_perf_counters(void) case PV_970MP: ppmu = &ppc970_pmu; break; + case PV_POWER5: + ppmu = &power5_pmu; + break; case 0x3e: ppmu = &power6_pmu; break; diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c new file mode 100644 index 00000000000..379ed1087cc --- /dev/null +++ b/arch/powerpc/kernel/power5-pmu.c @@ -0,0 +1,475 @@ +/* + * Performance counter support for POWER5 (not POWER5++) processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for POWER5 (not POWER5++) + */ +#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0xf +#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK 0xf +#define PM_BYTE_SH 12 /* Byte number of event bus to use */ +#define PM_BYTE_MSK 7 +#define PM_GRS_SH 8 /* Storage subsystem mux select */ +#define PM_GRS_MSK 7 +#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */ +#define PM_PMCSEL_MSK 0x7f + +/* Values in PM_UNIT field */ +#define PM_FPU 0 +#define PM_ISU0 1 +#define PM_IFU 2 +#define PM_ISU1 3 +#define PM_IDU 4 +#define PM_ISU0_ALT 6 +#define PM_GRS 7 +#define PM_LSU0 8 +#define PM_LSU1 0xc +#define PM_LASTUNIT 0xc + +/* + * Bits in MMCR1 for POWER5 + */ +#define MMCR1_TTM0SEL_SH 62 +#define MMCR1_TTM1SEL_SH 60 +#define MMCR1_TTM2SEL_SH 58 +#define MMCR1_TTM3SEL_SH 56 +#define MMCR1_TTMSEL_MSK 3 +#define MMCR1_TD_CP_DBG0SEL_SH 54 +#define MMCR1_TD_CP_DBG1SEL_SH 52 +#define MMCR1_TD_CP_DBG2SEL_SH 50 +#define MMCR1_TD_CP_DBG3SEL_SH 48 +#define MMCR1_GRS_L2SEL_SH 46 +#define MMCR1_GRS_L2SEL_MSK 3 +#define MMCR1_GRS_L3SEL_SH 44 +#define MMCR1_GRS_L3SEL_MSK 3 +#define MMCR1_GRS_MCSEL_SH 41 +#define MMCR1_GRS_MCSEL_MSK 7 +#define MMCR1_GRS_FABSEL_SH 39 +#define MMCR1_GRS_FABSEL_MSK 3 +#define MMCR1_PMC1_ADDER_SEL_SH 35 +#define MMCR1_PMC2_ADDER_SEL_SH 34 +#define MMCR1_PMC3_ADDER_SEL_SH 33 +#define MMCR1_PMC4_ADDER_SEL_SH 32 +#define MMCR1_PMC1SEL_SH 25 +#define MMCR1_PMC2SEL_SH 17 +#define MMCR1_PMC3SEL_SH 9 +#define MMCR1_PMC4SEL_SH 1 +#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK 0x7f + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + * <><>[ ><><>< ><> [ >[ >[ >< >< >< >< ><><><><><><> + * T0T1 NC G0G1G2 G3 UC PS1PS2 B0 B1 B2 B3 P6P5P4P3P2P1 + * + * T0 - TTM0 constraint + * 54-55: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0xc0_0000_0000_0000 + * + * T1 - TTM1 constraint + * 52-53: TTM1SEL value (0=IDU, 3=GRS) 0x30_0000_0000_0000 + * + * NC - number of counters + * 51: NC error 0x0008_0000_0000_0000 + * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 + * + * G0..G3 - GRS mux constraints + * 46-47: GRS_L2SEL value + * 44-45: GRS_L3SEL value + * 41-44: GRS_MCSEL value + * 39-40: GRS_FABSEL value + * Note that these match up with their bit positions in MMCR1 + * + * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS + * 37: UC3 error 0x20_0000_0000 + * 36: FPU|IFU|ISU1 events needed 0x10_0000_0000 + * 35: ISU0 events needed 0x08_0000_0000 + * 34: IDU|GRS events needed 0x04_0000_0000 + * + * PS1 + * 33: PS1 error 0x2_0000_0000 + * 31-32: count of events needing PMC1/2 0x1_8000_0000 + * + * PS2 + * 30: PS2 error 0x4000_0000 + * 28-29: count of events needing PMC3/4 0x3000_0000 + * + * B0 + * 24-27: Byte 0 event source 0x0f00_0000 + * Encoding as for the event code + * + * B1, B2, B3 + * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources + * + * P1..P6 + * 0-11: Count of events needing PMC1..PMC6 + */ + +static const int grsel_shift[8] = { + MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, + MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, + MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH +}; + +/* Masks and values for using events from the various units */ +static u64 unit_cons[PM_LASTUNIT+1][2] = { + [PM_FPU] = { 0xc0002000000000ull, 0x00001000000000ull }, + [PM_ISU0] = { 0x00002000000000ull, 0x00000800000000ull }, + [PM_ISU1] = { 0xc0002000000000ull, 0xc0001000000000ull }, + [PM_IFU] = { 0xc0002000000000ull, 0x80001000000000ull }, + [PM_IDU] = { 0x30002000000000ull, 0x00000400000000ull }, + [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull }, +}; + +static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +{ + int pmc, byte, unit, sh; + int bit, fmask; + u64 mask = 0, value = 0; + int grp = -1; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 6) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + if (pmc <= 4) + grp = (pmc - 1) >> 1; + else if (event != 0x500009 && event != 0x600005) + return -1; + } + if (event & PM_BUSEVENT_MSK) { + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + if (unit > PM_LASTUNIT) + return -1; + if (unit == PM_ISU0_ALT) + unit = PM_ISU0; + mask |= unit_cons[unit][0]; + value |= unit_cons[unit][1]; + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + if (byte >= 4) { + if (unit != PM_LSU1) + return -1; + /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ + ++unit; + byte &= 3; + } + if (unit == PM_GRS) { + bit = event & 7; + fmask = (bit == 6)? 7: 3; + sh = grsel_shift[bit]; + mask |= (u64)fmask << sh; + value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; + } + /* + * Bus events on bytes 0 and 2 can be counted + * on PMC1/2; bytes 1 and 3 on PMC3/4. + */ + if (!pmc) + grp = byte & 1; + /* Set byte lane select field */ + mask |= 0xfULL << (24 - 4 * byte); + value |= (u64)unit << (24 - 4 * byte); + } + if (grp == 0) { + /* increment PMC1/2 field */ + mask |= 0x200000000ull; + value |= 0x080000000ull; + } else if (grp == 1) { + /* increment PMC3/4 field */ + mask |= 0x40000000ull; + value |= 0x10000000ull; + } + if (pmc < 5) { + /* need a counter from PMC1-4 set */ + mask |= 0x8000000000000ull; + value |= 0x1000000000000ull; + } + *maskp = mask; + *valp = value; + return 0; +} + +#define MAX_ALT 3 /* at most 3 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { + { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */ + { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ + { 0x100005, 0x600005 }, /* PM_RUN_CYC */ + { 0x100009, 0x200009, 0x500009 }, /* PM_INST_CMPL */ + { 0x300009, 0x400009 }, /* PM_INST_DISP */ +}; + +/* + * Scan the alternatives table for a match and return the + * index into the alternatives table if found, else -1. + */ +static int find_alternative(unsigned int event) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { + if (event < event_alternatives[i][0]) + break; + for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) + if (event == event_alternatives[i][j]) + return i; + } + return -1; +} + +static const unsigned char bytedecode_alternatives[4][4] = { + /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 }, + /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e }, + /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 }, + /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e } +}; + +/* + * Some direct events for decodes of event bus byte 3 have alternative + * PMCSEL values on other counters. This returns the alternative + * event code for those that do, or -1 otherwise. + */ +static int find_alternative_bdecode(unsigned int event) +{ + int pmc, altpmc, pp, j; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc == 0 || pmc > 4) + return -1; + altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */ + pp = event & PM_PMCSEL_MSK; + for (j = 0; j < 4; ++j) { + if (bytedecode_alternatives[pmc - 1][j] == pp) { + return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | + (altpmc << PM_PMC_SH) | + bytedecode_alternatives[altpmc - 1][j]; + } + } + return -1; +} + +static int power5_get_alternatives(unsigned int event, unsigned int alt[]) +{ + int i, j, ae, nalt = 1; + + alt[0] = event; + nalt = 1; + i = find_alternative(event); + if (i >= 0) { + for (j = 0; j < MAX_ALT; ++j) { + ae = event_alternatives[i][j]; + if (ae && ae != event) + alt[nalt++] = ae; + } + } else { + ae = find_alternative_bdecode(event); + if (ae > 0) + alt[nalt++] = ae; + } + return nalt; +} + +static int power5_compute_mmcr(unsigned int event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr1 = 0; + unsigned int pmc, unit, byte, psel; + unsigned int ttm, grp; + int i, isbus, bit, grsel; + unsigned int pmc_inuse = 0; + unsigned int pmc_grp_use[2]; + unsigned char busbyte[4]; + unsigned char unituse[16]; + int ttmuse; + + if (n_ev > 6) + return -1; + + /* First pass to count resource use */ + pmc_grp_use[0] = pmc_grp_use[1] = 0; + memset(busbyte, 0, sizeof(busbyte)); + memset(unituse, 0, sizeof(unituse)); + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 6) + return -1; + if (pmc_inuse & (1 << (pmc - 1))) + return -1; + pmc_inuse |= 1 << (pmc - 1); + /* count 1/2 vs 3/4 use */ + if (pmc <= 4) + ++pmc_grp_use[(pmc - 1) >> 1]; + } + if (event[i] & PM_BUSEVENT_MSK) { + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + if (unit > PM_LASTUNIT) + return -1; + if (unit == PM_ISU0_ALT) + unit = PM_ISU0; + if (byte >= 4) { + if (unit != PM_LSU1) + return -1; + ++unit; + byte &= 3; + } + if (!pmc) + ++pmc_grp_use[byte & 1]; + if (busbyte[byte] && busbyte[byte] != unit) + return -1; + busbyte[byte] = unit; + unituse[unit] = 1; + } + } + if (pmc_grp_use[0] > 2 || pmc_grp_use[1] > 2) + return -1; + + /* + * Assign resources and set multiplexer selects. + * + * PM_ISU0 can go either on TTM0 or TTM1, but that's the only + * choice we have to deal with. + */ + if (unituse[PM_ISU0] & + (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { + unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */ + unituse[PM_ISU0] = 0; + } + /* Set TTM[01]SEL fields. */ + ttmuse = 0; + for (i = PM_FPU; i <= PM_ISU1; ++i) { + if (!unituse[i]) + continue; + if (ttmuse++) + return -1; + mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH; + } + ttmuse = 0; + for (; i <= PM_GRS; ++i) { + if (!unituse[i]) + continue; + if (ttmuse++) + return -1; + mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH; + } + if (ttmuse > 1) + return -1; + + /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ + for (byte = 0; byte < 4; ++byte) { + unit = busbyte[byte]; + if (!unit) + continue; + if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { + /* get ISU0 through TTM1 rather than TTM0 */ + unit = PM_ISU0_ALT; + } else if (unit == PM_LSU1 + 1) { + /* select lower word of LSU1 for this byte */ + mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); + } + ttm = unit >> 2; + mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); + } + + /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + psel = event[i] & PM_PMCSEL_MSK; + isbus = event[i] & PM_BUSEVENT_MSK; + if (!pmc) { + /* Bus event or any-PMC direct event */ + for (pmc = 0; pmc < 4; ++pmc) { + if (pmc_inuse & (1 << pmc)) + continue; + grp = (pmc >> 1) & 1; + if (isbus) { + if (grp == (byte & 1)) + break; + } else if (pmc_grp_use[grp] < 2) { + ++pmc_grp_use[grp]; + break; + } + } + pmc_inuse |= 1 << pmc; + } else if (pmc <= 4) { + /* Direct event */ + --pmc; + if ((psel == 8 || psel == 0x10) && isbus && (byte & 2)) + /* add events on higher-numbered bus */ + mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); + } else { + /* Instructions or run cycles on PMC5/6 */ + --pmc; + } + if (isbus && unit == PM_GRS) { + bit = psel & 7; + grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; + mmcr1 |= (u64)grsel << grsel_shift[bit]; + } + if (pmc <= 3) + mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); + hwc[i] = pmc; + } + + /* Return MMCRx values */ + mmcr[0] = 0; + if (pmc_inuse & 1) + mmcr[0] = MMCR0_PMC1CE; + if (pmc_inuse & 0x3e) + mmcr[0] |= MMCR0_PMCjCE; + mmcr[1] = mmcr1; + mmcr[2] = 0; + return 0; +} + +static void power5_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + if (pmc <= 3) + mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power5_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 0xf, + [PERF_COUNT_INSTRUCTIONS] = 0x100009, + [PERF_COUNT_CACHE_REFERENCES] = 0x4c1090, /* LD_REF_L1 */ + [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ + [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ +}; + +struct power_pmu power5_pmu = { + .n_counter = 6, + .max_alternatives = MAX_ALT, + .add_fields = 0x7000090000555ull, + .test_adder = 0x3000490000000ull, + .compute_mmcr = power5_compute_mmcr, + .get_constraint = power5_get_constraint, + .get_alternatives = power5_get_alternatives, + .disable_pmc = power5_disable_pmc, + .n_generic = ARRAY_SIZE(power5_generic_events), + .generic_events = power5_generic_events, +}; From f3dfd2656deb81a0addee4f4ceff66b50a387388 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 26 Feb 2009 22:43:46 +1100 Subject: [PATCH 0086/2061] perfcounters: fix a few minor cleanliness issues This fixes three issues noticed by Arnd Bergmann: - Add #ifdef __KERNEL__ and move some things around in perf_counter.h to make sure only the bits that userspace needs are exported to userspace. - Use __u64, __s64, __u32 types in the structs exported to userspace rather than u64, s64, u32. - Make the sys_perf_counter_open syscall available to the SPUs on Cell platforms. And one issue that I noticed in looking at the code again: - Wrap the perf_counter_open syscall with SYSCALL_DEFINE4 so we get the proper handling of int arguments on ppc64 (and some other 64-bit architectures). Reported-by: Arnd Bergmann Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/systbl.h | 2 +- include/linux/perf_counter.h | 43 +++++++++++++++++-------------- include/linux/syscalls.h | 9 +++---- kernel/perf_counter.c | 6 ++--- 4 files changed, 30 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/include/asm/systbl.h b/arch/powerpc/include/asm/systbl.h index 4c8095f6bec..d312eec8abb 100644 --- a/arch/powerpc/include/asm/systbl.h +++ b/arch/powerpc/include/asm/systbl.h @@ -322,4 +322,4 @@ SYSCALL_SPU(epoll_create1) SYSCALL_SPU(dup3) SYSCALL_SPU(pipe2) SYSCALL(inotify_init1) -SYSCALL(perf_counter_open) +SYSCALL_SPU(perf_counter_open) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 32cd1acb738..186efaf4966 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -13,20 +13,8 @@ #ifndef _LINUX_PERF_COUNTER_H #define _LINUX_PERF_COUNTER_H -#include -#include - -#ifdef CONFIG_PERF_COUNTERS -# include -#endif - -#include -#include -#include -#include -#include - -struct task_struct; +#include +#include /* * User-space ABI bits: @@ -78,12 +66,12 @@ enum perf_counter_record_type { * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - s64 type; + __s64 type; - u64 irq_period; - u32 record_type; + __u64 irq_period; + __u32 record_type; - u32 disabled : 1, /* off by default */ + __u32 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ @@ -95,7 +83,7 @@ struct perf_counter_hw_event { __reserved_1 : 23; - u64 __reserved_2; + __u64 __reserved_2; }; /* @@ -104,10 +92,24 @@ struct perf_counter_hw_event { #define PERF_COUNTER_IOC_ENABLE _IO('$', 0) #define PERF_COUNTER_IOC_DISABLE _IO('$', 1) +#ifdef __KERNEL__ /* - * Kernel-internal data types: + * Kernel-internal data types and definitions: */ +#ifdef CONFIG_PERF_COUNTERS +# include +#endif + +#include +#include +#include +#include +#include +#include + +struct task_struct; + /** * struct hw_perf_counter - performance counter hardware details: */ @@ -293,4 +295,5 @@ static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } #endif +#endif /* __KERNEL__ */ #endif /* _LINUX_PERF_COUNTER_H */ diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 88255d3261a..28ef2be839c 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -696,10 +696,7 @@ asmlinkage long sys_pipe(int __user *); int kernel_execve(const char *filename, char *const argv[], char *const envp[]); -asmlinkage int sys_perf_counter_open( - - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd); +asmlinkage long sys_perf_counter_open( + const struct perf_counter_hw_event __user *hw_event_uptr, + pid_t pid, int cpu, int group_fd); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ad62965828d..16b14ba99d3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1690,9 +1690,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, * @cpu: target cpu * @group_fd: group leader counter fd */ -asmlinkage int -sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, int cpu, int group_fd) +SYSCALL_DEFINE4(perf_counter_open, + const struct perf_counter_hw_event __user *, hw_event_uptr, + pid_t, pid, int, cpu, int, group_fd) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; From b56a3802dc6df29aa27d2c12edf420258091ad66 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 27 Feb 2009 18:09:09 +0530 Subject: [PATCH 0087/2061] x86: prepare perf_counter to add more cpus Introduced struct pmc_x86_ops to add more cpus. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 106 +++++++++++++++++++++-------- 1 file changed, 78 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 383d4c6423a..a3c88529bb7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -3,6 +3,7 @@ * * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar + * Copyright(C) 2009 Jaswinder Singh Rajput * * For licencing details see kernel-base/COPYING */ @@ -38,10 +39,24 @@ struct cpu_hw_counters { }; /* - * Intel PerfMon v3. Used on Core2 and later. + * struct pmc_x86_ops - performance counter x86 ops */ +struct pmc_x86_ops { + u64 (*save_disable_all) (void); + void (*restore_all) (u64 ctrl); + unsigned eventsel; + unsigned perfctr; + int (*event_map) (int event); + int max_events; +}; + +static struct pmc_x86_ops *pmc_ops; + static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); +/* + * Intel PerfMon v3. Used on Core2 and later. + */ static const int intel_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x003c, @@ -53,7 +68,10 @@ static const int intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static const int max_intel_perfmon_events = ARRAY_SIZE(intel_perfmon_event_map); +static int pmc_intel_event_map(int event) +{ + return intel_perfmon_event_map[event]; +} /* * Propagate counter elapsed time into the generic counter. @@ -144,38 +162,48 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (hw_event->raw) { hwc->config |= hw_event->type; } else { - if (hw_event->type >= max_intel_perfmon_events) + if (hw_event->type >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= intel_perfmon_event_map[hw_event->type]; + hwc->config |= pmc_ops->event_map(hw_event->type); } counter->wakeup_pending = 0; return 0; } -u64 hw_perf_save_disable(void) +static u64 pmc_intel_save_disable_all(void) { u64 ctrl; - if (unlikely(!perf_counters_initialized)) - return 0; - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); return ctrl; } + +u64 hw_perf_save_disable(void) +{ + if (unlikely(!perf_counters_initialized)) + return 0; + + return pmc_ops->save_disable_all(); +} EXPORT_SYMBOL_GPL(hw_perf_save_disable); +static void pmc_intel_restore_all(u64 ctrl) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); +} + void hw_perf_restore(u64 ctrl) { if (unlikely(!perf_counters_initialized)) return; - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + pmc_ops->restore_all(ctrl); } EXPORT_SYMBOL_GPL(hw_perf_restore); @@ -291,11 +319,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_INSTRUCTIONS])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_CPU_CYCLES])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == intel_perfmon_event_map[PERF_COUNT_BUS_CYCLES])) + if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -339,8 +367,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = MSR_ARCH_PERFMON_EVENTSEL0; - hwc->counter_base = MSR_ARCH_PERFMON_PERFCTR0; + hwc->config_base = pmc_ops->eventsel; + hwc->counter_base = pmc_ops->perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -386,8 +414,8 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, pmc_ctrl); - rdmsrl(MSR_ARCH_PERFMON_PERFCTR0 + idx, pmc_count); + rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); + rdmsrl(pmc_ops->perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -655,29 +683,56 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; -void __init init_hw_perf_counters(void) +static struct pmc_x86_ops pmc_intel_ops = { + .save_disable_all = pmc_intel_save_disable_all, + .restore_all = pmc_intel_restore_all, + .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, + .perfctr = MSR_ARCH_PERFMON_PERFCTR0, + .event_map = pmc_intel_event_map, + .max_events = ARRAY_SIZE(intel_perfmon_event_map), +}; + +static struct pmc_x86_ops *pmc_intel_init(void) { union cpuid10_eax eax; unsigned int ebx; unsigned int unused; union cpuid10_edx edx; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - /* * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return; + return NULL; printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... num counters: %d\n", eax.split.num_counters); + printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); + printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + nr_counters_generic = eax.split.num_counters; + nr_counters_fixed = edx.split.num_counters_fixed; + counter_value_mask = (1ULL << eax.split.bit_width) - 1; + + return &pmc_intel_ops; +} + +void __init init_hw_perf_counters(void) +{ + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return; + + switch (boot_cpu_data.x86_vendor) { + case X86_VENDOR_INTEL: + pmc_ops = pmc_intel_init(); + break; + } + if (!pmc_ops) + return; + + printk(KERN_INFO "... num counters: %d\n", nr_counters_generic); if (nr_counters_generic > X86_PMC_MAX_GENERIC) { nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -686,13 +741,8 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_counters_generic) - 1; perf_max_counters = nr_counters_generic; - printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); - counter_value_mask = (1ULL << eax.split.bit_width) - 1; printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); - printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); - - nr_counters_fixed = edx.split.num_counters_fixed; if (nr_counters_fixed > X86_PMC_MAX_FIXED) { nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", From f87ad35d37fa543925210550f7db20a54c83ed70 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 27 Feb 2009 20:15:14 +0530 Subject: [PATCH 0088/2061] x86: AMD Support for perf_counter Supported basic performance counter for AMD K7 and later: $ perfstat -e 0,1,2,3,4,5,-1,-2,-3,-4,-5 ls > /dev/null Performance counter stats for 'ls': 12.298610 task clock ticks (msecs) 3298477 CPU cycles (events) 1406354 instructions (events) 749035 cache references (events) 16939 cache misses (events) 100589 branches (events) 11159 branch misses (events) 7.627540 cpu clock ticks (msecs) 12.298610 task clock ticks (msecs) 500 pagefaults (events) 6 context switches (events) 3 CPU migrations (events) Wall-clock time elapsed: 8.672290 msecs Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 ++ arch/x86/kernel/cpu/perf_counter.c | 83 +++++++++++++++++++++++++++++- 2 files changed, 85 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index 25423a5b80e..edcde52bd17 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -368,6 +368,10 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); + /* Enable Performance counter for K7 and later */ + if (c->x86 > 6 && c->x86 <= 0x11) + set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); + if (!c->x86_model_id[0]) { switch (c->x86) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a3c88529bb7..266618aa1a0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -73,6 +73,24 @@ static int pmc_intel_event_map(int event) return intel_perfmon_event_map[event]; } +/* + * AMD Performance Monitor K7 and later. + */ +static const int amd_perfmon_event_map[] = +{ + [PERF_COUNT_CPU_CYCLES] = 0x0076, + [PERF_COUNT_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_CACHE_MISSES] = 0x0081, + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x00c4, + [PERF_COUNT_BRANCH_MISSES] = 0x00c5, +}; + +static int pmc_amd_event_map(int event) +{ + return amd_perfmon_event_map[event]; +} + /* * Propagate counter elapsed time into the generic counter. * Can only be executed on the CPU where the counter is active. @@ -151,8 +169,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * so we install an artificial 1<<31 period regardless of * the generic counter period: */ - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) - hwc->irq_period = 0x7FFFFFFF; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) + hwc->irq_period = 0x7FFFFFFF; atomic64_set(&hwc->period_left, hwc->irq_period); @@ -184,6 +203,22 @@ static u64 pmc_intel_save_disable_all(void) return ctrl; } +static u64 pmc_amd_save_disable_all(void) +{ + int idx; + u64 val, ctrl = 0; + + for (idx = 0; idx < nr_counters_generic; idx++) { + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + ctrl |= (1 << idx); + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } + + return ctrl; +} + u64 hw_perf_save_disable(void) { if (unlikely(!perf_counters_initialized)) @@ -198,6 +233,20 @@ static void pmc_intel_restore_all(u64 ctrl) wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } +static void pmc_amd_restore_all(u64 ctrl) +{ + u64 val; + int idx; + + for (idx = 0; idx < nr_counters_generic; idx++) { + if (ctrl & (1 << idx)) { + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } + } +} + void hw_perf_restore(u64 ctrl) { if (unlikely(!perf_counters_initialized)) @@ -314,6 +363,9 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return -1; + if (unlikely(hwc->nmi)) return -1; @@ -401,6 +453,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -411,6 +464,7 @@ void perf_counter_print_debug(void) printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); + } printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { @@ -588,6 +642,9 @@ void perf_counter_unthrottle(void) if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return; + if (unlikely(!perf_counters_initialized)) return; @@ -692,6 +749,15 @@ static struct pmc_x86_ops pmc_intel_ops = { .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; +static struct pmc_x86_ops pmc_amd_ops = { + .save_disable_all = pmc_amd_save_disable_all, + .restore_all = pmc_amd_restore_all, + .eventsel = MSR_K7_EVNTSEL0, + .perfctr = MSR_K7_PERFCTR0, + .event_map = pmc_amd_event_map, + .max_events = ARRAY_SIZE(amd_perfmon_event_map), +}; + static struct pmc_x86_ops *pmc_intel_init(void) { union cpuid10_eax eax; @@ -719,6 +785,16 @@ static struct pmc_x86_ops *pmc_intel_init(void) return &pmc_intel_ops; } +static struct pmc_x86_ops *pmc_amd_init(void) +{ + nr_counters_generic = 4; + nr_counters_fixed = 0; + + printk(KERN_INFO "AMD Performance Monitoring support detected.\n"); + + return &pmc_amd_ops; +} + void __init init_hw_perf_counters(void) { if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) @@ -728,6 +804,9 @@ void __init init_hw_perf_counters(void) case X86_VENDOR_INTEL: pmc_ops = pmc_intel_init(); break; + case X86_VENDOR_AMD: + pmc_ops = pmc_amd_init(); + break; } if (!pmc_ops) return; From 169e41eb7f5464c077a7e0e129f025759d04cc54 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 28 Feb 2009 18:37:49 +0530 Subject: [PATCH 0089/2061] x86: decent declarations in perf_counter.c Impact: cleanup making decent declrations for struct pmc_x86_ops and fix checkpatch error: ERROR: Macros with complex values should be enclosed in parenthesis Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 266618aa1a0..a1f3646a3e8 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -42,12 +42,12 @@ struct cpu_hw_counters { * struct pmc_x86_ops - performance counter x86 ops */ struct pmc_x86_ops { - u64 (*save_disable_all) (void); - void (*restore_all) (u64 ctrl); - unsigned eventsel; - unsigned perfctr; - int (*event_map) (int event); - int max_events; + u64 (*save_disable_all)(void); + void (*restore_all)(u64 ctrl); + unsigned eventsel; + unsigned perfctr; + int (*event_map)(int event); + int max_events; }; static struct pmc_x86_ops *pmc_ops; @@ -561,7 +561,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) /* * Maximum interrupt frequency of 100KHz per CPU */ -#define PERFMON_MAX_INTERRUPTS 100000/HZ +#define PERFMON_MAX_INTERRUPTS (100000/HZ) /* * This handler is triggered by the local APIC, so the APIC IRQ handling From a1ef58f442542d8b3e3b963339fbc522c36e827c Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 28 Feb 2009 18:45:39 +0530 Subject: [PATCH 0090/2061] x86: use pr_info in perf_counter.c Impact: cleanup using pr_info in perf_counter.c fixes various 80 characters warnings and also indenting for conditional statement Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 46 +++++++++++++++--------------- 1 file changed, 23 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a1f3646a3e8..3b65f19a668 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -454,18 +454,18 @@ void perf_counter_print_debug(void) cpuc = &per_cpu(cpu_hw_counters, cpu); if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); - rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); - rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); + rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR_CTRL, fixed); - printk(KERN_INFO "\n"); - printk(KERN_INFO "CPU#%d: ctrl: %016llx\n", cpu, ctrl); - printk(KERN_INFO "CPU#%d: status: %016llx\n", cpu, status); - printk(KERN_INFO "CPU#%d: overflow: %016llx\n", cpu, overflow); - printk(KERN_INFO "CPU#%d: fixed: %016llx\n", cpu, fixed); + pr_info("\n"); + pr_info("CPU#%d: ctrl: %016llx\n", cpu, ctrl); + pr_info("CPU#%d: status: %016llx\n", cpu, status); + pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); + pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - printk(KERN_INFO "CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); @@ -473,17 +473,17 @@ void perf_counter_print_debug(void) prev_left = per_cpu(prev_left[idx], cpu); - printk(KERN_INFO "CPU#%d: gen-PMC%d ctrl: %016llx\n", + pr_info("CPU#%d: gen-PMC%d ctrl: %016llx\n", cpu, idx, pmc_ctrl); - printk(KERN_INFO "CPU#%d: gen-PMC%d count: %016llx\n", + pr_info("CPU#%d: gen-PMC%d count: %016llx\n", cpu, idx, pmc_count); - printk(KERN_INFO "CPU#%d: gen-PMC%d left: %016llx\n", + pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } for (idx = 0; idx < nr_counters_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); - printk(KERN_INFO "CPU#%d: fixed-PMC%d count: %016llx\n", + pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } local_irq_enable(); @@ -773,10 +773,10 @@ static struct pmc_x86_ops *pmc_intel_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return NULL; - printk(KERN_INFO "Intel Performance Monitoring support detected.\n"); - printk(KERN_INFO "... version: %d\n", eax.split.version_id); - printk(KERN_INFO "... bit width: %d\n", eax.split.bit_width); - printk(KERN_INFO "... mask length: %d\n", eax.split.mask_length); + pr_info("Intel Performance Monitoring support detected.\n"); + pr_info("... version: %d\n", eax.split.version_id); + pr_info("... bit width: %d\n", eax.split.bit_width); + pr_info("... mask length: %d\n", eax.split.mask_length); nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; @@ -790,7 +790,7 @@ static struct pmc_x86_ops *pmc_amd_init(void) nr_counters_generic = 4; nr_counters_fixed = 0; - printk(KERN_INFO "AMD Performance Monitoring support detected.\n"); + pr_info("AMD Performance Monitoring support detected.\n"); return &pmc_amd_ops; } @@ -811,7 +811,7 @@ void __init init_hw_perf_counters(void) if (!pmc_ops) return; - printk(KERN_INFO "... num counters: %d\n", nr_counters_generic); + pr_info("... num counters: %d\n", nr_counters_generic); if (nr_counters_generic > X86_PMC_MAX_GENERIC) { nr_counters_generic = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", @@ -820,18 +820,18 @@ void __init init_hw_perf_counters(void) perf_counter_mask = (1 << nr_counters_generic) - 1; perf_max_counters = nr_counters_generic; - printk(KERN_INFO "... value mask: %016Lx\n", counter_value_mask); + pr_info("... value mask: %016Lx\n", counter_value_mask); if (nr_counters_fixed > X86_PMC_MAX_FIXED) { nr_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", nr_counters_fixed, X86_PMC_MAX_FIXED); } - printk(KERN_INFO "... fixed counters: %d\n", nr_counters_fixed); + pr_info("... fixed counters: %d\n", nr_counters_fixed); perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; - printk(KERN_INFO "... counter mask: %016Lx\n", perf_counter_mask); + pr_info("... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; perf_counters_lapic_init(0); From 2743a5b0fa6f309da904f2190a9cc25deee34dbd Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 4 Mar 2009 20:36:51 +1100 Subject: [PATCH 0091/2061] perfcounters: provide expansion room in the ABI Impact: ABI change This expands several fields in the perf_counter_hw_event struct and adds a "flags" argument to the perf_counter_open system call, in order that features can be added in future without ABI changes. In particular the record_type field is expanded to 64 bits, and the space for flag bits has been expanded from 32 to 64 bits. This also adds some new fields: * read_format (64 bits) is intended to provide a way to specify what userspace wants to get back when it does a read() on a simple (non-interrupting) counter; * exclude_idle (1 bit) provides a way for userspace to ask that events that occur when the cpu is idle be excluded; * extra_config_len will provide a way for userspace to supply an arbitrary amount of extra machine-specific PMU configuration data immediately following the perf_counter_hw_event struct, to allow sophisticated users to program things such as instruction matching CAMs and address range registers; * __reserved_3 and __reserved_4 provide space for future expansion. Signed-off-by: Paul Mackerras --- include/linux/perf_counter.h | 12 +++++++++--- include/linux/syscalls.h | 2 +- kernel/perf_counter.c | 10 +++++++--- 3 files changed, 17 insertions(+), 7 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 186efaf4966..c42455ab155 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -69,9 +69,10 @@ struct perf_counter_hw_event { __s64 type; __u64 irq_period; - __u32 record_type; + __u64 record_type; + __u64 read_format; - __u32 disabled : 1, /* off by default */ + __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ @@ -80,10 +81,15 @@ struct perf_counter_hw_event { exclude_user : 1, /* don't count user */ exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ - __reserved_1 : 23; + __reserved_1 : 55; + + __u32 extra_config_len; + __u32 __reserved_4; __u64 __reserved_2; + __u64 __reserved_3; }; /* diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 28ef2be839c..ab1d7724739 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -698,5 +698,5 @@ int kernel_execve(const char *filename, char *const argv[], char *const envp[]); asmlinkage long sys_perf_counter_open( const struct perf_counter_hw_event __user *hw_event_uptr, - pid_t pid, int cpu, int group_fd); + pid_t pid, int cpu, int group_fd, unsigned long flags); #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 16b14ba99d3..b2e838959f3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1683,16 +1683,16 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, } /** - * sys_perf_task_open - open a performance counter, associate it to a task/cpu + * sys_perf_counter_open - open a performance counter, associate it to a task/cpu * * @hw_event_uptr: event type attributes for monitoring/sampling * @pid: target pid * @cpu: target cpu * @group_fd: group leader counter fd */ -SYSCALL_DEFINE4(perf_counter_open, +SYSCALL_DEFINE5(perf_counter_open, const struct perf_counter_hw_event __user *, hw_event_uptr, - pid_t, pid, int, cpu, int, group_fd) + pid_t, pid, int, cpu, int, group_fd, unsigned long, flags) { struct perf_counter *counter, *group_leader; struct perf_counter_hw_event hw_event; @@ -1703,6 +1703,10 @@ SYSCALL_DEFINE4(perf_counter_open, int fput_needed2 = 0; int ret; + /* for future expandability... */ + if (flags) + return -EINVAL; + if (copy_from_user(&hw_event, hw_event_uptr, sizeof(hw_event)) != 0) return -EFAULT; From 2485e5184452ccbda34ff83883883d9107800dff Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 5 Mar 2009 12:33:16 +0100 Subject: [PATCH 0092/2061] perfcounters: fix reserved bits sizing The sum of bits is 65 currently not 64 - so reduce the # of reserved bits from 55 to 54. Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c42455ab155..dde564517b6 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -83,7 +83,7 @@ struct perf_counter_hw_event { exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - __reserved_1 : 55; + __reserved_1 : 54; __u32 extra_config_len; __u32 __reserved_4; From b0f3f28e0f14eb335f67bfaae33ce8b8d74fd58b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2009 18:08:27 +0100 Subject: [PATCH 0093/2061] perfcounters: IRQ and NMI support on AMD CPUs The below completes the K7+ performance counter support: - IRQ support - NMI support KernelTop output works now as well. Signed-off-by: Peter Zijlstra Cc: Jaswinder Singh Rajput Cc: Paul Mackerras LKML-Reference: <1236273633.5187.286.camel@laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 272 ++++++++++++++++++++++++----- 1 file changed, 228 insertions(+), 44 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3b65f19a668..6ebe9abf6ae 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -28,6 +28,7 @@ static bool perf_counters_initialized __read_mostly; static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; static u64 counter_value_mask __read_mostly; +static int counter_value_bits __read_mostly; static int nr_counters_fixed __read_mostly; @@ -35,7 +36,9 @@ struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; - u64 global_enable; + u64 throttle_ctrl; + u64 active_mask; + int enabled; }; /* @@ -43,21 +46,28 @@ struct cpu_hw_counters { */ struct pmc_x86_ops { u64 (*save_disable_all)(void); - void (*restore_all)(u64 ctrl); + void (*restore_all)(u64); + u64 (*get_status)(u64); + void (*ack_status)(u64); + void (*enable)(int, u64); + void (*disable)(int, u64); unsigned eventsel; unsigned perfctr; - int (*event_map)(int event); + u64 (*event_map)(int); + u64 (*raw_event)(u64); int max_events; }; static struct pmc_x86_ops *pmc_ops; -static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); +static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { + .enabled = 1, +}; /* * Intel PerfMon v3. Used on Core2 and later. */ -static const int intel_perfmon_event_map[] = +static const u64 intel_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x003c, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -68,15 +78,29 @@ static const int intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static int pmc_intel_event_map(int event) +static u64 pmc_intel_event_map(int event) { return intel_perfmon_event_map[event]; } +static u64 pmc_intel_raw_event(u64 event) +{ +#define CORE_EVNTSEL_EVENT_MASK 0x000000FF +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00 +#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000 + +#define CORE_EVNTSEL_MASK \ + (CORE_EVNTSEL_EVENT_MASK | \ + CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_COUNTER_MASK) + + return event & CORE_EVNTSEL_MASK; +} + /* * AMD Performance Monitor K7 and later. */ -static const int amd_perfmon_event_map[] = +static const u64 amd_perfmon_event_map[] = { [PERF_COUNT_CPU_CYCLES] = 0x0076, [PERF_COUNT_INSTRUCTIONS] = 0x00c0, @@ -86,11 +110,25 @@ static const int amd_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -static int pmc_amd_event_map(int event) +static u64 pmc_amd_event_map(int event) { return amd_perfmon_event_map[event]; } +static u64 pmc_amd_raw_event(u64 event) +{ +#define K7_EVNTSEL_EVENT_MASK 0x7000000FF +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00 +#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000 + +#define K7_EVNTSEL_MASK \ + (K7_EVNTSEL_EVENT_MASK | \ + K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_COUNTER_MASK) + + return event & K7_EVNTSEL_MASK; +} + /* * Propagate counter elapsed time into the generic counter. * Can only be executed on the CPU where the counter is active. @@ -179,7 +217,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (hw_event->raw) { - hwc->config |= hw_event->type; + hwc->config |= pmc_ops->raw_event(hw_event->type); } else { if (hw_event->type >= pmc_ops->max_events) return -EINVAL; @@ -205,18 +243,24 @@ static u64 pmc_intel_save_disable_all(void) static u64 pmc_amd_save_disable_all(void) { - int idx; - u64 val, ctrl = 0; + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + int enabled, idx; + + enabled = cpuc->enabled; + cpuc->enabled = 0; + barrier(); for (idx = 0; idx < nr_counters_generic; idx++) { + u64 val; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) - ctrl |= (1 << idx); - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) { + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); + } } - return ctrl; + return enabled; } u64 hw_perf_save_disable(void) @@ -226,6 +270,9 @@ u64 hw_perf_save_disable(void) return pmc_ops->save_disable_all(); } +/* + * Exported because of ACPI idle + */ EXPORT_SYMBOL_GPL(hw_perf_save_disable); static void pmc_intel_restore_all(u64 ctrl) @@ -235,11 +282,18 @@ static void pmc_intel_restore_all(u64 ctrl) static void pmc_amd_restore_all(u64 ctrl) { - u64 val; + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; + cpuc->enabled = ctrl; + barrier(); + if (!ctrl) + return; + for (idx = 0; idx < nr_counters_generic; idx++) { - if (ctrl & (1 << idx)) { + if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) { + u64 val; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); val |= ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); @@ -254,8 +308,112 @@ void hw_perf_restore(u64 ctrl) pmc_ops->restore_all(ctrl); } +/* + * Exported because of ACPI idle + */ EXPORT_SYMBOL_GPL(hw_perf_restore); +static u64 pmc_intel_get_status(u64 mask) +{ + u64 status; + + rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + + return status; +} + +static u64 pmc_amd_get_status(u64 mask) +{ + u64 status = 0; + int idx; + + for (idx = 0; idx < nr_counters_generic; idx++) { + s64 val; + + if (!(mask & (1 << idx))) + continue; + + rdmsrl(MSR_K7_PERFCTR0 + idx, val); + val <<= (64 - counter_value_bits); + if (val >= 0) + status |= (1 << idx); + } + + return status; +} + +static u64 hw_perf_get_status(u64 mask) +{ + if (unlikely(!perf_counters_initialized)) + return 0; + + return pmc_ops->get_status(mask); +} + +static void pmc_intel_ack_status(u64 ack) +{ + wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); +} + +static void pmc_amd_ack_status(u64 ack) +{ +} + +static void hw_perf_ack_status(u64 ack) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->ack_status(ack); +} + +static void pmc_intel_enable(int idx, u64 config) +{ + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, + config | ARCH_PERFMON_EVENTSEL0_ENABLE); +} + +static void pmc_amd_enable(int idx, u64 config) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + set_bit(idx, (unsigned long *)&cpuc->active_mask); + if (cpuc->enabled) + config |= ARCH_PERFMON_EVENTSEL0_ENABLE; + + wrmsrl(MSR_K7_EVNTSEL0 + idx, config); +} + +static void hw_perf_enable(int idx, u64 config) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->enable(idx, config); +} + +static void pmc_intel_disable(int idx, u64 config) +{ + wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); +} + +static void pmc_amd_disable(int idx, u64 config) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + clear_bit(idx, (unsigned long *)&cpuc->active_mask); + wrmsrl(MSR_K7_EVNTSEL0 + idx, config); + +} + +static void hw_perf_disable(int idx, u64 config) +{ + if (unlikely(!perf_counters_initialized)) + return; + + pmc_ops->disable(idx, config); +} + static inline void __pmc_fixed_disable(struct perf_counter *counter, struct hw_perf_counter *hwc, unsigned int __idx) @@ -278,7 +436,7 @@ __pmc_generic_disable(struct perf_counter *counter, if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); else - wrmsr_safe(hwc->config_base + idx, hwc->config, 0); + hw_perf_disable(idx, hwc->config); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -354,8 +512,7 @@ __pmc_generic_enable(struct perf_counter *counter, if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_enable(counter, hwc, idx); else - wrmsr(hwc->config_base + idx, - hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE, 0); + hw_perf_enable(idx, hwc->config); } static int @@ -567,22 +724,20 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static void __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + int ret = 0; - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + cpuc->throttle_ctrl = hw_perf_save_disable(); - /* Disable counters globally */ - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - ack_APIC_irq(); - - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + status = hw_perf_get_status(cpuc->throttle_ctrl); if (!status) goto out; + ret = 1; again: inc_irq_stat(apic_perf_irqs); ack = status; @@ -618,12 +773,12 @@ again: } } - wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); + hw_perf_ack_status(ack); /* * Repeat if there is more work to be done: */ - rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); + status = hw_perf_get_status(cpuc->throttle_ctrl); if (status) goto again; out: @@ -631,32 +786,27 @@ out: * Restore - do not reenable when global enable is off or throttled: */ if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + hw_perf_restore(cpuc->throttle_ctrl); + + return ret; } void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; - u64 global_enable; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return; - if (unlikely(!perf_counters_initialized)) return; - cpuc = &per_cpu(cpu_hw_counters, smp_processor_id()); + cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); + hw_perf_restore(cpuc->throttle_ctrl); } - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, global_enable); - if (unlikely(cpuc->global_enable && !global_enable)) - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, cpuc->global_enable); cpuc->interrupts = 0; } @@ -664,8 +814,8 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); + ack_APIC_irq(); __smp_perf_counter_interrupt(regs, 0); - irq_exit(); } @@ -722,16 +872,23 @@ perf_counter_nmi_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; + int ret; - if (likely(cmd != DIE_NMI_IPI)) + switch (cmd) { + case DIE_NMI: + case DIE_NMI_IPI: + break; + + default: return NOTIFY_DONE; + } regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - __smp_perf_counter_interrupt(regs, 1); + ret = __smp_perf_counter_interrupt(regs, 1); - return NOTIFY_STOP; + return ret ? NOTIFY_STOP : NOTIFY_OK; } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { @@ -743,18 +900,28 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { static struct pmc_x86_ops pmc_intel_ops = { .save_disable_all = pmc_intel_save_disable_all, .restore_all = pmc_intel_restore_all, + .get_status = pmc_intel_get_status, + .ack_status = pmc_intel_ack_status, + .enable = pmc_intel_enable, + .disable = pmc_intel_disable, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, .event_map = pmc_intel_event_map, + .raw_event = pmc_intel_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; static struct pmc_x86_ops pmc_amd_ops = { .save_disable_all = pmc_amd_save_disable_all, .restore_all = pmc_amd_restore_all, + .get_status = pmc_amd_get_status, + .ack_status = pmc_amd_ack_status, + .enable = pmc_amd_enable, + .disable = pmc_amd_disable, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, .event_map = pmc_amd_event_map, + .raw_event = pmc_amd_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; @@ -787,8 +954,25 @@ static struct pmc_x86_ops *pmc_intel_init(void) static struct pmc_x86_ops *pmc_amd_init(void) { + u64 old; + int bits; + nr_counters_generic = 4; nr_counters_fixed = 0; + counter_value_mask = ~0ULL; + + rdmsrl(MSR_K7_PERFCTR0, old); + wrmsrl(MSR_K7_PERFCTR0, counter_value_mask); + /* + * read the truncated mask + */ + rdmsrl(MSR_K7_PERFCTR0, counter_value_mask); + wrmsrl(MSR_K7_PERFCTR0, old); + + bits = 32 + fls(counter_value_mask >> 32); + if (bits == 32) + bits = fls((u32)counter_value_mask); + counter_value_bits = bits; pr_info("AMD Performance Monitoring support detected.\n"); From b5e8acf66ff5db707c7e08df49fdf6b415878442 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 5 Mar 2009 20:34:21 +0100 Subject: [PATCH 0094/2061] perfcounters: IRQ and NMI support on AMD CPUs, fix The BKGD suggests that counter width on AMD CPUs is 48 for all existing models (it certainly is for mine). Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6ebe9abf6ae..f5853718d4d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -959,20 +959,8 @@ static struct pmc_x86_ops *pmc_amd_init(void) nr_counters_generic = 4; nr_counters_fixed = 0; - counter_value_mask = ~0ULL; - - rdmsrl(MSR_K7_PERFCTR0, old); - wrmsrl(MSR_K7_PERFCTR0, counter_value_mask); - /* - * read the truncated mask - */ - rdmsrl(MSR_K7_PERFCTR0, counter_value_mask); - wrmsrl(MSR_K7_PERFCTR0, old); - - bits = 32 + fls(counter_value_mask >> 32); - if (bits == 32) - bits = fls((u32)counter_value_mask); - counter_value_bits = bits; + counter_value_mask = 0x0000FFFFFFFFFFFFULL; + counter_value_bits = 48; pr_info("AMD Performance Monitoring support detected.\n"); From 86028598de16538f02519141756ccf4accfc29a6 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 5 Mar 2009 14:05:57 +1100 Subject: [PATCH 0095/2061] perfcounters/powerpc: fix oops with multiple counters in a group Impact: fix oops-causing bug This fixes a bug in the powerpc hw_perf_counter_init where the code didn't initialize ctrs[n] before passing the ctrs array to check_excludes, leading to possible oopses and other incorrect behaviour. This fixes it by initializing ctrs[n] correctly. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 112332d07fc..4fec112386f 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -633,6 +633,7 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; } events[n] = ev; + ctrs[n] = counter; if (check_excludes(ctrs, n, 1)) return NULL; if (power_check_constraints(events, n + 1)) From aabbaa6036fd847c583f585c6bae82b5a033e6c7 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 6 Mar 2009 16:27:10 +1100 Subject: [PATCH 0096/2061] perfcounters/powerpc: add support for POWER5+ processors Impact: more hardware support This adds the back-end for the PMU on the POWER5+ processors (i.e. GS, including GS DD3 aka POWER5++). This doesn't use the fixed-function PMC5 and PMC6 since they don't respect the freeze conditions and don't generate interrupts, as on POWER6. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/Makefile | 4 +- arch/powerpc/kernel/perf_counter.c | 4 + arch/powerpc/kernel/power5+-pmu.c | 452 +++++++++++++++++++++++++++++ 3 files changed, 458 insertions(+), 2 deletions(-) create mode 100644 arch/powerpc/kernel/power5+-pmu.c diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index b4c6f466164..49851e0d8fd 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,8 +94,8 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o power5-pmu.o \ - power6-pmu.o +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o \ + power5-pmu.o power5+-pmu.o power6-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 4fec112386f..162f3981fa2 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -826,6 +826,7 @@ void hw_perf_counter_setup(int cpu) extern struct power_pmu ppc970_pmu; extern struct power_pmu power5_pmu; +extern struct power_pmu power5p_pmu; extern struct power_pmu power6_pmu; static int init_perf_counters(void) @@ -848,6 +849,9 @@ static int init_perf_counters(void) case PV_POWER5: ppmu = &power5_pmu; break; + case PV_POWER5p: + ppmu = &power5p_pmu; + break; case 0x3e: ppmu = &power6_pmu; break; diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c new file mode 100644 index 00000000000..cec21ea65b0 --- /dev/null +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -0,0 +1,452 @@ +/* + * Performance counter support for POWER5 (not POWER5++) processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for POWER5+ (POWER5 GS) and POWER5++ (POWER5 GS DD3) + */ +#define PM_PMC_SH 20 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0xf +#define PM_PMC_MSKS (PM_PMC_MSK << PM_PMC_SH) +#define PM_UNIT_SH 16 /* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK 0xf +#define PM_BYTE_SH 12 /* Byte number of event bus to use */ +#define PM_BYTE_MSK 7 +#define PM_GRS_SH 8 /* Storage subsystem mux select */ +#define PM_GRS_MSK 7 +#define PM_BUSEVENT_MSK 0x80 /* Set if event uses event bus */ +#define PM_PMCSEL_MSK 0x7f + +/* Values in PM_UNIT field */ +#define PM_FPU 0 +#define PM_ISU0 1 +#define PM_IFU 2 +#define PM_ISU1 3 +#define PM_IDU 4 +#define PM_ISU0_ALT 6 +#define PM_GRS 7 +#define PM_LSU0 8 +#define PM_LSU1 0xc +#define PM_LASTUNIT 0xc + +/* + * Bits in MMCR1 for POWER5+ + */ +#define MMCR1_TTM0SEL_SH 62 +#define MMCR1_TTM1SEL_SH 60 +#define MMCR1_TTM2SEL_SH 58 +#define MMCR1_TTM3SEL_SH 56 +#define MMCR1_TTMSEL_MSK 3 +#define MMCR1_TD_CP_DBG0SEL_SH 54 +#define MMCR1_TD_CP_DBG1SEL_SH 52 +#define MMCR1_TD_CP_DBG2SEL_SH 50 +#define MMCR1_TD_CP_DBG3SEL_SH 48 +#define MMCR1_GRS_L2SEL_SH 46 +#define MMCR1_GRS_L2SEL_MSK 3 +#define MMCR1_GRS_L3SEL_SH 44 +#define MMCR1_GRS_L3SEL_MSK 3 +#define MMCR1_GRS_MCSEL_SH 41 +#define MMCR1_GRS_MCSEL_MSK 7 +#define MMCR1_GRS_FABSEL_SH 39 +#define MMCR1_GRS_FABSEL_MSK 3 +#define MMCR1_PMC1_ADDER_SEL_SH 35 +#define MMCR1_PMC2_ADDER_SEL_SH 34 +#define MMCR1_PMC3_ADDER_SEL_SH 33 +#define MMCR1_PMC4_ADDER_SEL_SH 32 +#define MMCR1_PMC1SEL_SH 25 +#define MMCR1_PMC2SEL_SH 17 +#define MMCR1_PMC3SEL_SH 9 +#define MMCR1_PMC4SEL_SH 1 +#define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) +#define MMCR1_PMCSEL_MSK 0x7f + +/* + * Bits in MMCRA + */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + * [ ><><>< ><> <><>[ > < >< >< >< ><><><><> + * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P4P3P2P1 + * + * NC - number of counters + * 51: NC error 0x0008_0000_0000_0000 + * 48-50: number of events needing PMC1-4 0x0007_0000_0000_0000 + * + * G0..G3 - GRS mux constraints + * 46-47: GRS_L2SEL value + * 44-45: GRS_L3SEL value + * 41-44: GRS_MCSEL value + * 39-40: GRS_FABSEL value + * Note that these match up with their bit positions in MMCR1 + * + * T0 - TTM0 constraint + * 36-37: TTM0SEL value (0=FPU, 2=IFU, 3=ISU1) 0x30_0000_0000 + * + * T1 - TTM1 constraint + * 34-35: TTM1SEL value (0=IDU, 3=GRS) 0x0c_0000_0000 + * + * UC - unit constraint: can't have all three of FPU|IFU|ISU1, ISU0, IDU|GRS + * 33: UC3 error 0x02_0000_0000 + * 32: FPU|IFU|ISU1 events needed 0x01_0000_0000 + * 31: ISU0 events needed 0x01_8000_0000 + * 30: IDU|GRS events needed 0x00_4000_0000 + * + * B0 + * 20-23: Byte 0 event source 0x00f0_0000 + * Encoding as for the event code + * + * B1, B2, B3 + * 16-19, 12-15, 8-11: Byte 1, 2, 3 event sources + * + * P4 + * 7: P1 error 0x80 + * 6-7: Count of events needing PMC4 + * + * P1..P3 + * 0-6: Count of events needing PMC1..PMC3 + */ + +static const int grsel_shift[8] = { + MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, MMCR1_GRS_L2SEL_SH, + MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, MMCR1_GRS_L3SEL_SH, + MMCR1_GRS_MCSEL_SH, MMCR1_GRS_FABSEL_SH +}; + +/* Masks and values for using events from the various units */ +static u64 unit_cons[PM_LASTUNIT+1][2] = { + [PM_FPU] = { 0x3200000000ull, 0x0100000000ull }, + [PM_ISU0] = { 0x0200000000ull, 0x0080000000ull }, + [PM_ISU1] = { 0x3200000000ull, 0x3100000000ull }, + [PM_IFU] = { 0x3200000000ull, 0x2100000000ull }, + [PM_IDU] = { 0x0e00000000ull, 0x0040000000ull }, + [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull }, +}; + +static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +{ + int pmc, byte, unit, sh; + int bit, fmask; + u64 mask = 0, value = 0; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 4) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + } + if (event & PM_BUSEVENT_MSK) { + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + if (unit > PM_LASTUNIT) + return -1; + if (unit == PM_ISU0_ALT) + unit = PM_ISU0; + mask |= unit_cons[unit][0]; + value |= unit_cons[unit][1]; + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + if (byte >= 4) { + if (unit != PM_LSU1) + return -1; + /* Map LSU1 low word (bytes 4-7) to unit LSU1+1 */ + ++unit; + byte &= 3; + } + if (unit == PM_GRS) { + bit = event & 7; + fmask = (bit == 6)? 7: 3; + sh = grsel_shift[bit]; + mask |= (u64)fmask << sh; + value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; + } + /* Set byte lane select field */ + mask |= 0xfULL << (20 - 4 * byte); + value |= (u64)unit << (20 - 4 * byte); + } + mask |= 0x8000000000000ull; + value |= 0x1000000000000ull; + *maskp = mask; + *valp = value; + return 0; +} + +#define MAX_ALT 3 /* at most 3 alternatives for any event */ + +static const unsigned int event_alternatives[][MAX_ALT] = { + { 0x100c0, 0x40001f }, /* PM_GCT_FULL_CYC */ + { 0x120e4, 0x400002 }, /* PM_GRP_DISP_REJECT */ + { 0x230e2, 0x323087 }, /* PM_BR_PRED_CR */ + { 0x230e3, 0x223087, 0x3230a0 }, /* PM_BR_PRED_TA */ + { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ + { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */ + { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */ + { 0x100009, 0x200009 }, /* PM_INST_CMPL */ + { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */ + { 0x300009, 0x400009 }, /* PM_INST_DISP */ +}; + +/* + * Scan the alternatives table for a match and return the + * index into the alternatives table if found, else -1. + */ +static int find_alternative(unsigned int event) +{ + int i, j; + + for (i = 0; i < ARRAY_SIZE(event_alternatives); ++i) { + if (event < event_alternatives[i][0]) + break; + for (j = 0; j < MAX_ALT && event_alternatives[i][j]; ++j) + if (event == event_alternatives[i][j]) + return i; + } + return -1; +} + +static const unsigned char bytedecode_alternatives[4][4] = { + /* PMC 1 */ { 0x21, 0x23, 0x25, 0x27 }, + /* PMC 2 */ { 0x07, 0x17, 0x0e, 0x1e }, + /* PMC 3 */ { 0x20, 0x22, 0x24, 0x26 }, + /* PMC 4 */ { 0x07, 0x17, 0x0e, 0x1e } +}; + +/* + * Some direct events for decodes of event bus byte 3 have alternative + * PMCSEL values on other counters. This returns the alternative + * event code for those that do, or -1 otherwise. This also handles + * alternative PCMSEL values for add events. + */ +static int find_alternative_bdecode(unsigned int event) +{ + int pmc, altpmc, pp, j; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc == 0 || pmc > 4) + return -1; + altpmc = 5 - pmc; /* 1 <-> 4, 2 <-> 3 */ + pp = event & PM_PMCSEL_MSK; + for (j = 0; j < 4; ++j) { + if (bytedecode_alternatives[pmc - 1][j] == pp) { + return (event & ~(PM_PMC_MSKS | PM_PMCSEL_MSK)) | + (altpmc << PM_PMC_SH) | + bytedecode_alternatives[altpmc - 1][j]; + } + } + + /* new decode alternatives for power5+ */ + if (pmc == 1 && (pp == 0x0d || pp == 0x0e)) + return event + (2 << PM_PMC_SH) + (0x2e - 0x0d); + if (pmc == 3 && (pp == 0x2e || pp == 0x2f)) + return event - (2 << PM_PMC_SH) - (0x2e - 0x0d); + + /* alternative add event encodings */ + if (pp == 0x10 || pp == 0x28) + return ((event ^ (0x10 ^ 0x28)) & ~PM_PMC_MSKS) | + (altpmc << PM_PMC_SH); + + return -1; +} + +static int power5p_get_alternatives(unsigned int event, unsigned int alt[]) +{ + int i, j, ae, nalt = 1; + + alt[0] = event; + nalt = 1; + i = find_alternative(event); + if (i >= 0) { + for (j = 0; j < MAX_ALT; ++j) { + ae = event_alternatives[i][j]; + if (ae && ae != event) + alt[nalt++] = ae; + } + } else { + ae = find_alternative_bdecode(event); + if (ae > 0) + alt[nalt++] = ae; + } + return nalt; +} + +static int power5p_compute_mmcr(unsigned int event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr1 = 0; + unsigned int pmc, unit, byte, psel; + unsigned int ttm; + int i, isbus, bit, grsel; + unsigned int pmc_inuse = 0; + unsigned char busbyte[4]; + unsigned char unituse[16]; + int ttmuse; + + if (n_ev > 4) + return -1; + + /* First pass to count resource use */ + memset(busbyte, 0, sizeof(busbyte)); + memset(unituse, 0, sizeof(unituse)); + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 4) + return -1; + if (pmc_inuse & (1 << (pmc - 1))) + return -1; + pmc_inuse |= 1 << (pmc - 1); + } + if (event[i] & PM_BUSEVENT_MSK) { + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + if (unit > PM_LASTUNIT) + return -1; + if (unit == PM_ISU0_ALT) + unit = PM_ISU0; + if (byte >= 4) { + if (unit != PM_LSU1) + return -1; + ++unit; + byte &= 3; + } + if (busbyte[byte] && busbyte[byte] != unit) + return -1; + busbyte[byte] = unit; + unituse[unit] = 1; + } + } + + /* + * Assign resources and set multiplexer selects. + * + * PM_ISU0 can go either on TTM0 or TTM1, but that's the only + * choice we have to deal with. + */ + if (unituse[PM_ISU0] & + (unituse[PM_FPU] | unituse[PM_IFU] | unituse[PM_ISU1])) { + unituse[PM_ISU0_ALT] = 1; /* move ISU to TTM1 */ + unituse[PM_ISU0] = 0; + } + /* Set TTM[01]SEL fields. */ + ttmuse = 0; + for (i = PM_FPU; i <= PM_ISU1; ++i) { + if (!unituse[i]) + continue; + if (ttmuse++) + return -1; + mmcr1 |= (u64)i << MMCR1_TTM0SEL_SH; + } + ttmuse = 0; + for (; i <= PM_GRS; ++i) { + if (!unituse[i]) + continue; + if (ttmuse++) + return -1; + mmcr1 |= (u64)(i & 3) << MMCR1_TTM1SEL_SH; + } + if (ttmuse > 1) + return -1; + + /* Set byte lane select fields, TTM[23]SEL and GRS_*SEL. */ + for (byte = 0; byte < 4; ++byte) { + unit = busbyte[byte]; + if (!unit) + continue; + if (unit == PM_ISU0 && unituse[PM_ISU0_ALT]) { + /* get ISU0 through TTM1 rather than TTM0 */ + unit = PM_ISU0_ALT; + } else if (unit == PM_LSU1 + 1) { + /* select lower word of LSU1 for this byte */ + mmcr1 |= 1ull << (MMCR1_TTM3SEL_SH + 3 - byte); + } + ttm = unit >> 2; + mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2 * byte); + } + + /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + psel = event[i] & PM_PMCSEL_MSK; + isbus = event[i] & PM_BUSEVENT_MSK; + if (!pmc) { + /* Bus event or any-PMC direct event */ + for (pmc = 0; pmc < 4; ++pmc) { + if (!(pmc_inuse & (1 << pmc))) + break; + } + if (pmc >= 4) + return -1; + pmc_inuse |= 1 << pmc; + } else { + /* Direct event */ + --pmc; + if (isbus && (byte & 2) && + (psel == 8 || psel == 0x10 || psel == 0x28)) + /* add events on higher-numbered bus */ + mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); + } + if (isbus && unit == PM_GRS) { + bit = psel & 7; + grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; + mmcr1 |= (u64)grsel << grsel_shift[bit]; + } + if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1)) + /* select alternate byte lane */ + psel |= 0x10; + if (pmc <= 3) + mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); + hwc[i] = pmc; + } + + /* Return MMCRx values */ + mmcr[0] = 0; + if (pmc_inuse & 1) + mmcr[0] = MMCR0_PMC1CE; + if (pmc_inuse & 0x3e) + mmcr[0] |= MMCR0_PMCjCE; + mmcr[1] = mmcr1; + mmcr[2] = 0; + return 0; +} + +static void power5p_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + if (pmc <= 3) + mmcr[1] &= ~(0x7fUL << MMCR1_PMCSEL_SH(pmc)); +} + +static int power5p_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 0xf, + [PERF_COUNT_INSTRUCTIONS] = 0x100009, + [PERF_COUNT_CACHE_REFERENCES] = 0x1c10a8, /* LD_REF_L1 */ + [PERF_COUNT_CACHE_MISSES] = 0x3c1088, /* LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x230e4, /* BR_ISSUED */ + [PERF_COUNT_BRANCH_MISSES] = 0x230e5, /* BR_MPRED_CR */ +}; + +struct power_pmu power5p_pmu = { + .n_counter = 4, + .max_alternatives = MAX_ALT, + .add_fields = 0x7000000000055ull, + .test_adder = 0x3000040000000ull, + .compute_mmcr = power5p_compute_mmcr, + .get_constraint = power5p_get_constraint, + .get_alternatives = power5p_get_alternatives, + .disable_pmc = power5p_disable_pmc, + .n_generic = ARRAY_SIZE(power5p_generic_events), + .generic_events = power5p_generic_events, +}; From 880860e392d92c457e8116cdee39ec4d109174ee Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 6 Mar 2009 16:30:52 +1100 Subject: [PATCH 0097/2061] perfcounters/powerpc: add support for POWER4 processors Impact: more hardware support This adds the back-end for the PMU on the POWER4 and POWER4+ processors (GP and GQ). This is quite similar to the PPC970, with 8 PMCs, but has fewer events than the PPC970. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/Makefile | 2 +- arch/powerpc/kernel/perf_counter.c | 5 + arch/powerpc/kernel/power4-pmu.c | 557 +++++++++++++++++++++++++++++ 3 files changed, 563 insertions(+), 1 deletion(-) create mode 100644 arch/powerpc/kernel/power4-pmu.c diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile index 49851e0d8fd..8e5e2c74971 100644 --- a/arch/powerpc/kernel/Makefile +++ b/arch/powerpc/kernel/Makefile @@ -94,7 +94,7 @@ obj-$(CONFIG_AUDIT) += audit.o obj64-$(CONFIG_AUDIT) += compat_audit.o obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o -obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o ppc970-pmu.o \ +obj-$(CONFIG_PERF_COUNTERS) += perf_counter.o power4-pmu.o ppc970-pmu.o \ power5-pmu.o power5+-pmu.o power6-pmu.o obj-$(CONFIG_8XX_MINIMAL_FPEMU) += softemu8xx.o diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 162f3981fa2..0e33d27cd46 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -824,6 +824,7 @@ void hw_perf_counter_setup(int cpu) cpuhw->mmcr[0] = MMCR0_FC; } +extern struct power_pmu power4_pmu; extern struct power_pmu ppc970_pmu; extern struct power_pmu power5_pmu; extern struct power_pmu power5p_pmu; @@ -841,6 +842,10 @@ static int init_perf_counters(void) /* XXX should get this from cputable */ pvr = mfspr(SPRN_PVR); switch (PVR_VER(pvr)) { + case PV_POWER4: + case PV_POWER4p: + ppmu = &power4_pmu; + break; case PV_970: case PV_970FX: case PV_970MP: diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c new file mode 100644 index 00000000000..1407b19ab61 --- /dev/null +++ b/arch/powerpc/kernel/power4-pmu.c @@ -0,0 +1,557 @@ +/* + * Performance counter support for POWER4 (GP) and POWER4+ (GQ) processors. + * + * Copyright 2009 Paul Mackerras, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include +#include +#include + +/* + * Bits in event code for POWER4 + */ +#define PM_PMC_SH 12 /* PMC number (1-based) for direct events */ +#define PM_PMC_MSK 0xf +#define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ +#define PM_UNIT_MSK 0xf +#define PM_LOWER_SH 6 +#define PM_LOWER_MSK 1 +#define PM_LOWER_MSKS 0x40 +#define PM_BYTE_SH 4 /* Byte number of event bus to use */ +#define PM_BYTE_MSK 3 +#define PM_PMCSEL_MSK 7 + +/* + * Unit code values + */ +#define PM_FPU 1 +#define PM_ISU1 2 +#define PM_IFU 3 +#define PM_IDU0 4 +#define PM_ISU1_ALT 6 +#define PM_ISU2 7 +#define PM_IFU_ALT 8 +#define PM_LSU0 9 +#define PM_LSU1 0xc +#define PM_GPS 0xf + +/* + * Bits in MMCR0 for POWER4 + */ +#define MMCR0_PMC1SEL_SH 8 +#define MMCR0_PMC2SEL_SH 1 +#define MMCR_PMCSEL_MSK 0x1f + +/* + * Bits in MMCR1 for POWER4 + */ +#define MMCR1_TTM0SEL_SH 62 +#define MMCR1_TTC0SEL_SH 61 +#define MMCR1_TTM1SEL_SH 59 +#define MMCR1_TTC1SEL_SH 58 +#define MMCR1_TTM2SEL_SH 56 +#define MMCR1_TTC2SEL_SH 55 +#define MMCR1_TTM3SEL_SH 53 +#define MMCR1_TTC3SEL_SH 52 +#define MMCR1_TTMSEL_MSK 3 +#define MMCR1_TD_CP_DBG0SEL_SH 50 +#define MMCR1_TD_CP_DBG1SEL_SH 48 +#define MMCR1_TD_CP_DBG2SEL_SH 46 +#define MMCR1_TD_CP_DBG3SEL_SH 44 +#define MMCR1_DEBUG0SEL_SH 43 +#define MMCR1_DEBUG1SEL_SH 42 +#define MMCR1_DEBUG2SEL_SH 41 +#define MMCR1_DEBUG3SEL_SH 40 +#define MMCR1_PMC1_ADDER_SEL_SH 39 +#define MMCR1_PMC2_ADDER_SEL_SH 38 +#define MMCR1_PMC6_ADDER_SEL_SH 37 +#define MMCR1_PMC5_ADDER_SEL_SH 36 +#define MMCR1_PMC8_ADDER_SEL_SH 35 +#define MMCR1_PMC7_ADDER_SEL_SH 34 +#define MMCR1_PMC3_ADDER_SEL_SH 33 +#define MMCR1_PMC4_ADDER_SEL_SH 32 +#define MMCR1_PMC3SEL_SH 27 +#define MMCR1_PMC4SEL_SH 22 +#define MMCR1_PMC5SEL_SH 17 +#define MMCR1_PMC6SEL_SH 12 +#define MMCR1_PMC7SEL_SH 7 +#define MMCR1_PMC8SEL_SH 2 /* note bit 0 is in MMCRA for GP */ + +static short mmcr1_adder_bits[8] = { + MMCR1_PMC1_ADDER_SEL_SH, + MMCR1_PMC2_ADDER_SEL_SH, + MMCR1_PMC3_ADDER_SEL_SH, + MMCR1_PMC4_ADDER_SEL_SH, + MMCR1_PMC5_ADDER_SEL_SH, + MMCR1_PMC6_ADDER_SEL_SH, + MMCR1_PMC7_ADDER_SEL_SH, + MMCR1_PMC8_ADDER_SEL_SH +}; + +/* + * Bits in MMCRA + */ +#define MMCRA_PMC8SEL0_SH 17 /* PMC8SEL bit 0 for GP */ + +/* + * Layout of constraint bits: + * 6666555555555544444444443333333333222222222211111111110000000000 + * 3210987654321098765432109876543210987654321098765432109876543210 + * |[ >[ >[ >|||[ >[ >< >< >< >< ><><><><><><><><> + * | UC1 UC2 UC3 ||| PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 + * \SMPL ||\TTC3SEL + * |\TTC_IFU_SEL + * \TTM2SEL0 + * + * SMPL - SAMPLE_ENABLE constraint + * 56: SAMPLE_ENABLE value 0x0100_0000_0000_0000 + * + * UC1 - unit constraint 1: can't have all three of FPU/ISU1/IDU0|ISU2 + * 55: UC1 error 0x0080_0000_0000_0000 + * 54: FPU events needed 0x0040_0000_0000_0000 + * 53: ISU1 events needed 0x0020_0000_0000_0000 + * 52: IDU0|ISU2 events needed 0x0010_0000_0000_0000 + * + * UC2 - unit constraint 2: can't have all three of FPU/IFU/LSU0 + * 51: UC2 error 0x0008_0000_0000_0000 + * 50: FPU events needed 0x0004_0000_0000_0000 + * 49: IFU events needed 0x0002_0000_0000_0000 + * 48: LSU0 events needed 0x0001_0000_0000_0000 + * + * UC3 - unit constraint 3: can't have all four of LSU0/IFU/IDU0|ISU2/ISU1 + * 47: UC3 error 0x8000_0000_0000 + * 46: LSU0 events needed 0x4000_0000_0000 + * 45: IFU events needed 0x2000_0000_0000 + * 44: IDU0|ISU2 events needed 0x1000_0000_0000 + * 43: ISU1 events needed 0x0800_0000_0000 + * + * TTM2SEL0 + * 42: 0 = IDU0 events needed + * 1 = ISU2 events needed 0x0400_0000_0000 + * + * TTC_IFU_SEL + * 41: 0 = IFU.U events needed + * 1 = IFU.L events needed 0x0200_0000_0000 + * + * TTC3SEL + * 40: 0 = LSU1.U events needed + * 1 = LSU1.L events needed 0x0100_0000_0000 + * + * PS1 + * 39: PS1 error 0x0080_0000_0000 + * 36-38: count of events needing PMC1/2/5/6 0x0070_0000_0000 + * + * PS2 + * 35: PS2 error 0x0008_0000_0000 + * 32-34: count of events needing PMC3/4/7/8 0x0007_0000_0000 + * + * B0 + * 28-31: Byte 0 event source 0xf000_0000 + * 1 = FPU + * 2 = ISU1 + * 3 = IFU + * 4 = IDU0 + * 7 = ISU2 + * 9 = LSU0 + * c = LSU1 + * f = GPS + * + * B1, B2, B3 + * 24-27, 20-23, 16-19: Byte 1, 2, 3 event sources + * + * P8 + * 15: P8 error 0x8000 + * 14-15: Count of events needing PMC8 + * + * P1..P7 + * 0-13: Count of events needing PMC1..PMC7 + * + * Note: this doesn't allow events using IFU.U to be combined with events + * using IFU.L, though that is feasible (using TTM0 and TTM2). However + * there are no listed events for IFU.L (they are debug events not + * verified for performance monitoring) so this shouldn't cause a + * problem. + */ + +static struct unitinfo { + u64 value, mask; + int unit; + int lowerbit; +} p4_unitinfo[16] = { + [PM_FPU] = { 0x44000000000000ull, 0x88000000000000ull, PM_FPU, 0 }, + [PM_ISU1] = { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 }, + [PM_ISU1_ALT] = + { 0x20080000000000ull, 0x88000000000000ull, PM_ISU1, 0 }, + [PM_IFU] = { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 }, + [PM_IFU_ALT] = + { 0x02200000000000ull, 0x08820000000000ull, PM_IFU, 41 }, + [PM_IDU0] = { 0x10100000000000ull, 0x80840000000000ull, PM_IDU0, 1 }, + [PM_ISU2] = { 0x10140000000000ull, 0x80840000000000ull, PM_ISU2, 0 }, + [PM_LSU0] = { 0x01400000000000ull, 0x08800000000000ull, PM_LSU0, 0 }, + [PM_LSU1] = { 0x00000000000000ull, 0x00010000000000ull, PM_LSU1, 40 }, + [PM_GPS] = { 0x00000000000000ull, 0x00000000000000ull, PM_GPS, 0 } +}; + +static unsigned char direct_marked_event[8] = { + (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */ + (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */ + (1<<3), /* PMC3: PM_MRK_ST_CMPL_INT */ + (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */ + (1<<4) | (1<<5), /* PMC5: PM_MRK_GRP_TIMEO */ + (1<<3) | (1<<4) | (1<<5), + /* PMC6: PM_MRK_ST_GPS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */ + (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */ + (1<<4), /* PMC8: PM_MRK_LSU_FIN */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int p4_marked_instr_event(unsigned int event) +{ + int pmc, psel, unit, byte, bit; + unsigned int mask; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = event & PM_PMCSEL_MSK; + if (pmc) { + if (direct_marked_event[pmc - 1] & (1 << psel)) + return 1; + if (psel == 0) /* add events */ + bit = (pmc <= 4)? pmc - 1: 8 - pmc; + else if (psel == 6) /* decode events */ + bit = 4; + else + return 0; + } else + bit = psel; + + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + mask = 0; + switch (unit) { + case PM_LSU1: + if (event & PM_LOWER_MSKS) + mask = 1 << 28; /* byte 7 bit 4 */ + else + mask = 6 << 24; /* byte 3 bits 1 and 2 */ + break; + case PM_LSU0: + /* byte 3, bit 3; byte 2 bits 0,2,3,4,5; byte 1 */ + mask = 0x083dff00; + } + return (mask >> (byte * 8 + bit)) & 1; +} + +static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +{ + int pmc, byte, unit, lower, sh; + u64 mask = 0, value = 0; + int grp = -1; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc > 8) + return -1; + sh = (pmc - 1) * 2; + mask |= 2 << sh; + value |= 1 << sh; + grp = ((pmc - 1) >> 1) & 1; + } + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + if (unit) { + lower = (event >> PM_LOWER_SH) & PM_LOWER_MSK; + + /* + * Bus events on bytes 0 and 2 can be counted + * on PMC1/2/5/6; bytes 1 and 3 on PMC3/4/7/8. + */ + if (!pmc) + grp = byte & 1; + + if (!p4_unitinfo[unit].unit) + return -1; + mask |= p4_unitinfo[unit].mask; + value |= p4_unitinfo[unit].value; + sh = p4_unitinfo[unit].lowerbit; + if (sh > 1) + value |= (u64)lower << sh; + else if (lower != sh) + return -1; + unit = p4_unitinfo[unit].unit; + + /* Set byte lane select field */ + mask |= 0xfULL << (28 - 4 * byte); + value |= (u64)unit << (28 - 4 * byte); + } + if (grp == 0) { + /* increment PMC1/2/5/6 field */ + mask |= 0x8000000000ull; + value |= 0x1000000000ull; + } else { + /* increment PMC3/4/7/8 field */ + mask |= 0x800000000ull; + value |= 0x100000000ull; + } + + /* Marked instruction events need sample_enable set */ + if (p4_marked_instr_event(event)) { + mask |= 1ull << 56; + value |= 1ull << 56; + } + + /* PMCSEL=6 decode events on byte 2 need sample_enable clear */ + if (pmc && (event & PM_PMCSEL_MSK) == 6 && byte == 2) + mask |= 1ull << 56; + + *maskp = mask; + *valp = value; + return 0; +} + +static unsigned int ppc_inst_cmpl[] = { + 0x1001, 0x4001, 0x6001, 0x7001, 0x8001 +}; + +static int p4_get_alternatives(unsigned int event, unsigned int alt[]) +{ + int i, j, na; + + alt[0] = event; + na = 1; + + /* 2 possibilities for PM_GRP_DISP_REJECT */ + if (event == 0x8003 || event == 0x0224) { + alt[1] = event ^ (0x8003 ^ 0x0224); + return 2; + } + + /* 2 possibilities for PM_ST_MISS_L1 */ + if (event == 0x0c13 || event == 0x0c23) { + alt[1] = event ^ (0x0c13 ^ 0x0c23); + return 2; + } + + /* several possibilities for PM_INST_CMPL */ + for (i = 0; i < ARRAY_SIZE(ppc_inst_cmpl); ++i) { + if (event == ppc_inst_cmpl[i]) { + for (j = 0; j < ARRAY_SIZE(ppc_inst_cmpl); ++j) + if (j != i) + alt[na++] = ppc_inst_cmpl[j]; + break; + } + } + + return na; +} + +static int p4_compute_mmcr(unsigned int event[], int n_ev, + unsigned int hwc[], u64 mmcr[]) +{ + u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; + unsigned int pmc, unit, byte, psel, lower; + unsigned int ttm, grp; + unsigned int pmc_inuse = 0; + unsigned int pmc_grp_use[2]; + unsigned char busbyte[4]; + unsigned char unituse[16]; + unsigned int unitlower = 0; + int i; + + if (n_ev > 8) + return -1; + + /* First pass to count resource use */ + pmc_grp_use[0] = pmc_grp_use[1] = 0; + memset(busbyte, 0, sizeof(busbyte)); + memset(unituse, 0, sizeof(unituse)); + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + if (pmc) { + if (pmc_inuse & (1 << (pmc - 1))) + return -1; + pmc_inuse |= 1 << (pmc - 1); + /* count 1/2/5/6 vs 3/4/7/8 use */ + ++pmc_grp_use[((pmc - 1) >> 1) & 1]; + } + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + lower = (event[i] >> PM_LOWER_SH) & PM_LOWER_MSK; + if (unit) { + if (!pmc) + ++pmc_grp_use[byte & 1]; + if (unit == 6 || unit == 8) + /* map alt ISU1/IFU codes: 6->2, 8->3 */ + unit = (unit >> 1) - 1; + if (busbyte[byte] && busbyte[byte] != unit) + return -1; + busbyte[byte] = unit; + lower <<= unit; + if (unituse[unit] && lower != (unitlower & lower)) + return -1; + unituse[unit] = 1; + unitlower |= lower; + } + } + if (pmc_grp_use[0] > 4 || pmc_grp_use[1] > 4) + return -1; + + /* + * Assign resources and set multiplexer selects. + * + * Units 1,2,3 are on TTM0, 4,6,7 on TTM1, 8,10 on TTM2. + * Each TTMx can only select one unit, but since + * units 2 and 6 are both ISU1, and 3 and 8 are both IFU, + * we have some choices. + */ + if (unituse[2] & (unituse[1] | (unituse[3] & unituse[9]))) { + unituse[6] = 1; /* Move 2 to 6 */ + unituse[2] = 0; + } + if (unituse[3] & (unituse[1] | unituse[2])) { + unituse[8] = 1; /* Move 3 to 8 */ + unituse[3] = 0; + unitlower = (unitlower & ~8) | ((unitlower & 8) << 5); + } + /* Check only one unit per TTMx */ + if (unituse[1] + unituse[2] + unituse[3] > 1 || + unituse[4] + unituse[6] + unituse[7] > 1 || + unituse[8] + unituse[9] > 1 || + (unituse[5] | unituse[10] | unituse[11] | + unituse[13] | unituse[14])) + return -1; + + /* Set TTMxSEL fields. Note, units 1-3 => TTM0SEL codes 0-2 */ + mmcr1 |= (u64)(unituse[3] * 2 + unituse[2]) << MMCR1_TTM0SEL_SH; + mmcr1 |= (u64)(unituse[7] * 3 + unituse[6] * 2) << MMCR1_TTM1SEL_SH; + mmcr1 |= (u64)unituse[9] << MMCR1_TTM2SEL_SH; + + /* Set TTCxSEL fields. */ + if (unitlower & 0xe) + mmcr1 |= 1ull << MMCR1_TTC0SEL_SH; + if (unitlower & 0xf0) + mmcr1 |= 1ull << MMCR1_TTC1SEL_SH; + if (unitlower & 0xf00) + mmcr1 |= 1ull << MMCR1_TTC2SEL_SH; + if (unitlower & 0x7000) + mmcr1 |= 1ull << MMCR1_TTC3SEL_SH; + + /* Set byte lane select fields. */ + for (byte = 0; byte < 4; ++byte) { + unit = busbyte[byte]; + if (!unit) + continue; + if (unit == 0xf) { + /* special case for GPS */ + mmcr1 |= 1ull << (MMCR1_DEBUG0SEL_SH - byte); + } else { + if (!unituse[unit]) + ttm = unit - 1; /* 2->1, 3->2 */ + else + ttm = unit >> 2; + mmcr1 |= (u64)ttm << (MMCR1_TD_CP_DBG0SEL_SH - 2*byte); + } + } + + /* Second pass: assign PMCs, set PMCxSEL and PMCx_ADDER_SEL fields */ + for (i = 0; i < n_ev; ++i) { + pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; + unit = (event[i] >> PM_UNIT_SH) & PM_UNIT_MSK; + byte = (event[i] >> PM_BYTE_SH) & PM_BYTE_MSK; + psel = event[i] & PM_PMCSEL_MSK; + if (!pmc) { + /* Bus event or 00xxx direct event (off or cycles) */ + if (unit) + psel |= 0x10 | ((byte & 2) << 2); + for (pmc = 0; pmc < 8; ++pmc) { + if (pmc_inuse & (1 << pmc)) + continue; + grp = (pmc >> 1) & 1; + if (unit) { + if (grp == (byte & 1)) + break; + } else if (pmc_grp_use[grp] < 4) { + ++pmc_grp_use[grp]; + break; + } + } + pmc_inuse |= 1 << pmc; + } else { + /* Direct event */ + --pmc; + if (psel == 0 && (byte & 2)) + /* add events on higher-numbered bus */ + mmcr1 |= 1ull << mmcr1_adder_bits[pmc]; + else if (psel == 6 && byte == 3) + /* seem to need to set sample_enable here */ + mmcra |= MMCRA_SAMPLE_ENABLE; + psel |= 8; + } + if (pmc <= 1) + mmcr0 |= psel << (MMCR0_PMC1SEL_SH - 7 * pmc); + else + mmcr1 |= psel << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2)); + if (pmc == 7) /* PMC8 */ + mmcra |= (psel & 1) << MMCRA_PMC8SEL0_SH; + hwc[i] = pmc; + if (p4_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; + } + + if (pmc_inuse & 1) + mmcr0 |= MMCR0_PMC1CE; + if (pmc_inuse & 0xfe) + mmcr0 |= MMCR0_PMCjCE; + + mmcra |= 0x2000; /* mark only one IOP per PPC instruction */ + + /* Return MMCRx values */ + mmcr[0] = mmcr0; + mmcr[1] = mmcr1; + mmcr[2] = mmcra; + return 0; +} + +static void p4_disable_pmc(unsigned int pmc, u64 mmcr[]) +{ + /* + * Setting the PMCxSEL field to 0 disables PMC x. + * (Note that pmc is 0-based here, not 1-based.) + */ + if (pmc <= 1) { + mmcr[0] &= ~(0x1fUL << (MMCR0_PMC1SEL_SH - 7 * pmc)); + } else { + mmcr[1] &= ~(0x1fUL << (MMCR1_PMC3SEL_SH - 5 * (pmc - 2))); + if (pmc == 7) + mmcr[2] &= ~(1UL << MMCRA_PMC8SEL0_SH); + } +} + +static int p4_generic_events[] = { + [PERF_COUNT_CPU_CYCLES] = 7, + [PERF_COUNT_INSTRUCTIONS] = 0x1001, + [PERF_COUNT_CACHE_REFERENCES] = 0x8c10, /* PM_LD_REF_L1 */ + [PERF_COUNT_CACHE_MISSES] = 0x3c10, /* PM_LD_MISS_L1 */ + [PERF_COUNT_BRANCH_INSTRUCTIONS] = 0x330, /* PM_BR_ISSUED */ + [PERF_COUNT_BRANCH_MISSES] = 0x331, /* PM_BR_MPRED_CR */ +}; + +struct power_pmu power4_pmu = { + .n_counter = 8, + .max_alternatives = 5, + .add_fields = 0x0000001100005555ull, + .test_adder = 0x0011083300000000ull, + .compute_mmcr = p4_compute_mmcr, + .get_constraint = p4_get_constraint, + .get_alternatives = p4_get_alternatives, + .disable_pmc = p4_disable_pmc, + .n_generic = ARRAY_SIZE(p4_generic_events), + .generic_events = p4_generic_events, +}; From 184fe4ab1f2e4dfa45584889bb3820031648386b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sun, 8 Mar 2009 11:34:19 +0100 Subject: [PATCH 0098/2061] x86: perf_counter cleanup Use and actual unsigned long bitmap instead of casting our way around. Signed-off-by: Peter Zijlstra Cc: Jaswinder Singh Rajput LKML-Reference: <1236508459.22914.3645.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f5853718d4d..1df421042b2 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -37,7 +37,7 @@ struct cpu_hw_counters { unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; - u64 active_mask; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; }; @@ -291,7 +291,7 @@ static void pmc_amd_restore_all(u64 ctrl) return; for (idx = 0; idx < nr_counters_generic; idx++) { - if (test_bit(idx, (unsigned long *)&cpuc->active_mask)) { + if (test_bit(idx, cpuc->active_mask)) { u64 val; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); @@ -377,7 +377,7 @@ static void pmc_amd_enable(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, (unsigned long *)&cpuc->active_mask); + set_bit(idx, cpuc->active_mask); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -401,7 +401,7 @@ static void pmc_amd_disable(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - clear_bit(idx, (unsigned long *)&cpuc->active_mask); + clear_bit(idx, cpuc->active_mask); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } From e255357764f92afcafafbd4879b222b8c752065a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 8 Mar 2009 17:09:49 +0530 Subject: [PATCH 0099/2061] x86: perf_counter cleanup Remove unused variables and duplicate header file. Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1df421042b2..155bc3c239b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -17,7 +17,6 @@ #include #include -#include #include static bool perf_counters_initialized __read_mostly; @@ -954,9 +953,6 @@ static struct pmc_x86_ops *pmc_intel_init(void) static struct pmc_x86_ops *pmc_amd_init(void) { - u64 old; - int bits; - nr_counters_generic = 4; nr_counters_fixed = 0; counter_value_mask = 0x0000FFFFFFFFFFFFULL; From bc44fb5f7d3e764ed7698c835a1a0f35aba2eb3d Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:42:18 +0100 Subject: [PATCH 0100/2061] x86, bts: detect size of DS fields Impact: more robust DS feature enumeration Detect the size of the pointer-type fields in the DS area configuration via the DTES64 features rather than based on the cpuid. Rename a variable to denote that size to reflect that it only covers the pointer-type fields. Add more boot-time diagnostics giving the detected size and the sizes of BTS and PEBS records. Use the size of the BTS/PEBS record to indicate that the respective feature is not available (if the record size is zero). Signed-off-by: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 84 +++++++++++++++++++++++--------------------- 1 file changed, 44 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 87b67e3a765..6e5ec679a0c 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -39,7 +39,7 @@ struct ds_configuration { /* the size of one pointer-typed field in the DS structure and in the BTS and PEBS buffers in bytes; this covers the first 8 DS fields related to buffer management. */ - unsigned char sizeof_field; + unsigned char sizeof_ptr_field; /* the size of a BTS/PEBS record in bytes */ unsigned char sizeof_rec[2]; /* a series of bit-masks to control various features indexed @@ -142,14 +142,14 @@ enum ds_qualifier { static inline unsigned long ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); return *(unsigned long *)base; } static inline void ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, unsigned long value) { - base += (ds_cfg.sizeof_field * (field + (4 * qual))); + base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); (*(unsigned long *)base) = value; } @@ -410,7 +410,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, * Later architectures use 64bit pointers throughout, whereas earlier * architectures use 32bit pointers in 32bit mode. * - * We compute the base address for the first 8 fields based on: + * We compute the base address for the fields based on: * - the field size stored in the DS configuration * - the relative field position * @@ -441,13 +441,13 @@ enum bts_field { static inline unsigned long bts_get(const char *base, enum bts_field field) { - base += (ds_cfg.sizeof_field * field); + base += (ds_cfg.sizeof_ptr_field * field); return *(unsigned long *)base; } static inline void bts_set(char *base, enum bts_field field, unsigned long val) { - base += (ds_cfg.sizeof_field * field);; + base += (ds_cfg.sizeof_ptr_field * field);; (*(unsigned long *)base) = val; } @@ -593,6 +593,10 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, struct ds_context *context; int error; + error = -EOPNOTSUPP; + if (!ds_cfg.sizeof_rec[qual]) + goto out; + error = -EINVAL; if (!base) goto out; @@ -635,10 +639,6 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, unsigned long irq; int error; - error = -EOPNOTSUPP; - if (!ds_cfg.ctl[dsf_bts]) - goto out; - /* buffer overflow notification is not yet implemented */ error = -EOPNOTSUPP; if (ovfl) @@ -848,7 +848,8 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); tracer->trace.reset_value = - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)); + *(u64 *)(tracer->ds.context->ds + + (ds_cfg.sizeof_ptr_field * 8)); return &tracer->trace; } @@ -884,7 +885,8 @@ int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) if (!tracer) return -EINVAL; - *(u64 *)(tracer->ds.context->ds + (ds_cfg.sizeof_field * 8)) = value; + *(u64 *)(tracer->ds.context->ds + + (ds_cfg.sizeof_ptr_field * 8)) = value; return 0; } @@ -894,52 +896,54 @@ static const struct ds_configuration ds_cfg_netburst = { .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif }; static const struct ds_configuration ds_cfg_pentium_m = { .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), - - .sizeof_field = sizeof(long), - .sizeof_rec[ds_bts] = sizeof(long) * 3, -#ifdef __i386__ - .sizeof_rec[ds_pebs] = sizeof(long) * 10, -#else - .sizeof_rec[ds_pebs] = sizeof(long) * 18, -#endif }; static const struct ds_configuration ds_cfg_core2_atom = { .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), - - .sizeof_field = 8, - .sizeof_rec[ds_bts] = 8 * 3, - .sizeof_rec[ds_pebs] = 8 * 18, }; static void -ds_configure(const struct ds_configuration *cfg) +ds_configure(const struct ds_configuration *cfg, + struct cpuinfo_x86 *cpu) { + unsigned long nr_pebs_fields = 0; + + printk(KERN_INFO "[ds] using %s configuration\n", cfg->name); + +#ifdef __i386__ + nr_pebs_fields = 10; +#else + nr_pebs_fields = 18; +#endif + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; - printk(KERN_INFO "[ds] using %s configuration\n", ds_cfg.name); + ds_cfg.sizeof_ptr_field = + (cpu_has(cpu, X86_FEATURE_DTES64) ? 8 : 4); - if (!cpu_has_bts) { - ds_cfg.ctl[dsf_bts] = 0; + ds_cfg.sizeof_rec[ds_bts] = ds_cfg.sizeof_ptr_field * 3; + ds_cfg.sizeof_rec[ds_pebs] = ds_cfg.sizeof_ptr_field * nr_pebs_fields; + + if (!cpu_has(cpu, X86_FEATURE_BTS)) { + ds_cfg.sizeof_rec[ds_bts] = 0; printk(KERN_INFO "[ds] bts not available\n"); } - if (!cpu_has_pebs) + if (!cpu_has(cpu, X86_FEATURE_PEBS)) { + ds_cfg.sizeof_rec[ds_pebs] = 0; printk(KERN_INFO "[ds] pebs not available\n"); + } + + printk(KERN_INFO "[ds] sizes: address: %u bit, ", + 8 * ds_cfg.sizeof_ptr_field); + printk("bts/pebs record: %u/%u bytes\n", + ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); } @@ -951,12 +955,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) switch (c->x86_model) { case 0x9: case 0xd: /* Pentium M */ - ds_configure(&ds_cfg_pentium_m); + ds_configure(&ds_cfg_pentium_m, c); break; case 0xf: case 0x17: /* Core2 */ case 0x1c: /* Atom */ - ds_configure(&ds_cfg_core2_atom); + ds_configure(&ds_cfg_core2_atom, c); break; case 0x1a: /* i7 */ default: @@ -969,7 +973,7 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x0: case 0x1: case 0x2: /* Netburst */ - ds_configure(&ds_cfg_netburst); + ds_configure(&ds_cfg_netburst, c); break; default: /* sorry, don't know about them */ From 8a327f6d1b05f5ce16572b4413a5df1d0e872283 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:45:07 +0100 Subject: [PATCH 0101/2061] x86, bts: add selftest for BTS Perform a selftest of branch trace store when a cpu is initialized. WARN and disable branch trace store support if the selftest fails. Signed-off-by: Markus Metzger LKML-Reference: <20090313104507.A30125@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 9 ++ arch/x86/kernel/Makefile | 1 + arch/x86/kernel/ds.c | 21 +++ arch/x86/kernel/ds_selftest.c | 241 ++++++++++++++++++++++++++++++++++ arch/x86/kernel/ds_selftest.h | 15 +++ 5 files changed, 287 insertions(+) create mode 100644 arch/x86/kernel/ds_selftest.c create mode 100644 arch/x86/kernel/ds_selftest.h diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index fdb45df608b..dfd74abc03f 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -175,6 +175,15 @@ config IOMMU_LEAK Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. +config X86_DS_SELFTEST + bool "DS selftest" + default y + depends on DEBUG_KERNEL + depends on X86_DS + ---help--- + Perform Debug Store selftests at boot time. + If in doubt, say "N". + config HAVE_MMIOTRACE_SUPPORT def_bool y diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index 339ce35648e..a0c9e138b00 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -44,6 +44,7 @@ obj-y += process.o obj-y += i387.o xsave.o obj-y += ptrace.o obj-$(CONFIG_X86_DS) += ds.o +obj-$(CONFIG_X86_DS_SELFTEST) += ds_selftest.o obj-$(CONFIG_X86_32) += tls.o obj-$(CONFIG_IA32_EMULATION) += tls.o obj-y += step.o diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 6e5ec679a0c..51c936c1a39 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -29,6 +29,7 @@ #include #include +#include "ds_selftest.h" /* * The configuration for a particular DS hardware implementation. @@ -940,6 +941,26 @@ ds_configure(const struct ds_configuration *cfg, printk(KERN_INFO "[ds] pebs not available\n"); } + if (ds_cfg.sizeof_rec[ds_bts]) { + int error; + + error = ds_selftest_bts(); + if (error) { + WARN(1, "[ds] selftest failed. disabling bts.\n"); + ds_cfg.sizeof_rec[ds_bts] = 0; + } + } + + if (ds_cfg.sizeof_rec[ds_pebs]) { + int error; + + error = ds_selftest_pebs(); + if (error) { + WARN(1, "[ds] selftest failed. disabling pebs.\n"); + ds_cfg.sizeof_rec[ds_pebs] = 0; + } + } + printk(KERN_INFO "[ds] sizes: address: %u bit, ", 8 * ds_cfg.sizeof_ptr_field); printk("bts/pebs record: %u/%u bytes\n", diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c new file mode 100644 index 00000000000..8c46fbf38c4 --- /dev/null +++ b/arch/x86/kernel/ds_selftest.c @@ -0,0 +1,241 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger , 2009 + */ + +#include "ds_selftest.h" + +#include +#include + +#include + + +#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ + + +static int ds_selftest_bts_consistency(const struct bts_trace *trace) +{ + int error = 0; + + if (!trace) { + printk(KERN_CONT "failed to access trace..."); + /* Bail out. Other tests are pointless. */ + return -1; + } + + if (!trace->read) { + printk(KERN_CONT "bts read not available..."); + error = -1; + } + + /* Do some sanity checks on the trace configuration. */ + if (!trace->ds.n) { + printk(KERN_CONT "empty bts buffer..."); + error = -1; + } + if (!trace->ds.size) { + printk(KERN_CONT "bad bts trace setup..."); + error = -1; + } + if (trace->ds.end != + (char *)trace->ds.begin + (trace->ds.n * trace->ds.size)) { + printk(KERN_CONT "bad bts buffer setup..."); + error = -1; + } + if ((trace->ds.top < trace->ds.begin) || + (trace->ds.end <= trace->ds.top)) { + printk(KERN_CONT "bts top out of bounds..."); + error = -1; + } + + return error; +} + +static int ds_selftest_bts_read(struct bts_tracer *tracer, + const struct bts_trace *trace, + const void *from, const void *to) +{ + const unsigned char *at; + + /* + * Check a few things which do not belong to this test. + * They should be covered by other tests. + */ + if (!trace) + return -1; + + if (!trace->read) + return -1; + + if (to < from) + return -1; + + if (from < trace->ds.begin) + return -1; + + if (trace->ds.end < to) + return -1; + + if (!trace->ds.size) + return -1; + + /* Now to the test itself. */ + for (at = from; (void *)at < to; at += trace->ds.size) { + struct bts_struct bts; + size_t index; + int error; + + if (((void *)at - trace->ds.begin) % trace->ds.size) { + printk(KERN_CONT + "read from non-integer index..."); + return -1; + } + index = ((void *)at - trace->ds.begin) / trace->ds.size; + + memset(&bts, 0, sizeof(bts)); + error = trace->read(tracer, at, &bts); + if (error < 0) { + printk(KERN_CONT + "error reading bts trace at [%lu] (0x%p)...", + index, at); + return error; + } + + switch (bts.qualifier) { + case BTS_BRANCH: + break; + default: + printk(KERN_CONT + "unexpected bts entry %llu at [%lu] (0x%p)...", + bts.qualifier, index, at); + return -1; + } + } + + return 0; +} + +int ds_selftest_bts(void) +{ + const struct bts_trace *trace; + struct bts_tracer *tracer; + int error = 0; + void *top; + unsigned char buffer[DS_SELFTEST_BUFFER_SIZE]; + + printk(KERN_INFO "[ds] bts selftest..."); + + tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + if (IS_ERR(tracer)) { + error = PTR_ERR(tracer); + tracer = NULL; + + printk(KERN_CONT + "initialization failed (err: %d)...", error); + goto out; + } + + /* The return should already give us enough trace. */ + ds_suspend_bts(tracer); + + /* Let's see if we can access the trace. */ + trace = ds_read_bts(tracer); + + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + /* If everything went well, we should have a few trace entries. */ + if (trace->ds.top == trace->ds.begin) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning, but continue. + */ + printk(KERN_CONT "no trace/overflow..."); + } + + /* Let's try to read the trace we collected. */ + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.top); + if (error < 0) + goto out; + + /* + * Let's read the trace again. + * Since we suspended tracing, we should get the same result. + */ + top = trace->ds.top; + + trace = ds_read_bts(tracer); + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + if (top != trace->ds.top) { + printk(KERN_CONT "suspend not working..."); + error = -1; + goto out; + } + + /* Let's collect some more trace - see if resume is working. */ + ds_resume_bts(tracer); + ds_suspend_bts(tracer); + + trace = ds_read_bts(tracer); + + error = ds_selftest_bts_consistency(trace); + if (error < 0) + goto out; + + if (trace->ds.top == top) { + /* + * It is possible but highly unlikely that we got a + * buffer overflow and end up at exactly the same + * position we started from. + * Let's issue a warning and check the full trace. + */ + printk(KERN_CONT + "no resume progress/overflow..."); + + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.end); + } else if (trace->ds.top < top) { + /* + * We had a buffer overflow - the entire buffer should + * contain trace records. + */ + error = ds_selftest_bts_read(tracer, trace, + trace->ds.begin, trace->ds.end); + } else { + /* + * It is quite likely that the buffer did not overflow. + * Let's just check the delta trace. + */ + error = ds_selftest_bts_read(tracer, trace, + top, trace->ds.top); + } + if (error < 0) + goto out; + + error = 0; + + /* The final test: release the tracer while tracing is suspended. */ + out: + ds_release_bts(tracer); + + printk(KERN_CONT "%s.\n", (error ? "failed" : "passed")); + + return error; +} + +int ds_selftest_pebs(void) +{ + return 0; +} diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h new file mode 100644 index 00000000000..0e6e19d4c7d --- /dev/null +++ b/arch/x86/kernel/ds_selftest.h @@ -0,0 +1,15 @@ +/* + * Debug Store support - selftest + * + * + * Copyright (C) 2009 Intel Corporation. + * Markus Metzger , 2009 + */ + +#ifdef CONFIG_X86_DS_SELFTEST +extern int ds_selftest_bts(void); +extern int ds_selftest_pebs(void); +#else +static inline int ds_selftest_bts(void) { return 0; } +static inline int ds_selftest_pebs(void) { return 0; } +#endif /* CONFIG_X86_DS_SELFTEST */ From b8e47195451c5d3f62620b2b1b5928669afd56eb Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:46:42 +0100 Subject: [PATCH 0102/2061] x86, bts: correct comment style in ds.c Correct the comment style in ds.c. Signed-off-by: Markus Metzger LKML-Reference: <20090313104642.A30149@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 79 ++++++++++++++++++++++---------------------- 1 file changed, 39 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 51c936c1a39..d9cab716805 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -35,25 +35,22 @@ * The configuration for a particular DS hardware implementation. */ struct ds_configuration { - /* the name of the configuration */ + /* The name of the configuration. */ const char *name; - /* the size of one pointer-typed field in the DS structure and - in the BTS and PEBS buffers in bytes; - this covers the first 8 DS fields related to buffer management. */ + /* The size of pointer-typed fields in DS, BTS, and PEBS. */ unsigned char sizeof_ptr_field; - /* the size of a BTS/PEBS record in bytes */ + /* The size of a BTS/PEBS record in bytes. */ unsigned char sizeof_rec[2]; - /* a series of bit-masks to control various features indexed - * by enum ds_feature */ + /* Control bit-masks indexed by enum ds_feature. */ unsigned long ctl[dsf_ctl_max]; }; static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) -#define MAX_SIZEOF_DS (12 * 8) /* maximal size of a DS configuration */ -#define MAX_SIZEOF_BTS (3 * 8) /* maximal size of a BTS record */ -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment */ +#define MAX_SIZEOF_DS (12 * 8) /* Maximal size of a DS configuration. */ +#define MAX_SIZEOF_BTS (3 * 8) /* Maximal size of a BTS record. */ +#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment. */ #define BTS_CONTROL \ (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ @@ -67,28 +64,28 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); * to identify tracers. */ struct ds_tracer { - /* the DS context (partially) owned by this tracer */ + /* The DS context (partially) owned by this tracer. */ struct ds_context *context; - /* the buffer provided on ds_request() and its size in bytes */ + /* The buffer provided on ds_request() and its size in bytes. */ void *buffer; size_t size; }; struct bts_tracer { - /* the common DS part */ + /* The common DS part. */ struct ds_tracer ds; - /* the trace including the DS configuration */ + /* The trace including the DS configuration. */ struct bts_trace trace; - /* buffer overflow notification function */ + /* Buffer overflow notification function. */ bts_ovfl_callback_t ovfl; }; struct pebs_tracer { - /* the common DS part */ + /* The common DS part. */ struct ds_tracer ds; - /* the trace including the DS configuration */ + /* The trace including the DS configuration. */ struct pebs_trace trace; - /* buffer overflow notification function */ + /* Buffer overflow notification function. */ pebs_ovfl_callback_t ovfl; }; @@ -214,18 +211,16 @@ static inline int check_tracer(struct task_struct *task) * deallocated when the last user puts the context. */ struct ds_context { - /* pointer to the DS configuration; goes into MSR_IA32_DS_AREA */ + /* The DS configuration; goes into MSR_IA32_DS_AREA. */ unsigned char ds[MAX_SIZEOF_DS]; - /* the owner of the BTS and PEBS configuration, respectively */ + /* The owner of the BTS and PEBS configuration, respectively. */ struct bts_tracer *bts_master; struct pebs_tracer *pebs_master; - /* use count */ + /* Use count. */ unsigned long count; - /* a pointer to the context location inside the thread_struct - * or the per_cpu context array */ + /* Pointer to the context pointer field. */ struct ds_context **this; - /* a pointer to the task owning this context, or NULL, if the - * context is owned by a cpu */ + /* The traced task; NULL for current cpu. */ struct task_struct *task; }; @@ -350,14 +345,14 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, unsigned long write_size, adj_write_size; /* - * write as much as possible without producing an + * Write as much as possible without producing an * overflow interrupt. * - * interrupt_threshold must either be + * Interrupt_threshold must either be * - bigger than absolute_maximum or * - point to a record between buffer_base and absolute_maximum * - * index points to a valid record. + * Index points to a valid record. */ base = ds_get(context->ds, qual, ds_buffer_base); index = ds_get(context->ds, qual, ds_index); @@ -366,8 +361,10 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, write_end = min(end, int_th); - /* if we are already beyond the interrupt threshold, - * we fill the entire buffer */ + /* + * If we are already beyond the interrupt threshold, + * we fill the entire buffer. + */ if (write_end <= index) write_end = end; @@ -384,7 +381,7 @@ static int ds_write(struct ds_context *context, enum ds_qualifier qual, adj_write_size = write_size / ds_cfg.sizeof_rec[qual]; adj_write_size *= ds_cfg.sizeof_rec[qual]; - /* zero out trailing bytes */ + /* Zero out trailing bytes. */ memset((char *)index + write_size, 0, adj_write_size - write_size); index += adj_write_size; @@ -556,7 +553,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, unsigned int flags) { unsigned long buffer, adj; - /* adjust the buffer address and size to meet alignment + /* + * Adjust the buffer address and size to meet alignment * constraints: * - buffer is double-word aligned * - size is multiple of record size @@ -578,7 +576,8 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, trace->begin = (void *)buffer; trace->top = trace->begin; trace->end = (void *)(buffer + size); - /* The value for 'no threshold' is -1, which will set the + /* + * The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ trace->ith = (void *)(buffer + size - ith); @@ -602,7 +601,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* we require some space to do alignment adjustments below */ + /* We require some space to do alignment adjustments below. */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) goto out; @@ -640,7 +639,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, unsigned long irq; int error; - /* buffer overflow notification is not yet implemented */ + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; if (ovfl) goto out; @@ -700,7 +699,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, unsigned long irq; int error; - /* buffer overflow notification is not yet implemented */ + /* Buffer overflow notification is not yet implemented. */ error = -EOPNOTSUPP; if (ovfl) goto out; @@ -983,9 +982,9 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) case 0x1c: /* Atom */ ds_configure(&ds_cfg_core2_atom, c); break; - case 0x1a: /* i7 */ + case 0x1a: /* Core i7 */ default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; @@ -997,12 +996,12 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) ds_configure(&ds_cfg_netburst, c); break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } break; default: - /* sorry, don't know about them */ + /* Sorry, don't know about them. */ break; } } From ba9372a8f306c4e53a5f61dcbcd6c1e4a8c2e9ac Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:48:52 +0100 Subject: [PATCH 0103/2061] x86, hw-branch-tracer: keep resources on stop Distinguish init/reset and start/stop: init/reset will allocate and release bts tracing resources stop/start will suspend and resume bts tracing Return an error on init() if no cpu can be traced. Signed-off-by: Markus Metzger LKML-Reference: <20090313104852.A30168@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_hw_branches.c | 119 ++++++++++++++++++++++--------- 1 file changed, 85 insertions(+), 34 deletions(-) diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c2347..a99a04c5e9c 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -19,7 +19,7 @@ #include "trace_output.h" -#define SIZEOF_BTS (1 << 13) +#define BTS_BUFFER_SIZE (1 << 13) /* * The tracer lock protects the below per-cpu tracer array. @@ -33,53 +33,68 @@ */ static DEFINE_SPINLOCK(bts_tracer_lock); static DEFINE_PER_CPU(struct bts_tracer *, tracer); -static DEFINE_PER_CPU(unsigned char[SIZEOF_BTS], buffer); +static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) #define this_buffer per_cpu(buffer, smp_processor_id()) -static int __read_mostly trace_hw_branches_enabled; +static int trace_hw_branches_enabled __read_mostly; +static int trace_hw_branches_suspended __read_mostly; static struct trace_array *hw_branch_trace __read_mostly; /* - * Start tracing on the current cpu. + * Initialize the tracer for the current cpu. * The argument is ignored. * * pre: bts_tracer_lock must be locked. */ -static void bts_trace_start_cpu(void *arg) +static void bts_trace_init_cpu(void *arg) { if (this_tracer) ds_release_bts(this_tracer); - this_tracer = - ds_request_bts(/* task = */ NULL, this_buffer, SIZEOF_BTS, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - BTS_KERNEL); + this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); if (IS_ERR(this_tracer)) { this_tracer = NULL; return; } } -static void bts_trace_start(struct trace_array *tr) +static int bts_trace_init(struct trace_array *tr) { + int cpu, avail; + spin_lock(&bts_tracer_lock); - on_each_cpu(bts_trace_start_cpu, NULL, 1); - trace_hw_branches_enabled = 1; + hw_branch_trace = tr; + + on_each_cpu(bts_trace_init_cpu, NULL, 1); + + /* Check on how many cpus we could enable tracing */ + avail = 0; + for_each_online_cpu(cpu) + if (per_cpu(tracer, cpu)) + avail++; + + trace_hw_branches_enabled = (avail ? 1 : 0); + trace_hw_branches_suspended = 0; spin_unlock(&bts_tracer_lock); + + + /* If we could not enable tracing on a single cpu, we fail. */ + return avail ? 0 : -EOPNOTSUPP; } /* - * Stop tracing on the current cpu. + * Release the tracer for the current cpu. * The argument is ignored. * * pre: bts_tracer_lock must be locked. */ -static void bts_trace_stop_cpu(void *arg) +static void bts_trace_release_cpu(void *arg) { if (this_tracer) { ds_release_bts(this_tracer); @@ -87,12 +102,57 @@ static void bts_trace_stop_cpu(void *arg) } } +static void bts_trace_reset(struct trace_array *tr) +{ + spin_lock(&bts_tracer_lock); + + on_each_cpu(bts_trace_release_cpu, NULL, 1); + trace_hw_branches_enabled = 0; + trace_hw_branches_suspended = 0; + + spin_unlock(&bts_tracer_lock); +} + +/* + * Resume tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_lock must be locked. + */ +static void bts_trace_resume_cpu(void *arg) +{ + if (this_tracer) + ds_resume_bts(this_tracer); +} + +static void bts_trace_start(struct trace_array *tr) +{ + spin_lock(&bts_tracer_lock); + + on_each_cpu(bts_trace_resume_cpu, NULL, 1); + trace_hw_branches_suspended = 0; + + spin_unlock(&bts_tracer_lock); +} + +/* + * Suspend tracing on the current cpu. + * The argument is ignored. + * + * pre: bts_tracer_lock must be locked. + */ +static void bts_trace_suspend_cpu(void *arg) +{ + if (this_tracer) + ds_suspend_bts(this_tracer); +} + static void bts_trace_stop(struct trace_array *tr) { spin_lock(&bts_tracer_lock); - trace_hw_branches_enabled = 0; - on_each_cpu(bts_trace_stop_cpu, NULL, 1); + on_each_cpu(bts_trace_suspend_cpu, NULL, 1); + trace_hw_branches_suspended = 1; spin_unlock(&bts_tracer_lock); } @@ -110,10 +170,14 @@ static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: - smp_call_function_single(cpu, bts_trace_start_cpu, NULL, 1); + smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1); + + if (trace_hw_branches_suspended) + smp_call_function_single(cpu, bts_trace_suspend_cpu, + NULL, 1); break; case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, bts_trace_stop_cpu, NULL, 1); + smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1); break; } @@ -126,20 +190,6 @@ static struct notifier_block bts_hotcpu_notifier __cpuinitdata = { .notifier_call = bts_hotcpu_handler }; -static int bts_trace_init(struct trace_array *tr) -{ - hw_branch_trace = tr; - - bts_trace_start(tr); - - return 0; -} - -static void bts_trace_reset(struct trace_array *tr) -{ - bts_trace_stop(tr); -} - static void bts_trace_print_header(struct seq_file *m) { seq_puts(m, "# CPU# TO <- FROM\n"); @@ -228,7 +278,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at) */ static void trace_bts_cpu(void *arg) { - struct trace_array *tr = (struct trace_array *) arg; + struct trace_array *tr = (struct trace_array *)arg; const struct bts_trace *trace; unsigned char *at; @@ -276,7 +326,8 @@ void trace_hw_branch_oops(void) { spin_lock(&bts_tracer_lock); - trace_bts_cpu(hw_branch_trace); + if (trace_hw_branches_enabled) + trace_bts_cpu(hw_branch_trace); spin_unlock(&bts_tracer_lock); } From 321bb5e1ac461c04b6a93f795010d6eb01d8c5ca Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 13 Mar 2009 10:50:27 +0100 Subject: [PATCH 0104/2061] x86, hw-branch-tracer: add selftest Add a selftest for the hw-branch-tracer. Signed-off-by: Markus Metzger LKML-Reference: <20090313105027.A30183@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 2 ++ kernel/trace/trace_hw_branches.c | 5 ++- kernel/trace/trace_selftest.c | 53 ++++++++++++++++++++++++++++++++ 3 files changed, 59 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 56ce34d90b0..e7fbc826f1e 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -576,6 +576,8 @@ extern int trace_selftest_startup_sysprof(struct tracer *trace, struct trace_array *tr); extern int trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr); +extern int trace_selftest_startup_hw_branches(struct tracer *trace, + struct trace_array *tr); #endif /* CONFIG_FTRACE_STARTUP_TEST */ extern void *head_page(struct trace_array_cpu *data); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index a99a04c5e9c..4ca82700c04 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -342,7 +342,10 @@ struct tracer bts_tracer __read_mostly = .start = bts_trace_start, .stop = bts_trace_stop, .open = trace_bts_prepare, - .close = trace_bts_close + .close = trace_bts_close, +#ifdef CONFIG_FTRACE_SELFTEST + .selftest = trace_selftest_startup_hw_branches, +#endif /* CONFIG_FTRACE_SELFTEST */ }; __init static int init_bts_trace(void) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index f907a2b2902..3c7b797d0d2 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -16,6 +16,7 @@ static inline int trace_valid_entry(struct trace_entry *entry) case TRACE_BRANCH: case TRACE_GRAPH_ENT: case TRACE_GRAPH_RET: + case TRACE_HW_BRANCHES: return 1; } return 0; @@ -691,3 +692,55 @@ trace_selftest_startup_branch(struct tracer *trace, struct trace_array *tr) return ret; } #endif /* CONFIG_BRANCH_TRACER */ + +#ifdef CONFIG_HW_BRANCH_TRACER +int +trace_selftest_startup_hw_branches(struct tracer *trace, + struct trace_array *tr) +{ + unsigned long count; + int ret; + struct trace_iterator iter; + struct tracer tracer; + + if (!trace->open) { + printk(KERN_CONT "missing open function..."); + return -1; + } + + ret = tracer_init(trace, tr); + if (ret) { + warn_failed_init_tracer(trace, ret); + return ret; + } + + /* + * The hw-branch tracer needs to collect the trace from the various + * cpu trace buffers - before tracing is stopped. + */ + memset(&iter, 0, sizeof(iter)); + memcpy(&tracer, trace, sizeof(tracer)); + + iter.trace = &tracer; + iter.tr = tr; + iter.pos = -1; + mutex_init(&iter.mutex); + + trace->open(&iter); + + mutex_destroy(&iter.mutex); + + tracing_stop(); + + ret = trace_test_buffer(tr, &count); + trace->reset(tr); + tracing_start(); + + if (!ret && !count) { + printk(KERN_CONT "no entries found.."); + ret = -1; + } + + return ret; +} +#endif /* CONFIG_HW_BRANCH_TRACER */ From e9a22d1fb94050b7d600019c32e6b672d539054b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 11:54:40 +0100 Subject: [PATCH 0105/2061] x86, bts: cleanups Impact: cleanup, no code changed Cc: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 144 +++++++++++++++++-------------- arch/x86/kernel/ds_selftest.h | 2 +- kernel/trace/trace_hw_branches.c | 6 +- kernel/trace/trace_selftest.c | 5 +- 4 files changed, 88 insertions(+), 69 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index d9cab716805..7363e01ba08 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -19,43 +19,52 @@ * Markus Metzger , 2007-2009 */ +#include +#include +#include +#include +#include +#include #include -#include -#include -#include -#include -#include -#include - #include "ds_selftest.h" /* - * The configuration for a particular DS hardware implementation. + * The configuration for a particular DS hardware implementation: */ struct ds_configuration { - /* The name of the configuration. */ - const char *name; - /* The size of pointer-typed fields in DS, BTS, and PEBS. */ - unsigned char sizeof_ptr_field; - /* The size of a BTS/PEBS record in bytes. */ - unsigned char sizeof_rec[2]; - /* Control bit-masks indexed by enum ds_feature. */ - unsigned long ctl[dsf_ctl_max]; + /* The name of the configuration: */ + const char *name; + + /* The size of pointer-typed fields in DS, BTS, and PEBS: */ + unsigned char sizeof_ptr_field; + + /* The size of a BTS/PEBS record in bytes: */ + unsigned char sizeof_rec[2]; + + /* Control bit-masks indexed by enum ds_feature: */ + unsigned long ctl[dsf_ctl_max]; }; static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); #define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) -#define MAX_SIZEOF_DS (12 * 8) /* Maximal size of a DS configuration. */ -#define MAX_SIZEOF_BTS (3 * 8) /* Maximal size of a BTS record. */ -#define DS_ALIGNMENT (1 << 3) /* BTS and PEBS buffer alignment. */ +/* Maximal size of a DS configuration: */ +#define MAX_SIZEOF_DS (12 * 8) -#define BTS_CONTROL \ - (ds_cfg.ctl[dsf_bts] | ds_cfg.ctl[dsf_bts_kernel] | ds_cfg.ctl[dsf_bts_user] |\ - ds_cfg.ctl[dsf_bts_overflow]) +/* Maximal size of a BTS record: */ +#define MAX_SIZEOF_BTS (3 * 8) +/* BTS and PEBS buffer alignment: */ +#define DS_ALIGNMENT (1 << 3) + +/* Mask of control bits in the DS MSR register: */ +#define BTS_CONTROL \ + ( ds_cfg.ctl[dsf_bts] | \ + ds_cfg.ctl[dsf_bts_kernel] | \ + ds_cfg.ctl[dsf_bts_user] | \ + ds_cfg.ctl[dsf_bts_overflow] ) /* * A BTS or PEBS tracer. @@ -65,28 +74,32 @@ static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); */ struct ds_tracer { /* The DS context (partially) owned by this tracer. */ - struct ds_context *context; + struct ds_context *context; /* The buffer provided on ds_request() and its size in bytes. */ - void *buffer; - size_t size; + void *buffer; + size_t size; }; struct bts_tracer { - /* The common DS part. */ - struct ds_tracer ds; - /* The trace including the DS configuration. */ - struct bts_trace trace; - /* Buffer overflow notification function. */ - bts_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct bts_trace trace; + + /* Buffer overflow notification function: */ + bts_ovfl_callback_t ovfl; }; struct pebs_tracer { - /* The common DS part. */ - struct ds_tracer ds; - /* The trace including the DS configuration. */ - struct pebs_trace trace; - /* Buffer overflow notification function. */ - pebs_ovfl_callback_t ovfl; + /* The common DS part: */ + struct ds_tracer ds; + + /* The trace including the DS configuration: */ + struct pebs_trace trace; + + /* Buffer overflow notification function: */ + pebs_ovfl_callback_t ovfl; }; /* @@ -95,6 +108,7 @@ struct pebs_tracer { * * The DS configuration consists of the following fields; different * architetures vary in the size of those fields. + * * - double-word aligned base linear address of the BTS buffer * - write pointer into the BTS buffer * - end linear address of the BTS buffer (one byte beyond the end of @@ -133,19 +147,20 @@ enum ds_field { }; enum ds_qualifier { - ds_bts = 0, + ds_bts = 0, ds_pebs }; -static inline unsigned long ds_get(const unsigned char *base, - enum ds_qualifier qual, enum ds_field field) +static inline unsigned long +ds_get(const unsigned char *base, enum ds_qualifier qual, enum ds_field field) { base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); return *(unsigned long *)base; } -static inline void ds_set(unsigned char *base, enum ds_qualifier qual, - enum ds_field field, unsigned long value) +static inline void +ds_set(unsigned char *base, enum ds_qualifier qual, enum ds_field field, + unsigned long value) { base += (ds_cfg.sizeof_ptr_field * (field + (4 * qual))); (*(unsigned long *)base) = value; @@ -157,7 +172,6 @@ static inline void ds_set(unsigned char *base, enum ds_qualifier qual, */ static DEFINE_SPINLOCK(ds_lock); - /* * We either support (system-wide) per-cpu or per-thread allocation. * We distinguish the two based on the task_struct pointer, where a @@ -211,17 +225,21 @@ static inline int check_tracer(struct task_struct *task) * deallocated when the last user puts the context. */ struct ds_context { - /* The DS configuration; goes into MSR_IA32_DS_AREA. */ - unsigned char ds[MAX_SIZEOF_DS]; - /* The owner of the BTS and PEBS configuration, respectively. */ - struct bts_tracer *bts_master; - struct pebs_tracer *pebs_master; - /* Use count. */ + /* The DS configuration; goes into MSR_IA32_DS_AREA: */ + unsigned char ds[MAX_SIZEOF_DS]; + + /* The owner of the BTS and PEBS configuration, respectively: */ + struct bts_tracer *bts_master; + struct pebs_tracer *pebs_master; + + /* Use count: */ unsigned long count; - /* Pointer to the context pointer field. */ - struct ds_context **this; - /* The traced task; NULL for current cpu. */ - struct task_struct *task; + + /* Pointer to the context pointer field: */ + struct ds_context **this; + + /* The traced task; NULL for current cpu: */ + struct task_struct *task; }; static DEFINE_PER_CPU(struct ds_context *, system_context_array); @@ -328,9 +346,9 @@ static void ds_overflow(struct ds_context *context, enum ds_qualifier qual) * The remainder of any partially written record is zeroed out. * * context: the DS context - * qual: the buffer type - * record: the data to write - * size: the size of the data + * qual: the buffer type + * record: the data to write + * size: the size of the data */ static int ds_write(struct ds_context *context, enum ds_qualifier qual, const void *record, size_t size) @@ -429,12 +447,12 @@ enum bts_field { bts_to, bts_flags, - bts_qual = bts_from, - bts_jiffies = bts_to, - bts_pid = bts_flags, + bts_qual = bts_from, + bts_jiffies = bts_to, + bts_pid = bts_flags, - bts_qual_mask = (bts_qual_max - 1), - bts_escape = ((unsigned long)-1 & ~bts_qual_mask) + bts_qual_mask = (bts_qual_max - 1), + bts_escape = ((unsigned long)-1 & ~bts_qual_mask) }; static inline unsigned long bts_get(const char *base, enum bts_field field) @@ -461,8 +479,8 @@ static inline void bts_set(char *base, enum bts_field field, unsigned long val) * * return: bytes read/written on success; -Eerrno, otherwise */ -static int bts_read(struct bts_tracer *tracer, const void *at, - struct bts_struct *out) +static int +bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) { if (!tracer) return -EINVAL; diff --git a/arch/x86/kernel/ds_selftest.h b/arch/x86/kernel/ds_selftest.h index 0e6e19d4c7d..2ba8745c666 100644 --- a/arch/x86/kernel/ds_selftest.h +++ b/arch/x86/kernel/ds_selftest.h @@ -12,4 +12,4 @@ extern int ds_selftest_pebs(void); #else static inline int ds_selftest_bts(void) { return 0; } static inline int ds_selftest_pebs(void) { return 0; } -#endif /* CONFIG_X86_DS_SELFTEST */ +#endif diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 4ca82700c04..8b2109a6c61 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -1,5 +1,5 @@ /* - * h/w branch tracer for x86 based on bts + * h/w branch tracer for x86 based on BTS * * Copyright (C) 2008-2009 Intel Corporation. * Markus Metzger , 2008-2009 @@ -15,8 +15,8 @@ #include -#include "trace.h" #include "trace_output.h" +#include "trace.h" #define BTS_BUFFER_SIZE (1 << 13) @@ -197,10 +197,10 @@ static void bts_trace_print_header(struct seq_file *m) static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) { + unsigned long symflags = TRACE_ITER_SYM_OFFSET; struct trace_entry *entry = iter->ent; struct trace_seq *seq = &iter->seq; struct hw_branch_entry *it; - unsigned long symflags = TRACE_ITER_SYM_OFFSET; trace_assign_type(it, entry); diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 3c7b797d0d2..b9109126706 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -189,6 +189,7 @@ int trace_selftest_startup_dynamic_tracing(struct tracer *trace, #else # define trace_selftest_startup_dynamic_tracing(trace, tr, func) ({ 0; }) #endif /* CONFIG_DYNAMIC_FTRACE */ + /* * Simple verification test of ftrace function tracer. * Enable ftrace, sleep 1/10 second, and then read the trace @@ -698,10 +699,10 @@ int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr) { - unsigned long count; - int ret; struct trace_iterator iter; struct tracer tracer; + unsigned long count; + int ret; if (!trace->open) { printk(KERN_CONT "missing open function..."); From 79258a354e0c69be94ae2871809a195bf4a647b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 13 Mar 2009 12:02:08 +0100 Subject: [PATCH 0106/2061] x86, bts: detect size of DS fields, fix Impact: build fix One usage site was missed in the sizeof_field -> sizeof_ptr_field rename. Cc: Markus Metzger LKML-Reference: <20090313104218.A30096@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 7363e01ba08..5fd53333c1d 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -983,7 +983,7 @@ ds_configure(const struct ds_configuration *cfg, printk("bts/pebs record: %u/%u bytes\n", ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_field)); + WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field)); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) From c78a3956b982418186e40978a51636a2b43221bc Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Wed, 18 Mar 2009 19:27:00 +0100 Subject: [PATCH 0107/2061] x86, bts: use atomic memory allocation Ds_request_bts() needs to allocate memory. It uses GFP_KERNEL. Hw-branch-tracer calls ds_request_bts() within on_each_cpu(). Use atomic memory allocation to allow it to be used in that context. Signed-off-by: Markus Metzger LKML-Reference: <20090318192700.A6038@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 5fd53333c1d..b1d6e1f502f 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -255,8 +255,13 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) struct ds_context *new_context = NULL; unsigned long irq; - /* Chances are small that we already have a context. */ - new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); + /* + * Chances are small that we already have a context. + * + * Contexts for per-cpu tracing are allocated using + * smp_call_function(). We must not sleep. + */ + new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC); if (!new_context) return NULL; @@ -662,8 +667,12 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (ovfl) goto out; + /* + * Per-cpu tracing is typically requested using smp_call_function(). + * We must not sleep. + */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) goto out; tracer->ovfl = ovfl; @@ -722,8 +731,12 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (ovfl) goto out; + /* + * Per-cpu tracing is typically requested using smp_call_function(). + * We must not sleep. + */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); + tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) goto out; tracer->ovfl = ovfl; From 425480081e936d8725f0d44b8829d699bf088c6b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 13:38:36 -0400 Subject: [PATCH 0108/2061] tracing: add handler to trace_stat Currently, if a trace_stat user wants a handle to some private data, the trace_stat infrastructure does not supply a way to do that. This patch passes the trace_stat structure to the start function of the trace_stat code. Signed-off-by: Steven Rostedt --- kernel/trace/trace_branch.c | 4 ++-- kernel/trace/trace_stat.c | 2 +- kernel/trace/trace_stat.h | 2 +- kernel/trace/trace_workqueue.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index ad8c22efff4..e6e32912ffb 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -263,7 +263,7 @@ static int branch_stat_show(struct seq_file *m, void *v) return 0; } -static void *annotated_branch_stat_start(void) +static void *annotated_branch_stat_start(struct tracer_stat *trace) { return __start_annotated_branch_profile; } @@ -338,7 +338,7 @@ static int all_branch_stat_headers(struct seq_file *m) return 0; } -static void *all_branch_stat_start(void) +static void *all_branch_stat_start(struct tracer_stat *trace) { return __start_branch_profile; } diff --git a/kernel/trace/trace_stat.c b/kernel/trace/trace_stat.c index f71b85b22cf..f8f48d84b2c 100644 --- a/kernel/trace/trace_stat.c +++ b/kernel/trace/trace_stat.c @@ -85,7 +85,7 @@ static int stat_seq_init(struct tracer_stat_session *session) if (!ts->stat_cmp) ts->stat_cmp = dummy_cmp; - stat = ts->stat_start(); + stat = ts->stat_start(ts); if (!stat) goto exit; diff --git a/kernel/trace/trace_stat.h b/kernel/trace/trace_stat.h index 202274cf7f3..f3546a2cd82 100644 --- a/kernel/trace/trace_stat.h +++ b/kernel/trace/trace_stat.h @@ -12,7 +12,7 @@ struct tracer_stat { /* The name of your stat file */ const char *name; /* Iteration over statistic entries */ - void *(*stat_start)(void); + void *(*stat_start)(struct tracer_stat *trace); void *(*stat_next)(void *prev, int idx); /* Compare two entries for stats sorting */ int (*stat_cmp)(void *p1, void *p2); diff --git a/kernel/trace/trace_workqueue.c b/kernel/trace/trace_workqueue.c index 9ab035b58cf..ee533c2e161 100644 --- a/kernel/trace/trace_workqueue.c +++ b/kernel/trace/trace_workqueue.c @@ -152,7 +152,7 @@ static struct cpu_workqueue_stats *workqueue_stat_start_cpu(int cpu) return ret; } -static void *workqueue_stat_start(void) +static void *workqueue_stat_start(struct tracer_stat *trace) { int cpu; void *ret = NULL; From bac429f037f1a51a74d62bad6d1518c3be065df3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 20 Mar 2009 12:50:56 -0400 Subject: [PATCH 0109/2061] tracing: add function profiler Impact: new profiling feature This patch adds a function profiler. In debugfs/tracing/ two new files are created. function_profile_enabled - to enable or disable profiling trace_stat/functions - the profiled functions. For example: echo 1 > /debugfs/tracing/function_profile_enabled ./hackbench 50 echo 0 > /debugfs/tracing/function_profile_enabled yields: cat /debugfs/tracing/trace_stat/functions Function Hit -------- --- _spin_lock 10106442 _spin_unlock 10097492 kfree 6013704 _spin_unlock_irqrestore 4423941 _spin_lock_irqsave 4406825 __phys_addr 4181686 __slab_free 4038222 dput 4030130 path_put 4023387 unroll_tree_refs 4019532 [...] The most hit functions are listed first. Functions that are not hit are not listed. This feature depends on and uses dynamic function tracing. When the function profiling is disabled, no overhead occurs. But it still takes up around 300KB to hold the data, thus it is not recomended to keep it enabled for systems low on memory. When a '1' is echoed into the function_profile_enabled file, the counters for is function is reset back to zero. Thus you can see what functions are hit most by different programs. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 + kernel/trace/Kconfig | 19 +++ kernel/trace/ftrace.c | 313 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 334 insertions(+), 2 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 015a3d22cf7..0456c3a51c6 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -153,6 +153,10 @@ struct dyn_ftrace { unsigned long flags; struct dyn_ftrace *newlist; }; +#ifdef CONFIG_FUNCTION_PROFILER + unsigned long counter; + struct hlist_node node; +#endif struct dyn_arch_ftrace arch; }; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 8a4d7293104..95e9ad5735d 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -105,6 +105,7 @@ config FUNCTION_GRAPH_TRACER This is done by setting the current return address on the current task structure into a stack of calls. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n @@ -376,6 +377,24 @@ config DYNAMIC_FTRACE were made. If so, it runs stop_machine (stops all CPUS) and modifies the code to jump over the call to ftrace. +config FUNCTION_PROFILER + bool "Kernel function profiler" + depends on DYNAMIC_FTRACE + default n + help + This option enables the kernel function profiler. When the dynamic + function tracing is enabled, a counter is added into the function + records used by the dynamic function tracer. A file is created in + debugfs called function_profile_enabled which defaults to zero. + When a 1 is echoed into this file profiling begins, and when a + zero is entered, profiling stops. A file in the trace_stats + directory called functions, that show the list of functions that + have been hit and their counters. + + This takes up around 320K more memory. + + If in doubt, say N + config FTRACE_MCOUNT_RECORD def_bool y depends on DYNAMIC_FTRACE diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 7b8722baf15..11f364c776d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -34,6 +34,7 @@ #include #include "trace.h" +#include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ do { \ @@ -261,7 +262,6 @@ struct ftrace_func_probe { struct rcu_head rcu; }; - enum { FTRACE_ENABLE_CALLS = (1 << 0), FTRACE_DISABLE_CALLS = (1 << 1), @@ -309,6 +309,307 @@ static struct dyn_ftrace *ftrace_free_records; } \ } +#ifdef CONFIG_FUNCTION_PROFILER +static struct hlist_head *ftrace_profile_hash; +static int ftrace_profile_bits; +static int ftrace_profile_enabled; +static DEFINE_MUTEX(ftrace_profile_lock); + +static void * +function_stat_next(void *v, int idx) +{ + struct dyn_ftrace *rec = v; + struct ftrace_page *pg; + + pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK); + + again: + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + } + + if (rec->flags & FTRACE_FL_FREE || + rec->flags & FTRACE_FL_FAILED || + !(rec->flags & FTRACE_FL_CONVERTED) || + /* ignore non hit functions */ + !rec->counter) + goto again; + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + return function_stat_next(&ftrace_pages_start->records[0], 0); +} + +static int function_stat_cmp(void *p1, void *p2) +{ + struct dyn_ftrace *a = p1; + struct dyn_ftrace *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} + +static int function_stat_headers(struct seq_file *m) +{ + seq_printf(m, " Function Hit\n" + " -------- ---\n"); + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct dyn_ftrace *rec = v; + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); + return 0; +} + +static struct tracer_stat function_stats = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static void ftrace_profile_init(int nr_funcs) +{ + unsigned long addr; + int order; + int size; + + /* + * We are profiling all functions, lets make it 1/4th of the + * number of functions that are in core kernel. So we have to + * iterate 4 times. + */ + order = (sizeof(struct hlist_head) * nr_funcs) / 4; + order = get_order(order); + size = 1 << (PAGE_SHIFT + order); + + pr_info("Allocating %d KB for profiler hash\n", size >> 10); + + addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); + if (!addr) { + pr_warning("Could not allocate function profiler hash\n"); + return; + } + + ftrace_profile_hash = (void *)addr; + + /* + * struct hlist_head should be a pointer of 4 or 8 bytes. + * And a simple bit manipulation can be done, but if for + * some reason struct hlist_head is not a mulitple of 2, + * then we play it safe, and simply count. This function + * is done once at boot up, so it is not that critical in + * performance. + */ + + size--; + size /= sizeof(struct hlist_head); + + for (; size; size >>= 1) + ftrace_profile_bits++; + + pr_info("Function profiler has %d hash buckets\n", + 1 << ftrace_profile_bits); + + return; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static void ftrace_profile_reset(void) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + + do_for_each_ftrace_rec(pg, rec) { + rec->counter = 0; + } while_for_each_ftrace_rec(); +} + +static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip) +{ + struct dyn_ftrace *rec; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long flags; + unsigned long key; + + if (!ftrace_profile_hash) + return NULL; + + key = hash_long(ip, ftrace_profile_bits); + hhd = &ftrace_profile_hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + local_irq_save(flags); + hlist_for_each_entry_rcu(rec, n, hhd, node) { + if (rec->ip == ip) + goto out; + } + rec = NULL; + out: + local_irq_restore(flags); + + return rec; +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip) +{ + struct dyn_ftrace *rec; + unsigned long flags; + + if (!ftrace_profile_enabled) + return; + + local_irq_save(flags); + rec = ftrace_find_profiled_func(ip); + if (!rec) + goto out; + + rec->counter++; + out: + local_irq_restore(flags); +} + +static struct ftrace_ops ftrace_profile_ops __read_mostly = +{ + .func = function_profile_call, +}; + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + int ret; + + if (!ftrace_profile_hash) { + pr_info("Can not enable hash due to earlier problems\n"); + return -ENODEV; + } + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ftrace_profile_reset(); + register_ftrace_function(&ftrace_profile_ops); + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + unregister_ftrace_function(&ftrace_profile_ops); + } + } + mutex_unlock(&ftrace_profile_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, +}; + +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; + int ret; + + ret = register_stat_tracer(&function_stats); + if (ret) { + pr_warning("Warning: could not register " + "function stats\n"); + return; + } + + entry = debugfs_create_file("function_profile_enabled", 0644, + d_tracer, NULL, &ftrace_profile_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'function_profile_enabled' entry\n"); +} + +static void ftrace_add_profile(struct dyn_ftrace *rec) +{ + unsigned long key; + + if (!ftrace_profile_hash) + return; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); +} + +static void ftrace_profile_release(struct dyn_ftrace *rec) +{ + mutex_lock(&ftrace_profile_lock); + hlist_del(&rec->node); + mutex_unlock(&ftrace_profile_lock); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static void ftrace_profile_init(int nr_funcs) +{ +} +static void ftrace_add_profile(struct dyn_ftrace *rec) +{ +} +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +} +static void ftrace_profile_release(struct dyn_ftrace *rec) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + #ifdef CONFIG_KPROBES static int frozen_record_count; @@ -359,8 +660,10 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if ((rec->ip >= s) && (rec->ip < e) && - !(rec->flags & FTRACE_FL_FREE)) + !(rec->flags & FTRACE_FL_FREE)) { ftrace_free_rec(rec); + ftrace_profile_release(rec); + } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } @@ -414,6 +717,8 @@ ftrace_record_ip(unsigned long ip) rec->newlist = ftrace_new_addrs; ftrace_new_addrs = rec; + ftrace_add_profile(rec); + return rec; } @@ -2157,6 +2462,8 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + ftrace_profile_debugfs(d_tracer); + return 0; } @@ -2225,6 +2532,8 @@ void __init ftrace_init(void) if (ret) goto failed; + ftrace_profile_init(count); + last_ftrace_enabled = ftrace_enabled = 1; ret = ftrace_convert_nops(NULL, From 493762fc534c71d11d489f872c4b4a2c61173668 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 17:12:36 -0400 Subject: [PATCH 0110/2061] tracing: move function profiler data out of function struct Impact: reduce size of memory in function profiler The function profiler originally introduces its counters into the function records itself. There is 20 thousand different functions on a normal system, and that is adding 20 thousand counters for profiling event when not needed. A normal run of the profiler yields only a couple of thousand functions executed, depending on what is being profiled. This means we have around 18 thousand useless counters. This patch rectifies this by moving the data out of the function records used by dynamic ftrace. Data is preallocated to hold the functions when the profiling begins. Checks are made during profiling to see if more recorcds should be allocated, and they are allocated if it is safe to do so. This also removes the dependency from using dynamic ftrace, and also removes the overhead by having it enabled. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 4 - kernel/trace/Kconfig | 10 +- kernel/trace/ftrace.c | 700 +++++++++++++++++++++++------------------ 3 files changed, 393 insertions(+), 321 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 0456c3a51c6..015a3d22cf7 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -153,10 +153,6 @@ struct dyn_ftrace { unsigned long flags; struct dyn_ftrace *newlist; }; -#ifdef CONFIG_FUNCTION_PROFILER - unsigned long counter; - struct hlist_node node; -#endif struct dyn_arch_ftrace arch; }; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 95e9ad5735d..8a4136096d7 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -379,20 +379,16 @@ config DYNAMIC_FTRACE config FUNCTION_PROFILER bool "Kernel function profiler" - depends on DYNAMIC_FTRACE + depends on FUNCTION_TRACER default n help - This option enables the kernel function profiler. When the dynamic - function tracing is enabled, a counter is added into the function - records used by the dynamic function tracer. A file is created in - debugfs called function_profile_enabled which defaults to zero. + This option enables the kernel function profiler. A file is created + in debugfs called function_profile_enabled which defaults to zero. When a 1 is echoed into this file profiling begins, and when a zero is entered, profiling stops. A file in the trace_stats directory called functions, that show the list of functions that have been hit and their counters. - This takes up around 320K more memory. - If in doubt, say N config FTRACE_MCOUNT_RECORD diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 11f364c776d..24dac448cdc 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -241,6 +241,392 @@ static void ftrace_update_pid_func(void) #endif } +#ifdef CONFIG_FUNCTION_PROFILER +struct ftrace_profile { + struct hlist_node node; + unsigned long ip; + unsigned long counter; +}; + +struct ftrace_profile_page { + struct ftrace_profile_page *next; + unsigned long index; + struct ftrace_profile records[]; +}; + +#define PROFILE_RECORDS_SIZE \ + (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) + +#define PROFILES_PER_PAGE \ + (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) + +/* TODO: make these percpu, to prevent cache line bouncing */ +static struct ftrace_profile_page *profile_pages_start; +static struct ftrace_profile_page *profile_pages; + +static struct hlist_head *ftrace_profile_hash; +static int ftrace_profile_bits; +static int ftrace_profile_enabled; +static DEFINE_MUTEX(ftrace_profile_lock); + +static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable); + +#define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ + +static raw_spinlock_t ftrace_profile_rec_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; + +static void * +function_stat_next(void *v, int idx) +{ + struct ftrace_profile *rec = v; + struct ftrace_profile_page *pg; + + pg = (struct ftrace_profile_page *)((unsigned long)rec & PAGE_MASK); + + again: + rec++; + if ((void *)rec >= (void *)&pg->records[pg->index]) { + pg = pg->next; + if (!pg) + return NULL; + rec = &pg->records[0]; + if (!rec->counter) + goto again; + } + + return rec; +} + +static void *function_stat_start(struct tracer_stat *trace) +{ + return function_stat_next(&profile_pages_start->records[0], 0); +} + +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->counter < b->counter) + return -1; + if (a->counter > b->counter) + return 1; + else + return 0; +} + +static int function_stat_headers(struct seq_file *m) +{ + seq_printf(m, " Function Hit\n" + " -------- ---\n"); + return 0; +} + +static int function_stat_show(struct seq_file *m, void *v) +{ + struct ftrace_profile *rec = v; + char str[KSYM_SYMBOL_LEN]; + + kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + + seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); + return 0; +} + +static struct tracer_stat function_stats = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + +static void ftrace_profile_reset(void) +{ + struct ftrace_profile_page *pg; + + pg = profile_pages = profile_pages_start; + + while (pg) { + memset(pg->records, 0, PROFILE_RECORDS_SIZE); + pg->index = 0; + pg = pg->next; + } + + memset(ftrace_profile_hash, 0, + FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); +} + +int ftrace_profile_pages_init(void) +{ + struct ftrace_profile_page *pg; + int i; + + /* If we already allocated, do nothing */ + if (profile_pages) + return 0; + + profile_pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!profile_pages) + return -ENOMEM; + + pg = profile_pages_start = profile_pages; + + /* allocate 10 more pages to start */ + for (i = 0; i < 10; i++) { + pg->next = (void *)get_zeroed_page(GFP_KERNEL); + /* + * We only care about allocating profile_pages, if + * we failed to allocate here, hopefully we will allocate + * later. + */ + if (!pg->next) + break; + pg = pg->next; + } + + return 0; +} + +static int ftrace_profile_init(void) +{ + int size; + + if (ftrace_profile_hash) { + /* If the profile is already created, simply reset it */ + ftrace_profile_reset(); + return 0; + } + + /* + * We are profiling all functions, but usually only a few thousand + * functions are hit. We'll make a hash of 1024 items. + */ + size = FTRACE_PROFILE_HASH_SIZE; + + ftrace_profile_hash = + kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + + if (!ftrace_profile_hash) + return -ENOMEM; + + size--; + + for (; size; size >>= 1) + ftrace_profile_bits++; + + /* Preallocate a few pages */ + if (ftrace_profile_pages_init() < 0) { + kfree(ftrace_profile_hash); + ftrace_profile_hash = NULL; + return -ENOMEM; + } + + return 0; +} + +/* interrupts must be disabled */ +static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) +{ + struct ftrace_profile *rec; + struct hlist_head *hhd; + struct hlist_node *n; + unsigned long key; + + key = hash_long(ip, ftrace_profile_bits); + hhd = &ftrace_profile_hash[key]; + + if (hlist_empty(hhd)) + return NULL; + + hlist_for_each_entry_rcu(rec, n, hhd, node) { + if (rec->ip == ip) + return rec; + } + + return NULL; +} + +static void ftrace_add_profile(struct ftrace_profile *rec) +{ + unsigned long key; + + key = hash_long(rec->ip, ftrace_profile_bits); + hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); +} + +/* Interrupts must be disabled calling this */ +static struct ftrace_profile * +ftrace_profile_alloc(unsigned long ip, bool alloc_safe) +{ + struct ftrace_profile *rec = NULL; + + /* prevent recursion */ + if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1) + goto out; + + __raw_spin_lock(&ftrace_profile_rec_lock); + + /* Try to always keep another page available */ + if (!profile_pages->next && alloc_safe) + profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC); + + /* + * Try to find the function again since another + * task on another CPU could have added it + */ + rec = ftrace_find_profiled_func(ip); + if (rec) + goto out_unlock; + + if (profile_pages->index == PROFILES_PER_PAGE) { + if (!profile_pages->next) + goto out_unlock; + profile_pages = profile_pages->next; + } + + rec = &profile_pages->records[profile_pages->index++]; + rec->ip = ip; + ftrace_add_profile(rec); + + out_unlock: + __raw_spin_unlock(&ftrace_profile_rec_lock); + out: + atomic_dec(&__get_cpu_var(ftrace_profile_disable)); + + return rec; +} + +/* + * If we are not in an interrupt, or softirq and + * and interrupts are disabled and preemption is not enabled + * (not in a spinlock) then it should be safe to allocate memory. + */ +static bool ftrace_safe_to_allocate(void) +{ + return !in_interrupt() && irqs_disabled() && !preempt_count(); +} + +static void +function_profile_call(unsigned long ip, unsigned long parent_ip) +{ + struct ftrace_profile *rec; + unsigned long flags; + bool alloc_safe; + + if (!ftrace_profile_enabled) + return; + + alloc_safe = ftrace_safe_to_allocate(); + + local_irq_save(flags); + rec = ftrace_find_profiled_func(ip); + if (!rec) { + rec = ftrace_profile_alloc(ip, alloc_safe); + if (!rec) + goto out; + } + + rec->counter++; + out: + local_irq_restore(flags); +} + +static struct ftrace_ops ftrace_profile_ops __read_mostly = +{ + .func = function_profile_call, +}; + +static ssize_t +ftrace_profile_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + unsigned long val; + char buf[64]; + int ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + val = !!val; + + mutex_lock(&ftrace_profile_lock); + if (ftrace_profile_enabled ^ val) { + if (val) { + ret = ftrace_profile_init(); + if (ret < 0) { + cnt = ret; + goto out; + } + + register_ftrace_function(&ftrace_profile_ops); + ftrace_profile_enabled = 1; + } else { + ftrace_profile_enabled = 0; + unregister_ftrace_function(&ftrace_profile_ops); + } + } + out: + mutex_unlock(&ftrace_profile_lock); + + filp->f_pos += cnt; + + return cnt; +} + +static ssize_t +ftrace_profile_read(struct file *filp, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char buf[64]; + int r; + + r = sprintf(buf, "%u\n", ftrace_profile_enabled); + return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); +} + +static const struct file_operations ftrace_profile_fops = { + .open = tracing_open_generic, + .read = ftrace_profile_read, + .write = ftrace_profile_write, +}; + +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ + struct dentry *entry; + int ret; + + ret = register_stat_tracer(&function_stats); + if (ret) { + pr_warning("Warning: could not register " + "function stats\n"); + return; + } + + entry = debugfs_create_file("function_profile_enabled", 0644, + d_tracer, NULL, &ftrace_profile_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'function_profile_enabled' entry\n"); +} + +#else /* CONFIG_FUNCTION_PROFILER */ +static void ftrace_profile_debugfs(struct dentry *d_tracer) +{ +} +#endif /* CONFIG_FUNCTION_PROFILER */ + /* set when tracing only a pid */ struct pid *ftrace_pid_trace; static struct pid * const ftrace_swapper_pid = &init_struct_pid; @@ -309,307 +695,6 @@ static struct dyn_ftrace *ftrace_free_records; } \ } -#ifdef CONFIG_FUNCTION_PROFILER -static struct hlist_head *ftrace_profile_hash; -static int ftrace_profile_bits; -static int ftrace_profile_enabled; -static DEFINE_MUTEX(ftrace_profile_lock); - -static void * -function_stat_next(void *v, int idx) -{ - struct dyn_ftrace *rec = v; - struct ftrace_page *pg; - - pg = (struct ftrace_page *)((unsigned long)rec & PAGE_MASK); - - again: - rec++; - if ((void *)rec >= (void *)&pg->records[pg->index]) { - pg = pg->next; - if (!pg) - return NULL; - rec = &pg->records[0]; - } - - if (rec->flags & FTRACE_FL_FREE || - rec->flags & FTRACE_FL_FAILED || - !(rec->flags & FTRACE_FL_CONVERTED) || - /* ignore non hit functions */ - !rec->counter) - goto again; - - return rec; -} - -static void *function_stat_start(struct tracer_stat *trace) -{ - return function_stat_next(&ftrace_pages_start->records[0], 0); -} - -static int function_stat_cmp(void *p1, void *p2) -{ - struct dyn_ftrace *a = p1; - struct dyn_ftrace *b = p2; - - if (a->counter < b->counter) - return -1; - if (a->counter > b->counter) - return 1; - else - return 0; -} - -static int function_stat_headers(struct seq_file *m) -{ - seq_printf(m, " Function Hit\n" - " -------- ---\n"); - return 0; -} - -static int function_stat_show(struct seq_file *m, void *v) -{ - struct dyn_ftrace *rec = v; - char str[KSYM_SYMBOL_LEN]; - - kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); - - seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); - return 0; -} - -static struct tracer_stat function_stats = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show -}; - -static void ftrace_profile_init(int nr_funcs) -{ - unsigned long addr; - int order; - int size; - - /* - * We are profiling all functions, lets make it 1/4th of the - * number of functions that are in core kernel. So we have to - * iterate 4 times. - */ - order = (sizeof(struct hlist_head) * nr_funcs) / 4; - order = get_order(order); - size = 1 << (PAGE_SHIFT + order); - - pr_info("Allocating %d KB for profiler hash\n", size >> 10); - - addr = __get_free_pages(GFP_KERNEL | __GFP_ZERO, order); - if (!addr) { - pr_warning("Could not allocate function profiler hash\n"); - return; - } - - ftrace_profile_hash = (void *)addr; - - /* - * struct hlist_head should be a pointer of 4 or 8 bytes. - * And a simple bit manipulation can be done, but if for - * some reason struct hlist_head is not a mulitple of 2, - * then we play it safe, and simply count. This function - * is done once at boot up, so it is not that critical in - * performance. - */ - - size--; - size /= sizeof(struct hlist_head); - - for (; size; size >>= 1) - ftrace_profile_bits++; - - pr_info("Function profiler has %d hash buckets\n", - 1 << ftrace_profile_bits); - - return; -} - -static ssize_t -ftrace_profile_read(struct file *filp, char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - char buf[64]; - int r; - - r = sprintf(buf, "%u\n", ftrace_profile_enabled); - return simple_read_from_buffer(ubuf, cnt, ppos, buf, r); -} - -static void ftrace_profile_reset(void) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - - do_for_each_ftrace_rec(pg, rec) { - rec->counter = 0; - } while_for_each_ftrace_rec(); -} - -static struct dyn_ftrace *ftrace_find_profiled_func(unsigned long ip) -{ - struct dyn_ftrace *rec; - struct hlist_head *hhd; - struct hlist_node *n; - unsigned long flags; - unsigned long key; - - if (!ftrace_profile_hash) - return NULL; - - key = hash_long(ip, ftrace_profile_bits); - hhd = &ftrace_profile_hash[key]; - - if (hlist_empty(hhd)) - return NULL; - - local_irq_save(flags); - hlist_for_each_entry_rcu(rec, n, hhd, node) { - if (rec->ip == ip) - goto out; - } - rec = NULL; - out: - local_irq_restore(flags); - - return rec; -} - -static void -function_profile_call(unsigned long ip, unsigned long parent_ip) -{ - struct dyn_ftrace *rec; - unsigned long flags; - - if (!ftrace_profile_enabled) - return; - - local_irq_save(flags); - rec = ftrace_find_profiled_func(ip); - if (!rec) - goto out; - - rec->counter++; - out: - local_irq_restore(flags); -} - -static struct ftrace_ops ftrace_profile_ops __read_mostly = -{ - .func = function_profile_call, -}; - -static ssize_t -ftrace_profile_write(struct file *filp, const char __user *ubuf, - size_t cnt, loff_t *ppos) -{ - unsigned long val; - char buf[64]; - int ret; - - if (!ftrace_profile_hash) { - pr_info("Can not enable hash due to earlier problems\n"); - return -ENODEV; - } - - if (cnt >= sizeof(buf)) - return -EINVAL; - - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - - buf[cnt] = 0; - - ret = strict_strtoul(buf, 10, &val); - if (ret < 0) - return ret; - - val = !!val; - - mutex_lock(&ftrace_profile_lock); - if (ftrace_profile_enabled ^ val) { - if (val) { - ftrace_profile_reset(); - register_ftrace_function(&ftrace_profile_ops); - ftrace_profile_enabled = 1; - } else { - ftrace_profile_enabled = 0; - unregister_ftrace_function(&ftrace_profile_ops); - } - } - mutex_unlock(&ftrace_profile_lock); - - filp->f_pos += cnt; - - return cnt; -} - -static const struct file_operations ftrace_profile_fops = { - .open = tracing_open_generic, - .read = ftrace_profile_read, - .write = ftrace_profile_write, -}; - -static void ftrace_profile_debugfs(struct dentry *d_tracer) -{ - struct dentry *entry; - int ret; - - ret = register_stat_tracer(&function_stats); - if (ret) { - pr_warning("Warning: could not register " - "function stats\n"); - return; - } - - entry = debugfs_create_file("function_profile_enabled", 0644, - d_tracer, NULL, &ftrace_profile_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'function_profile_enabled' entry\n"); -} - -static void ftrace_add_profile(struct dyn_ftrace *rec) -{ - unsigned long key; - - if (!ftrace_profile_hash) - return; - - key = hash_long(rec->ip, ftrace_profile_bits); - hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); -} - -static void ftrace_profile_release(struct dyn_ftrace *rec) -{ - mutex_lock(&ftrace_profile_lock); - hlist_del(&rec->node); - mutex_unlock(&ftrace_profile_lock); -} - -#else /* CONFIG_FUNCTION_PROFILER */ -static void ftrace_profile_init(int nr_funcs) -{ -} -static void ftrace_add_profile(struct dyn_ftrace *rec) -{ -} -static void ftrace_profile_debugfs(struct dentry *d_tracer) -{ -} -static void ftrace_profile_release(struct dyn_ftrace *rec) -{ -} -#endif /* CONFIG_FUNCTION_PROFILER */ - #ifdef CONFIG_KPROBES static int frozen_record_count; @@ -660,10 +745,8 @@ void ftrace_release(void *start, unsigned long size) mutex_lock(&ftrace_lock); do_for_each_ftrace_rec(pg, rec) { if ((rec->ip >= s) && (rec->ip < e) && - !(rec->flags & FTRACE_FL_FREE)) { + !(rec->flags & FTRACE_FL_FREE)) ftrace_free_rec(rec); - ftrace_profile_release(rec); - } } while_for_each_ftrace_rec(); mutex_unlock(&ftrace_lock); } @@ -717,8 +800,6 @@ ftrace_record_ip(unsigned long ip) rec->newlist = ftrace_new_addrs; ftrace_new_addrs = rec; - ftrace_add_profile(rec); - return rec; } @@ -2462,8 +2543,6 @@ static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ - ftrace_profile_debugfs(d_tracer); - return 0; } @@ -2532,8 +2611,6 @@ void __init ftrace_init(void) if (ret) goto failed; - ftrace_profile_init(count); - last_ftrace_enabled = ftrace_enabled = 1; ret = ftrace_convert_nops(NULL, @@ -2734,6 +2811,9 @@ static __init int ftrace_init_debugfs(void) if (!entry) pr_warning("Could not create debugfs " "'set_ftrace_pid' entry\n"); + + ftrace_profile_debugfs(d_tracer); + return 0; } fs_initcall(ftrace_init_debugfs); From 0706f1c48ca8a7ab478090b4e38f2e578ae2bfe0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 23 Mar 2009 23:12:58 -0400 Subject: [PATCH 0111/2061] tracing: adding function timings to function profiler If the function graph trace is enabled, the function profiler will use it to take the timing of the functions. cat /debug/tracing/trace_stat/functions Function Hit Time -------- --- ---- mwait_idle 127 183028.4 us schedule 26 151997.7 us __schedule 31 151975.1 us sys_wait4 2 74080.53 us do_wait 2 74077.80 us sys_newlstat 138 39929.16 us do_path_lookup 179 39845.79 us vfs_lstat_fd 138 39761.97 us user_path_at 153 39469.58 us path_walk 179 39435.76 us __link_path_walk 189 39143.73 us [...] Note the times are skewed due to the function graph tracer not taking into account schedules. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 93 ++++++++++++++++++++++++++-- kernel/trace/trace.c | 11 ---- kernel/trace/trace.h | 3 +- kernel/trace/trace_functions_graph.c | 17 ++++- kernel/trace/trace_output.c | 10 +++ kernel/trace/trace_output.h | 2 + 6 files changed, 117 insertions(+), 19 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 24dac448cdc..a9ccd71fc92 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,7 +33,7 @@ #include -#include "trace.h" +#include "trace_output.h" #include "trace_stat.h" #define FTRACE_WARN_ON(cond) \ @@ -246,6 +246,9 @@ struct ftrace_profile { struct hlist_node node; unsigned long ip; unsigned long counter; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + unsigned long long time; +#endif }; struct ftrace_profile_page { @@ -303,6 +306,22 @@ static void *function_stat_start(struct tracer_stat *trace) return function_stat_next(&profile_pages_start->records[0], 0); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +/* function graph compares on total time */ +static int function_stat_cmp(void *p1, void *p2) +{ + struct ftrace_profile *a = p1; + struct ftrace_profile *b = p2; + + if (a->time < b->time) + return -1; + if (a->time > b->time) + return 1; + else + return 0; +} +#else +/* not function graph compares against hits */ static int function_stat_cmp(void *p1, void *p2) { struct ftrace_profile *a = p1; @@ -315,11 +334,17 @@ static int function_stat_cmp(void *p1, void *p2) else return 0; } +#endif static int function_stat_headers(struct seq_file *m) { +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " Function Hit Time\n" + " -------- --- ----\n"); +#else seq_printf(m, " Function Hit\n" " -------- ---\n"); +#endif return 0; } @@ -327,10 +352,25 @@ static int function_stat_show(struct seq_file *m, void *v) { struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + static struct trace_seq s; + static DEFINE_MUTEX(mutex); + + mutex_lock(&mutex); + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); +#endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); + seq_printf(m, " %-30.30s %10lu", str, rec->counter); + +#ifdef CONFIG_FUNCTION_GRAPH_TRACER + seq_printf(m, " "); + trace_print_seq(m, &s); + mutex_unlock(&mutex); +#endif + seq_putc(m, '\n'); - seq_printf(m, " %-30.30s %10lu\n", str, rec->counter); return 0; } @@ -534,11 +574,52 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) local_irq_restore(flags); } +#ifdef CONFIG_FUNCTION_GRAPH_TRACER +static int profile_graph_entry(struct ftrace_graph_ent *trace) +{ + function_profile_call(trace->func, 0); + return 1; +} + +static void profile_graph_return(struct ftrace_graph_ret *trace) +{ + unsigned long flags; + struct ftrace_profile *rec; + + local_irq_save(flags); + rec = ftrace_find_profiled_func(trace->func); + if (rec) + rec->time += trace->rettime - trace->calltime; + local_irq_restore(flags); +} + +static int register_ftrace_profiler(void) +{ + return register_ftrace_graph(&profile_graph_return, + &profile_graph_entry); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_graph(); +} +#else static struct ftrace_ops ftrace_profile_ops __read_mostly = { .func = function_profile_call, }; +static int register_ftrace_profiler(void) +{ + return register_ftrace_function(&ftrace_profile_ops); +} + +static void unregister_ftrace_profiler(void) +{ + unregister_ftrace_function(&ftrace_profile_ops); +} +#endif /* CONFIG_FUNCTION_GRAPH_TRACER */ + static ssize_t ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) @@ -570,11 +651,15 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, goto out; } - register_ftrace_function(&ftrace_profile_ops); + ret = register_ftrace_profiler(); + if (ret < 0) { + cnt = ret; + goto out; + } ftrace_profile_enabled = 1; } else { ftrace_profile_enabled = 0; - unregister_ftrace_function(&ftrace_profile_ops); + unregister_ftrace_profiler(); } } out: diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 67c6a21dd42..821bf49771d 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -402,17 +402,6 @@ static ssize_t trace_seq_to_buffer(struct trace_seq *s, void *buf, size_t cnt) return cnt; } -static void -trace_print_seq(struct seq_file *m, struct trace_seq *s) -{ - int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; - - s->buffer[len] = 0; - seq_puts(m, s->buffer); - - trace_seq_init(s); -} - /** * update_max_tr - snapshot all trace buffers from global_trace to max_tr * @tr: tracer diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index d7410bbb9a8..c66ca3b6605 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -605,6 +605,8 @@ extern unsigned long trace_flags; /* Standard output formatting function used for function return traces */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER extern enum print_line_t print_graph_function(struct trace_iterator *iter); +extern enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s); #ifdef CONFIG_DYNAMIC_FTRACE /* TODO: make this variable */ @@ -636,7 +638,6 @@ static inline int ftrace_graph_addr(unsigned long addr) return 1; } #endif /* CONFIG_DYNAMIC_FTRACE */ - #else /* CONFIG_FUNCTION_GRAPH_TRACER */ static inline enum print_line_t print_graph_function(struct trace_iterator *iter) diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index d28687e7b3a..85bba0f018b 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -426,8 +426,8 @@ print_graph_irq(struct trace_iterator *iter, unsigned long addr, return TRACE_TYPE_HANDLED; } -static enum print_line_t -print_graph_duration(unsigned long long duration, struct trace_seq *s) +enum print_line_t +trace_print_graph_duration(unsigned long long duration, struct trace_seq *s) { unsigned long nsecs_rem = do_div(duration, 1000); /* log10(ULONG_MAX) + '\0' */ @@ -464,12 +464,23 @@ print_graph_duration(unsigned long long duration, struct trace_seq *s) if (!ret) return TRACE_TYPE_PARTIAL_LINE; } + return TRACE_TYPE_HANDLED; +} + +static enum print_line_t +print_graph_duration(unsigned long long duration, struct trace_seq *s) +{ + int ret; + + ret = trace_print_graph_duration(duration, s); + if (ret != TRACE_TYPE_HANDLED) + return ret; ret = trace_seq_printf(s, "| "); if (!ret) return TRACE_TYPE_PARTIAL_LINE; - return TRACE_TYPE_HANDLED; + return TRACE_TYPE_HANDLED; } /* Case of a leaf function on its call entry */ diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 19261fdd245..a3b6e3fd704 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -19,6 +19,16 @@ static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; +void trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ + int len = s->len >= PAGE_SIZE ? PAGE_SIZE - 1 : s->len; + + s->buffer[len] = 0; + seq_puts(m, s->buffer); + + trace_seq_init(s); +} + enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter) { struct trace_seq *s = &iter->seq; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 35c422fb51a..1eac2973374 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -20,6 +20,8 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); +extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); + extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) __attribute__ ((format (printf, 2, 3))); extern int From cafb168a1c92e4c9e1731fe3d666c39611762c49 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 20:50:39 -0400 Subject: [PATCH 0112/2061] tracing: make the function profiler per cpu Impact: speed enhancement By making the function profiler record in per cpu data we not only get better readings, avoid races, we also do not have to take any locks. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 199 +++++++++++++++++++++++++++--------------- 1 file changed, 130 insertions(+), 69 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a9ccd71fc92..ed1fc5021d4 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -257,28 +257,28 @@ struct ftrace_profile_page { struct ftrace_profile records[]; }; +struct ftrace_profile_stat { + atomic_t disabled; + struct hlist_head *hash; + struct ftrace_profile_page *pages; + struct ftrace_profile_page *start; + struct tracer_stat stat; +}; + #define PROFILE_RECORDS_SIZE \ (PAGE_SIZE - offsetof(struct ftrace_profile_page, records)) #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -/* TODO: make these percpu, to prevent cache line bouncing */ -static struct ftrace_profile_page *profile_pages_start; -static struct ftrace_profile_page *profile_pages; - -static struct hlist_head *ftrace_profile_hash; static int ftrace_profile_bits; static int ftrace_profile_enabled; static DEFINE_MUTEX(ftrace_profile_lock); -static DEFINE_PER_CPU(atomic_t, ftrace_profile_disable); +static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); #define FTRACE_PROFILE_HASH_SIZE 1024 /* must be power of 2 */ -static raw_spinlock_t ftrace_profile_rec_lock = - (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; - static void * function_stat_next(void *v, int idx) { @@ -303,7 +303,13 @@ function_stat_next(void *v, int idx) static void *function_stat_start(struct tracer_stat *trace) { - return function_stat_next(&profile_pages_start->records[0], 0); + struct ftrace_profile_stat *stat = + container_of(trace, struct ftrace_profile_stat, stat); + + if (!stat || !stat->start) + return NULL; + + return function_stat_next(&stat->start->records[0], 0); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -374,20 +380,11 @@ static int function_stat_show(struct seq_file *m, void *v) return 0; } -static struct tracer_stat function_stats = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show -}; - -static void ftrace_profile_reset(void) +static void ftrace_profile_reset(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; - pg = profile_pages = profile_pages_start; + pg = stat->pages = stat->start; while (pg) { memset(pg->records, 0, PROFILE_RECORDS_SIZE); @@ -395,24 +392,24 @@ static void ftrace_profile_reset(void) pg = pg->next; } - memset(ftrace_profile_hash, 0, + memset(stat->hash, 0, FTRACE_PROFILE_HASH_SIZE * sizeof(struct hlist_head)); } -int ftrace_profile_pages_init(void) +int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; int i; /* If we already allocated, do nothing */ - if (profile_pages) + if (stat->pages) return 0; - profile_pages = (void *)get_zeroed_page(GFP_KERNEL); - if (!profile_pages) + stat->pages = (void *)get_zeroed_page(GFP_KERNEL); + if (!stat->pages) return -ENOMEM; - pg = profile_pages_start = profile_pages; + pg = stat->start = stat->pages; /* allocate 10 more pages to start */ for (i = 0; i < 10; i++) { @@ -430,13 +427,16 @@ int ftrace_profile_pages_init(void) return 0; } -static int ftrace_profile_init(void) +static int ftrace_profile_init_cpu(int cpu) { + struct ftrace_profile_stat *stat; int size; - if (ftrace_profile_hash) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + if (stat->hash) { /* If the profile is already created, simply reset it */ - ftrace_profile_reset(); + ftrace_profile_reset(stat); return 0; } @@ -446,29 +446,45 @@ static int ftrace_profile_init(void) */ size = FTRACE_PROFILE_HASH_SIZE; - ftrace_profile_hash = - kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); + stat->hash = kzalloc(sizeof(struct hlist_head) * size, GFP_KERNEL); - if (!ftrace_profile_hash) + if (!stat->hash) return -ENOMEM; - size--; + if (!ftrace_profile_bits) { + size--; - for (; size; size >>= 1) - ftrace_profile_bits++; + for (; size; size >>= 1) + ftrace_profile_bits++; + } /* Preallocate a few pages */ - if (ftrace_profile_pages_init() < 0) { - kfree(ftrace_profile_hash); - ftrace_profile_hash = NULL; + if (ftrace_profile_pages_init(stat) < 0) { + kfree(stat->hash); + stat->hash = NULL; return -ENOMEM; } return 0; } +static int ftrace_profile_init(void) +{ + int cpu; + int ret = 0; + + for_each_online_cpu(cpu) { + ret = ftrace_profile_init_cpu(cpu); + if (ret) + break; + } + + return ret; +} + /* interrupts must be disabled */ -static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) +static struct ftrace_profile * +ftrace_find_profiled_func(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec; struct hlist_head *hhd; @@ -476,7 +492,7 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) unsigned long key; key = hash_long(ip, ftrace_profile_bits); - hhd = &ftrace_profile_hash[key]; + hhd = &stat->hash[key]; if (hlist_empty(hhd)) return NULL; @@ -489,52 +505,50 @@ static struct ftrace_profile *ftrace_find_profiled_func(unsigned long ip) return NULL; } -static void ftrace_add_profile(struct ftrace_profile *rec) +static void ftrace_add_profile(struct ftrace_profile_stat *stat, + struct ftrace_profile *rec) { unsigned long key; key = hash_long(rec->ip, ftrace_profile_bits); - hlist_add_head_rcu(&rec->node, &ftrace_profile_hash[key]); + hlist_add_head_rcu(&rec->node, &stat->hash[key]); } /* Interrupts must be disabled calling this */ static struct ftrace_profile * -ftrace_profile_alloc(unsigned long ip, bool alloc_safe) +ftrace_profile_alloc(struct ftrace_profile_stat *stat, + unsigned long ip, bool alloc_safe) { struct ftrace_profile *rec = NULL; /* prevent recursion */ - if (atomic_inc_return(&__get_cpu_var(ftrace_profile_disable)) != 1) + if (atomic_inc_return(&stat->disabled) != 1) goto out; - __raw_spin_lock(&ftrace_profile_rec_lock); - /* Try to always keep another page available */ - if (!profile_pages->next && alloc_safe) - profile_pages->next = (void *)get_zeroed_page(GFP_ATOMIC); + if (!stat->pages->next && alloc_safe) + stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC); /* * Try to find the function again since another * task on another CPU could have added it */ - rec = ftrace_find_profiled_func(ip); + rec = ftrace_find_profiled_func(stat, ip); if (rec) - goto out_unlock; + goto out; - if (profile_pages->index == PROFILES_PER_PAGE) { - if (!profile_pages->next) - goto out_unlock; - profile_pages = profile_pages->next; + if (stat->pages->index == PROFILES_PER_PAGE) { + if (!stat->pages->next) + goto out; + stat->pages = stat->pages->next; } - rec = &profile_pages->records[profile_pages->index++]; + rec = &stat->pages->records[stat->pages->index++]; rec->ip = ip; - ftrace_add_profile(rec); + ftrace_add_profile(stat, rec); - out_unlock: - __raw_spin_unlock(&ftrace_profile_rec_lock); out: - atomic_dec(&__get_cpu_var(ftrace_profile_disable)); + atomic_dec(&stat->disabled); return rec; } @@ -552,6 +566,7 @@ static bool ftrace_safe_to_allocate(void) static void function_profile_call(unsigned long ip, unsigned long parent_ip) { + struct ftrace_profile_stat *stat; struct ftrace_profile *rec; unsigned long flags; bool alloc_safe; @@ -562,9 +577,14 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) alloc_safe = ftrace_safe_to_allocate(); local_irq_save(flags); - rec = ftrace_find_profiled_func(ip); + + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash) + goto out; + + rec = ftrace_find_profiled_func(stat, ip); if (!rec) { - rec = ftrace_profile_alloc(ip, alloc_safe); + rec = ftrace_profile_alloc(stat, ip, alloc_safe); if (!rec) goto out; } @@ -583,13 +603,19 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) static void profile_graph_return(struct ftrace_graph_ret *trace) { - unsigned long flags; + struct ftrace_profile_stat *stat; struct ftrace_profile *rec; + unsigned long flags; local_irq_save(flags); - rec = ftrace_find_profiled_func(trace->func); + stat = &__get_cpu_var(ftrace_profile_stats); + if (!stat->hash) + goto out; + + rec = ftrace_find_profiled_func(stat, trace->func); if (rec) rec->time += trace->rettime - trace->calltime; + out: local_irq_restore(flags); } @@ -687,16 +713,51 @@ static const struct file_operations ftrace_profile_fops = { .write = ftrace_profile_write, }; +/* used to initialize the real stat files */ +static struct tracer_stat function_stats __initdata = { + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show +}; + static void ftrace_profile_debugfs(struct dentry *d_tracer) { + struct ftrace_profile_stat *stat; struct dentry *entry; + char *name; int ret; + int cpu; - ret = register_stat_tracer(&function_stats); - if (ret) { - pr_warning("Warning: could not register " - "function stats\n"); - return; + for_each_possible_cpu(cpu) { + stat = &per_cpu(ftrace_profile_stats, cpu); + + /* allocate enough for function name + cpu number */ + name = kmalloc(32, GFP_KERNEL); + if (!name) { + /* + * The files created are permanent, if something happens + * we still do not free memory. + */ + kfree(stat); + WARN(1, + "Could not allocate stat file for cpu %d\n", + cpu); + return; + } + stat->stat = function_stats; + snprintf(name, 32, "function%d", cpu); + stat->stat.name = name; + ret = register_stat_tracer(&stat->stat); + if (ret) { + WARN(1, + "Could not register function stat for cpu %d\n", + cpu); + kfree(name); + return; + } } entry = debugfs_create_file("function_profile_enabled", 0644, From a2a16d6a3156ef7309ca7328a20c35df9418e670 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 24 Mar 2009 23:17:58 -0400 Subject: [PATCH 0113/2061] function-graph: add option to calculate graph time or not graph time is the time that a function is executing another function. Thus if function A calls B, if graph-time is set, then the time for A includes B. This is the default behavior. But if graph-time is off, then the time spent executing B is subtracted from A. Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 3 +-- kernel/trace/ftrace.c | 21 ++++++++++++++++++++- kernel/trace/trace.c | 4 +++- kernel/trace/trace.h | 1 + kernel/trace/trace_functions_graph.c | 8 ++++---- 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 015a3d22cf7..9e0a8d245e5 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -365,6 +365,7 @@ struct ftrace_ret_stack { unsigned long ret; unsigned long func; unsigned long long calltime; + unsigned long long subtime; }; /* @@ -376,8 +377,6 @@ extern void return_to_handler(void); extern int ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth); -extern void -ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret); /* * Sometimes we don't want to trace a function with the function diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index ed1fc5021d4..71e5faef12a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -604,6 +604,7 @@ static int profile_graph_entry(struct ftrace_graph_ent *trace) static void profile_graph_return(struct ftrace_graph_ret *trace) { struct ftrace_profile_stat *stat; + unsigned long long calltime; struct ftrace_profile *rec; unsigned long flags; @@ -612,9 +613,27 @@ static void profile_graph_return(struct ftrace_graph_ret *trace) if (!stat->hash) goto out; + calltime = trace->rettime - trace->calltime; + + if (!(trace_flags & TRACE_ITER_GRAPH_TIME)) { + int index; + + index = trace->depth; + + /* Append this call time to the parent time to subtract */ + if (index) + current->ret_stack[index - 1].subtime += calltime; + + if (current->ret_stack[index].subtime < calltime) + calltime -= current->ret_stack[index].subtime; + else + calltime = 0; + } + rec = ftrace_find_profiled_func(stat, trace->func); if (rec) - rec->time += trace->rettime - trace->calltime; + rec->time += calltime; + out: local_irq_restore(flags); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 821bf49771d..5d1a16cae37 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -255,7 +255,8 @@ static DECLARE_WAIT_QUEUE_HEAD(trace_wait); /* trace_flags holds trace_options default values */ unsigned long trace_flags = TRACE_ITER_PRINT_PARENT | TRACE_ITER_PRINTK | - TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME; + TRACE_ITER_ANNOTATE | TRACE_ITER_CONTEXT_INFO | TRACE_ITER_SLEEP_TIME | + TRACE_ITER_GRAPH_TIME; /** * trace_wake_up - wake up tasks waiting for trace input @@ -317,6 +318,7 @@ static const char *trace_options[] = { "latency-format", "global-clock", "sleep-time", + "graph-time", NULL }; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c66ca3b6605..e3429a8ab05 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -685,6 +685,7 @@ enum trace_iterator_flags { TRACE_ITER_LATENCY_FMT = 0x40000, TRACE_ITER_GLOBAL_CLK = 0x80000, TRACE_ITER_SLEEP_TIME = 0x100000, + TRACE_ITER_GRAPH_TIME = 0x200000, }; /* diff --git a/kernel/trace/trace_functions_graph.c b/kernel/trace/trace_functions_graph.c index 85bba0f018b..10f6ad7d85f 100644 --- a/kernel/trace/trace_functions_graph.c +++ b/kernel/trace/trace_functions_graph.c @@ -78,13 +78,14 @@ ftrace_push_return_trace(unsigned long ret, unsigned long func, int *depth) current->ret_stack[index].ret = ret; current->ret_stack[index].func = func; current->ret_stack[index].calltime = calltime; + current->ret_stack[index].subtime = 0; *depth = index; return 0; } /* Retrieve a function return address to the trace stack on thread info.*/ -void +static void ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) { int index; @@ -104,9 +105,6 @@ ftrace_pop_return_trace(struct ftrace_graph_ret *trace, unsigned long *ret) trace->calltime = current->ret_stack[index].calltime; trace->overrun = atomic_read(¤t->trace_overrun); trace->depth = index; - barrier(); - current->curr_ret_stack--; - } /* @@ -121,6 +119,8 @@ unsigned long ftrace_return_to_handler(void) ftrace_pop_return_trace(&trace, &ret); trace.rettime = trace_clock_local(); ftrace_graph_return(&trace); + barrier(); + current->curr_ret_stack--; if (unlikely(!ret)) { ftrace_graph_stop(); From fb9fb015e92123fa3a8e0c2e2fff491d4a56b470 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 13:26:41 -0400 Subject: [PATCH 0114/2061] tracing: clean up tracing profiler Ingo Molnar suggested clean ups for the profiling code. This patch makes those updates. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 28 +++++++++++++++------------- 1 file changed, 15 insertions(+), 13 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 71e5faef12a..a141d8499ab 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -69,7 +69,7 @@ static DEFINE_MUTEX(ftrace_lock); static struct ftrace_ops ftrace_list_end __read_mostly = { - .func = ftrace_stub, + .func = ftrace_stub, }; static struct ftrace_ops *ftrace_list __read_mostly = &ftrace_list_end; @@ -271,8 +271,10 @@ struct ftrace_profile_stat { #define PROFILES_PER_PAGE \ (PROFILE_RECORDS_SIZE / sizeof(struct ftrace_profile)) -static int ftrace_profile_bits; -static int ftrace_profile_enabled; +static int ftrace_profile_bits __read_mostly; +static int ftrace_profile_enabled __read_mostly; + +/* ftrace_profile_lock - synchronize the enable and disable of the profiler */ static DEFINE_MUTEX(ftrace_profile_lock); static DEFINE_PER_CPU(struct ftrace_profile_stat, ftrace_profile_stats); @@ -651,7 +653,7 @@ static void unregister_ftrace_profiler(void) #else static struct ftrace_ops ftrace_profile_ops __read_mostly = { - .func = function_profile_call, + .func = function_profile_call, }; static int register_ftrace_profiler(void) @@ -670,7 +672,7 @@ ftrace_profile_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { unsigned long val; - char buf[64]; + char buf[64]; /* big enough to hold a number */ int ret; if (cnt >= sizeof(buf)) @@ -719,7 +721,7 @@ static ssize_t ftrace_profile_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { - char buf[64]; + char buf[64]; /* big enough to hold a number */ int r; r = sprintf(buf, "%u\n", ftrace_profile_enabled); @@ -734,12 +736,12 @@ static const struct file_operations ftrace_profile_fops = { /* used to initialize the real stat files */ static struct tracer_stat function_stats __initdata = { - .name = "functions", - .stat_start = function_stat_start, - .stat_next = function_stat_next, - .stat_cmp = function_stat_cmp, - .stat_headers = function_stat_headers, - .stat_show = function_stat_show + .name = "functions", + .stat_start = function_stat_start, + .stat_next = function_stat_next, + .stat_cmp = function_stat_cmp, + .stat_headers = function_stat_headers, + .stat_show = function_stat_show }; static void ftrace_profile_debugfs(struct dentry *d_tracer) @@ -1954,7 +1956,7 @@ function_trace_probe_call(unsigned long ip, unsigned long parent_ip) static struct ftrace_ops trace_probe_ops __read_mostly = { - .func = function_trace_probe_call, + .func = function_trace_probe_call, }; static int ftrace_probe_registered; From 318e0a73c9e41b9a17241829bcd0605a39b87cb9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 20:06:34 -0400 Subject: [PATCH 0115/2061] tracing: remove on the fly allocator from function profiler Impact: safer code The on the fly allocator for the function profiler was to save memory. But at the expense of stability. Although it survived several tests, allocating from the function tracer is just too risky, just to save space. This patch removes the allocator and simply allocates enough entries at start up. Each function gets a profiling structure of 40 bytes. With an average of 20K functions, and this is for each CPU, we have 800K per online CPU. This is not too bad, at least for non-embedded. Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 76 ++++++++++++++++++++++++------------------- 1 file changed, 43 insertions(+), 33 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a141d8499ab..4d90c916b2b 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -401,6 +401,8 @@ static void ftrace_profile_reset(struct ftrace_profile_stat *stat) int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) { struct ftrace_profile_page *pg; + int functions; + int pages; int i; /* If we already allocated, do nothing */ @@ -411,22 +413,46 @@ int ftrace_profile_pages_init(struct ftrace_profile_stat *stat) if (!stat->pages) return -ENOMEM; +#ifdef CONFIG_DYNAMIC_FTRACE + functions = ftrace_update_tot_cnt; +#else + /* + * We do not know the number of functions that exist because + * dynamic tracing is what counts them. With past experience + * we have around 20K functions. That should be more than enough. + * It is highly unlikely we will execute every function in + * the kernel. + */ + functions = 20000; +#endif + pg = stat->start = stat->pages; - /* allocate 10 more pages to start */ - for (i = 0; i < 10; i++) { + pages = DIV_ROUND_UP(functions, PROFILES_PER_PAGE); + + for (i = 0; i < pages; i++) { pg->next = (void *)get_zeroed_page(GFP_KERNEL); - /* - * We only care about allocating profile_pages, if - * we failed to allocate here, hopefully we will allocate - * later. - */ if (!pg->next) - break; + goto out_free; pg = pg->next; } return 0; + + out_free: + pg = stat->start; + while (pg) { + unsigned long tmp = (unsigned long)pg; + + pg = pg->next; + free_page(tmp); + } + + free_page((unsigned long)stat->pages); + stat->pages = NULL; + stat->start = NULL; + + return -ENOMEM; } static int ftrace_profile_init_cpu(int cpu) @@ -460,7 +486,7 @@ static int ftrace_profile_init_cpu(int cpu) ftrace_profile_bits++; } - /* Preallocate a few pages */ + /* Preallocate the function profiling pages */ if (ftrace_profile_pages_init(stat) < 0) { kfree(stat->hash); stat->hash = NULL; @@ -516,24 +542,21 @@ static void ftrace_add_profile(struct ftrace_profile_stat *stat, hlist_add_head_rcu(&rec->node, &stat->hash[key]); } -/* Interrupts must be disabled calling this */ +/* + * The memory is already allocated, this simply finds a new record to use. + */ static struct ftrace_profile * -ftrace_profile_alloc(struct ftrace_profile_stat *stat, - unsigned long ip, bool alloc_safe) +ftrace_profile_alloc(struct ftrace_profile_stat *stat, unsigned long ip) { struct ftrace_profile *rec = NULL; - /* prevent recursion */ + /* prevent recursion (from NMIs) */ if (atomic_inc_return(&stat->disabled) != 1) goto out; - /* Try to always keep another page available */ - if (!stat->pages->next && alloc_safe) - stat->pages->next = (void *)get_zeroed_page(GFP_ATOMIC); - /* - * Try to find the function again since another - * task on another CPU could have added it + * Try to find the function again since an NMI + * could have added it */ rec = ftrace_find_profiled_func(stat, ip); if (rec) @@ -555,29 +578,16 @@ ftrace_profile_alloc(struct ftrace_profile_stat *stat, return rec; } -/* - * If we are not in an interrupt, or softirq and - * and interrupts are disabled and preemption is not enabled - * (not in a spinlock) then it should be safe to allocate memory. - */ -static bool ftrace_safe_to_allocate(void) -{ - return !in_interrupt() && irqs_disabled() && !preempt_count(); -} - static void function_profile_call(unsigned long ip, unsigned long parent_ip) { struct ftrace_profile_stat *stat; struct ftrace_profile *rec; unsigned long flags; - bool alloc_safe; if (!ftrace_profile_enabled) return; - alloc_safe = ftrace_safe_to_allocate(); - local_irq_save(flags); stat = &__get_cpu_var(ftrace_profile_stats); @@ -586,7 +596,7 @@ function_profile_call(unsigned long ip, unsigned long parent_ip) rec = ftrace_find_profiled_func(stat, ip); if (!rec) { - rec = ftrace_profile_alloc(stat, ip, alloc_safe); + rec = ftrace_profile_alloc(stat, ip); if (!rec) goto out; } From 34886c8bc590f078d4c0b88f50d061326639198d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 21:00:47 -0400 Subject: [PATCH 0116/2061] tracing: add average time in function to function profiler Show the average time in the function (Time / Hit) Function Hit Time Avg -------- --- ---- --- mwait_idle 51 140326.6 us 2751.503 us smp_apic_timer_interrupt 47 3517.735 us 74.845 us schedule 10 2738.754 us 273.875 us __schedule 10 2732.857 us 273.285 us hrtimer_interrupt 47 1896.104 us 40.342 us irq_exit 56 1711.833 us 30.568 us __run_hrtimer 47 1315.589 us 27.991 us tick_sched_timer 47 1138.690 us 24.227 us do_softirq 56 1116.829 us 19.943 us __do_softirq 56 1066.932 us 19.052 us do_IRQ 9 926.153 us 102.905 us Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 4d90c916b2b..c7f4a4be05d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -347,8 +347,10 @@ static int function_stat_cmp(void *p1, void *p2) static int function_stat_headers(struct seq_file *m) { #ifdef CONFIG_FUNCTION_GRAPH_TRACER - seq_printf(m, " Function Hit Time\n" - " -------- --- ----\n"); + seq_printf(m, " Function " + "Hit Time Avg\n" + " -------- " + "--- ---- ---\n"); #else seq_printf(m, " Function Hit\n" " -------- ---\n"); @@ -361,12 +363,9 @@ static int function_stat_show(struct seq_file *m, void *v) struct ftrace_profile *rec = v; char str[KSYM_SYMBOL_LEN]; #ifdef CONFIG_FUNCTION_GRAPH_TRACER - static struct trace_seq s; static DEFINE_MUTEX(mutex); - - mutex_lock(&mutex); - trace_seq_init(&s); - trace_print_graph_duration(rec->time, &s); + static struct trace_seq s; + unsigned long long avg; #endif kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); @@ -374,6 +373,14 @@ static int function_stat_show(struct seq_file *m, void *v) #ifdef CONFIG_FUNCTION_GRAPH_TRACER seq_printf(m, " "); + avg = rec->time; + do_div(avg, rec->counter); + + mutex_lock(&mutex); + trace_seq_init(&s); + trace_print_graph_duration(rec->time, &s); + trace_seq_puts(&s, " "); + trace_print_graph_duration(avg, &s); trace_print_seq(m, &s); mutex_unlock(&mutex); #endif From a8a93f3f03b7a8008d720e8d91798efe599d416c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Feb 2009 13:45:34 -0800 Subject: [PATCH 0117/2061] mm: disable preemption in apply_to_pte_range Impact: bugfix Lazy mmu mode needs preemption disabled, so if we're apply to init_mm (which doesn't require any pte locks), then explicitly disable preemption. (Do it unconditionally after checking we've successfully done the allocation to simplify the error handling.) Signed-off-by: Jeremy Fitzhardinge --- mm/memory.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/memory.c b/mm/memory.c index baa999e87cd..b80cc31292b 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1718,6 +1718,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(pmd_huge(*pmd)); + preempt_disable(); arch_enter_lazy_mmu_mode(); token = pmd_pgtable(*pmd); @@ -1729,6 +1730,7 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); + preempt_enable(); if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); From b8bcfe997e46150fedcc3f5b26b846400122fdd9 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:05:19 -0800 Subject: [PATCH 0118/2061] x86/paravirt: remove lazy mode in interrupts Impact: simplification, robustness Make paravirt_lazy_mode() always return PARAVIRT_LAZY_NONE when in an interrupt. This prevents interrupt code from accidentally inheriting an outer lazy state, and instead does everything synchronously. Outer batched operations are left deferred. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra Cc: Thomas Gleixner --- arch/x86/kernel/paravirt.c | 3 +++ arch/x86/mm/fault.c | 6 ++---- arch/x86/mm/highmem_32.c | 2 -- arch/x86/mm/iomap_32.c | 1 - arch/x86/mm/pageattr.c | 14 -------------- 5 files changed, 5 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 63dd358d8ee..8ab250ac498 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -282,6 +282,9 @@ void paravirt_leave_lazy_cpu(void) enum paravirt_lazy_mode paravirt_get_lazy_mode(void) { + if (in_interrupt()) + return PARAVIRT_LAZY_NONE; + return __get_cpu_var(paravirt_lazy_mode); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a03b7279efa..cfbb4a73801 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -225,12 +225,10 @@ static inline pmd_t *vmalloc_sync_one(pgd_t *pgd, unsigned long address) if (!pmd_present(*pmd_k)) return NULL; - if (!pmd_present(*pmd)) { + if (!pmd_present(*pmd)) set_pmd(pmd, *pmd_k); - arch_flush_lazy_mmu_mode(); - } else { + else BUG_ON(pmd_page(*pmd) != pmd_page(*pmd_k)); - } return pmd_k; } diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 00f127c80b0..e81dfa40815 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -87,7 +87,6 @@ void *kmap_atomic_prot(struct page *page, enum km_type type, pgprot_t prot) vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); BUG_ON(!pte_none(*(kmap_pte-idx))); set_pte(kmap_pte-idx, mk_pte(page, prot)); - arch_flush_lazy_mmu_mode(); return (void *)vaddr; } @@ -117,7 +116,6 @@ void kunmap_atomic(void *kvaddr, enum km_type type) #endif } - arch_flush_lazy_mmu_mode(); pagefault_enable(); } diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index 04102d42ff4..b6a61f3d7ef 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -74,7 +74,6 @@ iounmap_atomic(void *kvaddr, enum km_type type) if (vaddr == __fix_to_virt(FIX_KMAP_BEGIN+idx)) kpte_clear_flush(kmap_pte-idx, vaddr); - arch_flush_lazy_mmu_mode(); pagefault_enable(); } EXPORT_SYMBOL_GPL(iounmap_atomic); diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 9c4294986af..9015e5e412b 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -824,13 +824,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, vm_unmap_aliases(); - /* - * If we're called with lazy mmu updates enabled, the - * in-memory pte state may be stale. Flush pending updates to - * bring them up to date. - */ - arch_flush_lazy_mmu_mode(); - cpa.vaddr = addr; cpa.numpages = numpages; cpa.mask_set = mask_set; @@ -873,13 +866,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, } else cpa_flush_all(cache); - /* - * If we've been called with lazy mmu updates enabled, then - * make sure that everything gets flushed out before we - * return. - */ - arch_flush_lazy_mmu_mode(); - out: return ret; } From 7fd7d83d49914f03aefffba6aee09032fcd54cce Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:24:03 -0800 Subject: [PATCH 0119/2061] x86/pvops: replace arch_enter_lazy_cpu_mode with arch_start_context_switch Impact: simplification, prepare for later changes Make lazy cpu mode more specific to context switching, so that it makes sense to do more context-switch specific things in the callbacks. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/include/asm/paravirt.h | 8 +++----- arch/x86/kernel/paravirt.c | 13 ------------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/xen/mmu.c | 5 +---- include/asm-frv/pgtable.h | 4 ++-- include/asm-generic/pgtable.h | 21 +++++++++++---------- kernel/sched.c | 2 +- 8 files changed, 20 insertions(+), 37 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 0617d5cc971..7b28abac323 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1420,19 +1420,17 @@ void paravirt_enter_lazy_mmu(void); void paravirt_leave_lazy_mmu(void); void paravirt_leave_lazy(enum paravirt_lazy_mode mode); -#define __HAVE_ARCH_ENTER_LAZY_CPU_MODE -static inline void arch_enter_lazy_cpu_mode(void) +#define __HAVE_ARCH_START_CONTEXT_SWITCH +static inline void arch_start_context_switch(void) { PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); } -static inline void arch_leave_lazy_cpu_mode(void) +static inline void arch_end_context_switch(void) { PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); } -void arch_flush_lazy_cpu_mode(void); - #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE static inline void arch_enter_lazy_mmu_mode(void) { diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 8ab250ac498..5eea9548216 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -301,19 +301,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_enable(); } -void arch_flush_lazy_cpu_mode(void) -{ - preempt_disable(); - - if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_CPU) { - WARN_ON(preempt_count() == 1); - arch_leave_lazy_cpu_mode(); - arch_enter_lazy_cpu_mode(); - } - - preempt_enable(); -} - struct pv_info pv_info = { .name = "bare hardware", .paravirt_enabled = 0, diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 14014d766ca..57e49a8278a 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index abb7e6a7f0c..7115e608532 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_leave_lazy_cpu_mode(); + arch_end_context_switch(); /* * Switch FS and GS. diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index cb6afa4ec95..6b98f87232a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1119,10 +1119,8 @@ static void drop_other_mm_ref(void *info) /* If this cpu still has a stale cr3 reference, then make sure it has been flushed. */ - if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) { + if (percpu_read(xen_current_cr3) == __pa(mm->pgd)) load_cr3(swapper_pg_dir); - arch_flush_lazy_cpu_mode(); - } } static void xen_drop_mm_ref(struct mm_struct *mm) @@ -1135,7 +1133,6 @@ static void xen_drop_mm_ref(struct mm_struct *mm) load_cr3(swapper_pg_dir); else leave_mm(smp_processor_id()); - arch_flush_lazy_cpu_mode(); } /* Get the "official" set of cpus referring to our pagetable. */ diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h index e16fdb1f4f4..235e34a7a34 100644 --- a/include/asm-frv/pgtable.h +++ b/include/asm-frv/pgtable.h @@ -73,8 +73,8 @@ static inline int pte_file(pte_t pte) { return 0; } #define pgtable_cache_init() do {} while (0) #define arch_enter_lazy_mmu_mode() do {} while (0) #define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_enter_lazy_cpu_mode() do {} while (0) -#define arch_leave_lazy_cpu_mode() do {} while (0) + +#define arch_start_context_switch() do {} while (0) #else /* !CONFIG_MMU */ /*****************************************************************************/ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 8e6d0ca70ab..922f03671dd 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -280,17 +280,18 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, #endif /* - * A facility to provide batching of the reload of page tables with the - * actual context switch code for paravirtualized guests. By convention, - * only one of the lazy modes (CPU, MMU) should be active at any given - * time, entry should never be nested, and entry and exits should always - * be paired. This is for sanity of maintaining and reasoning about the - * kernel code. + * A facility to provide batching of the reload of page tables and + * other process state with the actual context switch code for + * paravirtualized guests. By convention, only one of the batched + * update (lazy) modes (CPU, MMU) should be active at any given time, + * entry should never be nested, and entry and exits should always be + * paired. This is for sanity of maintaining and reasoning about the + * kernel code. In this case, the exit (end of the context switch) is + * in architecture-specific code, and so doesn't need a generic + * definition. */ -#ifndef __HAVE_ARCH_ENTER_LAZY_CPU_MODE -#define arch_enter_lazy_cpu_mode() do {} while (0) -#define arch_leave_lazy_cpu_mode() do {} while (0) -#define arch_flush_lazy_cpu_mode() do {} while (0) +#ifndef __HAVE_ARCH_START_CONTEXT_SWITCH +#define arch_start_context_switch() do {} while (0) #endif #ifndef __HAVE_PFNMAP_TRACKING diff --git a/kernel/sched.c b/kernel/sched.c index 5757e03cfac..7530fdd7c98 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_enter_lazy_cpu_mode(); + arch_start_context_switch(); if (unlikely(!mm)) { next->active_mm = oldmm; From b407fc57b815b2016186220baabc76cc8264206e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:46:21 -0800 Subject: [PATCH 0120/2061] x86/paravirt: flush pending mmu updates on context switch Impact: allow preemption during lazy mmu updates If we're in lazy mmu mode when context switching, leave lazy mmu mode, but remember the task's state in TIF_LAZY_MMU_UPDATES. When we resume the task, check this flag and re-enter lazy mmu mode if its set. This sets things up for allowing lazy mmu mode while preemptible, though that won't actually be active until the next change. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/include/asm/paravirt.h | 1 - arch/x86/include/asm/thread_info.h | 2 ++ arch/x86/kernel/kvm.c | 2 +- arch/x86/kernel/paravirt.c | 13 ++++++++++--- arch/x86/kernel/vmi_32.c | 14 ++++++++++---- arch/x86/lguest/boot.c | 14 ++++++++++---- arch/x86/xen/enlighten.c | 6 +++--- arch/x86/xen/mmu.c | 7 ++++++- arch/x86/xen/xen-ops.h | 1 - 9 files changed, 42 insertions(+), 18 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 7b28abac323..58d2481b01a 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -1418,7 +1418,6 @@ void paravirt_enter_lazy_cpu(void); void paravirt_leave_lazy_cpu(void); void paravirt_enter_lazy_mmu(void); void paravirt_leave_lazy_mmu(void); -void paravirt_leave_lazy(enum paravirt_lazy_mode mode); #define __HAVE_ARCH_START_CONTEXT_SWITCH static inline void arch_start_context_switch(void) diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index df9d5f78385..2f34d643b56 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -94,6 +94,7 @@ struct thread_info { #define TIF_FORCED_TF 24 /* true if TF in eflags artificially */ #define TIF_DEBUGCTLMSR 25 /* uses thread_struct.debugctlmsr */ #define TIF_DS_AREA_MSR 26 /* uses thread_struct.ds_area_msr */ +#define TIF_LAZY_MMU_UPDATES 27 /* task is updating the mmu lazily */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -115,6 +116,7 @@ struct thread_info { #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) #define _TIF_DEBUGCTLMSR (1 << TIF_DEBUGCTLMSR) #define _TIF_DS_AREA_MSR (1 << TIF_DS_AREA_MSR) +#define _TIF_LAZY_MMU_UPDATES (1 << TIF_LAZY_MMU_UPDATES) /* work to do in syscall_trace_enter() */ #define _TIF_WORK_SYSCALL_ENTRY \ diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c index 478bca986ec..5d7f6e76b5d 100644 --- a/arch/x86/kernel/kvm.c +++ b/arch/x86/kernel/kvm.c @@ -201,7 +201,7 @@ static void kvm_leave_lazy_mmu(void) struct kvm_para_state *state = kvm_para_state(); mmu_queue_flush(state); - paravirt_leave_lazy(paravirt_get_lazy_mode()); + paravirt_leave_lazy_mmu(); state->mode = paravirt_get_lazy_mode(); } diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 5eea9548216..430a0e30577 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -252,7 +252,7 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode) __get_cpu_var(paravirt_lazy_mode) = mode; } -void paravirt_leave_lazy(enum paravirt_lazy_mode mode) +static void leave_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); BUG_ON(preemptible()); @@ -267,17 +267,24 @@ void paravirt_enter_lazy_mmu(void) void paravirt_leave_lazy_mmu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_MMU); + leave_lazy(PARAVIRT_LAZY_MMU); } void paravirt_enter_lazy_cpu(void) { + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { + arch_leave_lazy_mmu_mode(); + set_thread_flag(TIF_LAZY_MMU_UPDATES); + } enter_lazy(PARAVIRT_LAZY_CPU); } void paravirt_leave_lazy_cpu(void) { - paravirt_leave_lazy(PARAVIRT_LAZY_CPU); + leave_lazy(PARAVIRT_LAZY_CPU); + + if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES)) + arch_enter_lazy_mmu_mode(); } enum paravirt_lazy_mode paravirt_get_lazy_mode(void) diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 2cc4a90e2cb..950929c607d 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -473,16 +473,22 @@ static void vmi_enter_lazy_cpu(void) vmi_ops.set_lazy_mode(2); } +static void vmi_leave_lazy_cpu(void) +{ + vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_cpu(); +} + static void vmi_enter_lazy_mmu(void) { paravirt_enter_lazy_mmu(); vmi_ops.set_lazy_mode(1); } -static void vmi_leave_lazy(void) +static void vmi_leave_lazy_mmu(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); vmi_ops.set_lazy_mode(0); + paravirt_leave_lazy_mmu(); } static inline int __init check_vmi_rom(struct vrom_header *rom) @@ -718,12 +724,12 @@ static inline int __init activate_vmi(void) para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, set_lazy_mode, SetLazyMode); - para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy, + para_wrap(pv_mmu_ops.lazy_mode.leave, vmi_leave_lazy_mmu, set_lazy_mode, SetLazyMode); /* user and kernel flush are just handled with different flags to FlushTLB */ diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 9fe4ddaa8f6..41a5562e710 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -147,10 +147,16 @@ static void lazy_hcall(unsigned long call, /* When lazy mode is turned off reset the per-cpu lazy mode variable and then * issue the do-nothing hypercall to flush any stored calls. */ -static void lguest_leave_lazy_mode(void) +static void lguest_leave_lazy_mmu_mode(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); + paravirt_leave_lazy_mmu(); +} + +static void lguest_leave_lazy_cpu_mode(void) +{ + hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); + paravirt_leave_lazy_cpu(); } /*G:033 @@ -1026,7 +1032,7 @@ __init void lguest_init(void) pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; pv_cpu_ops.wbinvd = lguest_wbinvd; pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; - pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_cpu_mode; /* pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; @@ -1039,7 +1045,7 @@ __init void lguest_init(void) pv_mmu_ops.read_cr2 = lguest_read_cr2; pv_mmu_ops.read_cr3 = lguest_read_cr3; pv_mmu_ops.lazy_mode.enter = paravirt_enter_lazy_mmu; - pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mode; + pv_mmu_ops.lazy_mode.leave = lguest_leave_lazy_mmu_mode; #ifdef CONFIG_X86_LOCAL_APIC /* apic read/write intercepts */ diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 82cd39a6cbd..f586e63b9a6 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -203,10 +203,10 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -void xen_leave_lazy(void) +static void xen_leave_lazy_cpu(void) { - paravirt_leave_lazy(paravirt_get_lazy_mode()); xen_mc_flush(); + paravirt_leave_lazy_cpu(); } static unsigned long xen_store_tr(void) @@ -819,7 +819,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .lazy_mode = { .enter = paravirt_enter_lazy_cpu, - .leave = xen_leave_lazy, + .leave = xen_leave_lazy_cpu, }, }; diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 6b98f87232a..f5f8faa4f76 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1816,6 +1816,11 @@ __init void xen_post_allocator_init(void) xen_mark_init_mm_pinned(); } +static void xen_leave_lazy_mmu(void) +{ + xen_mc_flush(); + paravirt_leave_lazy_mmu(); +} const struct pv_mmu_ops xen_mmu_ops __initdata = { .pagetable_setup_start = xen_pagetable_setup_start, @@ -1891,7 +1896,7 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .lazy_mode = { .enter = paravirt_enter_lazy_mmu, - .leave = xen_leave_lazy, + .leave = xen_leave_lazy_mmu, }, .set_fixmap = xen_set_fixmap, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index 2f5ef2632ea..f897cdffccb 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -30,7 +30,6 @@ pgd_t *xen_setup_kernel_pagetable(pgd_t *pgd, unsigned long max_pfn); void xen_ident_map_ISA(void); void xen_reserve_top(void); -void xen_leave_lazy(void); void xen_post_allocator_init(void); char * __init xen_memory_setup(void); From 224101ed69d3fbb486868e0f6e0f9fa37302efb4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 11:18:57 -0800 Subject: [PATCH 0121/2061] x86/paravirt: finish change from lazy cpu to context switch start/end Impact: fix lazy context switch API Pass the previous and next tasks into the context switch start end calls, so that the called functions can properly access the task state (esp in end_context_switch, in which the next task is not yet completely current). Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/include/asm/paravirt.h | 17 ++++++++++------- arch/x86/include/asm/pgtable.h | 2 ++ arch/x86/kernel/paravirt.c | 14 ++++++-------- arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- arch/x86/kernel/vmi_32.c | 12 ++++++------ arch/x86/lguest/boot.c | 8 ++++---- arch/x86/xen/enlighten.c | 10 ++++------ include/asm-frv/pgtable.h | 2 +- include/asm-generic/pgtable.h | 2 +- kernel/sched.c | 2 +- 11 files changed, 37 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index 58d2481b01a..dfdee0ca57d 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -56,6 +56,7 @@ struct desc_ptr; struct tss_struct; struct mm_struct; struct desc_struct; +struct task_struct; /* * Wrapper type for pointers to code which uses the non-standard @@ -203,7 +204,8 @@ struct pv_cpu_ops { void (*swapgs)(void); - struct pv_lazy_ops lazy_mode; + void (*start_context_switch)(struct task_struct *prev); + void (*end_context_switch)(struct task_struct *next); }; struct pv_irq_ops { @@ -1414,20 +1416,21 @@ enum paravirt_lazy_mode { }; enum paravirt_lazy_mode paravirt_get_lazy_mode(void); -void paravirt_enter_lazy_cpu(void); -void paravirt_leave_lazy_cpu(void); +void paravirt_start_context_switch(struct task_struct *prev); +void paravirt_end_context_switch(struct task_struct *next); + void paravirt_enter_lazy_mmu(void); void paravirt_leave_lazy_mmu(void); #define __HAVE_ARCH_START_CONTEXT_SWITCH -static inline void arch_start_context_switch(void) +static inline void arch_start_context_switch(struct task_struct *prev) { - PVOP_VCALL0(pv_cpu_ops.lazy_mode.enter); + PVOP_VCALL1(pv_cpu_ops.start_context_switch, prev); } -static inline void arch_end_context_switch(void) +static inline void arch_end_context_switch(struct task_struct *next) { - PVOP_VCALL0(pv_cpu_ops.lazy_mode.leave); + PVOP_VCALL1(pv_cpu_ops.end_context_switch, next); } #define __HAVE_ARCH_ENTER_LAZY_MMU_MODE diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index d0812e155f1..24e42836e92 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -83,6 +83,8 @@ static inline void __init paravirt_pagetable_setup_done(pgd_t *base) #define pte_val(x) native_pte_val(x) #define __pte(x) native_make_pte(x) +#define arch_end_context_switch(prev) do {} while(0) + #endif /* CONFIG_PARAVIRT */ /* diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index 430a0e30577..cf1437503ba 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -270,20 +270,20 @@ void paravirt_leave_lazy_mmu(void) leave_lazy(PARAVIRT_LAZY_MMU); } -void paravirt_enter_lazy_cpu(void) +void paravirt_start_context_switch(struct task_struct *prev) { if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); - set_thread_flag(TIF_LAZY_MMU_UPDATES); + set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); } enter_lazy(PARAVIRT_LAZY_CPU); } -void paravirt_leave_lazy_cpu(void) +void paravirt_end_context_switch(struct task_struct *next) { leave_lazy(PARAVIRT_LAZY_CPU); - if (test_and_clear_thread_flag(TIF_LAZY_MMU_UPDATES)) + if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) arch_enter_lazy_mmu_mode(); } @@ -399,10 +399,8 @@ struct pv_cpu_ops pv_cpu_ops = { .set_iopl_mask = native_set_iopl_mask, .io_delay = native_io_delay, - .lazy_mode = { - .enter = paravirt_nop, - .leave = paravirt_nop, - }, + .start_context_switch = paravirt_nop, + .end_context_switch = paravirt_nop, }; struct pv_apic_ops pv_apic_ops = { diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 57e49a8278a..d766c7616fd 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -407,7 +407,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_end_context_switch(); + arch_end_context_switch(next_p); /* If the task has used fpu the last 5 timeslices, just do a full * restore of the math state immediately to avoid the trap; the diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 7115e608532..e8a9aaf9df8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -428,7 +428,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) * done before math_state_restore, so the TS bit is up * to date. */ - arch_end_context_switch(); + arch_end_context_switch(next_p); /* * Switch FS and GS. diff --git a/arch/x86/kernel/vmi_32.c b/arch/x86/kernel/vmi_32.c index 950929c607d..55a5d6938e5 100644 --- a/arch/x86/kernel/vmi_32.c +++ b/arch/x86/kernel/vmi_32.c @@ -467,16 +467,16 @@ vmi_startup_ipi_hook(int phys_apicid, unsigned long start_eip, } #endif -static void vmi_enter_lazy_cpu(void) +static void vmi_start_context_switch(struct task_struct *prev) { - paravirt_enter_lazy_cpu(); + paravirt_start_context_switch(prev); vmi_ops.set_lazy_mode(2); } -static void vmi_leave_lazy_cpu(void) +static void vmi_end_context_switch(struct task_struct *next) { vmi_ops.set_lazy_mode(0); - paravirt_leave_lazy_cpu(); + paravirt_end_context_switch(next); } static void vmi_enter_lazy_mmu(void) @@ -722,9 +722,9 @@ static inline int __init activate_vmi(void) para_fill(pv_cpu_ops.set_iopl_mask, SetIOPLMask); para_fill(pv_cpu_ops.io_delay, IODelay); - para_wrap(pv_cpu_ops.lazy_mode.enter, vmi_enter_lazy_cpu, + para_wrap(pv_cpu_ops.start_context_switch, vmi_start_context_switch, set_lazy_mode, SetLazyMode); - para_wrap(pv_cpu_ops.lazy_mode.leave, vmi_leave_lazy_cpu, + para_wrap(pv_cpu_ops.end_context_switch, vmi_end_context_switch, set_lazy_mode, SetLazyMode); para_wrap(pv_mmu_ops.lazy_mode.enter, vmi_enter_lazy_mmu, diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 41a5562e710..5287081b356 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -153,10 +153,10 @@ static void lguest_leave_lazy_mmu_mode(void) paravirt_leave_lazy_mmu(); } -static void lguest_leave_lazy_cpu_mode(void) +static void lguest_end_context_switch(struct task_struct *next) { hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); - paravirt_leave_lazy_cpu(); + paravirt_end_context_switch(next); } /*G:033 @@ -1031,8 +1031,8 @@ __init void lguest_init(void) pv_cpu_ops.write_gdt_entry = lguest_write_gdt_entry; pv_cpu_ops.write_idt_entry = lguest_write_idt_entry; pv_cpu_ops.wbinvd = lguest_wbinvd; - pv_cpu_ops.lazy_mode.enter = paravirt_enter_lazy_cpu; - pv_cpu_ops.lazy_mode.leave = lguest_leave_lazy_cpu_mode; + pv_cpu_ops.start_context_switch = paravirt_start_context_switch; + pv_cpu_ops.end_context_switch = lguest_end_context_switch; /* pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index f586e63b9a6..70b355d3a86 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -203,10 +203,10 @@ static unsigned long xen_get_debugreg(int reg) return HYPERVISOR_get_debugreg(reg); } -static void xen_leave_lazy_cpu(void) +static void xen_end_context_switch(struct task_struct *next) { xen_mc_flush(); - paravirt_leave_lazy_cpu(); + paravirt_end_context_switch(next); } static unsigned long xen_store_tr(void) @@ -817,10 +817,8 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { /* Xen takes care of %gs when switching to usermode for us */ .swapgs = paravirt_nop, - .lazy_mode = { - .enter = paravirt_enter_lazy_cpu, - .leave = xen_leave_lazy_cpu, - }, + .start_context_switch = paravirt_start_context_switch, + .end_context_switch = xen_end_context_switch, }; static const struct pv_apic_ops xen_apic_ops __initdata = { diff --git a/include/asm-frv/pgtable.h b/include/asm-frv/pgtable.h index 235e34a7a34..09887045d03 100644 --- a/include/asm-frv/pgtable.h +++ b/include/asm-frv/pgtable.h @@ -74,7 +74,7 @@ static inline int pte_file(pte_t pte) { return 0; } #define arch_enter_lazy_mmu_mode() do {} while (0) #define arch_leave_lazy_mmu_mode() do {} while (0) -#define arch_start_context_switch() do {} while (0) +#define arch_start_context_switch(prev) do {} while (0) #else /* !CONFIG_MMU */ /*****************************************************************************/ diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 922f03671dd..e410f602cab 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -291,7 +291,7 @@ static inline void ptep_modify_prot_commit(struct mm_struct *mm, * definition. */ #ifndef __HAVE_ARCH_START_CONTEXT_SWITCH -#define arch_start_context_switch() do {} while (0) +#define arch_start_context_switch(prev) do {} while (0) #endif #ifndef __HAVE_PFNMAP_TRACKING diff --git a/kernel/sched.c b/kernel/sched.c index 7530fdd7c98..133762aece5 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2746,7 +2746,7 @@ context_switch(struct rq *rq, struct task_struct *prev, * combine the page table reload and the switch backend into * one hypercall. */ - arch_start_context_switch(); + arch_start_context_switch(prev); if (unlikely(!mm)) { next->active_mm = oldmm; From 2829b449276aed45f3d649efb21e3418e39dd5d1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Feb 2009 23:53:19 -0800 Subject: [PATCH 0122/2061] x86/paravirt: allow preemption with lazy mmu mode Impact: remove obsolete checks, simplification Lift restrictions on preemption with lazy mmu mode, as it is now allowed. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- arch/x86/kernel/paravirt.c | 7 ++++--- arch/x86/xen/mmu.c | 8 +------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index cf1437503ba..bf2e86eee80 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -247,7 +247,6 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - BUG_ON(preemptible()); __get_cpu_var(paravirt_lazy_mode) = mode; } @@ -255,7 +254,6 @@ static inline void enter_lazy(enum paravirt_lazy_mode mode) static void leave_lazy(enum paravirt_lazy_mode mode) { BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); - BUG_ON(preemptible()); __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; } @@ -272,6 +270,8 @@ void paravirt_leave_lazy_mmu(void) void paravirt_start_context_switch(struct task_struct *prev) { + BUG_ON(preemptible()); + if (percpu_read(paravirt_lazy_mode) == PARAVIRT_LAZY_MMU) { arch_leave_lazy_mmu_mode(); set_ti_thread_flag(task_thread_info(prev), TIF_LAZY_MMU_UPDATES); @@ -281,6 +281,8 @@ void paravirt_start_context_switch(struct task_struct *prev) void paravirt_end_context_switch(struct task_struct *next) { + BUG_ON(preemptible()); + leave_lazy(PARAVIRT_LAZY_CPU); if (test_and_clear_ti_thread_flag(task_thread_info(next), TIF_LAZY_MMU_UPDATES)) @@ -300,7 +302,6 @@ void arch_flush_lazy_mmu_mode(void) preempt_disable(); if (paravirt_get_lazy_mode() == PARAVIRT_LAZY_MMU) { - WARN_ON(preempt_count() == 1); arch_leave_lazy_mmu_mode(); arch_enter_lazy_mmu_mode(); } diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index f5f8faa4f76..3f2d0fe5e6a 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -419,10 +419,6 @@ void set_pte_mfn(unsigned long vaddr, unsigned long mfn, pgprot_t flags) void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { - /* updates to init_mm may be done without lock */ - if (mm == &init_mm) - preempt_disable(); - ADD_STATS(set_pte_at, 1); // ADD_STATS(set_pte_at_pinned, xen_page_pinned(ptep)); ADD_STATS(set_pte_at_current, mm == current->mm); @@ -443,9 +439,7 @@ void xen_set_pte_at(struct mm_struct *mm, unsigned long addr, } xen_set_pte(ptep, pteval); -out: - if (mm == &init_mm) - preempt_enable(); +out: return; } pte_t xen_ptep_modify_prot_start(struct mm_struct *mm, From 252a6bf2a3a7e7add56b17d48aecf3f3ef213103 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 00:11:28 -0800 Subject: [PATCH 0123/2061] mm: allow preemption in apply_to_pte_range Impact: allow preemption in apply_to_pte_range updates to init_mm Preemption is now allowed for lazy mmu mode, so don't disable it for the inner loop of apply_to_pte_range. This only applies when doing updates to init_mm; user pagetables are still modified under the pte lock, so preemption is disabled anyway. Signed-off-by: Jeremy Fitzhardinge Acked-by: Peter Zijlstra --- mm/memory.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index b80cc31292b..baa999e87cd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1718,7 +1718,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, BUG_ON(pmd_huge(*pmd)); - preempt_disable(); arch_enter_lazy_mmu_mode(); token = pmd_pgtable(*pmd); @@ -1730,7 +1729,6 @@ static int apply_to_pte_range(struct mm_struct *mm, pmd_t *pmd, } while (pte++, addr += PAGE_SIZE, addr != end); arch_leave_lazy_mmu_mode(); - preempt_enable(); if (mm != &init_mm) pte_unmap_unlock(pte-1, ptl); From ab2f75f0b760d2b0c9a875b669a1b51dce02c85a Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 18 Feb 2009 00:18:50 -0800 Subject: [PATCH 0124/2061] x86/paravirt: use percpu_ rather than __get_cpu_var Impact: minor optimisation percpu_read/write is a slightly more direct way of getting to percpu data. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/kernel/paravirt.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c index bf2e86eee80..254e8aa8bfd 100644 --- a/arch/x86/kernel/paravirt.c +++ b/arch/x86/kernel/paravirt.c @@ -246,16 +246,16 @@ static DEFINE_PER_CPU(enum paravirt_lazy_mode, paravirt_lazy_mode) = PARAVIRT_LA static inline void enter_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); + BUG_ON(percpu_read(paravirt_lazy_mode) != PARAVIRT_LAZY_NONE); - __get_cpu_var(paravirt_lazy_mode) = mode; + percpu_write(paravirt_lazy_mode, mode); } static void leave_lazy(enum paravirt_lazy_mode mode) { - BUG_ON(__get_cpu_var(paravirt_lazy_mode) != mode); + BUG_ON(percpu_read(paravirt_lazy_mode) != mode); - __get_cpu_var(paravirt_lazy_mode) = PARAVIRT_LAZY_NONE; + percpu_write(paravirt_lazy_mode, PARAVIRT_LAZY_NONE); } void paravirt_enter_lazy_mmu(void) @@ -294,7 +294,7 @@ enum paravirt_lazy_mode paravirt_get_lazy_mode(void) if (in_interrupt()) return PARAVIRT_LAZY_NONE; - return __get_cpu_var(paravirt_lazy_mode); + return percpu_read(paravirt_lazy_mode); } void arch_flush_lazy_mmu_mode(void) From 5caecb9432428241d0c641897f07ff4003f1b55f Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 20 Feb 2009 23:01:26 -0800 Subject: [PATCH 0125/2061] xen: disable preempt for leave_lazy_mmu xen_mc_flush() requires preemption to be disabled for its own sanity, so disable it while we're flushing. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 3f2d0fe5e6a..0e572380413 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1812,8 +1812,10 @@ __init void xen_post_allocator_init(void) static void xen_leave_lazy_mmu(void) { + preempt_disable(); xen_mc_flush(); paravirt_leave_lazy_mmu(); + preempt_enable(); } const struct pv_mmu_ops xen_mmu_ops __initdata = { From 59d7187142bbe9b404a403ed0f874d3227305f26 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 26 Feb 2009 15:48:33 -0800 Subject: [PATCH 0126/2061] xen: separate p2m allocation from setting When doing very early p2m setting, we need to separate setting from allocation, so split things up accordingly. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 71 ++++++++++++++++++++++++++++++++-------------- arch/x86/xen/mmu.h | 3 ++ 2 files changed, 52 insertions(+), 22 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 0e572380413..e0a55b7a6ce 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -233,47 +233,74 @@ unsigned long get_phys_to_machine(unsigned long pfn) } EXPORT_SYMBOL_GPL(get_phys_to_machine); -static void alloc_p2m(unsigned long **pp, unsigned long *mfnp) +/* install a new p2m_top page */ +bool install_p2mtop_page(unsigned long pfn, unsigned long *p) { - unsigned long *p; + unsigned topidx = p2m_top_index(pfn); + unsigned long **pfnp, *mfnp; unsigned i; - p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); - BUG_ON(p == NULL); + pfnp = &p2m_top[topidx]; + mfnp = &p2m_top_mfn[topidx]; for (i = 0; i < P2M_ENTRIES_PER_PAGE; i++) p[i] = INVALID_P2M_ENTRY; - if (cmpxchg(pp, p2m_missing, p) != p2m_missing) - free_page((unsigned long)p); - else + if (cmpxchg(pfnp, p2m_missing, p) == p2m_missing) { *mfnp = virt_to_mfn(p); + return true; + } + + return false; +} + +static void alloc_p2m(unsigned long pfn) +{ + unsigned long *p; + + p = (void *)__get_free_page(GFP_KERNEL | __GFP_NOFAIL); + BUG_ON(p == NULL); + + if (!install_p2mtop_page(pfn, p)) + free_page((unsigned long)p); +} + +/* Try to install p2m mapping; fail if intermediate bits missing */ +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn) +{ + unsigned topidx, idx; + + if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { + BUG_ON(mfn != INVALID_P2M_ENTRY); + return true; + } + + topidx = p2m_top_index(pfn); + if (p2m_top[topidx] == p2m_missing) { + if (mfn == INVALID_P2M_ENTRY) + return true; + return false; + } + + idx = p2m_index(pfn); + p2m_top[topidx][idx] = mfn; + + return true; } void set_phys_to_machine(unsigned long pfn, unsigned long mfn) { - unsigned topidx, idx; - if (unlikely(xen_feature(XENFEAT_auto_translated_physmap))) { BUG_ON(pfn != mfn && mfn != INVALID_P2M_ENTRY); return; } - if (unlikely(pfn >= MAX_DOMAIN_PAGES)) { - BUG_ON(mfn != INVALID_P2M_ENTRY); - return; - } + if (unlikely(!__set_phys_to_machine(pfn, mfn))) { + alloc_p2m(pfn); - topidx = p2m_top_index(pfn); - if (p2m_top[topidx] == p2m_missing) { - /* no need to allocate a page to store an invalid entry */ - if (mfn == INVALID_P2M_ENTRY) - return; - alloc_p2m(&p2m_top[topidx], &p2m_top_mfn[topidx]); + if (!__set_phys_to_machine(pfn, mfn)) + BUG(); } - - idx = p2m_index(pfn); - p2m_top[topidx][idx] = mfn; } unsigned long arbitrary_virt_to_mfn(void *vaddr) diff --git a/arch/x86/xen/mmu.h b/arch/x86/xen/mmu.h index 24d1b44a337..da730262489 100644 --- a/arch/x86/xen/mmu.h +++ b/arch/x86/xen/mmu.h @@ -11,6 +11,9 @@ enum pt_level { }; +bool __set_phys_to_machine(unsigned long pfn, unsigned long mfn); +bool install_p2mtop_page(unsigned long pfn, unsigned long *p); + void set_pte_mfn(unsigned long vaddr, unsigned long pfn, pgprot_t flags); From a2bcd4731f77cb77ae4b5e4a3d7f5471cf346c33 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 29 Mar 2009 23:47:48 +0200 Subject: [PATCH 0127/2061] x86/mm: further cleanups of fault.c's include file section Impact: cleanup Eliminate more than 20 unnecessary #include lines in fault.c Also fix include file dependency bug in asm/traps.h. (this was masked before, by implicit inclusion) Signed-off-by: Ingo Molnar LKML-Reference: Acked-by: H. Peter Anvin --- arch/x86/include/asm/traps.h | 1 + arch/x86/mm/fault.c | 42 ++++++++---------------------------- 2 files changed, 10 insertions(+), 33 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0d5342515b8..37fb07a9cda 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -2,6 +2,7 @@ #define _ASM_X86_TRAPS_H #include +#include /* TRAP_TRACE, ... */ #ifdef CONFIG_X86_32 #define dotraplinkage diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a03b7279efa..24a36a6426a 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -3,40 +3,16 @@ * Copyright (C) 2001, 2002 Andi Kleen, SuSE Labs. * Copyright (C) 2008-2009, Red Hat Inc., Ingo Molnar */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include /* STACK_END_MAGIC */ +#include /* test_thread_flag(), ... */ +#include /* oops_begin/end, ... */ +#include /* search_exception_table */ +#include /* max_low_pfn */ +#include /* __kprobes, ... */ +#include /* kmmio_handler, ... */ -#include - -#include -#include -#include -#include -#include -#include -#include +#include /* dotraplinkage, ... */ +#include /* pgd_*(), ... */ /* * Page fault error code bits: From 7571a60446030d2576d881438447e86a0755a83b Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Feb 2009 15:34:59 -0800 Subject: [PATCH 0128/2061] xen: split construction of p2m mfn tables from registration Build the p2m_mfn_list_list early with the rest of the p2m table, but register it later when the real shared_info structure is in place. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index e0a55b7a6ce..67d2ab45cd9 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -184,7 +184,7 @@ static inline unsigned p2m_index(unsigned long pfn) } /* Build the parallel p2m_top_mfn structures */ -void xen_setup_mfn_list_list(void) +static void __init xen_build_mfn_list_list(void) { unsigned pfn, idx; @@ -198,7 +198,10 @@ void xen_setup_mfn_list_list(void) unsigned topidx = idx * P2M_ENTRIES_PER_PAGE; p2m_top_mfn_list[idx] = virt_to_mfn(&p2m_top_mfn[topidx]); } +} +void xen_setup_mfn_list_list(void) +{ BUG_ON(HYPERVISOR_shared_info == &xen_dummy_shared_info); HYPERVISOR_shared_info->arch.pfn_to_mfn_frame_list_list = @@ -218,6 +221,8 @@ void __init xen_build_dynamic_phys_to_machine(void) p2m_top[topidx] = &mfn_list[pfn]; } + + xen_build_mfn_list_list(); } unsigned long get_phys_to_machine(unsigned long pfn) From 6ed6bf428aff64fe37cdc54b239d598fee6016f1 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 13:02:18 -0800 Subject: [PATCH 0129/2061] xen: clean up xen_load_gdt Makes the logic a bit clearer. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 70b355d3a86..5776dc27029 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -301,10 +301,21 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames = mcs.args; for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { - frames[f] = arbitrary_virt_to_mfn((void *)va); + int level; + pte_t *ptep = lookup_address(va, &level); + unsigned long pfn, mfn; + void *virt; + + BUG_ON(ptep == NULL); + + pfn = pte_pfn(*ptep); + mfn = pfn_to_mfn(pfn); + virt = __va(PFN_PHYS(pfn)); + + frames[f] = mfn; make_lowmem_page_readonly((void *)va); - make_lowmem_page_readonly(mfn_to_virt(frames[f])); + make_lowmem_page_readonly(virt); } MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); From 3ce5fa7ebff74b6a4dc5fdcdc22e6979f5a4ff85 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 15:26:00 -0800 Subject: [PATCH 0130/2061] xen: make xen_load_gdt simpler Remove use of multicall machinery which is unused (gdt loading is never performance critical). This removes the implicit use of percpu variables, which simplifies understanding how the percpu code's use of load_gdt interacts with this code. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5776dc27029..48b399bd6e0 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -284,12 +284,11 @@ static void xen_set_ldt(const void *addr, unsigned entries) static void xen_load_gdt(const struct desc_ptr *dtr) { - unsigned long *frames; unsigned long va = dtr->address; unsigned int size = dtr->size + 1; unsigned pages = (size + PAGE_SIZE - 1) / PAGE_SIZE; + unsigned long frames[pages]; int f; - struct multicall_space mcs; /* A GDT can be up to 64k in size, which corresponds to 8192 8-byte entries, or 16 4k pages.. */ @@ -297,9 +296,6 @@ static void xen_load_gdt(const struct desc_ptr *dtr) BUG_ON(size > 65536); BUG_ON(va & ~PAGE_MASK); - mcs = xen_mc_entry(sizeof(*frames) * pages); - frames = mcs.args; - for (f = 0; va < dtr->address + size; va += PAGE_SIZE, f++) { int level; pte_t *ptep = lookup_address(va, &level); @@ -314,13 +310,15 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames[f] = mfn; + printk("xen_load_gdt: %d va=%p mfn=%lx pfn=%lx va'=%p\n", + f, (void *)va, mfn, pfn, virt); + make_lowmem_page_readonly((void *)va); make_lowmem_page_readonly(virt); } - MULTI_set_gdt(mcs.mc, frames, size / sizeof(struct desc_struct)); - - xen_mc_issue(PARAVIRT_LAZY_CPU); + if (HYPERVISOR_set_gdt(frames, size / sizeof(struct desc_struct))) + BUG(); } static void load_TLS_descriptor(struct thread_struct *t, From b4b7e58590d0e94ed78bd6be1aa163caba7b6c74 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 16:34:27 -0800 Subject: [PATCH 0131/2061] xen: remove xen_load_gdt debug Don't need the noise. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 48b399bd6e0..75b7a0f9038 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -310,9 +310,6 @@ static void xen_load_gdt(const struct desc_ptr *dtr) frames[f] = mfn; - printk("xen_load_gdt: %d va=%p mfn=%lx pfn=%lx va'=%p\n", - f, (void *)va, mfn, pfn, virt); - make_lowmem_page_readonly((void *)va); make_lowmem_page_readonly(virt); } From e9e2d1ffcfdb38bed11a3064aa74bea9ee38ed80 Mon Sep 17 00:00:00 2001 From: Hannes Eder Date: Thu, 5 Mar 2009 20:13:57 +0100 Subject: [PATCH 0132/2061] NULL noise: arch/x86/xen/smp.c Fix this sparse warnings: arch/x86/xen/smp.c:316:52: warning: Using plain integer as NULL pointer arch/x86/xen/smp.c:421:60: warning: Using plain integer as NULL pointer Signed-off-by: Hannes Eder Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/smp.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c index 8d470562ffc..304d832710c 100644 --- a/arch/x86/xen/smp.c +++ b/arch/x86/xen/smp.c @@ -317,7 +317,7 @@ static int __cpuinit xen_cpu_up(unsigned int cpu) BUG_ON(rc); while(per_cpu(cpu_state, cpu) != CPU_ONLINE) { - HYPERVISOR_sched_op(SCHEDOP_yield, 0); + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); barrier(); } @@ -422,7 +422,7 @@ static void xen_smp_send_call_function_ipi(const struct cpumask *mask) /* Make sure other vcpus get a chance to run if they need to. */ for_each_cpu(cpu, mask) { if (xen_vcpu_stolen(cpu)) { - HYPERVISOR_sched_op(SCHEDOP_yield, 0); + HYPERVISOR_sched_op(SCHEDOP_yield, NULL); break; } } From e826fe1ba1563a9272345da8e3279a930ac160a7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 7 Mar 2009 17:09:27 -0800 Subject: [PATCH 0133/2061] xen: mask XSAVE from cpuid Xen leaves XSAVE set in cpuid, but doesn't allow cr4.OSXSAVE to be set. This confuses the kernel and it ends up crashing on an xsetbv instruction. At boot time, try to set cr4.OSXSAVE, and mask XSAVE out of cpuid it we can't. This will produce a spurious error from Xen, but allows us to support XSAVE if/when Xen does. This also factors out the cpuid mask decisions to boot time. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 50 +++++++++++++++++++++++++++++++++++----- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 75b7a0f9038..da33e0c5870 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -168,21 +168,23 @@ static void __init xen_banner(void) xen_feature(XENFEAT_mmu_pt_update_preserve_ad) ? " (preserve-AD)" : ""); } +static __read_mostly unsigned int cpuid_leaf1_edx_mask = ~0; +static __read_mostly unsigned int cpuid_leaf1_ecx_mask = ~0; + static void xen_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { + unsigned maskecx = ~0; unsigned maskedx = ~0; /* * Mask out inconvenient features, to try and disable as many * unsupported kernel subsystems as possible. */ - if (*ax == 1) - maskedx = ~((1 << X86_FEATURE_APIC) | /* disable APIC */ - (1 << X86_FEATURE_ACPI) | /* disable ACPI */ - (1 << X86_FEATURE_MCE) | /* disable MCE */ - (1 << X86_FEATURE_MCA) | /* disable MCA */ - (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + if (*ax == 1) { + maskecx = cpuid_leaf1_ecx_mask; + maskedx = cpuid_leaf1_edx_mask; + } asm(XEN_EMULATE_PREFIX "cpuid" : "=a" (*ax), @@ -190,9 +192,43 @@ static void xen_cpuid(unsigned int *ax, unsigned int *bx, "=c" (*cx), "=d" (*dx) : "0" (*ax), "2" (*cx)); + + *cx &= maskecx; *dx &= maskedx; } +static __init void xen_init_cpuid_mask(void) +{ + unsigned int ax, bx, cx, dx; + + cpuid_leaf1_edx_mask = + ~((1 << X86_FEATURE_MCE) | /* disable MCE */ + (1 << X86_FEATURE_MCA) | /* disable MCA */ + (1 << X86_FEATURE_ACC)); /* thermal monitoring */ + + if (!xen_initial_domain()) + cpuid_leaf1_edx_mask &= + ~((1 << X86_FEATURE_APIC) | /* disable local APIC */ + (1 << X86_FEATURE_ACPI)); /* disable ACPI */ + + ax = 1; + xen_cpuid(&ax, &bx, &cx, &dx); + + /* cpuid claims we support xsave; try enabling it to see what happens */ + if (cx & (1 << (X86_FEATURE_XSAVE % 32))) { + unsigned long cr4; + + set_in_cr4(X86_CR4_OSXSAVE); + + cr4 = read_cr4(); + + if ((cr4 & X86_CR4_OSXSAVE) == 0) + cpuid_leaf1_ecx_mask &= ~(1 << (X86_FEATURE_XSAVE % 32)); + + clear_in_cr4(X86_CR4_OSXSAVE); + } +} + static void xen_set_debugreg(int reg, unsigned long val) { HYPERVISOR_set_debugreg(reg, val); @@ -901,6 +937,8 @@ asmlinkage void __init xen_start_kernel(void) xen_init_irq_ops(); + xen_init_cpuid_mask(); + #ifdef CONFIG_X86_LOCAL_APIC /* * set up the basic apic ops. From 68509cdcde6583ee1a9542899d1270449c7d5903 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 8 Mar 2009 03:59:04 -0700 Subject: [PATCH 0134/2061] x86-64: remove PGE from must-have feature list PGE may not be available when running paravirtualized, so test the cpuid bit before using it. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/required-features.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index d5cd6c58688..a4737dddfd5 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -50,7 +50,7 @@ #ifdef CONFIG_X86_64 #define NEED_PSE 0 #define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) -#define NEED_PGE (1<<(X86_FEATURE_PGE & 31)) +#define NEED_PGE 0 #define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) #define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) #define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) From 1e7449730853e7c9ae9a2458b2ced7ba12559a0e Mon Sep 17 00:00:00 2001 From: Alex Nixon Date: Mon, 9 Feb 2009 12:05:46 -0800 Subject: [PATCH 0135/2061] Xen: Add virt_to_pfn helper function Signed-off-by: Alex Nixon --- arch/x86/include/asm/xen/page.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/xen/page.h b/arch/x86/include/asm/xen/page.h index 1a918dde46b..018a0a40079 100644 --- a/arch/x86/include/asm/xen/page.h +++ b/arch/x86/include/asm/xen/page.h @@ -124,7 +124,8 @@ static inline unsigned long mfn_to_local_pfn(unsigned long mfn) /* VIRT <-> MACHINE conversion */ #define virt_to_machine(v) (phys_to_machine(XPADDR(__pa(v)))) -#define virt_to_mfn(v) (pfn_to_mfn(PFN_DOWN(__pa(v)))) +#define virt_to_pfn(v) (PFN_DOWN(__pa(v))) +#define virt_to_mfn(v) (pfn_to_mfn(virt_to_pfn(v))) #define mfn_to_virt(m) (__va(mfn_to_pfn(m) << PAGE_SHIFT)) static inline unsigned long pte_mfn(pte_t pte) From 5f241e65f2be4661a33e1937e1c829252a80b2b8 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 16 Mar 2009 17:08:48 -0700 Subject: [PATCH 0136/2061] x86-64: non-paravirt systems always has PSE and PGE A paravirtualized system may not have PSE or PGE available to guests, so they are not required features. However, without paravirt we can assume that any x86-64 implementation will have them available. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/required-features.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/required-features.h b/arch/x86/include/asm/required-features.h index a4737dddfd5..64cf2d24fad 100644 --- a/arch/x86/include/asm/required-features.h +++ b/arch/x86/include/asm/required-features.h @@ -48,9 +48,15 @@ #endif #ifdef CONFIG_X86_64 +#ifdef CONFIG_PARAVIRT +/* Paravirtualized systems may not have PSE or PGE available */ #define NEED_PSE 0 -#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) #define NEED_PGE 0 +#else +#define NEED_PSE (1<<(X86_FEATURE_PSE) & 31) +#define NEED_PGE (1<<(X86_FEATURE_PGE) & 31) +#endif +#define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) #define NEED_FXSR (1<<(X86_FEATURE_FXSR & 31)) #define NEED_XMM (1<<(X86_FEATURE_XMM & 31)) #define NEED_XMM2 (1<<(X86_FEATURE_XMM2 & 31)) From 4185f35404dc96f8525298c7c548aee419f3b3f4 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 17 Mar 2009 13:30:55 -0700 Subject: [PATCH 0137/2061] xen/mmu: some early pagetable cleanups 1. make sure early-allocated ptes are pinned, so they can be later unpinned 2. don't pin pmd+pud, just make them RO 3. scatter some __inits around Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 40 ++++++++++++++++++++++++++++------------ arch/x86/xen/xen-ops.h | 2 -- 2 files changed, 28 insertions(+), 14 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 67d2ab45cd9..df87c803cec 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1013,7 +1013,7 @@ static __init int xen_mark_pinned(struct mm_struct *mm, struct page *page, return 0; } -void __init xen_mark_init_mm_pinned(void) +static void __init xen_mark_init_mm_pinned(void) { xen_pgd_walk(&init_mm, xen_mark_pinned, FIXADDR_TOP); } @@ -1461,10 +1461,29 @@ static __init void xen_set_pte_init(pte_t *ptep, pte_t pte) } #endif +static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +{ + struct mmuext_op op; + op.cmd = cmd; + op.arg1.mfn = pfn_to_mfn(pfn); + if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) + BUG(); +} + /* Early in boot, while setting up the initial pagetable, assume everything is pinned. */ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) { +#ifdef CONFIG_FLATMEM + BUG_ON(mem_map); /* should only be used early */ +#endif + make_lowmem_page_readonly(__va(PFN_PHYS(pfn))); + pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE, pfn); +} + +/* Used for pmd and pud */ +static __init void xen_alloc_pmd_init(struct mm_struct *mm, unsigned long pfn) +{ #ifdef CONFIG_FLATMEM BUG_ON(mem_map); /* should only be used early */ #endif @@ -1473,18 +1492,15 @@ static __init void xen_alloc_pte_init(struct mm_struct *mm, unsigned long pfn) /* Early release_pte assumes that all pts are pinned, since there's only init_mm and anything attached to that is pinned. */ -static void xen_release_pte_init(unsigned long pfn) +static __init void xen_release_pte_init(unsigned long pfn) { + pin_pagetable_pfn(MMUEXT_UNPIN_TABLE, pfn); make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } -static void pin_pagetable_pfn(unsigned cmd, unsigned long pfn) +static __init void xen_release_pmd_init(unsigned long pfn) { - struct mmuext_op op; - op.cmd = cmd; - op.arg1.mfn = pfn_to_mfn(pfn); - if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF)) - BUG(); + make_lowmem_page_readwrite(__va(PFN_PHYS(pfn))); } /* This needs to make sure the new pte page is pinned iff its being @@ -1873,9 +1889,9 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .alloc_pte = xen_alloc_pte_init, .release_pte = xen_release_pte_init, - .alloc_pmd = xen_alloc_pte_init, + .alloc_pmd = xen_alloc_pmd_init, .alloc_pmd_clone = paravirt_nop, - .release_pmd = xen_release_pte_init, + .release_pmd = xen_release_pmd_init, #ifdef CONFIG_HIGHPTE .kmap_atomic_pte = xen_kmap_atomic_pte, @@ -1914,8 +1930,8 @@ const struct pv_mmu_ops xen_mmu_ops __initdata = { .make_pud = PV_CALLEE_SAVE(xen_make_pud), .set_pgd = xen_set_pgd_hyper, - .alloc_pud = xen_alloc_pte_init, - .release_pud = xen_release_pte_init, + .alloc_pud = xen_alloc_pmd_init, + .release_pud = xen_release_pmd_init, #endif /* PAGETABLE_LEVELS == 4 */ .activate_mm = xen_activate_mm, diff --git a/arch/x86/xen/xen-ops.h b/arch/x86/xen/xen-ops.h index f897cdffccb..5c50a1017a3 100644 --- a/arch/x86/xen/xen-ops.h +++ b/arch/x86/xen/xen-ops.h @@ -56,8 +56,6 @@ irqreturn_t xen_debug_interrupt(int irq, void *dev_id); bool xen_vcpu_stolen(int vcpu); -void xen_mark_init_mm_pinned(void); - void xen_setup_vcpu_info_placement(void); #ifdef CONFIG_SMP From 8de07bbdede03598801cf33ab23dcbcd28a918d2 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Wed, 4 Mar 2009 17:36:57 -0800 Subject: [PATCH 0138/2061] xen/mmu: weaken flush_tlb_other test Impact: fixes crashing bug There's no particular problem with getting an empty cpu mask, so just shortcut-return if we get one. Avoids crash reported by Christophe Saout Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index df87c803cec..e425a32e0a9 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1293,8 +1293,8 @@ static void xen_flush_tlb_others(const struct cpumask *cpus, } *args; struct multicall_space mcs; - BUG_ON(cpumask_empty(cpus)); - BUG_ON(!mm); + if (cpumask_empty(cpus)) + return; /* nothing to do */ mcs = xen_mc_entry(sizeof(*args)); args = mcs.args; From 1e6fcf840e11ceff8a656a678c6e4b0560a98e08 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 25 Mar 2009 17:46:42 +0000 Subject: [PATCH 0139/2061] xen: resume interrupts before system devices. Impact: bugfix Xen domain restore Otherwise the first timer interrupt after resume is missed and we never get another. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/manage.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index 3ccd348d112..b703dd2c9f1 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -68,15 +68,15 @@ static int xen_suspend(void *data) gnttab_resume(); xen_mm_unpin_all(); - sysdev_resume(); - device_power_up(PMSG_RESUME); - if (!*cancelled) { xen_irq_resume(); xen_console_resume(); xen_timer_resume(); } + sysdev_resume(); + device_power_up(PMSG_RESUME); + return 0; } From 707ebbc81c61eb480d8a51ca61e355e240df1d32 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 27 Mar 2009 11:29:02 -0700 Subject: [PATCH 0140/2061] xen: set _PAGE_NX in __supported_pte_mask before pagetable construction Some 64-bit machines don't support the NX flag in ptes. Check for NX before constructing the kernel pagetables. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index da33e0c5870..80f4c534349 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -912,7 +913,6 @@ static const struct machine_ops __initdata xen_machine_ops = { .emergency_restart = xen_emergency_restart, }; - /* First C function to be called on Xen boot */ asmlinkage void __init xen_start_kernel(void) { @@ -980,6 +980,11 @@ asmlinkage void __init xen_start_kernel(void) if (!xen_initial_domain()) __supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD); +#ifdef CONFIG_X86_64 + /* Work out if we support NX */ + check_efer(); +#endif + /* Don't do the full vcpu_info placement stuff until we have a possible map and a non-dummy shared_info. */ per_cpu(xen_vcpu, 0) = &HYPERVISOR_shared_info->vcpu_info[0]; From 6d02c42698f99eccb290ac53d4f10ca883b9f90c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 29 Mar 2009 22:57:15 -0700 Subject: [PATCH 0141/2061] xen: clean up gate trap/interrupt constants Use GATE_INTERRUPT/TRAP rather than 0xe/f. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 80f4c534349..12a3159333b 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -428,7 +428,7 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { - if (val->type != 0xf && val->type != 0xe) + if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) return 0; info->vector = vector; @@ -436,8 +436,8 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, info->cs = gate_segment(*val); info->flags = val->dpl; /* interrupt gates clear IF */ - if (val->type == 0xe) - info->flags |= 4; + if (val->type == GATE_INTERRUPT) + info->flags |= 1 << 2; return 1; } From d4c045364d3107603187f21a56ec231e74d26441 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 6 Feb 2009 19:20:31 -0800 Subject: [PATCH 0142/2061] xen: add irq_from_evtchn Given an evtchn, return the corresponding irq. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/events.c | 6 ++++++ include/xen/events.h | 3 +++ 2 files changed, 9 insertions(+) diff --git a/drivers/xen/events.c b/drivers/xen/events.c index 30963af5dba..1cd2a0e15ae 100644 --- a/drivers/xen/events.c +++ b/drivers/xen/events.c @@ -151,6 +151,12 @@ static unsigned int evtchn_from_irq(unsigned irq) return info_for_irq(irq)->evtchn; } +unsigned irq_from_evtchn(unsigned int evtchn) +{ + return evtchn_to_irq[evtchn]; +} +EXPORT_SYMBOL_GPL(irq_from_evtchn); + static enum ipi_vector ipi_from_irq(unsigned irq) { struct irq_info *info = info_for_irq(irq); diff --git a/include/xen/events.h b/include/xen/events.h index 0d5f1adc036..e68d59a90ca 100644 --- a/include/xen/events.h +++ b/include/xen/events.h @@ -53,4 +53,7 @@ bool xen_test_irq_pending(int irq); irq will be disabled so it won't deliver an interrupt. */ void xen_poll_irq(int irq); +/* Determine the IRQ which is bound to an event channel */ +unsigned irq_from_evtchn(unsigned int evtchn); + #endif /* _XEN_EVENTS_H */ From f7116284c734f3a47180cd9c907944a1837ccb3c Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 6 Feb 2009 19:21:19 -0800 Subject: [PATCH 0143/2061] xen: add /dev/xen/evtchn driver This driver is used by application which wish to receive notifications from the hypervisor or other guests via Xen's event channel mechanism. In particular it is used by the xenstore daemon in domain 0. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/Kconfig | 10 + drivers/xen/Makefile | 3 +- drivers/xen/evtchn.c | 494 +++++++++++++++++++++++++++++++++++++++++++ include/xen/evtchn.h | 88 ++++++++ 4 files changed, 594 insertions(+), 1 deletion(-) create mode 100644 drivers/xen/evtchn.c create mode 100644 include/xen/evtchn.h diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 526187c8a12..1bbb9108f31 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -18,6 +18,16 @@ config XEN_SCRUB_PAGES secure, but slightly less efficient. If in doubt, say yes. +config XEN_DEV_EVTCHN + tristate "Xen /dev/xen/evtchn device" + depends on XEN + default y + help + The evtchn driver allows a userspace process to triger event + channels and to receive notification of an event channel + firing. + If in doubt, say yes. + config XENFS tristate "Xen filesystem" depends on XEN diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index ff8accc9e10..1567639847e 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -4,4 +4,5 @@ obj-y += xenbus/ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o -obj-$(CONFIG_XENFS) += xenfs/ \ No newline at end of file +obj-$(CONFIG_XEN_DEV_EVTCHN) += evtchn.o +obj-$(CONFIG_XENFS) += xenfs/ diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c new file mode 100644 index 00000000000..517b9ee63e1 --- /dev/null +++ b/drivers/xen/evtchn.c @@ -0,0 +1,494 @@ +/****************************************************************************** + * evtchn.c + * + * Driver for receiving and demuxing event-channel signals. + * + * Copyright (c) 2004-2005, K A Fraser + * Multi-process extensions Copyright (c) 2004, Steven Smith + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct per_user_data { + /* Notification ring, accessed via /dev/xen/evtchn. */ +#define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) +#define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) + evtchn_port_t *ring; + unsigned int ring_cons, ring_prod, ring_overflow; + struct mutex ring_cons_mutex; /* protect against concurrent readers */ + + /* Processes wait on this queue when ring is empty. */ + wait_queue_head_t evtchn_wait; + struct fasync_struct *evtchn_async_queue; + const char *name; +}; + +/* Who's bound to each port? */ +static struct per_user_data *port_user[NR_EVENT_CHANNELS]; +static DEFINE_SPINLOCK(port_user_lock); + +irqreturn_t evtchn_interrupt(int irq, void *data) +{ + unsigned int port = (unsigned long)data; + struct per_user_data *u; + + spin_lock(&port_user_lock); + + u = port_user[port]; + + disable_irq_nosync(irq); + + if ((u->ring_prod - u->ring_cons) < EVTCHN_RING_SIZE) { + u->ring[EVTCHN_RING_MASK(u->ring_prod)] = port; + wmb(); /* Ensure ring contents visible */ + if (u->ring_cons == u->ring_prod++) { + wake_up_interruptible(&u->evtchn_wait); + kill_fasync(&u->evtchn_async_queue, + SIGIO, POLL_IN); + } + } else { + u->ring_overflow = 1; + } + + spin_unlock(&port_user_lock); + + return IRQ_HANDLED; +} + +static ssize_t evtchn_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + int rc; + unsigned int c, p, bytes1 = 0, bytes2 = 0; + struct per_user_data *u = file->private_data; + + /* Whole number of ports. */ + count &= ~(sizeof(evtchn_port_t)-1); + + if (count == 0) + return 0; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + for (;;) { + mutex_lock(&u->ring_cons_mutex); + + rc = -EFBIG; + if (u->ring_overflow) + goto unlock_out; + + c = u->ring_cons; + p = u->ring_prod; + if (c != p) + break; + + mutex_unlock(&u->ring_cons_mutex); + + if (file->f_flags & O_NONBLOCK) + return -EAGAIN; + + rc = wait_event_interruptible(u->evtchn_wait, + u->ring_cons != u->ring_prod); + if (rc) + return rc; + } + + /* Byte lengths of two chunks. Chunk split (if any) is at ring wrap. */ + if (((c ^ p) & EVTCHN_RING_SIZE) != 0) { + bytes1 = (EVTCHN_RING_SIZE - EVTCHN_RING_MASK(c)) * + sizeof(evtchn_port_t); + bytes2 = EVTCHN_RING_MASK(p) * sizeof(evtchn_port_t); + } else { + bytes1 = (p - c) * sizeof(evtchn_port_t); + bytes2 = 0; + } + + /* Truncate chunks according to caller's maximum byte count. */ + if (bytes1 > count) { + bytes1 = count; + bytes2 = 0; + } else if ((bytes1 + bytes2) > count) { + bytes2 = count - bytes1; + } + + rc = -EFAULT; + rmb(); /* Ensure that we see the port before we copy it. */ + if (copy_to_user(buf, &u->ring[EVTCHN_RING_MASK(c)], bytes1) || + ((bytes2 != 0) && + copy_to_user(&buf[bytes1], &u->ring[0], bytes2))) + goto unlock_out; + + u->ring_cons += (bytes1 + bytes2) / sizeof(evtchn_port_t); + rc = bytes1 + bytes2; + + unlock_out: + mutex_unlock(&u->ring_cons_mutex); + return rc; +} + +static ssize_t evtchn_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + int rc, i; + evtchn_port_t *kbuf = (evtchn_port_t *)__get_free_page(GFP_KERNEL); + struct per_user_data *u = file->private_data; + + if (kbuf == NULL) + return -ENOMEM; + + /* Whole number of ports. */ + count &= ~(sizeof(evtchn_port_t)-1); + + rc = 0; + if (count == 0) + goto out; + + if (count > PAGE_SIZE) + count = PAGE_SIZE; + + rc = -EFAULT; + if (copy_from_user(kbuf, buf, count) != 0) + goto out; + + spin_lock_irq(&port_user_lock); + for (i = 0; i < (count/sizeof(evtchn_port_t)); i++) + if ((kbuf[i] < NR_EVENT_CHANNELS) && (port_user[kbuf[i]] == u)) + enable_irq(irq_from_evtchn(kbuf[i])); + spin_unlock_irq(&port_user_lock); + + rc = count; + + out: + free_page((unsigned long)kbuf); + return rc; +} + +static int evtchn_bind_to_user(struct per_user_data *u, int port) +{ + int irq; + int rc = 0; + + spin_lock_irq(&port_user_lock); + + BUG_ON(port_user[port] != NULL); + + irq = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, + u->name, (void *)(unsigned long)port); + if (rc < 0) + goto fail; + + port_user[port] = u; + +fail: + spin_unlock_irq(&port_user_lock); + return rc; +} + +static void evtchn_unbind_from_user(struct per_user_data *u, int port) +{ + int irq = irq_from_evtchn(port); + + unbind_from_irqhandler(irq, (void *)(unsigned long)port); + port_user[port] = NULL; +} + +static long evtchn_ioctl(struct file *file, + unsigned int cmd, unsigned long arg) +{ + int rc; + struct per_user_data *u = file->private_data; + void __user *uarg = (void __user *) arg; + + switch (cmd) { + case IOCTL_EVTCHN_BIND_VIRQ: { + struct ioctl_evtchn_bind_virq bind; + struct evtchn_bind_virq bind_virq; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + bind_virq.virq = bind.virq; + bind_virq.vcpu = 0; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_virq, + &bind_virq); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, bind_virq.port); + if (rc == 0) + rc = bind_virq.port; + break; + } + + case IOCTL_EVTCHN_BIND_INTERDOMAIN: { + struct ioctl_evtchn_bind_interdomain bind; + struct evtchn_bind_interdomain bind_interdomain; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + bind_interdomain.remote_dom = bind.remote_domain; + bind_interdomain.remote_port = bind.remote_port; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_bind_interdomain, + &bind_interdomain); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, bind_interdomain.local_port); + if (rc == 0) + rc = bind_interdomain.local_port; + break; + } + + case IOCTL_EVTCHN_BIND_UNBOUND_PORT: { + struct ioctl_evtchn_bind_unbound_port bind; + struct evtchn_alloc_unbound alloc_unbound; + + rc = -EFAULT; + if (copy_from_user(&bind, uarg, sizeof(bind))) + break; + + alloc_unbound.dom = DOMID_SELF; + alloc_unbound.remote_dom = bind.remote_domain; + rc = HYPERVISOR_event_channel_op(EVTCHNOP_alloc_unbound, + &alloc_unbound); + if (rc != 0) + break; + + rc = evtchn_bind_to_user(u, alloc_unbound.port); + if (rc == 0) + rc = alloc_unbound.port; + break; + } + + case IOCTL_EVTCHN_UNBIND: { + struct ioctl_evtchn_unbind unbind; + + rc = -EFAULT; + if (copy_from_user(&unbind, uarg, sizeof(unbind))) + break; + + rc = -EINVAL; + if (unbind.port >= NR_EVENT_CHANNELS) + break; + + spin_lock_irq(&port_user_lock); + + rc = -ENOTCONN; + if (port_user[unbind.port] != u) { + spin_unlock_irq(&port_user_lock); + break; + } + + evtchn_unbind_from_user(u, unbind.port); + + spin_unlock_irq(&port_user_lock); + + rc = 0; + break; + } + + case IOCTL_EVTCHN_NOTIFY: { + struct ioctl_evtchn_notify notify; + + rc = -EFAULT; + if (copy_from_user(¬ify, uarg, sizeof(notify))) + break; + + if (notify.port >= NR_EVENT_CHANNELS) { + rc = -EINVAL; + } else if (port_user[notify.port] != u) { + rc = -ENOTCONN; + } else { + notify_remote_via_evtchn(notify.port); + rc = 0; + } + break; + } + + case IOCTL_EVTCHN_RESET: { + /* Initialise the ring to empty. Clear errors. */ + mutex_lock(&u->ring_cons_mutex); + spin_lock_irq(&port_user_lock); + u->ring_cons = u->ring_prod = u->ring_overflow = 0; + spin_unlock_irq(&port_user_lock); + mutex_unlock(&u->ring_cons_mutex); + rc = 0; + break; + } + + default: + rc = -ENOSYS; + break; + } + + return rc; +} + +static unsigned int evtchn_poll(struct file *file, poll_table *wait) +{ + unsigned int mask = POLLOUT | POLLWRNORM; + struct per_user_data *u = file->private_data; + + poll_wait(file, &u->evtchn_wait, wait); + if (u->ring_cons != u->ring_prod) + mask |= POLLIN | POLLRDNORM; + if (u->ring_overflow) + mask = POLLERR; + return mask; +} + +static int evtchn_fasync(int fd, struct file *filp, int on) +{ + struct per_user_data *u = filp->private_data; + return fasync_helper(fd, filp, on, &u->evtchn_async_queue); +} + +static int evtchn_open(struct inode *inode, struct file *filp) +{ + struct per_user_data *u; + + u = kzalloc(sizeof(*u), GFP_KERNEL); + if (u == NULL) + return -ENOMEM; + + u->name = kasprintf(GFP_KERNEL, "evtchn:%s", current->comm); + if (u->name == NULL) { + kfree(u); + return -ENOMEM; + } + + init_waitqueue_head(&u->evtchn_wait); + + u->ring = (evtchn_port_t *)__get_free_page(GFP_KERNEL); + if (u->ring == NULL) { + kfree(u->name); + kfree(u); + return -ENOMEM; + } + + mutex_init(&u->ring_cons_mutex); + + filp->private_data = u; + + return 0; +} + +static int evtchn_release(struct inode *inode, struct file *filp) +{ + int i; + struct per_user_data *u = filp->private_data; + + spin_lock_irq(&port_user_lock); + + free_page((unsigned long)u->ring); + + for (i = 0; i < NR_EVENT_CHANNELS; i++) { + if (port_user[i] != u) + continue; + + evtchn_unbind_from_user(port_user[i], i); + } + + spin_unlock_irq(&port_user_lock); + + kfree(u->name); + kfree(u); + + return 0; +} + +static const struct file_operations evtchn_fops = { + .owner = THIS_MODULE, + .read = evtchn_read, + .write = evtchn_write, + .unlocked_ioctl = evtchn_ioctl, + .poll = evtchn_poll, + .fasync = evtchn_fasync, + .open = evtchn_open, + .release = evtchn_release, +}; + +static struct miscdevice evtchn_miscdev = { + .minor = MISC_DYNAMIC_MINOR, + .name = "evtchn", + .fops = &evtchn_fops, +}; +static int __init evtchn_init(void) +{ + int err; + + if (!xen_domain()) + return -ENODEV; + + spin_lock_init(&port_user_lock); + memset(port_user, 0, sizeof(port_user)); + + /* Create '/dev/misc/evtchn'. */ + err = misc_register(&evtchn_miscdev); + if (err != 0) { + printk(KERN_ALERT "Could not register /dev/misc/evtchn\n"); + return err; + } + + printk(KERN_INFO "Event-channel device installed.\n"); + + return 0; +} + +static void __exit evtchn_cleanup(void) +{ + misc_deregister(&evtchn_miscdev); +} + +module_init(evtchn_init); +module_exit(evtchn_cleanup); + +MODULE_LICENSE("GPL"); diff --git a/include/xen/evtchn.h b/include/xen/evtchn.h new file mode 100644 index 00000000000..14e833ee4e0 --- /dev/null +++ b/include/xen/evtchn.h @@ -0,0 +1,88 @@ +/****************************************************************************** + * evtchn.h + * + * Interface to /dev/xen/evtchn. + * + * Copyright (c) 2003-2005, K A Fraser + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version 2 + * as published by the Free Software Foundation; or, when distributed + * separately from the Linux kernel or incorporated into other + * software packages, subject to the following license: + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this source file (the "Software"), to deal in the Software without + * restriction, including without limitation the rights to use, copy, modify, + * merge, publish, distribute, sublicense, and/or sell copies of the Software, + * and to permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#ifndef __LINUX_PUBLIC_EVTCHN_H__ +#define __LINUX_PUBLIC_EVTCHN_H__ + +/* + * Bind a fresh port to VIRQ @virq. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_VIRQ \ + _IOC(_IOC_NONE, 'E', 0, sizeof(struct ioctl_evtchn_bind_virq)) +struct ioctl_evtchn_bind_virq { + unsigned int virq; +}; + +/* + * Bind a fresh port to remote <@remote_domain, @remote_port>. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_INTERDOMAIN \ + _IOC(_IOC_NONE, 'E', 1, sizeof(struct ioctl_evtchn_bind_interdomain)) +struct ioctl_evtchn_bind_interdomain { + unsigned int remote_domain, remote_port; +}; + +/* + * Allocate a fresh port for binding to @remote_domain. + * Return allocated port. + */ +#define IOCTL_EVTCHN_BIND_UNBOUND_PORT \ + _IOC(_IOC_NONE, 'E', 2, sizeof(struct ioctl_evtchn_bind_unbound_port)) +struct ioctl_evtchn_bind_unbound_port { + unsigned int remote_domain; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_UNBIND \ + _IOC(_IOC_NONE, 'E', 3, sizeof(struct ioctl_evtchn_unbind)) +struct ioctl_evtchn_unbind { + unsigned int port; +}; + +/* + * Unbind previously allocated @port. + */ +#define IOCTL_EVTCHN_NOTIFY \ + _IOC(_IOC_NONE, 'E', 4, sizeof(struct ioctl_evtchn_notify)) +struct ioctl_evtchn_notify { + unsigned int port; +}; + +/* Clear and reinitialise the event buffer. Clear error condition. */ +#define IOCTL_EVTCHN_RESET \ + _IOC(_IOC_NONE, 'E', 5, 0) + +#endif /* __LINUX_PUBLIC_EVTCHN_H__ */ From c5cfef0f79cacc3aa438fc28f4747f0d10c54d0d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Fri, 6 Feb 2009 19:21:19 -0800 Subject: [PATCH 0144/2061] xen: export ioctl headers to userspace Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- include/Kbuild | 1 + include/xen/Kbuild | 1 + 2 files changed, 2 insertions(+) create mode 100644 include/xen/Kbuild diff --git a/include/Kbuild b/include/Kbuild index d8c3e3cbf41..fe36accd432 100644 --- a/include/Kbuild +++ b/include/Kbuild @@ -8,3 +8,4 @@ header-y += mtd/ header-y += rdma/ header-y += video/ header-y += drm/ +header-y += xen/ diff --git a/include/xen/Kbuild b/include/xen/Kbuild new file mode 100644 index 00000000000..4e65c16a445 --- /dev/null +++ b/include/xen/Kbuild @@ -0,0 +1 @@ +header-y += evtchn.h From 0a4666b539a0e896ec4e8396a034a479e3573125 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 12 Feb 2009 13:03:24 -0800 Subject: [PATCH 0145/2061] xen/dev-evtchn: clean up locking in evtchn Define a new per_user_data mutex to serialize bind/unbind operations to prevent them from racing with each other. Fix error returns and don't do a bind while holding a spinlock. Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/evtchn.c | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/drivers/xen/evtchn.c b/drivers/xen/evtchn.c index 517b9ee63e1..af031950f9b 100644 --- a/drivers/xen/evtchn.c +++ b/drivers/xen/evtchn.c @@ -54,6 +54,8 @@ #include struct per_user_data { + struct mutex bind_mutex; /* serialize bind/unbind operations */ + /* Notification ring, accessed via /dev/xen/evtchn. */ #define EVTCHN_RING_SIZE (PAGE_SIZE / sizeof(evtchn_port_t)) #define EVTCHN_RING_MASK(_i) ((_i)&(EVTCHN_RING_SIZE-1)) @@ -69,7 +71,7 @@ struct per_user_data { /* Who's bound to each port? */ static struct per_user_data *port_user[NR_EVENT_CHANNELS]; -static DEFINE_SPINLOCK(port_user_lock); +static DEFINE_SPINLOCK(port_user_lock); /* protects port_user[] and ring_prod */ irqreturn_t evtchn_interrupt(int irq, void *data) { @@ -210,22 +212,24 @@ static ssize_t evtchn_write(struct file *file, const char __user *buf, static int evtchn_bind_to_user(struct per_user_data *u, int port) { - int irq; int rc = 0; - spin_lock_irq(&port_user_lock); - + /* + * Ports are never reused, so every caller should pass in a + * unique port. + * + * (Locking not necessary because we haven't registered the + * interrupt handler yet, and our caller has already + * serialized bind operations.) + */ BUG_ON(port_user[port] != NULL); - - irq = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, - u->name, (void *)(unsigned long)port); - if (rc < 0) - goto fail; - port_user[port] = u; -fail: - spin_unlock_irq(&port_user_lock); + rc = bind_evtchn_to_irqhandler(port, evtchn_interrupt, IRQF_DISABLED, + u->name, (void *)(unsigned long)port); + if (rc >= 0) + rc = 0; + return rc; } @@ -234,6 +238,10 @@ static void evtchn_unbind_from_user(struct per_user_data *u, int port) int irq = irq_from_evtchn(port); unbind_from_irqhandler(irq, (void *)(unsigned long)port); + + /* make sure we unbind the irq handler before clearing the port */ + barrier(); + port_user[port] = NULL; } @@ -244,6 +252,9 @@ static long evtchn_ioctl(struct file *file, struct per_user_data *u = file->private_data; void __user *uarg = (void __user *) arg; + /* Prevent bind from racing with unbind */ + mutex_lock(&u->bind_mutex); + switch (cmd) { case IOCTL_EVTCHN_BIND_VIRQ: { struct ioctl_evtchn_bind_virq bind; @@ -368,6 +379,7 @@ static long evtchn_ioctl(struct file *file, rc = -ENOSYS; break; } + mutex_unlock(&u->bind_mutex); return rc; } @@ -414,6 +426,7 @@ static int evtchn_open(struct inode *inode, struct file *filp) return -ENOMEM; } + mutex_init(&u->bind_mutex); mutex_init(&u->ring_cons_mutex); filp->private_data = u; From a1ce1be578365a4da7e7d7db4812539d2d5da763 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 9 Feb 2009 12:05:50 -0800 Subject: [PATCH 0146/2061] xen: remove suspend_cancel hook Remove suspend_cancel hook from xenbus_driver, in preparation for using the device model for suspending. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/xenbus/xenbus_probe.c | 23 ----------------------- include/xen/xenbus.h | 1 - 2 files changed, 24 deletions(-) diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 773d1cf2328..bd20361fb09 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -689,27 +689,6 @@ static int suspend_dev(struct device *dev, void *data) return 0; } -static int suspend_cancel_dev(struct device *dev, void *data) -{ - int err = 0; - struct xenbus_driver *drv; - struct xenbus_device *xdev; - - DPRINTK(""); - - if (dev->driver == NULL) - return 0; - drv = to_xenbus_driver(dev->driver); - xdev = container_of(dev, struct xenbus_device, dev); - if (drv->suspend_cancel) - err = drv->suspend_cancel(xdev); - if (err) - printk(KERN_WARNING - "xenbus: suspend_cancel %s failed: %i\n", - dev_name(dev), err); - return 0; -} - static int resume_dev(struct device *dev, void *data) { int err; @@ -777,8 +756,6 @@ EXPORT_SYMBOL_GPL(xenbus_resume); void xenbus_suspend_cancel(void) { xs_suspend_cancel(); - bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_cancel_dev); - xenbus_backend_resume(suspend_cancel_dev); } EXPORT_SYMBOL_GPL(xenbus_suspend_cancel); diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index f87f9614844..0836772b968 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -92,7 +92,6 @@ struct xenbus_driver { enum xenbus_state backend_state); int (*remove)(struct xenbus_device *dev); int (*suspend)(struct xenbus_device *dev); - int (*suspend_cancel)(struct xenbus_device *dev); int (*resume)(struct xenbus_device *dev); int (*uevent)(struct xenbus_device *, char **, int, char *, int); struct device_driver driver; From de5b31bd47de7e6f41be2e271318dbc8f1af354d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Mon, 9 Feb 2009 12:05:50 -0800 Subject: [PATCH 0147/2061] xen: use device model for suspending xenbus devices Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/manage.c | 9 ++++---- drivers/xen/xenbus/xenbus_probe.c | 37 ++++++++----------------------- drivers/xen/xenbus/xenbus_xs.c | 2 ++ include/xen/xenbus.h | 2 +- 4 files changed, 16 insertions(+), 34 deletions(-) diff --git a/drivers/xen/manage.c b/drivers/xen/manage.c index b703dd2c9f1..5269bb4d249 100644 --- a/drivers/xen/manage.c +++ b/drivers/xen/manage.c @@ -104,9 +104,8 @@ static void do_suspend(void) goto out; } - printk("suspending xenbus...\n"); - /* XXX use normal device tree? */ - xenbus_suspend(); + printk(KERN_DEBUG "suspending xenstore...\n"); + xs_suspend(); err = stop_machine(xen_suspend, &cancelled, cpumask_of(0)); if (err) { @@ -116,9 +115,9 @@ static void do_suspend(void) if (!cancelled) { xen_arch_resume(); - xenbus_resume(); + xs_resume(); } else - xenbus_suspend_cancel(); + xs_suspend_cancel(); device_resume(PMSG_RESUME); diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index bd20361fb09..4649213bed9 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -71,6 +71,9 @@ static int xenbus_probe_frontend(const char *type, const char *name); static void xenbus_dev_shutdown(struct device *_dev); +static int xenbus_dev_suspend(struct device *dev, pm_message_t state); +static int xenbus_dev_resume(struct device *dev); + /* If something in array of ids matches this device, return it. */ static const struct xenbus_device_id * match_device(const struct xenbus_device_id *arr, struct xenbus_device *dev) @@ -188,6 +191,9 @@ static struct xen_bus_type xenbus_frontend = { .remove = xenbus_dev_remove, .shutdown = xenbus_dev_shutdown, .dev_attrs = xenbus_dev_attrs, + + .suspend = xenbus_dev_suspend, + .resume = xenbus_dev_resume, }, }; @@ -669,7 +675,7 @@ static struct xenbus_watch fe_watch = { .callback = frontend_changed, }; -static int suspend_dev(struct device *dev, void *data) +static int xenbus_dev_suspend(struct device *dev, pm_message_t state) { int err = 0; struct xenbus_driver *drv; @@ -682,14 +688,14 @@ static int suspend_dev(struct device *dev, void *data) drv = to_xenbus_driver(dev->driver); xdev = container_of(dev, struct xenbus_device, dev); if (drv->suspend) - err = drv->suspend(xdev); + err = drv->suspend(xdev, state); if (err) printk(KERN_WARNING "xenbus: suspend %s failed: %i\n", dev_name(dev), err); return 0; } -static int resume_dev(struct device *dev, void *data) +static int xenbus_dev_resume(struct device *dev) { int err; struct xenbus_driver *drv; @@ -734,31 +740,6 @@ static int resume_dev(struct device *dev, void *data) return 0; } -void xenbus_suspend(void) -{ - DPRINTK(""); - - bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, suspend_dev); - xenbus_backend_suspend(suspend_dev); - xs_suspend(); -} -EXPORT_SYMBOL_GPL(xenbus_suspend); - -void xenbus_resume(void) -{ - xb_init_comms(); - xs_resume(); - bus_for_each_dev(&xenbus_frontend.bus, NULL, NULL, resume_dev); - xenbus_backend_resume(resume_dev); -} -EXPORT_SYMBOL_GPL(xenbus_resume); - -void xenbus_suspend_cancel(void) -{ - xs_suspend_cancel(); -} -EXPORT_SYMBOL_GPL(xenbus_suspend_cancel); - /* A flag to determine if xenstored is 'ready' (i.e. has started) */ int xenstored_ready = 0; diff --git a/drivers/xen/xenbus/xenbus_xs.c b/drivers/xen/xenbus/xenbus_xs.c index e325eab4724..eab33f1dbdf 100644 --- a/drivers/xen/xenbus/xenbus_xs.c +++ b/drivers/xen/xenbus/xenbus_xs.c @@ -673,6 +673,8 @@ void xs_resume(void) struct xenbus_watch *watch; char token[sizeof(watch) * 2 + 1]; + xb_init_comms(); + mutex_unlock(&xs_state.response_mutex); mutex_unlock(&xs_state.request_mutex); up_write(&xs_state.transaction_mutex); diff --git a/include/xen/xenbus.h b/include/xen/xenbus.h index 0836772b968..b9763badbd7 100644 --- a/include/xen/xenbus.h +++ b/include/xen/xenbus.h @@ -91,7 +91,7 @@ struct xenbus_driver { void (*otherend_changed)(struct xenbus_device *dev, enum xenbus_state backend_state); int (*remove)(struct xenbus_device *dev); - int (*suspend)(struct xenbus_device *dev); + int (*suspend)(struct xenbus_device *dev, pm_message_t state); int (*resume)(struct xenbus_device *dev); int (*uevent)(struct xenbus_device *, char **, int, char *, int); struct device_driver driver; From c6a960ce8858f20036cc3afc3b9422670d0d9021 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 9 Feb 2009 12:05:53 -0800 Subject: [PATCH 0148/2061] xen/xenbus: export xenbus_dev_changed Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/xenbus/xenbus_probe.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/xen/xenbus/xenbus_probe.c b/drivers/xen/xenbus/xenbus_probe.c index 4649213bed9..d42e25d5968 100644 --- a/drivers/xen/xenbus/xenbus_probe.c +++ b/drivers/xen/xenbus/xenbus_probe.c @@ -660,6 +660,7 @@ void xenbus_dev_changed(const char *node, struct xen_bus_type *bus) kfree(root); } +EXPORT_SYMBOL_GPL(xenbus_dev_changed); static void frontend_changed(struct xenbus_watch *watch, const char **vec, unsigned int len) From cff7e81b3dd7c25cd2248cd7a04c5764552d5d55 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 10 Mar 2009 14:39:59 -0700 Subject: [PATCH 0149/2061] xen: add /sys/hypervisor support Adds support for Xen info under /sys/hypervisor. Taken from Novell 2.6.27 backport tree. Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/Kconfig | 10 + drivers/xen/Makefile | 3 +- drivers/xen/sys-hypervisor.c | 475 ++++++++++++++++++++++++++++++++ include/xen/interface/version.h | 3 + 4 files changed, 490 insertions(+), 1 deletion(-) create mode 100644 drivers/xen/sys-hypervisor.c diff --git a/drivers/xen/Kconfig b/drivers/xen/Kconfig index 526187c8a12..88bca1c42db 100644 --- a/drivers/xen/Kconfig +++ b/drivers/xen/Kconfig @@ -41,3 +41,13 @@ config XEN_COMPAT_XENFS a xen platform. If in doubt, say yes. +config XEN_SYS_HYPERVISOR + bool "Create xen entries under /sys/hypervisor" + depends on XEN && SYSFS + select SYS_HYPERVISOR + default y + help + Create entries under /sys/hypervisor describing the Xen + hypervisor environment. When running native or in another + virtual environment, /sys/hypervisor will still be present, + but will have no xen contents. \ No newline at end of file diff --git a/drivers/xen/Makefile b/drivers/xen/Makefile index ff8accc9e10..f3603a39db5 100644 --- a/drivers/xen/Makefile +++ b/drivers/xen/Makefile @@ -4,4 +4,5 @@ obj-y += xenbus/ obj-$(CONFIG_HOTPLUG_CPU) += cpu_hotplug.o obj-$(CONFIG_XEN_XENCOMM) += xencomm.o obj-$(CONFIG_XEN_BALLOON) += balloon.o -obj-$(CONFIG_XENFS) += xenfs/ \ No newline at end of file +obj-$(CONFIG_XENFS) += xenfs/ +obj-$(CONFIG_XEN_SYS_HYPERVISOR) += sys-hypervisor.o diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c new file mode 100644 index 00000000000..cb29d1ccf0d --- /dev/null +++ b/drivers/xen/sys-hypervisor.c @@ -0,0 +1,475 @@ +/* + * copyright (c) 2006 IBM Corporation + * Authored by: Mike D. Day + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +#include +#include + +#include +#include +#include + +#define HYPERVISOR_ATTR_RO(_name) \ +static struct hyp_sysfs_attr _name##_attr = __ATTR_RO(_name) + +#define HYPERVISOR_ATTR_RW(_name) \ +static struct hyp_sysfs_attr _name##_attr = \ + __ATTR(_name, 0644, _name##_show, _name##_store) + +struct hyp_sysfs_attr { + struct attribute attr; + ssize_t (*show)(struct hyp_sysfs_attr *, char *); + ssize_t (*store)(struct hyp_sysfs_attr *, const char *, size_t); + void *hyp_attr_data; +}; + +static ssize_t type_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + return sprintf(buffer, "xen\n"); +} + +HYPERVISOR_ATTR_RO(type); + +static int __init xen_sysfs_type_init(void) +{ + return sysfs_create_file(hypervisor_kobj, &type_attr.attr); +} + +static void xen_sysfs_type_destroy(void) +{ + sysfs_remove_file(hypervisor_kobj, &type_attr.attr); +} + +/* xen version attributes */ +static ssize_t major_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int version = HYPERVISOR_xen_version(XENVER_version, NULL); + if (version) + return sprintf(buffer, "%d\n", version >> 16); + return -ENODEV; +} + +HYPERVISOR_ATTR_RO(major); + +static ssize_t minor_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int version = HYPERVISOR_xen_version(XENVER_version, NULL); + if (version) + return sprintf(buffer, "%d\n", version & 0xff); + return -ENODEV; +} + +HYPERVISOR_ATTR_RO(minor); + +static ssize_t extra_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *extra; + + extra = kmalloc(XEN_EXTRAVERSION_LEN, GFP_KERNEL); + if (extra) { + ret = HYPERVISOR_xen_version(XENVER_extraversion, extra); + if (!ret) + ret = sprintf(buffer, "%s\n", extra); + kfree(extra); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(extra); + +static struct attribute *version_attrs[] = { + &major_attr.attr, + &minor_attr.attr, + &extra_attr.attr, + NULL +}; + +static struct attribute_group version_group = { + .name = "version", + .attrs = version_attrs, +}; + +static int __init xen_sysfs_version_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &version_group); +} + +static void xen_sysfs_version_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &version_group); +} + +/* UUID */ + +static ssize_t uuid_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + char *vm, *val; + int ret; + extern int xenstored_ready; + + if (!xenstored_ready) + return -EBUSY; + + vm = xenbus_read(XBT_NIL, "vm", "", NULL); + if (IS_ERR(vm)) + return PTR_ERR(vm); + val = xenbus_read(XBT_NIL, vm, "uuid", NULL); + kfree(vm); + if (IS_ERR(val)) + return PTR_ERR(val); + ret = sprintf(buffer, "%s\n", val); + kfree(val); + return ret; +} + +HYPERVISOR_ATTR_RO(uuid); + +static int __init xen_sysfs_uuid_init(void) +{ + return sysfs_create_file(hypervisor_kobj, &uuid_attr.attr); +} + +static void xen_sysfs_uuid_destroy(void) +{ + sysfs_remove_file(hypervisor_kobj, &uuid_attr.attr); +} + +/* xen compilation attributes */ + +static ssize_t compiler_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compiler); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compiler); + +static ssize_t compiled_by_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compile_by); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compiled_by); + +static ssize_t compile_date_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_compile_info *info; + + info = kmalloc(sizeof(struct xen_compile_info), GFP_KERNEL); + if (info) { + ret = HYPERVISOR_xen_version(XENVER_compile_info, info); + if (!ret) + ret = sprintf(buffer, "%s\n", info->compile_date); + kfree(info); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(compile_date); + +static struct attribute *xen_compile_attrs[] = { + &compiler_attr.attr, + &compiled_by_attr.attr, + &compile_date_attr.attr, + NULL +}; + +static struct attribute_group xen_compilation_group = { + .name = "compilation", + .attrs = xen_compile_attrs, +}; + +int __init static xen_compilation_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_compilation_group); +} + +static void xen_compilation_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_compilation_group); +} + +/* xen properties info */ + +static ssize_t capabilities_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *caps; + + caps = kmalloc(XEN_CAPABILITIES_INFO_LEN, GFP_KERNEL); + if (caps) { + ret = HYPERVISOR_xen_version(XENVER_capabilities, caps); + if (!ret) + ret = sprintf(buffer, "%s\n", caps); + kfree(caps); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(capabilities); + +static ssize_t changeset_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + char *cset; + + cset = kmalloc(XEN_CHANGESET_INFO_LEN, GFP_KERNEL); + if (cset) { + ret = HYPERVISOR_xen_version(XENVER_changeset, cset); + if (!ret) + ret = sprintf(buffer, "%s\n", cset); + kfree(cset); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(changeset); + +static ssize_t virtual_start_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret = -ENOMEM; + struct xen_platform_parameters *parms; + + parms = kmalloc(sizeof(struct xen_platform_parameters), GFP_KERNEL); + if (parms) { + ret = HYPERVISOR_xen_version(XENVER_platform_parameters, + parms); + if (!ret) + ret = sprintf(buffer, "%lx\n", parms->virt_start); + kfree(parms); + } + + return ret; +} + +HYPERVISOR_ATTR_RO(virtual_start); + +static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + int ret; + + ret = HYPERVISOR_xen_version(XENVER_pagesize, NULL); + if (ret > 0) + ret = sprintf(buffer, "%x\n", ret); + + return ret; +} + +HYPERVISOR_ATTR_RO(pagesize); + +/* eventually there will be several more features to export */ +static ssize_t xen_feature_show(int index, char *buffer) +{ + int ret = -ENOMEM; + struct xen_feature_info *info; + + info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL); + if (info) { + info->submap_idx = index; + ret = HYPERVISOR_xen_version(XENVER_get_features, info); + if (!ret) + ret = sprintf(buffer, "%d\n", info->submap); + kfree(info); + } + + return ret; +} + +static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer) +{ + return xen_feature_show(XENFEAT_writable_page_tables, buffer); +} + +HYPERVISOR_ATTR_RO(writable_pt); + +static struct attribute *xen_properties_attrs[] = { + &capabilities_attr.attr, + &changeset_attr.attr, + &virtual_start_attr.attr, + &pagesize_attr.attr, + &writable_pt_attr.attr, + NULL +}; + +static struct attribute_group xen_properties_group = { + .name = "properties", + .attrs = xen_properties_attrs, +}; + +static int __init xen_properties_init(void) +{ + return sysfs_create_group(hypervisor_kobj, &xen_properties_group); +} + +static void xen_properties_destroy(void) +{ + sysfs_remove_group(hypervisor_kobj, &xen_properties_group); +} + +#ifdef CONFIG_KEXEC + +extern size_t vmcoreinfo_size_xen; +extern unsigned long paddr_vmcoreinfo_xen; + +static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page) +{ + return sprintf(page, "%lx %zx\n", + paddr_vmcoreinfo_xen, vmcoreinfo_size_xen); +} + +HYPERVISOR_ATTR_RO(vmcoreinfo); + +static int __init xen_sysfs_vmcoreinfo_init(void) +{ + return sysfs_create_file(hypervisor_kobj, + &vmcoreinfo_attr.attr); +} + +static void xen_sysfs_vmcoreinfo_destroy(void) +{ + sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr); +} + +#endif + +static int __init hyper_sysfs_init(void) +{ + int ret; + + if (!xen_domain()) + return -ENODEV; + + ret = xen_sysfs_type_init(); + if (ret) + goto out; + ret = xen_sysfs_version_init(); + if (ret) + goto version_out; + ret = xen_compilation_init(); + if (ret) + goto comp_out; + ret = xen_sysfs_uuid_init(); + if (ret) + goto uuid_out; + ret = xen_properties_init(); + if (ret) + goto prop_out; +#ifdef CONFIG_KEXEC + if (vmcoreinfo_size_xen != 0) { + ret = xen_sysfs_vmcoreinfo_init(); + if (ret) + goto vmcoreinfo_out; + } +#endif + + goto out; + +#ifdef CONFIG_KEXEC +vmcoreinfo_out: +#endif + xen_properties_destroy(); +prop_out: + xen_sysfs_uuid_destroy(); +uuid_out: + xen_compilation_destroy(); +comp_out: + xen_sysfs_version_destroy(); +version_out: + xen_sysfs_type_destroy(); +out: + return ret; +} + +static void __exit hyper_sysfs_exit(void) +{ +#ifdef CONFIG_KEXEC + if (vmcoreinfo_size_xen != 0) + xen_sysfs_vmcoreinfo_destroy(); +#endif + xen_properties_destroy(); + xen_compilation_destroy(); + xen_sysfs_uuid_destroy(); + xen_sysfs_version_destroy(); + xen_sysfs_type_destroy(); + +} +module_init(hyper_sysfs_init); +module_exit(hyper_sysfs_exit); + +static ssize_t hyp_sysfs_show(struct kobject *kobj, + struct attribute *attr, + char *buffer) +{ + struct hyp_sysfs_attr *hyp_attr; + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); + if (hyp_attr->show) + return hyp_attr->show(hyp_attr, buffer); + return 0; +} + +static ssize_t hyp_sysfs_store(struct kobject *kobj, + struct attribute *attr, + const char *buffer, + size_t len) +{ + struct hyp_sysfs_attr *hyp_attr; + hyp_attr = container_of(attr, struct hyp_sysfs_attr, attr); + if (hyp_attr->store) + return hyp_attr->store(hyp_attr, buffer, len); + return 0; +} + +static struct sysfs_ops hyp_sysfs_ops = { + .show = hyp_sysfs_show, + .store = hyp_sysfs_store, +}; + +static struct kobj_type hyp_sysfs_kobj_type = { + .sysfs_ops = &hyp_sysfs_ops, +}; + +static int __init hypervisor_subsys_init(void) +{ + if (!xen_domain()) + return -ENODEV; + + hypervisor_kobj->ktype = &hyp_sysfs_kobj_type; + return 0; +} +device_initcall(hypervisor_subsys_init); diff --git a/include/xen/interface/version.h b/include/xen/interface/version.h index 453235e923f..e8b6519d47e 100644 --- a/include/xen/interface/version.h +++ b/include/xen/interface/version.h @@ -57,4 +57,7 @@ struct xen_feature_info { /* Declares the features reported by XENVER_get_features. */ #include "features.h" +/* arg == NULL; returns host memory page size. */ +#define XENVER_pagesize 7 + #endif /* __XEN_PUBLIC_VERSION_H__ */ From a649b720614d5675dc402bef75a92576143fede7 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 10 Mar 2009 17:17:41 -0700 Subject: [PATCH 0150/2061] xen/sys/hypervisor: change writable_pt to features /sys/hypervisor/properties/writable_pt was misnamed. Rename to features, expressed as a bit array in hex. Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/sys-hypervisor.c | 41 +++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 15 deletions(-) diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index cb29d1ccf0d..1267d6fcc0c 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -293,37 +293,48 @@ static ssize_t pagesize_show(struct hyp_sysfs_attr *attr, char *buffer) HYPERVISOR_ATTR_RO(pagesize); -/* eventually there will be several more features to export */ static ssize_t xen_feature_show(int index, char *buffer) { - int ret = -ENOMEM; - struct xen_feature_info *info; + ssize_t ret; + struct xen_feature_info info; - info = kmalloc(sizeof(struct xen_feature_info), GFP_KERNEL); - if (info) { - info->submap_idx = index; - ret = HYPERVISOR_xen_version(XENVER_get_features, info); - if (!ret) - ret = sprintf(buffer, "%d\n", info->submap); - kfree(info); - } + info.submap_idx = index; + ret = HYPERVISOR_xen_version(XENVER_get_features, &info); + if (!ret) + ret = sprintf(buffer, "%08x", info.submap); return ret; } -static ssize_t writable_pt_show(struct hyp_sysfs_attr *attr, char *buffer) +static ssize_t features_show(struct hyp_sysfs_attr *attr, char *buffer) { - return xen_feature_show(XENFEAT_writable_page_tables, buffer); + ssize_t len; + int i; + + len = 0; + for (i = XENFEAT_NR_SUBMAPS-1; i >= 0; i--) { + int ret = xen_feature_show(i, buffer + len); + if (ret < 0) { + if (len == 0) + len = ret; + break; + } + len += ret; + } + if (len > 0) + buffer[len++] = '\n'; + + return len; } -HYPERVISOR_ATTR_RO(writable_pt); +HYPERVISOR_ATTR_RO(features); static struct attribute *xen_properties_attrs[] = { &capabilities_attr.attr, &changeset_attr.attr, &virtual_start_attr.attr, &pagesize_attr.attr, - &writable_pt_attr.attr, + &features_attr.attr, NULL }; From f0783708bf63a2827863cf2be57c08a24843e6bd Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Wed, 11 Mar 2009 10:19:54 +0000 Subject: [PATCH 0151/2061] xen: drop kexec bits from /sys/hypervisor since kexec isn't implemented yet I needed this to compile since there is no kexec yet in pvops kernel CC drivers/xen/sys-hypervisor.o drivers/xen/sys-hypervisor.c: In function 'hyper_sysfs_init': drivers/xen/sys-hypervisor.c:405: error: 'vmcoreinfo_size_xen' undeclared (first use in this function) drivers/xen/sys-hypervisor.c:405: error: (Each undeclared identifier is reported only once drivers/xen/sys-hypervisor.c:405: error: for each function it appears in.) drivers/xen/sys-hypervisor.c:406: error: implicit declaration of function 'xen_sysfs_vmcoreinfo_init' drivers/xen/sys-hypervisor.c: In function 'hyper_sysfs_exit': drivers/xen/sys-hypervisor.c:433: error: 'vmcoreinfo_size_xen' undeclared (first use in this function) drivers/xen/sys-hypervisor.c:434: error: implicit declaration of function 'xen_sysfs_vmcoreinfo_destroy' Signed-off-by: Ian Campbell --- drivers/xen/sys-hypervisor.c | 41 ------------------------------------ 1 file changed, 41 deletions(-) diff --git a/drivers/xen/sys-hypervisor.c b/drivers/xen/sys-hypervisor.c index 1267d6fcc0c..88a60e03ccf 100644 --- a/drivers/xen/sys-hypervisor.c +++ b/drivers/xen/sys-hypervisor.c @@ -353,32 +353,6 @@ static void xen_properties_destroy(void) sysfs_remove_group(hypervisor_kobj, &xen_properties_group); } -#ifdef CONFIG_KEXEC - -extern size_t vmcoreinfo_size_xen; -extern unsigned long paddr_vmcoreinfo_xen; - -static ssize_t vmcoreinfo_show(struct hyp_sysfs_attr *attr, char *page) -{ - return sprintf(page, "%lx %zx\n", - paddr_vmcoreinfo_xen, vmcoreinfo_size_xen); -} - -HYPERVISOR_ATTR_RO(vmcoreinfo); - -static int __init xen_sysfs_vmcoreinfo_init(void) -{ - return sysfs_create_file(hypervisor_kobj, - &vmcoreinfo_attr.attr); -} - -static void xen_sysfs_vmcoreinfo_destroy(void) -{ - sysfs_remove_file(hypervisor_kobj, &vmcoreinfo_attr.attr); -} - -#endif - static int __init hyper_sysfs_init(void) { int ret; @@ -401,20 +375,9 @@ static int __init hyper_sysfs_init(void) ret = xen_properties_init(); if (ret) goto prop_out; -#ifdef CONFIG_KEXEC - if (vmcoreinfo_size_xen != 0) { - ret = xen_sysfs_vmcoreinfo_init(); - if (ret) - goto vmcoreinfo_out; - } -#endif goto out; -#ifdef CONFIG_KEXEC -vmcoreinfo_out: -#endif - xen_properties_destroy(); prop_out: xen_sysfs_uuid_destroy(); uuid_out: @@ -429,10 +392,6 @@ out: static void __exit hyper_sysfs_exit(void) { -#ifdef CONFIG_KEXEC - if (vmcoreinfo_size_xen != 0) - xen_sysfs_vmcoreinfo_destroy(); -#endif xen_properties_destroy(); xen_compilation_destroy(); xen_sysfs_uuid_destroy(); From 818fd20673df82031e604bb784d836f1fc2e2451 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 6 Feb 2009 18:46:47 -0800 Subject: [PATCH 0152/2061] xen: add "capabilities" file The xenfs capabilities file allows usermode to determine what capabilities the domain has. The only one at present is "control_d" in a privileged domain. Signed-off-by: Jeremy Fitzhardinge --- drivers/xen/xenfs/super.c | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/drivers/xen/xenfs/super.c b/drivers/xen/xenfs/super.c index 515741a8e6b..6559e0c752c 100644 --- a/drivers/xen/xenfs/super.c +++ b/drivers/xen/xenfs/super.c @@ -20,10 +20,27 @@ MODULE_DESCRIPTION("Xen filesystem"); MODULE_LICENSE("GPL"); +static ssize_t capabilities_read(struct file *file, char __user *buf, + size_t size, loff_t *off) +{ + char *tmp = ""; + + if (xen_initial_domain()) + tmp = "control_d\n"; + + return simple_read_from_buffer(buf, size, off, tmp, strlen(tmp)); +} + +static const struct file_operations capabilities_file_ops = { + .read = capabilities_read, +}; + static int xenfs_fill_super(struct super_block *sb, void *data, int silent) { static struct tree_descr xenfs_files[] = { - [2] = {"xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR}, + [1] = {}, + { "xenbus", &xenbus_file_ops, S_IRUSR|S_IWUSR }, + { "capabilities", &capabilities_file_ops, S_IRUGO }, {""}, }; From 8a6f83afd0c5355db6d11394a798e94950306239 Mon Sep 17 00:00:00 2001 From: KaiGai Kohei Date: Wed, 1 Apr 2009 10:07:57 +0900 Subject: [PATCH 0153/2061] Permissive domain in userspace object manager This patch enables applications to handle permissive domain correctly. Since the v2.6.26 kernel, SELinux has supported an idea of permissive domain which allows certain processes to work as if permissive mode, even if the global setting is enforcing mode. However, we don't have an application program interface to inform what domains are permissive one, and what domains are not. It means applications focuses on SELinux (XACE/SELinux, SE-PostgreSQL and so on) cannot handle permissive domain correctly. This patch add the sixth field (flags) on the reply of the /selinux/access interface which is used to make an access control decision from userspace. If the first bit of the flags field is positive, it means the required access control decision is on permissive domain, so application should allow any required actions, as the kernel doing. This patch also has a side benefit. The av_decision.flags is set at context_struct_compute_av(). It enables to check required permissions without read_lock(&policy_rwlock). Signed-off-by: KaiGai Kohei Acked-by: Stephen Smalley Acked-by: Eric Paris -- security/selinux/avc.c | 2 +- security/selinux/include/security.h | 4 +++- security/selinux/selinuxfs.c | 4 ++-- security/selinux/ss/services.c | 30 +++++------------------------- 4 files changed, 11 insertions(+), 29 deletions(-) Signed-off-by: James Morris --- security/selinux/avc.c | 2 +- security/selinux/include/security.h | 4 +++- security/selinux/selinuxfs.c | 4 ++-- security/selinux/ss/services.c | 30 +++++------------------------ 4 files changed, 11 insertions(+), 29 deletions(-) diff --git a/security/selinux/avc.c b/security/selinux/avc.c index 7f9b5fac877..b2ab6085983 100644 --- a/security/selinux/avc.c +++ b/security/selinux/avc.c @@ -927,7 +927,7 @@ int avc_has_perm_noaudit(u32 ssid, u32 tsid, if (denied) { if (flags & AVC_STRICT) rc = -EACCES; - else if (!selinux_enforcing || security_permissive_sid(ssid)) + else if (!selinux_enforcing || (avd->flags & AVD_FLAGS_PERMISSIVE)) avc_update_node(AVC_CALLBACK_GRANT, requested, ssid, tsid, tclass, avd->seqno); else diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index 5c3434f7626..a7be3f01fb0 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -91,9 +91,11 @@ struct av_decision { u32 auditallow; u32 auditdeny; u32 seqno; + u32 flags; }; -int security_permissive_sid(u32 sid); +/* definitions of av_decision.flags */ +#define AVD_FLAGS_PERMISSIVE 0x0001 int security_compute_av(u32 ssid, u32 tsid, u16 tclass, u32 requested, diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 2d5136ec3d5..8d4007fbe0e 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -527,10 +527,10 @@ static ssize_t sel_write_access(struct file *file, char *buf, size_t size) goto out2; length = scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, - "%x %x %x %x %u", + "%x %x %x %x %u %x", avd.allowed, 0xffffffff, avd.auditallow, avd.auditdeny, - avd.seqno); + avd.seqno, avd.flags); out2: kfree(tcon); out: diff --git a/security/selinux/ss/services.c b/security/selinux/ss/services.c index deeec6c013a..500e6f78e11 100644 --- a/security/selinux/ss/services.c +++ b/security/selinux/ss/services.c @@ -410,6 +410,7 @@ static int context_struct_compute_av(struct context *scontext, avd->auditallow = 0; avd->auditdeny = 0xffffffff; avd->seqno = latest_granting; + avd->flags = 0; /* * Check for all the invalid cases. @@ -528,31 +529,6 @@ inval_class: return 0; } -/* - * Given a sid find if the type has the permissive flag set - */ -int security_permissive_sid(u32 sid) -{ - struct context *context; - u32 type; - int rc; - - read_lock(&policy_rwlock); - - context = sidtab_search(&sidtab, sid); - BUG_ON(!context); - - type = context->type; - /* - * we are intentionally using type here, not type-1, the 0th bit may - * someday indicate that we are globally setting permissive in policy. - */ - rc = ebitmap_get_bit(&policydb.permissive_map, type); - - read_unlock(&policy_rwlock); - return rc; -} - static int security_validtrans_handle_fail(struct context *ocontext, struct context *ncontext, struct context *tcontext, @@ -767,6 +743,10 @@ int security_compute_av(u32 ssid, rc = context_struct_compute_av(scontext, tcontext, tclass, requested, avd); + + /* permissive domain? */ + if (ebitmap_get_bit(&policydb.permissive_map, scontext->type)) + avd->flags |= AVD_FLAGS_PERMISSIVE; out: read_unlock(&policy_rwlock); return rc; From 53152f957d4a5dfd537d17c823afeb1a2c03753e Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Thu, 2 Apr 2009 13:24:28 +0100 Subject: [PATCH 0154/2061] xen: honour VCPU availability on boot If a VM is booted with offline VCPUs then unplug them during boot. Determining the availability of a VCPU requires access to XenStore which is not available at the point smp_prepare_cpus() is called, therefore we bring up all VCPUS initially and unplug the offline ones as soon as XenStore becomes available. Signed-off-by: Ian Campbell --- drivers/xen/cpu_hotplug.c | 40 +++++++++++++++++++++++++++++---------- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/drivers/xen/cpu_hotplug.c b/drivers/xen/cpu_hotplug.c index 974f56d1ebe..411cb1fc927 100644 --- a/drivers/xen/cpu_hotplug.c +++ b/drivers/xen/cpu_hotplug.c @@ -21,29 +21,41 @@ static void disable_hotplug_cpu(int cpu) cpu_clear(cpu, cpu_present_map); } -static void vcpu_hotplug(unsigned int cpu) +static int vcpu_online(unsigned int cpu) { int err; char dir[32], state[32]; - if (!cpu_possible(cpu)) - return; - sprintf(dir, "cpu/%u", cpu); err = xenbus_scanf(XBT_NIL, dir, "availability", "%s", state); if (err != 1) { printk(KERN_ERR "XENBUS: Unable to read cpu state\n"); - return; + return err; } - if (strcmp(state, "online") == 0) { + if (strcmp(state, "online") == 0) + return 1; + else if (strcmp(state, "offline") == 0) + return 0; + + printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", state, cpu); + return -EINVAL; +} +static void vcpu_hotplug(unsigned int cpu) +{ + if (!cpu_possible(cpu)) + return; + + switch (vcpu_online(cpu)) { + case 1: enable_hotplug_cpu(cpu); - } else if (strcmp(state, "offline") == 0) { + break; + case 0: (void)cpu_down(cpu); disable_hotplug_cpu(cpu); - } else { - printk(KERN_ERR "XENBUS: unknown state(%s) on CPU%d\n", - state, cpu); + break; + default: + break; } } @@ -64,12 +76,20 @@ static void handle_vcpu_hotplug_event(struct xenbus_watch *watch, static int setup_cpu_watcher(struct notifier_block *notifier, unsigned long event, void *data) { + int cpu; static struct xenbus_watch cpu_watch = { .node = "cpu", .callback = handle_vcpu_hotplug_event}; (void)register_xenbus_watch(&cpu_watch); + for_each_possible_cpu(cpu) { + if (vcpu_online(cpu) == 0) { + (void)cpu_down(cpu); + cpu_clear(cpu, cpu_present_map); + } + } + return NOTIFY_DONE; } From 3d43321b7015387cfebbe26436d0e9d299162ea1 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Thu, 2 Apr 2009 15:49:29 -0700 Subject: [PATCH 0155/2061] modules: sysctl to block module loading Implement a sysctl file that disables module-loading system-wide since there is no longer a viable way to remove CAP_SYS_MODULE after the system bounding capability set was removed in 2.6.25. Value can only be set to "1", and is tested only if standard capability checks allow CAP_SYS_MODULE. Given existing /dev/mem protections, this should allow administrators a one-way method to block module loading after initial boot-time module loading has finished. Signed-off-by: Kees Cook Acked-by: Serge Hallyn Signed-off-by: James Morris --- Documentation/sysctl/kernel.txt | 11 +++++++++++ kernel/module.c | 7 +++++-- kernel/sysctl.c | 12 ++++++++++++ 3 files changed, 28 insertions(+), 2 deletions(-) diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index a4ccdd1981c..02b13495627 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -30,6 +30,7 @@ show up in /proc/sys/kernel: - kstack_depth_to_print [ X86 only ] - l2cr [ PPC only ] - modprobe ==> Documentation/debugging-modules.txt +- modules_disabled - msgmax - msgmnb - msgmni @@ -179,6 +180,16 @@ kernel stack. ============================================================== +modules_disabled: + +A toggle value indicating if modules are allowed to be loaded +in an otherwise modular kernel. This toggle defaults to off +(0), but can be set true (1). Once true, modules can be +neither loaded nor unloaded, and the toggle cannot be set back +to false. + +============================================================== + osrelease, ostype & version: # cat osrelease diff --git a/kernel/module.c b/kernel/module.c index f77ac320d0b..eeb3f7b1383 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -778,6 +778,9 @@ static void wait_for_zero_refcount(struct module *mod) mutex_lock(&module_mutex); } +/* Block module loading/unloading? */ +int modules_disabled = 0; + SYSCALL_DEFINE2(delete_module, const char __user *, name_user, unsigned int, flags) { @@ -785,7 +788,7 @@ SYSCALL_DEFINE2(delete_module, const char __user *, name_user, char name[MODULE_NAME_LEN]; int ret, forced = 0; - if (!capable(CAP_SYS_MODULE)) + if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; if (strncpy_from_user(name, name_user, MODULE_NAME_LEN-1) < 0) @@ -2349,7 +2352,7 @@ SYSCALL_DEFINE3(init_module, void __user *, umod, int ret = 0; /* Must have permission */ - if (!capable(CAP_SYS_MODULE)) + if (!capable(CAP_SYS_MODULE) || modules_disabled) return -EPERM; /* Only one module load at a time, please */ diff --git a/kernel/sysctl.c b/kernel/sysctl.c index c5ef44ff850..2fb4246d27d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -113,6 +113,7 @@ static int ngroups_max = NGROUPS_MAX; #ifdef CONFIG_MODULES extern char modprobe_path[]; +extern int modules_disabled; #endif #ifdef CONFIG_CHR_DEV_SG extern int sg_big_buff; @@ -533,6 +534,17 @@ static struct ctl_table kern_table[] = { .proc_handler = &proc_dostring, .strategy = &sysctl_string, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "modules_disabled", + .data = &modules_disabled, + .maxlen = sizeof(int), + .mode = 0644, + /* only handle a transition from default "0" to "1" */ + .proc_handler = &proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &one, + }, #endif #if defined(CONFIG_HOTPLUG) && defined(CONFIG_NET) { From b5f22a59c0356655a501190959db9f7f5dd07e3f Mon Sep 17 00:00:00 2001 From: "Serge E. Hallyn" Date: Thu, 2 Apr 2009 18:47:14 -0500 Subject: [PATCH 0156/2061] don't raise all privs on setuid-root file with fE set (v2) Distributions face a backward compatibility problem with starting to use file capabilities. For instance, removing setuid root from ping and doing setcap cap_net_raw=pe means that booting with an older kernel or one compiled without file capabilities means ping won't work for non-root users. In order to replace the setuid root bit on a capability-unaware program, one has to set the effective, or legacy, file capability, which makes the capability effective immediately. This patch uses the legacy bit as a queue to not automatically add full privilege to a setuid-root program. So, with this patch, an ordinary setuid-root program will run with privilege. But if /bin/ping has both setuid-root and cap_net_raw in fP and fE, then ping (when run by non-root user) will not run with only cap_net_raw. Changelog: Apr 2 2009: Print a message once when such a binary is loaded, as per James Morris' suggestion. Apr 2 2009: Fix the condition to only catch uid!=0 && euid==0. Signed-off-by: Serge E. Hallyn Acked-by: Casey Schaufler Signed-off-by: James Morris --- security/commoncap.c | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/security/commoncap.c b/security/commoncap.c index 7cd61a5f520..97ac1f16771 100644 --- a/security/commoncap.c +++ b/security/commoncap.c @@ -28,6 +28,28 @@ #include #include +/* + * If a non-root user executes a setuid-root binary in + * !secure(SECURE_NOROOT) mode, then we raise capabilities. + * However if fE is also set, then the intent is for only + * the file capabilities to be applied, and the setuid-root + * bit is left on either to change the uid (plausible) or + * to get full privilege on a kernel without file capabilities + * support. So in that case we do not raise capabilities. + * + * Warn if that happens, once per boot. + */ +static void warn_setuid_and_fcaps_mixed(char *fname) +{ + static int warned; + if (!warned) { + printk(KERN_INFO "warning: `%s' has both setuid-root and" + " effective capabilities. Therefore not raising all" + " capabilities.\n", fname); + warned = 1; + } +} + int cap_netlink_send(struct sock *sk, struct sk_buff *skb) { NETLINK_CB(skb).eff_cap = current_cap(); @@ -463,6 +485,15 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) return ret; if (!issecure(SECURE_NOROOT)) { + /* + * If the legacy file capability is set, then don't set privs + * for a setuid root binary run by a non-root user. Do set it + * for a root user just to cause least surprise to an admin. + */ + if (effective && new->uid != 0 && new->euid == 0) { + warn_setuid_and_fcaps_mixed(bprm->filename); + goto skip; + } /* * To support inheritance of root-permissions and suid-root * executables under compatibility mode, we override the @@ -478,6 +509,7 @@ int cap_bprm_set_creds(struct linux_binprm *bprm) if (new->euid == 0) effective = true; } +skip: /* Don't let someone trace a set[ug]id/setpcap binary with the revised * credentials unless they have the appropriate permit From 31c9a24ec82926fcae49483e53566d231e705057 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 2 Apr 2009 21:06:25 -0700 Subject: [PATCH 0157/2061] RCU: make treercu be default Impact: switch default config from CLASSIC_RCU to TREE_RCU Given that I have not gotten any complaints or bug reports on treercu recently, this patch makes it be the default. There are a number of other defconfig files that explicitly call out CLASSIC_RCU, but which have comment headers saying not to edit them. Probably holdovers from one of the flavors of "make config", but... Signed-off-by: Paul E. McKenney Cc: akpm@linux-foundation.org Cc: dipankar@in.ibm.com Cc: niv@us.ibm.com Cc: manfred@colorfullife.com Cc: peterz@infradead.org LKML-Reference: <20090403040625.GA9473@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- init/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/init/Kconfig b/init/Kconfig index 14c483d2b7c..26ac8772464 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -302,7 +302,7 @@ menu "RCU Subsystem" choice prompt "RCU Implementation" - default CLASSIC_RCU + default TREE_RCU config CLASSIC_RCU bool "Classic RCU" From 595258aaeac4cc6e187b98b1bf29bb176febe763 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:28 +0100 Subject: [PATCH 0158/2061] perf_counter: x86: fix 32-bit irq_period assumption No need to assume the irq_period is 32bit. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 155bc3c239b..1cedc3468ce 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -449,7 +449,7 @@ __hw_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s32 period = hwc->irq_period; + s64 period = hwc->irq_period; int err; /* From 755642322aa66fbc5421a35fd3e1733f73e20083 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:29 +0100 Subject: [PATCH 0159/2061] perf_counter: use list_move_tail() Instead of del/add use a move list-op. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b2e838959f3..0fe22c916e2 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -89,8 +89,7 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_for_each_entry_safe(sibling, tmp, &counter->sibling_list, list_entry) { - list_del_init(&sibling->list_entry); - list_add_tail(&sibling->list_entry, &ctx->counter_list); + list_move_tail(&sibling->list_entry, &ctx->counter_list); sibling->group_leader = sibling; } } @@ -959,8 +958,7 @@ static void rotate_ctx(struct perf_counter_context *ctx) */ perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - list_del(&counter->list_entry); - list_add_tail(&counter->list_entry, &ctx->counter_list); + list_move_tail(&counter->list_entry, &ctx->counter_list); break; } hw_perf_restore(perf_flags); From 60b3df9c1e24a18aabb412da9905208c5f04ebea Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:30 +0100 Subject: [PATCH 0160/2061] perf_counter: add comment to barrier We need to ensure the enabled=0 write happens before we start disabling the actual counters, so that a pcm_amd_enable() will not enable one underneath us. I think the race is impossible anyway, we always balance the ops within any one context and perform enable() with IRQs disabled. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1cedc3468ce..a2e3b76bfdc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -247,6 +247,10 @@ static u64 pmc_amd_save_disable_all(void) enabled = cpuc->enabled; cpuc->enabled = 0; + /* + * ensure we write the disable before we start disabling the + * counters proper, so that pcm_amd_enable() does the right thing. + */ barrier(); for (idx = 0; idx < nr_counters_generic; idx++) { From 82bae4f8c2fd64a2bb1e2e72c508853ed2b4a299 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:31 +0100 Subject: [PATCH 0161/2061] perf_counter: x86: use ULL postfix for 64bit constants Fix a build warning on 32bit machines by explicitly marking the constants as 64-bit. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a2e3b76bfdc..22dab06c08a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -84,9 +84,9 @@ static u64 pmc_intel_event_map(int event) static u64 pmc_intel_raw_event(u64 event) { -#define CORE_EVNTSEL_EVENT_MASK 0x000000FF -#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00 -#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000 +#define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL +#define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ @@ -116,9 +116,9 @@ static u64 pmc_amd_event_map(int event) static u64 pmc_amd_raw_event(u64 event) { -#define K7_EVNTSEL_EVENT_MASK 0x7000000FF -#define K7_EVNTSEL_UNIT_MASK 0x00000FF00 -#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000 +#define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL +#define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ From 15dbf27cc18559a14e99609f78678aa86b9c6ff1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:32 +0100 Subject: [PATCH 0162/2061] perf_counter: software counter event infrastructure Provide generic software counter infrastructure that supports software events. This will be used to allow sample based profiling based on software events such as pagefaults. The current infrastructure can only provide a count of such events, no place information. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 +- kernel/perf_counter.c | 201 +++++++++++++++++++++++++++++++++++ 2 files changed, 208 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index dde564517b6..3fefc3b8150 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -126,6 +126,7 @@ struct hw_perf_counter { unsigned long counter_base; int nmi; unsigned int idx; + atomic64_t count; /* software */ atomic64_t prev_count; u64 irq_period; atomic64_t period_left; @@ -283,6 +284,8 @@ static inline int is_software_counter(struct perf_counter *counter) return !counter->hw_event.raw && counter->hw_event.type < 0; } +extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -295,10 +298,13 @@ static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_notify(struct pt_regs *regs) { } static inline void perf_counter_print_debug(void) { } static inline void perf_counter_unthrottle(void) { } -static inline void hw_perf_restore(u64 ctrl) { } +static inline void hw_perf_restore(u64 ctrl) { } static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } + +static inline void perf_swcounter_event(enum hw_event_types event, u64 nr, + int nmi, struct pt_regs *regs) { } #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0fe22c916e2..eeb1b46cf70 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1328,6 +1328,185 @@ static const struct file_operations perf_fops = { .compat_ioctl = perf_ioctl, }; +/* + * Generic software counter infrastructure + */ + +static void perf_swcounter_update(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + u64 prev, now; + s64 delta; + +again: + prev = atomic64_read(&hwc->prev_count); + now = atomic64_read(&hwc->count); + if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) + goto again; + + delta = now - prev; + + atomic64_add(delta, &counter->count); + atomic64_sub(delta, &hwc->period_left); +} + +static void perf_swcounter_set_period(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + s64 left = atomic64_read(&hwc->period_left); + s64 period = hwc->irq_period; + + if (unlikely(left <= -period)) { + left = period; + atomic64_set(&hwc->period_left, left); + } + + if (unlikely(left <= 0)) { + left += period; + atomic64_add(period, &hwc->period_left); + } + + atomic64_set(&hwc->prev_count, -left); + atomic64_set(&hwc->count, -left); +} + +static void perf_swcounter_save_and_restart(struct perf_counter *counter) +{ + perf_swcounter_update(counter); + perf_swcounter_set_period(counter); +} + +static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +static void perf_swcounter_handle_group(struct perf_counter *sibling) +{ + struct perf_counter *counter, *group_leader = sibling->group_leader; + + list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { + perf_swcounter_update(counter); + perf_swcounter_store_irq(sibling, counter->hw_event.type); + perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); + } +} + +static void perf_swcounter_interrupt(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + perf_swcounter_save_and_restart(counter); + + switch (counter->hw_event.record_type) { + case PERF_RECORD_SIMPLE: + break; + + case PERF_RECORD_IRQ: + perf_swcounter_store_irq(counter, instruction_pointer(regs)); + break; + + case PERF_RECORD_GROUP: + perf_swcounter_handle_group(counter); + break; + } + + if (nmi) { + counter->wakeup_pending = 1; + set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + } else + wake_up(&counter->waitq); +} + +static int perf_swcounter_match(struct perf_counter *counter, + enum hw_event_types event, + struct pt_regs *regs) +{ + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + return 0; + + if (counter->hw_event.raw) + return 0; + + if (counter->hw_event.type != event) + return 0; + + if (counter->hw_event.exclude_user && user_mode(regs)) + return 0; + + if (counter->hw_event.exclude_kernel && !user_mode(regs)) + return 0; + + return 1; +} + +static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, + enum hw_event_types event, u64 nr, + int nmi, struct pt_regs *regs) +{ + struct perf_counter *counter; + unsigned long flags; + int neg; + + if (list_empty(&ctx->counter_list)) + return; + + spin_lock_irqsave(&ctx->lock, flags); + + /* + * XXX: make counter_list RCU safe + */ + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (perf_swcounter_match(counter, event, regs)) { + neg = atomic64_add_negative(nr, &counter->hw.count); + if (counter->hw.irq_period && !neg) + perf_swcounter_interrupt(counter, nmi, regs); + } + } + + spin_unlock_irqrestore(&ctx->lock, flags); +} + +void perf_swcounter_event(enum hw_event_types event, u64 nr, + int nmi, struct pt_regs *regs) +{ + struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + + perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs); + if (cpuctx->task_ctx) + perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs); + + put_cpu_var(perf_cpu_context); +} + +static void perf_swcounter_read(struct perf_counter *counter) +{ + perf_swcounter_update(counter); +} + +static int perf_swcounter_enable(struct perf_counter *counter) +{ + perf_swcounter_set_period(counter); + return 0; +} + +static void perf_swcounter_disable(struct perf_counter *counter) +{ + perf_swcounter_update(counter); +} + +/* + * Software counter: cpu wall time clock + */ + static int cpu_clock_perf_counter_enable(struct perf_counter *counter) { int cpu = raw_smp_processor_id(); @@ -1364,6 +1543,10 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { .read = cpu_clock_perf_counter_read, }; +/* + * Software counter: task time clock + */ + /* * Called from within the scheduler: */ @@ -1420,6 +1603,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; +/* + * Software counter: page faults + */ + #ifdef CONFIG_VM_EVENT_COUNTERS #define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT] #else @@ -1473,6 +1660,10 @@ static const struct hw_perf_counter_ops perf_ops_page_faults = { .read = page_faults_perf_counter_read, }; +/* + * Software counter: context switches + */ + static u64 get_context_switches(struct perf_counter *counter) { struct task_struct *curr = counter->ctx->task; @@ -1521,6 +1712,10 @@ static const struct hw_perf_counter_ops perf_ops_context_switches = { .read = context_switches_perf_counter_read, }; +/* + * Software counter: cpu migrations + */ + static inline u64 get_cpu_migrations(struct perf_counter *counter) { struct task_struct *curr = counter->ctx->task; @@ -1572,7 +1767,9 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { + struct perf_counter_hw_event *hw_event = &counter->hw_event; const struct hw_perf_counter_ops *hw_ops = NULL; + struct hw_perf_counter *hwc = &counter->hw; /* * Software counters (currently) can't in general distinguish @@ -1618,6 +1815,10 @@ sw_perf_counter_init(struct perf_counter *counter) default: break; } + + if (hw_ops) + hwc->irq_period = hw_event->irq_period; + return hw_ops; } From 7dd1fcc258b65da718f01e4684a7b9244501a9fb Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:33 +0100 Subject: [PATCH 0163/2061] perf_counter: provide pagefault software events We use the generic software counter infrastructure to provide page fault events. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 3 +++ arch/x86/mm/fault.c | 3 +++ kernel/perf_counter.c | 53 +++-------------------------------------- 3 files changed, 9 insertions(+), 50 deletions(-) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 76993941cac..eda5b0ca4af 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -170,6 +171,8 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the * kernel and should generate an OOPS. Unfortunately, in the case of an diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index a03b7279efa..c8725752b6c 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -27,6 +27,7 @@ #include #include #include +#include #include @@ -1044,6 +1045,8 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + /* * If we're in an interrupt, have no user context or are running * in an atomic region then we must not take the fault: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index eeb1b46cf70..1773c5d7427 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1607,57 +1607,10 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { * Software counter: page faults */ -#ifdef CONFIG_VM_EVENT_COUNTERS -#define cpu_page_faults() __get_cpu_var(vm_event_states).event[PGFAULT] -#else -#define cpu_page_faults() 0 -#endif - -static u64 get_page_faults(struct perf_counter *counter) -{ - struct task_struct *curr = counter->ctx->task; - - if (curr) - return curr->maj_flt + curr->min_flt; - return cpu_page_faults(); -} - -static void page_faults_perf_counter_update(struct perf_counter *counter) -{ - u64 prev, now; - s64 delta; - - prev = atomic64_read(&counter->hw.prev_count); - now = get_page_faults(counter); - - atomic64_set(&counter->hw.prev_count, now); - - delta = now - prev; - - atomic64_add(delta, &counter->count); -} - -static void page_faults_perf_counter_read(struct perf_counter *counter) -{ - page_faults_perf_counter_update(counter); -} - -static int page_faults_perf_counter_enable(struct perf_counter *counter) -{ - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, get_page_faults(counter)); - return 0; -} - -static void page_faults_perf_counter_disable(struct perf_counter *counter) -{ - page_faults_perf_counter_update(counter); -} - static const struct hw_perf_counter_ops perf_ops_page_faults = { - .enable = page_faults_perf_counter_enable, - .disable = page_faults_perf_counter_disable, - .read = page_faults_perf_counter_read, + .enable = perf_swcounter_enable, + .disable = perf_swcounter_disable, + .read = perf_swcounter_read, }; /* From ac17dc8e58f3069ea895cfff963adf98ff3cf6b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:34 +0100 Subject: [PATCH 0164/2061] perf_counter: provide major/minor page fault software events Provide separate sw counters for major and minor page faults. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/powerpc/mm/fault.c | 5 ++++- arch/x86/mm/fault.c | 7 +++++-- include/linux/perf_counter.h | 4 +++- kernel/perf_counter.c | 22 +++++++++------------- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index eda5b0ca4af..17bbf6f91fb 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -312,6 +312,7 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { preempt_disable(); @@ -319,8 +320,10 @@ good_area: preempt_enable(); } #endif - } else + } else { current->min_flt++; + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + } up_read(&mm->mmap_sem); return 0; diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index c8725752b6c..f2d3324d921 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1140,10 +1140,13 @@ good_area: return; } - if (fault & VM_FAULT_MAJOR) + if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - else + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); + } else { tsk->min_flt++; + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + } check_v8086_mode(regs, address, tsk); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 3fefc3b8150..4b14a8e9dbf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -49,8 +49,10 @@ enum hw_event_types { PERF_COUNT_PAGE_FAULTS = -3, PERF_COUNT_CONTEXT_SWITCHES = -4, PERF_COUNT_CPU_MIGRATIONS = -5, + PERF_COUNT_PAGE_FAULTS_MIN = -6, + PERF_COUNT_PAGE_FAULTS_MAJ = -7, - PERF_SW_EVENTS_MIN = -6, + PERF_SW_EVENTS_MIN = -8, }; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 1773c5d7427..68950a3a52b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1503,6 +1503,12 @@ static void perf_swcounter_disable(struct perf_counter *counter) perf_swcounter_update(counter); } +static const struct hw_perf_counter_ops perf_ops_generic = { + .enable = perf_swcounter_enable, + .disable = perf_swcounter_disable, + .read = perf_swcounter_read, +}; + /* * Software counter: cpu wall time clock */ @@ -1603,16 +1609,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -/* - * Software counter: page faults - */ - -static const struct hw_perf_counter_ops perf_ops_page_faults = { - .enable = perf_swcounter_enable, - .disable = perf_swcounter_disable, - .read = perf_swcounter_read, -}; - /* * Software counter: context switches */ @@ -1753,9 +1749,9 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_clock; break; case PERF_COUNT_PAGE_FAULTS: - if (!(counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel)) - hw_ops = &perf_ops_page_faults; + case PERF_COUNT_PAGE_FAULTS_MIN: + case PERF_COUNT_PAGE_FAULTS_MAJ: + hw_ops = &perf_ops_generic; break; case PERF_COUNT_CONTEXT_SWITCHES: if (!counter->hw_event.exclude_kernel) From d6d020e9957745c61285ef3da9f294c5e6801f0f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:35 +0100 Subject: [PATCH 0165/2061] perf_counter: hrtimer based sampling for software time events Use hrtimers to profile timer based sampling for the software time counters. This allows platforms without hardware counter support to still perform sample based profiling. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 20 ++++-- kernel/perf_counter.c | 123 ++++++++++++++++++++++++----------- 2 files changed, 100 insertions(+), 43 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4b14a8e9dbf..dfb4c7ce18b 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -114,6 +114,7 @@ struct perf_counter_hw_event { #include #include #include +#include #include struct task_struct; @@ -123,12 +124,19 @@ struct task_struct; */ struct hw_perf_counter { #ifdef CONFIG_PERF_COUNTERS - u64 config; - unsigned long config_base; - unsigned long counter_base; - int nmi; - unsigned int idx; - atomic64_t count; /* software */ + union { + struct { /* hardware */ + u64 config; + unsigned long config_base; + unsigned long counter_base; + int nmi; + unsigned int idx; + }; + union { /* software */ + atomic64_t count; + struct hrtimer hrtimer; + }; + }; atomic64_t prev_count; u64 irq_period; atomic64_t period_left; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 68950a3a52b..f9330d5827c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1395,7 +1395,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling) struct perf_counter *counter, *group_leader = sibling->group_leader; list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - perf_swcounter_update(counter); + counter->hw_ops->read(counter); perf_swcounter_store_irq(sibling, counter->hw_event.type); perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); } @@ -1404,8 +1404,6 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling) static void perf_swcounter_interrupt(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - perf_swcounter_save_and_restart(counter); - switch (counter->hw_event.record_type) { case PERF_RECORD_SIMPLE: break; @@ -1426,6 +1424,38 @@ static void perf_swcounter_interrupt(struct perf_counter *counter, wake_up(&counter->waitq); } +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + struct perf_counter *counter; + struct pt_regs *regs; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->hw_ops->read(counter); + + regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->hw_event.exclude_kernel || !regs) && + !counter->hw_event.exclude_user) + regs = task_pt_regs(current); + + if (regs) + perf_swcounter_interrupt(counter, 0, regs); + + hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); + + return HRTIMER_RESTART; +} + +static void perf_swcounter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + perf_swcounter_save_and_restart(counter); + perf_swcounter_interrupt(counter, nmi, regs); +} + static int perf_swcounter_match(struct perf_counter *counter, enum hw_event_types event, struct pt_regs *regs) @@ -1448,13 +1478,20 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct pt_regs *regs) +{ + int neg = atomic64_add_negative(nr, &counter->hw.count); + if (counter->hw.irq_period && !neg) + perf_swcounter_overflow(counter, nmi, regs); +} + static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum hw_event_types event, u64 nr, int nmi, struct pt_regs *regs) { struct perf_counter *counter; unsigned long flags; - int neg; if (list_empty(&ctx->counter_list)) return; @@ -1465,11 +1502,8 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, * XXX: make counter_list RCU safe */ list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (perf_swcounter_match(counter, event, regs)) { - neg = atomic64_add_negative(nr, &counter->hw.count); - if (counter->hw.irq_period && !neg) - perf_swcounter_interrupt(counter, nmi, regs); - } + if (perf_swcounter_match(counter, event, regs)) + perf_swcounter_add(counter, nr, nmi, regs); } spin_unlock_irqrestore(&ctx->lock, flags); @@ -1513,14 +1547,6 @@ static const struct hw_perf_counter_ops perf_ops_generic = { * Software counter: cpu wall time clock */ -static int cpu_clock_perf_counter_enable(struct perf_counter *counter) -{ - int cpu = raw_smp_processor_id(); - - atomic64_set(&counter->hw.prev_count, cpu_clock(cpu)); - return 0; -} - static void cpu_clock_perf_counter_update(struct perf_counter *counter) { int cpu = raw_smp_processor_id(); @@ -1533,8 +1559,26 @@ static void cpu_clock_perf_counter_update(struct perf_counter *counter) atomic64_add(now - prev, &counter->count); } +static int cpu_clock_perf_counter_enable(struct perf_counter *counter) +{ + struct hw_perf_counter *hwc = &counter->hw; + int cpu = raw_smp_processor_id(); + + atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + if (hwc->irq_period) { + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(hwc->irq_period), 0, + HRTIMER_MODE_REL, 0); + } + + return 0; +} + static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { + hrtimer_cancel(&counter->hw.hrtimer); cpu_clock_perf_counter_update(counter); } @@ -1580,27 +1624,33 @@ static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now atomic64_add(delta, &counter->count); } -static void task_clock_perf_counter_read(struct perf_counter *counter) -{ - u64 now = task_clock_perf_counter_val(counter, 1); - - task_clock_perf_counter_update(counter, now); -} - static int task_clock_perf_counter_enable(struct perf_counter *counter) { - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - task_clock_perf_counter_val(counter, 0)); + struct hw_perf_counter *hwc = &counter->hw; + + atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0)); + if (hwc->irq_period) { + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; + __hrtimer_start_range_ns(&hwc->hrtimer, + ns_to_ktime(hwc->irq_period), 0, + HRTIMER_MODE_REL, 0); + } return 0; } static void task_clock_perf_counter_disable(struct perf_counter *counter) { - u64 now = task_clock_perf_counter_val(counter, 0); + hrtimer_cancel(&counter->hw.hrtimer); + task_clock_perf_counter_update(counter, + task_clock_perf_counter_val(counter, 0)); +} - task_clock_perf_counter_update(counter, now); +static void task_clock_perf_counter_read(struct perf_counter *counter) +{ + task_clock_perf_counter_update(counter, + task_clock_perf_counter_val(counter, 1)); } static const struct hw_perf_counter_ops perf_ops_task_clock = { @@ -1729,16 +1779,12 @@ sw_perf_counter_init(struct perf_counter *counter) */ switch (counter->hw_event.type) { case PERF_COUNT_CPU_CLOCK: - if (!(counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel || - counter->hw_event.exclude_hv)) - hw_ops = &perf_ops_cpu_clock; + hw_ops = &perf_ops_cpu_clock; + + if (hw_event->irq_period && hw_event->irq_period < 10000) + hw_event->irq_period = 10000; break; case PERF_COUNT_TASK_CLOCK: - if (counter->hw_event.exclude_user || - counter->hw_event.exclude_kernel || - counter->hw_event.exclude_hv) - break; /* * If the user instantiates this as a per-cpu counter, * use the cpu_clock counter instead. @@ -1747,6 +1793,9 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_task_clock; else hw_ops = &perf_ops_cpu_clock; + + if (hw_event->irq_period && hw_event->irq_period < 10000) + hw_event->irq_period = 10000; break; case PERF_COUNT_PAGE_FAULTS: case PERF_COUNT_PAGE_FAULTS_MIN: From 592903cdcbf606a838056bae6d03fc557806c914 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:36 +0100 Subject: [PATCH 0166/2061] perf_counter: add an event_list I noticed that the counter_list only includes top-level counters, thus perf_swcounter_event() will miss sw-counters in groups. Since perf_swcounter_event() also wants an RCU safe list, create a new event_list that includes all counters and uses RCU list ops and use call_rcu to free the counter structure. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++++ kernel/perf_counter.c | 30 +++++++++++++++++++----------- 2 files changed, 23 insertions(+), 11 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index dfb4c7ce18b..08c11a6afeb 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -187,6 +187,7 @@ struct file; struct perf_counter { #ifdef CONFIG_PERF_COUNTERS struct list_head list_entry; + struct list_head event_entry; struct list_head sibling_list; struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; @@ -220,6 +221,8 @@ struct perf_counter { struct perf_data *irqdata; struct perf_data *usrdata; struct perf_data data[2]; + + struct rcu_head rcu_head; #endif }; @@ -243,6 +246,7 @@ struct perf_counter_context { struct mutex mutex; struct list_head counter_list; + struct list_head event_list; int nr_counters; int nr_active; int is_active; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f9330d5827c..8d6ecfa64c0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Each CPU has a list of per CPU counters: @@ -72,6 +73,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_tail(&counter->list_entry, &ctx->counter_list); else list_add_tail(&counter->list_entry, &group_leader->sibling_list); + + list_add_rcu(&counter->event_entry, &ctx->event_list); } static void @@ -80,6 +83,7 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) struct perf_counter *sibling, *tmp; list_del_init(&counter->list_entry); + list_del_rcu(&counter->event_entry); /* * If this was a group counter with sibling counters then @@ -1133,6 +1137,14 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) return ctx; } +static void free_counter_rcu(struct rcu_head *head) +{ + struct perf_counter *counter; + + counter = container_of(head, struct perf_counter, rcu_head); + kfree(counter); +} + /* * Called when the last reference to the file is gone. */ @@ -1151,7 +1163,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); - kfree(counter); + call_rcu(&counter->rcu_head, free_counter_rcu); put_context(ctx); return 0; @@ -1491,22 +1503,16 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, int nmi, struct pt_regs *regs) { struct perf_counter *counter; - unsigned long flags; - if (list_empty(&ctx->counter_list)) + if (list_empty(&ctx->event_list)) return; - spin_lock_irqsave(&ctx->lock, flags); - - /* - * XXX: make counter_list RCU safe - */ - list_for_each_entry(counter, &ctx->counter_list, list_entry) { + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { if (perf_swcounter_match(counter, event, regs)) perf_swcounter_add(counter, nr, nmi, regs); } - - spin_unlock_irqrestore(&ctx->lock, flags); + rcu_read_unlock(); } void perf_swcounter_event(enum hw_event_types event, u64 nr, @@ -1846,6 +1852,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mutex); INIT_LIST_HEAD(&counter->list_entry); + INIT_LIST_HEAD(&counter->event_entry); INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); @@ -1992,6 +1999,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->event_list); ctx->task = task; } From 039fc91e064b81c2820ff16c304be5aba35fd126 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 16:43:47 +0100 Subject: [PATCH 0167/2061] perf_counter: fix hrtimer sampling Impact: fix deadlock with perfstat Fix for the perfstat fubar.. We cannot unconditionally call hrtimer_cancel() without ever having done hrtimer_init() on the thing. Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <1236959027.22447.149.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8d6ecfa64c0..d6cc22271ef 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1571,9 +1571,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) int cpu = raw_smp_processor_id(); atomic64_set(&hwc->prev_count, cpu_clock(cpu)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(hwc->irq_period), 0, HRTIMER_MODE_REL, 0); @@ -1635,9 +1635,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0)); + hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { - hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - hwc->hrtimer.function = perf_swcounter_hrtimer; __hrtimer_start_range_ns(&hwc->hrtimer, ns_to_ktime(hwc->irq_period), 0, HRTIMER_MODE_REL, 0); From 4e193bd4dfdc983d12969b51439b4a1fbaf2daad Mon Sep 17 00:00:00 2001 From: Tim Blechmann Date: Sat, 14 Mar 2009 14:29:25 +0100 Subject: [PATCH 0168/2061] perf_counter: include missing header Impact: build fix In order to compile a kernel with performance counter patches, has to be included to provide the declaration of struct pt_regs *get_irq_regs(void); [ This bug was masked by unrelated x86 header file changes in the x86 tree, but occurs in the tip:perfcounters/core standalone tree. ] Signed-off-by: Tim Blechmann Orig-LKML-Reference: <20090314142925.49c29c17@thinkpad> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d6cc22271ef..0018c5e8124 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -24,6 +24,8 @@ #include #include +#include + /* * Each CPU has a list of per CPU counters: */ From 7bb497bd885eedd0f56dfe3cc1b5ff20710d33b9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Mar 2009 08:59:21 +0100 Subject: [PATCH 0169/2061] perf_counter: fix crash on perfmon v1 systems Impact: fix boot crash on Intel Perfmon Version 1 systems Intel Perfmon v1 does not support the global MSRs, nor does it offer the generalized MSR ranges. So support v2 and later CPUs only. Also mark pmc_ops as read-mostly - to avoid false cacheline sharing. Cc: Paul Mackerras Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 22dab06c08a..6cba9d47b71 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -57,12 +57,14 @@ struct pmc_x86_ops { int max_events; }; -static struct pmc_x86_ops *pmc_ops; +static struct pmc_x86_ops *pmc_ops __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; +static __read_mostly int intel_perfmon_version; + /* * Intel PerfMon v3. Used on Core2 and later. */ @@ -613,7 +615,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) { + if (intel_perfmon_version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -930,10 +932,10 @@ static struct pmc_x86_ops pmc_amd_ops = { static struct pmc_x86_ops *pmc_intel_init(void) { - union cpuid10_eax eax; - unsigned int ebx; - unsigned int unused; union cpuid10_edx edx; + union cpuid10_eax eax; + unsigned int unused; + unsigned int ebx; /* * Check whether the Architectural PerfMon supports @@ -943,8 +945,12 @@ static struct pmc_x86_ops *pmc_intel_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return NULL; + intel_perfmon_version = eax.split.version_id; + if (intel_perfmon_version < 2) + return NULL; + pr_info("Intel Performance Monitoring support detected.\n"); - pr_info("... version: %d\n", eax.split.version_id); + pr_info("... version: %d\n", intel_perfmon_version); pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); From b6c5a71da1477d261bc36254fe1f20d32b57598d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 16 Mar 2009 21:00:00 +1100 Subject: [PATCH 0170/2061] perf_counter: abstract wakeup flag setting in core to fix powerpc build Impact: build fix for powerpc Commit bd753921015e7905 ("perf_counter: software counter event infrastructure") introduced a use of TIF_PERF_COUNTERS into the core perfcounter code. This breaks the build on powerpc because we use a flag in a per-cpu area to signal wakeups on powerpc rather than a thread_info flag, because the thread_info flags have to be manipulated with atomic operations and are thus slower than per-cpu flags. This fixes the by changing the core to use an abstracted set_perf_counter_pending() function, which is defined on x86 to set the TIF_PERF_COUNTERS flag and on powerpc to set the per-cpu flag (paca->perf_counter_pending). It changes the previous powerpc definition of set_perf_counter_pending to not take an argument and adds a clear_perf_counter_pending, so as to simplify the definition on x86. On x86, set_perf_counter_pending() is defined as a macro. Defining it as a static inline in arch/x86/include/asm/perf_counters.h causes compile failures because gets included early in , and the definitions of set_tsk_thread_flag etc. are therefore not available in . (On powerpc this problem is avoided by defining set_perf_counter_pending etc. in .) Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/hw_irq.h | 14 +++++++++++--- arch/powerpc/kernel/irq.c | 11 +++-------- arch/powerpc/kernel/perf_counter.c | 3 +-- arch/x86/include/asm/perf_counter.h | 3 +++ kernel/perf_counter.c | 2 +- 5 files changed, 19 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index b43076ff92c..cb32d571c9c 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -142,10 +142,17 @@ static inline unsigned long get_perf_counter_pending(void) return x; } -static inline void set_perf_counter_pending(int x) +static inline void set_perf_counter_pending(void) { asm volatile("stb %0,%1(13)" : : - "r" (x), + "r" (1), + "i" (offsetof(struct paca_struct, perf_counter_pending))); +} + +static inline void clear_perf_counter_pending(void) +{ + asm volatile("stb %0,%1(13)" : : + "r" (0), "i" (offsetof(struct paca_struct, perf_counter_pending))); } @@ -158,7 +165,8 @@ static inline unsigned long get_perf_counter_pending(void) return 0; } -static inline void set_perf_counter_pending(int x) {} +static inline void set_perf_counter_pending(void) {} +static inline void clear_perf_counter_pending(void) {} static inline void perf_counter_do_pending(void) {} #endif /* CONFIG_PERF_COUNTERS */ diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 0d2e37c5773..469e9635ff0 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -104,13 +104,6 @@ static inline notrace void set_soft_enabled(unsigned long enable) : : "r" (enable), "i" (offsetof(struct paca_struct, soft_enabled))); } -#ifdef CONFIG_PERF_COUNTERS -notrace void __weak perf_counter_do_pending(void) -{ - set_perf_counter_pending(0); -} -#endif - notrace void raw_local_irq_restore(unsigned long en) { /* @@ -142,8 +135,10 @@ notrace void raw_local_irq_restore(unsigned long en) iseries_handle_interrupts(); } - if (get_perf_counter_pending()) + if (get_perf_counter_pending()) { + clear_perf_counter_pending(); perf_counter_do_pending(); + } /* * if (get_paca()->hard_enabled) return; diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 0e33d27cd46..5008762e8bf 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -653,7 +653,6 @@ void perf_counter_do_pending(void) struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); struct perf_counter *counter; - set_perf_counter_pending(0); for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; if (counter && counter->wakeup_pending) { @@ -811,7 +810,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) perf_counter_do_pending(); irq_exit(); } else { - set_perf_counter_pending(1); + set_perf_counter_pending(); } } } diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 2e08ed73664..1662043b340 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,6 +84,9 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) +#define set_perf_counter_pending() \ + set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); extern void perf_counters_lapic_init(int nmi); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0018c5e8124..b39456ad74a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1433,7 +1433,7 @@ static void perf_swcounter_interrupt(struct perf_counter *counter, if (nmi) { counter->wakeup_pending = 1; - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); + set_perf_counter_pending(); } else wake_up(&counter->waitq); } From 01ef09d9ffb5ce9f8d62d1e5206da3d5ca612acc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:11 +0100 Subject: [PATCH 0171/2061] perf_counter: fix uninitialized usage of event_list Impact: fix boot crash When doing the generic context switch event I ran into some early boot hangs, which were caused by inf func recursion (event, fault, event, fault). I eventually tracked it down to event_list not being initialized at the time of the first event. Fix this. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.195392657@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 2 ++ kernel/perf_counter.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 219748d0026..ca226a91abe 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -124,6 +124,8 @@ extern struct cred init_cred; # define INIT_PERF_COUNTERS(tsk) \ .perf_counter_ctx.counter_list = \ LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ + .perf_counter_ctx.event_list = \ + LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \ .perf_counter_ctx.lock = \ __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), #else diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b39456ad74a..4c4e9eb37ab 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1506,7 +1506,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, { struct perf_counter *counter; - if (list_empty(&ctx->event_list)) + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) return; rcu_read_lock(); From 4a0deca657f3dbb8a707b5dc8f173beec01e7ed2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:12 +0100 Subject: [PATCH 0172/2061] perf_counter: generic context switch event Impact: cleanup Use the generic software events for context switches. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.283522645@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 1 - kernel/perf_counter.c | 60 +++---------------------------------------- kernel/sched.c | 6 ----- 3 files changed, 4 insertions(+), 63 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 75b2fc5306d..7ed41f7c5ac 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -138,7 +138,6 @@ extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); -extern u64 cpu_nr_switches(int cpu); extern u64 cpu_nr_migrations(int cpu); extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4c4e9eb37ab..99d5930f0a5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -710,10 +710,13 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct pt_regs *regs; if (likely(!cpuctx->task_ctx)) return; + regs = task_pt_regs(task); + perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; @@ -1667,58 +1670,6 @@ static const struct hw_perf_counter_ops perf_ops_task_clock = { .read = task_clock_perf_counter_read, }; -/* - * Software counter: context switches - */ - -static u64 get_context_switches(struct perf_counter *counter) -{ - struct task_struct *curr = counter->ctx->task; - - if (curr) - return curr->nvcsw + curr->nivcsw; - return cpu_nr_switches(smp_processor_id()); -} - -static void context_switches_perf_counter_update(struct perf_counter *counter) -{ - u64 prev, now; - s64 delta; - - prev = atomic64_read(&counter->hw.prev_count); - now = get_context_switches(counter); - - atomic64_set(&counter->hw.prev_count, now); - - delta = now - prev; - - atomic64_add(delta, &counter->count); -} - -static void context_switches_perf_counter_read(struct perf_counter *counter) -{ - context_switches_perf_counter_update(counter); -} - -static int context_switches_perf_counter_enable(struct perf_counter *counter) -{ - if (counter->prev_state <= PERF_COUNTER_STATE_OFF) - atomic64_set(&counter->hw.prev_count, - get_context_switches(counter)); - return 0; -} - -static void context_switches_perf_counter_disable(struct perf_counter *counter) -{ - context_switches_perf_counter_update(counter); -} - -static const struct hw_perf_counter_ops perf_ops_context_switches = { - .enable = context_switches_perf_counter_enable, - .disable = context_switches_perf_counter_disable, - .read = context_switches_perf_counter_read, -}; - /* * Software counter: cpu migrations */ @@ -1808,11 +1759,8 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_PAGE_FAULTS: case PERF_COUNT_PAGE_FAULTS_MIN: case PERF_COUNT_PAGE_FAULTS_MAJ: - hw_ops = &perf_ops_generic; - break; case PERF_COUNT_CONTEXT_SWITCHES: - if (!counter->hw_event.exclude_kernel) - hw_ops = &perf_ops_context_switches; + hw_ops = &perf_ops_generic; break; case PERF_COUNT_CPU_MIGRATIONS: if (!counter->hw_event.exclude_kernel) diff --git a/kernel/sched.c b/kernel/sched.c index 39e70860216..f76e3c0188a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2900,14 +2900,8 @@ unsigned long nr_active(void) /* * Externally visible per-cpu scheduler statistics: - * cpu_nr_switches(cpu) - number of context switches on that cpu * cpu_nr_migrations(cpu) - number of migrations into that cpu */ -u64 cpu_nr_switches(int cpu) -{ - return cpu_rq(cpu)->nr_switches; -} - u64 cpu_nr_migrations(int cpu) { return cpu_rq(cpu)->nr_migrations_in; From f16009527595ee562308653bc3d0039166d2ab15 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:16 +0100 Subject: [PATCH 0173/2061] perf_counter: fix up counter free paths Impact: fix crash during perfcounters use I found another counter free path, create a free_counter() call to accomodate generic tear-down. Fixes an RCU bug. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.652078652@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 99d5930f0a5..97f891ffeb4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1150,6 +1150,11 @@ static void free_counter_rcu(struct rcu_head *head) kfree(counter); } +static void free_counter(struct perf_counter *counter) +{ + call_rcu(&counter->rcu_head, free_counter_rcu); +} + /* * Called when the last reference to the file is gone. */ @@ -1168,7 +1173,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); - call_rcu(&counter->rcu_head, free_counter_rcu); + free_counter(counter); put_context(ctx); return 0; @@ -2128,10 +2133,10 @@ __perf_counter_exit_task(struct task_struct *child, list_entry) { if (sub->parent) { sync_child_counter(sub, sub->parent); - kfree(sub); + free_counter(sub); } } - kfree(child_counter); + free_counter(child_counter); } } From e077df4f439681e43f0db8255b2d215b342ebdc6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:17 +0100 Subject: [PATCH 0174/2061] perf_counter: hook up the tracepoint events Impact: new perfcounters feature Enable usage of tracepoints as perf counter events. tracepoint event ids can be found in /debug/tracing/event/*/*/id and (for now) are represented as -65536+id in the type field. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.744044174@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 +++ init/Kconfig | 5 +++++ kernel/perf_counter.c | 43 ++++++++++++++++++++++++++++++++++++ 3 files changed, 51 insertions(+) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 08c11a6afeb..065984c1ff5 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -53,6 +53,8 @@ enum hw_event_types { PERF_COUNT_PAGE_FAULTS_MAJ = -7, PERF_SW_EVENTS_MIN = -8, + + PERF_TP_EVENTS_MIN = -65536 }; /* @@ -222,6 +224,7 @@ struct perf_counter { struct perf_data *usrdata; struct perf_data data[2]; + void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; #endif }; diff --git a/init/Kconfig b/init/Kconfig index 38a2ecd47c3..4f647142f2e 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -947,6 +947,11 @@ config PERF_COUNTERS Say Y if unsure. +config EVENT_PROFILE + bool "Tracepoint profile sources" + depends on PERF_COUNTERS && EVENT_TRACER + default y + endmenu config VM_EVENT_COUNTERS diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 97f891ffeb4..0bbe3e45ba0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1152,6 +1152,9 @@ static void free_counter_rcu(struct rcu_head *head) static void free_counter(struct perf_counter *counter) { + if (counter->destroy) + counter->destroy(counter); + call_rcu(&counter->rcu_head, free_counter_rcu); } @@ -1727,6 +1730,45 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { .read = cpu_migrations_perf_counter_read, }; +#ifdef CONFIG_EVENT_PROFILE +void perf_tpcounter_event(int event_id) +{ + perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1, + task_pt_regs(current)); +} + +extern int ftrace_profile_enable(int); +extern void ftrace_profile_disable(int); + +static void tp_perf_counter_destroy(struct perf_counter *counter) +{ + int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; + + ftrace_profile_disable(event_id); +} + +static const struct hw_perf_counter_ops * +tp_perf_counter_init(struct perf_counter *counter) +{ + int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; + int ret; + + ret = ftrace_profile_enable(event_id); + if (ret) + return NULL; + + counter->destroy = tp_perf_counter_destroy; + + return &perf_ops_generic; +} +#else +static const struct hw_perf_counter_ops * +tp_perf_counter_init(struct perf_counter *counter) +{ + return NULL; +} +#endif + static const struct hw_perf_counter_ops * sw_perf_counter_init(struct perf_counter *counter) { @@ -1772,6 +1814,7 @@ sw_perf_counter_init(struct perf_counter *counter) hw_ops = &perf_ops_cpu_migrations; break; default: + hw_ops = tp_perf_counter_init(counter); break; } From b8e83514b64577b48bfb794fe85fcde40a9343ca Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:18 +0100 Subject: [PATCH 0175/2061] perf_counter: revamp syscall input ABI Impact: modify ABI The hardware/software classification in hw_event->type became a little strained due to the addition of tracepoint tracing. Instead split up the field and provide a type field to explicitly specify the counter type, while using the event_id field to specify which event to use. Raw counters still work as before, only the raw config now goes into raw_event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.836807573@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 4 +- arch/x86/kernel/cpu/perf_counter.c | 10 ++-- include/linux/perf_counter.h | 95 +++++++++++++++++++----------- kernel/perf_counter.c | 83 +++++++++++++++----------- 4 files changed, 117 insertions(+), 75 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 5008762e8bf..26f69dc7130 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -602,7 +602,7 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; if ((s64)counter->hw_event.irq_period < 0) return NULL; - ev = counter->hw_event.type; + ev = counter->hw_event.event_id; if (!counter->hw_event.raw) { if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) @@ -692,7 +692,7 @@ static void perf_handle_group(struct perf_counter *counter) list_for_each_entry(sub, &leader->sibling_list, list_entry) { if (sub != counter) sub->hw_ops->read(sub); - perf_store_irq_data(counter, sub->hw_event.type); + perf_store_irq_data(counter, sub->hw_event.event_config); perf_store_irq_data(counter, atomic64_read(&sub->count)); } } diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6cba9d47b71..d844ae41d5a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (hw_event->raw) { - hwc->config |= pmc_ops->raw_event(hw_event->type); + if (hw_event->raw_type) { + hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id); } else { - if (hw_event->type >= pmc_ops->max_events) + if (hw_event->event_id >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(hw_event->type); + hwc->config |= pmc_ops->event_map(hw_event->event_id); } counter->wakeup_pending = 0; @@ -715,7 +715,7 @@ perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); - perf_store_irq_data(sibling, counter->hw_event.type); + perf_store_irq_data(sibling, counter->hw_event.event_config); perf_store_irq_data(sibling, atomic64_read(&counter->count)); } } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 065984c1ff5..8f939490550 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -21,56 +21,81 @@ */ /* - * Generalized performance counter event types, used by the hw_event.type + * hw_event.type + */ +enum perf_event_types { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, + + /* + * available TYPE space, raw is the max value. + */ + + PERF_TYPE_RAW = 128, +}; + +/* + * Generalized performance counter event types, used by the hw_event.event_id * parameter of the sys_perf_counter_open() syscall: */ -enum hw_event_types { +enum hw_event_ids { /* * Common hardware events, generalized by the kernel: */ - PERF_COUNT_CPU_CYCLES = 0, - PERF_COUNT_INSTRUCTIONS = 1, - PERF_COUNT_CACHE_REFERENCES = 2, - PERF_COUNT_CACHE_MISSES = 3, - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_BRANCH_MISSES = 5, - PERF_COUNT_BUS_CYCLES = 6, + PERF_COUNT_CPU_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + PERF_COUNT_BUS_CYCLES = 6, - PERF_HW_EVENTS_MAX = 7, + PERF_HW_EVENTS_MAX = 7, +}; - /* - * Special "software" counters provided by the kernel, even if - * the hardware does not support performance counters. These - * counters measure various physical and sw events of the - * kernel (and allow the profiling of them as well): - */ - PERF_COUNT_CPU_CLOCK = -1, - PERF_COUNT_TASK_CLOCK = -2, - PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, - PERF_COUNT_CPU_MIGRATIONS = -5, - PERF_COUNT_PAGE_FAULTS_MIN = -6, - PERF_COUNT_PAGE_FAULTS_MAJ = -7, +/* + * Special "software" counters provided by the kernel, even if the hardware + * does not support performance counters. These counters measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum sw_event_ids { + PERF_COUNT_CPU_CLOCK = 0, + PERF_COUNT_TASK_CLOCK = 1, + PERF_COUNT_PAGE_FAULTS = 2, + PERF_COUNT_CONTEXT_SWITCHES = 3, + PERF_COUNT_CPU_MIGRATIONS = 4, + PERF_COUNT_PAGE_FAULTS_MIN = 5, + PERF_COUNT_PAGE_FAULTS_MAJ = 6, - PERF_SW_EVENTS_MIN = -8, - - PERF_TP_EVENTS_MIN = -65536 + PERF_SW_EVENTS_MAX = 7, }; /* * IRQ-notification data record type: */ enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, }; /* * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - __s64 type; + union { + struct { + __u64 event_id : 56, + type : 8; + }; + struct { + __u64 raw_event_id : 63, + raw_type : 1; + }; + __u64 event_config; + }; __u64 irq_period; __u64 record_type; @@ -78,7 +103,6 @@ struct perf_counter_hw_event { __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ inherit : 1, /* children inherit it */ pinned : 1, /* must always be on PMU */ exclusive : 1, /* only group on PMU */ @@ -87,7 +111,7 @@ struct perf_counter_hw_event { exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - __reserved_1 : 54; + __reserved_1 : 55; __u32 extra_config_len; __u32 __reserved_4; @@ -298,10 +322,11 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, */ static inline int is_software_counter(struct perf_counter *counter) { - return !counter->hw_event.raw && counter->hw_event.type < 0; + return !counter->hw_event.raw_type && + counter->hw_event.type != PERF_TYPE_HARDWARE; } -extern void perf_swcounter_event(enum hw_event_types, u64, int, struct pt_regs *); +extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); #else static inline void @@ -320,7 +345,7 @@ static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } -static inline void perf_swcounter_event(enum hw_event_types event, u64 nr, +static inline void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0bbe3e45ba0..68a56a68bc7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1395,12 +1395,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter) atomic64_set(&hwc->count, -left); } -static void perf_swcounter_save_and_restart(struct perf_counter *counter) -{ - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); -} - static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data) { struct perf_data *irqdata = counter->irqdata; @@ -1421,7 +1415,7 @@ static void perf_swcounter_handle_group(struct perf_counter *sibling) list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { counter->hw_ops->read(counter); - perf_swcounter_store_irq(sibling, counter->hw_event.type); + perf_swcounter_store_irq(sibling, counter->hw_event.event_config); perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); } } @@ -1477,21 +1471,25 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) static void perf_swcounter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - perf_swcounter_save_and_restart(counter); + perf_swcounter_update(counter); + perf_swcounter_set_period(counter); perf_swcounter_interrupt(counter, nmi, regs); } static int perf_swcounter_match(struct perf_counter *counter, - enum hw_event_types event, - struct pt_regs *regs) + enum perf_event_types type, + u32 event, struct pt_regs *regs) { if (counter->state != PERF_COUNTER_STATE_ACTIVE) return 0; - if (counter->hw_event.raw) + if (counter->hw_event.raw_type) return 0; - if (counter->hw_event.type != event) + if (counter->hw_event.type != type) + return 0; + + if (counter->hw_event.event_id != event) return 0; if (counter->hw_event.exclude_user && user_mode(regs)) @@ -1512,8 +1510,8 @@ static void perf_swcounter_add(struct perf_counter *counter, u64 nr, } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, - enum hw_event_types event, u64 nr, - int nmi, struct pt_regs *regs) + enum perf_event_types type, u32 event, + u64 nr, int nmi, struct pt_regs *regs) { struct perf_counter *counter; @@ -1522,24 +1520,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { - if (perf_swcounter_match(counter, event, regs)) + if (perf_swcounter_match(counter, type, event, regs)) perf_swcounter_add(counter, nr, nmi, regs); } rcu_read_unlock(); } -void perf_swcounter_event(enum hw_event_types event, u64 nr, - int nmi, struct pt_regs *regs) +static void __perf_swcounter_event(enum perf_event_types type, u32 event, + u64 nr, int nmi, struct pt_regs *regs) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); - perf_swcounter_ctx_event(&cpuctx->ctx, event, nr, nmi, regs); - if (cpuctx->task_ctx) - perf_swcounter_ctx_event(cpuctx->task_ctx, event, nr, nmi, regs); + perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); + if (cpuctx->task_ctx) { + perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, + nr, nmi, regs); + } put_cpu_var(perf_cpu_context); } +void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) +{ + __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs); +} + static void perf_swcounter_read(struct perf_counter *counter) { perf_swcounter_update(counter); @@ -1733,8 +1738,12 @@ static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { #ifdef CONFIG_EVENT_PROFILE void perf_tpcounter_event(int event_id) { - perf_swcounter_event(PERF_TP_EVENTS_MIN + event_id, 1, 1, - task_pt_regs(current)); + struct pt_regs *regs = get_irq_regs(); + + if (!regs) + regs = task_pt_regs(current); + + __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs); } extern int ftrace_profile_enable(int); @@ -1742,15 +1751,13 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; - - ftrace_profile_disable(event_id); + ftrace_profile_disable(counter->hw_event.event_id); } static const struct hw_perf_counter_ops * tp_perf_counter_init(struct perf_counter *counter) { - int event_id = counter->hw_event.type - PERF_TP_EVENTS_MIN; + int event_id = counter->hw_event.event_id; int ret; ret = ftrace_profile_enable(event_id); @@ -1758,6 +1765,7 @@ tp_perf_counter_init(struct perf_counter *counter) return NULL; counter->destroy = tp_perf_counter_destroy; + counter->hw.irq_period = counter->hw_event.irq_period; return &perf_ops_generic; } @@ -1783,7 +1791,7 @@ sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (counter->hw_event.type) { + switch (counter->hw_event.event_id) { case PERF_COUNT_CPU_CLOCK: hw_ops = &perf_ops_cpu_clock; @@ -1813,9 +1821,6 @@ sw_perf_counter_init(struct perf_counter *counter) if (!counter->hw_event.exclude_kernel) hw_ops = &perf_ops_cpu_migrations; break; - default: - hw_ops = tp_perf_counter_init(counter); - break; } if (hw_ops) @@ -1870,10 +1875,22 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->state = PERF_COUNTER_STATE_OFF; hw_ops = NULL; - if (!hw_event->raw && hw_event->type < 0) - hw_ops = sw_perf_counter_init(counter); - else + + if (hw_event->raw_type) hw_ops = hw_perf_counter_init(counter); + else switch (hw_event->type) { + case PERF_TYPE_HARDWARE: + hw_ops = hw_perf_counter_init(counter); + break; + + case PERF_TYPE_SOFTWARE: + hw_ops = sw_perf_counter_init(counter); + break; + + case PERF_TYPE_TRACEPOINT: + hw_ops = tp_perf_counter_init(counter); + break; + } if (!hw_ops) { kfree(counter); From 0322cd6ec504b0bf08ca7b2c3d7f43bda37d79c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 19 Mar 2009 20:26:19 +0100 Subject: [PATCH 0176/2061] perf_counter: unify irq output code Impact: cleanup Having 3 slightly different copies of the same code around does nobody any good. First step in revamping the output format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Steven Rostedt Orig-LKML-Reference: <20090319194233.929962222@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 51 +------------- arch/x86/kernel/cpu/perf_counter.c | 53 +-------------- include/linux/perf_counter.h | 2 + kernel/perf_counter.c | 106 +++++++++++++++-------------- 4 files changed, 61 insertions(+), 151 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 26f69dc7130..88b72eb4af1 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -662,41 +662,6 @@ void perf_counter_do_pending(void) } } -/* - * Record data for an irq counter. - * This function was lifted from the x86 code; maybe it should - * go in the core? - */ -static void perf_store_irq_data(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - -/* - * Record all the values of the counters in a group - */ -static void perf_handle_group(struct perf_counter *counter) -{ - struct perf_counter *leader, *sub; - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->hw_ops->read(sub); - perf_store_irq_data(counter, sub->hw_event.event_config); - perf_store_irq_data(counter, atomic64_read(&sub->count)); - } -} - /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -736,20 +701,8 @@ static void record_and_restart(struct perf_counter *counter, long val, /* * Finally record data if requested. */ - if (record) { - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - break; - case PERF_RECORD_IRQ: - perf_store_irq_data(counter, instruction_pointer(regs)); - counter->wakeup_pending = 1; - break; - case PERF_RECORD_GROUP: - perf_handle_group(counter); - counter->wakeup_pending = 1; - break; - } - } + if (record) + perf_counter_output(counter, 1, regs); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d844ae41d5a..902282d68b0 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -674,20 +674,6 @@ static void pmc_generic_disable(struct perf_counter *counter) x86_perf_counter_update(counter, hwc, idx); } -static void perf_store_irq_data(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - /* * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: @@ -704,22 +690,6 @@ static void perf_save_and_restart(struct perf_counter *counter) __pmc_generic_enable(counter, hwc, idx); } -static void -perf_handle_group(struct perf_counter *sibling, u64 *status, u64 *overflown) -{ - struct perf_counter *counter, *group_leader = sibling->group_leader; - - /* - * Store sibling timestamps (if any): - */ - list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - - x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); - perf_store_irq_data(sibling, counter->hw_event.event_config); - perf_store_irq_data(sibling, atomic64_read(&counter->count)); - } -} - /* * Maximum interrupt frequency of 100KHz per CPU */ @@ -754,28 +724,7 @@ again: continue; perf_save_and_restart(counter); - - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - continue; - case PERF_RECORD_IRQ: - perf_store_irq_data(counter, instruction_pointer(regs)); - break; - case PERF_RECORD_GROUP: - perf_handle_group(counter, &status, &ack); - break; - } - /* - * From NMI context we cannot call into the scheduler to - * do a task wakeup - but we mark these generic as - * wakeup_pending and initate a wakeup callback: - */ - if (nmi) { - counter->wakeup_pending = 1; - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); - } else { - wake_up(&counter->waitq); - } + perf_counter_output(counter, nmi, regs); } hw_perf_ack_status(ack); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8f939490550..a4b76c0175f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -317,6 +317,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu); +extern void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs); /* * Return 1 for a software counter, 0 for a hardware counter */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 68a56a68bc7..f054b8c9bf9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1353,6 +1353,60 @@ static const struct file_operations perf_fops = { .compat_ioctl = perf_ioctl, }; +/* + * Output + */ + +static void perf_counter_store_irq(struct perf_counter *counter, u64 data) +{ + struct perf_data *irqdata = counter->irqdata; + + if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { + irqdata->overrun++; + } else { + u64 *p = (u64 *) &irqdata->data[irqdata->len]; + + *p = data; + irqdata->len += sizeof(u64); + } +} + +static void perf_counter_handle_group(struct perf_counter *counter) +{ + struct perf_counter *leader, *sub; + + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->hw_ops->read(sub); + perf_counter_store_irq(counter, sub->hw_event.event_config); + perf_counter_store_irq(counter, atomic64_read(&sub->count)); + } +} + +void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + switch (counter->hw_event.record_type) { + case PERF_RECORD_SIMPLE: + return; + + case PERF_RECORD_IRQ: + perf_counter_store_irq(counter, instruction_pointer(regs)); + break; + + case PERF_RECORD_GROUP: + perf_counter_handle_group(counter); + break; + } + + if (nmi) { + counter->wakeup_pending = 1; + set_perf_counter_pending(); + } else + wake_up(&counter->waitq); +} + /* * Generic software counter infrastructure */ @@ -1395,54 +1449,6 @@ static void perf_swcounter_set_period(struct perf_counter *counter) atomic64_set(&hwc->count, -left); } -static void perf_swcounter_store_irq(struct perf_counter *counter, u64 data) -{ - struct perf_data *irqdata = counter->irqdata; - - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; - - *p = data; - irqdata->len += sizeof(u64); - } -} - -static void perf_swcounter_handle_group(struct perf_counter *sibling) -{ - struct perf_counter *counter, *group_leader = sibling->group_leader; - - list_for_each_entry(counter, &group_leader->sibling_list, list_entry) { - counter->hw_ops->read(counter); - perf_swcounter_store_irq(sibling, counter->hw_event.event_config); - perf_swcounter_store_irq(sibling, atomic64_read(&counter->count)); - } -} - -static void perf_swcounter_interrupt(struct perf_counter *counter, - int nmi, struct pt_regs *regs) -{ - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - break; - - case PERF_RECORD_IRQ: - perf_swcounter_store_irq(counter, instruction_pointer(regs)); - break; - - case PERF_RECORD_GROUP: - perf_swcounter_handle_group(counter); - break; - } - - if (nmi) { - counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&counter->waitq); -} - static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { struct perf_counter *counter; @@ -1461,7 +1467,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) - perf_swcounter_interrupt(counter, 0, regs); + perf_counter_output(counter, 0, regs); hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); @@ -1473,7 +1479,7 @@ static void perf_swcounter_overflow(struct perf_counter *counter, { perf_swcounter_update(counter); perf_swcounter_set_period(counter); - perf_swcounter_interrupt(counter, nmi, regs); + perf_counter_output(counter, nmi, regs); } static int perf_swcounter_match(struct perf_counter *counter, From db4fb5acf20295063d1d5105e67724eb51440207 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 19 Mar 2009 20:26:20 +0100 Subject: [PATCH 0177/2061] perf_counter: powerpc: clean up perc_counter_interrupt Impact: cleanup This updates the powerpc perf_counter_interrupt following on from the "perf_counter: unify irq output code" patch. Since we now use the generic perf_counter_output code, which sets the perf_counter_pending flag directly, we no longer need the need_wakeup variable. This removes need_wakeup and makes perf_counter_interrupt use get_perf_counter_pending() instead. Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Steven Rostedt Orig-LKML-Reference: <20090319194234.024464535@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 19 +++++++------------ 1 file changed, 7 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 88b72eb4af1..830ca9c4494 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -723,8 +723,6 @@ static void perf_counter_interrupt(struct pt_regs *regs) /* counter has overflowed */ found = 1; record_and_restart(counter, val, regs); - if (counter->wakeup_pending) - need_wakeup = 1; } } @@ -754,17 +752,14 @@ static void perf_counter_interrupt(struct pt_regs *regs) /* * If we need a wakeup, check whether interrupts were soft-enabled * when we took the interrupt. If they were, we can wake stuff up - * immediately; otherwise we'll have to set a flag and do the - * wakeup when interrupts get soft-enabled. + * immediately; otherwise we'll have do the wakeup when interrupts + * get soft-enabled. */ - if (need_wakeup) { - if (regs->softe) { - irq_enter(); - perf_counter_do_pending(); - irq_exit(); - } else { - set_perf_counter_pending(); - } + if (get_perf_counter_pending() && regs->softe) { + irq_enter(); + clear_perf_counter_pending(); + perf_counter_do_pending(); + irq_exit(); } } From 9aaa131a279834dff75c290c91f0058f62d72d46 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 21 Mar 2009 15:31:47 +1100 Subject: [PATCH 0178/2061] perf_counter: fix type/event_id layout on big-endian systems Impact: build fix for powerpc Commit db3a944aca35ae61 ("perf_counter: revamp syscall input ABI") expanded the hw_event.type field into a union of structs containing bitfields. In particular it introduced a type field and a raw_type field, with the intention that the 1-bit raw_type field should overlay the most-significant bit of the 8-bit type field, and in fact perf_counter_alloc() now assumes that (or at least, assumes that raw_type doesn't overlay any of the bits that are 1 in the values of PERF_TYPE_{HARDWARE,SOFTWARE,TRACEPOINT}). Unfortunately this is not true on big-endian systems such as PowerPC, where bitfields are laid out from left to right, i.e. from most significant bit to least significant. This means that setting hw_event.type = PERF_TYPE_SOFTWARE will set hw_event.raw_type to 1. This fixes it by making the layout depend on whether or not __BIG_ENDIAN_BITFIELD is defined. It's a bit ugly, but that's what we get for using bitfields in a user/kernel ABI. Also, that commit didn't fix up some places in arch/powerpc/kernel/ perf_counter.c where hw_event.raw and hw_event.event_id were used. This fixes them too. Signed-off-by: Paul Mackerras --- arch/powerpc/kernel/perf_counter.c | 9 +++++---- include/linux/perf_counter.h | 12 ++++++++++++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 830ca9c4494..6413d9c0313 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -602,12 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; if ((s64)counter->hw_event.irq_period < 0) return NULL; - ev = counter->hw_event.event_id; - if (!counter->hw_event.raw) { - if (ev >= ppmu->n_generic || - ppmu->generic_events[ev] == 0) + if (!counter->hw_event.raw_type) { + ev = counter->hw_event.event_id; + if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return NULL; ev = ppmu->generic_events[ev]; + } else { + ev = counter->hw_event.raw_event_id; } counter->hw.config_base = ev; counter->hw.idx = 0; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a4b76c0175f..98f5990be1e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -15,6 +15,7 @@ #include #include +#include /* * User-space ABI bits: @@ -86,6 +87,7 @@ enum perf_counter_record_type { */ struct perf_counter_hw_event { union { +#ifndef __BIG_ENDIAN_BITFIELD struct { __u64 event_id : 56, type : 8; @@ -94,6 +96,16 @@ struct perf_counter_hw_event { __u64 raw_event_id : 63, raw_type : 1; }; +#else + struct { + __u64 type : 8, + event_id : 56; + }; + struct { + __u64 raw_type : 1, + raw_event_id : 63; + }; +#endif /* __BIT_ENDIAN_BITFIELD */ __u64 event_config; }; From 6f9f791eb53b56097cd311a1a5517a8e89bdaf35 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Mar 2009 21:26:52 +0100 Subject: [PATCH 0179/2061] perf_counter: create Documentation/perf_counter/ and move perfcounters.txt there We'll have more files in that directory, prepare for that. Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/{perf-counters.txt => perf_counter/design.txt} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename Documentation/{perf-counters.txt => perf_counter/design.txt} (100%) diff --git a/Documentation/perf-counters.txt b/Documentation/perf_counter/design.txt similarity index 100% rename from Documentation/perf-counters.txt rename to Documentation/perf_counter/design.txt From e0143bad9dbf2a8fad4c5430562bceba196b66ea Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Mar 2009 21:29:59 +0100 Subject: [PATCH 0180/2061] perf_counter: add sample user-space to Documentation/perf_counter/ Initial version of kerneltop.c and perfstat.c. Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/Makefile | 12 + Documentation/perf_counter/kerneltop.c | 956 +++++++++++++++++++++++++ Documentation/perf_counter/perfstat.c | 521 ++++++++++++++ 3 files changed, 1489 insertions(+) create mode 100644 Documentation/perf_counter/Makefile create mode 100644 Documentation/perf_counter/kerneltop.c create mode 100644 Documentation/perf_counter/perfstat.c diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile new file mode 100644 index 00000000000..b45749753fc --- /dev/null +++ b/Documentation/perf_counter/Makefile @@ -0,0 +1,12 @@ +BINS = kerneltop perfstat + +all: $(BINS) + +kerneltop: kerneltop.c perfcounters.h + cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $< + +perfstat: kerneltop + ln -sf kerneltop perfstat + +clean: + rm $(BINS) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c new file mode 100644 index 00000000000..cf0e30bab5d --- /dev/null +++ b/Documentation/perf_counter/kerneltop.c @@ -0,0 +1,956 @@ +/* + * kerneltop.c: show top kernel functions - performance counters showcase + + Build with: + + cc -O6 -Wall `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c + + Sample output: + +------------------------------------------------------------------------------ + KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2) +------------------------------------------------------------------------------ + + weight RIP kernel function + ______ ________________ _______________ + + 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev + 33.00 - ffffffff804cb740 : sock_alloc_send_skb + 31.26 - ffffffff804ce808 : skb_push + 22.43 - ffffffff80510004 : tcp_established_options + 19.00 - ffffffff8027d250 : find_get_page + 15.76 - ffffffff804e4fc9 : eth_type_trans + 15.20 - ffffffff804d8baa : dst_release + 14.86 - ffffffff804cf5d8 : skb_release_head_state + 14.00 - ffffffff802217d5 : read_hpet + 12.00 - ffffffff804ffb7f : __ip_local_out + 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish + 8.54 - ffffffff805001a3 : ip_queue_xmit + + Started by Ingo Molnar + + Improvements and fixes by: + + Arjan van de Ven + Yanmin Zhang + Mike Galbraith + + Released under the GPL v2. (and only v2, not any later version) + + */ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include + +#include + +#ifdef __x86_64__ +# define __NR_perf_counter_open 295 +#endif + +#ifdef __i386__ +# define __NR_perf_counter_open 333 +#endif + +/* + * Pick up some kernel type conventions: + */ +#define __user +#define asmlinkage + +typedef unsigned int __u32; +typedef unsigned long long __u64; +typedef long long __s64; + +/* + * User-space ABI bits: + */ + +/* + * Generalized performance counter event types, used by the hw_event.type + * parameter of the sys_perf_counter_open() syscall: + */ +enum hw_event_types { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_CPU_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + PERF_COUNT_BUS_CYCLES = 6, + + PERF_HW_EVENTS_MAX = 7, + + /* + * Special "software" counters provided by the kernel, even if + * the hardware does not support performance counters. These + * counters measure various physical and sw events of the + * kernel (and allow the profiling of them as well): + */ + PERF_COUNT_CPU_CLOCK = -1, + PERF_COUNT_TASK_CLOCK = -2, + PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, + PERF_COUNT_CPU_MIGRATIONS = -5, + + PERF_SW_EVENTS_MIN = -6, +}; + +/* + * IRQ-notification data record type: + */ +enum perf_counter_record_type { + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, +}; + +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_hw_event { + __s64 type; + + __u64 irq_period; + __u64 record_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + + __reserved_1 : 54; + + __u32 extra_config_len; + __u32 __reserved_4; + + __u64 __reserved_2; + __u64 __reserved_3; +}; + +/* + * Ioctls that can be done on a perf counter fd: + */ +#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) + +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd, + unsigned long flags) +{ + int ret; + + ret = syscall( + __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); +#if defined(__x86_64__) || defined(__i386__) + if (ret < 0 && ret > -4096) { + errno = -ret; + ret = -1; + } +#endif + return ret; +} + +const char *event_types [] = { + "CPU cycles", + "instructions", + "cache-refs", + "cache-misses", + "branches", + "branch-misses", + "bus cycles" +}; + +const unsigned int default_count[] = { + 1000000, + 1000000, + 10000, + 10000, + 1000000, + 10000, +}; + +/* + * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * counters in the current task. + */ +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + +#define MAX_COUNTERS 8 + +static int nr_counters = -1; + +static __u64 count_filter = 100; + +#define MAX_NR_CPUS 256 + +static int event_count[MAX_COUNTERS]; +static unsigned long event_id[MAX_COUNTERS]; +static int event_raw[MAX_COUNTERS]; + +static int tid = -1; +static int profile_cpu = -1; +static int nr_cpus = 0; +static int nmi = 1; +static int group = 0; + +static char *vmlinux; + +static char *sym_filter; +static unsigned long filter_start; +static unsigned long filter_end; + +static int delay_secs = 2; +static int zero; +static int dump_symtab; + +struct source_line { + uint64_t EIP; + unsigned long count; + char *line; +}; + +static GList *lines; + +static void display_help(void) +{ + printf( + "Usage: kerneltop []\n\n" + "KernelTop Options (up to %d event types can be specified at once):\n\n", + MAX_COUNTERS); + printf( + " -e EID --event_id=EID # event type ID [default: 0]\n" + " 0: CPU cycles\n" + " 1: instructions\n" + " 2: cache accesses\n" + " 3: cache misses\n" + " 4: branch instructions\n" + " 5: branch prediction misses\n" + " 6: bus cycles\n\n" + " rNNN: raw PMU events (eventsel+umask)\n\n" + " -c CNT --count=CNT # event period to sample\n\n" + " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n" + " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n" + " -d delay --delay= # sampling/display delay [default: 2]\n" + " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n" + " -s symbol --symbol= # function to be showed annotated one-shot\n" + " -x path --vmlinux= # the vmlinux binary, required for -s use:\n" + " -z --zero # zero counts after display\n" + " -D --dump_symtab # dump symbol table to stderr on startup\n" + "\n"); + + exit(0); +} + +static void process_options(int argc, char *argv[]) +{ + int error = 0, counter; + + for (;;) { + int option_index = 0; + /** Options for getopt */ + static struct option long_options[] = { + {"count", required_argument, NULL, 'c'}, + {"cpu", required_argument, NULL, 'C'}, + {"delay", required_argument, NULL, 'd'}, + {"dump_symtab", no_argument, NULL, 'D'}, + {"event_id", required_argument, NULL, 'e'}, + {"filter", required_argument, NULL, 'f'}, + {"group", required_argument, NULL, 'g'}, + {"help", no_argument, NULL, 'h'}, + {"nmi", required_argument, NULL, 'n'}, + {"pid", required_argument, NULL, 'p'}, + {"vmlinux", required_argument, NULL, 'x'}, + {"symbol", required_argument, NULL, 's'}, + {"zero", no_argument, NULL, 'z'}, + {NULL, 0, NULL, 0 } + }; + int c = getopt_long(argc, argv, "c:C:d:De:f:g:hn:p:s:x:z", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'c': + if (nr_counters == -1) + nr_counters = 0; + event_count[nr_counters] = atoi(optarg); break; + case 'C': + /* CPU and PID are mutually exclusive */ + if (tid != -1) { + printf("WARNING: CPU switch overriding PID\n"); + sleep(1); + tid = -1; + } + profile_cpu = atoi(optarg); break; + case 'd': delay_secs = atoi(optarg); break; + case 'D': dump_symtab = 1; break; + + case 'e': + nr_counters++; + if (nr_counters == MAX_COUNTERS) { + error = 1; + break; + } + if (*optarg == 'r') { + event_raw[nr_counters] = 1; + ++optarg; + } + event_id[nr_counters] = strtol(optarg, NULL, 16); + break; + + case 'f': count_filter = atoi(optarg); break; + case 'g': group = atoi(optarg); break; + case 'h': display_help(); break; + case 'n': nmi = atoi(optarg); break; + case 'p': + /* CPU and PID are mutually exclusive */ + if (profile_cpu != -1) { + printf("WARNING: PID switch overriding CPU\n"); + sleep(1); + profile_cpu = -1; + } + tid = atoi(optarg); break; + case 's': sym_filter = strdup(optarg); break; + case 'x': vmlinux = strdup(optarg); break; + case 'z': zero = 1; break; + default: error = 1; break; + } + } + if (error) + display_help(); + + nr_counters++; + if (nr_counters < 1) + nr_counters = 1; + + for (counter = 0; counter < nr_counters; counter++) { + if (event_count[counter]) + continue; + + if (event_id[counter] < PERF_HW_EVENTS_MAX) + event_count[counter] = default_count[event_id[counter]]; + else + event_count[counter] = 100000; + } +} + +static uint64_t min_ip; +static uint64_t max_ip = -1ll; + +struct sym_entry { + unsigned long long addr; + char *sym; + unsigned long count[MAX_COUNTERS]; + int skip; + GList *source; +}; + +#define MAX_SYMS 100000 + +static int sym_table_count; + +struct sym_entry *sym_filter_entry; + +static struct sym_entry sym_table[MAX_SYMS]; + +static void show_details(struct sym_entry *sym); + +/* + * Ordering weight: count-1 * count-1 * ... / count-n + */ +static double sym_weight(const struct sym_entry *sym) +{ + double weight; + int counter; + + weight = sym->count[0]; + + for (counter = 1; counter < nr_counters-1; counter++) + weight *= sym->count[counter]; + + weight /= (sym->count[counter] + 1); + + return weight; +} + +static int compare(const void *__sym1, const void *__sym2) +{ + const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; + + return sym_weight(sym1) < sym_weight(sym2); +} + +static time_t last_refresh; +static long events; +static long userspace_events; +static const char CONSOLE_CLEAR[] = ""; + +static struct sym_entry tmp[MAX_SYMS]; + +static void print_sym_table(void) +{ + int i, printed; + int counter; + float events_per_sec = events/delay_secs; + float kevents_per_sec = (events-userspace_events)/delay_secs; + + memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count); + qsort(tmp, sym_table_count, sizeof(tmp[0]), compare); + + write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR)); + + printf( +"------------------------------------------------------------------------------\n"); + printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ", + events_per_sec, + 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)), + nmi ? "NMI" : "IRQ"); + + if (nr_counters == 1) + printf("%d ", event_count[0]); + + for (counter = 0; counter < nr_counters; counter++) { + if (counter) + printf("/"); + + if (event_id[counter] < PERF_HW_EVENTS_MAX) + printf( "%s", event_types[event_id[counter]]); + else + printf( "raw:%04lx", event_id[counter]); + } + + printf( "], "); + + if (tid != -1) + printf(" (tid: %d", tid); + else + printf(" (all"); + + if (profile_cpu != -1) + printf(", cpu: %d)\n", profile_cpu); + else { + if (tid != -1) + printf(")\n"); + else + printf(", %d CPUs)\n", nr_cpus); + } + + printf("------------------------------------------------------------------------------\n\n"); + + if (nr_counters == 1) + printf(" events"); + else + printf(" weight events"); + + printf(" RIP kernel function\n" + " ______ ______ ________________ _______________\n\n" + ); + + printed = 0; + for (i = 0; i < sym_table_count; i++) { + int count; + + if (nr_counters == 1) { + if (printed <= 18 && + tmp[i].count[0] >= count_filter) { + printf("%19.2f - %016llx : %s\n", + sym_weight(tmp + i), tmp[i].addr, tmp[i].sym); + printed++; + } + } else { + if (printed <= 18 && + tmp[i].count[0] >= count_filter) { + printf("%8.1f %10ld - %016llx : %s\n", + sym_weight(tmp + i), + tmp[i].count[0], + tmp[i].addr, tmp[i].sym); + printed++; + } + } + /* + * Add decay to the counts: + */ + for (count = 0; count < nr_counters; count++) + sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8; + } + + if (sym_filter_entry) + show_details(sym_filter_entry); + + last_refresh = time(NULL); + + { + struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; + + if (poll(&stdin_poll, 1, 0) == 1) { + printf("key pressed - exiting.\n"); + exit(0); + } + } +} + +static int read_symbol(FILE *in, struct sym_entry *s) +{ + static int filter_match = 0; + char *sym, stype; + char str[500]; + int rc, pos; + + rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str); + if (rc == EOF) + return -1; + + assert(rc == 3); + + /* skip until end of line: */ + pos = strlen(str); + do { + rc = fgetc(in); + if (rc == '\n' || rc == EOF || pos >= 499) + break; + str[pos] = rc; + pos++; + } while (1); + str[pos] = 0; + + sym = str; + + /* Filter out known duplicates and non-text symbols. */ + if (!strcmp(sym, "_text")) + return 1; + if (!min_ip && !strcmp(sym, "_stext")) + return 1; + if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext")) + return 1; + if (stype != 'T' && stype != 't') + return 1; + if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14)) + return 1; + if (strstr(sym, "_text_start") || strstr(sym, "_text_end")) + return 1; + + s->sym = malloc(strlen(str)); + assert(s->sym); + + strcpy((char *)s->sym, str); + s->skip = 0; + + /* Tag events to be skipped. */ + if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym)) + s->skip = 1; + if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym)) + s->skip = 1; + + if (filter_match == 1) { + filter_end = s->addr; + filter_match = -1; + if (filter_end - filter_start > 10000) { + printf("hm, too large filter symbol <%s> - skipping.\n", + sym_filter); + printf("symbol filter start: %016lx\n", filter_start); + printf(" end: %016lx\n", filter_end); + filter_end = filter_start = 0; + sym_filter = NULL; + sleep(1); + } + } + if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) { + filter_match = 1; + filter_start = s->addr; + } + + return 0; +} + +int compare_addr(const void *__sym1, const void *__sym2) +{ + const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; + + return sym1->addr > sym2->addr; +} + +static void sort_symbol_table(void) +{ + int i, dups; + + do { + qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr); + for (i = 0, dups = 0; i < sym_table_count; i++) { + if (sym_table[i].addr == sym_table[i+1].addr) { + sym_table[i+1].addr = -1ll; + dups++; + } + } + sym_table_count -= dups; + } while(dups); +} + +static void parse_symbols(void) +{ + struct sym_entry *last; + + FILE *kallsyms = fopen("/proc/kallsyms", "r"); + + if (!kallsyms) { + printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n"); + exit(-1); + } + + while (!feof(kallsyms)) { + if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) { + sym_table_count++; + assert(sym_table_count <= MAX_SYMS); + } + } + + sort_symbol_table(); + min_ip = sym_table[0].addr; + max_ip = sym_table[sym_table_count-1].addr; + last = sym_table + sym_table_count++; + + last->addr = -1ll; + last->sym = ""; + + if (filter_end) { + int count; + for (count=0; count < sym_table_count; count ++) { + if (!strcmp(sym_table[count].sym, sym_filter)) { + sym_filter_entry = &sym_table[count]; + break; + } + } + } + if (dump_symtab) { + int i; + + for (i = 0; i < sym_table_count; i++) + fprintf(stderr, "%llx %s\n", + sym_table[i].addr, sym_table[i].sym); + } +} + + +static void parse_vmlinux(char *filename) +{ + FILE *file; + char command[PATH_MAX*2]; + if (!filename) + return; + + sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename); + + file = popen(command, "r"); + if (!file) + return; + + while (!feof(file)) { + struct source_line *src; + size_t dummy = 0; + char *c; + + src = malloc(sizeof(struct source_line)); + assert(src != NULL); + memset(src, 0, sizeof(struct source_line)); + + if (getline(&src->line, &dummy, file) < 0) + break; + if (!src->line) + break; + + c = strchr(src->line, '\n'); + if (c) + *c = 0; + + lines = g_list_prepend(lines, src); + + if (strlen(src->line)>8 && src->line[8] == ':') + src->EIP = strtoull(src->line, NULL, 16); + if (strlen(src->line)>8 && src->line[16] == ':') + src->EIP = strtoull(src->line, NULL, 16); + } + pclose(file); + lines = g_list_reverse(lines); +} + +static void record_precise_ip(uint64_t ip) +{ + struct source_line *line; + GList *item; + + item = g_list_first(lines); + while (item) { + line = item->data; + if (line->EIP == ip) + line->count++; + if (line->EIP > ip) + break; + item = g_list_next(item); + } +} + +static void lookup_sym_in_vmlinux(struct sym_entry *sym) +{ + struct source_line *line; + GList *item; + char pattern[PATH_MAX]; + sprintf(pattern, "<%s>:", sym->sym); + + item = g_list_first(lines); + while (item) { + line = item->data; + if (strstr(line->line, pattern)) { + sym->source = item; + break; + } + item = g_list_next(item); + } +} + +void show_lines(GList *item_queue, int item_queue_count) +{ + int i; + struct source_line *line; + + for (i = 0; i < item_queue_count; i++) { + line = item_queue->data; + printf("%8li\t%s\n", line->count, line->line); + item_queue = g_list_next(item_queue); + } +} + +#define TRACE_COUNT 3 + +static void show_details(struct sym_entry *sym) +{ + struct source_line *line; + GList *item; + int displayed = 0; + GList *item_queue = NULL; + int item_queue_count = 0; + + if (!sym->source) + lookup_sym_in_vmlinux(sym); + if (!sym->source) + return; + + printf("Showing details for %s\n", sym->sym); + + item = sym->source; + while (item) { + line = item->data; + if (displayed && strstr(line->line, ">:")) + break; + + if (!item_queue_count) + item_queue = item; + item_queue_count ++; + + if (line->count >= count_filter) { + show_lines(item_queue, item_queue_count); + item_queue_count = 0; + item_queue = NULL; + } else if (item_queue_count > TRACE_COUNT) { + item_queue = g_list_next(item_queue); + item_queue_count --; + } + + line->count = 0; + displayed++; + if (displayed > 300) + break; + item = g_list_next(item); + } +} + +/* + * Binary search in the histogram table and record the hit: + */ +static void record_ip(uint64_t ip, int counter) +{ + int left_idx, middle_idx, right_idx, idx; + unsigned long left, middle, right; + + record_precise_ip(ip); + + left_idx = 0; + right_idx = sym_table_count-1; + assert(ip <= max_ip && ip >= min_ip); + + while (left_idx + 1 < right_idx) { + middle_idx = (left_idx + right_idx) / 2; + + left = sym_table[ left_idx].addr; + middle = sym_table[middle_idx].addr; + right = sym_table[ right_idx].addr; + + if (!(left <= middle && middle <= right)) { + printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right); + printf("%d %d %d\n", left_idx, middle_idx, right_idx); + } + assert(left <= middle && middle <= right); + if (!(left <= ip && ip <= right)) { + printf(" left: %016lx\n", left); + printf(" ip: %016lx\n", ip); + printf("right: %016lx\n", right); + } + assert(left <= ip && ip <= right); + /* + * [ left .... target .... middle .... right ] + * => right := middle + */ + if (ip < middle) { + right_idx = middle_idx; + continue; + } + /* + * [ left .... middle ... target ... right ] + * => left := middle + */ + left_idx = middle_idx; + } + + idx = left_idx; + + if (!sym_table[idx].skip) + sym_table[idx].count[counter]++; + else events--; +} + +static void process_event(uint64_t ip, int counter) +{ + events++; + + if (ip < min_ip || ip > max_ip) { + userspace_events++; + return; + } + + record_ip(ip, counter); +} + +int main(int argc, char *argv[]) +{ + struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS]; + struct perf_counter_hw_event hw_event; + int fd[MAX_NR_CPUS][MAX_COUNTERS]; + int i, counter, group_fd; + unsigned int cpu; + uint64_t ip; + ssize_t res; + int ret; + + process_options(argc, argv); + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + if (tid != -1 || profile_cpu != -1) + nr_cpus = 1; + + assert(nr_cpus <= MAX_NR_CPUS); + + for (i = 0; i < nr_cpus; i++) { + group_fd = -1; + for (counter = 0; counter < nr_counters; counter++) { + + cpu = profile_cpu; + if (tid == -1 && profile_cpu == -1) + cpu = i; + + memset(&hw_event, 0, sizeof(hw_event)); + hw_event.type = event_id[counter]; + hw_event.raw = event_raw[counter]; + hw_event.irq_period = event_count[counter]; + hw_event.record_type = PERF_RECORD_IRQ; + hw_event.nmi = nmi; + + fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); + fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); + if (fd[i][counter] < 0) { + printf("kerneltop error: syscall returned with %d (%s)\n", + fd[i][counter], strerror(-fd[i][counter])); + if (fd[i][counter] == -1) + printf("Are you root?\n"); + exit(-1); + } + assert(fd[i][counter] >= 0); + + /* + * First counter acts as the group leader: + */ + if (group && group_fd == -1) + group_fd = fd[i][counter]; + + event_array[i][counter].fd = fd[i][counter]; + event_array[i][counter].events = POLLIN; + } + } + + parse_symbols(); + if (vmlinux && sym_filter_entry) + parse_vmlinux(vmlinux); + + printf("KernelTop refresh period: %d seconds\n", delay_secs); + last_refresh = time(NULL); + + while (1) { + int hits = events; + + for (i = 0; i < nr_cpus; i++) { + for (counter = 0; counter < nr_counters; counter++) { + res = read(fd[i][counter], (char *) &ip, sizeof(ip)); + if (res > 0) { + assert(res == sizeof(ip)); + + process_event(ip, counter); + } + } + } + + if (time(NULL) >= last_refresh + delay_secs) { + print_sym_table(); + events = userspace_events = 0; + } + + if (hits == events) + ret = poll(event_array[0], nr_cpus, 1000); + hits = events; + } + + return 0; +} diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c new file mode 100644 index 00000000000..9a5808fbcf9 --- /dev/null +++ b/Documentation/perf_counter/perfstat.c @@ -0,0 +1,521 @@ +/* + * perfstat: /usr/bin/time -alike performance counter statistics utility + * + * It summarizes the counter events of all tasks (and child tasks), + * covering all CPUs that the command (or workload) executes on. + * It only counts the per-task events of the workload started, + * independent of how many other tasks run on those CPUs. + * + * Build with: cc -O2 -g -lrt -Wall -W -o perfstat perfstat.c + * + * Sample output: + * + + $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null + + Performance counter stats for 'ls': + + 163516953 instructions + 2295 cache-misses + 2855182 branch-misses + + * + * Copyright (C) 2008, Red Hat Inc, Ingo Molnar + * + * Released under the GPLv2 (not later). + * + * Percpu counter support by: Yanmin Zhang + * Symbolic event options by: Wu Fengguang + */ +#define _GNU_SOURCE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef __x86_64__ +# define __NR_perf_counter_open 295 +#endif + +#ifdef __i386__ +# define __NR_perf_counter_open 333 +#endif + +#ifdef __powerpc__ +#define __NR_perf_counter_open 319 +#endif + +/* + * Pick up some kernel type conventions: + */ +#define __user +#define asmlinkage + +typedef unsigned int __u32; +typedef unsigned long long __u64; +typedef long long __s64; + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +/* + * User-space ABI bits: + */ + +/* + * Generalized performance counter event types, used by the hw_event.type + * parameter of the sys_perf_counter_open() syscall: + */ +enum hw_event_types { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_CPU_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + PERF_COUNT_BUS_CYCLES = 6, + + PERF_HW_EVENTS_MAX = 7, + + /* + * Special "software" counters provided by the kernel, even if + * the hardware does not support performance counters. These + * counters measure various physical and sw events of the + * kernel (and allow the profiling of them as well): + */ + PERF_COUNT_CPU_CLOCK = -1, + PERF_COUNT_TASK_CLOCK = -2, + PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, + PERF_COUNT_CPU_MIGRATIONS = -5, + + PERF_SW_EVENTS_MIN = -6, +}; + +/* + * IRQ-notification data record type: + */ +enum perf_counter_record_type { + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, +}; + +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_hw_event { + __s64 type; + + __u64 irq_period; + __u64 record_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + + __reserved_1 : 54; + + __u32 extra_config_len; + __u32 __reserved_4; + + __u64 __reserved_2; + __u64 __reserved_3; +}; + +/* + * Ioctls that can be done on a perf counter fd: + */ +#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) + +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd, + unsigned long flags) +{ + int ret; + + ret = syscall( + __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); +#if defined(__x86_64__) || defined(__i386__) + if (ret < 0 && ret > -4096) { + errno = -ret; + ret = -1; + } +#endif + return ret; +} + + +static char *hw_event_names [] = { + "CPU cycles", + "instructions", + "cache references", + "cache misses", + "branches", + "branch misses", + "bus cycles", +}; + +static char *sw_event_names [] = { + "cpu clock ticks", + "task clock ticks", + "pagefaults", + "context switches", + "CPU migrations", +}; + +struct event_symbol { + int event; + char *symbol; +}; + +static struct event_symbol event_symbols [] = { + {PERF_COUNT_CPU_CYCLES, "cpu-cycles", }, + {PERF_COUNT_CPU_CYCLES, "cycles", }, + {PERF_COUNT_INSTRUCTIONS, "instructions", }, + {PERF_COUNT_CACHE_REFERENCES, "cache-references", }, + {PERF_COUNT_CACHE_MISSES, "cache-misses", }, + {PERF_COUNT_BRANCH_INSTRUCTIONS, "branch-instructions", }, + {PERF_COUNT_BRANCH_INSTRUCTIONS, "branches", }, + {PERF_COUNT_BRANCH_MISSES, "branch-misses", }, + {PERF_COUNT_BUS_CYCLES, "bus-cycles", }, + {PERF_COUNT_CPU_CLOCK, "cpu-ticks", }, + {PERF_COUNT_CPU_CLOCK, "ticks", }, + {PERF_COUNT_TASK_CLOCK, "task-ticks", }, + {PERF_COUNT_PAGE_FAULTS, "page-faults", }, + {PERF_COUNT_PAGE_FAULTS, "faults", }, + {PERF_COUNT_CONTEXT_SWITCHES, "context-switches", }, + {PERF_COUNT_CONTEXT_SWITCHES, "cs", }, + {PERF_COUNT_CPU_MIGRATIONS, "cpu-migrations", }, + {PERF_COUNT_CPU_MIGRATIONS, "migrations", }, +}; + +#define MAX_COUNTERS 64 +#define MAX_NR_CPUS 256 + +static int nr_counters = 0; +static int nr_cpus = 0; + +static int event_id[MAX_COUNTERS] = + { -2, -5, -4, -3, 0, 1, 2, 3}; + +static int event_raw[MAX_COUNTERS]; + +static int system_wide = 0; + +static void display_help(void) +{ + unsigned int i; + int e; + + printf( + "Usage: perfstat [] \n\n" + "PerfStat Options (up to %d event types can be specified):\n\n", + MAX_COUNTERS); + printf( + " -e EVENT --event=EVENT # symbolic-name abbreviations"); + + for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) { + if (e != event_symbols[i].event) { + e = event_symbols[i].event; + printf( + "\n %2d: %-20s", e, event_symbols[i].symbol); + } else + printf(" %s", event_symbols[i].symbol); + } + + printf("\n" + " rNNN: raw event type\n\n" + " -s # system-wide collection\n\n" + " -c --command= # command+arguments to be timed.\n" + "\n"); + exit(0); +} + +static int type_valid(int type) +{ + if (type >= PERF_HW_EVENTS_MAX) + return 0; + if (type <= PERF_SW_EVENTS_MIN) + return 0; + + return 1; +} + +static char *event_name(int ctr) +{ + int type = event_id[ctr]; + static char buf[32]; + + if (event_raw[ctr]) { + sprintf(buf, "raw 0x%x", type); + return buf; + } + if (!type_valid(type)) + return "unknown"; + + if (type >= 0) + return hw_event_names[type]; + + return sw_event_names[-type-1]; +} + +/* + * Each event can have multiple symbolic names. + * Symbolic names are (almost) exactly matched. + */ +static int match_event_symbols(char *str) +{ + unsigned int i; + + if (isdigit(str[0]) || str[0] == '-') + return atoi(str); + + for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { + if (!strncmp(str, event_symbols[i].symbol, + strlen(event_symbols[i].symbol))) + return event_symbols[i].event; + } + + return PERF_HW_EVENTS_MAX; +} + +static void parse_events(char *str) +{ + int type, raw; + +again: + nr_counters++; + if (nr_counters == MAX_COUNTERS) + display_help(); + + raw = 0; + if (*str == 'r') { + raw = 1; + ++str; + type = strtol(str, NULL, 16); + } else { + type = match_event_symbols(str); + if (!type_valid(type)) + display_help(); + } + + event_id[nr_counters] = type; + event_raw[nr_counters] = raw; + + str = strstr(str, ","); + if (str) { + str++; + goto again; + } +} + +static void process_options(int argc, char *argv[]) +{ + for (;;) { + int option_index = 0; + /** Options for getopt */ + static struct option long_options[] = { + {"event", required_argument, NULL, 'e'}, + {"help", no_argument, NULL, 'h'}, + {"command", no_argument, NULL, 'c'}, + {NULL, 0, NULL, 0 } + }; + int c = getopt_long(argc, argv, "+:e:c:s", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'c': + break; + case 's': + system_wide = 1; + break; + case 'e': + parse_events(optarg); + break; + default: + break; + } + } + if (optind == argc) + goto err; + + if (!nr_counters) + nr_counters = 8; + else + nr_counters++; + return; + +err: + display_help(); +} + +char fault_here[1000000]; + +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + +static int fd[MAX_NR_CPUS][MAX_COUNTERS]; + +static void create_counter(int counter) +{ + struct perf_counter_hw_event hw_event; + + memset(&hw_event, 0, sizeof(hw_event)); + hw_event.type = event_id[counter]; + hw_event.raw = event_raw[counter]; + hw_event.record_type = PERF_RECORD_SIMPLE; + hw_event.nmi = 0; + + if (system_wide) { + int cpu; + for (cpu = 0; cpu < nr_cpus; cpu ++) { + fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0); + if (fd[cpu][counter] < 0) { + printf("perfstat error: syscall returned with %d (%s)\n", + fd[cpu][counter], strerror(errno)); + exit(-1); + } + + } + } else { + hw_event.inherit = 1; + hw_event.disabled = 1; + + fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0); + if (fd[0][counter] < 0) { + printf("perfstat error: syscall returned with %d (%s)\n", + fd[0][counter], strerror(errno)); + exit(-1); + } + } +} + + +#define rdclock() \ +({ \ + struct timespec ts; \ + \ + clock_gettime(CLOCK_MONOTONIC, &ts); \ + ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ +}) + +int main(int argc, char *argv[]) +{ + unsigned long long t0, t1; + int counter; + ssize_t res; + int status; + int pid; + + process_options(argc, argv); + + if (system_wide) { + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + assert(nr_cpus <= MAX_NR_CPUS); + assert(nr_cpus >= 0); + } else + nr_cpus = 1; + + for (counter = 0; counter < nr_counters; counter++) + create_counter(counter); + + argc -= optind; + argv += optind; + + /* + * Enable counters and exec the command: + */ + t0 = rdclock(); + prctl(PR_TASK_PERF_COUNTERS_ENABLE); + + if ((pid = fork()) < 0) + perror("failed to fork"); + if (!pid) { + if (execvp(argv[0], argv)) { + perror(argv[0]); + exit(-1); + } + } + while (wait(&status) >= 0) + ; + prctl(PR_TASK_PERF_COUNTERS_DISABLE); + t1 = rdclock(); + + fflush(stdout); + + fprintf(stderr, "\n"); + fprintf(stderr, " Performance counter stats for \'%s\':\n", + argv[0]); + fprintf(stderr, "\n"); + + for (counter = 0; counter < nr_counters; counter++) { + int cpu; + __u64 count, single_count; + + count = 0; + for (cpu = 0; cpu < nr_cpus; cpu ++) { + res = read(fd[cpu][counter], + (char *) &single_count, sizeof(single_count)); + assert(res == sizeof(single_count)); + count += single_count; + } + + if (!event_raw[counter] && + (event_id[counter] == PERF_COUNT_CPU_CLOCK || + event_id[counter] == PERF_COUNT_TASK_CLOCK)) { + + double msecs = (double)count / 1000000; + + fprintf(stderr, " %14.6f %-20s (msecs)\n", + msecs, event_name(counter)); + } else { + fprintf(stderr, " %14Ld %-20s (events)\n", + count, event_name(counter)); + } + if (!counter) + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); + fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", + (double)(t1-t0)/1e6); + fprintf(stderr, "\n"); + + return 0; +} From cea92ce5b07078cd62c4712d51390b09a43dba2e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 20 Mar 2009 10:08:02 +0800 Subject: [PATCH 0181/2061] perf_counter tools: Merge common code into perfcounters.h kerneltop's MAX_COUNTERS is increased from 8 to 64(the value used by perfstat). Signed-off-by: Wu Fengguang Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 132 +-------------------- Documentation/perf_counter/perfcounters.h | 137 ++++++++++++++++++++++ Documentation/perf_counter/perfstat.c | 134 +-------------------- 3 files changed, 139 insertions(+), 264 deletions(-) create mode 100644 Documentation/perf_counter/perfcounters.h diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index cf0e30bab5d..fe70a2c92a8 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -65,126 +65,7 @@ #include -#ifdef __x86_64__ -# define __NR_perf_counter_open 295 -#endif - -#ifdef __i386__ -# define __NR_perf_counter_open 333 -#endif - -/* - * Pick up some kernel type conventions: - */ -#define __user -#define asmlinkage - -typedef unsigned int __u32; -typedef unsigned long long __u64; -typedef long long __s64; - -/* - * User-space ABI bits: - */ - -/* - * Generalized performance counter event types, used by the hw_event.type - * parameter of the sys_perf_counter_open() syscall: - */ -enum hw_event_types { - /* - * Common hardware events, generalized by the kernel: - */ - PERF_COUNT_CPU_CYCLES = 0, - PERF_COUNT_INSTRUCTIONS = 1, - PERF_COUNT_CACHE_REFERENCES = 2, - PERF_COUNT_CACHE_MISSES = 3, - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_BRANCH_MISSES = 5, - PERF_COUNT_BUS_CYCLES = 6, - - PERF_HW_EVENTS_MAX = 7, - - /* - * Special "software" counters provided by the kernel, even if - * the hardware does not support performance counters. These - * counters measure various physical and sw events of the - * kernel (and allow the profiling of them as well): - */ - PERF_COUNT_CPU_CLOCK = -1, - PERF_COUNT_TASK_CLOCK = -2, - PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, - PERF_COUNT_CPU_MIGRATIONS = -5, - - PERF_SW_EVENTS_MIN = -6, -}; - -/* - * IRQ-notification data record type: - */ -enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, -}; - -/* - * Hardware event to monitor via a performance monitoring counter: - */ -struct perf_counter_hw_event { - __s64 type; - - __u64 irq_period; - __u64 record_type; - __u64 read_format; - - __u64 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only group on PMU */ - exclude_user : 1, /* don't count user */ - exclude_kernel : 1, /* ditto kernel */ - exclude_hv : 1, /* ditto hypervisor */ - exclude_idle : 1, /* don't count when idle */ - - __reserved_1 : 54; - - __u32 extra_config_len; - __u32 __reserved_4; - - __u64 __reserved_2; - __u64 __reserved_3; -}; - -/* - * Ioctls that can be done on a perf counter fd: - */ -#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) - -asmlinkage int sys_perf_counter_open( - - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd, - unsigned long flags) -{ - int ret; - - ret = syscall( - __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); -#if defined(__x86_64__) || defined(__i386__) - if (ret < 0 && ret > -4096) { - errno = -ret; - ret = -1; - } -#endif - return ret; -} +#include "perfcounters.h" const char *event_types [] = { "CPU cycles", @@ -205,21 +86,10 @@ const unsigned int default_count[] = { 10000, }; -/* - * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all - * counters in the current task. - */ -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 - -#define MAX_COUNTERS 8 - static int nr_counters = -1; static __u64 count_filter = 100; -#define MAX_NR_CPUS 256 - static int event_count[MAX_COUNTERS]; static unsigned long event_id[MAX_COUNTERS]; static int event_raw[MAX_COUNTERS]; diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h new file mode 100644 index 00000000000..8c1559b25f1 --- /dev/null +++ b/Documentation/perf_counter/perfcounters.h @@ -0,0 +1,137 @@ +/* + * Ioctls that can be done on a perf counter fd: + */ +#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) + +/* + * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * counters in the current task. + */ +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + +#define MAX_COUNTERS 64 +#define MAX_NR_CPUS 256 + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +/* + * Pick up some kernel type conventions: + */ +#define __user +#define asmlinkage + +typedef unsigned int __u32; +typedef unsigned long long __u64; +typedef long long __s64; + +/* + * User-space ABI bits: + */ + +/* + * Generalized performance counter event types, used by the hw_event.type + * parameter of the sys_perf_counter_open() syscall: + */ +enum hw_event_types { + /* + * Common hardware events, generalized by the kernel: + */ + PERF_COUNT_CPU_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + PERF_COUNT_BUS_CYCLES = 6, + + PERF_HW_EVENTS_MAX = 7, + + /* + * Special "software" counters provided by the kernel, even if + * the hardware does not support performance counters. These + * counters measure various physical and sw events of the + * kernel (and allow the profiling of them as well): + */ + PERF_COUNT_CPU_CLOCK = -1, + PERF_COUNT_TASK_CLOCK = -2, + PERF_COUNT_PAGE_FAULTS = -3, + PERF_COUNT_CONTEXT_SWITCHES = -4, + PERF_COUNT_CPU_MIGRATIONS = -5, + + PERF_SW_EVENTS_MIN = -6, +}; + +/* + * IRQ-notification data record type: + */ +enum perf_counter_record_type { + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, +}; + +/* + * Hardware event to monitor via a performance monitoring counter: + */ +struct perf_counter_hw_event { + __s64 type; + + __u64 irq_period; + __u64 record_type; + __u64 read_format; + + __u64 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + raw : 1, /* raw event type */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + + __reserved_1 : 54; + + __u32 extra_config_len; + __u32 __reserved_4; + + __u64 __reserved_2; + __u64 __reserved_3; +}; + +#ifdef __x86_64__ +# define __NR_perf_counter_open 295 +#endif + +#ifdef __i386__ +# define __NR_perf_counter_open 333 +#endif + +#ifdef __powerpc__ +#define __NR_perf_counter_open 319 +#endif + +asmlinkage int sys_perf_counter_open( + + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd, + unsigned long flags) +{ + int ret; + + ret = syscall( + __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); +#if defined(__x86_64__) || defined(__i386__) + if (ret < 0 && ret > -4096) { + errno = -ret; + ret = -1; + } +#endif + return ret; +} + diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c index 9a5808fbcf9..a3d4a7a602f 100644 --- a/Documentation/perf_counter/perfstat.c +++ b/Documentation/perf_counter/perfstat.c @@ -52,133 +52,7 @@ #include -#ifdef __x86_64__ -# define __NR_perf_counter_open 295 -#endif - -#ifdef __i386__ -# define __NR_perf_counter_open 333 -#endif - -#ifdef __powerpc__ -#define __NR_perf_counter_open 319 -#endif - -/* - * Pick up some kernel type conventions: - */ -#define __user -#define asmlinkage - -typedef unsigned int __u32; -typedef unsigned long long __u64; -typedef long long __s64; - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -/* - * User-space ABI bits: - */ - -/* - * Generalized performance counter event types, used by the hw_event.type - * parameter of the sys_perf_counter_open() syscall: - */ -enum hw_event_types { - /* - * Common hardware events, generalized by the kernel: - */ - PERF_COUNT_CPU_CYCLES = 0, - PERF_COUNT_INSTRUCTIONS = 1, - PERF_COUNT_CACHE_REFERENCES = 2, - PERF_COUNT_CACHE_MISSES = 3, - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_BRANCH_MISSES = 5, - PERF_COUNT_BUS_CYCLES = 6, - - PERF_HW_EVENTS_MAX = 7, - - /* - * Special "software" counters provided by the kernel, even if - * the hardware does not support performance counters. These - * counters measure various physical and sw events of the - * kernel (and allow the profiling of them as well): - */ - PERF_COUNT_CPU_CLOCK = -1, - PERF_COUNT_TASK_CLOCK = -2, - PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, - PERF_COUNT_CPU_MIGRATIONS = -5, - - PERF_SW_EVENTS_MIN = -6, -}; - -/* - * IRQ-notification data record type: - */ -enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, -}; - -/* - * Hardware event to monitor via a performance monitoring counter: - */ -struct perf_counter_hw_event { - __s64 type; - - __u64 irq_period; - __u64 record_type; - __u64 read_format; - - __u64 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only group on PMU */ - exclude_user : 1, /* don't count user */ - exclude_kernel : 1, /* ditto kernel */ - exclude_hv : 1, /* ditto hypervisor */ - exclude_idle : 1, /* don't count when idle */ - - __reserved_1 : 54; - - __u32 extra_config_len; - __u32 __reserved_4; - - __u64 __reserved_2; - __u64 __reserved_3; -}; - -/* - * Ioctls that can be done on a perf counter fd: - */ -#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) - -asmlinkage int sys_perf_counter_open( - - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd, - unsigned long flags) -{ - int ret; - - ret = syscall( - __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); -#if defined(__x86_64__) || defined(__i386__) - if (ret < 0 && ret > -4096) { - errno = -ret; - ret = -1; - } -#endif - return ret; -} - +#include "perfcounters.h" static char *hw_event_names [] = { "CPU cycles", @@ -224,9 +98,6 @@ static struct event_symbol event_symbols [] = { {PERF_COUNT_CPU_MIGRATIONS, "migrations", }, }; -#define MAX_COUNTERS 64 -#define MAX_NR_CPUS 256 - static int nr_counters = 0; static int nr_cpus = 0; @@ -388,9 +259,6 @@ err: char fault_here[1000000]; -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 - static int fd[MAX_NR_CPUS][MAX_COUNTERS]; static void create_counter(int counter) From f49012fad4ed2231c7380c0b1901122242b3eab0 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 20 Mar 2009 10:08:03 +0800 Subject: [PATCH 0182/2061] perf_counter tools: Move perfstat supporting code into perfcounters.h Signed-off-by: Wu Fengguang Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/perfcounters.h | 130 ++++++++++++++++++++++ Documentation/perf_counter/perfstat.c | 130 ---------------------- 2 files changed, 130 insertions(+), 130 deletions(-) diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h index 8c1559b25f1..0f3764aa52a 100644 --- a/Documentation/perf_counter/perfcounters.h +++ b/Documentation/perf_counter/perfcounters.h @@ -16,6 +16,14 @@ #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) +#define rdclock() \ +({ \ + struct timespec ts; \ + \ + clock_gettime(CLOCK_MONOTONIC, &ts); \ + ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ +}) + /* * Pick up some kernel type conventions: */ @@ -135,3 +143,125 @@ asmlinkage int sys_perf_counter_open( return ret; } +static char *hw_event_names [] = { + "CPU cycles", + "instructions", + "cache references", + "cache misses", + "branches", + "branch misses", + "bus cycles", +}; + +static char *sw_event_names [] = { + "cpu clock ticks", + "task clock ticks", + "pagefaults", + "context switches", + "CPU migrations", +}; + +struct event_symbol { + int event; + char *symbol; +}; + +static struct event_symbol event_symbols [] = { + {PERF_COUNT_CPU_CYCLES, "cpu-cycles", }, + {PERF_COUNT_CPU_CYCLES, "cycles", }, + {PERF_COUNT_INSTRUCTIONS, "instructions", }, + {PERF_COUNT_CACHE_REFERENCES, "cache-references", }, + {PERF_COUNT_CACHE_MISSES, "cache-misses", }, + {PERF_COUNT_BRANCH_INSTRUCTIONS, "branch-instructions", }, + {PERF_COUNT_BRANCH_INSTRUCTIONS, "branches", }, + {PERF_COUNT_BRANCH_MISSES, "branch-misses", }, + {PERF_COUNT_BUS_CYCLES, "bus-cycles", }, + {PERF_COUNT_CPU_CLOCK, "cpu-ticks", }, + {PERF_COUNT_CPU_CLOCK, "ticks", }, + {PERF_COUNT_TASK_CLOCK, "task-ticks", }, + {PERF_COUNT_PAGE_FAULTS, "page-faults", }, + {PERF_COUNT_PAGE_FAULTS, "faults", }, + {PERF_COUNT_CONTEXT_SWITCHES, "context-switches", }, + {PERF_COUNT_CONTEXT_SWITCHES, "cs", }, + {PERF_COUNT_CPU_MIGRATIONS, "cpu-migrations", }, + {PERF_COUNT_CPU_MIGRATIONS, "migrations", }, +}; + +static int type_valid(int type) +{ + if (type >= PERF_HW_EVENTS_MAX) + return 0; + if (type <= PERF_SW_EVENTS_MIN) + return 0; + + return 1; +} + +static char *event_name(int ctr) +{ + int type = event_id[ctr]; + static char buf[32]; + + if (event_raw[ctr]) { + sprintf(buf, "raw 0x%x", type); + return buf; + } + if (!type_valid(type)) + return "unknown"; + + if (type >= 0) + return hw_event_names[type]; + + return sw_event_names[-type-1]; +} + +/* + * Each event can have multiple symbolic names. + * Symbolic names are (almost) exactly matched. + */ +static int match_event_symbols(char *str) +{ + unsigned int i; + + if (isdigit(str[0]) || str[0] == '-') + return atoi(str); + + for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { + if (!strncmp(str, event_symbols[i].symbol, + strlen(event_symbols[i].symbol))) + return event_symbols[i].event; + } + + return PERF_HW_EVENTS_MAX; +} + +static void parse_events(char *str) +{ + int type, raw; + +again: + nr_counters++; + if (nr_counters == MAX_COUNTERS) + display_help(); + + raw = 0; + if (*str == 'r') { + raw = 1; + ++str; + type = strtol(str, NULL, 16); + } else { + type = match_event_symbols(str); + if (!type_valid(type)) + display_help(); + } + + event_id[nr_counters] = type; + event_raw[nr_counters] = raw; + + str = strstr(str, ","); + if (str) { + str++; + goto again; + } +} + diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c index a3d4a7a602f..3364dcb9dd9 100644 --- a/Documentation/perf_counter/perfstat.c +++ b/Documentation/perf_counter/perfstat.c @@ -54,50 +54,6 @@ #include "perfcounters.h" -static char *hw_event_names [] = { - "CPU cycles", - "instructions", - "cache references", - "cache misses", - "branches", - "branch misses", - "bus cycles", -}; - -static char *sw_event_names [] = { - "cpu clock ticks", - "task clock ticks", - "pagefaults", - "context switches", - "CPU migrations", -}; - -struct event_symbol { - int event; - char *symbol; -}; - -static struct event_symbol event_symbols [] = { - {PERF_COUNT_CPU_CYCLES, "cpu-cycles", }, - {PERF_COUNT_CPU_CYCLES, "cycles", }, - {PERF_COUNT_INSTRUCTIONS, "instructions", }, - {PERF_COUNT_CACHE_REFERENCES, "cache-references", }, - {PERF_COUNT_CACHE_MISSES, "cache-misses", }, - {PERF_COUNT_BRANCH_INSTRUCTIONS, "branch-instructions", }, - {PERF_COUNT_BRANCH_INSTRUCTIONS, "branches", }, - {PERF_COUNT_BRANCH_MISSES, "branch-misses", }, - {PERF_COUNT_BUS_CYCLES, "bus-cycles", }, - {PERF_COUNT_CPU_CLOCK, "cpu-ticks", }, - {PERF_COUNT_CPU_CLOCK, "ticks", }, - {PERF_COUNT_TASK_CLOCK, "task-ticks", }, - {PERF_COUNT_PAGE_FAULTS, "page-faults", }, - {PERF_COUNT_PAGE_FAULTS, "faults", }, - {PERF_COUNT_CONTEXT_SWITCHES, "context-switches", }, - {PERF_COUNT_CONTEXT_SWITCHES, "cs", }, - {PERF_COUNT_CPU_MIGRATIONS, "cpu-migrations", }, - {PERF_COUNT_CPU_MIGRATIONS, "migrations", }, -}; - static int nr_counters = 0; static int nr_cpus = 0; @@ -137,84 +93,6 @@ static void display_help(void) exit(0); } -static int type_valid(int type) -{ - if (type >= PERF_HW_EVENTS_MAX) - return 0; - if (type <= PERF_SW_EVENTS_MIN) - return 0; - - return 1; -} - -static char *event_name(int ctr) -{ - int type = event_id[ctr]; - static char buf[32]; - - if (event_raw[ctr]) { - sprintf(buf, "raw 0x%x", type); - return buf; - } - if (!type_valid(type)) - return "unknown"; - - if (type >= 0) - return hw_event_names[type]; - - return sw_event_names[-type-1]; -} - -/* - * Each event can have multiple symbolic names. - * Symbolic names are (almost) exactly matched. - */ -static int match_event_symbols(char *str) -{ - unsigned int i; - - if (isdigit(str[0]) || str[0] == '-') - return atoi(str); - - for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { - if (!strncmp(str, event_symbols[i].symbol, - strlen(event_symbols[i].symbol))) - return event_symbols[i].event; - } - - return PERF_HW_EVENTS_MAX; -} - -static void parse_events(char *str) -{ - int type, raw; - -again: - nr_counters++; - if (nr_counters == MAX_COUNTERS) - display_help(); - - raw = 0; - if (*str == 'r') { - raw = 1; - ++str; - type = strtol(str, NULL, 16); - } else { - type = match_event_symbols(str); - if (!type_valid(type)) - display_help(); - } - - event_id[nr_counters] = type; - event_raw[nr_counters] = raw; - - str = strstr(str, ","); - if (str) { - str++; - goto again; - } -} - static void process_options(int argc, char *argv[]) { for (;;) { @@ -296,14 +174,6 @@ static void create_counter(int counter) } -#define rdclock() \ -({ \ - struct timespec ts; \ - \ - clock_gettime(CLOCK_MONOTONIC, &ts); \ - ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ -}) - int main(int argc, char *argv[]) { unsigned long long t0, t1; From 95bb3be1b3ca4a71cc168787b675d5b7852fc6be Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 20 Mar 2009 10:08:04 +0800 Subject: [PATCH 0183/2061] perf_counter tools: support symbolic event names in kerneltop - kerneltop: --event_id => --event - kerneltop: can accept SW event types now - perfstat: it used to implicitly add event -2(task-clock), the new code no longer does this. Shall we? Signed-off-by: Wu Fengguang Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 28 +++++------------------ Documentation/perf_counter/perfcounters.h | 15 ++++++++---- Documentation/perf_counter/perfstat.c | 8 ------- 3 files changed, 16 insertions(+), 35 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index fe70a2c92a8..edc5b09fb58 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -86,13 +86,9 @@ const unsigned int default_count[] = { 10000, }; -static int nr_counters = -1; - static __u64 count_filter = 100; static int event_count[MAX_COUNTERS]; -static unsigned long event_id[MAX_COUNTERS]; -static int event_raw[MAX_COUNTERS]; static int tid = -1; static int profile_cpu = -1; @@ -125,7 +121,7 @@ static void display_help(void) "KernelTop Options (up to %d event types can be specified at once):\n\n", MAX_COUNTERS); printf( - " -e EID --event_id=EID # event type ID [default: 0]\n" + " -e EID --event=EID # event type ID [default: 0]\n" " 0: CPU cycles\n" " 1: instructions\n" " 2: cache accesses\n" @@ -160,7 +156,7 @@ static void process_options(int argc, char *argv[]) {"cpu", required_argument, NULL, 'C'}, {"delay", required_argument, NULL, 'd'}, {"dump_symtab", no_argument, NULL, 'D'}, - {"event_id", required_argument, NULL, 'e'}, + {"event", required_argument, NULL, 'e'}, {"filter", required_argument, NULL, 'f'}, {"group", required_argument, NULL, 'g'}, {"help", no_argument, NULL, 'h'}, @@ -178,8 +174,6 @@ static void process_options(int argc, char *argv[]) switch (c) { case 'c': - if (nr_counters == -1) - nr_counters = 0; event_count[nr_counters] = atoi(optarg); break; case 'C': /* CPU and PID are mutually exclusive */ @@ -192,18 +186,7 @@ static void process_options(int argc, char *argv[]) case 'd': delay_secs = atoi(optarg); break; case 'D': dump_symtab = 1; break; - case 'e': - nr_counters++; - if (nr_counters == MAX_COUNTERS) { - error = 1; - break; - } - if (*optarg == 'r') { - event_raw[nr_counters] = 1; - ++optarg; - } - event_id[nr_counters] = strtol(optarg, NULL, 16); - break; + case 'e': error = parse_events(optarg); break; case 'f': count_filter = atoi(optarg); break; case 'g': group = atoi(optarg); break; @@ -226,9 +209,10 @@ static void process_options(int argc, char *argv[]) if (error) display_help(); - nr_counters++; - if (nr_counters < 1) + if (!nr_counters) { nr_counters = 1; + event_id[0] = 0; + } for (counter = 0; counter < nr_counters; counter++) { if (event_count[counter]) diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h index 0f3764aa52a..99a90d833e1 100644 --- a/Documentation/perf_counter/perfcounters.h +++ b/Documentation/perf_counter/perfcounters.h @@ -143,6 +143,10 @@ asmlinkage int sys_perf_counter_open( return ret; } +static int nr_counters = 0; +static long event_id[MAX_COUNTERS] = { -2, -5, -4, -3, 0, 1, 2, 3}; +static int event_raw[MAX_COUNTERS]; + static char *hw_event_names [] = { "CPU cycles", "instructions", @@ -235,14 +239,13 @@ static int match_event_symbols(char *str) return PERF_HW_EVENTS_MAX; } -static void parse_events(char *str) +static int parse_events(char *str) { int type, raw; again: - nr_counters++; if (nr_counters == MAX_COUNTERS) - display_help(); + return -1; raw = 0; if (*str == 'r') { @@ -252,16 +255,18 @@ again: } else { type = match_event_symbols(str); if (!type_valid(type)) - display_help(); + return -1; } event_id[nr_counters] = type; event_raw[nr_counters] = raw; + nr_counters++; str = strstr(str, ","); if (str) { str++; goto again; } -} + return 0; +} diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c index 3364dcb9dd9..fd594468e65 100644 --- a/Documentation/perf_counter/perfstat.c +++ b/Documentation/perf_counter/perfstat.c @@ -54,14 +54,8 @@ #include "perfcounters.h" -static int nr_counters = 0; static int nr_cpus = 0; -static int event_id[MAX_COUNTERS] = - { -2, -5, -4, -3, 0, 1, 2, 3}; - -static int event_raw[MAX_COUNTERS]; - static int system_wide = 0; static void display_help(void) @@ -127,8 +121,6 @@ static void process_options(int argc, char *argv[]) if (!nr_counters) nr_counters = 8; - else - nr_counters++; return; err: From e3908612d93dff9d7746d85d37c42593254bf282 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 20 Mar 2009 10:08:05 +0800 Subject: [PATCH 0184/2061] perf_counter tools: Reuse event_name() in kerneltop - can handle sw counters now - the outputs will look slightly different Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index edc5b09fb58..cba5cb0a97f 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -67,16 +67,6 @@ #include "perfcounters.h" -const char *event_types [] = { - "CPU cycles", - "instructions", - "cache-refs", - "cache-misses", - "branches", - "branch-misses", - "bus cycles" -}; - const unsigned int default_count[] = { 1000000, 1000000, @@ -304,10 +294,7 @@ static void print_sym_table(void) if (counter) printf("/"); - if (event_id[counter] < PERF_HW_EVENTS_MAX) - printf( "%s", event_types[event_id[counter]]); - else - printf( "raw:%04lx", event_id[counter]); + printf("%s", event_name(counter)); } printf( "], "); From f7524bda8be8be98db356d6a83ac1da451ecdb2e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 20 Mar 2009 10:08:06 +0800 Subject: [PATCH 0185/2061] perf_counter tools: move remaining code into kerneltop.c - perfstat.c can be safely removed now - perfstat: -s => -a for system wide accounting - kerneltop: add -S/--stat for perfstat mode - minor adjustments to kerneltop --help, perfstat --help Signed-off-by: Wu Fengguang Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 544 +++++++++++++++++----- Documentation/perf_counter/perfcounters.h | 132 +----- 2 files changed, 439 insertions(+), 237 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index cba5cb0a97f..9db65a4f104 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -3,7 +3,7 @@ Build with: - cc -O6 -Wall `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c + cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c Sample output: @@ -26,18 +26,40 @@ 12.00 - ffffffff804ffb7f : __ip_local_out 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish 8.54 - ffffffff805001a3 : ip_queue_xmit - - Started by Ingo Molnar - - Improvements and fixes by: - - Arjan van de Ven - Yanmin Zhang - Mike Galbraith - - Released under the GPL v2. (and only v2, not any later version) - */ + +/* + * perfstat: /usr/bin/time -alike performance counter statistics utility + + It summarizes the counter events of all tasks (and child tasks), + covering all CPUs that the command (or workload) executes on. + It only counts the per-task events of the workload started, + independent of how many other tasks run on those CPUs. + + Sample output: + + $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null + + Performance counter stats for 'ls': + + 163516953 instructions + 2295 cache-misses + 2855182 branch-misses + */ + + /* + * Copyright (C) 2008, Red Hat Inc, Ingo Molnar + * + * Improvements and fixes by: + * + * Arjan van de Ven + * Yanmin Zhang + * Wu Fengguang + * Mike Galbraith + * + * Released under the GPL v2. (and only v2, not any later version) + */ + #define _GNU_SOURCE #include #include @@ -67,19 +89,23 @@ #include "perfcounters.h" -const unsigned int default_count[] = { - 1000000, - 1000000, - 10000, - 10000, - 1000000, - 10000, -}; + +#define MAX_COUNTERS 64 +#define MAX_NR_CPUS 256 + +#define DEF_PERFSTAT_EVENTS { -2, -5, -4, -3, 0, 1, 2, 3} + +static int run_perfstat = 0; +static int system_wide = 0; + +static int nr_counters = 0; +static long event_id[MAX_COUNTERS] = DEF_PERFSTAT_EVENTS; +static int event_raw[MAX_COUNTERS]; +static int event_count[MAX_COUNTERS]; +static int fd[MAX_NR_CPUS][MAX_COUNTERS]; static __u64 count_filter = 100; -static int event_count[MAX_COUNTERS]; - static int tid = -1; static int profile_cpu = -1; static int nr_cpus = 0; @@ -96,125 +122,335 @@ static int delay_secs = 2; static int zero; static int dump_symtab; +static GList *lines; + struct source_line { uint64_t EIP; unsigned long count; char *line; }; -static GList *lines; + +const unsigned int default_count[] = { + 1000000, + 1000000, + 10000, + 10000, + 1000000, + 10000, +}; + +static char *hw_event_names[] = { + "CPU cycles", + "instructions", + "cache references", + "cache misses", + "branches", + "branch misses", + "bus cycles", +}; + +static char *sw_event_names[] = { + "cpu clock ticks", + "task clock ticks", + "pagefaults", + "context switches", + "CPU migrations", +}; + +struct event_symbol { + int event; + char *symbol; +}; + +static struct event_symbol event_symbols[] = { + {PERF_COUNT_CPU_CYCLES, "cpu-cycles", }, + {PERF_COUNT_CPU_CYCLES, "cycles", }, + {PERF_COUNT_INSTRUCTIONS, "instructions", }, + {PERF_COUNT_CACHE_REFERENCES, "cache-references", }, + {PERF_COUNT_CACHE_MISSES, "cache-misses", }, + {PERF_COUNT_BRANCH_INSTRUCTIONS, "branch-instructions", }, + {PERF_COUNT_BRANCH_INSTRUCTIONS, "branches", }, + {PERF_COUNT_BRANCH_MISSES, "branch-misses", }, + {PERF_COUNT_BUS_CYCLES, "bus-cycles", }, + {PERF_COUNT_CPU_CLOCK, "cpu-ticks", }, + {PERF_COUNT_CPU_CLOCK, "ticks", }, + {PERF_COUNT_TASK_CLOCK, "task-ticks", }, + {PERF_COUNT_PAGE_FAULTS, "page-faults", }, + {PERF_COUNT_PAGE_FAULTS, "faults", }, + {PERF_COUNT_CONTEXT_SWITCHES, "context-switches", }, + {PERF_COUNT_CONTEXT_SWITCHES, "cs", }, + {PERF_COUNT_CPU_MIGRATIONS, "cpu-migrations", }, + {PERF_COUNT_CPU_MIGRATIONS, "migrations", }, +}; + +static void display_events_help(void) +{ + unsigned int i; + int e; + + printf( + " -e EVENT --event=EVENT # symbolic-name abbreviations"); + + for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) { + if (e != event_symbols[i].event) { + e = event_symbols[i].event; + printf( + "\n %2d: %-20s", e, event_symbols[i].symbol); + } else + printf(" %s", event_symbols[i].symbol); + } + + printf("\n" + " rNNN: raw PMU events (eventsel+umask)\n\n"); +} + +static void display_perfstat_help(void) +{ + printf( + "Usage: perfstat [] \n\n" + "PerfStat Options (up to %d event types can be specified):\n\n", + MAX_COUNTERS); + + display_events_help(); + + printf( + " -a # system-wide collection\n"); + exit(0); +} static void display_help(void) { + if (run_perfstat) + return display_perfstat_help(); + printf( - "Usage: kerneltop []\n\n" + "Usage: kerneltop []\n" + " Or: kerneltop -S [] COMMAND [ARGS]\n\n" "KernelTop Options (up to %d event types can be specified at once):\n\n", MAX_COUNTERS); + + display_events_help(); + printf( - " -e EID --event=EID # event type ID [default: 0]\n" - " 0: CPU cycles\n" - " 1: instructions\n" - " 2: cache accesses\n" - " 3: cache misses\n" - " 4: branch instructions\n" - " 5: branch prediction misses\n" - " 6: bus cycles\n\n" - " rNNN: raw PMU events (eventsel+umask)\n\n" + " -S --stat # perfstat COMMAND\n" + " -a # system-wide collection (for perfstat)\n\n" " -c CNT --count=CNT # event period to sample\n\n" " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n" " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n" " -d delay --delay= # sampling/display delay [default: 2]\n" - " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n" + " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n" " -s symbol --symbol= # function to be showed annotated one-shot\n" - " -x path --vmlinux= # the vmlinux binary, required for -s use:\n" + " -x path --vmlinux= # the vmlinux binary, required for -s use\n" " -z --zero # zero counts after display\n" " -D --dump_symtab # dump symbol table to stderr on startup\n" - "\n"); + ); exit(0); } -static void process_options(int argc, char *argv[]) +static int type_valid(int type) { - int error = 0, counter; + if (type >= PERF_HW_EVENTS_MAX) + return 0; + if (type <= PERF_SW_EVENTS_MIN) + return 0; - for (;;) { - int option_index = 0; - /** Options for getopt */ - static struct option long_options[] = { - {"count", required_argument, NULL, 'c'}, - {"cpu", required_argument, NULL, 'C'}, - {"delay", required_argument, NULL, 'd'}, - {"dump_symtab", no_argument, NULL, 'D'}, - {"event", required_argument, NULL, 'e'}, - {"filter", required_argument, NULL, 'f'}, - {"group", required_argument, NULL, 'g'}, - {"help", no_argument, NULL, 'h'}, - {"nmi", required_argument, NULL, 'n'}, - {"pid", required_argument, NULL, 'p'}, - {"vmlinux", required_argument, NULL, 'x'}, - {"symbol", required_argument, NULL, 's'}, - {"zero", no_argument, NULL, 'z'}, - {NULL, 0, NULL, 0 } - }; - int c = getopt_long(argc, argv, "c:C:d:De:f:g:hn:p:s:x:z", - long_options, &option_index); - if (c == -1) - break; + return 1; +} - switch (c) { - case 'c': - event_count[nr_counters] = atoi(optarg); break; - case 'C': - /* CPU and PID are mutually exclusive */ - if (tid != -1) { - printf("WARNING: CPU switch overriding PID\n"); - sleep(1); - tid = -1; +static char *event_name(int ctr) +{ + int type = event_id[ctr]; + static char buf[32]; + + if (event_raw[ctr]) { + sprintf(buf, "raw 0x%x", type); + return buf; + } + if (!type_valid(type)) + return "unknown"; + + if (type >= 0) + return hw_event_names[type]; + + return sw_event_names[-type-1]; +} + +/* + * Each event can have multiple symbolic names. + * Symbolic names are (almost) exactly matched. + */ +static int match_event_symbols(char *str) +{ + unsigned int i; + + if (isdigit(str[0]) || str[0] == '-') + return atoi(str); + + for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { + if (!strncmp(str, event_symbols[i].symbol, + strlen(event_symbols[i].symbol))) + return event_symbols[i].event; + } + + return PERF_HW_EVENTS_MAX; +} + +static int parse_events(char *str) +{ + int type, raw; + +again: + if (nr_counters == MAX_COUNTERS) + return -1; + + raw = 0; + if (*str == 'r') { + raw = 1; + ++str; + type = strtol(str, NULL, 16); + } else { + type = match_event_symbols(str); + if (!type_valid(type)) + return -1; + } + + event_id[nr_counters] = type; + event_raw[nr_counters] = raw; + nr_counters++; + + str = strstr(str, ","); + if (str) { + str++; + goto again; + } + + return 0; +} + + +/* + * perfstat + */ + +char fault_here[1000000]; + +static void create_perfstat_counter(int counter) +{ + struct perf_counter_hw_event hw_event; + + memset(&hw_event, 0, sizeof(hw_event)); + hw_event.type = event_id[counter]; + hw_event.raw = event_raw[counter]; + hw_event.record_type = PERF_RECORD_SIMPLE; + hw_event.nmi = 0; + + if (system_wide) { + int cpu; + for (cpu = 0; cpu < nr_cpus; cpu ++) { + fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0); + if (fd[cpu][counter] < 0) { + printf("perfstat error: syscall returned with %d (%s)\n", + fd[cpu][counter], strerror(errno)); + exit(-1); } - profile_cpu = atoi(optarg); break; - case 'd': delay_secs = atoi(optarg); break; - case 'D': dump_symtab = 1; break; + } + } else { + hw_event.inherit = 1; + hw_event.disabled = 1; - case 'e': error = parse_events(optarg); break; - - case 'f': count_filter = atoi(optarg); break; - case 'g': group = atoi(optarg); break; - case 'h': display_help(); break; - case 'n': nmi = atoi(optarg); break; - case 'p': - /* CPU and PID are mutually exclusive */ - if (profile_cpu != -1) { - printf("WARNING: PID switch overriding CPU\n"); - sleep(1); - profile_cpu = -1; - } - tid = atoi(optarg); break; - case 's': sym_filter = strdup(optarg); break; - case 'x': vmlinux = strdup(optarg); break; - case 'z': zero = 1; break; - default: error = 1; break; + fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0); + if (fd[0][counter] < 0) { + printf("perfstat error: syscall returned with %d (%s)\n", + fd[0][counter], strerror(errno)); + exit(-1); } } - if (error) - display_help(); +} - if (!nr_counters) { - nr_counters = 1; - event_id[0] = 0; +int do_perfstat(int argc, char *argv[]) +{ + unsigned long long t0, t1; + int counter; + ssize_t res; + int status; + int pid; + + if (!system_wide) + nr_cpus = 1; + + for (counter = 0; counter < nr_counters; counter++) + create_perfstat_counter(counter); + + argc -= optind; + argv += optind; + + /* + * Enable counters and exec the command: + */ + t0 = rdclock(); + prctl(PR_TASK_PERF_COUNTERS_ENABLE); + + if ((pid = fork()) < 0) + perror("failed to fork"); + if (!pid) { + if (execvp(argv[0], argv)) { + perror(argv[0]); + exit(-1); + } } + while (wait(&status) >= 0) + ; + prctl(PR_TASK_PERF_COUNTERS_DISABLE); + t1 = rdclock(); + + fflush(stdout); + + fprintf(stderr, "\n"); + fprintf(stderr, " Performance counter stats for \'%s\':\n", + argv[0]); + fprintf(stderr, "\n"); for (counter = 0; counter < nr_counters; counter++) { - if (event_count[counter]) - continue; + int cpu; + __u64 count, single_count; - if (event_id[counter] < PERF_HW_EVENTS_MAX) - event_count[counter] = default_count[event_id[counter]]; - else - event_count[counter] = 100000; + count = 0; + for (cpu = 0; cpu < nr_cpus; cpu ++) { + res = read(fd[cpu][counter], + (char *) &single_count, sizeof(single_count)); + assert(res == sizeof(single_count)); + count += single_count; + } + + if (!event_raw[counter] && + (event_id[counter] == PERF_COUNT_CPU_CLOCK || + event_id[counter] == PERF_COUNT_TASK_CLOCK)) { + + double msecs = (double)count / 1000000; + + fprintf(stderr, " %14.6f %-20s (msecs)\n", + msecs, event_name(counter)); + } else { + fprintf(stderr, " %14Ld %-20s (events)\n", + count, event_name(counter)); + } + if (!counter) + fprintf(stderr, "\n"); } + fprintf(stderr, "\n"); + fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", + (double)(t1-t0)/1e6); + fprintf(stderr, "\n"); + + return 0; } +/* + * Symbols + */ + static uint64_t min_ip; static uint64_t max_ip = -1ll; @@ -507,6 +743,9 @@ static void parse_symbols(void) } } +/* + * Source lines + */ static void parse_vmlinux(char *filename) { @@ -527,7 +766,7 @@ static void parse_vmlinux(char *filename) char *c; src = malloc(sizeof(struct source_line)); - assert(src != NULL); + assert(src != NULL); memset(src, 0, sizeof(struct source_line)); if (getline(&src->line, &dummy, file) < 0) @@ -706,11 +945,100 @@ static void process_event(uint64_t ip, int counter) record_ip(ip, counter); } +static void process_options(int argc, char *argv[]) +{ + int error = 0, counter; + + if (strstr(argv[0], "perfstat")) + run_perfstat = 1; + + for (;;) { + int option_index = 0; + /** Options for getopt */ + static struct option long_options[] = { + {"count", required_argument, NULL, 'c'}, + {"cpu", required_argument, NULL, 'C'}, + {"delay", required_argument, NULL, 'd'}, + {"dump_symtab", no_argument, NULL, 'D'}, + {"event", required_argument, NULL, 'e'}, + {"filter", required_argument, NULL, 'f'}, + {"group", required_argument, NULL, 'g'}, + {"help", no_argument, NULL, 'h'}, + {"nmi", required_argument, NULL, 'n'}, + {"pid", required_argument, NULL, 'p'}, + {"vmlinux", required_argument, NULL, 'x'}, + {"symbol", required_argument, NULL, 's'}, + {"stat", no_argument, NULL, 'S'}, + {"zero", no_argument, NULL, 'z'}, + {NULL, 0, NULL, 0 } + }; + int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'a': system_wide = 1; break; + case 'c': event_count[nr_counters] = atoi(optarg); break; + case 'C': + /* CPU and PID are mutually exclusive */ + if (tid != -1) { + printf("WARNING: CPU switch overriding PID\n"); + sleep(1); + tid = -1; + } + profile_cpu = atoi(optarg); break; + case 'd': delay_secs = atoi(optarg); break; + case 'D': dump_symtab = 1; break; + + case 'e': error = parse_events(optarg); break; + + case 'f': count_filter = atoi(optarg); break; + case 'g': group = atoi(optarg); break; + case 'h': display_help(); break; + case 'n': nmi = atoi(optarg); break; + case 'p': + /* CPU and PID are mutually exclusive */ + if (profile_cpu != -1) { + printf("WARNING: PID switch overriding CPU\n"); + sleep(1); + profile_cpu = -1; + } + tid = atoi(optarg); break; + case 's': sym_filter = strdup(optarg); break; + case 'S': run_perfstat = 1; break; + case 'x': vmlinux = strdup(optarg); break; + case 'z': zero = 1; break; + default: error = 1; break; + } + } + if (error) + display_help(); + + if (!nr_counters) { + if (run_perfstat) + nr_counters = 8; + else { + nr_counters = 1; + event_id[0] = 0; + } + } + + for (counter = 0; counter < nr_counters; counter++) { + if (event_count[counter]) + continue; + + if (event_id[counter] < PERF_HW_EVENTS_MAX) + event_count[counter] = default_count[event_id[counter]]; + else + event_count[counter] = 100000; + } +} + int main(int argc, char *argv[]) { struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS]; struct perf_counter_hw_event hw_event; - int fd[MAX_NR_CPUS][MAX_COUNTERS]; int i, counter, group_fd; unsigned int cpu; uint64_t ip; @@ -720,11 +1048,15 @@ int main(int argc, char *argv[]) process_options(argc, argv); nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + assert(nr_cpus <= MAX_NR_CPUS); + assert(nr_cpus >= 0); + + if (run_perfstat) + return do_perfstat(argc, argv); + if (tid != -1 || profile_cpu != -1) nr_cpus = 1; - assert(nr_cpus <= MAX_NR_CPUS); - for (i = 0; i < nr_cpus; i++) { group_fd = -1; for (counter = 0; counter < nr_counters; counter++) { diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h index 99a90d833e1..32e24b9154a 100644 --- a/Documentation/perf_counter/perfcounters.h +++ b/Documentation/perf_counter/perfcounters.h @@ -11,9 +11,6 @@ #define PR_TASK_PERF_COUNTERS_DISABLE 31 #define PR_TASK_PERF_COUNTERS_ENABLE 32 -#define MAX_COUNTERS 64 -#define MAX_NR_CPUS 256 - #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) #define rdclock() \ @@ -110,6 +107,7 @@ struct perf_counter_hw_event { __u64 __reserved_3; }; + #ifdef __x86_64__ # define __NR_perf_counter_open 295 #endif @@ -142,131 +140,3 @@ asmlinkage int sys_perf_counter_open( #endif return ret; } - -static int nr_counters = 0; -static long event_id[MAX_COUNTERS] = { -2, -5, -4, -3, 0, 1, 2, 3}; -static int event_raw[MAX_COUNTERS]; - -static char *hw_event_names [] = { - "CPU cycles", - "instructions", - "cache references", - "cache misses", - "branches", - "branch misses", - "bus cycles", -}; - -static char *sw_event_names [] = { - "cpu clock ticks", - "task clock ticks", - "pagefaults", - "context switches", - "CPU migrations", -}; - -struct event_symbol { - int event; - char *symbol; -}; - -static struct event_symbol event_symbols [] = { - {PERF_COUNT_CPU_CYCLES, "cpu-cycles", }, - {PERF_COUNT_CPU_CYCLES, "cycles", }, - {PERF_COUNT_INSTRUCTIONS, "instructions", }, - {PERF_COUNT_CACHE_REFERENCES, "cache-references", }, - {PERF_COUNT_CACHE_MISSES, "cache-misses", }, - {PERF_COUNT_BRANCH_INSTRUCTIONS, "branch-instructions", }, - {PERF_COUNT_BRANCH_INSTRUCTIONS, "branches", }, - {PERF_COUNT_BRANCH_MISSES, "branch-misses", }, - {PERF_COUNT_BUS_CYCLES, "bus-cycles", }, - {PERF_COUNT_CPU_CLOCK, "cpu-ticks", }, - {PERF_COUNT_CPU_CLOCK, "ticks", }, - {PERF_COUNT_TASK_CLOCK, "task-ticks", }, - {PERF_COUNT_PAGE_FAULTS, "page-faults", }, - {PERF_COUNT_PAGE_FAULTS, "faults", }, - {PERF_COUNT_CONTEXT_SWITCHES, "context-switches", }, - {PERF_COUNT_CONTEXT_SWITCHES, "cs", }, - {PERF_COUNT_CPU_MIGRATIONS, "cpu-migrations", }, - {PERF_COUNT_CPU_MIGRATIONS, "migrations", }, -}; - -static int type_valid(int type) -{ - if (type >= PERF_HW_EVENTS_MAX) - return 0; - if (type <= PERF_SW_EVENTS_MIN) - return 0; - - return 1; -} - -static char *event_name(int ctr) -{ - int type = event_id[ctr]; - static char buf[32]; - - if (event_raw[ctr]) { - sprintf(buf, "raw 0x%x", type); - return buf; - } - if (!type_valid(type)) - return "unknown"; - - if (type >= 0) - return hw_event_names[type]; - - return sw_event_names[-type-1]; -} - -/* - * Each event can have multiple symbolic names. - * Symbolic names are (almost) exactly matched. - */ -static int match_event_symbols(char *str) -{ - unsigned int i; - - if (isdigit(str[0]) || str[0] == '-') - return atoi(str); - - for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { - if (!strncmp(str, event_symbols[i].symbol, - strlen(event_symbols[i].symbol))) - return event_symbols[i].event; - } - - return PERF_HW_EVENTS_MAX; -} - -static int parse_events(char *str) -{ - int type, raw; - -again: - if (nr_counters == MAX_COUNTERS) - return -1; - - raw = 0; - if (*str == 'r') { - raw = 1; - ++str; - type = strtol(str, NULL, 16); - } else { - type = match_event_symbols(str); - if (!type_valid(type)) - return -1; - } - - event_id[nr_counters] = type; - event_raw[nr_counters] = raw; - nr_counters++; - - str = strstr(str, ","); - if (str) { - str++; - goto again; - } - - return 0; -} From ef45fa9e6c1694d3e8063f39749097a6e496b12c Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 20 Mar 2009 10:08:07 +0800 Subject: [PATCH 0186/2061] perf_counter tools: fix comment for sym_weight() Signed-off-by: Wu Fengguang Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 9db65a4f104..7bf2a516f18 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -473,7 +473,7 @@ static struct sym_entry sym_table[MAX_SYMS]; static void show_details(struct sym_entry *sym); /* - * Ordering weight: count-1 * count-1 * ... / count-n + * Ordering weight: count-1 * count-2 * ... / count-n */ static double sym_weight(const struct sym_entry *sym) { From 3ab8d792b1348eaabfe550ba60902d781d160dd4 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 20 Mar 2009 10:08:08 +0800 Subject: [PATCH 0187/2061] perf_counter tools: fix event_id type Signed-off-by: Wu Fengguang Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 7bf2a516f18..7bfb0f0d800 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -99,7 +99,7 @@ static int run_perfstat = 0; static int system_wide = 0; static int nr_counters = 0; -static long event_id[MAX_COUNTERS] = DEF_PERFSTAT_EVENTS; +static __s64 event_id[MAX_COUNTERS] = DEF_PERFSTAT_EVENTS; static int event_raw[MAX_COUNTERS]; static int event_count[MAX_COUNTERS]; static int fd[MAX_NR_CPUS][MAX_COUNTERS]; @@ -261,11 +261,11 @@ static int type_valid(int type) static char *event_name(int ctr) { - int type = event_id[ctr]; + __s64 type = event_id[ctr]; static char buf[32]; if (event_raw[ctr]) { - sprintf(buf, "raw 0x%x", type); + sprintf(buf, "raw 0x%llx", (long long)type); return buf; } if (!type_valid(type)) @@ -299,7 +299,8 @@ static int match_event_symbols(char *str) static int parse_events(char *str) { - int type, raw; + __s64 type; + int raw; again: if (nr_counters == MAX_COUNTERS) From dda7c02f33833bfa9412ba6f0e410b0a18b42c88 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 20 Mar 2009 10:08:09 +0800 Subject: [PATCH 0188/2061] perf_counter tools: cut down default count for cpu-cycles In my system, it takes kerneltop dozens of minutes to show up usable numbers. Make the default count 100 times smaller fixed this long startup latency. I'm not sure if it's the right solution though. Signed-off-by: Wu Fengguang Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 7bfb0f0d800..0bd3c13150b 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -132,7 +132,7 @@ struct source_line { const unsigned int default_count[] = { - 1000000, + 10000, 1000000, 10000, 10000, From af9522cf133e9be6da8525a46a9ed7e7659f0e1a Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Fri, 20 Mar 2009 10:08:10 +0800 Subject: [PATCH 0189/2061] perf_counter tools: when no command is feed to perfstat, display help and exit Signed-off-by: Wu Fengguang Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 0bd3c13150b..81a68aac137 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -387,6 +387,9 @@ int do_perfstat(int argc, char *argv[]) argc -= optind; argv += optind; + if (!argc) + display_help(); + /* * Enable counters and exec the command: */ From f4a2deb4860497f4332cf6a1acddab3dd628ddf0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:06 +0100 Subject: [PATCH 0190/2061] perf_counter: remove the event config bitfields Since the bitfields turned into a bit of a mess, remove them and rely on good old masks. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.059499915@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 6 +-- arch/x86/kernel/cpu/perf_counter.c | 8 ++-- include/linux/perf_counter.h | 74 ++++++++++++++++++++---------- kernel/perf_counter.c | 22 +++++---- 4 files changed, 70 insertions(+), 40 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 6413d9c0313..d05651584d4 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -602,13 +602,13 @@ hw_perf_counter_init(struct perf_counter *counter) return NULL; if ((s64)counter->hw_event.irq_period < 0) return NULL; - if (!counter->hw_event.raw_type) { - ev = counter->hw_event.event_id; + if (!perf_event_raw(&counter->hw_event)) { + ev = perf_event_id(&counter->hw_event); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) return NULL; ev = ppmu->generic_events[ev]; } else { - ev = counter->hw_event.raw_event_id; + ev = perf_event_config(&counter->hw_event); } counter->hw.config_base = ev; counter->hw.idx = 0; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 902282d68b0..3f95b0cdc55 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -217,15 +217,15 @@ static int __hw_perf_counter_init(struct perf_counter *counter) /* * Raw event type provide the config in the event structure */ - if (hw_event->raw_type) { - hwc->config |= pmc_ops->raw_event(hw_event->raw_event_id); + if (perf_event_raw(hw_event)) { + hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); } else { - if (hw_event->event_id >= pmc_ops->max_events) + if (perf_event_id(hw_event) >= pmc_ops->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(hw_event->event_id); + hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } counter->wakeup_pending = 0; diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 98f5990be1e..56099e52970 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -82,32 +82,37 @@ enum perf_counter_record_type { PERF_RECORD_GROUP = 2, }; +#define __PERF_COUNTER_MASK(name) \ + (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ + PERF_COUNTER_##name##_SHIFT) + +#define PERF_COUNTER_RAW_BITS 1 +#define PERF_COUNTER_RAW_SHIFT 63 +#define PERF_COUNTER_RAW_MASK __PERF_COUNTER_MASK(RAW) + +#define PERF_COUNTER_CONFIG_BITS 63 +#define PERF_COUNTER_CONFIG_SHIFT 0 +#define PERF_COUNTER_CONFIG_MASK __PERF_COUNTER_MASK(CONFIG) + +#define PERF_COUNTER_TYPE_BITS 7 +#define PERF_COUNTER_TYPE_SHIFT 56 +#define PERF_COUNTER_TYPE_MASK __PERF_COUNTER_MASK(TYPE) + +#define PERF_COUNTER_EVENT_BITS 56 +#define PERF_COUNTER_EVENT_SHIFT 0 +#define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) + /* * Hardware event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - union { -#ifndef __BIG_ENDIAN_BITFIELD - struct { - __u64 event_id : 56, - type : 8; - }; - struct { - __u64 raw_event_id : 63, - raw_type : 1; - }; -#else - struct { - __u64 type : 8, - event_id : 56; - }; - struct { - __u64 raw_type : 1, - raw_event_id : 63; - }; -#endif /* __BIT_ENDIAN_BITFIELD */ - __u64 event_config; - }; + /* + * The MSB of the config word signifies if the rest contains cpu + * specific (raw) counter configuration data, if unset, the next + * 7 bits are an event type and the rest of the bits are the event + * identifier. + */ + __u64 config; __u64 irq_period; __u64 record_type; @@ -157,6 +162,27 @@ struct perf_counter_hw_event { struct task_struct; +static inline u64 perf_event_raw(struct perf_counter_hw_event *hw_event) +{ + return hw_event->config & PERF_COUNTER_RAW_MASK; +} + +static inline u64 perf_event_config(struct perf_counter_hw_event *hw_event) +{ + return hw_event->config & PERF_COUNTER_CONFIG_MASK; +} + +static inline u64 perf_event_type(struct perf_counter_hw_event *hw_event) +{ + return (hw_event->config & PERF_COUNTER_TYPE_MASK) >> + PERF_COUNTER_TYPE_SHIFT; +} + +static inline u64 perf_event_id(struct perf_counter_hw_event *hw_event) +{ + return hw_event->config & PERF_COUNTER_EVENT_MASK; +} + /** * struct hw_perf_counter - performance counter hardware details: */ @@ -336,8 +362,8 @@ extern void perf_counter_output(struct perf_counter *counter, */ static inline int is_software_counter(struct perf_counter *counter) { - return !counter->hw_event.raw_type && - counter->hw_event.type != PERF_TYPE_HARDWARE; + return !perf_event_raw(&counter->hw_event) && + perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; } extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f054b8c9bf9..ca14fc41ccd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1379,7 +1379,7 @@ static void perf_counter_handle_group(struct perf_counter *counter) list_for_each_entry(sub, &leader->sibling_list, list_entry) { if (sub != counter) sub->hw_ops->read(sub); - perf_counter_store_irq(counter, sub->hw_event.event_config); + perf_counter_store_irq(counter, sub->hw_event.config); perf_counter_store_irq(counter, atomic64_read(&sub->count)); } } @@ -1489,13 +1489,13 @@ static int perf_swcounter_match(struct perf_counter *counter, if (counter->state != PERF_COUNTER_STATE_ACTIVE) return 0; - if (counter->hw_event.raw_type) + if (perf_event_raw(&counter->hw_event)) return 0; - if (counter->hw_event.type != type) + if (perf_event_type(&counter->hw_event) != type) return 0; - if (counter->hw_event.event_id != event) + if (perf_event_id(&counter->hw_event) != event) return 0; if (counter->hw_event.exclude_user && user_mode(regs)) @@ -1757,13 +1757,13 @@ extern void ftrace_profile_disable(int); static void tp_perf_counter_destroy(struct perf_counter *counter) { - ftrace_profile_disable(counter->hw_event.event_id); + ftrace_profile_disable(perf_event_id(&counter->hw_event)); } static const struct hw_perf_counter_ops * tp_perf_counter_init(struct perf_counter *counter) { - int event_id = counter->hw_event.event_id; + int event_id = perf_event_id(&counter->hw_event); int ret; ret = ftrace_profile_enable(event_id); @@ -1797,7 +1797,7 @@ sw_perf_counter_init(struct perf_counter *counter) * to be kernel events, and page faults are never hypervisor * events. */ - switch (counter->hw_event.event_id) { + switch (perf_event_id(&counter->hw_event)) { case PERF_COUNT_CPU_CLOCK: hw_ops = &perf_ops_cpu_clock; @@ -1882,9 +1882,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = NULL; - if (hw_event->raw_type) + if (perf_event_raw(hw_event)) { hw_ops = hw_perf_counter_init(counter); - else switch (hw_event->type) { + goto done; + } + + switch (perf_event_type(hw_event)) { case PERF_TYPE_HARDWARE: hw_ops = hw_perf_counter_init(counter); break; @@ -1902,6 +1905,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, kfree(counter); return NULL; } +done: counter->hw_ops = hw_ops; return counter; From 96f6d4444302bb2ea2cf409529eef816462f6ce0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:07 +0100 Subject: [PATCH 0191/2061] perf_counter: avoid recursion Tracepoint events like lock_acquire and software counters like pagefaults can recurse into the perf counter code again, avoid that. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.152096433@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++++++ kernel/perf_counter.c | 26 ++++++++++++++++++++++++++ 2 files changed, 33 insertions(+) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 56099e52970..18dc17d0a61 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -328,6 +328,13 @@ struct perf_cpu_context { int active_oncpu; int max_pertask; int exclusive; + + /* + * Recursion avoidance: + * + * task, softirq, irq, nmi context + */ + int recursion[4]; }; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ca14fc41ccd..ce34bff07bd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -23,6 +23,7 @@ #include #include #include +#include #include @@ -1532,10 +1533,31 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_unlock(); } +static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) +{ + if (in_nmi()) + return &cpuctx->recursion[3]; + + if (in_irq()) + return &cpuctx->recursion[2]; + + if (in_softirq()) + return &cpuctx->recursion[1]; + + return &cpuctx->recursion[0]; +} + static void __perf_swcounter_event(enum perf_event_types type, u32 event, u64 nr, int nmi, struct pt_regs *regs) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); + int *recursion = perf_swcounter_recursion_context(cpuctx); + + if (*recursion) + goto out; + + (*recursion)++; + barrier(); perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); if (cpuctx->task_ctx) { @@ -1543,6 +1565,10 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event, nr, nmi, regs); } + barrier(); + (*recursion)--; + +out: put_cpu_var(perf_cpu_context); } From 37d81828385f8ff823caaaf1a83e72d065b6cfa1 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 23 Mar 2009 18:22:08 +0100 Subject: [PATCH 0192/2061] perf_counter: add an mmap method to allow userspace to read hardware counters Impact: new feature giving performance improvement This adds the ability for userspace to do an mmap on a hardware counter fd and get access to a read-only page that contains the information needed to translate a hardware counter value to the full 64-bit counter value that would be returned by a read on the fd. This is useful on architectures that allow user programs to read the hardware counters, such as PowerPC. The mmap will only succeed if the counter is a hardware counter monitoring the current process. On my quad 2.5GHz PowerPC 970MP machine, userspace can read a counter and translate it to the full 64-bit value in about 30ns using the mmapped page, compared to about 830ns for the read syscall on the counter, so this does give a significant performance improvement. Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090323172417.297057964@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 6 +++ include/linux/perf_counter.h | 15 ++++++ kernel/perf_counter.c | 76 ++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index d05651584d4..e4349281b07 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -417,6 +417,8 @@ void hw_perf_restore(u64 disable) atomic64_set(&counter->hw.prev_count, val); counter->hw.idx = hwc_index[i] + 1; write_pmc(counter->hw.idx, val); + if (counter->user_page) + perf_counter_update_userpage(counter); } mb(); cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; @@ -572,6 +574,8 @@ static void power_perf_disable(struct perf_counter *counter) ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); write_pmc(counter->hw.idx, 0); counter->hw.idx = 0; + if (counter->user_page) + perf_counter_update_userpage(counter); break; } } @@ -698,6 +702,8 @@ static void record_and_restart(struct perf_counter *counter, long val, write_pmc(counter->hw.idx, val); atomic64_set(&counter->hw.prev_count, val); atomic64_set(&counter->hw.period_left, left); + if (counter->user_page) + perf_counter_update_userpage(counter); /* * Finally record data if requested. diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 18dc17d0a61..40b324e91bf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -143,6 +143,17 @@ struct perf_counter_hw_event { #define PERF_COUNTER_IOC_ENABLE _IO('$', 0) #define PERF_COUNTER_IOC_DISABLE _IO('$', 1) +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_counter_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware counter identifier */ + __s64 offset; /* add to hardware counter value */ +}; + #ifdef __KERNEL__ /* * Kernel-internal data types and definitions: @@ -278,6 +289,9 @@ struct perf_counter { int oncpu; int cpu; + /* pointer to page shared with userspace via mmap */ + unsigned long user_page; + /* read() / irq related data */ wait_queue_head_t waitq; /* optional: for NMIs */ @@ -361,6 +375,7 @@ extern int perf_counter_task_enable(void); extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, struct perf_counter_context *ctx, int cpu); +extern void perf_counter_update_userpage(struct perf_counter *counter); extern void perf_counter_output(struct perf_counter *counter, int nmi, struct pt_regs *regs); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ce34bff07bd..d9cfd902140 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1177,6 +1177,7 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); + free_page(counter->user_page); free_counter(counter); put_context(ctx); @@ -1346,12 +1347,87 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } +void perf_counter_update_userpage(struct perf_counter *counter) +{ + struct perf_counter_mmap_page *userpg; + + if (!counter->user_page) + return; + userpg = (struct perf_counter_mmap_page *) counter->user_page; + + ++userpg->lock; + smp_wmb(); + userpg->index = counter->hw.idx; + userpg->offset = atomic64_read(&counter->count); + if (counter->state == PERF_COUNTER_STATE_ACTIVE) + userpg->offset -= atomic64_read(&counter->hw.prev_count); + smp_wmb(); + ++userpg->lock; +} + +static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + if (!counter->user_page) + return VM_FAULT_SIGBUS; + + vmf->page = virt_to_page(counter->user_page); + get_page(vmf->page); + return 0; +} + +static struct vm_operations_struct perf_mmap_vmops = { + .fault = perf_mmap_fault, +}; + +static int perf_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct perf_counter *counter = file->private_data; + unsigned long userpg; + + if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) + return -EINVAL; + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + /* + * For now, restrict to the case of a hardware counter + * on the current task. + */ + if (is_software_counter(counter) || counter->task != current) + return -EINVAL; + + userpg = counter->user_page; + if (!userpg) { + userpg = get_zeroed_page(GFP_KERNEL); + mutex_lock(&counter->mutex); + if (counter->user_page) { + free_page(userpg); + userpg = counter->user_page; + } else { + counter->user_page = userpg; + } + mutex_unlock(&counter->mutex); + if (!userpg) + return -ENOMEM; + } + + perf_counter_update_userpage(counter); + + vma->vm_flags &= ~VM_MAYWRITE; + vma->vm_flags |= VM_RESERVED; + vma->vm_ops = &perf_mmap_vmops; + return 0; +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, .poll = perf_poll, .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_ioctl, + .mmap = perf_mmap, }; /* From 9ab772cd535c4b256a577eae516f9c7462346b2d Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 23 Mar 2009 18:22:09 +0100 Subject: [PATCH 0193/2061] mutex: add atomic_dec_and_mutex_lock() Much like the atomic_dec_and_lock() function in which we take an hold a spin_lock if we drop the atomic to 0 this function takes and holds the mutex if we dec the atomic to 0. Signed-off-by: Eric Paris Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.410913479@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 3069ec7e0ab..93054fc3635 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} + #endif From b09d2501ed3d294619cbfbcf828ad39324d0e548 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 17:21:56 -0700 Subject: [PATCH 0194/2061] mutex: drop "inline" from mutex_lock() inside kernel/mutex.c Impact: build fix mutex_lock() is was defined inline in kernel/mutex.c, but wasn't declared so not in . This didn't cause a problem until checkin 3a2d367d9aabac486ac4444c6c7ec7a1dab16267 added the atomic_dec_and_mutex_lock() inline in between declaration and definion. This broke building with CONFIG_ALLOW_WARNINGS=n, e.g. make allnoconfig. Either from the source code nor the allnoconfig binary output I cannot find any internal references to mutex_lock() in kernel/mutex.c, so presumably this "inline" is now-useless legacy. Cc: Eric Paris Cc: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: Signed-off-by: H. Peter Anvin --- kernel/mutex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/mutex.c b/kernel/mutex.c index 5d79781394a..fd95eaa672e 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -89,7 +89,7 @@ __mutex_lock_slowpath(atomic_t *lock_count); * * This function is similar to (but not equivalent to) down(). */ -void inline __sched mutex_lock(struct mutex *lock) +void __sched mutex_lock(struct mutex *lock) { might_sleep(); /* From 7b732a75047738e4f85438ed2f9cd34bf5f2a19a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:10 +0100 Subject: [PATCH 0195/2061] perf_counter: new output ABI - part 1 Impact: Rework the perfcounter output ABI use sys_read() only for instant data and provide mmap() output for all async overflow data. The first mmap() determines the size of the output buffer. The mmap() size must be a PAGE_SIZE multiple of 1+pages, where pages must be a power of 2 or 0. Further mmap()s of the same fd must have the same size. Once all maps are gone, you can again mmap() with a new size. In case of 0 extra pages there is no data output and the first page only contains meta data. When there are data pages, a poll() event will be generated for each full page of data. Furthermore, the output is circular. This means that although 1 page is a valid configuration, its useless, since we'll start overwriting it the instant we report a full page. Future work will focus on the output format (currently maintained) where we'll likey want each entry denoted by a header which includes a type and length. Further future work will allow to splice() the fd, also containing the async overflow data -- splice() would be mutually exclusive with mmap() of the data. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.470536358@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 9 +- include/linux/perf_counter.h | 36 +-- kernel/perf_counter.c | 464 +++++++++++++++-------------- 3 files changed, 263 insertions(+), 246 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index e4349281b07..d48596ab655 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -417,8 +417,7 @@ void hw_perf_restore(u64 disable) atomic64_set(&counter->hw.prev_count, val); counter->hw.idx = hwc_index[i] + 1; write_pmc(counter->hw.idx, val); - if (counter->user_page) - perf_counter_update_userpage(counter); + perf_counter_update_userpage(counter); } mb(); cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; @@ -574,8 +573,7 @@ static void power_perf_disable(struct perf_counter *counter) ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); write_pmc(counter->hw.idx, 0); counter->hw.idx = 0; - if (counter->user_page) - perf_counter_update_userpage(counter); + perf_counter_update_userpage(counter); break; } } @@ -702,8 +700,7 @@ static void record_and_restart(struct perf_counter *counter, long val, write_pmc(counter->hw.idx, val); atomic64_set(&counter->hw.prev_count, val); atomic64_set(&counter->hw.period_left, left); - if (counter->user_page) - perf_counter_update_userpage(counter); + perf_counter_update_userpage(counter); /* * Finally record data if requested. diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 40b324e91bf..2b5e66d5ebd 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -152,6 +152,8 @@ struct perf_counter_mmap_page { __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ __s64 offset; /* add to hardware counter value */ + + __u32 data_head; /* head in the data section */ }; #ifdef __KERNEL__ @@ -218,21 +220,6 @@ struct hw_perf_counter { #endif }; -/* - * Hardcoded buffer length limit for now, for IRQ-fed events: - */ -#define PERF_DATA_BUFLEN 2048 - -/** - * struct perf_data - performance counter IRQ data sampling ... - */ -struct perf_data { - int len; - int rd_idx; - int overrun; - u8 data[PERF_DATA_BUFLEN]; -}; - struct perf_counter; /** @@ -256,6 +243,14 @@ enum perf_counter_active_state { struct file; +struct perf_mmap_data { + struct rcu_head rcu_head; + int nr_pages; + atomic_t head; + struct perf_counter_mmap_page *user_page; + void *data_pages[0]; +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -289,16 +284,15 @@ struct perf_counter { int oncpu; int cpu; - /* pointer to page shared with userspace via mmap */ - unsigned long user_page; + /* mmap bits */ + struct mutex mmap_mutex; + atomic_t mmap_count; + struct perf_mmap_data *data; - /* read() / irq related data */ + /* poll related */ wait_queue_head_t waitq; /* optional: for NMIs */ int wakeup_pending; - struct perf_data *irqdata; - struct perf_data *usrdata; - struct perf_data data[2]; void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d9cfd902140..0dfe91094fd 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -4,7 +4,8 @@ * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar * - * For licencing details see kernel-base/COPYING + * + * For licensing details see kernel-base/COPYING */ #include @@ -1022,66 +1023,6 @@ static u64 perf_counter_read(struct perf_counter *counter) return atomic64_read(&counter->count); } -/* - * Cross CPU call to switch performance data pointers - */ -static void __perf_switch_irq_data(void *info) -{ - struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); - struct perf_counter *counter = info; - struct perf_counter_context *ctx = counter->ctx; - struct perf_data *oldirqdata = counter->irqdata; - - /* - * If this is a task context, we need to check whether it is - * the current task context of this cpu. If not it has been - * scheduled out before the smp call arrived. - */ - if (ctx->task) { - if (cpuctx->task_ctx != ctx) - return; - spin_lock(&ctx->lock); - } - - /* Change the pointer NMI safe */ - atomic_long_set((atomic_long_t *)&counter->irqdata, - (unsigned long) counter->usrdata); - counter->usrdata = oldirqdata; - - if (ctx->task) - spin_unlock(&ctx->lock); -} - -static struct perf_data *perf_switch_irq_data(struct perf_counter *counter) -{ - struct perf_counter_context *ctx = counter->ctx; - struct perf_data *oldirqdata = counter->irqdata; - struct task_struct *task = ctx->task; - - if (!task) { - smp_call_function_single(counter->cpu, - __perf_switch_irq_data, - counter, 1); - return counter->usrdata; - } - -retry: - spin_lock_irq(&ctx->lock); - if (counter->state != PERF_COUNTER_STATE_ACTIVE) { - counter->irqdata = counter->usrdata; - counter->usrdata = oldirqdata; - spin_unlock_irq(&ctx->lock); - return oldirqdata; - } - spin_unlock_irq(&ctx->lock); - task_oncpu_function_call(task, __perf_switch_irq_data, counter); - /* Might have failed, because task was scheduled out */ - if (counter->irqdata == oldirqdata) - goto retry; - - return counter->usrdata; -} - static void put_context(struct perf_counter_context *ctx) { if (ctx->task) @@ -1177,7 +1118,6 @@ static int perf_release(struct inode *inode, struct file *file) mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); - free_page(counter->user_page); free_counter(counter); put_context(ctx); @@ -1192,7 +1132,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { u64 cntval; - if (count != sizeof(cntval)) + if (count < sizeof(cntval)) return -EINVAL; /* @@ -1210,122 +1150,21 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); } -static ssize_t -perf_copy_usrdata(struct perf_data *usrdata, char __user *buf, size_t count) -{ - if (!usrdata->len) - return 0; - - count = min(count, (size_t)usrdata->len); - if (copy_to_user(buf, usrdata->data + usrdata->rd_idx, count)) - return -EFAULT; - - /* Adjust the counters */ - usrdata->len -= count; - if (!usrdata->len) - usrdata->rd_idx = 0; - else - usrdata->rd_idx += count; - - return count; -} - -static ssize_t -perf_read_irq_data(struct perf_counter *counter, - char __user *buf, - size_t count, - int nonblocking) -{ - struct perf_data *irqdata, *usrdata; - DECLARE_WAITQUEUE(wait, current); - ssize_t res, res2; - - irqdata = counter->irqdata; - usrdata = counter->usrdata; - - if (usrdata->len + irqdata->len >= count) - goto read_pending; - - if (nonblocking) - return -EAGAIN; - - spin_lock_irq(&counter->waitq.lock); - __add_wait_queue(&counter->waitq, &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - if (usrdata->len + irqdata->len >= count) - break; - - if (signal_pending(current)) - break; - - if (counter->state == PERF_COUNTER_STATE_ERROR) - break; - - spin_unlock_irq(&counter->waitq.lock); - schedule(); - spin_lock_irq(&counter->waitq.lock); - } - __remove_wait_queue(&counter->waitq, &wait); - __set_current_state(TASK_RUNNING); - spin_unlock_irq(&counter->waitq.lock); - - if (usrdata->len + irqdata->len < count && - counter->state != PERF_COUNTER_STATE_ERROR) - return -ERESTARTSYS; -read_pending: - mutex_lock(&counter->mutex); - - /* Drain pending data first: */ - res = perf_copy_usrdata(usrdata, buf, count); - if (res < 0 || res == count) - goto out; - - /* Switch irq buffer: */ - usrdata = perf_switch_irq_data(counter); - res2 = perf_copy_usrdata(usrdata, buf + res, count - res); - if (res2 < 0) { - if (!res) - res = -EFAULT; - } else { - res += res2; - } -out: - mutex_unlock(&counter->mutex); - - return res; -} - static ssize_t perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct perf_counter *counter = file->private_data; - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - return perf_read_hw(counter, buf, count); - - case PERF_RECORD_IRQ: - case PERF_RECORD_GROUP: - return perf_read_irq_data(counter, buf, count, - file->f_flags & O_NONBLOCK); - } - return -EINVAL; + return perf_read_hw(counter, buf, count); } static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; - unsigned int events = 0; - unsigned long flags; + unsigned int events = POLLIN; poll_wait(file, &counter->waitq, wait); - spin_lock_irqsave(&counter->waitq.lock, flags); - if (counter->usrdata->len || counter->irqdata->len) - events |= POLLIN; - spin_unlock_irqrestore(&counter->waitq.lock, flags); - return events; } @@ -1347,78 +1186,207 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } -void perf_counter_update_userpage(struct perf_counter *counter) +static void __perf_counter_update_userpage(struct perf_counter *counter, + struct perf_mmap_data *data) { - struct perf_counter_mmap_page *userpg; - - if (!counter->user_page) - return; - userpg = (struct perf_counter_mmap_page *) counter->user_page; + struct perf_counter_mmap_page *userpg = data->user_page; + /* + * Disable preemption so as to not let the corresponding user-space + * spin too long if we get preempted. + */ + preempt_disable(); ++userpg->lock; smp_wmb(); userpg->index = counter->hw.idx; userpg->offset = atomic64_read(&counter->count); if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); + + userpg->data_head = atomic_read(&data->head); smp_wmb(); ++userpg->lock; + preempt_enable(); +} + +void perf_counter_update_userpage(struct perf_counter *counter) +{ + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) + __perf_counter_update_userpage(counter, data); + rcu_read_unlock(); } static int perf_mmap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { struct perf_counter *counter = vma->vm_file->private_data; + struct perf_mmap_data *data; + int ret = VM_FAULT_SIGBUS; - if (!counter->user_page) - return VM_FAULT_SIGBUS; + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; - vmf->page = virt_to_page(counter->user_page); + if (vmf->pgoff == 0) { + vmf->page = virt_to_page(data->user_page); + } else { + int nr = vmf->pgoff - 1; + + if ((unsigned)nr > data->nr_pages) + goto unlock; + + vmf->page = virt_to_page(data->data_pages[nr]); + } get_page(vmf->page); + ret = 0; +unlock: + rcu_read_unlock(); + + return ret; +} + +static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) +{ + struct perf_mmap_data *data; + unsigned long size; + int i; + + WARN_ON(atomic_read(&counter->mmap_count)); + + size = sizeof(struct perf_mmap_data); + size += nr_pages * sizeof(void *); + + data = kzalloc(size, GFP_KERNEL); + if (!data) + goto fail; + + data->user_page = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->user_page) + goto fail_user_page; + + for (i = 0; i < nr_pages; i++) { + data->data_pages[i] = (void *)get_zeroed_page(GFP_KERNEL); + if (!data->data_pages[i]) + goto fail_data_pages; + } + + data->nr_pages = nr_pages; + + rcu_assign_pointer(counter->data, data); + return 0; + +fail_data_pages: + for (i--; i >= 0; i--) + free_page((unsigned long)data->data_pages[i]); + + free_page((unsigned long)data->user_page); + +fail_user_page: + kfree(data); + +fail: + return -ENOMEM; +} + +static void __perf_mmap_data_free(struct rcu_head *rcu_head) +{ + struct perf_mmap_data *data = container_of(rcu_head, + struct perf_mmap_data, rcu_head); + int i; + + free_page((unsigned long)data->user_page); + for (i = 0; i < data->nr_pages; i++) + free_page((unsigned long)data->data_pages[i]); + kfree(data); +} + +static void perf_mmap_data_free(struct perf_counter *counter) +{ + struct perf_mmap_data *data = counter->data; + + WARN_ON(atomic_read(&counter->mmap_count)); + + rcu_assign_pointer(counter->data, NULL); + call_rcu(&data->rcu_head, __perf_mmap_data_free); +} + +static void perf_mmap_open(struct vm_area_struct *vma) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + atomic_inc(&counter->mmap_count); +} + +static void perf_mmap_close(struct vm_area_struct *vma) +{ + struct perf_counter *counter = vma->vm_file->private_data; + + if (atomic_dec_and_mutex_lock(&counter->mmap_count, + &counter->mmap_mutex)) { + perf_mmap_data_free(counter); + mutex_unlock(&counter->mmap_mutex); + } } static struct vm_operations_struct perf_mmap_vmops = { + .open = perf_mmap_open, + .close = perf_mmap_close, .fault = perf_mmap_fault, }; static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_counter *counter = file->private_data; - unsigned long userpg; + unsigned long vma_size; + unsigned long nr_pages; + unsigned long locked, lock_limit; + int ret = 0; if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) return -EINVAL; - if (vma->vm_end - vma->vm_start != PAGE_SIZE) + + vma_size = vma->vm_end - vma->vm_start; + nr_pages = (vma_size / PAGE_SIZE) - 1; + + if (nr_pages == 0 || !is_power_of_2(nr_pages)) return -EINVAL; - /* - * For now, restrict to the case of a hardware counter - * on the current task. - */ - if (is_software_counter(counter) || counter->task != current) + if (vma_size != PAGE_SIZE * (1 + nr_pages)) return -EINVAL; - userpg = counter->user_page; - if (!userpg) { - userpg = get_zeroed_page(GFP_KERNEL); - mutex_lock(&counter->mutex); - if (counter->user_page) { - free_page(userpg); - userpg = counter->user_page; - } else { - counter->user_page = userpg; - } - mutex_unlock(&counter->mutex); - if (!userpg) - return -ENOMEM; - } + if (vma->vm_pgoff != 0) + return -EINVAL; - perf_counter_update_userpage(counter); + locked = vma_size >> PAGE_SHIFT; + locked += vma->vm_mm->locked_vm; + + lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; + lock_limit >>= PAGE_SHIFT; + + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) + return -EPERM; + + mutex_lock(&counter->mmap_mutex); + if (atomic_inc_not_zero(&counter->mmap_count)) + goto out; + + WARN_ON(counter->data); + ret = perf_mmap_data_alloc(counter, nr_pages); + if (!ret) + atomic_set(&counter->mmap_count, 1); +out: + mutex_unlock(&counter->mmap_mutex); vma->vm_flags &= ~VM_MAYWRITE; vma->vm_flags |= VM_RESERVED; vma->vm_ops = &perf_mmap_vmops; - return 0; + + return ret; } static const struct file_operations perf_fops = { @@ -1434,30 +1402,94 @@ static const struct file_operations perf_fops = { * Output */ -static void perf_counter_store_irq(struct perf_counter *counter, u64 data) +static int perf_output_write(struct perf_counter *counter, int nmi, + void *buf, ssize_t size) { - struct perf_data *irqdata = counter->irqdata; + struct perf_mmap_data *data; + unsigned int offset, head, nr; + unsigned int len; + int ret, wakeup; - if (irqdata->len > PERF_DATA_BUFLEN - sizeof(u64)) { - irqdata->overrun++; - } else { - u64 *p = (u64 *) &irqdata->data[irqdata->len]; + rcu_read_lock(); + ret = -ENOSPC; + data = rcu_dereference(counter->data); + if (!data) + goto out; - *p = data; - irqdata->len += sizeof(u64); + if (!data->nr_pages) + goto out; + + ret = -EINVAL; + if (size > PAGE_SIZE) + goto out; + + do { + offset = head = atomic_read(&data->head); + head += sizeof(u64); + } while (atomic_cmpxchg(&data->head, offset, head) != offset); + + wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); + + nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1); + offset &= PAGE_SIZE - 1; + + len = min_t(unsigned int, PAGE_SIZE - offset, size); + memcpy(data->data_pages[nr] + offset, buf, len); + size -= len; + + if (size) { + nr = (nr + 1) & (data->nr_pages - 1); + memcpy(data->data_pages[nr], buf + len, size); } + + /* + * generate a poll() wakeup for every page boundary crossed + */ + if (wakeup) { + __perf_counter_update_userpage(counter, data); + if (nmi) { + counter->wakeup_pending = 1; + set_perf_counter_pending(); + } else + wake_up(&counter->waitq); + } + ret = 0; +out: + rcu_read_unlock(); + + return ret; } -static void perf_counter_handle_group(struct perf_counter *counter) +static void perf_output_simple(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + u64 entry; + + entry = instruction_pointer(regs); + + perf_output_write(counter, nmi, &entry, sizeof(entry)); +} + +struct group_entry { + u64 event; + u64 counter; +}; + +static void perf_output_group(struct perf_counter *counter, int nmi) { struct perf_counter *leader, *sub; leader = counter->group_leader; list_for_each_entry(sub, &leader->sibling_list, list_entry) { + struct group_entry entry; + if (sub != counter) sub->hw_ops->read(sub); - perf_counter_store_irq(counter, sub->hw_event.config); - perf_counter_store_irq(counter, atomic64_read(&sub->count)); + + entry.event = sub->hw_event.config; + entry.counter = atomic64_read(&sub->count); + + perf_output_write(counter, nmi, &entry, sizeof(entry)); } } @@ -1469,19 +1501,13 @@ void perf_counter_output(struct perf_counter *counter, return; case PERF_RECORD_IRQ: - perf_counter_store_irq(counter, instruction_pointer(regs)); + perf_output_simple(counter, nmi, regs); break; case PERF_RECORD_GROUP: - perf_counter_handle_group(counter); + perf_output_group(counter, nmi); break; } - - if (nmi) { - counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&counter->waitq); } /* @@ -1967,10 +1993,10 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, INIT_LIST_HEAD(&counter->sibling_list); init_waitqueue_head(&counter->waitq); + mutex_init(&counter->mmap_mutex); + INIT_LIST_HEAD(&counter->child_list); - counter->irqdata = &counter->data[0]; - counter->usrdata = &counter->data[1]; counter->cpu = cpu; counter->hw_event = *hw_event; counter->wakeup_pending = 0; From 803d4f3980f6e220b27311a283aab0a4d68b6709 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:11 +0100 Subject: [PATCH 0196/2061] perf_counter tools: update to new syscall ABI update the kerneltop userspace to work with the latest syscall ABI Signed-off-by: Peter Zijlstra Cc: Wu Fengguang Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.559643732@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 235 +++++++++++++++++-------- 1 file changed, 157 insertions(+), 78 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 81a68aac137..a72c9bd2807 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -87,20 +87,90 @@ #include -#include "perfcounters.h" +#include "include/linux/perf_counter.h" +/* + * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * counters in the current task. + */ +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#define rdclock() \ +({ \ + struct timespec ts; \ + \ + clock_gettime(CLOCK_MONOTONIC, &ts); \ + ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ +}) + +/* + * Pick up some kernel type conventions: + */ +#define __user +#define asmlinkage + +typedef unsigned int __u32; +typedef unsigned long long __u64; +typedef long long __s64; + + +#ifdef __x86_64__ +# define __NR_perf_counter_open 295 +#endif + +#ifdef __i386__ +# define __NR_perf_counter_open 333 +#endif + +#ifdef __powerpc__ +#define __NR_perf_counter_open 319 +#endif + +asmlinkage int sys_perf_counter_open( + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd, + unsigned long flags) +{ + int ret; + + ret = syscall( + __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); +#if defined(__x86_64__) || defined(__i386__) + if (ret < 0 && ret > -4096) { + errno = -ret; + ret = -1; + } +#endif + return ret; +} + #define MAX_COUNTERS 64 #define MAX_NR_CPUS 256 -#define DEF_PERFSTAT_EVENTS { -2, -5, -4, -3, 0, 1, 2, 3} +#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) static int run_perfstat = 0; static int system_wide = 0; static int nr_counters = 0; -static __s64 event_id[MAX_COUNTERS] = DEF_PERFSTAT_EVENTS; -static int event_raw[MAX_COUNTERS]; +static __u64 event_id[MAX_COUNTERS] = { + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), + + EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), + EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), + EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), + EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), +}; +static int default_interval = 100000; static int event_count[MAX_COUNTERS]; static int fd[MAX_NR_CPUS][MAX_COUNTERS]; @@ -156,49 +226,63 @@ static char *sw_event_names[] = { "pagefaults", "context switches", "CPU migrations", + "minor faults", + "major faults", }; struct event_symbol { - int event; + __u64 event; char *symbol; }; static struct event_symbol event_symbols[] = { - {PERF_COUNT_CPU_CYCLES, "cpu-cycles", }, - {PERF_COUNT_CPU_CYCLES, "cycles", }, - {PERF_COUNT_INSTRUCTIONS, "instructions", }, - {PERF_COUNT_CACHE_REFERENCES, "cache-references", }, - {PERF_COUNT_CACHE_MISSES, "cache-misses", }, - {PERF_COUNT_BRANCH_INSTRUCTIONS, "branch-instructions", }, - {PERF_COUNT_BRANCH_INSTRUCTIONS, "branches", }, - {PERF_COUNT_BRANCH_MISSES, "branch-misses", }, - {PERF_COUNT_BUS_CYCLES, "bus-cycles", }, - {PERF_COUNT_CPU_CLOCK, "cpu-ticks", }, - {PERF_COUNT_CPU_CLOCK, "ticks", }, - {PERF_COUNT_TASK_CLOCK, "task-ticks", }, - {PERF_COUNT_PAGE_FAULTS, "page-faults", }, - {PERF_COUNT_PAGE_FAULTS, "faults", }, - {PERF_COUNT_CONTEXT_SWITCHES, "context-switches", }, - {PERF_COUNT_CONTEXT_SWITCHES, "cs", }, - {PERF_COUNT_CPU_MIGRATIONS, "cpu-migrations", }, - {PERF_COUNT_CPU_MIGRATIONS, "migrations", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", }, + + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", }, }; +#define __PERF_COUNTER_FIELD(config, name) \ + ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) + +#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW) +#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG) +#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) +#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) + static void display_events_help(void) { unsigned int i; - int e; + __u64 e; printf( " -e EVENT --event=EVENT # symbolic-name abbreviations"); - for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) { - if (e != event_symbols[i].event) { - e = event_symbols[i].event; - printf( - "\n %2d: %-20s", e, event_symbols[i].symbol); - } else - printf(" %s", event_symbols[i].symbol); + for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { + int type, id; + + e = event_symbols[i].event; + type = PERF_COUNTER_TYPE(e); + id = PERF_COUNTER_ID(e); + + printf("\n %d:%d: %-20s", + type, id, event_symbols[i].symbol); } printf("\n" @@ -249,44 +333,51 @@ static void display_help(void) exit(0); } -static int type_valid(int type) -{ - if (type >= PERF_HW_EVENTS_MAX) - return 0; - if (type <= PERF_SW_EVENTS_MIN) - return 0; - - return 1; -} - static char *event_name(int ctr) { - __s64 type = event_id[ctr]; + __u64 config = event_id[ctr]; + int type = PERF_COUNTER_TYPE(config); + int id = PERF_COUNTER_ID(config); static char buf[32]; - if (event_raw[ctr]) { - sprintf(buf, "raw 0x%llx", (long long)type); + if (PERF_COUNTER_RAW(config)) { + sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config)); return buf; } - if (!type_valid(type)) - return "unknown"; - if (type >= 0) - return hw_event_names[type]; + switch (type) { + case PERF_TYPE_HARDWARE: + if (id < PERF_HW_EVENTS_MAX) + return hw_event_names[id]; + return "unknown-hardware"; - return sw_event_names[-type-1]; + case PERF_TYPE_SOFTWARE: + if (id < PERF_SW_EVENTS_MAX) + return sw_event_names[id]; + return "unknown-software"; + + default: + break; + } + + return "unknown"; } /* * Each event can have multiple symbolic names. * Symbolic names are (almost) exactly matched. */ -static int match_event_symbols(char *str) +static __u64 match_event_symbols(char *str) { + __u64 config, id; + int type; unsigned int i; - if (isdigit(str[0]) || str[0] == '-') - return atoi(str); + if (sscanf(str, "r%llx", &config) == 1) + return config | PERF_COUNTER_RAW_MASK; + + if (sscanf(str, "%d:%llu", &type, &id) == 2) + return EID(type, id); for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { if (!strncmp(str, event_symbols[i].symbol, @@ -294,31 +385,22 @@ static int match_event_symbols(char *str) return event_symbols[i].event; } - return PERF_HW_EVENTS_MAX; + return ~0ULL; } static int parse_events(char *str) { - __s64 type; - int raw; + __u64 config; again: if (nr_counters == MAX_COUNTERS) return -1; - raw = 0; - if (*str == 'r') { - raw = 1; - ++str; - type = strtol(str, NULL, 16); - } else { - type = match_event_symbols(str); - if (!type_valid(type)) - return -1; - } + config = match_event_symbols(str); + if (config == ~0ULL) + return -1; - event_id[nr_counters] = type; - event_raw[nr_counters] = raw; + event_id[nr_counters] = config; nr_counters++; str = strstr(str, ","); @@ -342,8 +424,7 @@ static void create_perfstat_counter(int counter) struct perf_counter_hw_event hw_event; memset(&hw_event, 0, sizeof(hw_event)); - hw_event.type = event_id[counter]; - hw_event.raw = event_raw[counter]; + hw_event.config = event_id[counter]; hw_event.record_type = PERF_RECORD_SIMPLE; hw_event.nmi = 0; @@ -428,7 +509,7 @@ int do_perfstat(int argc, char *argv[]) count += single_count; } - if (!event_raw[counter] && + if (!PERF_COUNTER_RAW(event_id[counter]) && (event_id[counter] == PERF_COUNT_CPU_CLOCK || event_id[counter] == PERF_COUNT_TASK_CLOCK)) { @@ -911,7 +992,7 @@ static void record_ip(uint64_t ip, int counter) assert(left <= middle && middle <= right); if (!(left <= ip && ip <= right)) { printf(" left: %016lx\n", left); - printf(" ip: %016lx\n", ip); + printf(" ip: %016llx\n", ip); printf("right: %016lx\n", right); } assert(left <= ip && ip <= right); @@ -983,7 +1064,7 @@ static void process_options(int argc, char *argv[]) switch (c) { case 'a': system_wide = 1; break; - case 'c': event_count[nr_counters] = atoi(optarg); break; + case 'c': default_interval = atoi(optarg); break; case 'C': /* CPU and PID are mutually exclusive */ if (tid != -1) { @@ -1032,10 +1113,7 @@ static void process_options(int argc, char *argv[]) if (event_count[counter]) continue; - if (event_id[counter] < PERF_HW_EVENTS_MAX) - event_count[counter] = default_count[event_id[counter]]; - else - event_count[counter] = 100000; + event_count[counter] = default_interval; } } @@ -1070,12 +1148,13 @@ int main(int argc, char *argv[]) cpu = i; memset(&hw_event, 0, sizeof(hw_event)); - hw_event.type = event_id[counter]; - hw_event.raw = event_raw[counter]; + hw_event.config = event_id[counter]; hw_event.irq_period = event_count[counter]; hw_event.record_type = PERF_RECORD_IRQ; hw_event.nmi = nmi; + printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]); + fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); if (fd[i][counter] < 0) { From bcbcb37cdb67d8100acfa66df40c4d636c28c4d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 23 Mar 2009 18:22:12 +0100 Subject: [PATCH 0197/2061] perf_counter tools: use mmap() output update kerneltop to use the mmap() output to gather overflow information Signed-off-by: Peter Zijlstra Cc: Wu Fengguang Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.677932499@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 93 ++++++++++++++++++++++---- 1 file changed, 79 insertions(+), 14 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index a72c9bd2807..80b790553ec 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -84,6 +84,7 @@ #include #include #include +#include #include @@ -119,17 +120,25 @@ typedef long long __s64; #ifdef __x86_64__ -# define __NR_perf_counter_open 295 +#define __NR_perf_counter_open 295 +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); #endif #ifdef __i386__ -# define __NR_perf_counter_open 333 +#define __NR_perf_counter_open 333 +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); #endif #ifdef __powerpc__ #define __NR_perf_counter_open 319 +#define rmb() asm volatile ("sync" ::: "memory") +#define cpu_relax() asm volatile ("" ::: "memory"); #endif +#define unlikely(x) __builtin_expect(!!(x), 0) + asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, pid_t pid, @@ -181,6 +190,7 @@ static int profile_cpu = -1; static int nr_cpus = 0; static int nmi = 1; static int group = 0; +static unsigned int page_size; static char *vmlinux; @@ -1117,16 +1127,68 @@ static void process_options(int argc, char *argv[]) } } +struct mmap_data { + int counter; + void *base; + unsigned int mask; + unsigned int prev; +}; + +static unsigned int mmap_read_head(struct mmap_data *md) +{ + struct perf_counter_mmap_page *pc = md->base; + unsigned int seq, head; + +repeat: + rmb(); + seq = pc->lock; + + if (unlikely(seq & 1)) { + cpu_relax(); + goto repeat; + } + + head = pc->data_head; + + rmb(); + if (pc->lock != seq) + goto repeat; + + return head; +} + +static void mmap_read(struct mmap_data *md) +{ + unsigned int head = mmap_read_head(md); + unsigned int old = md->prev; + unsigned char *data = md->base + page_size; + + if (head - old > md->mask) { + printf("ERROR: failed to keep up with mmap data\n"); + exit(-1); + } + + for (; old != head;) { + __u64 *ptr = (__u64 *)&data[old & md->mask]; + old += sizeof(__u64); + + process_event(*ptr, md->counter); + } + + md->prev = old; +} + int main(int argc, char *argv[]) { struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS]; + struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; struct perf_counter_hw_event hw_event; int i, counter, group_fd; unsigned int cpu; - uint64_t ip; - ssize_t res; int ret; + page_size = sysconf(_SC_PAGE_SIZE); + process_options(argc, argv); nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); @@ -1153,8 +1215,6 @@ int main(int argc, char *argv[]) hw_event.record_type = PERF_RECORD_IRQ; hw_event.nmi = nmi; - printf("FOO: %d %llx %llx\n", counter, event_id[counter], event_count[counter]); - fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); if (fd[i][counter] < 0) { @@ -1174,6 +1234,17 @@ int main(int argc, char *argv[]) event_array[i][counter].fd = fd[i][counter]; event_array[i][counter].events = POLLIN; + + mmap_array[i][counter].counter = counter; + mmap_array[i][counter].prev = 0; + mmap_array[i][counter].mask = 2*page_size - 1; + mmap_array[i][counter].base = mmap(NULL, 3*page_size, + PROT_READ, MAP_SHARED, fd[i][counter], 0); + if (mmap_array[i][counter].base == MAP_FAILED) { + printf("kerneltop error: failed to mmap with %d (%s)\n", + errno, strerror(errno)); + exit(-1); + } } } @@ -1188,14 +1259,8 @@ int main(int argc, char *argv[]) int hits = events; for (i = 0; i < nr_cpus; i++) { - for (counter = 0; counter < nr_counters; counter++) { - res = read(fd[i][counter], (char *) &ip, sizeof(ip)); - if (res > 0) { - assert(res == sizeof(ip)); - - process_event(ip, counter); - } - } + for (counter = 0; counter < nr_counters; counter++) + mmap_read(&mmap_array[i][counter]); } if (time(NULL) >= last_refresh + delay_secs) { From 383c5f8cd7d253be0017d8d5a39cbb78aaf70206 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Mar 2009 21:49:25 +0100 Subject: [PATCH 0198/2061] perf_counter tools: tidy up in-kernel dependencies Remove now unified perfstat.c and perf_counter.h, and link to the in-kernel perf_counter.h. Cc: Wu Fengguang Cc: Paul Mackerras Acked-by: Peter Zijlstra Orig-LKML-Reference: <20090323172417.677932499@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/Makefile | 2 +- Documentation/perf_counter/kerneltop.c | 2 +- Documentation/perf_counter/perfcounters.h | 142 ------------ Documentation/perf_counter/perfstat.c | 251 ---------------------- 4 files changed, 2 insertions(+), 395 deletions(-) delete mode 100644 Documentation/perf_counter/perfcounters.h delete mode 100644 Documentation/perf_counter/perfstat.c diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index b45749753fc..666da95a787 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -2,7 +2,7 @@ BINS = kerneltop perfstat all: $(BINS) -kerneltop: kerneltop.c perfcounters.h +kerneltop: kerneltop.c ../../include/linux/perf_counter.h cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $< perfstat: kerneltop diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 80b790553ec..25e80bc4455 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -88,7 +88,7 @@ #include -#include "include/linux/perf_counter.h" +#include "../../include/linux/perf_counter.h" /* diff --git a/Documentation/perf_counter/perfcounters.h b/Documentation/perf_counter/perfcounters.h deleted file mode 100644 index 32e24b9154a..00000000000 --- a/Documentation/perf_counter/perfcounters.h +++ /dev/null @@ -1,142 +0,0 @@ -/* - * Ioctls that can be done on a perf counter fd: - */ -#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) - -/* - * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all - * counters in the current task. - */ -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#define rdclock() \ -({ \ - struct timespec ts; \ - \ - clock_gettime(CLOCK_MONOTONIC, &ts); \ - ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ -}) - -/* - * Pick up some kernel type conventions: - */ -#define __user -#define asmlinkage - -typedef unsigned int __u32; -typedef unsigned long long __u64; -typedef long long __s64; - -/* - * User-space ABI bits: - */ - -/* - * Generalized performance counter event types, used by the hw_event.type - * parameter of the sys_perf_counter_open() syscall: - */ -enum hw_event_types { - /* - * Common hardware events, generalized by the kernel: - */ - PERF_COUNT_CPU_CYCLES = 0, - PERF_COUNT_INSTRUCTIONS = 1, - PERF_COUNT_CACHE_REFERENCES = 2, - PERF_COUNT_CACHE_MISSES = 3, - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_BRANCH_MISSES = 5, - PERF_COUNT_BUS_CYCLES = 6, - - PERF_HW_EVENTS_MAX = 7, - - /* - * Special "software" counters provided by the kernel, even if - * the hardware does not support performance counters. These - * counters measure various physical and sw events of the - * kernel (and allow the profiling of them as well): - */ - PERF_COUNT_CPU_CLOCK = -1, - PERF_COUNT_TASK_CLOCK = -2, - PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, - PERF_COUNT_CPU_MIGRATIONS = -5, - - PERF_SW_EVENTS_MIN = -6, -}; - -/* - * IRQ-notification data record type: - */ -enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, -}; - -/* - * Hardware event to monitor via a performance monitoring counter: - */ -struct perf_counter_hw_event { - __s64 type; - - __u64 irq_period; - __u64 record_type; - __u64 read_format; - - __u64 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only group on PMU */ - exclude_user : 1, /* don't count user */ - exclude_kernel : 1, /* ditto kernel */ - exclude_hv : 1, /* ditto hypervisor */ - exclude_idle : 1, /* don't count when idle */ - - __reserved_1 : 54; - - __u32 extra_config_len; - __u32 __reserved_4; - - __u64 __reserved_2; - __u64 __reserved_3; -}; - - -#ifdef __x86_64__ -# define __NR_perf_counter_open 295 -#endif - -#ifdef __i386__ -# define __NR_perf_counter_open 333 -#endif - -#ifdef __powerpc__ -#define __NR_perf_counter_open 319 -#endif - -asmlinkage int sys_perf_counter_open( - - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd, - unsigned long flags) -{ - int ret; - - ret = syscall( - __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); -#if defined(__x86_64__) || defined(__i386__) - if (ret < 0 && ret > -4096) { - errno = -ret; - ret = -1; - } -#endif - return ret; -} diff --git a/Documentation/perf_counter/perfstat.c b/Documentation/perf_counter/perfstat.c deleted file mode 100644 index fd594468e65..00000000000 --- a/Documentation/perf_counter/perfstat.c +++ /dev/null @@ -1,251 +0,0 @@ -/* - * perfstat: /usr/bin/time -alike performance counter statistics utility - * - * It summarizes the counter events of all tasks (and child tasks), - * covering all CPUs that the command (or workload) executes on. - * It only counts the per-task events of the workload started, - * independent of how many other tasks run on those CPUs. - * - * Build with: cc -O2 -g -lrt -Wall -W -o perfstat perfstat.c - * - * Sample output: - * - - $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null - - Performance counter stats for 'ls': - - 163516953 instructions - 2295 cache-misses - 2855182 branch-misses - - * - * Copyright (C) 2008, Red Hat Inc, Ingo Molnar - * - * Released under the GPLv2 (not later). - * - * Percpu counter support by: Yanmin Zhang - * Symbolic event options by: Wu Fengguang - */ -#define _GNU_SOURCE - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include - -#include - -#include "perfcounters.h" - -static int nr_cpus = 0; - -static int system_wide = 0; - -static void display_help(void) -{ - unsigned int i; - int e; - - printf( - "Usage: perfstat [] \n\n" - "PerfStat Options (up to %d event types can be specified):\n\n", - MAX_COUNTERS); - printf( - " -e EVENT --event=EVENT # symbolic-name abbreviations"); - - for (i = 0, e = PERF_HW_EVENTS_MAX; i < ARRAY_SIZE(event_symbols); i++) { - if (e != event_symbols[i].event) { - e = event_symbols[i].event; - printf( - "\n %2d: %-20s", e, event_symbols[i].symbol); - } else - printf(" %s", event_symbols[i].symbol); - } - - printf("\n" - " rNNN: raw event type\n\n" - " -s # system-wide collection\n\n" - " -c --command= # command+arguments to be timed.\n" - "\n"); - exit(0); -} - -static void process_options(int argc, char *argv[]) -{ - for (;;) { - int option_index = 0; - /** Options for getopt */ - static struct option long_options[] = { - {"event", required_argument, NULL, 'e'}, - {"help", no_argument, NULL, 'h'}, - {"command", no_argument, NULL, 'c'}, - {NULL, 0, NULL, 0 } - }; - int c = getopt_long(argc, argv, "+:e:c:s", - long_options, &option_index); - if (c == -1) - break; - - switch (c) { - case 'c': - break; - case 's': - system_wide = 1; - break; - case 'e': - parse_events(optarg); - break; - default: - break; - } - } - if (optind == argc) - goto err; - - if (!nr_counters) - nr_counters = 8; - return; - -err: - display_help(); -} - -char fault_here[1000000]; - -static int fd[MAX_NR_CPUS][MAX_COUNTERS]; - -static void create_counter(int counter) -{ - struct perf_counter_hw_event hw_event; - - memset(&hw_event, 0, sizeof(hw_event)); - hw_event.type = event_id[counter]; - hw_event.raw = event_raw[counter]; - hw_event.record_type = PERF_RECORD_SIMPLE; - hw_event.nmi = 0; - - if (system_wide) { - int cpu; - for (cpu = 0; cpu < nr_cpus; cpu ++) { - fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0); - if (fd[cpu][counter] < 0) { - printf("perfstat error: syscall returned with %d (%s)\n", - fd[cpu][counter], strerror(errno)); - exit(-1); - } - - } - } else { - hw_event.inherit = 1; - hw_event.disabled = 1; - - fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0); - if (fd[0][counter] < 0) { - printf("perfstat error: syscall returned with %d (%s)\n", - fd[0][counter], strerror(errno)); - exit(-1); - } - } -} - - -int main(int argc, char *argv[]) -{ - unsigned long long t0, t1; - int counter; - ssize_t res; - int status; - int pid; - - process_options(argc, argv); - - if (system_wide) { - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - assert(nr_cpus <= MAX_NR_CPUS); - assert(nr_cpus >= 0); - } else - nr_cpus = 1; - - for (counter = 0; counter < nr_counters; counter++) - create_counter(counter); - - argc -= optind; - argv += optind; - - /* - * Enable counters and exec the command: - */ - t0 = rdclock(); - prctl(PR_TASK_PERF_COUNTERS_ENABLE); - - if ((pid = fork()) < 0) - perror("failed to fork"); - if (!pid) { - if (execvp(argv[0], argv)) { - perror(argv[0]); - exit(-1); - } - } - while (wait(&status) >= 0) - ; - prctl(PR_TASK_PERF_COUNTERS_DISABLE); - t1 = rdclock(); - - fflush(stdout); - - fprintf(stderr, "\n"); - fprintf(stderr, " Performance counter stats for \'%s\':\n", - argv[0]); - fprintf(stderr, "\n"); - - for (counter = 0; counter < nr_counters; counter++) { - int cpu; - __u64 count, single_count; - - count = 0; - for (cpu = 0; cpu < nr_cpus; cpu ++) { - res = read(fd[cpu][counter], - (char *) &single_count, sizeof(single_count)); - assert(res == sizeof(single_count)); - count += single_count; - } - - if (!event_raw[counter] && - (event_id[counter] == PERF_COUNT_CPU_CLOCK || - event_id[counter] == PERF_COUNT_TASK_CLOCK)) { - - double msecs = (double)count / 1000000; - - fprintf(stderr, " %14.6f %-20s (msecs)\n", - msecs, event_name(counter)); - } else { - fprintf(stderr, " %14Ld %-20s (events)\n", - count, event_name(counter)); - } - if (!counter) - fprintf(stderr, "\n"); - } - fprintf(stderr, "\n"); - fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", - (double)(t1-t0)/1e6); - fprintf(stderr, "\n"); - - return 0; -} From 193e8df1b465f206f8a286202680702e1c153367 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Mar 2009 22:23:16 +0100 Subject: [PATCH 0199/2061] perf_counter tools: fix build warning in kerneltop.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix: kerneltop.c: In function ‘record_ip’: kerneltop.c:1005: warning: format ‘%016llx’ expects type ‘long long unsigned int’, but argument 2 has type ‘uint64_t’ Cc: Wu Fengguang Cc: Paul Mackerras Acked-by: Peter Zijlstra Orig-LKML-Reference: <20090323172417.677932499@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 25e80bc4455..8f9a303f28d 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -1002,7 +1002,7 @@ static void record_ip(uint64_t ip, int counter) assert(left <= middle && middle <= right); if (!(left <= ip && ip <= right)) { printf(" left: %016lx\n", left); - printf(" ip: %016llx\n", ip); + printf(" ip: %016lx\n", (unsigned long)ip); printf("right: %016lx\n", right); } assert(left <= ip && ip <= right); From 81cdbe0509542324ad7d3282ab67c2b6716df663 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 23 Mar 2009 22:29:50 +0100 Subject: [PATCH 0200/2061] perf_counter tools: increase cpu-cycles again Commit b7368fdd7d decreased the CPU cycles interval 100-fold, but this is causig kerneltop failures on my Nehalem box: aldebaran:/home/mingo/linux/linux/Documentation/perf_counter> ./kerneltop KernelTop refresh period: 2 seconds ERROR: failed to keep up with mmap data 10,000 cycles is way too short. What we should do instead on mostly-idle systems is some sort of read/poll timeout, so that we display something every 2 seconds for sure. Cc: Wu Fengguang Cc: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 8f9a303f28d..2ab29b5e32e 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -212,7 +212,7 @@ struct source_line { const unsigned int default_count[] = { - 10000, + 1000000, 1000000, 10000, 10000, From cbe46555dc4de6403cd757139d42289b5f21abb9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 24 Mar 2009 16:52:34 +1100 Subject: [PATCH 0201/2061] perf_counter tools: remove glib dependency and fix bugs in kerneltop.c The glib dependency in kerneltop.c is only for a little bit of list manipulation, and I find it inconvenient. This adds a 'next' field to struct source_line, which lets us link them together into a list. The code to do the linking ourselves turns out to be no longer or more difficult than using glib. This also fixes a few other problems: - We need to #include to get PATH_MAX on powerpc. - We need to #include rather than have our own definitions of __u64 and __s64; on powerpc the installed headers define them to be unsigned long and long respectively, and if we have our own, different definition here that causes a compile error. - This takes out the x86 setting of errno from -ret in sys_perf_counter_open. My experiments on x86 indicate that the glibc syscall() does this for us already. - We had two CPU migration counters in the default set, which seems unnecessary; I changed one of them to a context switch counter. - In perfstat mode we were printing CPU cycles and instructions as milliseconds, and the cpu clock and task clock counters as events. This fixes that. - In perfstat mode we were still printing a blank line after the first counter, which was a holdover from when a task clock counter was automatically included as the first counter. This removes the blank line. - On a test machine here, parse_symbols() and parse_vmlinux() were taking long enough (almost 0.5 seconds) for the mmap buffer to overflow before we got to the first mmap_read() call, so this moves them before we open all the counters. - The error message if sys_perf_counter_open fails needs to use errno, not -fd[i][counter]. Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Acked-by: Mike Galbraith Cc: Arjan van de Ven Orig-LKML-Reference: <18888.29986.340328.540512@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/Makefile | 2 +- Documentation/perf_counter/kerneltop.c | 112 ++++++++++--------------- 2 files changed, 46 insertions(+), 68 deletions(-) diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index 666da95a787..194b6621558 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -3,7 +3,7 @@ BINS = kerneltop perfstat all: $(BINS) kerneltop: kerneltop.c ../../include/linux/perf_counter.h - cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o $@ $< + cc -O6 -Wall -lrt -o $@ $< perfstat: kerneltop ln -sf kerneltop perfstat diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 2ab29b5e32e..ea13e4e6722 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -3,7 +3,7 @@ Build with: - cc -O6 -Wall -lrt `pkg-config --cflags --libs glib-2.0` -o kerneltop kerneltop.c + cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt Sample output: @@ -56,6 +56,7 @@ * Yanmin Zhang * Wu Fengguang * Mike Galbraith + * Paul Mackerras * * Released under the GPL v2. (and only v2, not any later version) */ @@ -68,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -76,8 +78,6 @@ #include #include -#include - #include #include #include @@ -87,6 +87,7 @@ #include #include +#include #include "../../include/linux/perf_counter.h" @@ -114,11 +115,6 @@ #define __user #define asmlinkage -typedef unsigned int __u32; -typedef unsigned long long __u64; -typedef long long __s64; - - #ifdef __x86_64__ #define __NR_perf_counter_open 295 #define rmb() asm volatile("lfence" ::: "memory") @@ -146,17 +142,8 @@ asmlinkage int sys_perf_counter_open( int group_fd, unsigned long flags) { - int ret; - - ret = syscall( + return syscall( __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); -#if defined(__x86_64__) || defined(__i386__) - if (ret < 0 && ret > -4096) { - errno = -ret; - ret = -1; - } -#endif - return ret; } #define MAX_COUNTERS 64 @@ -170,7 +157,7 @@ static int system_wide = 0; static int nr_counters = 0; static __u64 event_id[MAX_COUNTERS] = { EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), @@ -202,14 +189,15 @@ static int delay_secs = 2; static int zero; static int dump_symtab; -static GList *lines; - struct source_line { uint64_t EIP; unsigned long count; char *line; + struct source_line *next; }; +static struct source_line *lines; +static struct source_line **lines_tail; const unsigned int default_count[] = { 1000000, @@ -519,9 +507,8 @@ int do_perfstat(int argc, char *argv[]) count += single_count; } - if (!PERF_COUNTER_RAW(event_id[counter]) && - (event_id[counter] == PERF_COUNT_CPU_CLOCK || - event_id[counter] == PERF_COUNT_TASK_CLOCK)) { + if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) || + event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) { double msecs = (double)count / 1000000; @@ -531,8 +518,6 @@ int do_perfstat(int argc, char *argv[]) fprintf(stderr, " %14Ld %-20s (events)\n", count, event_name(counter)); } - if (!counter) - fprintf(stderr, "\n"); } fprintf(stderr, "\n"); fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", @@ -554,7 +539,7 @@ struct sym_entry { char *sym; unsigned long count[MAX_COUNTERS]; int skip; - GList *source; + struct source_line *source; }; #define MAX_SYMS 100000 @@ -855,6 +840,7 @@ static void parse_vmlinux(char *filename) if (!file) return; + lines_tail = &lines; while (!feof(file)) { struct source_line *src; size_t dummy = 0; @@ -873,7 +859,9 @@ static void parse_vmlinux(char *filename) if (c) *c = 0; - lines = g_list_prepend(lines, src); + src->next = NULL; + *lines_tail = src; + lines_tail = &src->next; if (strlen(src->line)>8 && src->line[8] == ':') src->EIP = strtoull(src->line, NULL, 16); @@ -881,52 +869,43 @@ static void parse_vmlinux(char *filename) src->EIP = strtoull(src->line, NULL, 16); } pclose(file); - lines = g_list_reverse(lines); } static void record_precise_ip(uint64_t ip) { struct source_line *line; - GList *item; - item = g_list_first(lines); - while (item) { - line = item->data; + for (line = lines; line; line = line->next) { if (line->EIP == ip) line->count++; if (line->EIP > ip) break; - item = g_list_next(item); } } static void lookup_sym_in_vmlinux(struct sym_entry *sym) { struct source_line *line; - GList *item; char pattern[PATH_MAX]; sprintf(pattern, "<%s>:", sym->sym); - item = g_list_first(lines); - while (item) { - line = item->data; + for (line = lines; line; line = line->next) { if (strstr(line->line, pattern)) { - sym->source = item; + sym->source = line; break; } - item = g_list_next(item); } } -void show_lines(GList *item_queue, int item_queue_count) +static void show_lines(struct source_line *line_queue, int line_queue_count) { int i; struct source_line *line; - for (i = 0; i < item_queue_count; i++) { - line = item_queue->data; + line = line_queue; + for (i = 0; i < line_queue_count; i++) { printf("%8li\t%s\n", line->count, line->line); - item_queue = g_list_next(item_queue); + line = line->next; } } @@ -935,10 +914,9 @@ void show_lines(GList *item_queue, int item_queue_count) static void show_details(struct sym_entry *sym) { struct source_line *line; - GList *item; + struct source_line *line_queue = NULL; int displayed = 0; - GList *item_queue = NULL; - int item_queue_count = 0; + int line_queue_count = 0; if (!sym->source) lookup_sym_in_vmlinux(sym); @@ -947,30 +925,29 @@ static void show_details(struct sym_entry *sym) printf("Showing details for %s\n", sym->sym); - item = sym->source; - while (item) { - line = item->data; + line = sym->source; + while (line) { if (displayed && strstr(line->line, ">:")) break; - if (!item_queue_count) - item_queue = item; - item_queue_count ++; + if (!line_queue_count) + line_queue = line; + line_queue_count ++; if (line->count >= count_filter) { - show_lines(item_queue, item_queue_count); - item_queue_count = 0; - item_queue = NULL; - } else if (item_queue_count > TRACE_COUNT) { - item_queue = g_list_next(item_queue); - item_queue_count --; + show_lines(line_queue, line_queue_count); + line_queue_count = 0; + line_queue = NULL; + } else if (line_queue_count > TRACE_COUNT) { + line_queue = line_queue->next; + line_queue_count --; } line->count = 0; displayed++; if (displayed > 300) break; - item = g_list_next(item); + line = line->next; } } @@ -1201,6 +1178,10 @@ int main(int argc, char *argv[]) if (tid != -1 || profile_cpu != -1) nr_cpus = 1; + parse_symbols(); + if (vmlinux && sym_filter_entry) + parse_vmlinux(vmlinux); + for (i = 0; i < nr_cpus; i++) { group_fd = -1; for (counter = 0; counter < nr_counters; counter++) { @@ -1216,15 +1197,16 @@ int main(int argc, char *argv[]) hw_event.nmi = nmi; fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); - fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); if (fd[i][counter] < 0) { + int err = errno; printf("kerneltop error: syscall returned with %d (%s)\n", - fd[i][counter], strerror(-fd[i][counter])); - if (fd[i][counter] == -1) + fd[i][counter], strerror(err)); + if (err == EPERM) printf("Are you root?\n"); exit(-1); } assert(fd[i][counter] >= 0); + fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); /* * First counter acts as the group leader: @@ -1248,10 +1230,6 @@ int main(int argc, char *argv[]) } } - parse_symbols(); - if (vmlinux && sym_filter_entry) - parse_vmlinux(vmlinux); - printf("KernelTop refresh period: %d seconds\n", delay_secs); last_refresh = time(NULL); From 0fd112e41cd6f6d4779cbe327c3632d087e31476 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Mar 2009 10:50:24 +0100 Subject: [PATCH 0202/2061] perf_counter tools: remove glib dependency and fix bugs in kerneltop.c, fix poll() Paul Mackerras wrote: > I noticed the poll stuff is bogus - we have a 2D array of struct > pollfds (MAX_NR_CPUS x MAX_COUNTERS), we fill in a sub-array (with the > rest being uninitialized, since the array is on the stack) and then > pass the first nr_cpus elements to poll. Not what we really meant, I > suspect. :) Not even if we only have one counter, since it's the > counter dimension that varies fastest. This should fix the most obvious poll fubar.. not enough to fix the full problem though.. Reported-by: Paul Mackerras Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Arjan van de Ven Orig-LKML-Reference: <18888.29986.340328.540512@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index ea13e4e6722..7ebde7a336c 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -1157,10 +1157,10 @@ static void mmap_read(struct mmap_data *md) int main(int argc, char *argv[]) { - struct pollfd event_array[MAX_NR_CPUS][MAX_COUNTERS]; + struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; struct perf_counter_hw_event hw_event; - int i, counter, group_fd; + int i, counter, group_fd, nr_poll = 0; unsigned int cpu; int ret; @@ -1214,8 +1214,9 @@ int main(int argc, char *argv[]) if (group && group_fd == -1) group_fd = fd[i][counter]; - event_array[i][counter].fd = fd[i][counter]; - event_array[i][counter].events = POLLIN; + event_array[nr_poll].fd = fd[i][counter]; + event_array[nr_poll].events = POLLIN; + nr_poll++; mmap_array[i][counter].counter = counter; mmap_array[i][counter].prev = 0; @@ -1247,7 +1248,7 @@ int main(int argc, char *argv[]) } if (hits == events) - ret = poll(event_array[0], nr_cpus, 1000); + ret = poll(event_array, nr_poll, 1000); hits = events; } From f66c6b2066b44d4ab8e8ac1ee4cae543738fe2ac Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 23 Mar 2009 10:29:36 +1100 Subject: [PATCH 0203/2061] perf_counter: update documentation Impact: documentation fix This updates the perfcounter documentation to reflect recent changes. Signed-off-by: Paul Mackerras --- Documentation/perf_counter/design.txt | 264 +++++++++++++++++++------- 1 file changed, 200 insertions(+), 64 deletions(-) diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt index fddd32189a5..aaf105c02fb 100644 --- a/Documentation/perf_counter/design.txt +++ b/Documentation/perf_counter/design.txt @@ -11,7 +11,9 @@ thus be used to profile the code that runs on that CPU. The Linux Performance Counter subsystem provides an abstraction of these hardware capabilities. It provides per task and per CPU counters, counter -groups, and it provides event capabilities on top of those. +groups, and it provides event capabilities on top of those. It +provides "virtual" 64-bit counters, regardless of the width of the +underlying hardware counters. Performance counters are accessed via special file descriptors. There's one file descriptor per virtual counter used. @@ -20,7 +22,8 @@ The special file descriptor is opened via the perf_counter_open() system call: int sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, - pid_t pid, int cpu, int group_fd); + pid_t pid, int cpu, int group_fd, + unsigned long flags); The syscall returns the new fd. The fd can be used via the normal VFS system calls: read() can be used to read the counter, fcntl() @@ -32,90 +35,180 @@ can be poll()ed. When creating a new counter fd, 'perf_counter_hw_event' is: /* - * Hardware event to monitor via a performance monitoring counter: + * Event to monitor via a performance monitoring counter: */ struct perf_counter_hw_event { - s64 type; + __u64 event_config; - u64 irq_period; - u32 record_type; + __u64 irq_period; + __u64 record_type; + __u64 read_format; - u32 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - raw : 1, /* raw event type */ - __reserved_1 : 29; + __u64 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ - u64 __reserved_2; + __reserved_1 : 55; + + __u32 extra_config_len; + + __u32 __reserved_4; + __u64 __reserved_2; + __u64 __reserved_3; }; +The 'event_config' field specifies what the counter should count. It +is divided into 3 bit-fields: + +raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 +type: 7 bits (next most significant) 0x7f00_0000_0000_0000 +event_id: 56 bits (least significant) 0x00ff_0000_0000_0000 + +If 'raw_type' is 1, then the counter will count a hardware event +specified by the remaining 63 bits of event_config. The encoding is +machine-specific. + +If 'raw_type' is 0, then the 'type' field says what kind of counter +this is, with the following encoding: + +enum perf_event_types { + PERF_TYPE_HARDWARE = 0, + PERF_TYPE_SOFTWARE = 1, + PERF_TYPE_TRACEPOINT = 2, +}; + +A counter of PERF_TYPE_HARDWARE will count the hardware event +specified by 'event_id': + /* - * Generalized performance counter event types, used by the hw_event.type + * Generalized performance counter event types, used by the hw_event.event_id * parameter of the sys_perf_counter_open() syscall: */ -enum hw_event_types { +enum hw_event_ids { /* * Common hardware events, generalized by the kernel: */ - PERF_COUNT_CYCLES = 0, - PERF_COUNT_INSTRUCTIONS = 1, - PERF_COUNT_CACHE_REFERENCES = 2, - PERF_COUNT_CACHE_MISSES = 3, - PERF_COUNT_BRANCH_INSTRUCTIONS = 4, - PERF_COUNT_BRANCH_MISSES = 5, - - /* - * Special "software" counters provided by the kernel, even if - * the hardware does not support performance counters. These - * counters measure various physical and sw events of the - * kernel (and allow the profiling of them as well): - */ - PERF_COUNT_CPU_CLOCK = -1, - PERF_COUNT_TASK_CLOCK = -2, - /* - * Future software events: - */ - /* PERF_COUNT_PAGE_FAULTS = -3, - PERF_COUNT_CONTEXT_SWITCHES = -4, */ + PERF_COUNT_CPU_CYCLES = 0, + PERF_COUNT_INSTRUCTIONS = 1, + PERF_COUNT_CACHE_REFERENCES = 2, + PERF_COUNT_CACHE_MISSES = 3, + PERF_COUNT_BRANCH_INSTRUCTIONS = 4, + PERF_COUNT_BRANCH_MISSES = 5, + PERF_COUNT_BUS_CYCLES = 6, }; -These are standardized types of events that work uniformly on all CPUs -that implements Performance Counters support under Linux. If a CPU is -not able to count branch-misses, then the system call will return --EINVAL. +These are standardized types of events that work relatively uniformly +on all CPUs that implement Performance Counters support under Linux, +although there may be variations (e.g., different CPUs might count +cache references and misses at different levels of the cache hierarchy). +If a CPU is not able to count the selected event, then the system call +will return -EINVAL. -More hw_event_types are supported as well, but they are CPU -specific and are enumerated via /sys on a per CPU basis. Raw hw event -types can be passed in under hw_event.type if hw_event.raw is 1. -For example, to count "External bus cycles while bus lock signal asserted" -events on Intel Core CPUs, pass in a 0x4064 event type value and set -hw_event.raw to 1. +More hw_event_types are supported as well, but they are CPU-specific +and accessed as raw events. For example, to count "External bus +cycles while bus lock signal asserted" events on Intel Core CPUs, pass +in a 0x4064 event_id value and set hw_event.raw_type to 1. -'record_type' is the type of data that a read() will provide for the -counter, and it can be one of: +A counter of type PERF_TYPE_SOFTWARE will count one of the available +software events, selected by 'event_id': + +/* + * Special "software" counters provided by the kernel, even if the hardware + * does not support performance counters. These counters measure various + * physical and sw events of the kernel (and allow the profiling of them as + * well): + */ +enum sw_event_ids { + PERF_COUNT_CPU_CLOCK = 0, + PERF_COUNT_TASK_CLOCK = 1, + PERF_COUNT_PAGE_FAULTS = 2, + PERF_COUNT_CONTEXT_SWITCHES = 3, + PERF_COUNT_CPU_MIGRATIONS = 4, + PERF_COUNT_PAGE_FAULTS_MIN = 5, + PERF_COUNT_PAGE_FAULTS_MAJ = 6, +}; + +Counters come in two flavours: counting counters and sampling +counters. A "counting" counter is one that is used for counting the +number of events that occur, and is characterised by having +irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a +counting counter simply returns the current value of the counter as +an 8-byte number. + +A "sampling" counter is one that is set up to generate an interrupt +every N events, where N is given by 'irq_period'. A sampling counter +has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The +record_type controls what data is recorded on each interrupt, and the +available values are currently: /* * IRQ-notification data record type: */ enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, + PERF_RECORD_SIMPLE = 0, + PERF_RECORD_IRQ = 1, + PERF_RECORD_GROUP = 2, }; -a "simple" counter is one that counts hardware events and allows -them to be read out into a u64 count value. (read() returns 8 on -a successful read of a simple counter.) +A record_type value of PERF_RECORD_IRQ will record the instruction +pointer (IP) at which the interrupt occurred. A record_type value of +PERF_RECORD_GROUP will record the event_config and counter value of +all of the other counters in the group, and should only be used on a +group leader (see below). Currently these two values are mutually +exclusive, but record_type will become a bit-mask in future and +support other values. -An "irq" counter is one that will also provide an IRQ context information: -the IP of the interrupted context. In this case read() will return -the 8-byte counter value, plus the Instruction Pointer address of the -interrupted context. +A sampling counter has an event queue, into which an event is placed +on each interrupt. A read() on a sampling counter will read the next +event from the event queue. If the queue is empty, the read() will +either block or return an EAGAIN error, depending on whether the fd +has been set to non-blocking mode or not. -The parameter 'hw_event_period' is the number of events before waking up -a read() that is blocked on a counter fd. Zero value means a non-blocking -counter. +The 'disabled' bit specifies whether the counter starts out disabled +or enabled. If it is initially disabled, it can be enabled by ioctl +or prctl (see below). -The 'pid' parameter allows the counter to be specific to a task: +The 'nmi' bit specifies, for hardware events, whether the counter +should be set up to request non-maskable interrupts (NMIs) or normal +interrupts. This bit is ignored if the user doesn't have +CAP_SYS_ADMIN privilege (i.e. is not root) or if the CPU doesn't +generate NMIs from hardware counters. + +The 'inherit' bit, if set, specifies that this counter should count +events on descendant tasks as well as the task specified. This only +applies to new descendents, not to any existing descendents at the +time the counter is created (nor to any new descendents of existing +descendents). + +The 'pinned' bit, if set, specifies that the counter should always be +on the CPU if at all possible. It only applies to hardware counters +and only to group leaders. If a pinned counter cannot be put onto the +CPU (e.g. because there are not enough hardware counters or because of +a conflict with some other event), then the counter goes into an +'error' state, where reads return end-of-file (i.e. read() returns 0) +until the counter is subsequently enabled or disabled. + +The 'exclusive' bit, if set, specifies that when this counter's group +is on the CPU, it should be the only group using the CPU's counters. +In future, this will allow sophisticated monitoring programs to supply +extra configuration information via 'extra_config_len' to exploit +advanced features of the CPU's Performance Monitor Unit (PMU) that are +not otherwise accessible and that might disrupt other hardware +counters. + +The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a +way to request that counting of events be restricted to times when the +CPU is in user, kernel and/or hypervisor mode. + + +The 'pid' parameter to the perf_counter_open() system call allows the +counter to be specific to a task: pid == 0: if the pid parameter is zero, the counter is attached to the current task. @@ -125,8 +218,7 @@ The 'pid' parameter allows the counter to be specific to a task: pid < 0: all tasks are counted (per cpu counters) -The 'cpu' parameter allows a counter to be made specific to a full -CPU: +The 'cpu' parameter allows a counter to be made specific to a CPU: cpu >= 0: the counter is restricted to a specific CPU cpu == -1: the counter counts on all CPUs @@ -141,7 +233,51 @@ their own tasks. A 'pid == -1' and 'cpu == x' counter is a per CPU counter that counts all events on CPU-x. Per CPU counters need CAP_SYS_ADMIN privilege. -Group counters are created by passing in a group_fd of another counter. -Groups are scheduled at once and can be used with PERF_RECORD_GROUP -to record multi-dimensional timestamps. +The 'flags' parameter is currently unused and must be zero. + +The 'group_fd' parameter allows counter "groups" to be set up. A +counter group has one counter which is the group "leader". The leader +is created first, with group_fd = -1 in the perf_counter_open call +that creates it. The rest of the group members are created +subsequently, with group_fd giving the fd of the group leader. +(A single counter on its own is created with group_fd = -1 and is +considered to be a group with only 1 member.) + +A counter group is scheduled onto the CPU as a unit, that is, it will +only be put onto the CPU if all of the counters in the group can be +put onto the CPU. This means that the values of the member counters +can be meaningfully compared, added, divided (to get ratios), etc., +with each other, since they have counted events for the same set of +executed instructions. + +Counters can be enabled and disabled in two ways: via ioctl and via +prctl. When a counter is disabled, it doesn't count or generate +events but does continue to exist and maintain its count value. + +An individual counter or counter group can be enabled with + + ioctl(fd, PERF_COUNTER_IOC_ENABLE); + +or disabled with + + ioctl(fd, PERF_COUNTER_IOC_DISABLE); + +Enabling or disabling the leader of a group enables or disables the +whole group; that is, while the group leader is disabled, none of the +counters in the group will count. Enabling or disabling a member of a +group other than the leader only affects that counter - disabling an +non-leader stops that counter from counting but doesn't affect any +other counter. + +A process can enable or disable all the counter groups that are +attached to it, using prctl: + + prctl(PR_TASK_PERF_COUNTERS_ENABLE); + + prctl(PR_TASK_PERF_COUNTERS_DISABLE); + +This applies to all counters on the current process, whether created +by this process or by another, and doesn't affect any counters that +this process has created on other processes. It only enables or +disables the group leaders, not any other members in the groups. From c7138f37f905bb7987b1f9f5a8ee73667db39f25 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 24 Mar 2009 13:18:16 +0100 Subject: [PATCH 0204/2061] perf_counter: fix perf_poll() Impact: fix kerneltop 100% CPU usage Only return a poll event when there's actually been one, poll_wait() doesn't actually wait for the waitq you pass it, it only enqueues you on it. Only once all FDs have been iterated and none of thm returned a poll-event will it schedule(). Also make it return POLL_HUP when there's not mmap() area to read from. Further, fix a silly bug in the write code. Reported-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Arjan van de Ven Orig-LKML-Reference: <1237897096.24918.181.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 14 ++++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2b5e66d5ebd..48212c15b7d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -246,6 +246,7 @@ struct file; struct perf_mmap_data { struct rcu_head rcu_head; int nr_pages; + atomic_t wakeup; atomic_t head; struct perf_counter_mmap_page *user_page; void *data_pages[0]; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0dfe91094fd..affe227d56a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1161,7 +1161,16 @@ perf_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; - unsigned int events = POLLIN; + struct perf_mmap_data *data; + unsigned int events; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) + events = atomic_xchg(&data->wakeup, 0); + else + events = POLL_HUP; + rcu_read_unlock(); poll_wait(file, &counter->waitq, wait); @@ -1425,7 +1434,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi, do { offset = head = atomic_read(&data->head); - head += sizeof(u64); + head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); @@ -1446,6 +1455,7 @@ static int perf_output_write(struct perf_counter *counter, int nmi, * generate a poll() wakeup for every page boundary crossed */ if (wakeup) { + atomic_xchg(&data->wakeup, POLL_IN); __perf_counter_update_userpage(counter, data); if (nmi) { counter->wakeup_pending = 1; From b9cacc7bf193df16532bfa7d7ca77fe50fc3c2e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:22 +0100 Subject: [PATCH 0205/2061] perf_counter: more elaborate write API Provide a begin, copy, end interface to the output buffer. begin() reserves the space, copy() copies the data over, considering page boundaries, end() finalizes the event and does the wakeup. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113316.740550870@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 119 ++++++++++++++++++++++++++++-------------- 1 file changed, 80 insertions(+), 39 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index affe227d56a..0422fd9bf62 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include @@ -16,15 +17,14 @@ #include #include #include +#include +#include +#include #include #include #include #include #include -#include -#include -#include -#include #include @@ -1411,16 +1411,20 @@ static const struct file_operations perf_fops = { * Output */ -static int perf_output_write(struct perf_counter *counter, int nmi, - void *buf, ssize_t size) +struct perf_output_handle { + struct perf_counter *counter; + struct perf_mmap_data *data; + unsigned int offset; + int wakeup; +}; + +static int perf_output_begin(struct perf_output_handle *handle, + struct perf_counter *counter, unsigned int size) { struct perf_mmap_data *data; - unsigned int offset, head, nr; - unsigned int len; - int ret, wakeup; + unsigned int offset, head; rcu_read_lock(); - ret = -ENOSPC; data = rcu_dereference(counter->data); if (!data) goto out; @@ -1428,45 +1432,82 @@ static int perf_output_write(struct perf_counter *counter, int nmi, if (!data->nr_pages) goto out; - ret = -EINVAL; - if (size > PAGE_SIZE) - goto out; - do { offset = head = atomic_read(&data->head); head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); - wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); + handle->counter = counter; + handle->data = data; + handle->offset = offset; + handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); - nr = (offset >> PAGE_SHIFT) & (data->nr_pages - 1); - offset &= PAGE_SIZE - 1; + return 0; - len = min_t(unsigned int, PAGE_SIZE - offset, size); - memcpy(data->data_pages[nr] + offset, buf, len); - size -= len; - - if (size) { - nr = (nr + 1) & (data->nr_pages - 1); - memcpy(data->data_pages[nr], buf + len, size); - } - - /* - * generate a poll() wakeup for every page boundary crossed - */ - if (wakeup) { - atomic_xchg(&data->wakeup, POLL_IN); - __perf_counter_update_userpage(counter, data); - if (nmi) { - counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&counter->waitq); - } - ret = 0; out: rcu_read_unlock(); + return -ENOSPC; +} + +static void perf_output_copy(struct perf_output_handle *handle, + void *buf, unsigned int len) +{ + unsigned int pages_mask; + unsigned int offset; + unsigned int size; + void **pages; + + offset = handle->offset; + pages_mask = handle->data->nr_pages - 1; + pages = handle->data->data_pages; + + do { + unsigned int page_offset; + int nr; + + nr = (offset >> PAGE_SHIFT) & pages_mask; + page_offset = offset & (PAGE_SIZE - 1); + size = min_t(unsigned int, PAGE_SIZE - page_offset, len); + + memcpy(pages[nr] + page_offset, buf, size); + + len -= size; + buf += size; + offset += size; + } while (len); + + handle->offset = offset; +} + +static void perf_output_end(struct perf_output_handle *handle, int nmi) +{ + if (handle->wakeup) { + (void)atomic_xchg(&handle->data->wakeup, POLL_IN); + __perf_counter_update_userpage(handle->counter, handle->data); + if (nmi) { + handle->counter->wakeup_pending = 1; + set_perf_counter_pending(); + } else + wake_up(&handle->counter->waitq); + } + rcu_read_unlock(); +} + +static int perf_output_write(struct perf_counter *counter, int nmi, + void *buf, ssize_t size) +{ + struct perf_output_handle handle; + int ret; + + ret = perf_output_begin(&handle, counter, size); + if (ret) + goto out; + + perf_output_copy(&handle, buf, size); + perf_output_end(&handle, nmi); + +out: return ret; } From 5c1481943250ab65fa5130e05ec479c93216e9f7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:23 +0100 Subject: [PATCH 0206/2061] perf_counter: output objects Provide a {type,size} header for each output entry. This should provide extensible output, and the ability to mix multiple streams. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113316.831607932@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 ++++++++ kernel/perf_counter.c | 53 ++++++++++++++++++++++++++++-------- 2 files changed, 52 insertions(+), 12 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 48212c15b7d..c256635377d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -156,6 +156,16 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; +struct perf_event_header { + __u32 type; + __u32 size; +}; + +enum perf_event_type { + PERF_EVENT_IP = 0, + PERF_EVENT_GROUP = 1, +}; + #ifdef __KERNEL__ /* * Kernel-internal data types and definitions: @@ -260,6 +270,7 @@ struct perf_counter { struct list_head list_entry; struct list_head event_entry; struct list_head sibling_list; + int nr_siblings; struct perf_counter *group_leader; const struct hw_perf_counter_ops *hw_ops; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0422fd9bf62..d76e3112d38 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -75,8 +75,10 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) */ if (counter->group_leader == counter) list_add_tail(&counter->list_entry, &ctx->counter_list); - else + else { list_add_tail(&counter->list_entry, &group_leader->sibling_list); + group_leader->nr_siblings++; + } list_add_rcu(&counter->event_entry, &ctx->event_list); } @@ -89,6 +91,9 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); + if (counter->group_leader != counter) + counter->group_leader->nr_siblings--; + /* * If this was a group counter with sibling counters then * upgrade the siblings to singleton counters by adding them @@ -381,9 +386,11 @@ static int is_software_only_group(struct perf_counter *leader) if (!is_software_counter(leader)) return 0; + list_for_each_entry(counter, &leader->sibling_list, list_entry) if (!is_software_counter(counter)) return 0; + return 1; } @@ -1480,6 +1487,9 @@ static void perf_output_copy(struct perf_output_handle *handle, handle->offset = offset; } +#define perf_output_put(handle, x) \ + perf_output_copy((handle), &(x), sizeof(x)) + static void perf_output_end(struct perf_output_handle *handle, int nmi) { if (handle->wakeup) { @@ -1514,34 +1524,53 @@ out: static void perf_output_simple(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - u64 entry; + struct { + struct perf_event_header header; + u64 ip; + } event; - entry = instruction_pointer(regs); + event.header.type = PERF_EVENT_IP; + event.header.size = sizeof(event); + event.ip = instruction_pointer(regs); - perf_output_write(counter, nmi, &entry, sizeof(entry)); + perf_output_write(counter, nmi, &event, sizeof(event)); } -struct group_entry { - u64 event; - u64 counter; -}; - static void perf_output_group(struct perf_counter *counter, int nmi) { + struct perf_output_handle handle; + struct perf_event_header header; struct perf_counter *leader, *sub; + unsigned int size; + struct { + u64 event; + u64 counter; + } entry; + int ret; + + size = sizeof(header) + counter->nr_siblings * sizeof(entry); + + ret = perf_output_begin(&handle, counter, size); + if (ret) + return; + + header.type = PERF_EVENT_GROUP; + header.size = size; + + perf_output_put(&handle, header); leader = counter->group_leader; list_for_each_entry(sub, &leader->sibling_list, list_entry) { - struct group_entry entry; - if (sub != counter) sub->hw_ops->read(sub); entry.event = sub->hw_event.config; entry.counter = atomic64_read(&sub->count); - perf_output_write(counter, nmi, &entry, sizeof(entry)); + perf_output_put(&handle, entry); } + + perf_output_end(&handle, nmi); } void perf_counter_output(struct perf_counter *counter, From 63e35b25d6b5c3136d22ef249dbbf96716aa08bf Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:24 +0100 Subject: [PATCH 0207/2061] perf_counter: sanity check on the output API Ensure we never write more than we said we would. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113316.921433024@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d76e3112d38..7669afe82cc 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1422,6 +1422,7 @@ struct perf_output_handle { struct perf_counter *counter; struct perf_mmap_data *data; unsigned int offset; + unsigned int head; int wakeup; }; @@ -1447,6 +1448,7 @@ static int perf_output_begin(struct perf_output_handle *handle, handle->counter = counter; handle->data = data; handle->offset = offset; + handle->head = head; handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); return 0; @@ -1485,6 +1487,8 @@ static void perf_output_copy(struct perf_output_handle *handle, } while (len); handle->offset = offset; + + WARN_ON_ONCE(handle->offset > handle->head); } #define perf_output_put(handle, x) \ From ea5d20cf99db5d26d43b6d322d3ace17e08a6552 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:25 +0100 Subject: [PATCH 0208/2061] perf_counter: optionally provide the pid/tid of the sampled task Allow cpu wide counters to profile userspace by providing what process the sample belongs to. This raises the first issue with the output type, lots of these options: group, tid, callchain, etc.. are non-exclusive and could be combined, suggesting a bitfield. However, things like the mmap() data stream doesn't fit in that. How to split the type field... Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113317.013775235@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 ++++- kernel/perf_counter.c | 18 ++++++++++++++++-- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c256635377d..7fdbdf8be77 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -127,8 +127,9 @@ struct perf_counter_hw_event { exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ + include_tid : 1, /* include the tid */ - __reserved_1 : 55; + __reserved_1 : 54; __u32 extra_config_len; __u32 __reserved_4; @@ -164,6 +165,8 @@ struct perf_event_header { enum perf_event_type { PERF_EVENT_IP = 0, PERF_EVENT_GROUP = 1, + + __PERF_EVENT_TID = 0x100, }; #ifdef __KERNEL__ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7669afe82cc..f3e1b27bc1b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1528,16 +1528,30 @@ out: static void perf_output_simple(struct perf_counter *counter, int nmi, struct pt_regs *regs) { + unsigned int size; struct { struct perf_event_header header; u64 ip; + u32 pid, tid; } event; event.header.type = PERF_EVENT_IP; - event.header.size = sizeof(event); event.ip = instruction_pointer(regs); - perf_output_write(counter, nmi, &event, sizeof(event)); + size = sizeof(event); + + if (counter->hw_event.include_tid) { + /* namespace issues */ + event.pid = current->group_leader->pid; + event.tid = current->pid; + + event.header.type |= __PERF_EVENT_TID; + } else + size -= sizeof(u64); + + event.header.size = size; + + perf_output_write(counter, nmi, &event, size); } static void perf_output_group(struct perf_counter *counter, int nmi) From 4c4ba21d2c3659e4c0421533939b58a8fd9f06c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:26 +0100 Subject: [PATCH 0209/2061] perf_counter: kerneltop: mmap_pages argument provide a knob to set the number of mmap data pages. Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113317.104545398@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 31 +++++++++++++++++++++----- 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 7ebde7a336c..3e45bf6591b 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -178,6 +178,7 @@ static int nr_cpus = 0; static int nmi = 1; static int group = 0; static unsigned int page_size; +static unsigned int mmap_pages = 4; static char *vmlinux; @@ -326,6 +327,7 @@ static void display_help(void) " -x path --vmlinux= # the vmlinux binary, required for -s use\n" " -z --zero # zero counts after display\n" " -D --dump_symtab # dump symbol table to stderr on startup\n" + " -m pages --mmap_pages= # number of mmap data pages\n" ); exit(0); @@ -732,7 +734,9 @@ static int read_symbol(FILE *in, struct sym_entry *s) /* Tag events to be skipped. */ if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym)) s->skip = 1; - if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym)) + else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym)) + s->skip = 1; + else if (!strcmp("mwait_idle", s->sym)) s->skip = 1; if (filter_match == 1) { @@ -1042,9 +1046,10 @@ static void process_options(int argc, char *argv[]) {"symbol", required_argument, NULL, 's'}, {"stat", no_argument, NULL, 'S'}, {"zero", no_argument, NULL, 'z'}, + {"mmap_pages", required_argument, NULL, 'm'}, {NULL, 0, NULL, 0 } }; - int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:p:s:Sx:z", + int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z", long_options, &option_index); if (c == -1) break; @@ -1081,6 +1086,7 @@ static void process_options(int argc, char *argv[]) case 'S': run_perfstat = 1; break; case 'x': vmlinux = strdup(optarg); break; case 'z': zero = 1; break; + case 'm': mmap_pages = atoi(optarg); break; default: error = 1; break; } } @@ -1134,17 +1140,30 @@ repeat: return head; } +struct timeval last_read, this_read; + static void mmap_read(struct mmap_data *md) { unsigned int head = mmap_read_head(md); unsigned int old = md->prev; unsigned char *data = md->base + page_size; + gettimeofday(&this_read, NULL); + if (head - old > md->mask) { - printf("ERROR: failed to keep up with mmap data\n"); - exit(-1); + struct timeval iv; + unsigned long msecs; + + timersub(&this_read, &last_read, &iv); + msecs = iv.tv_sec*1000 + iv.tv_usec/1000; + + fprintf(stderr, "WARNING: failed to keep up with mmap data. Last read %lu msecs ago.\n", msecs); + + old = head; } + last_read = this_read; + for (; old != head;) { __u64 *ptr = (__u64 *)&data[old & md->mask]; old += sizeof(__u64); @@ -1220,8 +1239,8 @@ int main(int argc, char *argv[]) mmap_array[i][counter].counter = counter; mmap_array[i][counter].prev = 0; - mmap_array[i][counter].mask = 2*page_size - 1; - mmap_array[i][counter].base = mmap(NULL, 3*page_size, + mmap_array[i][counter].mask = mmap_pages*page_size - 1; + mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size, PROT_READ, MAP_SHARED, fd[i][counter], 0); if (mmap_array[i][counter].base == MAP_FAILED) { printf("kerneltop error: failed to mmap with %d (%s)\n", From 00f0ad73ac90e3fba8b4cbe4cf21b2fb9a56cb72 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:30:27 +0100 Subject: [PATCH 0210/2061] perf_counter: kerneltop: output event support Teach kerneltop about the new output ABI. XXX: anybody fancy integrating the PID/TID data into the output? Bump the mmap_data pages a little because we bloated the output and have to be more careful about overruns with structured data. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Mike Galbraith Cc: Arjan van de Ven Cc: Wu Fengguang Orig-LKML-Reference: <20090325113317.192910290@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 65 +++++++++++++++++++++++--- 1 file changed, 59 insertions(+), 6 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 3e45bf6591b..fda1438365d 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -134,6 +134,11 @@ #endif #define unlikely(x) __builtin_expect(!!(x), 0) +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, @@ -178,7 +183,7 @@ static int nr_cpus = 0; static int nmi = 1; static int group = 0; static unsigned int page_size; -static unsigned int mmap_pages = 4; +static unsigned int mmap_pages = 16; static char *vmlinux; @@ -1147,28 +1152,75 @@ static void mmap_read(struct mmap_data *md) unsigned int head = mmap_read_head(md); unsigned int old = md->prev; unsigned char *data = md->base + page_size; + int diff; gettimeofday(&this_read, NULL); - if (head - old > md->mask) { + /* + * If we're further behind than half the buffer, there's a chance + * the writer will bite our tail and screw up the events under us. + * + * If we somehow ended up ahead of the head, we got messed up. + * + * In either case, truncate and restart at head. + */ + diff = head - old; + if (diff > md->mask / 2 || diff < 0) { struct timeval iv; unsigned long msecs; timersub(&this_read, &last_read, &iv); msecs = iv.tv_sec*1000 + iv.tv_usec/1000; - fprintf(stderr, "WARNING: failed to keep up with mmap data. Last read %lu msecs ago.\n", msecs); + fprintf(stderr, "WARNING: failed to keep up with mmap data." + " Last read %lu msecs ago.\n", msecs); + /* + * head points to a known good entry, start there. + */ old = head; } last_read = this_read; for (; old != head;) { - __u64 *ptr = (__u64 *)&data[old & md->mask]; - old += sizeof(__u64); + struct event_struct { + struct perf_event_header header; + __u64 ip; + __u32 pid, tid; + } *event = (struct event_struct *)&data[old & md->mask]; + struct event_struct event_copy; - process_event(*ptr, md->counter); + unsigned int size = event->header.size; + + /* + * Event straddles the mmap boundary -- header should always + * be inside due to u64 alignment of output. + */ + if ((old & md->mask) + size != ((old + size) & md->mask)) { + unsigned int offset = old; + unsigned int len = sizeof(*event), cpy; + void *dst = &event_copy; + + do { + cpy = min(md->mask + 1 - (offset & md->mask), len); + memcpy(dst, &data[offset & md->mask], cpy); + offset += cpy; + dst += cpy; + len -= cpy; + } while (len); + + event = &event_copy; + } + + old += size; + + switch (event->header.type) { + case PERF_EVENT_IP: + case PERF_EVENT_IP | __PERF_EVENT_TID: + process_event(event->ip, md->counter); + break; + } } md->prev = old; @@ -1214,6 +1266,7 @@ int main(int argc, char *argv[]) hw_event.irq_period = event_count[counter]; hw_event.record_type = PERF_RECORD_IRQ; hw_event.nmi = nmi; + hw_event.include_tid = 1; fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); if (fd[i][counter] < 0) { From 7730d8655880f41f2ea519aca2ca6a1413dfd2c9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 25 Mar 2009 12:48:31 +0100 Subject: [PATCH 0211/2061] perf_counter: allow and require one-page mmap on counting counters A brainfart stopped single page mmap()s working. The rest of the code should be perfectly fine with not having any data pages. Reported-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <1237981712.7972.812.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f3e1b27bc1b..95e02575546 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1369,7 +1369,11 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) vma_size = vma->vm_end - vma->vm_start; nr_pages = (vma_size / PAGE_SIZE) - 1; - if (nr_pages == 0 || !is_power_of_2(nr_pages)) + /* + * If we have data pages ensure they're a power-of-two number, so we + * can do bitmasks instead of modulo. + */ + if (nr_pages != 0 && !is_power_of_2(nr_pages)) return -EINVAL; if (vma_size != PAGE_SIZE * (1 + nr_pages)) From 53cfbf593758916aac41db728f029986a62f1254 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 25 Mar 2009 22:46:58 +1100 Subject: [PATCH 0212/2061] perf_counter: record time running and time enabled for each counter Impact: new functionality Currently, if there are more counters enabled than can fit on the CPU, the kernel will multiplex the counters on to the hardware using round-robin scheduling. That isn't too bad for sampling counters, but for counting counters it means that the value read from a counter represents some unknown fraction of the true count of events that occurred while the counter was enabled. This remedies the situation by keeping track of how long each counter is enabled for, and how long it is actually on the cpu and counting events. These times are recorded in nanoseconds using the task clock for per-task counters and the cpu clock for per-cpu counters. These values can be supplied to userspace on a read from the counter. Userspace requests that they be supplied after the counter value by setting the PERF_FORMAT_TOTAL_TIME_ENABLED and/or PERF_FORMAT_TOTAL_TIME_RUNNING bits in the hw_event.read_format field when creating the counter. (There is no way to change the read format after the counter is created, though it would be possible to add some way to do that.) Using this information it is possible for userspace to scale the count it reads from the counter to get an estimate of the true count: true_count_estimate = count * total_time_enabled / total_time_running This also lets userspace detect the situation where the counter never got to go on the cpu: total_time_running == 0. This functionality has been requested by the PAPI developers, and will be generally needed for interpreting the count values from counting counters correctly. In the implementation, this keeps 5 time values (in nanoseconds) for each counter: total_time_enabled and total_time_running are used when the counter is in state OFF or ERROR and for reporting back to userspace. When the counter is in state INACTIVE or ACTIVE, it is the tstamp_enabled, tstamp_running and tstamp_stopped values that are relevant, and total_time_enabled and total_time_running are determined from them. (tstamp_stopped is only used in INACTIVE state.) The reason for doing it like this is that it means that only counters being enabled or disabled at sched-in and sched-out time need to be updated. There are no new loops that iterate over all counters to update total_time_enabled or total_time_running. This also keeps separate child_total_time_running and child_total_time_enabled fields that get added in when reporting the totals to userspace. They are separate fields so that they can be atomic. We don't want to use atomics for total_time_running, total_time_enabled etc., because then we would have to use atomic sequences to update them, which are slower than regular arithmetic and memory accesses. It is possible to measure total_time_running by adding a task_clock counter to each group of counters, and total_time_enabled can be measured approximately with a top-level task_clock counter (though inaccuracies will creep in if you need to disable and enable groups since it is not possible in general to disable/enable the top-level task_clock counter simultaneously with another group). However, that adds extra overhead - I measured around 15% increase in the context switch latency reported by lat_ctx (from lmbench) when a task_clock counter was added to each of 2 groups, and around 25% increase when a task_clock counter was added to each of 4 groups. (In both cases a top-level task-clock counter was also added.) In contrast, the code added in this commit gives better information with no overhead that I could measure (in fact in some cases I measured lower times with this code, but the differences were all less than one standard deviation). [ v2: address review comments by Andrew Morton. ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Andrew Morton Orig-LKML-Reference: <18890.6578.728637.139402@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 2 + include/linux/perf_counter.h | 53 ++++++++++ kernel/perf_counter.c | 157 +++++++++++++++++++++++++---- 3 files changed, 191 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index d48596ab655..df007fe0cc0 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -455,6 +455,8 @@ static void counter_sched_in(struct perf_counter *counter, int cpu) { counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; + counter->tstamp_running += counter->ctx->time_now - + counter->tstamp_stopped; if (is_software_counter(counter)) counter->hw_ops->enable(counter); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7fdbdf8be77..6bf67ce1762 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -102,6 +102,16 @@ enum perf_counter_record_type { #define PERF_COUNTER_EVENT_SHIFT 0 #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) +/* + * Bits that can be set in hw_event.read_format to request that + * reads on the counter should return the indicated quantities, + * in increasing order of bit value, after the counter value. + */ +enum perf_counter_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1, + PERF_FORMAT_TOTAL_TIME_RUNNING = 2, +}; + /* * Hardware event to monitor via a performance monitoring counter: */ @@ -281,6 +291,32 @@ struct perf_counter { enum perf_counter_active_state prev_state; atomic64_t count; + /* + * These are the total time in nanoseconds that the counter + * has been enabled (i.e. eligible to run, and the task has + * been scheduled in, if this is a per-task counter) + * and running (scheduled onto the CPU), respectively. + * + * They are computed from tstamp_enabled, tstamp_running and + * tstamp_stopped when the counter is in INACTIVE or ACTIVE state. + */ + u64 total_time_enabled; + u64 total_time_running; + + /* + * These are timestamps used for computing total_time_enabled + * and total_time_running when the counter is in INACTIVE or + * ACTIVE state, measured in nanoseconds from an arbitrary point + * in time. + * tstamp_enabled: the notional time when the counter was enabled + * tstamp_running: the notional time when the counter was scheduled on + * tstamp_stopped: in INACTIVE state, the notional time when the + * counter was scheduled off. + */ + u64 tstamp_enabled; + u64 tstamp_running; + u64 tstamp_stopped; + struct perf_counter_hw_event hw_event; struct hw_perf_counter hw; @@ -291,6 +327,13 @@ struct perf_counter { struct perf_counter *parent; struct list_head child_list; + /* + * These accumulate total time (in nanoseconds) that children + * counters have been enabled and running, respectively. + */ + atomic64_t child_total_time_enabled; + atomic64_t child_total_time_running; + /* * Protect attach/detach and child_list: */ @@ -339,6 +382,16 @@ struct perf_counter_context { int nr_active; int is_active; struct task_struct *task; + + /* + * time_now is the current time in nanoseconds since an arbitrary + * point in the past. For per-task counters, this is based on the + * task clock, and for per-cpu counters it is based on the cpu clock. + * time_lost is an offset from the task/cpu clock, used to make it + * appear that time only passes while the context is scheduled in. + */ + u64 time_now; + u64 time_lost; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 95e02575546..3b862a7988c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -116,6 +116,7 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_stopped = ctx->time_now; counter->hw_ops->disable(counter); counter->oncpu = -1; @@ -251,6 +252,60 @@ retry: spin_unlock_irq(&ctx->lock); } +/* + * Get the current time for this context. + * If this is a task context, we use the task's task clock, + * or for a per-cpu context, we use the cpu clock. + */ +static u64 get_context_time(struct perf_counter_context *ctx, int update) +{ + struct task_struct *curr = ctx->task; + + if (!curr) + return cpu_clock(smp_processor_id()); + + return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime; +} + +/* + * Update the record of the current time in a context. + */ +static void update_context_time(struct perf_counter_context *ctx, int update) +{ + ctx->time_now = get_context_time(ctx, update) - ctx->time_lost; +} + +/* + * Update the total_time_enabled and total_time_running fields for a counter. + */ +static void update_counter_times(struct perf_counter *counter) +{ + struct perf_counter_context *ctx = counter->ctx; + u64 run_end; + + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + counter->total_time_enabled = ctx->time_now - + counter->tstamp_enabled; + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + run_end = counter->tstamp_stopped; + else + run_end = ctx->time_now; + counter->total_time_running = run_end - counter->tstamp_running; + } +} + +/* + * Update total_time_enabled and total_time_running for all counters in a group. + */ +static void update_group_times(struct perf_counter *leader) +{ + struct perf_counter *counter; + + update_counter_times(leader); + list_for_each_entry(counter, &leader->sibling_list, list_entry) + update_counter_times(counter); +} + /* * Cross CPU call to disable a performance counter */ @@ -276,6 +331,8 @@ static void __perf_counter_disable(void *info) * If it is in error state, leave it in error state. */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { + update_context_time(ctx, 1); + update_counter_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); else @@ -320,8 +377,10 @@ static void perf_counter_disable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + } spin_unlock_irq(&ctx->lock); } @@ -366,6 +425,8 @@ counter_sched_in(struct perf_counter *counter, return -EAGAIN; } + counter->tstamp_running += ctx->time_now - counter->tstamp_stopped; + if (!is_software_counter(counter)) cpuctx->active_oncpu++; ctx->nr_active++; @@ -425,6 +486,17 @@ static int group_can_go_on(struct perf_counter *counter, return can_add_hw; } +static void add_counter_to_ctx(struct perf_counter *counter, + struct perf_counter_context *ctx) +{ + list_add_counter(counter, ctx); + ctx->nr_counters++; + counter->prev_state = PERF_COUNTER_STATE_OFF; + counter->tstamp_enabled = ctx->time_now; + counter->tstamp_running = ctx->time_now; + counter->tstamp_stopped = ctx->time_now; +} + /* * Cross CPU call to install and enable a performance counter */ @@ -449,6 +521,7 @@ static void __perf_install_in_context(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); + update_context_time(ctx, 1); /* * Protect the list operation against NMI by disabling the @@ -456,9 +529,7 @@ static void __perf_install_in_context(void *info) */ perf_flags = hw_perf_save_disable(); - list_add_counter(counter, ctx); - ctx->nr_counters++; - counter->prev_state = PERF_COUNTER_STATE_OFF; + add_counter_to_ctx(counter, ctx); /* * Don't put the counter on if it is disabled or if @@ -486,8 +557,10 @@ static void __perf_install_in_context(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) + if (leader->hw_event.pinned) { + update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; + } } if (!err && !ctx->task && cpuctx->max_pertask) @@ -548,10 +621,8 @@ retry: * can add the counter safely, if it the call above did not * succeed. */ - if (list_empty(&counter->list_entry)) { - list_add_counter(counter, ctx); - ctx->nr_counters++; - } + if (list_empty(&counter->list_entry)) + add_counter_to_ctx(counter, ctx); spin_unlock_irq(&ctx->lock); } @@ -576,11 +647,13 @@ static void __perf_counter_enable(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); + update_context_time(ctx, 1); counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled; /* * If the counter is in a group and isn't the group leader, @@ -602,8 +675,10 @@ static void __perf_counter_enable(void *info) */ if (leader != counter) group_sched_out(leader, cpuctx, ctx); - if (leader->hw_event.pinned) + if (leader->hw_event.pinned) { + update_group_times(leader); leader->state = PERF_COUNTER_STATE_ERROR; + } } unlock: @@ -659,8 +734,11 @@ static void perf_counter_enable(struct perf_counter *counter) * Since we have the lock this context can't be scheduled * in, so we can change the state safely. */ - if (counter->state == PERF_COUNTER_STATE_OFF) + if (counter->state == PERF_COUNTER_STATE_OFF) { counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time_now - + counter->total_time_enabled; + } out: spin_unlock_irq(&ctx->lock); } @@ -693,6 +771,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, ctx->is_active = 0; if (likely(!ctx->nr_counters)) goto out; + update_context_time(ctx, 0); flags = hw_perf_save_disable(); if (ctx->nr_active) { @@ -797,6 +876,13 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (likely(!ctx->nr_counters)) goto out; + /* + * Add any time since the last sched_out to the lost time + * so it doesn't get included in the total_time_enabled and + * total_time_running measures for counters in the context. + */ + ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now; + flags = hw_perf_save_disable(); /* @@ -817,8 +903,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, * If this pinned group hasn't been scheduled, * put it in error state. */ - if (counter->state == PERF_COUNTER_STATE_INACTIVE) + if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_group_times(counter); counter->state = PERF_COUNTER_STATE_ERROR; + } } list_for_each_entry(counter, &ctx->counter_list, list_entry) { @@ -902,8 +990,10 @@ int perf_counter_task_disable(void) perf_flags = hw_perf_save_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ERROR) + if (counter->state != PERF_COUNTER_STATE_ERROR) { + update_group_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + } } hw_perf_restore(perf_flags); @@ -946,6 +1036,8 @@ int perf_counter_task_enable(void) if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; + counter->tstamp_enabled = ctx->time_now - + counter->total_time_enabled; counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -1009,10 +1101,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) static void __read(void *info) { struct perf_counter *counter = info; + struct perf_counter_context *ctx = counter->ctx; unsigned long flags; curr_rq_lock_irq_save(&flags); + if (ctx->is_active) + update_context_time(ctx, 1); counter->hw_ops->read(counter); + update_counter_times(counter); curr_rq_unlock_irq_restore(&flags); } @@ -1025,6 +1121,8 @@ static u64 perf_counter_read(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_ACTIVE) { smp_call_function_single(counter->oncpu, __read, counter, 1); + } else if (counter->state == PERF_COUNTER_STATE_INACTIVE) { + update_counter_times(counter); } return atomic64_read(&counter->count); @@ -1137,10 +1235,8 @@ static int perf_release(struct inode *inode, struct file *file) static ssize_t perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) { - u64 cntval; - - if (count < sizeof(cntval)) - return -EINVAL; + u64 values[3]; + int n; /* * Return end-of-file for a read on a counter that is in @@ -1151,10 +1247,24 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) return 0; mutex_lock(&counter->mutex); - cntval = perf_counter_read(counter); + values[0] = perf_counter_read(counter); + n = 1; + if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) + values[n++] = counter->total_time_enabled + + atomic64_read(&counter->child_total_time_enabled); + if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) + values[n++] = counter->total_time_running + + atomic64_read(&counter->child_total_time_running); mutex_unlock(&counter->mutex); - return put_user(cntval, (u64 __user *) buf) ? -EFAULT : sizeof(cntval); + if (count < n * sizeof(u64)) + return -EINVAL; + count = n * sizeof(u64); + + if (copy_to_user(buf, values, count)) + return -EFAULT; + + return count; } static ssize_t @@ -2290,8 +2400,7 @@ inherit_counter(struct perf_counter *parent_counter, * Link it up in the child's context: */ child_counter->task = child; - list_add_counter(child_counter, child_ctx); - child_ctx->nr_counters++; + add_counter_to_ctx(child_counter, child_ctx); child_counter->parent = parent_counter; /* @@ -2361,6 +2470,10 @@ static void sync_child_counter(struct perf_counter *child_counter, * Add back the child's count to the parent's count: */ atomic64_add(child_val, &parent_counter->count); + atomic64_add(child_counter->total_time_enabled, + &parent_counter->child_total_time_enabled); + atomic64_add(child_counter->total_time_running, + &parent_counter->child_total_time_running); /* * Remove this counter from the parent's list @@ -2395,6 +2508,7 @@ __perf_counter_exit_task(struct task_struct *child, if (child != current) { wait_task_inactive(child, 0); list_del_init(&child_counter->list_entry); + update_counter_times(child_counter); } else { struct perf_cpu_context *cpuctx; unsigned long flags; @@ -2412,6 +2526,7 @@ __perf_counter_exit_task(struct task_struct *child, cpuctx = &__get_cpu_var(perf_cpu_context); group_sched_out(child_counter, cpuctx, child_ctx); + update_counter_times(child_counter); list_del_init(&child_counter->list_entry); From 925d519ab82b6dd7aca9420d809ee83819c08db2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:02 +0200 Subject: [PATCH 0213/2061] perf_counter: unify and fix delayed counter wakeup While going over the wakeup code I noticed delayed wakeups only work for hardware counters but basically all software counters rely on them. This patch unifies and generalizes the delayed wakeup to fix this issue. Since we're dealing with NMI context bits here, use a cmpxchg() based single link list implementation to track counters that have pending wakeups. [ This should really be generic code for delayed wakeups, but since we cannot use cmpxchg()/xchg() in generic code, I've let it live in the perf_counter code. -- Eric Dumazet could use it to aggregate the network wakeups. ] Furthermore, the x86 method of using TIF flags was flawed in that its quite possible to end up setting the bit on the idle task, loosing the wakeup. The powerpc method uses per-cpu storage and does appear to be sufficient. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.153932974@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/hw_irq.h | 4 +- arch/powerpc/kernel/irq.c | 2 +- arch/powerpc/kernel/perf_counter.c | 22 +---- arch/x86/include/asm/perf_counter.h | 5 +- arch/x86/include/asm/thread_info.h | 4 +- arch/x86/kernel/cpu/perf_counter.c | 29 ------- arch/x86/kernel/signal.c | 6 -- include/linux/perf_counter.h | 15 ++-- kernel/perf_counter.c | 128 ++++++++++++++++++++++++++-- kernel/timer.c | 3 + 10 files changed, 142 insertions(+), 76 deletions(-) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index cb32d571c9c..20a44d0c9fd 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -132,7 +132,7 @@ static inline int irqs_disabled_flags(unsigned long flags) struct irq_chip; #ifdef CONFIG_PERF_COUNTERS -static inline unsigned long get_perf_counter_pending(void) +static inline unsigned long test_perf_counter_pending(void) { unsigned long x; @@ -160,7 +160,7 @@ extern void perf_counter_do_pending(void); #else -static inline unsigned long get_perf_counter_pending(void) +static inline unsigned long test_perf_counter_pending(void) { return 0; } diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index 469e9635ff0..2cd471f92fe 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -135,7 +135,7 @@ notrace void raw_local_irq_restore(unsigned long en) iseries_handle_interrupts(); } - if (get_perf_counter_pending()) { + if (test_perf_counter_pending()) { clear_perf_counter_pending(); perf_counter_do_pending(); } diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index df007fe0cc0..cde720fc495 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -649,24 +649,6 @@ hw_perf_counter_init(struct perf_counter *counter) return &power_perf_ops; } -/* - * Handle wakeups. - */ -void perf_counter_do_pending(void) -{ - int i; - struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); - struct perf_counter *counter; - - for (i = 0; i < cpuhw->n_counters; ++i) { - counter = cpuhw->counter[i]; - if (counter && counter->wakeup_pending) { - counter->wakeup_pending = 0; - wake_up(&counter->waitq); - } - } -} - /* * A counter has overflowed; update its count and record * things if requested. Note that interrupts are hard-disabled @@ -720,7 +702,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) struct cpu_hw_counters *cpuhw = &__get_cpu_var(cpu_hw_counters); struct perf_counter *counter; long val; - int need_wakeup = 0, found = 0; + int found = 0; for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; @@ -761,7 +743,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) * immediately; otherwise we'll have do the wakeup when interrupts * get soft-enabled. */ - if (get_perf_counter_pending() && regs->softe) { + if (test_perf_counter_pending() && regs->softe) { irq_enter(); clear_perf_counter_pending(); perf_counter_do_pending(); diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index 1662043b340..e2b0e66b235 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,8 +84,9 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) -#define set_perf_counter_pending() \ - set_tsk_thread_flag(current, TIF_PERF_COUNTERS); +#define set_perf_counter_pending() do { } while (0) +#define clear_perf_counter_pending() do { } while (0) +#define test_perf_counter_pending() (0) #ifdef CONFIG_PERF_COUNTERS extern void init_hw_perf_counters(void); diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 3ffd5d2a367..8820a73ae09 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -83,7 +83,6 @@ struct thread_info { #define TIF_SYSCALL_AUDIT 7 /* syscall auditing active */ #define TIF_SECCOMP 8 /* secure computing */ #define TIF_MCE_NOTIFY 10 /* notify userspace of an MCE */ -#define TIF_PERF_COUNTERS 11 /* notify perf counter work */ #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* 32bit process */ #define TIF_FORK 18 /* ret_from_fork */ @@ -107,7 +106,6 @@ struct thread_info { #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_MCE_NOTIFY (1 << TIF_MCE_NOTIFY) -#define _TIF_PERF_COUNTERS (1 << TIF_PERF_COUNTERS) #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_FORK (1 << TIF_FORK) @@ -141,7 +139,7 @@ struct thread_info { /* Only used for 64 bit */ #define _TIF_DO_NOTIFY_MASK \ - (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_PERF_COUNTERS|_TIF_NOTIFY_RESUME) + (_TIF_SIGPENDING|_TIF_MCE_NOTIFY|_TIF_NOTIFY_RESUME) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3f95b0cdc55..7aab177fb56 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -227,7 +227,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) */ hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } - counter->wakeup_pending = 0; return 0; } @@ -773,34 +772,6 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_exit(); } -/* - * This handler is triggered by NMI contexts: - */ -void perf_counter_notify(struct pt_regs *regs) -{ - struct cpu_hw_counters *cpuc; - unsigned long flags; - int bit, cpu; - - local_irq_save(flags); - cpu = smp_processor_id(); - cpuc = &per_cpu(cpu_hw_counters, cpu); - - for_each_bit(bit, cpuc->used, X86_PMC_IDX_MAX) { - struct perf_counter *counter = cpuc->counters[bit]; - - if (!counter) - continue; - - if (counter->wakeup_pending) { - counter->wakeup_pending = 0; - wake_up(&counter->waitq); - } - } - - local_irq_restore(flags); -} - void perf_counters_lapic_init(int nmi) { u32 apic_val; diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index 611615a92c9..0a813b17b17 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -6,7 +6,6 @@ * 2000-06-20 Pentium III FXSR, SSE support by Gareth Hughes * 2000-2002 x86-64 support by Andi Kleen */ -#include #include #include #include @@ -872,11 +871,6 @@ do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags) tracehook_notify_resume(regs); } - if (thread_info_flags & _TIF_PERF_COUNTERS) { - clear_thread_flag(TIF_PERF_COUNTERS); - perf_counter_notify(regs); - } - #ifdef CONFIG_X86_32 clear_thread_flag(TIF_IRET); #endif /* CONFIG_X86_32 */ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 6bf67ce1762..0d833228eee 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -275,6 +275,10 @@ struct perf_mmap_data { void *data_pages[0]; }; +struct perf_wakeup_entry { + struct perf_wakeup_entry *next; +}; + /** * struct perf_counter - performance counter kernel representation: */ @@ -350,7 +354,7 @@ struct perf_counter { /* poll related */ wait_queue_head_t waitq; /* optional: for NMIs */ - int wakeup_pending; + struct perf_wakeup_entry wakeup; void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; @@ -427,7 +431,7 @@ extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); -extern void perf_counter_notify(struct pt_regs *regs); +extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); extern void perf_counter_unthrottle(void); extern u64 hw_perf_save_disable(void); @@ -461,7 +465,7 @@ static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } -static inline void perf_counter_notify(struct pt_regs *regs) { } +static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } static inline void perf_counter_unthrottle(void) { } static inline void hw_perf_restore(u64 ctrl) { } @@ -469,8 +473,9 @@ static inline u64 hw_perf_save_disable(void) { return 0; } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } -static inline void perf_swcounter_event(u32 event, u64 nr, - int nmi, struct pt_regs *regs) { } +static inline void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } + #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 3b862a7988c..f70ff80e79d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1197,8 +1197,12 @@ static void free_counter_rcu(struct rcu_head *head) kfree(counter); } +static void perf_pending_sync(struct perf_counter *counter); + static void free_counter(struct perf_counter *counter) { + perf_pending_sync(counter); + if (counter->destroy) counter->destroy(counter); @@ -1528,6 +1532,118 @@ static const struct file_operations perf_fops = { .mmap = perf_mmap, }; +/* + * Perf counter wakeup + * + * If there's data, ensure we set the poll() state and publish everything + * to user-space before waking everybody up. + */ + +void perf_counter_wakeup(struct perf_counter *counter) +{ + struct perf_mmap_data *data; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (data) { + (void)atomic_xchg(&data->wakeup, POLL_IN); + __perf_counter_update_userpage(counter, data); + } + rcu_read_unlock(); + + wake_up_all(&counter->waitq); +} + +/* + * Pending wakeups + * + * Handle the case where we need to wakeup up from NMI (or rq->lock) context. + * + * The NMI bit means we cannot possibly take locks. Therefore, maintain a + * single linked list and use cmpxchg() to add entries lockless. + */ + +#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL) + +static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = { + PENDING_TAIL, +}; + +static void perf_pending_queue(struct perf_counter *counter) +{ + struct perf_wakeup_entry **head; + struct perf_wakeup_entry *prev, *next; + + if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL) + return; + + head = &get_cpu_var(perf_wakeup_head); + + do { + prev = counter->wakeup.next = *head; + next = &counter->wakeup; + } while (cmpxchg(head, prev, next) != prev); + + set_perf_counter_pending(); + + put_cpu_var(perf_wakeup_head); +} + +static int __perf_pending_run(void) +{ + struct perf_wakeup_entry *list; + int nr = 0; + + list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL); + while (list != PENDING_TAIL) { + struct perf_counter *counter = container_of(list, + struct perf_counter, wakeup); + + list = list->next; + + counter->wakeup.next = NULL; + /* + * Ensure we observe the unqueue before we issue the wakeup, + * so that we won't be waiting forever. + * -- see perf_not_pending(). + */ + smp_wmb(); + + perf_counter_wakeup(counter); + nr++; + } + + return nr; +} + +static inline int perf_not_pending(struct perf_counter *counter) +{ + /* + * If we flush on whatever cpu we run, there is a chance we don't + * need to wait. + */ + get_cpu(); + __perf_pending_run(); + put_cpu(); + + /* + * Ensure we see the proper queue state before going to sleep + * so that we do not miss the wakeup. -- see perf_pending_handle() + */ + smp_rmb(); + return counter->wakeup.next == NULL; +} + +static void perf_pending_sync(struct perf_counter *counter) +{ + wait_event(counter->waitq, perf_not_pending(counter)); +} + +void perf_counter_do_pending(void) +{ + __perf_pending_run(); +} + /* * Output */ @@ -1611,13 +1727,10 @@ static void perf_output_copy(struct perf_output_handle *handle, static void perf_output_end(struct perf_output_handle *handle, int nmi) { if (handle->wakeup) { - (void)atomic_xchg(&handle->data->wakeup, POLL_IN); - __perf_counter_update_userpage(handle->counter, handle->data); - if (nmi) { - handle->counter->wakeup_pending = 1; - set_perf_counter_pending(); - } else - wake_up(&handle->counter->waitq); + if (nmi) + perf_pending_queue(handle->counter); + else + perf_counter_wakeup(handle->counter); } rcu_read_unlock(); } @@ -2211,7 +2324,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->cpu = cpu; counter->hw_event = *hw_event; - counter->wakeup_pending = 0; counter->group_leader = group_leader; counter->hw_ops = NULL; counter->ctx = ctx; diff --git a/kernel/timer.c b/kernel/timer.c index b4555568b4e..672ca25fbc4 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -1167,6 +1168,8 @@ static void run_timer_softirq(struct softirq_action *h) { struct tvec_base *base = __get_cpu_var(tvec_bases); + perf_counter_do_pending(); + hrtimer_run_pending(); if (time_after_eq(jiffies, base->timer_jiffies)) From 38ff667b321b00f5e6830e93fb4ab11a653a2920 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:03 +0200 Subject: [PATCH 0214/2061] perf_counter: fix update_userpage() It just occured to me it is possible to have multiple contending updates of the userpage (mmap information vs overflow vs counter). This would break the seqlock logic. It appear the arch code uses this from NMI context, so we cannot possibly serialize its use, therefore separate the data_head update from it and let it return to its original use. The arch code needs to make sure there are no contending callers by disabling the counter before using it -- powerpc appears to do this nicely. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.241410660@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 35 +++++++++++++++++++++++++++++++++ kernel/perf_counter.c | 38 ++++++++++++++++++++++-------------- 2 files changed, 58 insertions(+), 15 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0d833228eee..8ac18852dcf 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -160,10 +160,45 @@ struct perf_counter_hw_event { struct perf_counter_mmap_page { __u32 version; /* version number of this structure */ __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw counters in user-space. + * + * The index and offset should be read atomically using the seqlock: + * + * __u32 seq, index; + * __s64 offset; + * + * again: + * rmb(); + * seq = pc->lock; + * + * if (unlikely(seq & 1)) { + * cpu_relax(); + * goto again; + * } + * + * index = pc->index; + * offset = pc->offset; + * + * rmb(); + * if (pc->lock != seq) + * goto again; + * + * After this, index contains architecture specific counter index + 1, + * so that 0 means unavailable, offset contains the value to be added + * to the result of the raw timer read to obtain this counter's value. + */ __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ __s64 offset; /* add to hardware counter value */ + /* + * Control data for the mmap() data buffer. + * + * User-space reading this value should issue an rmb(), on SMP capable + * platforms, after reading this value -- see perf_counter_wakeup(). + */ __u32 data_head; /* head in the data section */ }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f70ff80e79d..c95e92329b9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1316,10 +1316,22 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return err; } -static void __perf_counter_update_userpage(struct perf_counter *counter, - struct perf_mmap_data *data) +/* + * Callers need to ensure there can be no nesting of this function, otherwise + * the seqlock logic goes bad. We can not serialize this because the arch + * code calls this from NMI context. + */ +void perf_counter_update_userpage(struct perf_counter *counter) { - struct perf_counter_mmap_page *userpg = data->user_page; + struct perf_mmap_data *data; + struct perf_counter_mmap_page *userpg; + + rcu_read_lock(); + data = rcu_dereference(counter->data); + if (!data) + goto unlock; + + userpg = data->user_page; /* * Disable preemption so as to not let the corresponding user-space @@ -1333,20 +1345,10 @@ static void __perf_counter_update_userpage(struct perf_counter *counter, if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); - userpg->data_head = atomic_read(&data->head); smp_wmb(); ++userpg->lock; preempt_enable(); -} - -void perf_counter_update_userpage(struct perf_counter *counter) -{ - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) - __perf_counter_update_userpage(counter, data); +unlock: rcu_read_unlock(); } @@ -1547,7 +1549,13 @@ void perf_counter_wakeup(struct perf_counter *counter) data = rcu_dereference(counter->data); if (data) { (void)atomic_xchg(&data->wakeup, POLL_IN); - __perf_counter_update_userpage(counter, data); + /* + * Ensure all data writes are issued before updating the + * user-space data head information. The matching rmb() + * will be in userspace after reading this value. + */ + smp_wmb(); + data->user_page->data_head = atomic_read(&data->head); } rcu_read_unlock(); From 195564390210977954fe4ef45b39cdee34f41b59 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:04 +0200 Subject: [PATCH 0215/2061] perf_counter: kerneltop: simplify data_head read Now that the kernel side changed, match up again. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.327144324@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index fda1438365d..2779c57ad4b 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -1125,22 +1125,10 @@ struct mmap_data { static unsigned int mmap_read_head(struct mmap_data *md) { struct perf_counter_mmap_page *pc = md->base; - unsigned int seq, head; - -repeat: - rmb(); - seq = pc->lock; - - if (unlikely(seq & 1)) { - cpu_relax(); - goto repeat; - } + int head; head = pc->data_head; - rmb(); - if (pc->lock != seq) - goto repeat; return head; } From 0a4a93919bdc5cee48fe4367591e8e0449c1086c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:05 +0200 Subject: [PATCH 0216/2061] perf_counter: executable mmap() information Currently the profiling information returns userspace IPs but no way to correlate them to userspace code. Userspace could look into /proc/$pid/maps but that might not be current or even present anymore at the time of analyzing the IPs. Therefore provide means to track the mmap information and provide it in the output stream. XXX: only covers mmap()/munmap(), mremap() and mprotect() are missing. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Cc: Andrew Morton Orig-LKML-Reference: <20090330171023.417259499@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 24 +++++- kernel/perf_counter.c | 145 +++++++++++++++++++++++++++++++++++ mm/mmap.c | 10 +++ 3 files changed, 177 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8ac18852dcf..037a81145ac 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -137,9 +137,11 @@ struct perf_counter_hw_event { exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - include_tid : 1, /* include the tid */ + include_tid : 1, /* include the tid */ + mmap : 1, /* include mmap data */ + munmap : 1, /* include munmap data */ - __reserved_1 : 54; + __reserved_1 : 52; __u32 extra_config_len; __u32 __reserved_4; @@ -211,6 +213,9 @@ enum perf_event_type { PERF_EVENT_IP = 0, PERF_EVENT_GROUP = 1, + PERF_EVENT_MMAP = 2, + PERF_EVENT_MUNMAP = 3, + __PERF_EVENT_TID = 0x100, }; @@ -491,6 +496,12 @@ static inline int is_software_counter(struct perf_counter *counter) extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); +extern void perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file); + +extern void perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -511,6 +522,15 @@ static inline int perf_counter_task_enable(void) { return -EINVAL; } static inline void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } + +static inline void +perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) { } + +static inline void +perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) { } + #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c95e92329b9..f35e89e3d6a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -1843,6 +1844,150 @@ void perf_counter_output(struct perf_counter *counter, } } +/* + * mmap tracking + */ + +struct perf_mmap_event { + struct file *file; + char *file_name; + int file_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + u64 start; + u64 len; + u64 pgoff; + } event; +}; + +static void perf_counter_mmap_output(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + struct perf_output_handle handle; + int size = mmap_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size); + + if (ret) + return; + + perf_output_put(&handle, mmap_event->event); + perf_output_copy(&handle, mmap_event->file_name, + mmap_event->file_size); + perf_output_end(&handle, 0); +} + +static int perf_counter_mmap_match(struct perf_counter *counter, + struct perf_mmap_event *mmap_event) +{ + if (counter->hw_event.mmap && + mmap_event->event.header.type == PERF_EVENT_MMAP) + return 1; + + if (counter->hw_event.munmap && + mmap_event->event.header.type == PERF_EVENT_MUNMAP) + return 1; + + return 0; +} + +static void perf_counter_mmap_ctx(struct perf_counter_context *ctx, + struct perf_mmap_event *mmap_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_mmap_match(counter, mmap_event)) + perf_counter_mmap_output(counter, mmap_event); + } + rcu_read_unlock(); +} + +static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) +{ + struct perf_cpu_context *cpuctx; + struct file *file = mmap_event->file; + unsigned int size; + char tmp[16]; + char *buf = NULL; + char *name; + + if (file) { + buf = kzalloc(PATH_MAX, GFP_KERNEL); + if (!buf) { + name = strncpy(tmp, "//enomem", sizeof(tmp)); + goto got_name; + } + name = dentry_path(file->f_dentry, buf, PATH_MAX); + if (IS_ERR(name)) { + name = strncpy(tmp, "//toolong", sizeof(tmp)); + goto got_name; + } + } else { + name = strncpy(tmp, "//anon", sizeof(tmp)); + goto got_name; + } + +got_name: + size = ALIGN(strlen(name), sizeof(u64)); + + mmap_event->file_name = name; + mmap_event->file_size = size; + + mmap_event->event.header.size = sizeof(mmap_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); + put_cpu_var(perf_cpu_context); + + perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); + + kfree(buf); +} + +void perf_counter_mmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) +{ + struct perf_mmap_event mmap_event = { + .file = file, + .event = { + .header = { .type = PERF_EVENT_MMAP, }, + .pid = current->group_leader->pid, + .tid = current->pid, + .start = addr, + .len = len, + .pgoff = pgoff, + }, + }; + + perf_counter_mmap_event(&mmap_event); +} + +void perf_counter_munmap(unsigned long addr, unsigned long len, + unsigned long pgoff, struct file *file) +{ + struct perf_mmap_event mmap_event = { + .file = file, + .event = { + .header = { .type = PERF_EVENT_MUNMAP, }, + .pid = current->group_leader->pid, + .tid = current->pid, + .start = addr, + .len = len, + .pgoff = pgoff, + }, + }; + + perf_counter_mmap_event(&mmap_event); +} + /* * Generic software counter infrastructure */ diff --git a/mm/mmap.c b/mm/mmap.c index 4a3841186c1..1df63f614f9 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -1223,6 +1224,9 @@ munmap_back: if (correct_wcount) atomic_inc(&inode->i_writecount); out: + if (vm_flags & VM_EXEC) + perf_counter_mmap(addr, len, pgoff, file); + mm->total_vm += len >> PAGE_SHIFT; vm_stat_account(mm, vm_flags, file, len >> PAGE_SHIFT); if (vm_flags & VM_LOCKED) { @@ -1756,6 +1760,12 @@ static void remove_vma_list(struct mm_struct *mm, struct vm_area_struct *vma) do { long nrpages = vma_pages(vma); + if (vma->vm_flags & VM_EXEC) { + perf_counter_munmap(vma->vm_start, + nrpages << PAGE_SHIFT, + vma->vm_pgoff, vma->vm_file); + } + mm->total_vm -= nrpages; vm_stat_account(mm, vma->vm_flags, vma->vm_file, -nrpages); vma = remove_vma(vma); From 3c1ba6fafecaed295017881f8863a18602f32c1d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:06 +0200 Subject: [PATCH 0217/2061] perf_counter: kerneltop: parse the mmap data stream frob the kerneltop code to print the mmap data in the stream Better use would be collecting the IPs per PID and mapping them onto the provided userspace code.. TODO Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.501902515@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 50 ++++++++++++++++++++++---- 1 file changed, 44 insertions(+), 6 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 2779c57ad4b..995111dee7f 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -184,6 +184,8 @@ static int nmi = 1; static int group = 0; static unsigned int page_size; static unsigned int mmap_pages = 16; +static int use_mmap = 0; +static int use_munmap = 0; static char *vmlinux; @@ -333,6 +335,8 @@ static void display_help(void) " -z --zero # zero counts after display\n" " -D --dump_symtab # dump symbol table to stderr on startup\n" " -m pages --mmap_pages= # number of mmap data pages\n" + " -M --mmap_info # print mmap info stream\n" + " -U --munmap_info # print munmap info stream\n" ); exit(0); @@ -1052,9 +1056,11 @@ static void process_options(int argc, char *argv[]) {"stat", no_argument, NULL, 'S'}, {"zero", no_argument, NULL, 'z'}, {"mmap_pages", required_argument, NULL, 'm'}, + {"mmap_info", no_argument, NULL, 'M'}, + {"munmap_info", no_argument, NULL, 'U'}, {NULL, 0, NULL, 0 } }; - int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:z", + int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:zMU", long_options, &option_index); if (c == -1) break; @@ -1092,6 +1098,8 @@ static void process_options(int argc, char *argv[]) case 'x': vmlinux = strdup(optarg); break; case 'z': zero = 1; break; case 'm': mmap_pages = atoi(optarg); break; + case 'M': use_mmap = 1; break; + case 'U': use_munmap = 1; break; default: error = 1; break; } } @@ -1172,12 +1180,29 @@ static void mmap_read(struct mmap_data *md) last_read = this_read; for (; old != head;) { - struct event_struct { + struct ip_event { struct perf_event_header header; __u64 ip; __u32 pid, tid; - } *event = (struct event_struct *)&data[old & md->mask]; - struct event_struct event_copy; + }; + struct mmap_event { + struct perf_event_header header; + __u32 pid, tid; + __u64 start; + __u64 len; + __u64 pgoff; + char filename[PATH_MAX]; + }; + + typedef union event_union { + struct perf_event_header header; + struct ip_event ip; + struct mmap_event mmap; + } event_t; + + event_t *event = (event_t *)&data[old & md->mask]; + + event_t event_copy; unsigned int size = event->header.size; @@ -1187,7 +1212,7 @@ static void mmap_read(struct mmap_data *md) */ if ((old & md->mask) + size != ((old + size) & md->mask)) { unsigned int offset = old; - unsigned int len = sizeof(*event), cpy; + unsigned int len = min(sizeof(*event), size), cpy; void *dst = &event_copy; do { @@ -1206,7 +1231,18 @@ static void mmap_read(struct mmap_data *md) switch (event->header.type) { case PERF_EVENT_IP: case PERF_EVENT_IP | __PERF_EVENT_TID: - process_event(event->ip, md->counter); + process_event(event->ip.ip, md->counter); + break; + + case PERF_EVENT_MMAP: + case PERF_EVENT_MUNMAP: + printf("%s: %Lu %Lu %Lu %s\n", + event->header.type == PERF_EVENT_MMAP + ? "mmap" : "munmap", + event->mmap.start, + event->mmap.len, + event->mmap.pgoff, + event->mmap.filename); break; } } @@ -1255,6 +1291,8 @@ int main(int argc, char *argv[]) hw_event.record_type = PERF_RECORD_IRQ; hw_event.nmi = nmi; hw_event.include_tid = 1; + hw_event.mmap = use_mmap; + hw_event.munmap = use_munmap; fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); if (fd[i][counter] < 0) { From 7595d63b3a9ce65d14c4fbd0e7de448a343d7215 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Mar 2009 19:07:07 +0200 Subject: [PATCH 0218/2061] perf_counter: powerpc: only reserve PMU hardware when we need it Impact: cooperate with oprofile At present, on PowerPC, if you have perf_counters compiled in, oprofile doesn't work. There is code to allow the PMU to be shared between competing subsystems, such as perf_counters and oprofile, but currently the perf_counter subsystem reserves the PMU for itself at boot time, and never releases it. This makes perf_counter play nicely with oprofile. Now we keep a count of how many perf_counter instances are counting hardware events, and reserve the PMU when that count becomes non-zero, and release the PMU when that count becomes zero. This means that it is possible to have perf_counters compiled in and still use oprofile, as long as there are no hardware perf_counters active. This also means that if oprofile is active, sys_perf_counter_open will fail if the hw_event specifies a hardware event. To avoid races with other tasks creating and destroying perf_counters, we use a mutex. We use atomic_inc_not_zero and atomic_add_unless to avoid having to take the mutex unless there is a possibility of the count going between 0 and 1. Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090330171023.627912475@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 47 ++++++++++++++++++++++++++---- 1 file changed, 42 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index cde720fc495..560dd1e7b52 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -41,6 +41,8 @@ struct power_pmu *ppmu; */ static unsigned int freeze_counters_kernel = MMCR0_FCS; +static void perf_counter_interrupt(struct pt_regs *regs); + void perf_counter_print_debug(void) { } @@ -594,6 +596,24 @@ struct hw_perf_counter_ops power_perf_ops = { .read = power_perf_read }; +/* Number of perf_counters counting hardware events */ +static atomic_t num_counters; +/* Used to avoid races in calling reserve/release_pmc_hardware */ +static DEFINE_MUTEX(pmc_reserve_mutex); + +/* + * Release the PMU if this is the last perf_counter. + */ +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ + if (!atomic_add_unless(&num_counters, -1, 1)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_dec_return(&num_counters) == 0) + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter) { @@ -601,6 +621,7 @@ hw_perf_counter_init(struct perf_counter *counter) struct perf_counter *ctrs[MAX_HWCOUNTERS]; unsigned int events[MAX_HWCOUNTERS]; int n; + int err; if (!ppmu) return NULL; @@ -646,6 +667,27 @@ hw_perf_counter_init(struct perf_counter *counter) counter->hw.config = events[n]; atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); + + /* + * See if we need to reserve the PMU. + * If no counters are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * reserve_pmc_hardware or release_pmc_hardware. + */ + err = 0; + if (!atomic_inc_not_zero(&num_counters)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_counters) == 0 && + reserve_pmc_hardware(perf_counter_interrupt)) + err = -EBUSY; + else + atomic_inc(&num_counters); + mutex_unlock(&pmc_reserve_mutex); + } + counter->destroy = hw_perf_counter_destroy; + + if (err) + return NULL; return &power_perf_ops; } @@ -769,11 +811,6 @@ static int init_perf_counters(void) { unsigned long pvr; - if (reserve_pmc_hardware(perf_counter_interrupt)) { - printk(KERN_ERR "Couldn't init performance monitor subsystem\n"); - return -EBUSY; - } - /* XXX should get this from cputable */ pvr = mfspr(SPRN_PVR); switch (PVR_VER(pvr)) { From d5d2bc0dd0379deddb9ede66fec90a3083eaec57 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Mar 2009 19:07:08 +0200 Subject: [PATCH 0219/2061] perf_counter: make it possible for hw_perf_counter_init to return error codes Impact: better error reporting At present, if hw_perf_counter_init encounters an error, all it can do is return NULL, which causes sys_perf_counter_open to return an EINVAL error to userspace. This isn't very informative for userspace; it means that userspace can't tell the difference between "sorry, oprofile is already using the PMU" and "we don't support this CPU" and "this CPU doesn't support the requested generic hardware event". This commit uses the PTR_ERR/ERR_PTR/IS_ERR set of macros to let hw_perf_counter_init return an error code on error rather than just NULL if it wishes. If it does so, that error code will be returned from sys_perf_counter_open to userspace. If it returns NULL, an EINVAL error will be returned to userspace, as before. This also adapts the powerpc hw_perf_counter_init to make use of this to return ENXIO, EINVAL, EBUSY, or EOPNOTSUPP as appropriate. It would be good to add extra error numbers in future to allow userspace to distinguish the various errors that are currently reported as EINVAL, i.e. irq_period < 0, too many events in a group, conflict between exclude_* settings in a group, and PMU resource conflict in a group. [ v2: fix a bug pointed out by Corey Ashford where error returns from hw_perf_counter_init were not handled correctly in the case of raw hardware events.] Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090330171023.682428180@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 14 +++++------ kernel/perf_counter.c | 39 ++++++++++++++++++------------ 2 files changed, 31 insertions(+), 22 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 560dd1e7b52..0a4d14f279a 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -624,13 +624,13 @@ hw_perf_counter_init(struct perf_counter *counter) int err; if (!ppmu) - return NULL; + return ERR_PTR(-ENXIO); if ((s64)counter->hw_event.irq_period < 0) - return NULL; + return ERR_PTR(-EINVAL); if (!perf_event_raw(&counter->hw_event)) { ev = perf_event_id(&counter->hw_event); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) - return NULL; + return ERR_PTR(-EOPNOTSUPP); ev = ppmu->generic_events[ev]; } else { ev = perf_event_config(&counter->hw_event); @@ -656,14 +656,14 @@ hw_perf_counter_init(struct perf_counter *counter) n = collect_events(counter->group_leader, ppmu->n_counter - 1, ctrs, events); if (n < 0) - return NULL; + return ERR_PTR(-EINVAL); } events[n] = ev; ctrs[n] = counter; if (check_excludes(ctrs, n, 1)) - return NULL; + return ERR_PTR(-EINVAL); if (power_check_constraints(events, n + 1)) - return NULL; + return ERR_PTR(-EINVAL); counter->hw.config = events[n]; atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); @@ -687,7 +687,7 @@ hw_perf_counter_init(struct perf_counter *counter) counter->destroy = hw_perf_counter_destroy; if (err) - return NULL; + return ERR_PTR(err); return &power_perf_ops; } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f35e89e3d6a..d07b45278b4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2453,10 +2453,11 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, { const struct hw_perf_counter_ops *hw_ops; struct perf_counter *counter; + long err; counter = kzalloc(sizeof(*counter), gfpflags); if (!counter) - return NULL; + return ERR_PTR(-ENOMEM); /* * Single counters are their own group leaders, with an @@ -2505,12 +2506,18 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hw_ops = tp_perf_counter_init(counter); break; } - - if (!hw_ops) { - kfree(counter); - return NULL; - } done: + err = 0; + if (!hw_ops) + err = -EINVAL; + else if (IS_ERR(hw_ops)) + err = PTR_ERR(hw_ops); + + if (err) { + kfree(counter); + return ERR_PTR(err); + } + counter->hw_ops = hw_ops; return counter; @@ -2583,10 +2590,10 @@ SYSCALL_DEFINE5(perf_counter_open, goto err_put_context; } - ret = -EINVAL; counter = perf_counter_alloc(&hw_event, cpu, ctx, group_leader, GFP_KERNEL); - if (!counter) + ret = PTR_ERR(counter); + if (IS_ERR(counter)) goto err_put_context; ret = anon_inode_getfd("[perf_counter]", &perf_fops, counter, 0); @@ -2658,8 +2665,8 @@ inherit_counter(struct perf_counter *parent_counter, child_counter = perf_counter_alloc(&parent_counter->hw_event, parent_counter->cpu, child_ctx, group_leader, GFP_KERNEL); - if (!child_counter) - return NULL; + if (IS_ERR(child_counter)) + return child_counter; /* * Link it up in the child's context: @@ -2710,15 +2717,17 @@ static int inherit_group(struct perf_counter *parent_counter, { struct perf_counter *leader; struct perf_counter *sub; + struct perf_counter *child_ctr; leader = inherit_counter(parent_counter, parent, parent_ctx, child, NULL, child_ctx); - if (!leader) - return -ENOMEM; + if (IS_ERR(leader)) + return PTR_ERR(leader); list_for_each_entry(sub, &parent_counter->sibling_list, list_entry) { - if (!inherit_counter(sub, parent, parent_ctx, - child, leader, child_ctx)) - return -ENOMEM; + child_ctr = inherit_counter(sub, parent, parent_ctx, + child, leader, child_ctx); + if (IS_ERR(child_ctr)) + return PTR_ERR(child_ctr); } return 0; } From 9ea98e191255ee642e64a5745014424fc63f83b0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:09 +0200 Subject: [PATCH 0220/2061] perf_counter: x86: proper error propagation for the x86 hw_perf_counter_init() Now that Paul cleaned up the error propagation paths, pass down the x86 error as well. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.792822360@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7aab177fb56..b8885ccd804 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -954,7 +954,7 @@ hw_perf_counter_init(struct perf_counter *counter) err = __hw_perf_counter_init(counter); if (err) - return NULL; + return ERR_PTR(err); return &x86_perf_counter_ops; } From 31f004df8d14212f0a8a2fb12a8ed44a3d80e2fb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 30 Mar 2009 19:07:10 +0200 Subject: [PATCH 0221/2061] perf_counter tools: optionally scale counter values in perfstat mode Impact: new functionality This adds add an option to the perfstat mode of kerneltop to scale the reported counter values according to the fraction of time that each counter gets to count. This is invoked with the -l option (I used 'l' because s, c, a and e were all taken already.) This uses the new PERF_RECORD_TOTAL_TIME_{ENABLED,RUNNING} read format options. With this, we get output like this: $ ./perfstat -l -e 0:0,0:1,0:2,0:3,0:4,0:5 ./spin Performance counter stats for './spin': 4016072055 CPU cycles (events) (scaled from 66.53%) 2005887318 instructions (events) (scaled from 66.53%) 1762849 cache references (events) (scaled from 66.69%) 165229 cache misses (events) (scaled from 66.85%) 1001298009 branches (events) (scaled from 66.78%) 41566 branch misses (events) (scaled from 66.61%) Wall-clock time elapsed: 2438.227446 msecs This also lets us detect when a counter is zero because the counter never got to go on the CPU at all. In that case we print rather than 0. Signed-off-by: Paul Mackerras Signed-off-by: Peter Zijlstra Orig-LKML-Reference: <20090330171023.871484899@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 56 +++++++++++++++++++++----- 1 file changed, 45 insertions(+), 11 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 995111dee7f..c0ca01504ff 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -197,6 +197,8 @@ static int delay_secs = 2; static int zero; static int dump_symtab; +static int scale; + struct source_line { uint64_t EIP; unsigned long count; @@ -305,6 +307,7 @@ static void display_perfstat_help(void) display_events_help(); printf( + " -l # scale counter values\n" " -a # system-wide collection\n"); exit(0); } @@ -328,6 +331,7 @@ static void display_help(void) " -c CNT --count=CNT # event period to sample\n\n" " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n" " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n" + " -l # show scale factor for RR events\n" " -d delay --delay= # sampling/display delay [default: 2]\n" " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n" " -s symbol --symbol= # function to be showed annotated one-shot\n" @@ -436,6 +440,9 @@ static void create_perfstat_counter(int counter) hw_event.config = event_id[counter]; hw_event.record_type = PERF_RECORD_SIMPLE; hw_event.nmi = 0; + if (scale) + hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; if (system_wide) { int cpu; @@ -507,28 +514,53 @@ int do_perfstat(int argc, char *argv[]) fprintf(stderr, "\n"); for (counter = 0; counter < nr_counters; counter++) { - int cpu; - __u64 count, single_count; + int cpu, nv; + __u64 count[3], single_count[3]; + int scaled; - count = 0; + count[0] = count[1] = count[2] = 0; + nv = scale ? 3 : 1; for (cpu = 0; cpu < nr_cpus; cpu ++) { res = read(fd[cpu][counter], - (char *) &single_count, sizeof(single_count)); - assert(res == sizeof(single_count)); - count += single_count; + single_count, nv * sizeof(__u64)); + assert(res == nv * sizeof(__u64)); + + count[0] += single_count[0]; + if (scale) { + count[1] += single_count[1]; + count[2] += single_count[2]; + } + } + + scaled = 0; + if (scale) { + if (count[2] == 0) { + fprintf(stderr, " %14s %-20s\n", + "", event_name(counter)); + continue; + } + if (count[2] < count[1]) { + scaled = 1; + count[0] = (unsigned long long) + ((double)count[0] * count[1] / count[2] + 0.5); + } } if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) || event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) { - double msecs = (double)count / 1000000; + double msecs = (double)count[0] / 1000000; - fprintf(stderr, " %14.6f %-20s (msecs)\n", + fprintf(stderr, " %14.6f %-20s (msecs)", msecs, event_name(counter)); } else { - fprintf(stderr, " %14Ld %-20s (events)\n", - count, event_name(counter)); + fprintf(stderr, " %14Ld %-20s (events)", + count[0], event_name(counter)); } + if (scaled) + fprintf(stderr, " (scaled from %.2f%%)", + (double) count[2] / count[1] * 100); + fprintf(stderr, "\n"); } fprintf(stderr, "\n"); fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", @@ -1049,6 +1081,7 @@ static void process_options(int argc, char *argv[]) {"filter", required_argument, NULL, 'f'}, {"group", required_argument, NULL, 'g'}, {"help", no_argument, NULL, 'h'}, + {"scale", no_argument, NULL, 'l'}, {"nmi", required_argument, NULL, 'n'}, {"pid", required_argument, NULL, 'p'}, {"vmlinux", required_argument, NULL, 'x'}, @@ -1060,7 +1093,7 @@ static void process_options(int argc, char *argv[]) {"munmap_info", no_argument, NULL, 'U'}, {NULL, 0, NULL, 0 } }; - int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hn:m:p:s:Sx:zMU", + int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:s:Sx:zMU", long_options, &option_index); if (c == -1) break; @@ -1084,6 +1117,7 @@ static void process_options(int argc, char *argv[]) case 'f': count_filter = atoi(optarg); break; case 'g': group = atoi(optarg); break; case 'h': display_help(); break; + case 'l': scale = 1; break; case 'n': nmi = atoi(optarg); break; case 'p': /* CPU and PID are mutually exclusive */ From 78d613eb129fc4edf0e2cabfcc6a4c5285482d21 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:11 +0200 Subject: [PATCH 0222/2061] perf_counter: small cleanup of the output routines Move the nmi argument to the _begin() function, so that _end() only needs the handle. This allows the _begin() function to generate a wakeup on event loss. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171023.959404268@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 42 ++++++++++++++++++++++++++---------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d07b45278b4..4471e7e2c10 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1663,10 +1663,20 @@ struct perf_output_handle { unsigned int offset; unsigned int head; int wakeup; + int nmi; }; +static inline void __perf_output_wakeup(struct perf_output_handle *handle) +{ + if (handle->nmi) + perf_pending_queue(handle->counter); + else + perf_counter_wakeup(handle->counter); +} + static int perf_output_begin(struct perf_output_handle *handle, - struct perf_counter *counter, unsigned int size) + struct perf_counter *counter, unsigned int size, + int nmi) { struct perf_mmap_data *data; unsigned int offset, head; @@ -1676,15 +1686,17 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; + handle->counter = counter; + handle->nmi = nmi; + if (!data->nr_pages) - goto out; + goto fail; do { offset = head = atomic_read(&data->head); head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); - handle->counter = counter; handle->data = data; handle->offset = offset; handle->head = head; @@ -1692,6 +1704,8 @@ static int perf_output_begin(struct perf_output_handle *handle, return 0; +fail: + __perf_output_wakeup(handle); out: rcu_read_unlock(); @@ -1733,14 +1747,10 @@ static void perf_output_copy(struct perf_output_handle *handle, #define perf_output_put(handle, x) \ perf_output_copy((handle), &(x), sizeof(x)) -static void perf_output_end(struct perf_output_handle *handle, int nmi) +static void perf_output_end(struct perf_output_handle *handle) { - if (handle->wakeup) { - if (nmi) - perf_pending_queue(handle->counter); - else - perf_counter_wakeup(handle->counter); - } + if (handle->wakeup) + __perf_output_wakeup(handle); rcu_read_unlock(); } @@ -1750,12 +1760,12 @@ static int perf_output_write(struct perf_counter *counter, int nmi, struct perf_output_handle handle; int ret; - ret = perf_output_begin(&handle, counter, size); + ret = perf_output_begin(&handle, counter, size, nmi); if (ret) goto out; perf_output_copy(&handle, buf, size); - perf_output_end(&handle, nmi); + perf_output_end(&handle); out: return ret; @@ -1804,7 +1814,7 @@ static void perf_output_group(struct perf_counter *counter, int nmi) size = sizeof(header) + counter->nr_siblings * sizeof(entry); - ret = perf_output_begin(&handle, counter, size); + ret = perf_output_begin(&handle, counter, size, nmi); if (ret) return; @@ -1824,7 +1834,7 @@ static void perf_output_group(struct perf_counter *counter, int nmi) perf_output_put(&handle, entry); } - perf_output_end(&handle, nmi); + perf_output_end(&handle); } void perf_counter_output(struct perf_counter *counter, @@ -1869,7 +1879,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter, { struct perf_output_handle handle; int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size); + int ret = perf_output_begin(&handle, counter, size, 0); if (ret) return; @@ -1877,7 +1887,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter, perf_output_put(&handle, mmap_event->event); perf_output_copy(&handle, mmap_event->file_name, mmap_event->file_size); - perf_output_end(&handle, 0); + perf_output_end(&handle); } static int perf_counter_mmap_match(struct perf_counter *counter, From 5ed00415e304203a0a9dcaef226d6d3f1106070e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:12 +0200 Subject: [PATCH 0223/2061] perf_counter: re-arrange the perf_event_type Breaks ABI yet again :-) Change the event type so that [0, 2^31-1] are regular event types, but [2^31, 2^32-1] forms a bitmask for overflow events. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.047961770@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 6 ++-- kernel/perf_counter.c | 56 ++++++++++++++++-------------------- 2 files changed, 29 insertions(+), 33 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 037a81145ac..edf5bfb7ff5 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -210,13 +210,15 @@ struct perf_event_header { }; enum perf_event_type { - PERF_EVENT_IP = 0, + PERF_EVENT_GROUP = 1, PERF_EVENT_MMAP = 2, PERF_EVENT_MUNMAP = 3, - __PERF_EVENT_TID = 0x100, + PERF_EVENT_OVERFLOW = 1UL << 31, + __PERF_EVENT_IP = 1UL << 30, + __PERF_EVENT_TID = 1UL << 29, }; #ifdef __KERNEL__ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4471e7e2c10..d93e9ddf784 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1754,50 +1754,44 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static int perf_output_write(struct perf_counter *counter, int nmi, - void *buf, ssize_t size) -{ - struct perf_output_handle handle; - int ret; - - ret = perf_output_begin(&handle, counter, size, nmi); - if (ret) - goto out; - - perf_output_copy(&handle, buf, size); - perf_output_end(&handle); - -out: - return ret; -} - static void perf_output_simple(struct perf_counter *counter, int nmi, struct pt_regs *regs) { - unsigned int size; + int ret; + struct perf_output_handle handle; + struct perf_event_header header; + u64 ip; struct { - struct perf_event_header header; - u64 ip; u32 pid, tid; - } event; + } tid_entry; - event.header.type = PERF_EVENT_IP; - event.ip = instruction_pointer(regs); + header.type = PERF_EVENT_OVERFLOW; + header.size = sizeof(header); - size = sizeof(event); + ip = instruction_pointer(regs); + header.type |= __PERF_EVENT_IP; + header.size += sizeof(ip); if (counter->hw_event.include_tid) { /* namespace issues */ - event.pid = current->group_leader->pid; - event.tid = current->pid; + tid_entry.pid = current->group_leader->pid; + tid_entry.tid = current->pid; - event.header.type |= __PERF_EVENT_TID; - } else - size -= sizeof(u64); + header.type |= __PERF_EVENT_TID; + header.size += sizeof(tid_entry); + } - event.header.size = size; + ret = perf_output_begin(&handle, counter, header.size, nmi); + if (ret) + return; - perf_output_write(counter, nmi, &event, size); + perf_output_put(&handle, header); + perf_output_put(&handle, ip); + + if (counter->hw_event.include_tid) + perf_output_put(&handle, tid_entry); + + perf_output_end(&handle); } static void perf_output_group(struct perf_counter *counter, int nmi) From 023c54c42288416b4f43c67bfd5049a76926fad6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:13 +0200 Subject: [PATCH 0224/2061] perf_counter tools: kerneltop: update event_types Go along with the new perf_event_type ABI. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.133985461@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index c0ca01504ff..430810dae1f 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -1263,8 +1263,8 @@ static void mmap_read(struct mmap_data *md) old += size; switch (event->header.type) { - case PERF_EVENT_IP: - case PERF_EVENT_IP | __PERF_EVENT_TID: + case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP: + case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID: process_event(event->ip.ip, md->counter); break; From 394ee07623cf556c8daae2b3c00cf5fea47f0811 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:14 +0200 Subject: [PATCH 0225/2061] perf_counter: provide generic callchain bits Provide the generic callchain support bits. If hw_event->callchain is set the arch specific perf_callchain() function is called upon to provide a perf_callchain_entry structure filled with the current callchain. If it does so, it is added to the overflow output event. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.254266860@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 13 ++++++++++++- kernel/perf_counter.c | 27 +++++++++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index edf5bfb7ff5..43083afffe0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -140,8 +140,9 @@ struct perf_counter_hw_event { include_tid : 1, /* include the tid */ mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ + callchain : 1, /* add callchain data */ - __reserved_1 : 52; + __reserved_1 : 51; __u32 extra_config_len; __u32 __reserved_4; @@ -219,6 +220,7 @@ enum perf_event_type { PERF_EVENT_OVERFLOW = 1UL << 31, __PERF_EVENT_IP = 1UL << 30, __PERF_EVENT_TID = 1UL << 29, + __PERF_EVENT_CALLCHAIN = 1UL << 28, }; #ifdef __KERNEL__ @@ -504,6 +506,15 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); +#define MAX_STACK_DEPTH 255 + +struct perf_callchain_entry { + u64 nr; + u64 ip[MAX_STACK_DEPTH]; +}; + +extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d93e9ddf784..860cdc26bd7 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1653,6 +1653,17 @@ void perf_counter_do_pending(void) __perf_pending_run(); } +/* + * Callchain support -- arch specific + */ + +struct perf_callchain_entry * +__attribute__((weak)) +perf_callchain(struct pt_regs *regs) +{ + return NULL; +} + /* * Output */ @@ -1764,6 +1775,8 @@ static void perf_output_simple(struct perf_counter *counter, struct { u32 pid, tid; } tid_entry; + struct perf_callchain_entry *callchain = NULL; + int callchain_size = 0; header.type = PERF_EVENT_OVERFLOW; header.size = sizeof(header); @@ -1781,6 +1794,17 @@ static void perf_output_simple(struct perf_counter *counter, header.size += sizeof(tid_entry); } + if (counter->hw_event.callchain) { + callchain = perf_callchain(regs); + + if (callchain) { + callchain_size = (1 + callchain->nr) * sizeof(u64); + + header.type |= __PERF_EVENT_CALLCHAIN; + header.size += callchain_size; + } + } + ret = perf_output_begin(&handle, counter, header.size, nmi); if (ret) return; @@ -1791,6 +1815,9 @@ static void perf_output_simple(struct perf_counter *counter, if (counter->hw_event.include_tid) perf_output_put(&handle, tid_entry); + if (callchain) + perf_output_copy(&handle, callchain, callchain_size); + perf_output_end(&handle); } From d7d59fb323833682b117b528d77eeb8ef587036a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:15 +0200 Subject: [PATCH 0226/2061] perf_counter: x86: callchain support Provide the x86 perf_callchain() implementation. Code based on the ftrace/sysprof code from Soeren Sandmann Pedersen. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Cc: Soeren Sandmann Pedersen Cc: Frederic Weisbecker Cc: Steven Rostedt Orig-LKML-Reference: <20090330171024.341993293@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 154 +++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index b8885ccd804..e16dfafc6d7 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -16,8 +16,10 @@ #include #include #include +#include #include +#include static bool perf_counters_initialized __read_mostly; @@ -958,3 +960,155 @@ hw_perf_counter_init(struct perf_counter *counter) return &x86_perf_counter_ops; } + +/* + * callchain support + */ + +static inline +void callchain_store(struct perf_callchain_entry *entry, unsigned long ip) +{ + if (entry->nr < MAX_STACK_DEPTH) + entry->ip[entry->nr++] = ip; +} + +static DEFINE_PER_CPU(struct perf_callchain_entry, irq_entry); +static DEFINE_PER_CPU(struct perf_callchain_entry, nmi_entry); + + +static void +backtrace_warning_symbol(void *data, char *msg, unsigned long symbol) +{ + /* Ignore warnings */ +} + +static void backtrace_warning(void *data, char *msg) +{ + /* Ignore warnings */ +} + +static int backtrace_stack(void *data, char *name) +{ + /* Don't bother with IRQ stacks for now */ + return -1; +} + +static void backtrace_address(void *data, unsigned long addr, int reliable) +{ + struct perf_callchain_entry *entry = data; + + if (reliable) + callchain_store(entry, addr); +} + +static const struct stacktrace_ops backtrace_ops = { + .warning = backtrace_warning, + .warning_symbol = backtrace_warning_symbol, + .stack = backtrace_stack, + .address = backtrace_address, +}; + +static void +perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + unsigned long bp; + char *stack; + + callchain_store(entry, instruction_pointer(regs)); + + stack = ((char *)regs + sizeof(struct pt_regs)); +#ifdef CONFIG_FRAME_POINTER + bp = frame_pointer(regs); +#else + bp = 0; +#endif + + dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); +} + + +struct stack_frame { + const void __user *next_fp; + unsigned long return_address; +}; + +static int copy_stack_frame(const void __user *fp, struct stack_frame *frame) +{ + int ret; + + if (!access_ok(VERIFY_READ, fp, sizeof(*frame))) + return 0; + + ret = 1; + pagefault_disable(); + if (__copy_from_user_inatomic(frame, fp, sizeof(*frame))) + ret = 0; + pagefault_enable(); + + return ret; +} + +static void +perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + struct stack_frame frame; + const void __user *fp; + + regs = (struct pt_regs *)current->thread.sp0 - 1; + fp = (void __user *)regs->bp; + + callchain_store(entry, regs->ip); + + while (entry->nr < MAX_STACK_DEPTH) { + frame.next_fp = NULL; + frame.return_address = 0; + + if (!copy_stack_frame(fp, &frame)) + break; + + if ((unsigned long)fp < user_stack_pointer(regs)) + break; + + callchain_store(entry, frame.return_address); + fp = frame.next_fp; + } +} + +static void +perf_do_callchain(struct pt_regs *regs, struct perf_callchain_entry *entry) +{ + int is_user; + + if (!regs) + return; + + is_user = user_mode(regs); + + if (!current || current->pid == 0) + return; + + if (is_user && current->state != TASK_RUNNING) + return; + + if (!is_user) + perf_callchain_kernel(regs, entry); + + if (current->mm) + perf_callchain_user(regs, entry); +} + +struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) +{ + struct perf_callchain_entry *entry; + + if (in_nmi()) + entry = &__get_cpu_var(nmi_entry); + else + entry = &__get_cpu_var(irq_entry); + + entry->nr = 0; + + perf_do_callchain(regs, entry); + + return entry; +} From 4e935e47177c3b26cf383e79849bae2a464d0160 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 Mar 2009 19:07:16 +0200 Subject: [PATCH 0227/2061] perf_counter: pmc arbitration Follow the example set by powerpc and try to play nice with oprofile and the nmi watchdog. Signed-off-by: Peter Zijlstra Acked-by: Paul Mackerras Orig-LKML-Reference: <20090330171024.459968444@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 75 ++++++++++++++++++++++++++++++ 1 file changed, 75 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index e16dfafc6d7..2a946a160ca 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -20,6 +20,7 @@ #include #include +#include static bool perf_counters_initialized __read_mostly; @@ -172,6 +173,65 @@ again: atomic64_sub(delta, &hwc->period_left); } +static atomic_t num_counters; +static DEFINE_MUTEX(pmc_reserve_mutex); + +static bool reserve_pmc_hardware(void) +{ + int i; + + if (nmi_watchdog == NMI_LOCAL_APIC) + disable_lapic_nmi_watchdog(); + + for (i = 0; i < nr_counters_generic; i++) { + if (!reserve_perfctr_nmi(pmc_ops->perfctr + i)) + goto perfctr_fail; + } + + for (i = 0; i < nr_counters_generic; i++) { + if (!reserve_evntsel_nmi(pmc_ops->eventsel + i)) + goto eventsel_fail; + } + + return true; + +eventsel_fail: + for (i--; i >= 0; i--) + release_evntsel_nmi(pmc_ops->eventsel + i); + + i = nr_counters_generic; + +perfctr_fail: + for (i--; i >= 0; i--) + release_perfctr_nmi(pmc_ops->perfctr + i); + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); + + return false; +} + +static void release_pmc_hardware(void) +{ + int i; + + for (i = 0; i < nr_counters_generic; i++) { + release_perfctr_nmi(pmc_ops->perfctr + i); + release_evntsel_nmi(pmc_ops->eventsel + i); + } + + if (nmi_watchdog == NMI_LOCAL_APIC) + enable_lapic_nmi_watchdog(); +} + +static void hw_perf_counter_destroy(struct perf_counter *counter) +{ + if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) { + release_pmc_hardware(); + mutex_unlock(&pmc_reserve_mutex); + } +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -179,10 +239,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; struct hw_perf_counter *hwc = &counter->hw; + int err; if (unlikely(!perf_counters_initialized)) return -EINVAL; + err = 0; + if (atomic_inc_not_zero(&num_counters)) { + mutex_lock(&pmc_reserve_mutex); + if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware()) + err = -EBUSY; + else + atomic_inc(&num_counters); + mutex_unlock(&pmc_reserve_mutex); + } + if (err) + return err; + /* * Generate PMC IRQs: * (keep 'enabled' bit clear for now) @@ -230,6 +303,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); } + counter->destroy = hw_perf_counter_destroy; + return 0; } From 9dd499889bdb12ac0e412ccdd718fe0d348258f2 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 27 Mar 2009 12:13:43 +0100 Subject: [PATCH 0228/2061] perf_counter tools: kerneltop: add real-time data acquisition thread Decouple kerneltop display from event acquisition by introducing a separate data acquisition thread. This fixes annnoying kerneltop display refresh jitter and missed events. Also add a -r option, to switch the data acquisition thread to real-time priority. Signed-off-by: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Orig-LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 57 ++++++++++++++++++-------- 1 file changed, 39 insertions(+), 18 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 430810dae1f..33b4fcf6e48 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -77,6 +77,8 @@ #include #include #include +#include +#include #include #include @@ -181,6 +183,7 @@ static int tid = -1; static int profile_cpu = -1; static int nr_cpus = 0; static int nmi = 1; +static unsigned int realtime_prio = 0; static int group = 0; static unsigned int page_size; static unsigned int mmap_pages = 16; @@ -334,6 +337,7 @@ static void display_help(void) " -l # show scale factor for RR events\n" " -d delay --delay= # sampling/display delay [default: 2]\n" " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n" + " -r prio --realtime= # event acquisition runs with SCHED_FIFO policy\n" " -s symbol --symbol= # function to be showed annotated one-shot\n" " -x path --vmlinux= # the vmlinux binary, required for -s use\n" " -z --zero # zero counts after display\n" @@ -620,7 +624,6 @@ static int compare(const void *__sym1, const void *__sym2) return sym_weight(sym1) < sym_weight(sym2); } -static time_t last_refresh; static long events; static long userspace_events; static const char CONSOLE_CLEAR[] = ""; @@ -634,6 +637,7 @@ static void print_sym_table(void) float events_per_sec = events/delay_secs; float kevents_per_sec = (events-userspace_events)/delay_secs; + events = userspace_events = 0; memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count); qsort(tmp, sym_table_count, sizeof(tmp[0]), compare); @@ -714,8 +718,6 @@ static void print_sym_table(void) if (sym_filter_entry) show_details(sym_filter_entry); - last_refresh = time(NULL); - { struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; @@ -726,6 +728,16 @@ static void print_sym_table(void) } } +static void *display_thread(void *arg) +{ + printf("KernelTop refresh period: %d seconds\n", delay_secs); + + while (!sleep(delay_secs)) + print_sym_table(); + + return NULL; +} + static int read_symbol(FILE *in, struct sym_entry *s) { static int filter_match = 0; @@ -1081,19 +1093,20 @@ static void process_options(int argc, char *argv[]) {"filter", required_argument, NULL, 'f'}, {"group", required_argument, NULL, 'g'}, {"help", no_argument, NULL, 'h'}, - {"scale", no_argument, NULL, 'l'}, {"nmi", required_argument, NULL, 'n'}, + {"mmap_info", no_argument, NULL, 'M'}, + {"mmap_pages", required_argument, NULL, 'm'}, + {"munmap_info", no_argument, NULL, 'U'}, {"pid", required_argument, NULL, 'p'}, - {"vmlinux", required_argument, NULL, 'x'}, + {"realtime", required_argument, NULL, 'r'}, + {"scale", no_argument, NULL, 'l'}, {"symbol", required_argument, NULL, 's'}, {"stat", no_argument, NULL, 'S'}, + {"vmlinux", required_argument, NULL, 'x'}, {"zero", no_argument, NULL, 'z'}, - {"mmap_pages", required_argument, NULL, 'm'}, - {"mmap_info", no_argument, NULL, 'M'}, - {"munmap_info", no_argument, NULL, 'U'}, {NULL, 0, NULL, 0 } }; - int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:s:Sx:zMU", + int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU", long_options, &option_index); if (c == -1) break; @@ -1127,6 +1140,7 @@ static void process_options(int argc, char *argv[]) profile_cpu = -1; } tid = atoi(optarg); break; + case 'r': realtime_prio = atoi(optarg); break; case 's': sym_filter = strdup(optarg); break; case 'S': run_perfstat = 1; break; case 'x': vmlinux = strdup(optarg); break; @@ -1289,6 +1303,7 @@ int main(int argc, char *argv[]) struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; struct perf_counter_hw_event hw_event; + pthread_t thread; int i, counter, group_fd, nr_poll = 0; unsigned int cpu; int ret; @@ -1363,8 +1378,20 @@ int main(int argc, char *argv[]) } } - printf("KernelTop refresh period: %d seconds\n", delay_secs); - last_refresh = time(NULL); + if (pthread_create(&thread, NULL, display_thread, NULL)) { + printf("Could not create display thread.\n"); + exit(-1); + } + + if (realtime_prio) { + struct sched_param param; + + param.sched_priority = realtime_prio; + if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { + printf("Could not set realtime priority.\n"); + exit(-1); + } + } while (1) { int hits = events; @@ -1374,14 +1401,8 @@ int main(int argc, char *argv[]) mmap_read(&mmap_array[i][counter]); } - if (time(NULL) >= last_refresh + delay_secs) { - print_sym_table(); - events = userspace_events = 0; - } - if (hits == events) - ret = poll(event_array, nr_poll, 1000); - hits = events; + ret = poll(event_array, nr_poll, 100); } return 0; From 8a057d84912f36e53f970c4d177cb4bb6b2f9e08 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:11:59 +0200 Subject: [PATCH 0229/2061] perf_counter: move the event overflow output bits to record_type Per suggestion from Paul, move the event overflow bits to record_type and sanitize the enums a bit. Breaks the ABI -- again ;-) Suggested-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.151921176@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 50 +++++++++------- kernel/perf_counter.c | 107 ++++++++++++++--------------------- 2 files changed, 71 insertions(+), 86 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 43083afffe0..06a6fba9f53 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -73,15 +73,6 @@ enum sw_event_ids { PERF_SW_EVENTS_MAX = 7, }; -/* - * IRQ-notification data record type: - */ -enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, -}; - #define __PERF_COUNTER_MASK(name) \ (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ PERF_COUNTER_##name##_SHIFT) @@ -102,6 +93,17 @@ enum perf_counter_record_type { #define PERF_COUNTER_EVENT_SHIFT 0 #define PERF_COUNTER_EVENT_MASK __PERF_COUNTER_MASK(EVENT) +/* + * Bits that can be set in hw_event.record_type to request information + * in the overflow packets. + */ +enum perf_counter_record_format { + PERF_RECORD_IP = 1U << 0, + PERF_RECORD_TID = 1U << 1, + PERF_RECORD_GROUP = 1U << 2, + PERF_RECORD_CALLCHAIN = 1U << 3, +}; + /* * Bits that can be set in hw_event.read_format to request that * reads on the counter should return the indicated quantities, @@ -125,8 +127,8 @@ struct perf_counter_hw_event { __u64 config; __u64 irq_period; - __u64 record_type; - __u64 read_format; + __u32 record_type; + __u32 read_format; __u64 disabled : 1, /* off by default */ nmi : 1, /* NMI sampling */ @@ -137,12 +139,10 @@ struct perf_counter_hw_event { exclude_kernel : 1, /* ditto kernel */ exclude_hv : 1, /* ditto hypervisor */ exclude_idle : 1, /* don't count when idle */ - include_tid : 1, /* include the tid */ mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ - callchain : 1, /* add callchain data */ - __reserved_1 : 51; + __reserved_1 : 53; __u32 extra_config_len; __u32 __reserved_4; @@ -212,15 +212,21 @@ struct perf_event_header { enum perf_event_type { - PERF_EVENT_GROUP = 1, + PERF_EVENT_MMAP = 1, + PERF_EVENT_MUNMAP = 2, - PERF_EVENT_MMAP = 2, - PERF_EVENT_MUNMAP = 3, - - PERF_EVENT_OVERFLOW = 1UL << 31, - __PERF_EVENT_IP = 1UL << 30, - __PERF_EVENT_TID = 1UL << 29, - __PERF_EVENT_CALLCHAIN = 1UL << 28, + /* + * Half the event type space is reserved for the counter overflow + * bitfields, as found in hw_event.record_type. + * + * These events will have types of the form: + * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } * + */ + PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31, + __PERF_EVENT_IP = PERF_RECORD_IP, + __PERF_EVENT_TID = PERF_RECORD_TID, + __PERF_EVENT_GROUP = PERF_RECORD_GROUP, + __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN, }; #ifdef __KERNEL__ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 860cdc26bd7..995063df910 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1765,27 +1765,34 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -static void perf_output_simple(struct perf_counter *counter, - int nmi, struct pt_regs *regs) +void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs) { int ret; + u64 record_type = counter->hw_event.record_type; struct perf_output_handle handle; struct perf_event_header header; u64 ip; struct { u32 pid, tid; } tid_entry; + struct { + u64 event; + u64 counter; + } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; - header.type = PERF_EVENT_OVERFLOW; + header.type = PERF_EVENT_COUNTER_OVERFLOW; header.size = sizeof(header); - ip = instruction_pointer(regs); - header.type |= __PERF_EVENT_IP; - header.size += sizeof(ip); + if (record_type & PERF_RECORD_IP) { + ip = instruction_pointer(regs); + header.type |= __PERF_EVENT_IP; + header.size += sizeof(ip); + } - if (counter->hw_event.include_tid) { + if (record_type & PERF_RECORD_TID) { /* namespace issues */ tid_entry.pid = current->group_leader->pid; tid_entry.tid = current->pid; @@ -1794,7 +1801,13 @@ static void perf_output_simple(struct perf_counter *counter, header.size += sizeof(tid_entry); } - if (counter->hw_event.callchain) { + if (record_type & PERF_RECORD_GROUP) { + header.type |= __PERF_EVENT_GROUP; + header.size += sizeof(u64) + + counter->nr_siblings * sizeof(group_entry); + } + + if (record_type & PERF_RECORD_CALLCHAIN) { callchain = perf_callchain(regs); if (callchain) { @@ -1810,71 +1823,37 @@ static void perf_output_simple(struct perf_counter *counter, return; perf_output_put(&handle, header); - perf_output_put(&handle, ip); - if (counter->hw_event.include_tid) + if (record_type & PERF_RECORD_IP) + perf_output_put(&handle, ip); + + if (record_type & PERF_RECORD_TID) perf_output_put(&handle, tid_entry); + if (record_type & PERF_RECORD_GROUP) { + struct perf_counter *leader, *sub; + u64 nr = counter->nr_siblings; + + perf_output_put(&handle, nr); + + leader = counter->group_leader; + list_for_each_entry(sub, &leader->sibling_list, list_entry) { + if (sub != counter) + sub->hw_ops->read(sub); + + group_entry.event = sub->hw_event.config; + group_entry.counter = atomic64_read(&sub->count); + + perf_output_put(&handle, group_entry); + } + } + if (callchain) perf_output_copy(&handle, callchain, callchain_size); perf_output_end(&handle); } -static void perf_output_group(struct perf_counter *counter, int nmi) -{ - struct perf_output_handle handle; - struct perf_event_header header; - struct perf_counter *leader, *sub; - unsigned int size; - struct { - u64 event; - u64 counter; - } entry; - int ret; - - size = sizeof(header) + counter->nr_siblings * sizeof(entry); - - ret = perf_output_begin(&handle, counter, size, nmi); - if (ret) - return; - - header.type = PERF_EVENT_GROUP; - header.size = size; - - perf_output_put(&handle, header); - - leader = counter->group_leader; - list_for_each_entry(sub, &leader->sibling_list, list_entry) { - if (sub != counter) - sub->hw_ops->read(sub); - - entry.event = sub->hw_event.config; - entry.counter = atomic64_read(&sub->count); - - perf_output_put(&handle, entry); - } - - perf_output_end(&handle); -} - -void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs) -{ - switch (counter->hw_event.record_type) { - case PERF_RECORD_SIMPLE: - return; - - case PERF_RECORD_IRQ: - perf_output_simple(counter, nmi, regs); - break; - - case PERF_RECORD_GROUP: - perf_output_group(counter, nmi); - break; - } -} - /* * mmap tracking */ From c457810ab4a825161aec6ef71b581e1bc8febd1a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:01 +0200 Subject: [PATCH 0230/2061] perf_counter: per event wakeups By request, provide a way to request a wakeup every 'n' events instead of every page of output. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.323309784@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 ++- kernel/perf_counter.c | 10 +++++++++- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 06a6fba9f53..5428ba120d7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -145,7 +145,7 @@ struct perf_counter_hw_event { __reserved_1 : 53; __u32 extra_config_len; - __u32 __reserved_4; + __u32 wakeup_events; /* wakeup every n events */ __u64 __reserved_2; __u64 __reserved_3; @@ -321,6 +321,7 @@ struct perf_mmap_data { int nr_pages; atomic_t wakeup; atomic_t head; + atomic_t events; struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 995063df910..9bcab10e735 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1760,7 +1760,15 @@ static void perf_output_copy(struct perf_output_handle *handle, static void perf_output_end(struct perf_output_handle *handle) { - if (handle->wakeup) + int wakeup_events = handle->counter->hw_event.wakeup_events; + + if (wakeup_events) { + int events = atomic_inc_return(&handle->data->events); + if (events >= wakeup_events) { + atomic_sub(wakeup_events, &handle->data->events); + __perf_output_wakeup(handle); + } + } else if (handle->wakeup) __perf_output_wakeup(handle); rcu_read_unlock(); } From 3df70fd623bb109e0079e697c0276d220a4b7908 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:02 +0200 Subject: [PATCH 0231/2061] perf_counter: kerneltop: update to new ABI Update to reflect the new record_type ABI changes. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.407283141@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 33b4fcf6e48..4f8d7917aba 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -442,7 +442,7 @@ static void create_perfstat_counter(int counter) memset(&hw_event, 0, sizeof(hw_event)); hw_event.config = event_id[counter]; - hw_event.record_type = PERF_RECORD_SIMPLE; + hw_event.record_type = 0; hw_event.nmi = 0; if (scale) hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | @@ -1277,8 +1277,8 @@ static void mmap_read(struct mmap_data *md) old += size; switch (event->header.type) { - case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP: - case PERF_EVENT_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID: + case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP: + case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID: process_event(event->ip.ip, md->counter); break; @@ -1337,9 +1337,8 @@ int main(int argc, char *argv[]) memset(&hw_event, 0, sizeof(hw_event)); hw_event.config = event_id[counter]; hw_event.irq_period = event_count[counter]; - hw_event.record_type = PERF_RECORD_IRQ; + hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID; hw_event.nmi = nmi; - hw_event.include_tid = 1; hw_event.mmap = use_mmap; hw_event.munmap = use_munmap; From 5872bdb88a35fae7d224bd6b21e5f377e854ccfc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:03 +0200 Subject: [PATCH 0232/2061] perf_counter: add more context information Put in counts to tell which ips belong to what context. ----- | | hv | -- nr | | kernel | -- | | user ----- Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.493101305@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++++++++ include/linux/perf_counter.h | 4 ++-- kernel/perf_counter.c | 2 +- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2a946a160ca..c74e20d593a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1088,6 +1088,7 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) { unsigned long bp; char *stack; + int nr = entry->nr; callchain_store(entry, instruction_pointer(regs)); @@ -1099,6 +1100,8 @@ perf_callchain_kernel(struct pt_regs *regs, struct perf_callchain_entry *entry) #endif dump_trace(NULL, regs, (void *)stack, bp, &backtrace_ops, entry); + + entry->kernel = entry->nr - nr; } @@ -1128,6 +1131,7 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) { struct stack_frame frame; const void __user *fp; + int nr = entry->nr; regs = (struct pt_regs *)current->thread.sp0 - 1; fp = (void __user *)regs->bp; @@ -1147,6 +1151,8 @@ perf_callchain_user(struct pt_regs *regs, struct perf_callchain_entry *entry) callchain_store(entry, frame.return_address); fp = frame.next_fp; } + + entry->user = entry->nr - nr; } static void @@ -1182,6 +1188,9 @@ struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) entry = &__get_cpu_var(irq_entry); entry->nr = 0; + entry->hv = 0; + entry->kernel = 0; + entry->user = 0; perf_do_callchain(regs, entry); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5428ba120d7..90cce0c74a0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -513,10 +513,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); -#define MAX_STACK_DEPTH 255 +#define MAX_STACK_DEPTH 254 struct perf_callchain_entry { - u64 nr; + u32 nr, hv, kernel, user; u64 ip[MAX_STACK_DEPTH]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 9bcab10e735..f105a6e696c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1819,7 +1819,7 @@ void perf_counter_output(struct perf_counter *counter, callchain = perf_callchain(regs); if (callchain) { - callchain_size = (1 + callchain->nr) * sizeof(u64); + callchain_size = (2 + callchain->nr) * sizeof(u64); header.type |= __PERF_EVENT_CALLCHAIN; header.size += callchain_size; From 92f22a3865abe87eea2609a6f8e5be5123f7ce4f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 2 Apr 2009 11:12:04 +0200 Subject: [PATCH 0233/2061] perf_counter: update mmap() counter read Paul noted that we don't need SMP barriers for the mmap() counter read because its always on the same cpu (otherwise you can't access the hw counter anyway). So remove the SMP barriers and replace them with regular compiler barriers. Further, update the comment to include a race free method of reading said hardware counter. The primary change is putting the pmc_read inside the seq-loop, otherwise we can still race and read rubbish. Noticed-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Corey Ashford Orig-LKML-Reference: <20090402091319.577951445@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 22 ++++++++++------------ kernel/perf_counter.c | 4 ++-- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 90cce0c74a0..f2b914de3f0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -167,30 +167,28 @@ struct perf_counter_mmap_page { /* * Bits needed to read the hw counters in user-space. * - * The index and offset should be read atomically using the seqlock: - * - * __u32 seq, index; - * __s64 offset; + * u32 seq; + * s64 count; * * again: - * rmb(); * seq = pc->lock; - * * if (unlikely(seq & 1)) { * cpu_relax(); * goto again; * } * - * index = pc->index; - * offset = pc->offset; + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; * - * rmb(); + * barrier(); * if (pc->lock != seq) * goto again; * - * After this, index contains architecture specific counter index + 1, - * so that 0 means unavailable, offset contains the value to be added - * to the result of the raw timer read to obtain this counter's value. + * NOTE: for obvious reason this only works on self-monitoring + * processes. */ __u32 lock; /* seqlock for synchronization */ __u32 index; /* hardware counter identifier */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f105a6e696c..2a5d4f52556 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1340,13 +1340,13 @@ void perf_counter_update_userpage(struct perf_counter *counter) */ preempt_disable(); ++userpg->lock; - smp_wmb(); + barrier(); userpg->index = counter->hw.idx; userpg->offset = atomic64_read(&counter->count); if (counter->state == PERF_COUNTER_STATE_ACTIVE) userpg->offset -= atomic64_read(&counter->hw.prev_count); - smp_wmb(); + barrier(); ++userpg->lock; preempt_enable(); unlock: From ca5f9524d61f54b1f618293ab92fc6b49cac864d Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:39:33 -0700 Subject: [PATCH 0234/2061] futex: separate futex_wait_queue_me() logic from futex_wait() Refactor futex_wait() in preparation for futex_wait_requeue_pi(). In order to reuse a good chunk of the futex_wait() code for the upcoming futex_wait_requeue_pi() function, this patch breaks out the queue-to-wakeup section of futex_wait() into futex_wait_queue_me(). Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 138 +++++++++++++++++++++++++++---------------------- 1 file changed, 76 insertions(+), 62 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 6b50a024bca..ebb48d6d1a8 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1115,24 +1115,87 @@ handle_fault: static long futex_wait_restart(struct restart_block *restart); +/** + * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal + * @hb: the futex hash bucket, must be locked by the caller + * @q: the futex_q to queue up on + * @timeout: the prepared hrtimer_sleeper, or null for no timeout + * @wait: the wait_queue to add to the futex_q after queueing in the hb + */ +static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, + struct hrtimer_sleeper *timeout, + wait_queue_t *wait) +{ + queue_me(q, hb); + + /* + * There might have been scheduling since the queue_me(), as we + * cannot hold a spinlock across the get_user() in case it + * faults, and we cannot just set TASK_INTERRUPTIBLE state when + * queueing ourselves into the futex hash. This code thus has to + * rely on the futex_wake() code removing us from hash when it + * wakes us up. + */ + + /* add_wait_queue is the barrier after __set_current_state. */ + __set_current_state(TASK_INTERRUPTIBLE); + + /* + * Add current as the futex_q waiter. We don't remove ourselves from + * the wait_queue because we are the only user of it. + */ + add_wait_queue(&q->waiter, wait); + + /* Arm the timer */ + if (timeout) { + hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); + if (!hrtimer_active(&timeout->timer)) + timeout->task = NULL; + } + + /* + * !plist_node_empty() is safe here without any lock. + * q.lock_ptr != 0 is not safe, because of ordering against wakeup. + */ + if (likely(!plist_node_empty(&q->list))) { + /* + * If the timer has already expired, current will already be + * flagged for rescheduling. Only call schedule if there + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) + schedule(); + } + __set_current_state(TASK_RUNNING); +} + static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { - struct task_struct *curr = current; + struct hrtimer_sleeper timeout, *to = NULL; + DECLARE_WAITQUEUE(wait, current); struct restart_block *restart; - DECLARE_WAITQUEUE(wait, curr); struct futex_hash_bucket *hb; struct futex_q q; u32 uval; int ret; - struct hrtimer_sleeper t; - int rem = 0; if (!bitset) return -EINVAL; q.pi_state = NULL; q.bitset = bitset; + + if (abs_time) { + to = &timeout; + + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); @@ -1178,75 +1241,22 @@ retry_private: goto retry; } ret = -EWOULDBLOCK; + + /* Only actually queue if *uaddr contained val. */ if (unlikely(uval != val)) { queue_unlock(&q, hb); goto out_put_key; } - /* Only actually queue if *uaddr contained val. */ - queue_me(&q, hb); - - /* - * There might have been scheduling since the queue_me(), as we - * cannot hold a spinlock across the get_user() in case it - * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to - * rely on the futex_wake() code removing us from hash when it - * wakes us up. - */ - - /* add_wait_queue is the barrier after __set_current_state. */ - __set_current_state(TASK_INTERRUPTIBLE); - add_wait_queue(&q.waiter, &wait); - /* - * !plist_node_empty() is safe here without any lock. - * q.lock_ptr != 0 is not safe, because of ordering against wakeup. - */ - if (likely(!plist_node_empty(&q.list))) { - if (!abs_time) - schedule(); - else { - hrtimer_init_on_stack(&t.timer, - clockrt ? CLOCK_REALTIME : - CLOCK_MONOTONIC, - HRTIMER_MODE_ABS); - hrtimer_init_sleeper(&t, current); - hrtimer_set_expires_range_ns(&t.timer, *abs_time, - current->timer_slack_ns); - - hrtimer_start_expires(&t.timer, HRTIMER_MODE_ABS); - if (!hrtimer_active(&t.timer)) - t.task = NULL; - - /* - * the timer could have already expired, in which - * case current would be flagged for rescheduling. - * Don't bother calling schedule. - */ - if (likely(t.task)) - schedule(); - - hrtimer_cancel(&t.timer); - - /* Flag if a timeout occured */ - rem = (t.task == NULL); - - destroy_hrtimer_on_stack(&t.timer); - } - } - __set_current_state(TASK_RUNNING); - - /* - * NOTE: we don't remove ourselves from the waitqueue because - * we are the only user of it. - */ + /* queue_me and wait for wakeup, timeout, or a signal. */ + futex_wait_queue_me(hb, &q, to, &wait); /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; if (!unqueue_me(&q)) goto out_put_key; ret = -ETIMEDOUT; - if (rem) + if (to && !to->task) goto out_put_key; /* @@ -1275,6 +1285,10 @@ retry_private: out_put_key: put_futex_key(fshared, &q.key); out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } return ret; } From 4b1c486b3587d2abf50bee4a05eb488cd4045f2c Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:39:42 -0700 Subject: [PATCH 0235/2061] futex: add helper to find the top prio waiter of a futex Improve legibility by wrapping finding the top waiter in a function. This will be used by the follow-on patches for enabling requeue pi. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/kernel/futex.c b/kernel/futex.c index ebb48d6d1a8..421fb5e42a1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -276,6 +276,25 @@ void put_futex_key(int fshared, union futex_key *key) drop_futex_key_refs(key); } +/** + * futex_top_waiter() - Return the highest priority waiter on a futex + * @hb: the hash bucket the futex_q's reside in + * @key: the futex key (to distinguish it from other futex futex_q's) + * + * Must be called with the hb lock held. + */ +static struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, + union futex_key *key) +{ + struct futex_q *this; + + plist_for_each_entry(this, &hb->chain, list) { + if (match_futex(&this->key, key)) + return this; + } + return NULL; +} + static u32 cmpxchg_futex_value_locked(u32 __user *uaddr, u32 uval, u32 newval) { u32 curval; From 1a52084d0919c2799258737c21fb328a9de159b5 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:39:52 -0700 Subject: [PATCH 0236/2061] futex: split out atomic logic from futex_lock_pi() Refactor the atomic portion of futex_lock_pi() into futex_lock_pi_atomic(). This logic will be needed by requeue_pi, so modularize it to reduce code duplication. The only significant change is passing of the task to try and take the lock for. This simplifies the -EDEADLK test as if the lock is owned by task t, it's a deadlock, regardless of if we are doing requeue pi or not. This patch updates the corresponding comment accordingly. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 224 ++++++++++++++++++++++++++++--------------------- 1 file changed, 130 insertions(+), 94 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 421fb5e42a1..986b16e4453 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -556,6 +556,127 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, return 0; } +/** + * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the lookup + * @task: the task to perform the atomic lock work for. This will be + * "current" except in the case of requeue pi. + * + * Returns: + * 0 - ready to wait + * 1 - acquired the lock + * <0 - error + * + * The hb->lock and futex_key refs shall be held by the caller. + */ +static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, + union futex_key *key, + struct futex_pi_state **ps, + struct task_struct *task) +{ + int lock_taken, ret, ownerdied = 0; + u32 uval, newval, curval; + +retry: + ret = lock_taken = 0; + + /* + * To avoid races, we attempt to take the lock here again + * (by doing a 0 -> TID atomic cmpxchg), while holding all + * the locks. It will most likely not succeed. + */ + newval = task_pid_vnr(task); + + curval = cmpxchg_futex_value_locked(uaddr, 0, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + + /* + * Detect deadlocks. + */ + if ((unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(task)))) + return -EDEADLK; + + /* + * Surprise - we got the lock. Just return to userspace: + */ + if (unlikely(!curval)) + return 1; + + uval = curval; + + /* + * Set the FUTEX_WAITERS flag, so the owner will know it has someone + * to wake at the next unlock. + */ + newval = curval | FUTEX_WAITERS; + + /* + * There are two cases, where a futex might have no owner (the + * owner TID is 0): OWNER_DIED. We take over the futex in this + * case. We also do an unconditional take over, when the owner + * of the futex died. + * + * This is safe as we are protected by the hash bucket lock ! + */ + if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { + /* Keep the OWNER_DIED bit */ + newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(task); + ownerdied = 0; + lock_taken = 1; + } + + curval = cmpxchg_futex_value_locked(uaddr, uval, newval); + + if (unlikely(curval == -EFAULT)) + return -EFAULT; + if (unlikely(curval != uval)) + goto retry; + + /* + * We took the lock due to owner died take over. + */ + if (unlikely(lock_taken)) + return 1; + + /* + * We dont have the lock. Look up the PI state (or create it if + * we are the first waiter): + */ + ret = lookup_pi_state(uval, hb, key, ps); + + if (unlikely(ret)) { + switch (ret) { + case -ESRCH: + /* + * No owner found for this futex. Check if the + * OWNER_DIED bit is set to figure out whether + * this is a robust futex or not. + */ + if (get_futex_value_locked(&curval, uaddr)) + return -EFAULT; + + /* + * We simply start over in case of a robust + * futex. The code above will take the futex + * and return happy. + */ + if (curval & FUTEX_OWNER_DIED) { + ownerdied = 1; + goto retry; + } + default: + break; + } + } + + return ret; +} + /* * The hash bucket lock must be held when this is called. * Afterwards, the futex_q must not be accessed. @@ -1340,9 +1461,9 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, struct hrtimer_sleeper timeout, *to = NULL; struct task_struct *curr = current; struct futex_hash_bucket *hb; - u32 uval, newval, curval; + u32 uval; struct futex_q q; - int ret, lock_taken, ownerdied = 0; + int ret; if (refill_pi_state_cache()) return -ENOMEM; @@ -1365,81 +1486,15 @@ retry: retry_private: hb = queue_lock(&q); -retry_locked: - ret = lock_taken = 0; - - /* - * To avoid races, we attempt to take the lock here again - * (by doing a 0 -> TID atomic cmpxchg), while holding all - * the locks. It will most likely not succeed. - */ - newval = task_pid_vnr(current); - - curval = cmpxchg_futex_value_locked(uaddr, 0, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - - /* - * Detect deadlocks. In case of REQUEUE_PI this is a valid - * situation and we return success to user space. - */ - if (unlikely((curval & FUTEX_TID_MASK) == task_pid_vnr(current))) { - ret = -EDEADLK; - goto out_unlock_put_key; - } - - /* - * Surprise - we got the lock. Just return to userspace: - */ - if (unlikely(!curval)) - goto out_unlock_put_key; - - uval = curval; - - /* - * Set the WAITERS flag, so the owner will know it has someone - * to wake at next unlock - */ - newval = curval | FUTEX_WAITERS; - - /* - * There are two cases, where a futex might have no owner (the - * owner TID is 0): OWNER_DIED. We take over the futex in this - * case. We also do an unconditional take over, when the owner - * of the futex died. - * - * This is safe as we are protected by the hash bucket lock ! - */ - if (unlikely(ownerdied || !(curval & FUTEX_TID_MASK))) { - /* Keep the OWNER_DIED bit */ - newval = (curval & ~FUTEX_TID_MASK) | task_pid_vnr(current); - ownerdied = 0; - lock_taken = 1; - } - - curval = cmpxchg_futex_value_locked(uaddr, uval, newval); - - if (unlikely(curval == -EFAULT)) - goto uaddr_faulted; - if (unlikely(curval != uval)) - goto retry_locked; - - /* - * We took the lock due to owner died take over. - */ - if (unlikely(lock_taken)) - goto out_unlock_put_key; - - /* - * We dont have the lock. Look up the PI state (or create it if - * we are the first waiter): - */ - ret = lookup_pi_state(uval, hb, &q.key, &q.pi_state); - + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current); if (unlikely(ret)) { switch (ret) { - + case 1: + /* We got the lock. */ + ret = 0; + goto out_unlock_put_key; + case -EFAULT: + goto uaddr_faulted; case -EAGAIN: /* * Task is exiting and we just wait for the @@ -1449,25 +1504,6 @@ retry_locked: put_futex_key(fshared, &q.key); cond_resched(); goto retry; - - case -ESRCH: - /* - * No owner found for this futex. Check if the - * OWNER_DIED bit is set to figure out whether - * this is a robust futex or not. - */ - if (get_futex_value_locked(&curval, uaddr)) - goto uaddr_faulted; - - /* - * We simply start over in case of a robust - * futex. The code above will take the futex - * and return happy. - */ - if (curval & FUTEX_OWNER_DIED) { - ownerdied = 1; - goto retry_locked; - } default: goto out_unlock_put_key; } From dd9739980b50c8cde33e1f8eb08b7e0140bcd61e Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:02 -0700 Subject: [PATCH 0237/2061] futex: split out fixup owner logic from futex_lock_pi() Refactor the post lock acquisition logic from futex_lock_pi(). This code will be reused in futex_wait_requeue_pi(). Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 158 ++++++++++++++++++++++++++++--------------------- 1 file changed, 89 insertions(+), 69 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 986b16e4453..af831fbb7fb 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1255,6 +1255,79 @@ handle_fault: static long futex_wait_restart(struct restart_block *restart); +/** + * fixup_owner() - Post lock pi_state and corner case management + * @uaddr: user address of the futex + * @fshared: whether the futex is shared (1) or not (0) + * @q: futex_q (contains pi_state and access to the rt_mutex) + * @locked: if the attempt to take the rt_mutex succeeded (1) or not (0) + * + * After attempting to lock an rt_mutex, this function is called to cleanup + * the pi_state owner as well as handle race conditions that may allow us to + * acquire the lock. Must be called with the hb lock held. + * + * Returns: + * 1 - success, lock taken + * 0 - success, lock not taken + * <0 - on error (-EFAULT) + */ +static int fixup_owner(u32 __user *uaddr, int fshared, struct futex_q *q, + int locked) +{ + struct task_struct *owner; + int ret = 0; + + if (locked) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case: + */ + if (q->pi_state->owner != current) + ret = fixup_pi_state_owner(uaddr, q, current, fshared); + goto out; + } + + /* + * Catch the rare case, where the lock was released when we were on the + * way back before we locked the hash bucket. + */ + if (q->pi_state->owner == current) { + /* + * Try to get the rt_mutex now. This might fail as some other + * task acquired the rt_mutex after we removed ourself from the + * rt_mutex waiters list. + */ + if (rt_mutex_trylock(&q->pi_state->pi_mutex)) { + locked = 1; + goto out; + } + + /* + * pi_state is incorrect, some other task did a lock steal and + * we returned due to timeout or signal without taking the + * rt_mutex. Too late. We can access the rt_mutex_owner without + * locking, as the other task is now blocked on the hash bucket + * lock. Fix the state up. + */ + owner = rt_mutex_owner(&q->pi_state->pi_mutex); + ret = fixup_pi_state_owner(uaddr, q, owner, fshared); + goto out; + } + + /* + * Paranoia check. If we did not take the lock, then we should not be + * the owner, nor the pending owner, of the rt_mutex. + */ + if (rt_mutex_owner(&q->pi_state->pi_mutex) == current) + printk(KERN_ERR "fixup_owner: ret = %d pi-mutex: %p " + "pi-state %p\n", ret, + q->pi_state->pi_mutex.owner, + q->pi_state->owner); + +out: + return ret ? ret : locked; +} + /** * futex_wait_queue_me() - queue_me() and wait for wakeup, timeout, or signal * @hb: the futex hash bucket, must be locked by the caller @@ -1459,11 +1532,10 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, int detect, ktime_t *time, int trylock) { struct hrtimer_sleeper timeout, *to = NULL; - struct task_struct *curr = current; struct futex_hash_bucket *hb; u32 uval; struct futex_q q; - int ret; + int res, ret; if (refill_pi_state_cache()) return -ENOMEM; @@ -1527,71 +1599,21 @@ retry_private: } spin_lock(q.lock_ptr); - - if (!ret) { - /* - * Got the lock. We might not be the anticipated owner - * if we did a lock-steal - fix up the PI-state in - * that case: - */ - if (q.pi_state->owner != curr) - ret = fixup_pi_state_owner(uaddr, &q, curr, fshared); - } else { - /* - * Catch the rare case, where the lock was released - * when we were on the way back before we locked the - * hash bucket. - */ - if (q.pi_state->owner == curr) { - /* - * Try to get the rt_mutex now. This might - * fail as some other task acquired the - * rt_mutex after we removed ourself from the - * rt_mutex waiters list. - */ - if (rt_mutex_trylock(&q.pi_state->pi_mutex)) - ret = 0; - else { - /* - * pi_state is incorrect, some other - * task did a lock steal and we - * returned due to timeout or signal - * without taking the rt_mutex. Too - * late. We can access the - * rt_mutex_owner without locking, as - * the other task is now blocked on - * the hash bucket lock. Fix the state - * up. - */ - struct task_struct *owner; - int res; - - owner = rt_mutex_owner(&q.pi_state->pi_mutex); - res = fixup_pi_state_owner(uaddr, &q, owner, - fshared); - - /* propagate -EFAULT, if the fixup failed */ - if (res) - ret = res; - } - } else { - /* - * Paranoia check. If we did not take the lock - * in the trylock above, then we should not be - * the owner of the rtmutex, neither the real - * nor the pending one: - */ - if (rt_mutex_owner(&q.pi_state->pi_mutex) == curr) - printk(KERN_ERR "futex_lock_pi: ret = %d " - "pi-mutex: %p pi-state %p\n", ret, - q.pi_state->pi_mutex.owner, - q.pi_state->owner); - } - } + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it acquired + * the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; /* - * If fixup_pi_state_owner() faulted and was unable to handle the - * fault, unlock it and return the fault to userspace. + * If fixup_owner() faulted and was unable to handle the fault, unlock + * it and return the fault to userspace. */ if (ret && (rt_mutex_owner(&q.pi_state->pi_mutex) == current)) rt_mutex_unlock(&q.pi_state->pi_mutex); @@ -1599,9 +1621,7 @@ retry_private: /* Unqueue and drop the lock */ unqueue_me_pi(&q); - if (to) - destroy_hrtimer_on_stack(&to->timer); - return ret != -EINTR ? ret : -ERESTARTNOINTR; + goto out; out_unlock_put_key: queue_unlock(&q, hb); @@ -1611,7 +1631,7 @@ out_put_key: out: if (to) destroy_hrtimer_on_stack(&to->timer); - return ret; + return ret != -EINTR ? ret : -ERESTARTNOINTR; uaddr_faulted: /* From 8dac456a681bd94272ff50ecb31be6b669382c2b Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:12 -0700 Subject: [PATCH 0238/2061] rt_mutex: add proxy lock routines This patch is a prerequisite for futex requeue_pi. It basically splits rt_mutex_slowlock() right down the middle, just before the first call to schedule(). It further adds helper functions which make use of the split and provide the rt-mutex preliminaries for futex requeue_pi. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/rtmutex.c | 288 +++++++++++++++++++++++++++++----------- kernel/rtmutex_common.h | 8 ++ 2 files changed, 219 insertions(+), 77 deletions(-) diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ff..fec77e7e056 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -300,7 +300,8 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * assigned pending owner [which might not have taken the * lock yet]: */ -static inline int try_to_steal_lock(struct rt_mutex *lock) +static inline int try_to_steal_lock(struct rt_mutex *lock, + struct task_struct *task) { struct task_struct *pendowner = rt_mutex_owner(lock); struct rt_mutex_waiter *next; @@ -309,11 +310,11 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) if (!rt_mutex_owner_pending(lock)) return 0; - if (pendowner == current) + if (pendowner == task) return 1; spin_lock_irqsave(&pendowner->pi_lock, flags); - if (current->prio >= pendowner->prio) { + if (task->prio >= pendowner->prio) { spin_unlock_irqrestore(&pendowner->pi_lock, flags); return 0; } @@ -338,21 +339,21 @@ static inline int try_to_steal_lock(struct rt_mutex *lock) * We are going to steal the lock and a waiter was * enqueued on the pending owners pi_waiters queue. So * we have to enqueue this waiter into - * current->pi_waiters list. This covers the case, - * where current is boosted because it holds another + * task->pi_waiters list. This covers the case, + * where task is boosted because it holds another * lock and gets unboosted because the booster is * interrupted, so we would delay a waiter with higher - * priority as current->normal_prio. + * priority as task->normal_prio. * * Note: in the rare case of a SCHED_OTHER task changing * its priority and thus stealing the lock, next->task - * might be current: + * might be task: */ - if (likely(next->task != current)) { - spin_lock_irqsave(¤t->pi_lock, flags); - plist_add(&next->pi_list_entry, ¤t->pi_waiters); - __rt_mutex_adjust_prio(current); - spin_unlock_irqrestore(¤t->pi_lock, flags); + if (likely(next->task != task)) { + spin_lock_irqsave(&task->pi_lock, flags); + plist_add(&next->pi_list_entry, &task->pi_waiters); + __rt_mutex_adjust_prio(task); + spin_unlock_irqrestore(&task->pi_lock, flags); } return 1; } @@ -389,7 +390,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ mark_rt_mutex_waiters(lock); - if (rt_mutex_owner(lock) && !try_to_steal_lock(lock)) + if (rt_mutex_owner(lock) && !try_to_steal_lock(lock, current)) return 0; /* We got the lock. */ @@ -411,6 +412,7 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock) */ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) { struct task_struct *owner = rt_mutex_owner(lock); @@ -418,21 +420,21 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, unsigned long flags; int chain_walk = 0, res; - spin_lock_irqsave(¤t->pi_lock, flags); - __rt_mutex_adjust_prio(current); - waiter->task = current; + spin_lock_irqsave(&task->pi_lock, flags); + __rt_mutex_adjust_prio(task); + waiter->task = task; waiter->lock = lock; - plist_node_init(&waiter->list_entry, current->prio); - plist_node_init(&waiter->pi_list_entry, current->prio); + plist_node_init(&waiter->list_entry, task->prio); + plist_node_init(&waiter->pi_list_entry, task->prio); /* Get the top priority waiter on the lock */ if (rt_mutex_has_waiters(lock)) top_waiter = rt_mutex_top_waiter(lock); plist_add(&waiter->list_entry, &lock->wait_list); - current->pi_blocked_on = waiter; + task->pi_blocked_on = waiter; - spin_unlock_irqrestore(¤t->pi_lock, flags); + spin_unlock_irqrestore(&task->pi_lock, flags); if (waiter == rt_mutex_top_waiter(lock)) { spin_lock_irqsave(&owner->pi_lock, flags); @@ -460,7 +462,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, spin_unlock(&lock->wait_lock); res = rt_mutex_adjust_prio_chain(owner, detect_deadlock, lock, waiter, - current); + task); spin_lock(&lock->wait_lock); @@ -605,6 +607,85 @@ void rt_mutex_adjust_pi(struct task_struct *task) rt_mutex_adjust_prio_chain(task, 0, NULL, NULL, task); } +/** + * __rt_mutex_slowlock() - Perform the wait-wake-try-to-take loop + * @lock: the rt_mutex to take + * @state: the state the task should block in (TASK_INTERRUPTIBLE + * or TASK_UNINTERRUPTIBLE) + * @timeout: the pre-initialized and started timer, or NULL for none + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: passed to task_blocks_on_rt_mutex + * + * lock->wait_lock must be held by the caller. + */ +static int __sched +__rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + struct rt_mutex_waiter *waiter, + int detect_deadlock) +{ + int ret = 0; + + for (;;) { + /* Try to acquire the lock: */ + if (try_to_take_rt_mutex(lock)) + break; + + /* + * TASK_INTERRUPTIBLE checks for signals and + * timeout. Ignored otherwise. + */ + if (unlikely(state == TASK_INTERRUPTIBLE)) { + /* Signal pending? */ + if (signal_pending(current)) + ret = -EINTR; + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + if (ret) + break; + } + + /* + * waiter->task is NULL the first time we come here and + * when we have been woken up by the previous owner + * but the lock got stolen by a higher prio task. + */ + if (!waiter->task) { + ret = task_blocks_on_rt_mutex(lock, waiter, current, + detect_deadlock); + /* + * If we got woken up by the owner then start loop + * all over without going into schedule to try + * to get the lock now: + */ + if (unlikely(!waiter->task)) { + /* + * Reset the return value. We might + * have returned with -EDEADLK and the + * owner released the lock while we + * were walking the pi chain. + */ + ret = 0; + continue; + } + if (unlikely(ret)) + break; + } + + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + if (waiter->task) + schedule_rt_mutex(lock); + + spin_lock(&lock->wait_lock); + set_current_state(state); + } + + return ret; +} + /* * Slow path lock function: */ @@ -636,62 +717,8 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, timeout->task = NULL; } - for (;;) { - /* Try to acquire the lock: */ - if (try_to_take_rt_mutex(lock)) - break; - - /* - * TASK_INTERRUPTIBLE checks for signals and - * timeout. Ignored otherwise. - */ - if (unlikely(state == TASK_INTERRUPTIBLE)) { - /* Signal pending? */ - if (signal_pending(current)) - ret = -EINTR; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; - if (ret) - break; - } - - /* - * waiter.task is NULL the first time we come here and - * when we have been woken up by the previous owner - * but the lock got stolen by a higher prio task. - */ - if (!waiter.task) { - ret = task_blocks_on_rt_mutex(lock, &waiter, - detect_deadlock); - /* - * If we got woken up by the owner then start loop - * all over without going into schedule to try - * to get the lock now: - */ - if (unlikely(!waiter.task)) { - /* - * Reset the return value. We might - * have returned with -EDEADLK and the - * owner released the lock while we - * were walking the pi chain. - */ - ret = 0; - continue; - } - if (unlikely(ret)) - break; - } - - spin_unlock(&lock->wait_lock); - - debug_rt_mutex_print_deadlock(&waiter); - - if (waiter.task) - schedule_rt_mutex(lock); - - spin_lock(&lock->wait_lock); - set_current_state(state); - } + ret = __rt_mutex_slowlock(lock, state, timeout, &waiter, + detect_deadlock); set_current_state(TASK_RUNNING); @@ -985,6 +1012,59 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock, rt_mutex_deadlock_account_unlock(proxy_owner); } +/** + * rt_mutex_start_proxy_lock() - Start lock acquisition for another task + * @lock: the rt_mutex to take + * @waiter: the pre-initialized rt_mutex_waiter + * @task: the task to prepare + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Returns: + * 0 - task blocked on lock + * 1 - acquired the lock for task, caller should wake it up + * <0 - error + * + * Special API call for FUTEX_REQUEUE_PI support. + */ +int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + mark_rt_mutex_waiters(lock); + + if (!rt_mutex_owner(lock) || try_to_steal_lock(lock, task)) { + /* We got the lock for task. */ + debug_rt_mutex_lock(lock); + + rt_mutex_set_owner(lock, task, 0); + + rt_mutex_deadlock_account_lock(lock, task); + return 1; + } + + ret = task_blocks_on_rt_mutex(lock, waiter, task, detect_deadlock); + + + if (ret && !waiter->task) { + /* + * Reset the return value. We might have + * returned with -EDEADLK and the owner + * released the lock while we were walking the + * pi chain. Let the waiter sort it out. + */ + ret = 0; + } + spin_unlock(&lock->wait_lock); + + debug_rt_mutex_print_deadlock(waiter); + + return ret; +} + /** * rt_mutex_next_owner - return the next owner of the lock * @@ -1004,3 +1084,57 @@ struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock) return rt_mutex_top_waiter(lock)->task; } + +/** + * rt_mutex_finish_proxy_lock() - Complete lock acquisition + * @lock: the rt_mutex we were woken on + * @to: the timeout, null if none. hrtimer should already have + * been started. + * @waiter: the pre-initialized rt_mutex_waiter + * @detect_deadlock: perform deadlock detection (1) or not (0) + * + * Complete the lock acquisition started our behalf by another thread. + * + * Returns: + * 0 - success + * <0 - error, one of -EINTR, -ETIMEDOUT, or -EDEADLK + * + * Special API call for PI-futex requeue support + */ +int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock) +{ + int ret; + + spin_lock(&lock->wait_lock); + + set_current_state(TASK_INTERRUPTIBLE); + + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, + detect_deadlock); + + set_current_state(TASK_RUNNING); + + if (unlikely(waiter->task)) + remove_waiter(lock, waiter); + + /* + * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might + * have to fix that up. + */ + fixup_rt_mutex_waiters(lock); + + spin_unlock(&lock->wait_lock); + + /* + * Readjust priority, when we did not get the lock. We might have been + * the pending owner and boosted. Since we did not take the lock, the + * PI boost has to go. + */ + if (unlikely(ret)) + rt_mutex_adjust_prio(current); + + return ret; +} diff --git a/kernel/rtmutex_common.h b/kernel/rtmutex_common.h index e124bf5800e..97a2f81866a 100644 --- a/kernel/rtmutex_common.h +++ b/kernel/rtmutex_common.h @@ -120,6 +120,14 @@ extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock, struct task_struct *proxy_owner); +extern int rt_mutex_start_proxy_lock(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + int detect_deadlock); +extern int rt_mutex_finish_proxy_lock(struct rt_mutex *lock, + struct hrtimer_sleeper *to, + struct rt_mutex_waiter *waiter, + int detect_deadlock); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" From a72188d8a64ebe74722f1cf7ffac41b41ffdba21 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:22 -0700 Subject: [PATCH 0239/2061] futex: add FUTEX_HAS_TIMEOUT flag to restart.futex.flags Currently restart is only used if there is a timeout. The requeue_pi functionality requires restarting to futex_lock_pi() on signal after wakeup in futex_wait_requeue_pi() regardless of if there was a timeout or not. Using 0 for the timeout value is confusing as that could indicate an expired timer. The flag makes this explicit. While the check is not technically needed in futex_wait_restart(), doing so makes the code consistent with and will avoid confusion should the need arise to restart wait without a timeout. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index af831fbb7fb..6b597cf33b0 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1252,6 +1252,7 @@ handle_fault: */ #define FLAGS_SHARED 0x01 #define FLAGS_CLOCKRT 0x02 +#define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); @@ -1486,7 +1487,7 @@ retry_private: restart->futex.val = val; restart->futex.time = abs_time->tv64; restart->futex.bitset = bitset; - restart->futex.flags = 0; + restart->futex.flags = FLAGS_HAS_TIMEOUT; if (fshared) restart->futex.flags |= FLAGS_SHARED; @@ -1510,13 +1511,16 @@ static long futex_wait_restart(struct restart_block *restart) { u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; int fshared = 0; - ktime_t t; + ktime_t t, *tp = NULL; - t.tv64 = restart->futex.time; + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t.tv64 = restart->futex.time; + tp = &t; + } restart->fn = do_no_restart_syscall; if (restart->futex.flags & FLAGS_SHARED) fshared = 1; - return (long)futex_wait(uaddr, fshared, restart->futex.val, &t, + return (long)futex_wait(uaddr, fshared, restart->futex.val, tp, restart->futex.bitset, restart->futex.flags & FLAGS_CLOCKRT); } From 9121e4783cd5c7e2a407763f3b61c2d573891133 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:31 -0700 Subject: [PATCH 0240/2061] futex: distangle futex_requeue() futex_requeue() is getting a bit long-winded, and will be getting more so after the requeue_pi patch. Factor out the actual requeueing into a nicely contained inline function to reduce function length and improve legibility. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 6b597cf33b0..e76942e2a79 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -940,6 +940,34 @@ out: return ret; } +/** + * requeue_futex() - Requeue a futex_q from one hb to another + * @q: the futex_q to requeue + * @hb1: the source hash_bucket + * @hb2: the target hash_bucket + * @key2: the new key for the requeued futex_q + */ +static inline +void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, union futex_key *key2) +{ + + /* + * If key1 and key2 hash to the same bucket, no need to + * requeue. + */ + if (likely(&hb1->chain != &hb2->chain)) { + plist_del(&q->list, &hb1->chain); + plist_add(&q->list, &hb2->chain); + q->lock_ptr = &hb2->lock; +#ifdef CONFIG_DEBUG_PI_LIST + q->list.plist.lock = &hb2->lock; +#endif + } + get_futex_key_refs(key2); + q->key = *key2; +} + /* * Requeue all waiters hashed on one physical page to another * physical page. @@ -999,20 +1027,7 @@ retry_private: if (++ret <= nr_wake) { wake_futex(this); } else { - /* - * If key1 and key2 hash to the same bucket, no need to - * requeue. - */ - if (likely(head1 != &hb2->chain)) { - plist_del(&this->list, &hb1->chain); - plist_add(&this->list, &hb2->chain); - this->lock_ptr = &hb2->lock; -#ifdef CONFIG_DEBUG_PI_LIST - this->list.plist.lock = &hb2->lock; -#endif - } - this->key = key2; - get_futex_key_refs(&key2); + requeue_futex(this, hb1, hb2, &key2); drop_count++; if (ret - nr_wake >= nr_requeue) From f801073f87aa22ddf0e9146355fec3993163790f Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:40 -0700 Subject: [PATCH 0241/2061] futex: split out futex value validation code Refactor the code to validate the expected futex value in order to reuse it with the requeue_pi code. Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- kernel/futex.c | 130 ++++++++++++++++++++++++++++++------------------- 1 file changed, 79 insertions(+), 51 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index e76942e2a79..dbe857aa438 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1398,6 +1398,82 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, __set_current_state(TASK_RUNNING); } +/** + * futex_wait_setup() - Prepare to wait on a futex + * @uaddr: the futex userspace address + * @val: the expected value + * @fshared: whether the futex is shared (1) or not (0) + * @q: the associated futex_q + * @hb: storage for hash_bucket pointer to be returned to caller + * + * Setup the futex_q and locate the hash_bucket. Get the futex value and + * compare it with the expected value. Handle atomic faults internally. + * Return with the hb lock held and a q.key reference on success, and unlocked + * with no q.key reference on failure. + * + * Returns: + * 0 - uaddr contains val and hb has been locked + * <1 - -EFAULT or -EWOULDBLOCK (uaddr does not contain val) and hb is unlcoked + */ +static int futex_wait_setup(u32 __user *uaddr, u32 val, int fshared, + struct futex_q *q, struct futex_hash_bucket **hb) +{ + u32 uval; + int ret; + + /* + * Access the page AFTER the hash-bucket is locked. + * Order is important: + * + * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); + * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } + * + * The basic logical guarantee of a futex is that it blocks ONLY + * if cond(var) is known to be true at the time of blocking, for + * any cond. If we queued after testing *uaddr, that would open + * a race condition where we could block indefinitely with + * cond(var) false, which would violate the guarantee. + * + * A consequence is that futex_wait() can return zero and absorb + * a wakeup when *uaddr != val on entry to the syscall. This is + * rare, but normal. + */ +retry: + q->key = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr, fshared, &q->key); + if (unlikely(ret != 0)) + goto out; + +retry_private: + *hb = queue_lock(q); + + ret = get_futex_value_locked(&uval, uaddr); + + if (ret) { + queue_unlock(q, *hb); + + ret = get_user(uval, uaddr); + if (ret) + goto out; + + if (!fshared) + goto retry_private; + + put_futex_key(fshared, &q->key); + goto retry; + } + + if (uval != val) { + queue_unlock(q, *hb); + ret = -EWOULDBLOCK; + } + +out: + if (ret) + put_futex_key(fshared, &q->key); + return ret; +} + static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { @@ -1406,7 +1482,6 @@ static int futex_wait(u32 __user *uaddr, int fshared, struct restart_block *restart; struct futex_hash_bucket *hb; struct futex_q q; - u32 uval; int ret; if (!bitset) @@ -1425,58 +1500,11 @@ static int futex_wait(u32 __user *uaddr, int fshared, current->timer_slack_ns); } -retry: - q.key = FUTEX_KEY_INIT; - ret = get_futex_key(uaddr, fshared, &q.key); - if (unlikely(ret != 0)) + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) goto out; -retry_private: - hb = queue_lock(&q); - - /* - * Access the page AFTER the hash-bucket is locked. - * Order is important: - * - * Userspace waiter: val = var; if (cond(val)) futex_wait(&var, val); - * Userspace waker: if (cond(var)) { var = new; futex_wake(&var); } - * - * The basic logical guarantee of a futex is that it blocks ONLY - * if cond(var) is known to be true at the time of blocking, for - * any cond. If we queued after testing *uaddr, that would open - * a race condition where we could block indefinitely with - * cond(var) false, which would violate the guarantee. - * - * A consequence is that futex_wait() can return zero and absorb - * a wakeup when *uaddr != val on entry to the syscall. This is - * rare, but normal. - * - * For shared futexes, we hold the mmap semaphore, so the mapping - * cannot have changed since we looked it up in get_futex_key. - */ - ret = get_futex_value_locked(&uval, uaddr); - - if (unlikely(ret)) { - queue_unlock(&q, hb); - - ret = get_user(uval, uaddr); - if (ret) - goto out_put_key; - - if (!fshared) - goto retry_private; - - put_futex_key(fshared, &q.key); - goto retry; - } - ret = -EWOULDBLOCK; - - /* Only actually queue if *uaddr contained val. */ - if (unlikely(uval != val)) { - queue_unlock(&q, hb); - goto out_put_key; - } - /* queue_me and wait for wakeup, timeout, or a signal. */ futex_wait_queue_me(hb, &q, to, &wait); From 52400ba946759af28442dee6265c5c0180ac7122 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 3 Apr 2009 13:40:49 -0700 Subject: [PATCH 0242/2061] futex: add requeue_pi functionality PI Futexes and their underlying rt_mutex cannot be left ownerless if there are pending waiters as this will break the PI boosting logic, so the standard requeue commands aren't sufficient. The new commands properly manage pi futex ownership by ensuring a futex with waiters has an owner at all times. This will allow glibc to properly handle pi mutexes with pthread_condvars. The approach taken here is to create two new futex op codes: FUTEX_WAIT_REQUEUE_PI: Tasks will use this op code to wait on a futex (such as a non-pi waitqueue) and wake after they have been requeued to a pi futex. Prior to returning to userspace, they will acquire this pi futex (and the underlying rt_mutex). futex_wait_requeue_pi() is the result of a high speed collision between futex_wait() and futex_lock_pi() (with the first part of futex_lock_pi() being done by futex_proxy_trylock_atomic() on behalf of the top_waiter). FUTEX_REQUEUE_PI (and FUTEX_CMP_REQUEUE_PI): This call must be used to wake tasks waiting with FUTEX_WAIT_REQUEUE_PI, regardless of how many tasks the caller intends to wake or requeue. pthread_cond_broadcast() should call this with nr_wake=1 and nr_requeue=INT_MAX. pthread_cond_signal() should call this with nr_wake=1 and nr_requeue=0. The reason being we need both callers to get the benefit of the futex_proxy_trylock_atomic() routine. futex_requeue() also enqueues the top_waiter on the rt_mutex via rt_mutex_start_proxy_lock(). Signed-off-by: Darren Hart Reviewed-by: Thomas Gleixner Signed-off-by: Thomas Gleixner --- include/linux/futex.h | 8 + include/linux/thread_info.h | 3 +- kernel/futex.c | 523 ++++++++++++++++++++++++++++++++++-- 3 files changed, 512 insertions(+), 22 deletions(-) diff --git a/include/linux/futex.h b/include/linux/futex.h index 3bf5bb5a34f..b05519ca9e5 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -23,6 +23,9 @@ union ktime; #define FUTEX_TRYLOCK_PI 8 #define FUTEX_WAIT_BITSET 9 #define FUTEX_WAKE_BITSET 10 +#define FUTEX_WAIT_REQUEUE_PI 11 +#define FUTEX_REQUEUE_PI 12 +#define FUTEX_CMP_REQUEUE_PI 13 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -38,6 +41,11 @@ union ktime; #define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) #define FUTEX_WAIT_BITSET_PRIVATE (FUTEX_WAIT_BITS | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) +#define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) +#define FUTEX_REQUEUE_PI_PRIVATE (FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG) +#define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ + FUTEX_PRIVATE_FLAG) /* * Support for robust futexes: the kernel cleans up held futexes at diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index e6b820f8b56..a8cc4e13434 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -21,13 +21,14 @@ struct restart_block { struct { unsigned long arg0, arg1, arg2, arg3; }; - /* For futex_wait */ + /* For futex_wait and futex_wait_requeue_pi */ struct { u32 *uaddr; u32 val; u32 flags; u32 bitset; u64 time; + u32 *uaddr2; } futex; /* For nanosleep */ struct { diff --git a/kernel/futex.c b/kernel/futex.c index dbe857aa438..185c981d89e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -19,6 +19,10 @@ * PRIVATE futexes by Eric Dumazet * Copyright (C) 2007 Eric Dumazet * + * Requeue-PI support by Darren Hart + * Copyright (C) IBM Corporation, 2009 + * Thanks to Thomas Gleixner for conceptual design and careful reviews. + * * Thanks to Ben LaHaise for yelling "hashed waitqueues" loudly * enough at me, Linus for the original (flawed) idea, Matthew * Kirkwood for proof-of-concept implementation. @@ -109,6 +113,9 @@ struct futex_q { struct futex_pi_state *pi_state; struct task_struct *task; + /* rt_waiter storage for requeue_pi: */ + struct rt_mutex_waiter *rt_waiter; + /* Bitset for the optional bitmasked wakeup */ u32 bitset; }; @@ -827,7 +834,7 @@ static int futex_wake(u32 __user *uaddr, int fshared, int nr_wake, u32 bitset) plist_for_each_entry_safe(this, next, head, list) { if (match_futex (&this->key, &key)) { - if (this->pi_state) { + if (this->pi_state || this->rt_waiter) { ret = -EINVAL; break; } @@ -968,20 +975,138 @@ void requeue_futex(struct futex_q *q, struct futex_hash_bucket *hb1, q->key = *key2; } -/* - * Requeue all waiters hashed on one physical page to another - * physical page. +/** + * requeue_pi_wake_futex() - Wake a task that acquired the lock during requeue + * q: the futex_q + * key: the key of the requeue target futex + * + * During futex_requeue, with requeue_pi=1, it is possible to acquire the + * target futex if it is uncontended or via a lock steal. Set the futex_q key + * to the requeue target futex so the waiter can detect the wakeup on the right + * futex, but remove it from the hb and NULL the rt_waiter so it can detect + * atomic lock acquisition. Must be called with the q->lock_ptr held. + */ +static inline +void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) +{ + drop_futex_key_refs(&q->key); + get_futex_key_refs(key); + q->key = *key; + + WARN_ON(plist_node_empty(&q->list)); + plist_del(&q->list, &q->list.plist); + + WARN_ON(!q->rt_waiter); + q->rt_waiter = NULL; + + wake_up(&q->waiter); +} + +/** + * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * + * Try and get the lock on behalf of the top waiter if we can do it atomically. + * Wake the top waiter if we succeed. hb1 and hb2 must be held by the caller. + * + * Returns: + * 0 - failed to acquire the lock atomicly + * 1 - acquired the lock + * <0 - error + */ +static int futex_proxy_trylock_atomic(u32 __user *pifutex, + struct futex_hash_bucket *hb1, + struct futex_hash_bucket *hb2, + union futex_key *key1, union futex_key *key2, + struct futex_pi_state **ps) +{ + struct futex_q *top_waiter; + u32 curval; + int ret; + + if (get_futex_value_locked(&curval, pifutex)) + return -EFAULT; + + top_waiter = futex_top_waiter(hb1, key1); + + /* There are no waiters, nothing for us to do. */ + if (!top_waiter) + return 0; + + /* + * Either take the lock for top_waiter or set the FUTEX_WAITERS bit. + * The pi_state is returned in ps in contended cases. + */ + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task); + if (ret == 1) + requeue_pi_wake_futex(top_waiter, key2); + + return ret; +} + +/** + * futex_requeue() - Requeue waiters from uaddr1 to uaddr2 + * uaddr1: source futex user address + * uaddr2: target futex user address + * nr_wake: number of waiters to wake (must be 1 for requeue_pi) + * nr_requeue: number of waiters to requeue (0-INT_MAX) + * requeue_pi: if we are attempting to requeue from a non-pi futex to a + * pi futex (pi to pi requeue is not supported) + * + * Requeue waiters on uaddr1 to uaddr2. In the requeue_pi case, try to acquire + * uaddr2 atomically on behalf of the top waiter. + * + * Returns: + * >=0 - on success, the number of tasks requeued or woken + * <0 - on error */ static int futex_requeue(u32 __user *uaddr1, int fshared, u32 __user *uaddr2, - int nr_wake, int nr_requeue, u32 *cmpval) + int nr_wake, int nr_requeue, u32 *cmpval, + int requeue_pi) { union futex_key key1 = FUTEX_KEY_INIT, key2 = FUTEX_KEY_INIT; + int drop_count = 0, task_count = 0, ret; + struct futex_pi_state *pi_state = NULL; struct futex_hash_bucket *hb1, *hb2; struct plist_head *head1; struct futex_q *this, *next; - int ret, drop_count = 0; + u32 curval2; + + if (requeue_pi) { + /* + * requeue_pi requires a pi_state, try to allocate it now + * without any locks in case it fails. + */ + if (refill_pi_state_cache()) + return -ENOMEM; + /* + * requeue_pi must wake as many tasks as it can, up to nr_wake + * + nr_requeue, since it acquires the rt_mutex prior to + * returning to userspace, so as to not leave the rt_mutex with + * waiters and no owner. However, second and third wake-ups + * cannot be predicted as they involve race conditions with the + * first wake and a fault while looking up the pi_state. Both + * pthread_cond_signal() and pthread_cond_broadcast() should + * use nr_wake=1. + */ + if (nr_wake != 1) + return -EINVAL; + } retry: + if (pi_state != NULL) { + /* + * We will have to lookup the pi_state again, so free this one + * to keep the accounting correct. + */ + free_pi_state(pi_state); + pi_state = NULL; + } + ret = get_futex_key(uaddr1, fshared, &key1); if (unlikely(ret != 0)) goto out; @@ -1020,19 +1145,94 @@ retry_private: } } + if (requeue_pi && (task_count - nr_wake < nr_requeue)) { + /* Attempt to acquire uaddr2 and wake the top_waiter. */ + ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, + &key2, &pi_state); + + /* + * At this point the top_waiter has either taken uaddr2 or is + * waiting on it. If the former, then the pi_state will not + * exist yet, look it up one more time to ensure we have a + * reference to it. + */ + if (ret == 1) { + WARN_ON(pi_state); + task_count++; + ret = get_futex_value_locked(&curval2, uaddr2); + if (!ret) + ret = lookup_pi_state(curval2, hb2, &key2, + &pi_state); + } + + switch (ret) { + case 0: + break; + case -EFAULT: + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + ret = get_user(curval2, uaddr2); + if (!ret) + goto retry; + goto out; + case -EAGAIN: + /* The owner was exiting, try again. */ + double_unlock_hb(hb1, hb2); + put_futex_key(fshared, &key2); + put_futex_key(fshared, &key1); + cond_resched(); + goto retry; + default: + goto out_unlock; + } + } + head1 = &hb1->chain; plist_for_each_entry_safe(this, next, head1, list) { - if (!match_futex (&this->key, &key1)) - continue; - if (++ret <= nr_wake) { - wake_futex(this); - } else { - requeue_futex(this, hb1, hb2, &key2); - drop_count++; + if (task_count - nr_wake >= nr_requeue) + break; - if (ret - nr_wake >= nr_requeue) - break; + if (!match_futex(&this->key, &key1)) + continue; + + WARN_ON(!requeue_pi && this->rt_waiter); + WARN_ON(requeue_pi && !this->rt_waiter); + + /* + * Wake nr_wake waiters. For requeue_pi, if we acquired the + * lock, we already woke the top_waiter. If not, it will be + * woken by futex_unlock_pi(). + */ + if (++task_count <= nr_wake && !requeue_pi) { + wake_futex(this); + continue; } + + /* + * Requeue nr_requeue waiters and possibly one more in the case + * of requeue_pi if we couldn't acquire the lock atomically. + */ + if (requeue_pi) { + /* Prepare the waiter to take the rt_mutex. */ + atomic_inc(&pi_state->refcount); + this->pi_state = pi_state; + ret = rt_mutex_start_proxy_lock(&pi_state->pi_mutex, + this->rt_waiter, + this->task, 1); + if (ret == 1) { + /* We got the lock. */ + requeue_pi_wake_futex(this, &key2); + continue; + } else if (ret) { + /* -EDEADLK */ + this->pi_state = NULL; + free_pi_state(pi_state); + goto out_unlock; + } + } + requeue_futex(this, hb1, hb2, &key2); + drop_count++; } out_unlock: @@ -1047,7 +1247,9 @@ out_put_keys: out_put_key1: put_futex_key(fshared, &key1); out: - return ret; + if (pi_state != NULL) + free_pi_state(pi_state); + return ret ? ret : task_count; } /* The key must be already stored in q->key. */ @@ -1270,6 +1472,7 @@ handle_fault: #define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); +static long futex_lock_pi_restart(struct restart_block *restart); /** * fixup_owner() - Post lock pi_state and corner case management @@ -1489,6 +1692,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, q.pi_state = NULL; q.bitset = bitset; + q.rt_waiter = NULL; if (abs_time) { to = &timeout; @@ -1596,6 +1800,7 @@ static int futex_lock_pi(u32 __user *uaddr, int fshared, } q.pi_state = NULL; + q.rt_waiter = NULL; retry: q.key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q.key); @@ -1701,6 +1906,20 @@ uaddr_faulted: goto retry; } +static long futex_lock_pi_restart(struct restart_block *restart) +{ + u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; + ktime_t t, *tp = NULL; + int fshared = restart->futex.flags & FLAGS_SHARED; + + if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { + t.tv64 = restart->futex.time; + tp = &t; + } + restart->fn = do_no_restart_syscall; + + return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0); +} /* * Userspace attempted a TID -> 0 atomic transition, and failed. @@ -1803,6 +2022,253 @@ pi_faulted: return ret; } +/** + * handle_early_requeue_pi_wakeup() - Detect early wakeup on the initial futex + * @hb: the hash_bucket futex_q was original enqueued on + * @q: the futex_q woken while waiting to be requeued + * @key2: the futex_key of the requeue target futex + * @timeout: the timeout associated with the wait (NULL if none) + * + * Detect if the task was woken on the initial futex as opposed to the requeue + * target futex. If so, determine if it was a timeout or a signal that caused + * the wakeup and return the appropriate error code to the caller. Must be + * called with the hb lock held. + * + * Returns + * 0 - no early wakeup detected + * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?) + */ +static inline +int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, + struct futex_q *q, union futex_key *key2, + struct hrtimer_sleeper *timeout) +{ + int ret = 0; + + /* + * With the hb lock held, we avoid races while we process the wakeup. + * We only need to hold hb (and not hb2) to ensure atomicity as the + * wakeup code can't change q.key from uaddr to uaddr2 if we hold hb. + * It can't be requeued from uaddr2 to something else since we don't + * support a PI aware source futex for requeue. + */ + if (!match_futex(&q->key, key2)) { + WARN_ON(q->lock_ptr && (&hb->lock != q->lock_ptr)); + /* + * We were woken prior to requeue by a timeout or a signal. + * Unqueue the futex_q and determine which it was. + */ + plist_del(&q->list, &q->list.plist); + drop_futex_key_refs(&q->key); + + if (timeout && !timeout->task) + ret = -ETIMEDOUT; + else { + /* + * We expect signal_pending(current), but another + * thread may have handled it for us already. + */ + /* FIXME: ERESTARTSYS or ERESTARTNOINTR? Do we care if + * the user specified SA_RESTART or not? */ + ret = -ERESTARTSYS; + } + } + return ret; +} + +/** + * futex_wait_requeue_pi() - Wait on uaddr and take uaddr2 + * @uaddr: the futex we initialyl wait on (non-pi) + * @fshared: whether the futexes are shared (1) or not (0). They must be + * the same type, no requeueing from private to shared, etc. + * @val: the expected value of uaddr + * @abs_time: absolute timeout + * @bitset: 32 bit wakeup bitset set by userspace, defaults to all. + * @clockrt: whether to use CLOCK_REALTIME (1) or CLOCK_MONOTONIC (0) + * @uaddr2: the pi futex we will take prior to returning to user-space + * + * The caller will wait on uaddr and will be requeued by futex_requeue() to + * uaddr2 which must be PI aware. Normal wakeup will wake on uaddr2 and + * complete the acquisition of the rt_mutex prior to returning to userspace. + * This ensures the rt_mutex maintains an owner when it has waiters; without + * one, the pi logic wouldn't know which task to boost/deboost, if there was a + * need to. + * + * We call schedule in futex_wait_queue_me() when we enqueue and return there + * via the following: + * 1) wakeup on uaddr2 after an atomic lock acquisition by futex_requeue() + * 2) wakeup on uaddr2 after a requeue and subsequent unlock + * 3) signal (before or after requeue) + * 4) timeout (before or after requeue) + * + * If 3, we setup a restart_block with futex_wait_requeue_pi() as the function. + * + * If 2, we may then block on trying to take the rt_mutex and return via: + * 5) successful lock + * 6) signal + * 7) timeout + * 8) other lock acquisition failure + * + * If 6, we setup a restart_block with futex_lock_pi() as the function. + * + * If 4 or 7, we cleanup and return with -ETIMEDOUT. + * + * Returns: + * 0 - On success + * <0 - On error + */ +static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, + u32 val, ktime_t *abs_time, u32 bitset, + int clockrt, u32 __user *uaddr2) +{ + struct hrtimer_sleeper timeout, *to = NULL; + struct rt_mutex_waiter rt_waiter; + struct rt_mutex *pi_mutex = NULL; + DECLARE_WAITQUEUE(wait, current); + struct restart_block *restart; + struct futex_hash_bucket *hb; + union futex_key key2; + struct futex_q q; + int res, ret; + u32 uval; + + if (!bitset) + return -EINVAL; + + if (abs_time) { + to = &timeout; + hrtimer_init_on_stack(&to->timer, clockrt ? CLOCK_REALTIME : + CLOCK_MONOTONIC, HRTIMER_MODE_ABS); + hrtimer_init_sleeper(to, current); + hrtimer_set_expires_range_ns(&to->timer, *abs_time, + current->timer_slack_ns); + } + + /* + * The waiter is allocated on our stack, manipulated by the requeue + * code while we sleep on uaddr. + */ + debug_rt_mutex_init_waiter(&rt_waiter); + rt_waiter.task = NULL; + + q.pi_state = NULL; + q.bitset = bitset; + q.rt_waiter = &rt_waiter; + + key2 = FUTEX_KEY_INIT; + ret = get_futex_key(uaddr2, fshared, &key2); + if (unlikely(ret != 0)) + goto out; + + /* Prepare to wait on uaddr. */ + ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); + if (ret) { + put_futex_key(fshared, &key2); + goto out; + } + + /* Queue the futex_q, drop the hb lock, wait for wakeup. */ + futex_wait_queue_me(hb, &q, to, &wait); + + spin_lock(&hb->lock); + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out_put_keys; + + /* + * In order for us to be here, we know our q.key == key2, and since + * we took the hb->lock above, we also know that futex_requeue() has + * completed and we no longer have to concern ourselves with a wakeup + * race with the atomic proxy lock acquition by the requeue code. + */ + + /* Check if the requeue code acquired the second futex for us. */ + if (!q.rt_waiter) { + /* + * Got the lock. We might not be the anticipated owner if we + * did a lock-steal - fix up the PI-state in that case. + */ + if (q.pi_state && (q.pi_state->owner != current)) { + spin_lock(q.lock_ptr); + ret = fixup_pi_state_owner(uaddr2, &q, current, + fshared); + spin_unlock(q.lock_ptr); + } + } else { + /* + * We have been woken up by futex_unlock_pi(), a timeout, or a + * signal. futex_unlock_pi() will not destroy the lock_ptr nor + * the pi_state. + */ + WARN_ON(!&q.pi_state); + pi_mutex = &q.pi_state->pi_mutex; + ret = rt_mutex_finish_proxy_lock(pi_mutex, to, &rt_waiter, 1); + debug_rt_mutex_free_waiter(&rt_waiter); + + spin_lock(q.lock_ptr); + /* + * Fixup the pi_state owner and possibly acquire the lock if we + * haven't already. + */ + res = fixup_owner(uaddr2, fshared, &q, !ret); + /* + * If fixup_owner() returned an error, proprogate that. If it + * acquired the lock, clear our -ETIMEDOUT or -EINTR. + */ + if (res) + ret = (res < 0) ? res : 0; + + /* Unqueue and drop the lock. */ + unqueue_me_pi(&q); + } + + /* + * If fixup_pi_state_owner() faulted and was unable to handle the + * fault, unlock the rt_mutex and return the fault to userspace. + */ + if (ret == -EFAULT) { + if (rt_mutex_owner(pi_mutex) == current) + rt_mutex_unlock(pi_mutex); + } else if (ret == -EINTR) { + ret = -EFAULT; + if (get_user(uval, uaddr2)) + goto out_put_keys; + + /* + * We've already been requeued, so restart by calling + * futex_lock_pi() directly, rather then returning to this + * function. + */ + ret = -ERESTART_RESTARTBLOCK; + restart = ¤t_thread_info()->restart_block; + restart->fn = futex_lock_pi_restart; + restart->futex.uaddr = (u32 *)uaddr2; + restart->futex.val = uval; + restart->futex.flags = 0; + if (abs_time) { + restart->futex.flags |= FLAGS_HAS_TIMEOUT; + restart->futex.time = abs_time->tv64; + } + + if (fshared) + restart->futex.flags |= FLAGS_SHARED; + if (clockrt) + restart->futex.flags |= FLAGS_CLOCKRT; + } + +out_put_keys: + put_futex_key(fshared, &q.key); + put_futex_key(fshared, &key2); + +out: + if (to) { + hrtimer_cancel(&to->timer); + destroy_hrtimer_on_stack(&to->timer); + } + return ret; +} + /* * Support for robust futexes: the kernel cleans up held futexes at * thread exit time. @@ -2025,7 +2491,7 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, fshared = 1; clockrt = op & FUTEX_CLOCK_REALTIME; - if (clockrt && cmd != FUTEX_WAIT_BITSET) + if (clockrt && cmd != FUTEX_WAIT_BITSET && cmd != FUTEX_WAIT_REQUEUE_PI) return -ENOSYS; switch (cmd) { @@ -2040,10 +2506,11 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wake(uaddr, fshared, val, val3); break; case FUTEX_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 0); break; case FUTEX_CMP_REQUEUE: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3); + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 0); break; case FUTEX_WAKE_OP: ret = futex_wake_op(uaddr, fshared, uaddr2, val, val2, val3); @@ -2060,6 +2527,18 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, if (futex_cmpxchg_enabled) ret = futex_lock_pi(uaddr, fshared, 0, timeout, 1); break; + case FUTEX_WAIT_REQUEUE_PI: + val3 = FUTEX_BITSET_MATCH_ANY; + ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, + clockrt, uaddr2); + break; + case FUTEX_REQUEUE_PI: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1); + break; + case FUTEX_CMP_REQUEUE_PI: + ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, + 1); + break; default: ret = -ENOSYS; } @@ -2077,7 +2556,8 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, int cmd = op & FUTEX_CMD_MASK; if (utime && (cmd == FUTEX_WAIT || cmd == FUTEX_LOCK_PI || - cmd == FUTEX_WAIT_BITSET)) { + cmd == FUTEX_WAIT_BITSET || + cmd == FUTEX_WAIT_REQUEUE_PI)) { if (copy_from_user(&ts, utime, sizeof(ts)) != 0) return -EFAULT; if (!timespec_valid(&ts)) @@ -2089,10 +2569,11 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, tp = &t; } /* - * requeue parameter in 'utime' if cmd == FUTEX_REQUEUE. + * requeue parameter in 'utime' if cmd == FUTEX_*_REQUEUE_*. * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || + cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; From 7ba5779533819fc061b4afafcb4a609d55f37057 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Mon, 6 Apr 2009 20:49:14 +0900 Subject: [PATCH 0243/2061] tomoyo: remove "undelete domain" command. Since TOMOYO's policy management tools does not use the "undelete domain" command, we decided to remove that command. Signed-off-by: Kentaro Takeda Signed-off-by: Tetsuo Handa Signed-off-by: Toshiharu Harada Signed-off-by: James Morris --- security/tomoyo/common.c | 7 +--- security/tomoyo/common.h | 8 +--- security/tomoyo/domain.c | 90 ++-------------------------------------- 3 files changed, 5 insertions(+), 100 deletions(-) diff --git a/security/tomoyo/common.c b/security/tomoyo/common.c index 92cea656ad2..a0affd9cfca 100644 --- a/security/tomoyo/common.c +++ b/security/tomoyo/common.c @@ -1252,15 +1252,12 @@ static int tomoyo_write_domain_policy(struct tomoyo_io_buffer *head) struct tomoyo_domain_info *domain = head->write_var1; bool is_delete = false; bool is_select = false; - bool is_undelete = false; unsigned int profile; if (tomoyo_str_starts(&data, TOMOYO_KEYWORD_DELETE)) is_delete = true; else if (tomoyo_str_starts(&data, TOMOYO_KEYWORD_SELECT)) is_select = true; - else if (tomoyo_str_starts(&data, TOMOYO_KEYWORD_UNDELETE)) - is_undelete = true; if (is_select && tomoyo_is_select_one(head, data)) return 0; /* Don't allow updating policies by non manager programs. */ @@ -1274,9 +1271,7 @@ static int tomoyo_write_domain_policy(struct tomoyo_io_buffer *head) down_read(&tomoyo_domain_list_lock); domain = tomoyo_find_domain(data); up_read(&tomoyo_domain_list_lock); - } else if (is_undelete) - domain = tomoyo_undelete_domain(data); - else + } else domain = tomoyo_find_or_assign_new_domain(data, 0); head->write_var1 = domain; return 0; diff --git a/security/tomoyo/common.h b/security/tomoyo/common.h index 26a76d67aa1..e77e6a6de0f 100644 --- a/security/tomoyo/common.h +++ b/security/tomoyo/common.h @@ -88,10 +88,7 @@ struct tomoyo_domain_info { /* Name of this domain. Never NULL. */ const struct tomoyo_path_info *domainname; u8 profile; /* Profile number to use. */ - u8 is_deleted; /* Delete flag. - 0 = active. - 1 = deleted but undeletable. - 255 = deleted and no longer undeletable. */ + bool is_deleted; /* Delete flag. */ bool quota_warned; /* Quota warnning flag. */ /* DOMAIN_FLAGS_*. Use tomoyo_set_domain_flag() to modify. */ u8 flags; @@ -144,7 +141,6 @@ struct tomoyo_double_path_acl_record { #define TOMOYO_KEYWORD_NO_INITIALIZE_DOMAIN "no_initialize_domain " #define TOMOYO_KEYWORD_NO_KEEP_DOMAIN "no_keep_domain " #define TOMOYO_KEYWORD_SELECT "select " -#define TOMOYO_KEYWORD_UNDELETE "undelete " #define TOMOYO_KEYWORD_USE_PROFILE "use_profile " #define TOMOYO_KEYWORD_IGNORE_GLOBAL_ALLOW_READ "ignore_global_allow_read" /* A domain definition starts with . */ @@ -267,8 +263,6 @@ struct tomoyo_domain_info *tomoyo_find_domain(const char *domainname); struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char * domainname, const u8 profile); -/* Undelete a domain. */ -struct tomoyo_domain_info *tomoyo_undelete_domain(const char *domainname); /* Check mode for specified functionality. */ unsigned int tomoyo_check_flags(const struct tomoyo_domain_info *domain, const u8 index); diff --git a/security/tomoyo/domain.c b/security/tomoyo/domain.c index 093a756030b..2f2b449ffd2 100644 --- a/security/tomoyo/domain.c +++ b/security/tomoyo/domain.c @@ -551,9 +551,7 @@ int tomoyo_write_alias_policy(char *data, const bool is_delete) return tomoyo_update_alias_entry(data, cp, is_delete); } -/* Domain create/delete/undelete handler. */ - -/* #define TOMOYO_DEBUG_DOMAIN_UNDELETE */ +/* Domain create/delete handler. */ /** * tomoyo_delete_domain - Delete a domain. @@ -571,41 +569,15 @@ int tomoyo_delete_domain(char *domainname) tomoyo_fill_path_info(&name); /***** EXCLUSIVE SECTION START *****/ down_write(&tomoyo_domain_list_lock); -#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE - printk(KERN_DEBUG "tomoyo_delete_domain %s\n", domainname); - list_for_each_entry(domain, &tomoyo_domain_list, list) { - if (tomoyo_pathcmp(domain->domainname, &name)) - continue; - printk(KERN_DEBUG "List: %p %u\n", domain, domain->is_deleted); - } -#endif /* Is there an active domain? */ list_for_each_entry(domain, &tomoyo_domain_list, list) { - struct tomoyo_domain_info *domain2; /* Never delete tomoyo_kernel_domain */ if (domain == &tomoyo_kernel_domain) continue; if (domain->is_deleted || tomoyo_pathcmp(domain->domainname, &name)) continue; - /* Mark already deleted domains as non undeletable. */ - list_for_each_entry(domain2, &tomoyo_domain_list, list) { - if (!domain2->is_deleted || - tomoyo_pathcmp(domain2->domainname, &name)) - continue; -#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE - if (domain2->is_deleted != 255) - printk(KERN_DEBUG - "Marked %p as non undeletable\n", - domain2); -#endif - domain2->is_deleted = 255; - } - /* Delete and mark active domain as undeletable. */ - domain->is_deleted = 1; -#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE - printk(KERN_DEBUG "Marked %p as undeletable\n", domain); -#endif + domain->is_deleted = true; break; } up_write(&tomoyo_domain_list_lock); @@ -613,58 +585,6 @@ int tomoyo_delete_domain(char *domainname) return 0; } -/** - * tomoyo_undelete_domain - Undelete a domain. - * - * @domainname: The name of domain. - * - * Returns pointer to "struct tomoyo_domain_info" on success, NULL otherwise. - */ -struct tomoyo_domain_info *tomoyo_undelete_domain(const char *domainname) -{ - struct tomoyo_domain_info *domain; - struct tomoyo_domain_info *candidate_domain = NULL; - struct tomoyo_path_info name; - - name.name = domainname; - tomoyo_fill_path_info(&name); - /***** EXCLUSIVE SECTION START *****/ - down_write(&tomoyo_domain_list_lock); -#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE - printk(KERN_DEBUG "tomoyo_undelete_domain %s\n", domainname); - list_for_each_entry(domain, &tomoyo_domain_list, list) { - if (tomoyo_pathcmp(domain->domainname, &name)) - continue; - printk(KERN_DEBUG "List: %p %u\n", domain, domain->is_deleted); - } -#endif - list_for_each_entry(domain, &tomoyo_domain_list, list) { - if (tomoyo_pathcmp(&name, domain->domainname)) - continue; - if (!domain->is_deleted) { - /* This domain is active. I can't undelete. */ - candidate_domain = NULL; -#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE - printk(KERN_DEBUG "%p is active. I can't undelete.\n", - domain); -#endif - break; - } - /* Is this domain undeletable? */ - if (domain->is_deleted == 1) - candidate_domain = domain; - } - if (candidate_domain) { - candidate_domain->is_deleted = 0; -#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE - printk(KERN_DEBUG "%p was undeleted.\n", candidate_domain); -#endif - } - up_write(&tomoyo_domain_list_lock); - /***** EXCLUSIVE SECTION END *****/ - return candidate_domain; -} - /** * tomoyo_find_or_assign_new_domain - Create a domain. * @@ -711,10 +631,6 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char * /***** CRITICAL SECTION END *****/ if (flag) continue; -#ifdef TOMOYO_DEBUG_DOMAIN_UNDELETE - printk(KERN_DEBUG "Reusing %p %s\n", domain, - domain->domainname->name); -#endif list_for_each_entry(ptr, &domain->acl_info_list, list) { ptr->type |= TOMOYO_ACL_DELETED; } @@ -722,7 +638,7 @@ struct tomoyo_domain_info *tomoyo_find_or_assign_new_domain(const char * domain->profile = profile; domain->quota_warned = false; mb(); /* Avoid out-of-order execution. */ - domain->is_deleted = 0; + domain->is_deleted = false; goto out; } /* No memory reusable. Create using new memory. */ From a2e87d06ddbe6e6fdb8d6d2e5e985efe4efb07dd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:44:59 +0200 Subject: [PATCH 0244/2061] perf_counter: update mmap() counter read, take 2 Update the userspace read method. Paul noted that: - userspace cannot observe ->lock & 1 on the same cpu. - we need a barrier() between reading ->lock and ->index to ensure we read them in that prticular order. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.368446033@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 24 ++++++++++-------------- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f2b914de3f0..e22ab47a2f4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -170,22 +170,18 @@ struct perf_counter_mmap_page { * u32 seq; * s64 count; * - * again: - * seq = pc->lock; - * if (unlikely(seq & 1)) { - * cpu_relax(); - * goto again; - * } + * do { + * seq = pc->lock; * - * if (pc->index) { - * count = pmc_read(pc->index - 1); - * count += pc->offset; - * } else - * goto regular_read; + * barrier() + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; * - * barrier(); - * if (pc->lock != seq) - * goto again; + * barrier(); + * } while (pc->lock != seq); * * NOTE: for obvious reason this only works on self-monitoring * processes. From 9c03d88e328d5f28f13191622c2ea1349c36b799 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:00 +0200 Subject: [PATCH 0245/2061] perf_counter: add more context information Change the callchain context entries to u16, so as to gain some space. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.457320003@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- kernel/perf_counter.c | 6 ++---- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e22ab47a2f4..f9d5cf0bfbd 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -507,10 +507,10 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); -#define MAX_STACK_DEPTH 254 +#define MAX_STACK_DEPTH 255 struct perf_callchain_entry { - u32 nr, hv, kernel, user; + u16 nr, hv, kernel, user; u64 ip[MAX_STACK_DEPTH]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2a5d4f52556..727624db507 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1657,9 +1657,7 @@ void perf_counter_do_pending(void) * Callchain support -- arch specific */ -struct perf_callchain_entry * -__attribute__((weak)) -perf_callchain(struct pt_regs *regs) +__weak struct perf_callchain_entry *perf_callchain(struct pt_regs *regs) { return NULL; } @@ -1819,7 +1817,7 @@ void perf_counter_output(struct perf_counter *counter, callchain = perf_callchain(regs); if (callchain) { - callchain_size = (2 + callchain->nr) * sizeof(u64); + callchain_size = (1 + callchain->nr) * sizeof(u64); header.type |= __PERF_EVENT_CALLCHAIN; header.size += callchain_size; From 3c446b3d3b38f991f97e9d2df0ad26a60a94dcff Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:01 +0200 Subject: [PATCH 0246/2061] perf_counter: SIGIO support Provide support for fcntl() I/O availability signals. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.579788800@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 20 +++++++++++++++++++- 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f9d5cf0bfbd..8d5d11b8d01 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -238,6 +238,7 @@ enum perf_event_type { #include #include #include +#include #include struct task_struct; @@ -398,6 +399,7 @@ struct perf_counter { /* poll related */ wait_queue_head_t waitq; + struct fasync_struct *fasync; /* optional: for NMIs */ struct perf_wakeup_entry wakeup; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 727624db507..c58cc64319e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1526,6 +1526,22 @@ out: return ret; } +static int perf_fasync(int fd, struct file *filp, int on) +{ + struct perf_counter *counter = filp->private_data; + struct inode *inode = filp->f_path.dentry->d_inode; + int retval; + + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &counter->fasync); + mutex_unlock(&inode->i_mutex); + + if (retval < 0) + return retval; + + return 0; +} + static const struct file_operations perf_fops = { .release = perf_release, .read = perf_read, @@ -1533,6 +1549,7 @@ static const struct file_operations perf_fops = { .unlocked_ioctl = perf_ioctl, .compat_ioctl = perf_ioctl, .mmap = perf_mmap, + .fasync = perf_fasync, }; /* @@ -1549,7 +1566,7 @@ void perf_counter_wakeup(struct perf_counter *counter) rcu_read_lock(); data = rcu_dereference(counter->data); if (data) { - (void)atomic_xchg(&data->wakeup, POLL_IN); + atomic_set(&data->wakeup, POLL_IN); /* * Ensure all data writes are issued before updating the * user-space data head information. The matching rmb() @@ -1561,6 +1578,7 @@ void perf_counter_wakeup(struct perf_counter *counter) rcu_read_unlock(); wake_up_all(&counter->waitq); + kill_fasync(&counter->fasync, SIGIO, POLL_IN); } /* From 671dec5daf3b3c43c5777be282f00120a44cf37f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:02 +0200 Subject: [PATCH 0247/2061] perf_counter: generalize pending infrastructure Prepare the pending infrastructure to do more than wakeups. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.634732847@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++-- kernel/perf_counter.c | 53 ++++++++++++++++++++++-------------- 2 files changed, 36 insertions(+), 24 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8d5d11b8d01..977fb15a53f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -321,8 +321,9 @@ struct perf_mmap_data { void *data_pages[0]; }; -struct perf_wakeup_entry { - struct perf_wakeup_entry *next; +struct perf_pending_entry { + struct perf_pending_entry *next; + void (*func)(struct perf_pending_entry *); }; /** @@ -401,7 +402,7 @@ struct perf_counter { wait_queue_head_t waitq; struct fasync_struct *fasync; /* optional: for NMIs */ - struct perf_wakeup_entry wakeup; + struct perf_pending_entry pending; void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c58cc64319e..0a2ade2e4f1 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1581,6 +1581,14 @@ void perf_counter_wakeup(struct perf_counter *counter) kill_fasync(&counter->fasync, SIGIO, POLL_IN); } +static void perf_pending_wakeup(struct perf_pending_entry *entry) +{ + struct perf_counter *counter = container_of(entry, + struct perf_counter, pending); + + perf_counter_wakeup(counter); +} + /* * Pending wakeups * @@ -1590,45 +1598,47 @@ void perf_counter_wakeup(struct perf_counter *counter) * single linked list and use cmpxchg() to add entries lockless. */ -#define PENDING_TAIL ((struct perf_wakeup_entry *)-1UL) +#define PENDING_TAIL ((struct perf_pending_entry *)-1UL) -static DEFINE_PER_CPU(struct perf_wakeup_entry *, perf_wakeup_head) = { +static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { PENDING_TAIL, }; -static void perf_pending_queue(struct perf_counter *counter) +static void perf_pending_queue(struct perf_pending_entry *entry, + void (*func)(struct perf_pending_entry *)) { - struct perf_wakeup_entry **head; - struct perf_wakeup_entry *prev, *next; + struct perf_pending_entry **head; - if (cmpxchg(&counter->wakeup.next, NULL, PENDING_TAIL) != NULL) + if (cmpxchg(&entry->next, NULL, PENDING_TAIL) != NULL) return; - head = &get_cpu_var(perf_wakeup_head); + entry->func = func; + + head = &get_cpu_var(perf_pending_head); do { - prev = counter->wakeup.next = *head; - next = &counter->wakeup; - } while (cmpxchg(head, prev, next) != prev); + entry->next = *head; + } while (cmpxchg(head, entry->next, entry) != entry->next); set_perf_counter_pending(); - put_cpu_var(perf_wakeup_head); + put_cpu_var(perf_pending_head); } static int __perf_pending_run(void) { - struct perf_wakeup_entry *list; + struct perf_pending_entry *list; int nr = 0; - list = xchg(&__get_cpu_var(perf_wakeup_head), PENDING_TAIL); + list = xchg(&__get_cpu_var(perf_pending_head), PENDING_TAIL); while (list != PENDING_TAIL) { - struct perf_counter *counter = container_of(list, - struct perf_counter, wakeup); + void (*func)(struct perf_pending_entry *); + struct perf_pending_entry *entry = list; list = list->next; - counter->wakeup.next = NULL; + func = entry->func; + entry->next = NULL; /* * Ensure we observe the unqueue before we issue the wakeup, * so that we won't be waiting forever. @@ -1636,7 +1646,7 @@ static int __perf_pending_run(void) */ smp_wmb(); - perf_counter_wakeup(counter); + func(entry); nr++; } @@ -1658,7 +1668,7 @@ static inline int perf_not_pending(struct perf_counter *counter) * so that we do not miss the wakeup. -- see perf_pending_handle() */ smp_rmb(); - return counter->wakeup.next == NULL; + return counter->pending.next == NULL; } static void perf_pending_sync(struct perf_counter *counter) @@ -1695,9 +1705,10 @@ struct perf_output_handle { static inline void __perf_output_wakeup(struct perf_output_handle *handle) { - if (handle->nmi) - perf_pending_queue(handle->counter); - else + if (handle->nmi) { + perf_pending_queue(&handle->counter->pending, + perf_pending_wakeup); + } else perf_counter_wakeup(handle->counter); } From b6276f353bf490add62dcf7db0ebd75baa3e1a37 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:03 +0200 Subject: [PATCH 0248/2061] perf_counter: x86: self-IPI for pending work Implement set_perf_counter_pending() with a self-IPI so that it will run ASAP in a usable context. For now use a second IRQ vector, because the primary vector pokes the apic in funny ways that seem to confuse things. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.724626696@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/entry_arch.h | 1 + arch/x86/include/asm/hardirq.h | 1 + arch/x86/include/asm/hw_irq.h | 1 + arch/x86/include/asm/irq_vectors.h | 5 +++++ arch/x86/include/asm/perf_counter.h | 3 ++- arch/x86/kernel/cpu/perf_counter.c | 14 ++++++++++++++ arch/x86/kernel/entry_64.S | 2 ++ arch/x86/kernel/irq.c | 5 +++++ arch/x86/kernel/irqinit_32.c | 1 + arch/x86/kernel/irqinit_64.c | 1 + 10 files changed, 33 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/entry_arch.h b/arch/x86/include/asm/entry_arch.h index c2e6bedaf25..fe24d280249 100644 --- a/arch/x86/include/asm/entry_arch.h +++ b/arch/x86/include/asm/entry_arch.h @@ -50,6 +50,7 @@ BUILD_INTERRUPT(spurious_interrupt,SPURIOUS_APIC_VECTOR) #ifdef CONFIG_PERF_COUNTERS BUILD_INTERRUPT(perf_counter_interrupt, LOCAL_PERF_VECTOR) +BUILD_INTERRUPT(perf_pending_interrupt, LOCAL_PENDING_VECTOR) #endif #ifdef CONFIG_X86_MCE_P4THERMAL diff --git a/arch/x86/include/asm/hardirq.h b/arch/x86/include/asm/hardirq.h index 25454427cee..f5ebe2aaca4 100644 --- a/arch/x86/include/asm/hardirq.h +++ b/arch/x86/include/asm/hardirq.h @@ -14,6 +14,7 @@ typedef struct { #endif unsigned int generic_irqs; /* arch dependent */ unsigned int apic_perf_irqs; + unsigned int apic_pending_irqs; #ifdef CONFIG_SMP unsigned int irq_resched_count; unsigned int irq_call_count; diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index ae80f64973e..7309c0ad690 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -30,6 +30,7 @@ extern void apic_timer_interrupt(void); extern void generic_interrupt(void); extern void error_interrupt(void); extern void perf_counter_interrupt(void); +extern void perf_pending_interrupt(void); extern void spurious_interrupt(void); extern void thermal_interrupt(void); diff --git a/arch/x86/include/asm/irq_vectors.h b/arch/x86/include/asm/irq_vectors.h index 3cbd79bbb47..545bb811ccb 100644 --- a/arch/x86/include/asm/irq_vectors.h +++ b/arch/x86/include/asm/irq_vectors.h @@ -116,6 +116,11 @@ */ #define GENERIC_INTERRUPT_VECTOR 0xed +/* + * Performance monitoring pending work vector: + */ +#define LOCAL_PENDING_VECTOR 0xec + /* * First APIC vector available to drivers: (vectors 0x30-0xee) we * start at 0x31(0x41) to spread out vectors evenly between priority diff --git a/arch/x86/include/asm/perf_counter.h b/arch/x86/include/asm/perf_counter.h index e2b0e66b235..d08dd52cb8f 100644 --- a/arch/x86/include/asm/perf_counter.h +++ b/arch/x86/include/asm/perf_counter.h @@ -84,7 +84,8 @@ union cpuid10_edx { #define MSR_ARCH_PERFMON_FIXED_CTR2 0x30b #define X86_PMC_IDX_FIXED_BUS_CYCLES (X86_PMC_IDX_FIXED + 2) -#define set_perf_counter_pending() do { } while (0) +extern void set_perf_counter_pending(void); + #define clear_perf_counter_pending() do { } while (0) #define test_perf_counter_pending() (0) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c74e20d593a..438415866fe 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -849,6 +849,20 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_exit(); } +void smp_perf_pending_interrupt(struct pt_regs *regs) +{ + irq_enter(); + ack_APIC_irq(); + inc_irq_stat(apic_pending_irqs); + perf_counter_do_pending(); + irq_exit(); +} + +void set_perf_counter_pending(void) +{ + apic->send_IPI_self(LOCAL_PENDING_VECTOR); +} + void perf_counters_lapic_init(int nmi) { u32 apic_val; diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 3f129d963a0..1d46cba56fd 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1028,6 +1028,8 @@ apicinterrupt SPURIOUS_APIC_VECTOR \ #ifdef CONFIG_PERF_COUNTERS apicinterrupt LOCAL_PERF_VECTOR \ perf_counter_interrupt smp_perf_counter_interrupt +apicinterrupt LOCAL_PENDING_VECTOR \ + perf_pending_interrupt smp_perf_pending_interrupt #endif /* diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 9c2754302ec..d465487da58 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,6 +67,10 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); + seq_printf(p, "PND: "); + for_each_online_cpu(j) + seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); + seq_printf(p, " Performance pending work\n"); #endif if (generic_interrupt_extension) { seq_printf(p, "PLT: "); @@ -171,6 +175,7 @@ u64 arch_irq_stat_cpu(unsigned int cpu) sum += irq_stats(cpu)->apic_timer_irqs; sum += irq_stats(cpu)->irq_spurious_count; sum += irq_stats(cpu)->apic_perf_irqs; + sum += irq_stats(cpu)->apic_pending_irqs; #endif if (generic_interrupt_extension) sum += irq_stats(cpu)->generic_irqs; diff --git a/arch/x86/kernel/irqinit_32.c b/arch/x86/kernel/irqinit_32.c index 925d87cfc55..3190a6b961e 100644 --- a/arch/x86/kernel/irqinit_32.c +++ b/arch/x86/kernel/irqinit_32.c @@ -166,6 +166,7 @@ static void __init apic_intr_init(void) alloc_intr_gate(ERROR_APIC_VECTOR, error_interrupt); # ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); # endif # ifdef CONFIG_X86_MCE_P4THERMAL diff --git a/arch/x86/kernel/irqinit_64.c b/arch/x86/kernel/irqinit_64.c index 665e2ab48ab..53ceb26f80f 100644 --- a/arch/x86/kernel/irqinit_64.c +++ b/arch/x86/kernel/irqinit_64.c @@ -156,6 +156,7 @@ static void __init apic_intr_init(void) /* Performance monitoring interrupt: */ #ifdef CONFIG_PERF_COUNTERS alloc_intr_gate(LOCAL_PERF_VECTOR, perf_counter_interrupt); + alloc_intr_gate(LOCAL_PENDING_VECTOR, perf_pending_interrupt); #endif } From f6c7d5fe58b4846ee0cb4b98b6042489705eced4 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:04 +0200 Subject: [PATCH 0249/2061] perf_counter: theres more to overflow than writing events Prepare for more generic overflow handling. The new perf_counter_overflow() method will handle the generic bits of the counter overflow, and can return a !0 return value, in which case the counter should be (soft) disabled, so that it won't count until it's properly disabled. XXX: do powerpc and swcounter Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.812109629@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 2 +- arch/x86/kernel/cpu/perf_counter.c | 3 ++- include/linux/perf_counter.h | 4 ++-- kernel/perf_counter.c | 29 +++++++++++++++++++++++------ 4 files changed, 28 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 0a4d14f279a..f88c35d0710 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -732,7 +732,7 @@ static void record_and_restart(struct perf_counter *counter, long val, * Finally record data if requested. */ if (record) - perf_counter_output(counter, 1, regs); + perf_counter_overflow(counter, 1, regs); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 438415866fe..1116a41bc7b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -800,7 +800,8 @@ again: continue; perf_save_and_restart(counter); - perf_counter_output(counter, nmi, regs); + if (perf_counter_overflow(counter, nmi, regs)) + __pmc_generic_disable(counter, &counter->hw, bit); } hw_perf_ack_status(ack); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 977fb15a53f..ca2d4df29e0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -491,8 +491,8 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_counter_context *ctx, int cpu); extern void perf_counter_update_userpage(struct perf_counter *counter); -extern void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs); +extern int perf_counter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs); /* * Return 1 for a software counter, 0 for a hardware counter */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0a2ade2e4f1..195e976eb07 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1800,8 +1800,8 @@ static void perf_output_end(struct perf_output_handle *handle) rcu_read_unlock(); } -void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs) +static void perf_counter_output(struct perf_counter *counter, + int nmi, struct pt_regs *regs) { int ret; u64 record_type = counter->hw_event.record_type; @@ -2033,6 +2033,17 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, perf_counter_mmap_event(&mmap_event); } +/* + * Generic counter overflow handling. + */ + +int perf_counter_overflow(struct perf_counter *counter, + int nmi, struct pt_regs *regs) +{ + perf_counter_output(counter, nmi, regs); + return 0; +} + /* * Generic software counter infrastructure */ @@ -2077,6 +2088,7 @@ static void perf_swcounter_set_period(struct perf_counter *counter) static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) { + enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_counter *counter; struct pt_regs *regs; @@ -2092,12 +2104,14 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) !counter->hw_event.exclude_user) regs = task_pt_regs(current); - if (regs) - perf_counter_output(counter, 0, regs); + if (regs) { + if (perf_counter_overflow(counter, 0, regs)) + ret = HRTIMER_NORESTART; + } hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); - return HRTIMER_RESTART; + return ret; } static void perf_swcounter_overflow(struct perf_counter *counter, @@ -2105,7 +2119,10 @@ static void perf_swcounter_overflow(struct perf_counter *counter, { perf_swcounter_update(counter); perf_swcounter_set_period(counter); - perf_counter_output(counter, nmi, regs); + if (perf_counter_overflow(counter, nmi, regs)) + /* soft-disable the counter */ + ; + } static int perf_swcounter_match(struct perf_counter *counter, From ebb3c4c4cb81d64cc041356915ec015e2c57092a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:05 +0200 Subject: [PATCH 0250/2061] perf_counter: fix the mlock accounting Reading through the code I saw I forgot the finish the mlock accounting. Do so now. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094517.899767331@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 195e976eb07..c841563de04 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1461,13 +1461,14 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { + vma->vm_mm->locked_vm -= counter->data->nr_pages + 1; perf_mmap_data_free(counter); mutex_unlock(&counter->mmap_mutex); } } static struct vm_operations_struct perf_mmap_vmops = { - .open = perf_mmap_open, + .open = perf_mmap_open, .close = perf_mmap_close, .fault = perf_mmap_fault, }; @@ -1499,24 +1500,32 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_pgoff != 0) return -EINVAL; - locked = vma_size >> PAGE_SHIFT; - locked += vma->vm_mm->locked_vm; + mutex_lock(&counter->mmap_mutex); + if (atomic_inc_not_zero(&counter->mmap_count)) { + if (nr_pages != counter->data->nr_pages) + ret = -EINVAL; + goto unlock; + } + + locked = vma->vm_mm->locked_vm; + locked += nr_pages + 1; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; - if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) - return -EPERM; - - mutex_lock(&counter->mmap_mutex); - if (atomic_inc_not_zero(&counter->mmap_count)) - goto out; + if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { + ret = -EPERM; + goto unlock; + } WARN_ON(counter->data); ret = perf_mmap_data_alloc(counter, nr_pages); - if (!ret) - atomic_set(&counter->mmap_count, 1); -out: + if (ret) + goto unlock; + + atomic_set(&counter->mmap_count, 1); + vma->vm_mm->locked_vm += nr_pages + 1; +unlock: mutex_unlock(&counter->mmap_mutex); vma->vm_flags &= ~VM_MAYWRITE; From 339f7c90b8a2f3aa2dd4267e79f797999e8a3c59 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:06 +0200 Subject: [PATCH 0251/2061] perf_counter: PERF_RECORD_TIME By popular request, provide means to log a timestamp along with the counter overflow event. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.024173282@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 14 ++++++++++++++ 2 files changed, 16 insertions(+) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index ca2d4df29e0..928a7fae096 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -102,6 +102,7 @@ enum perf_counter_record_format { PERF_RECORD_TID = 1U << 1, PERF_RECORD_GROUP = 1U << 2, PERF_RECORD_CALLCHAIN = 1U << 3, + PERF_RECORD_TIME = 1U << 4, }; /* @@ -221,6 +222,7 @@ enum perf_event_type { __PERF_EVENT_TID = PERF_RECORD_TID, __PERF_EVENT_GROUP = PERF_RECORD_GROUP, __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN, + __PERF_EVENT_TIME = PERF_RECORD_TIME, }; #ifdef __KERNEL__ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c841563de04..19990d1f021 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1826,6 +1826,7 @@ static void perf_counter_output(struct perf_counter *counter, } group_entry; struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; + u64 time; header.type = PERF_EVENT_COUNTER_OVERFLOW; header.size = sizeof(header); @@ -1862,6 +1863,16 @@ static void perf_counter_output(struct perf_counter *counter, } } + if (record_type & PERF_RECORD_TIME) { + /* + * Maybe do better on x86 and provide cpu_clock_nmi() + */ + time = sched_clock(); + + header.type |= __PERF_EVENT_TIME; + header.size += sizeof(u64); + } + ret = perf_output_begin(&handle, counter, header.size, nmi); if (ret) return; @@ -1895,6 +1906,9 @@ static void perf_counter_output(struct perf_counter *counter, if (callchain) perf_output_copy(&handle, callchain, callchain_size); + if (record_type & PERF_RECORD_TIME) + perf_output_put(&handle, time); + perf_output_end(&handle); } From 79f146415623fe74f39af67c0f6adc208939a410 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:07 +0200 Subject: [PATCH 0252/2061] perf_counter: counter overflow limit Provide means to auto-disable the counter after 'n' overflow events. Create the counter with hw_event.disabled = 1, and then issue an ioctl(fd, PREF_COUNTER_IOC_REFRESH, n); to set the limit and enable the counter. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.083139737@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 12 ++++++--- kernel/perf_counter.c | 51 +++++++++++++++++++++++++++++------- 2 files changed, 50 insertions(+), 13 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 928a7fae096..ef4dcbff75a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -155,8 +155,9 @@ struct perf_counter_hw_event { /* * Ioctls that can be done on a perf counter fd: */ -#define PERF_COUNTER_IOC_ENABLE _IO('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO('$', 1) +#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) +#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) /* * Structure of the page that can be mapped via mmap @@ -403,9 +404,14 @@ struct perf_counter { /* poll related */ wait_queue_head_t waitq; struct fasync_struct *fasync; - /* optional: for NMIs */ + + /* delayed work for NMIs and such */ + int pending_wakeup; + int pending_disable; struct perf_pending_entry pending; + atomic_t event_limit; + void (*destroy)(struct perf_counter *); struct rcu_head rcu_head; #endif diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 19990d1f021..c05e10354bc 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -744,6 +744,12 @@ static void perf_counter_enable(struct perf_counter *counter) spin_unlock_irq(&ctx->lock); } +static void perf_counter_refresh(struct perf_counter *counter, int refresh) +{ + atomic_add(refresh, &counter->event_limit); + perf_counter_enable(counter); +} + /* * Enable a counter and all its children. */ @@ -1311,6 +1317,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_DISABLE: perf_counter_disable_family(counter); break; + case PERF_COUNTER_IOC_REFRESH: + perf_counter_refresh(counter, arg); + break; default: err = -ENOTTY; } @@ -1590,14 +1599,6 @@ void perf_counter_wakeup(struct perf_counter *counter) kill_fasync(&counter->fasync, SIGIO, POLL_IN); } -static void perf_pending_wakeup(struct perf_pending_entry *entry) -{ - struct perf_counter *counter = container_of(entry, - struct perf_counter, pending); - - perf_counter_wakeup(counter); -} - /* * Pending wakeups * @@ -1607,6 +1608,22 @@ static void perf_pending_wakeup(struct perf_pending_entry *entry) * single linked list and use cmpxchg() to add entries lockless. */ +static void perf_pending_counter(struct perf_pending_entry *entry) +{ + struct perf_counter *counter = container_of(entry, + struct perf_counter, pending); + + if (counter->pending_disable) { + counter->pending_disable = 0; + perf_counter_disable(counter); + } + + if (counter->pending_wakeup) { + counter->pending_wakeup = 0; + perf_counter_wakeup(counter); + } +} + #define PENDING_TAIL ((struct perf_pending_entry *)-1UL) static DEFINE_PER_CPU(struct perf_pending_entry *, perf_pending_head) = { @@ -1715,8 +1732,9 @@ struct perf_output_handle { static inline void __perf_output_wakeup(struct perf_output_handle *handle) { if (handle->nmi) { + handle->counter->pending_wakeup = 1; perf_pending_queue(&handle->counter->pending, - perf_pending_wakeup); + perf_pending_counter); } else perf_counter_wakeup(handle->counter); } @@ -2063,8 +2081,21 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, int perf_counter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs) { + int events = atomic_read(&counter->event_limit); + int ret = 0; + + if (events && atomic_dec_and_test(&counter->event_limit)) { + ret = 1; + if (nmi) { + counter->pending_disable = 1; + perf_pending_queue(&counter->pending, + perf_pending_counter); + } else + perf_counter_disable(counter); + } + perf_counter_output(counter, nmi, regs); - return 0; + return ret; } /* From 0c593b3411341e3a05a61f5527df36ab02bd11e8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:08 +0200 Subject: [PATCH 0253/2061] perf_counter: comment the perf_event_type stuff Describe the event format. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.211174347@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index ef4dcbff75a..81220188d05 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -208,6 +208,20 @@ struct perf_event_header { enum perf_event_type { + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * }; + */ PERF_EVENT_MMAP = 1, PERF_EVENT_MUNMAP = 2, @@ -217,6 +231,24 @@ enum perf_event_type { * * These events will have types of the form: * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } * + * + * struct { + * struct perf_event_header header; + * + * { u64 ip; } && __PERF_EVENT_IP + * { u32 pid, tid; } && __PERF_EVENT_TID + * + * { u64 nr; + * { u64 event, val; } cnt[nr]; } && __PERF_EVENT_GROUP + * + * { u16 nr, + * hv, + * kernel, + * user; + * u64 ips[nr]; } && __PERF_EVENT_CALLCHAIN + * + * { u64 time; } && __PERF_EVENT_TIME + * }; */ PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31, __PERF_EVENT_IP = PERF_RECORD_IP, From 4c9e25428ff46b968a30f1dfafdba550cb6e4141 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:09 +0200 Subject: [PATCH 0254/2061] perf_counter: change event definition Currently the definition of an event is slightly ambiguous. We have wakeup events, for poll() and SIGIO, which are either generated when a record crosses a page boundary (hw_events.wakeup_events == 0), or every wakeup_events new records. Now a record can be either a counter overflow record, or a number of different things, like the mmap PROT_EXEC region notifications. Then there is the PERF_COUNTER_IOC_REFRESH event limit, which only considers counter overflows. This patch changes then wakeup_events and SIGIO notification to only consider overflow events. Furthermore it changes the SIGIO notification to report SIGHUP when the event limit is reached and the counter will be disabled. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.266679874@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 22 +++++++++++++++------- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 81220188d05..0f5a4005048 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -439,6 +439,7 @@ struct perf_counter { /* delayed work for NMIs and such */ int pending_wakeup; + int pending_kill; int pending_disable; struct perf_pending_entry pending; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c05e10354bc..8c8eaf0625f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1596,7 +1596,11 @@ void perf_counter_wakeup(struct perf_counter *counter) rcu_read_unlock(); wake_up_all(&counter->waitq); - kill_fasync(&counter->fasync, SIGIO, POLL_IN); + + if (counter->pending_kill) { + kill_fasync(&counter->fasync, SIGIO, counter->pending_kill); + counter->pending_kill = 0; + } } /* @@ -1727,6 +1731,7 @@ struct perf_output_handle { unsigned int head; int wakeup; int nmi; + int overflow; }; static inline void __perf_output_wakeup(struct perf_output_handle *handle) @@ -1741,7 +1746,7 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle) static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, - int nmi) + int nmi, int overflow) { struct perf_mmap_data *data; unsigned int offset, head; @@ -1751,8 +1756,9 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; - handle->counter = counter; - handle->nmi = nmi; + handle->counter = counter; + handle->nmi = nmi; + handle->overflow = overflow; if (!data->nr_pages) goto fail; @@ -1816,7 +1822,7 @@ static void perf_output_end(struct perf_output_handle *handle) { int wakeup_events = handle->counter->hw_event.wakeup_events; - if (wakeup_events) { + if (handle->overflow && wakeup_events) { int events = atomic_inc_return(&handle->data->events); if (events >= wakeup_events) { atomic_sub(wakeup_events, &handle->data->events); @@ -1891,7 +1897,7 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } - ret = perf_output_begin(&handle, counter, header.size, nmi); + ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -1955,7 +1961,7 @@ static void perf_counter_mmap_output(struct perf_counter *counter, { struct perf_output_handle handle; int size = mmap_event->event.header.size; - int ret = perf_output_begin(&handle, counter, size, 0); + int ret = perf_output_begin(&handle, counter, size, 0, 0); if (ret) return; @@ -2084,8 +2090,10 @@ int perf_counter_overflow(struct perf_counter *counter, int events = atomic_read(&counter->event_limit); int ret = 0; + counter->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&counter->event_limit)) { ret = 1; + counter->pending_kill = POLL_HUP; if (nmi) { counter->pending_disable = 1; perf_pending_queue(&counter->pending, From 4af4998b8aa35600f4c4a4f3c3a23baca6081d02 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:10 +0200 Subject: [PATCH 0255/2061] perf_counter: rework context time Since perf_counter_context is switched along with tasks, we can maintain the context time without using the task runtime clock. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.353552838@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++--- kernel/perf_counter.c | 78 ++++++++++++++++-------------------- 2 files changed, 37 insertions(+), 51 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0f5a4005048..7f5d353d78a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -477,14 +477,10 @@ struct perf_counter_context { struct task_struct *task; /* - * time_now is the current time in nanoseconds since an arbitrary - * point in the past. For per-task counters, this is based on the - * task clock, and for per-cpu counters it is based on the cpu clock. - * time_lost is an offset from the task/cpu clock, used to make it - * appear that time only passes while the context is scheduled in. + * Context clock, runs when context enabled. */ - u64 time_now; - u64 time_lost; + u64 time; + u64 timestamp; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8c8eaf0625f..84d85ab4e16 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -117,7 +117,7 @@ counter_sched_out(struct perf_counter *counter, return; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_stopped = ctx->time_now; + counter->tstamp_stopped = ctx->time; counter->hw_ops->disable(counter); counter->oncpu = -1; @@ -253,27 +253,20 @@ retry: spin_unlock_irq(&ctx->lock); } -/* - * Get the current time for this context. - * If this is a task context, we use the task's task clock, - * or for a per-cpu context, we use the cpu clock. - */ -static u64 get_context_time(struct perf_counter_context *ctx, int update) +static inline u64 perf_clock(void) { - struct task_struct *curr = ctx->task; - - if (!curr) - return cpu_clock(smp_processor_id()); - - return __task_delta_exec(curr, update) + curr->se.sum_exec_runtime; + return cpu_clock(smp_processor_id()); } /* * Update the record of the current time in a context. */ -static void update_context_time(struct perf_counter_context *ctx, int update) +static void update_context_time(struct perf_counter_context *ctx) { - ctx->time_now = get_context_time(ctx, update) - ctx->time_lost; + u64 now = perf_clock(); + + ctx->time += now - ctx->timestamp; + ctx->timestamp = now; } /* @@ -284,15 +277,17 @@ static void update_counter_times(struct perf_counter *counter) struct perf_counter_context *ctx = counter->ctx; u64 run_end; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - counter->total_time_enabled = ctx->time_now - - counter->tstamp_enabled; - if (counter->state == PERF_COUNTER_STATE_INACTIVE) - run_end = counter->tstamp_stopped; - else - run_end = ctx->time_now; - counter->total_time_running = run_end - counter->tstamp_running; - } + if (counter->state < PERF_COUNTER_STATE_INACTIVE) + return; + + counter->total_time_enabled = ctx->time - counter->tstamp_enabled; + + if (counter->state == PERF_COUNTER_STATE_INACTIVE) + run_end = counter->tstamp_stopped; + else + run_end = ctx->time; + + counter->total_time_running = run_end - counter->tstamp_running; } /* @@ -332,7 +327,7 @@ static void __perf_counter_disable(void *info) * If it is in error state, leave it in error state. */ if (counter->state >= PERF_COUNTER_STATE_INACTIVE) { - update_context_time(ctx, 1); + update_context_time(ctx); update_counter_times(counter); if (counter == counter->group_leader) group_sched_out(counter, cpuctx, ctx); @@ -426,7 +421,7 @@ counter_sched_in(struct perf_counter *counter, return -EAGAIN; } - counter->tstamp_running += ctx->time_now - counter->tstamp_stopped; + counter->tstamp_running += ctx->time - counter->tstamp_stopped; if (!is_software_counter(counter)) cpuctx->active_oncpu++; @@ -493,9 +488,9 @@ static void add_counter_to_ctx(struct perf_counter *counter, list_add_counter(counter, ctx); ctx->nr_counters++; counter->prev_state = PERF_COUNTER_STATE_OFF; - counter->tstamp_enabled = ctx->time_now; - counter->tstamp_running = ctx->time_now; - counter->tstamp_stopped = ctx->time_now; + counter->tstamp_enabled = ctx->time; + counter->tstamp_running = ctx->time; + counter->tstamp_stopped = ctx->time; } /* @@ -522,7 +517,7 @@ static void __perf_install_in_context(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); - update_context_time(ctx, 1); + update_context_time(ctx); /* * Protect the list operation against NMI by disabling the @@ -648,13 +643,13 @@ static void __perf_counter_enable(void *info) curr_rq_lock_irq_save(&flags); spin_lock(&ctx->lock); - update_context_time(ctx, 1); + update_context_time(ctx); counter->prev_state = counter->state; if (counter->state >= PERF_COUNTER_STATE_INACTIVE) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time_now - counter->total_time_enabled; + counter->tstamp_enabled = ctx->time - counter->total_time_enabled; /* * If the counter is in a group and isn't the group leader, @@ -737,8 +732,8 @@ static void perf_counter_enable(struct perf_counter *counter) */ if (counter->state == PERF_COUNTER_STATE_OFF) { counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time_now - - counter->total_time_enabled; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; } out: spin_unlock_irq(&ctx->lock); @@ -778,7 +773,7 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, ctx->is_active = 0; if (likely(!ctx->nr_counters)) goto out; - update_context_time(ctx, 0); + update_context_time(ctx); flags = hw_perf_save_disable(); if (ctx->nr_active) { @@ -883,12 +878,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (likely(!ctx->nr_counters)) goto out; - /* - * Add any time since the last sched_out to the lost time - * so it doesn't get included in the total_time_enabled and - * total_time_running measures for counters in the context. - */ - ctx->time_lost = get_context_time(ctx, 0) - ctx->time_now; + ctx->timestamp = perf_clock(); flags = hw_perf_save_disable(); @@ -1043,8 +1033,8 @@ int perf_counter_task_enable(void) if (counter->state > PERF_COUNTER_STATE_OFF) continue; counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = ctx->time_now - - counter->total_time_enabled; + counter->tstamp_enabled = + ctx->time - counter->total_time_enabled; counter->hw_event.disabled = 0; } hw_perf_restore(perf_flags); @@ -1113,7 +1103,7 @@ static void __read(void *info) curr_rq_lock_irq_save(&flags); if (ctx->is_active) - update_context_time(ctx, 1); + update_context_time(ctx); counter->hw_ops->read(counter); update_counter_times(counter); curr_rq_unlock_irq_restore(&flags); From a39d6f2556c4a19f58f538c6aa28bf8faca4fcb8 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:11 +0200 Subject: [PATCH 0256/2061] perf_counter: rework the task clock software counter Rework the task clock software counter to use the context time instead of the task runtime clock, this removes the last such user. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.445450972@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 42 ++++++++++++------------------------------ 1 file changed, 12 insertions(+), 30 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 84d85ab4e16..56b7eb53d67 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -974,9 +974,6 @@ int perf_counter_task_disable(void) curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); - /* force the update of the task clock: */ - __task_delta_exec(curr, 1); - perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -1017,9 +1014,6 @@ int perf_counter_task_enable(void) curr_rq_lock_irq_save(&flags); cpu = smp_processor_id(); - /* force the update of the task clock: */ - __task_delta_exec(curr, 1); - perf_counter_task_sched_out(curr, cpu); spin_lock(&ctx->lock); @@ -2347,38 +2341,28 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { * Software counter: task time clock */ -/* - * Called from within the scheduler: - */ -static u64 task_clock_perf_counter_val(struct perf_counter *counter, int update) +static void task_clock_perf_counter_update(struct perf_counter *counter) { - struct task_struct *curr = counter->task; - u64 delta; - - delta = __task_delta_exec(curr, update); - - return curr->se.sum_exec_runtime + delta; -} - -static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) -{ - u64 prev; + u64 prev, now; s64 delta; - prev = atomic64_read(&counter->hw.prev_count); - - atomic64_set(&counter->hw.prev_count, now); + update_context_time(counter->ctx); + now = counter->ctx->time; + prev = atomic64_xchg(&counter->hw.prev_count, now); delta = now - prev; - atomic64_add(delta, &counter->count); } static int task_clock_perf_counter_enable(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; + u64 now; - atomic64_set(&hwc->prev_count, task_clock_perf_counter_val(counter, 0)); + update_context_time(counter->ctx); + now = counter->ctx->time; + + atomic64_set(&hwc->prev_count, now); hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { @@ -2393,14 +2377,12 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) static void task_clock_perf_counter_disable(struct perf_counter *counter) { hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter, - task_clock_perf_counter_val(counter, 0)); + task_clock_perf_counter_update(counter); } static void task_clock_perf_counter_read(struct perf_counter *counter) { - task_clock_perf_counter_update(counter, - task_clock_perf_counter_val(counter, 1)); + task_clock_perf_counter_update(counter); } static const struct hw_perf_counter_ops perf_ops_task_clock = { From 849691a6cd40270ff5f4a8846d5f6bf8df663ffc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:12 +0200 Subject: [PATCH 0257/2061] perf_counter: remove rq->lock usage Now that all the task runtime clock users are gone, remove the ugly rq->lock usage from perf counters, which solves the nasty deadlock seen when a software task clock counter was read from an NMI overflow context. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.531137582@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 2 -- kernel/perf_counter.c | 42 ++++++++++++++----------------------- kernel/sched.c | 20 ------------------ 3 files changed, 16 insertions(+), 48 deletions(-) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index b6d2887a5d8..080d1fd461d 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -85,8 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq) /* * Lock/unlock the current runqueue - to extract task statistics: */ -extern void curr_rq_lock_irq_save(unsigned long *flags); -extern void curr_rq_unlock_irq_restore(unsigned long *flags); extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 56b7eb53d67..f4f7596f784 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -172,8 +172,7 @@ static void __perf_counter_remove_from_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); counter_sched_out(counter, cpuctx, ctx); @@ -198,8 +197,7 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } @@ -319,8 +317,7 @@ static void __perf_counter_disable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); /* * If the counter is on, turn it off. @@ -336,8 +333,7 @@ static void __perf_counter_disable(void *info) counter->state = PERF_COUNTER_STATE_OFF; } - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -515,8 +511,7 @@ static void __perf_install_in_context(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); update_context_time(ctx); /* @@ -565,8 +560,7 @@ static void __perf_install_in_context(void *info) unlock: hw_perf_restore(perf_flags); - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -641,8 +635,7 @@ static void __perf_counter_enable(void *info) if (ctx->task && cpuctx->task_ctx != ctx) return; - curr_rq_lock_irq_save(&flags); - spin_lock(&ctx->lock); + spin_lock_irqsave(&ctx->lock, flags); update_context_time(ctx); counter->prev_state = counter->state; @@ -678,8 +671,7 @@ static void __perf_counter_enable(void *info) } unlock: - spin_unlock(&ctx->lock); - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); } /* @@ -971,7 +963,7 @@ int perf_counter_task_disable(void) if (likely(!ctx->nr_counters)) return 0; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); cpu = smp_processor_id(); perf_counter_task_sched_out(curr, cpu); @@ -992,9 +984,7 @@ int perf_counter_task_disable(void) hw_perf_restore(perf_flags); - spin_unlock(&ctx->lock); - - curr_rq_unlock_irq_restore(&flags); + spin_unlock_irqrestore(&ctx->lock, flags); return 0; } @@ -1011,7 +1001,7 @@ int perf_counter_task_enable(void) if (likely(!ctx->nr_counters)) return 0; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); cpu = smp_processor_id(); perf_counter_task_sched_out(curr, cpu); @@ -1037,7 +1027,7 @@ int perf_counter_task_enable(void) perf_counter_task_sched_in(curr, cpu); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); return 0; } @@ -1095,12 +1085,12 @@ static void __read(void *info) struct perf_counter_context *ctx = counter->ctx; unsigned long flags; - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); counter->hw_ops->read(counter); update_counter_times(counter); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); } static u64 perf_counter_read(struct perf_counter *counter) @@ -2890,7 +2880,7 @@ __perf_counter_exit_task(struct task_struct *child, * Be careful about zapping the list - IRQ/NMI context * could still be processing it: */ - curr_rq_lock_irq_save(&flags); + local_irq_save(flags); perf_flags = hw_perf_save_disable(); cpuctx = &__get_cpu_var(perf_cpu_context); @@ -2903,7 +2893,7 @@ __perf_counter_exit_task(struct task_struct *child, child_ctx->nr_counters--; hw_perf_restore(perf_flags); - curr_rq_unlock_irq_restore(&flags); + local_irq_restore(flags); } parent_counter = child_counter->parent; diff --git a/kernel/sched.c b/kernel/sched.c index f76e3c0188a..0de2f814fb1 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -997,26 +997,6 @@ static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) } } -void curr_rq_lock_irq_save(unsigned long *flags) - __acquires(rq->lock) -{ - struct rq *rq; - - local_irq_save(*flags); - rq = cpu_rq(smp_processor_id()); - spin_lock(&rq->lock); -} - -void curr_rq_unlock_irq_restore(unsigned long *flags) - __releases(rq->lock) -{ - struct rq *rq; - - rq = cpu_rq(smp_processor_id()); - spin_unlock(&rq->lock); - local_irq_restore(*flags); -} - void task_rq_unlock_wait(struct task_struct *p) { struct rq *rq = task_rq(p); From bce379bf358738ab8efc8cda2594a299ac685085 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 6 Apr 2009 11:45:13 +0200 Subject: [PATCH 0258/2061] perf_counter: minimize context time updates Push the update_context_time() calls up the stack so that we get less invokations and thereby a less noisy output: before: # ./perfstat -e 1:0 -e 1:1 -e 1:1 -e 1:1 -l ls > /dev/null Performance counter stats for 'ls': 10.163691 cpu clock ticks (msecs) (scaled from 98.94%) 10.215360 task clock ticks (msecs) (scaled from 98.18%) 10.185549 task clock ticks (msecs) (scaled from 98.53%) 10.183581 task clock ticks (msecs) (scaled from 98.71%) Wall-clock time elapsed: 11.912858 msecs after: # ./perfstat -e 1:0 -e 1:1 -e 1:1 -e 1:1 -l ls > /dev/null Performance counter stats for 'ls': 9.316630 cpu clock ticks (msecs) 9.280789 task clock ticks (msecs) 9.280789 task clock ticks (msecs) 9.280789 task clock ticks (msecs) Wall-clock time elapsed: 9.574872 msecs Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090406094518.618876874@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f4f7596f784..863703b3158 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -319,6 +319,8 @@ static void __perf_counter_disable(void *info) spin_lock_irqsave(&ctx->lock, flags); + update_context_time(ctx); + /* * If the counter is on, turn it off. * If it is in error state, leave it in error state. @@ -797,6 +799,8 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) if (likely(!cpuctx->task_ctx)) return; + update_context_time(ctx); + regs = task_pt_regs(task); perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); __perf_counter_sched_out(ctx, cpuctx); @@ -2336,7 +2340,6 @@ static void task_clock_perf_counter_update(struct perf_counter *counter) u64 prev, now; s64 delta; - update_context_time(counter->ctx); now = counter->ctx->time; prev = atomic64_xchg(&counter->hw.prev_count, now); @@ -2349,7 +2352,6 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; u64 now; - update_context_time(counter->ctx); now = counter->ctx->time; atomic64_set(&hwc->prev_count, now); @@ -2372,6 +2374,7 @@ static void task_clock_perf_counter_disable(struct perf_counter *counter) static void task_clock_perf_counter_read(struct perf_counter *counter) { + update_context_time(counter->ctx); task_clock_perf_counter_update(counter); } From 6278af660ff83fbafb18e53fc2747eb2ee6780fa Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Thu, 2 Apr 2009 10:40:28 +0200 Subject: [PATCH 0259/2061] perf_counter tools: kerneltop: display per function percentage along with event count ------------------------------------------------------------------------------ KernelTop: 90551 irqs/sec kernel:15.0% [NMI, 100000 CPU cycles], (all, 4 CPUs) ------------------------------------------------------------------------------ events pcnt RIP kernel function ______ ______ _____ ________________ _______________ 16871.00 - 19.1% - ffffffff80328e20 : clear_page_c 8810.00 - 9.9% - ffffffff8048ce80 : page_fault 4746.00 - 5.4% - ffffffff8048cae2 : _spin_lock 4428.00 - 5.0% - ffffffff80328e70 : copy_page_c 3340.00 - 3.8% - ffffffff80329090 : copy_user_generic_string! 2679.00 - 3.0% - ffffffff8028a16b : get_page_from_freelist 2254.00 - 2.5% - ffffffff80296f19 : unmap_vmas 2082.00 - 2.4% - ffffffff80297e19 : handle_mm_fault 1754.00 - 2.0% - ffffffff80288dc8 : __rmqueue_smallest 1553.00 - 1.8% - ffffffff8048ca58 : _spin_lock_irqsave 1400.00 - 1.6% - ffffffff8028cdc8 : release_pages 1337.00 - 1.5% - ffffffff80285400 : find_get_page 1335.00 - 1.5% - ffffffff80225a23 : do_page_fault 1299.00 - 1.5% - ffffffff802ba8e7 : __d_lookup 1174.00 - 1.3% - ffffffff802b38f3 : __link_path_walk 1155.00 - 1.3% - ffffffff802843e1 : perf_swcounter_ctx_event! 1137.00 - 1.3% - ffffffff8028d118 : ____pagevec_lru_add 963.00 - 1.1% - ffffffff802a670b : kmem_cache_alloc 885.00 - 1.0% - ffffffff8024bc61 : __wake_up_bit Display per function percentage along with event count. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 45 +++++++++++++------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 4f8d7917aba..15f3a5f9019 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -636,16 +636,20 @@ static void print_sym_table(void) int counter; float events_per_sec = events/delay_secs; float kevents_per_sec = (events-userspace_events)/delay_secs; + float sum_kevents = 0.0; events = userspace_events = 0; memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count); qsort(tmp, sym_table_count, sizeof(tmp[0]), compare); + for (i = 0; i < sym_table_count && tmp[i].count[0]; i++) + sum_kevents += tmp[i].count[0]; + write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR)); printf( "------------------------------------------------------------------------------\n"); - printf( " KernelTop:%8.0f irqs/sec kernel:%3.1f%% [%s, ", + printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ", events_per_sec, 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)), nmi ? "NMI" : "IRQ"); @@ -679,34 +683,31 @@ static void print_sym_table(void) printf("------------------------------------------------------------------------------\n\n"); if (nr_counters == 1) - printf(" events"); + printf(" events pcnt"); else - printf(" weight events"); + printf(" weight events pcnt"); printf(" RIP kernel function\n" - " ______ ______ ________________ _______________\n\n" + " ______ ______ _____ ________________ _______________\n\n" ); - printed = 0; - for (i = 0; i < sym_table_count; i++) { + for (i = 0, printed = 0; i < sym_table_count; i++) { + float pcnt; int count; - if (nr_counters == 1) { - if (printed <= 18 && - tmp[i].count[0] >= count_filter) { - printf("%19.2f - %016llx : %s\n", - sym_weight(tmp + i), tmp[i].addr, tmp[i].sym); - printed++; - } - } else { - if (printed <= 18 && - tmp[i].count[0] >= count_filter) { - printf("%8.1f %10ld - %016llx : %s\n", - sym_weight(tmp + i), - tmp[i].count[0], - tmp[i].addr, tmp[i].sym); - printed++; - } + if (printed <= 18 && tmp[i].count[0] >= count_filter) { + pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents)); + + if (nr_counters == 1) + printf("%19.2f - %4.1f%% - %016llx : %s\n", + sym_weight(tmp + i), + pcnt, tmp[i].addr, tmp[i].sym); + else + printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n", + sym_weight(tmp + i), + tmp[i].count[0], + pcnt, tmp[i].addr, tmp[i].sym); + printed++; } /* * Add decay to the counts: From 98c2aaf8be5baf7193be37fb28bce8e7327158bc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 7 Apr 2009 11:30:17 +0200 Subject: [PATCH 0260/2061] x86, perfcounters: add atomic64_xchg() Complete atomic64_t support on the 32-bit side by adding atomic64_xch(). Cc: Peter Zijlstra LKML-Reference: <20090406094518.445450972@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic_32.h | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/arch/x86/include/asm/atomic_32.h b/arch/x86/include/asm/atomic_32.h index 977250ed8b8..aff9f1fcdcd 100644 --- a/arch/x86/include/asm/atomic_32.h +++ b/arch/x86/include/asm/atomic_32.h @@ -290,6 +290,28 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, return cmpxchg8b(&ptr->counter, old_val, new_val); } +/** + * atomic64_xchg - xchg atomic64 variable + * @ptr: pointer to type atomic64_t + * @new_val: value to assign + * @old_val: old value that was there + * + * Atomically xchgs the value of @ptr to @new_val and returns + * the old value. + */ + +static inline unsigned long long +atomic64_xchg(atomic64_t *ptr, unsigned long long new_val) +{ + unsigned long long old_val; + + do { + old_val = atomic_read(ptr); + } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + + return old_val; +} + /** * atomic64_set - set atomic64 variable * @ptr: pointer to type atomic64_t @@ -299,11 +321,7 @@ atomic64_cmpxchg(atomic64_t *ptr, unsigned long long old_val, */ static inline void atomic64_set(atomic64_t *ptr, unsigned long long new_val) { - unsigned long long old_val; - - do { - old_val = atomic_read(ptr); - } while (atomic64_cmpxchg(ptr, old_val, new_val) != old_val); + atomic64_xchg(ptr, new_val); } /** From cac94f979326212831c0ea44ed9ea1622b4f4e93 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:33 +0200 Subject: [PATCH 0261/2061] x86, bts: fix race when bts tracer is removed When the bts tracer is removed while the traced task is running, the write to clear the bts tracer pointer races with context switch code. Read the tracer once during a context switch. When a new tracer is installed, the bts tracer is set in the ds context before the tracer is initialized in order to claim the context for that tracer. This may result in write accesses using an uninitialized trace configuration when scheduling timestamps have been requested. Store active tracing flags separately and only set active flags after the tracing configuration has been initialized. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144548.881338000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 58 ++++++++++++++++++++++++++++---------------- 1 file changed, 37 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index b1d6e1f502f..c730155bf54 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -89,6 +89,9 @@ struct bts_tracer { /* Buffer overflow notification function: */ bts_ovfl_callback_t ovfl; + + /* Active flags affecting trace collection. */ + unsigned int flags; }; struct pebs_tracer { @@ -799,6 +802,8 @@ void ds_suspend_bts(struct bts_tracer *tracer) if (!tracer) return; + tracer->flags = 0; + task = tracer->ds.context->task; if (!task || (task == current)) @@ -820,6 +825,8 @@ void ds_resume_bts(struct bts_tracer *tracer) if (!tracer) return; + tracer->flags = tracer->trace.ds.flags; + task = tracer->ds.context->task; control = ds_cfg.ctl[dsf_bts]; @@ -1037,43 +1044,52 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) } } +static inline void ds_take_timestamp(struct ds_context *context, + enum bts_qualifier qualifier, + struct task_struct *task) +{ + struct bts_tracer *tracer = context->bts_master; + struct bts_struct ts; + + /* Prevent compilers from reading the tracer pointer twice. */ + barrier(); + + if (!tracer || !(tracer->flags & BTS_TIMESTAMPS)) + return; + + memset(&ts, 0, sizeof(ts)); + ts.qualifier = qualifier; + ts.variant.timestamp.jiffies = jiffies_64; + ts.variant.timestamp.pid = task->pid; + + bts_write(tracer, &ts); +} + /* * Change the DS configuration from tracing prev to tracing next. */ void ds_switch_to(struct task_struct *prev, struct task_struct *next) { - struct ds_context *prev_ctx = prev->thread.ds_ctx; - struct ds_context *next_ctx = next->thread.ds_ctx; + struct ds_context *prev_ctx = prev->thread.ds_ctx; + struct ds_context *next_ctx = next->thread.ds_ctx; + unsigned long debugctlmsr = next->thread.debugctlmsr; + + /* Make sure all data is read before we start. */ + barrier(); if (prev_ctx) { update_debugctlmsr(0); - if (prev_ctx->bts_master && - (prev_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_departs, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = prev->pid - }; - bts_write(prev_ctx->bts_master, &ts); - } + ds_take_timestamp(prev_ctx, bts_task_departs, prev); } if (next_ctx) { - if (next_ctx->bts_master && - (next_ctx->bts_master->trace.ds.flags & BTS_TIMESTAMPS)) { - struct bts_struct ts = { - .qualifier = bts_task_arrives, - .variant.timestamp.jiffies = jiffies_64, - .variant.timestamp.pid = next->pid - }; - bts_write(next_ctx->bts_master, &ts); - } + ds_take_timestamp(next_ctx, bts_task_arrives, next); wrmsrl(MSR_IA32_DS_AREA, (unsigned long)next_ctx->ds); } - update_debugctlmsr(next->thread.debugctlmsr); + update_debugctlmsr(debugctlmsr); } void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) From a26b89f05d194413c7238e0bea071054f6b5d3c8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:34 +0200 Subject: [PATCH 0262/2061] sched, hw-branch-tracer: add wait_task_context_switch() function to sched.h Add a function to wait until some other task has been switched out at least once. This differs from wait_task_inactive() subtly, in that the latter will wait until the task has left the CPU. Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144549.794157000@intel.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 2 ++ kernel/sched.c | 43 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 45 insertions(+) diff --git a/include/linux/sched.h b/include/linux/sched.h index b94f3541f67..a5b9a83065f 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1993,8 +1993,10 @@ extern void set_task_comm(struct task_struct *tsk, char *from); extern char *get_task_comm(char *to, struct task_struct *tsk); #ifdef CONFIG_SMP +extern void wait_task_context_switch(struct task_struct *p); extern unsigned long wait_task_inactive(struct task_struct *, long match_state); #else +static inline void wait_task_context_switch(struct task_struct *p) {} static inline unsigned long wait_task_inactive(struct task_struct *p, long match_state) { diff --git a/kernel/sched.c b/kernel/sched.c index 6cc1fd5d507..f91bc8141dc 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2002,6 +2002,49 @@ migrate_task(struct task_struct *p, int dest_cpu, struct migration_req *req) return 1; } +/* + * wait_task_context_switch - wait for a thread to complete at least one + * context switch. + * + * @p must not be current. + */ +void wait_task_context_switch(struct task_struct *p) +{ + unsigned long nvcsw, nivcsw, flags; + int running; + struct rq *rq; + + nvcsw = p->nvcsw; + nivcsw = p->nivcsw; + for (;;) { + /* + * The runqueue is assigned before the actual context + * switch. We need to take the runqueue lock. + * + * We could check initially without the lock but it is + * very likely that we need to take the lock in every + * iteration. + */ + rq = task_rq_lock(p, &flags); + running = task_running(rq, p); + task_rq_unlock(rq, &flags); + + if (likely(!running)) + break; + /* + * The switch count is incremented before the actual + * context switch. We thus wait for two switches to be + * sure at least one completed. + */ + if ((p->nvcsw - nvcsw) > 1) + break; + if ((p->nivcsw - nivcsw) > 1) + break; + + cpu_relax(); + } +} + /* * wait_task_inactive - wait for a thread to unschedule. * From e2b371f00a6f529f6362654239bdec8dcd510760 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:35 +0200 Subject: [PATCH 0263/2061] mm, x86, ptrace, bts: defer branch trace stopping When a ptraced task is unlinked, we need to stop branch tracing for that task. Since the unlink is called with interrupts disabled, and we need interrupts enabled to stop branch tracing, we defer the work. Collect all branch tracing related stuff in a branch tracing context. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: Andrew Morton Cc: Peter Zijlstra Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144550.712401000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 4 - arch/x86/kernel/ptrace.c | 254 ++++++++++++++++++++----------- include/linux/mm.h | 3 +- include/linux/sched.h | 9 +- mm/mlock.c | 13 +- 5 files changed, 179 insertions(+), 104 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 34c52370f2f..2483807e06e 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -458,10 +458,6 @@ struct thread_struct { /* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */ struct ds_context *ds_ctx; #endif /* CONFIG_X86_DS */ -#ifdef CONFIG_X86_PTRACE_BTS -/* the signal to send on a bts buffer overflow */ - unsigned int bts_ovfl_signal; -#endif /* CONFIG_X86_PTRACE_BTS */ }; static inline unsigned long native_get_debugreg(int regno) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index fe9345c967d..7c21d1e8cae 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -577,17 +578,119 @@ static int ioperm_get(struct task_struct *target, } #ifdef CONFIG_X86_PTRACE_BTS +/* + * A branch trace store context. + * + * Contexts may only be installed by ptrace_bts_config() and only for + * ptraced tasks. + * + * Contexts are destroyed when the tracee is detached from the tracer. + * The actual destruction work requires interrupts enabled, so the + * work is deferred and will be scheduled during __ptrace_unlink(). + * + * Contexts hold an additional task_struct reference on the traced + * task, as well as a reference on the tracer's mm. + * + * Ptrace already holds a task_struct for the duration of ptrace operations, + * but since destruction is deferred, it may be executed after both + * tracer and tracee exited. + */ +struct bts_context { + /* The branch trace handle. */ + struct bts_tracer *tracer; + + /* The buffer used to store the branch trace and its size. */ + void *buffer; + unsigned int size; + + /* The mm that paid for the above buffer. */ + struct mm_struct *mm; + + /* The task this context belongs to. */ + struct task_struct *task; + + /* The signal to send on a bts buffer overflow. */ + unsigned int bts_ovfl_signal; + + /* The work struct to destroy a context. */ + struct work_struct work; +}; + +static inline void alloc_bts_buffer(struct bts_context *context, + unsigned int size) +{ + void *buffer; + + buffer = alloc_locked_buffer(size); + if (buffer) { + context->buffer = buffer; + context->size = size; + context->mm = get_task_mm(current); + } +} + +static inline void free_bts_buffer(struct bts_context *context) +{ + if (!context->buffer) + return; + + kfree(context->buffer); + context->buffer = NULL; + + refund_locked_buffer_memory(context->mm, context->size); + context->size = 0; + + mmput(context->mm); + context->mm = NULL; +} + +static void free_bts_context_work(struct work_struct *w) +{ + struct bts_context *context; + + context = container_of(w, struct bts_context, work); + + ds_release_bts(context->tracer); + put_task_struct(context->task); + free_bts_buffer(context); + kfree(context); +} + +static inline void free_bts_context(struct bts_context *context) +{ + INIT_WORK(&context->work, free_bts_context_work); + schedule_work(&context->work); +} + +static inline struct bts_context *alloc_bts_context(struct task_struct *task) +{ + struct bts_context *context = kzalloc(sizeof(*context), GFP_KERNEL); + if (context) { + context->task = task; + task->bts = context; + + get_task_struct(task); + } + + return context; +} + static int ptrace_bts_read_record(struct task_struct *child, size_t index, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; struct bts_struct bts; const unsigned char *at; int error; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; at = trace->ds.top - ((index + 1) * trace->ds.size); if ((void *)at < trace->ds.begin) @@ -596,7 +699,7 @@ static int ptrace_bts_read_record(struct task_struct *child, size_t index, if (!trace->read) return -EOPNOTSUPP; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -610,13 +713,18 @@ static int ptrace_bts_drain(struct task_struct *child, long size, struct bts_struct __user *out) { + struct bts_context *context; const struct bts_trace *trace; const unsigned char *at; int error, drained = 0; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; if (!trace->read) return -EOPNOTSUPP; @@ -627,9 +735,8 @@ static int ptrace_bts_drain(struct task_struct *child, for (at = trace->ds.begin; (void *)at < trace->ds.top; out++, drained++, at += trace->ds.size) { struct bts_struct bts; - int error; - error = trace->read(child->bts, at, &bts); + error = trace->read(context->tracer, at, &bts); if (error < 0) return error; @@ -639,35 +746,18 @@ static int ptrace_bts_drain(struct task_struct *child, memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - error = ds_reset_bts(child->bts); + error = ds_reset_bts(context->tracer); if (error < 0) return error; return drained; } -static int ptrace_bts_allocate_buffer(struct task_struct *child, size_t size) -{ - child->bts_buffer = alloc_locked_buffer(size); - if (!child->bts_buffer) - return -ENOMEM; - - child->bts_size = size; - - return 0; -} - -static void ptrace_bts_free_buffer(struct task_struct *child) -{ - free_locked_buffer(child->bts_buffer, child->bts_size); - child->bts_buffer = NULL; - child->bts_size = 0; -} - static int ptrace_bts_config(struct task_struct *child, long cfg_size, const struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; struct ptrace_bts_config cfg; unsigned int flags = 0; @@ -677,28 +767,31 @@ static int ptrace_bts_config(struct task_struct *child, if (copy_from_user(&cfg, ucfg, sizeof(cfg))) return -EFAULT; - if (child->bts) { - ds_release_bts(child->bts); - child->bts = NULL; - } + context = child->bts; + if (!context) + context = alloc_bts_context(child); + if (!context) + return -ENOMEM; if (cfg.flags & PTRACE_BTS_O_SIGNAL) { if (!cfg.signal) return -EINVAL; - child->thread.bts_ovfl_signal = cfg.signal; return -EOPNOTSUPP; + context->bts_ovfl_signal = cfg.signal; } - if ((cfg.flags & PTRACE_BTS_O_ALLOC) && - (cfg.size != child->bts_size)) { - int error; + ds_release_bts(context->tracer); + context->tracer = NULL; - ptrace_bts_free_buffer(child); + if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { + free_bts_buffer(context); + if (!cfg.size) + return 0; - error = ptrace_bts_allocate_buffer(child, cfg.size); - if (error < 0) - return error; + alloc_bts_buffer(context, cfg.size); + if (!context->buffer) + return -ENOMEM; } if (cfg.flags & PTRACE_BTS_O_TRACE) @@ -707,15 +800,13 @@ static int ptrace_bts_config(struct task_struct *child, if (cfg.flags & PTRACE_BTS_O_SCHED) flags |= BTS_TIMESTAMPS; - child->bts = ds_request_bts(child, child->bts_buffer, child->bts_size, - /* ovfl = */ NULL, /* th = */ (size_t)-1, - flags); - if (IS_ERR(child->bts)) { - int error = PTR_ERR(child->bts); - - ptrace_bts_free_buffer(child); - child->bts = NULL; + context->tracer = ds_request_bts(child, context->buffer, context->size, + NULL, (size_t)-1, flags); + if (unlikely(IS_ERR(context->tracer))) { + int error = PTR_ERR(context->tracer); + free_bts_buffer(context); + context->tracer = NULL; return error; } @@ -726,20 +817,25 @@ static int ptrace_bts_status(struct task_struct *child, long cfg_size, struct ptrace_bts_config __user *ucfg) { + struct bts_context *context; const struct bts_trace *trace; struct ptrace_bts_config cfg; + context = child->bts; + if (!context) + return -ESRCH; + if (cfg_size < sizeof(cfg)) return -EIO; - trace = ds_read_bts(child->bts); + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(&cfg, 0, sizeof(cfg)); - cfg.size = trace->ds.end - trace->ds.begin; - cfg.signal = child->thread.bts_ovfl_signal; - cfg.bts_size = sizeof(struct bts_struct); + cfg.size = trace->ds.end - trace->ds.begin; + cfg.signal = context->bts_ovfl_signal; + cfg.bts_size = sizeof(struct bts_struct); if (cfg.signal) cfg.flags |= PTRACE_BTS_O_SIGNAL; @@ -758,67 +854,56 @@ static int ptrace_bts_status(struct task_struct *child, static int ptrace_bts_clear(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; memset(trace->ds.begin, 0, trace->ds.n * trace->ds.size); - return ds_reset_bts(child->bts); + return ds_reset_bts(context->tracer); } static int ptrace_bts_size(struct task_struct *child) { + struct bts_context *context; const struct bts_trace *trace; - trace = ds_read_bts(child->bts); + context = child->bts; + if (!context) + return -ESRCH; + + trace = ds_read_bts(context->tracer); if (!trace) - return -EPERM; + return -ESRCH; return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -static void ptrace_bts_fork(struct task_struct *tsk) +static inline void ptrace_bts_fork(struct task_struct *tsk) { tsk->bts = NULL; - tsk->bts_buffer = NULL; - tsk->bts_size = 0; - tsk->thread.bts_ovfl_signal = 0; } -static void ptrace_bts_untrace(struct task_struct *child) +/* + * Called from __ptrace_unlink() after the child has been moved back + * to its original parent. + */ +static inline void ptrace_bts_untrace(struct task_struct *child) { if (unlikely(child->bts)) { - ds_release_bts(child->bts); + free_bts_context(child->bts); child->bts = NULL; - - /* We cannot update total_vm and locked_vm since - child's mm is already gone. But we can reclaim the - memory. */ - kfree(child->bts_buffer); - child->bts_buffer = NULL; - child->bts_size = 0; } } - -static void ptrace_bts_detach(struct task_struct *child) -{ - /* - * Ptrace_detach() races with ptrace_untrace() in case - * the child dies and is reaped by another thread. - * - * We only do the memory accounting at this point and - * leave the buffer deallocation and the bts tracer - * release to ptrace_bts_untrace() which will be called - * later on with tasklist_lock held. - */ - release_locked_buffer(child->bts_buffer, child->bts_size); -} #else static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_detach(struct task_struct *child) {} static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ @@ -843,7 +928,6 @@ void ptrace_disable(struct task_struct *child) #ifdef TIF_SYSCALL_EMU clear_tsk_thread_flag(child, TIF_SYSCALL_EMU); #endif - ptrace_bts_detach(child); } #if defined CONFIG_X86_32 || defined CONFIG_IA32_EMULATION diff --git a/include/linux/mm.h b/include/linux/mm.h index bff1f0d475c..64d8ed2538a 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -13,6 +13,7 @@ #include #include #include +#include struct mempolicy; struct anon_vma; @@ -1321,6 +1322,6 @@ void vmemmap_populate_print_last(void); extern void *alloc_locked_buffer(size_t size); extern void free_locked_buffer(void *buffer, size_t size); -extern void release_locked_buffer(void *buffer, size_t size); +extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/include/linux/sched.h b/include/linux/sched.h index a5b9a83065f..52b8cd049c2 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -96,8 +96,8 @@ struct exec_domain; struct futex_pi_state; struct robust_list_head; struct bio; -struct bts_tracer; struct fs_struct; +struct bts_context; /* * List of flags we want to share for kernel threads, @@ -1210,12 +1210,7 @@ struct task_struct { * This is the tracer handle for the ptrace BTS extension. * This field actually belongs to the ptracer task. */ - struct bts_tracer *bts; - /* - * The buffer to hold the BTS data. - */ - void *bts_buffer; - size_t bts_size; + struct bts_context *bts; #endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ diff --git a/mm/mlock.c b/mm/mlock.c index cbe9e0581b7..749383b442c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -660,21 +660,20 @@ void *alloc_locked_buffer(size_t size) return buffer; } -void release_locked_buffer(void *buffer, size_t size) +void refund_locked_buffer_memory(struct mm_struct *mm, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); - current->mm->total_vm -= pgsz; - current->mm->locked_vm -= pgsz; + mm->total_vm -= pgsz; + mm->locked_vm -= pgsz; - up_write(¤t->mm->mmap_sem); + up_write(&mm->mmap_sem); } void free_locked_buffer(void *buffer, size_t size) { - release_locked_buffer(buffer, size); - + refund_locked_buffer_memory(current->mm, size); kfree(buffer); } From 8d99b3ac2726e5edd97ad147fa5c1f2acb63a745 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:36 +0200 Subject: [PATCH 0264/2061] x86, bts: wait until traced task has been scheduled out In order to stop branch tracing for a running task, we need to first clear the branch tracing control bits before we may free the tracing buffer. If the traced task is running, the cpu might still trace that task after the branch trace control bits have cleared. Wait until the traced task has been scheduled out before proceeding. A similar problem affects the task debug store context. We first remove the context, then we need to wait until the task has been scheduled out before we can free the context memory. Reviewed-by: Oleg Nesterov Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144551.919636000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index c730155bf54..5cd137ab267 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -299,6 +299,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) static inline void ds_put_context(struct ds_context *context) { + struct task_struct *task; unsigned long irq; if (!context) @@ -313,14 +314,20 @@ static inline void ds_put_context(struct ds_context *context) *(context->this) = NULL; - if (context->task) - clear_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + task = context->task; - if (!context->task || (context->task == current)) + if (task) + clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); + + if (!task || (task == current)) wrmsrl(MSR_IA32_DS_AREA, 0); spin_unlock_irqrestore(&ds_lock, irq); + /* The context might still be in use for context switching. */ + if (task && (task != current)) + wait_task_context_switch(task); + kfree(context); } @@ -781,15 +788,23 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, void ds_release_bts(struct bts_tracer *tracer) { + struct task_struct *task; + if (!tracer) return; + task = tracer->ds.context->task; + ds_suspend_bts(tracer); WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); tracer->ds.context->bts_master = NULL; - put_tracer(tracer->ds.context->task); + /* Make sure tracing stopped and the tracer is not in use. */ + if (task && (task != current)) + wait_task_context_switch(task); + + put_tracer(task); ds_put_context(tracer->ds.context); kfree(tracer); From 38f801129ad07b9afa7f9bd3779f61b805416d8c Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:37 +0200 Subject: [PATCH 0265/2061] x86, bts: fix race between per-task and per-cpu branch tracing Per-task branch tracing installs a debug store context with the traced task. This immediately results in the branch trace control bits to be cleared for the next context switch of that task, if not set before. Either per-cpu or per-task tracing are allowed at the same time. An active per-cpu tracing would be disabled even if the per-task tracing request is rejected and the task debug store context removed. Check the tracing type (per-cpu or per-task) before installing a task debug store context. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144552.856000000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 72 +++++++++++++++++++++++++------------------- 1 file changed, 41 insertions(+), 31 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 5cd137ab267..f03f117eff8 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -193,12 +193,28 @@ static DEFINE_SPINLOCK(ds_lock); */ static atomic_t tracers = ATOMIC_INIT(0); -static inline void get_tracer(struct task_struct *task) +static inline int get_tracer(struct task_struct *task) { - if (task) + int error; + + spin_lock_irq(&ds_lock); + + if (task) { + error = -EPERM; + if (atomic_read(&tracers) < 0) + goto out; atomic_inc(&tracers); - else + } else { + error = -EPERM; + if (atomic_read(&tracers) > 0) + goto out; atomic_dec(&tracers); + } + + error = 0; +out: + spin_unlock_irq(&ds_lock); + return error; } static inline void put_tracer(struct task_struct *task) @@ -209,14 +225,6 @@ static inline void put_tracer(struct task_struct *task) atomic_inc(&tracers); } -static inline int check_tracer(struct task_struct *task) -{ - return task ? - (atomic_read(&tracers) >= 0) : - (atomic_read(&tracers) <= 0); -} - - /* * The DS context is either attached to a thread or to a cpu: * - in the former case, the thread_struct contains a pointer to the @@ -677,6 +685,10 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (ovfl) goto out; + error = get_tracer(task); + if (error < 0) + goto out; + /* * Per-cpu tracing is typically requested using smp_call_function(). * We must not sleep. @@ -684,7 +696,7 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; error = ds_request(&tracer->ds, &tracer->trace.ds, @@ -695,14 +707,9 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, spin_lock_irqsave(&ds_lock, irq); - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - error = -EPERM; if (tracer->ds.context->bts_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->bts_master = tracer; spin_unlock_irqrestore(&ds_lock, irq); @@ -716,13 +723,13 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return tracer; - out_put_tracer: - put_tracer(task); out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } @@ -741,6 +748,10 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (ovfl) goto out; + error = get_tracer(task); + if (error < 0) + goto out; + /* * Per-cpu tracing is typically requested using smp_call_function(). * We must not sleep. @@ -748,7 +759,7 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, error = -ENOMEM; tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); if (!tracer) - goto out; + goto out_put_tracer; tracer->ovfl = ovfl; error = ds_request(&tracer->ds, &tracer->trace.ds, @@ -758,14 +769,9 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, spin_lock_irqsave(&ds_lock, irq); - error = -EPERM; - if (!check_tracer(task)) - goto out_unlock; - get_tracer(task); - error = -EPERM; if (tracer->ds.context->pebs_master) - goto out_put_tracer; + goto out_unlock; tracer->ds.context->pebs_master = tracer; spin_unlock_irqrestore(&ds_lock, irq); @@ -775,13 +781,13 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return tracer; - out_put_tracer: - put_tracer(task); out_unlock: spin_unlock_irqrestore(&ds_lock, irq); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); + out_put_tracer: + put_tracer(task); out: return ERR_PTR(error); } @@ -804,8 +810,8 @@ void ds_release_bts(struct bts_tracer *tracer) if (task && (task != current)) wait_task_context_switch(task); - put_tracer(task); ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } @@ -861,16 +867,20 @@ void ds_resume_bts(struct bts_tracer *tracer) void ds_release_pebs(struct pebs_tracer *tracer) { + struct task_struct *task; + if (!tracer) return; + task = tracer->ds.context->task; + ds_suspend_pebs(tracer); WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); tracer->ds.context->pebs_master = NULL; - put_tracer(tracer->ds.context->task); ds_put_context(tracer->ds.context); + put_tracer(task); kfree(tracer); } From 15879d042164650b93d83281ad5f87ad323bfbfe Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:38 +0200 Subject: [PATCH 0266/2061] x86, bts: use trace_clock_global() for timestamps Rename the bts_struct timestamp field to event. Use trace_clock_global() for time measurement. Reported-by: Ingo Molnar Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144553.773216000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 4 ++-- arch/x86/kernel/ds.c | 17 +++++++++-------- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index a8f672ba100..772f141afb9 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -170,9 +170,9 @@ struct bts_struct { } lbr; /* BTS_TASK_ARRIVES or BTS_TASK_DEPARTS */ struct { - __u64 jiffies; + __u64 clock; pid_t pid; - } timestamp; + } event; } variant; }; diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index f03f117eff8..2071b992c35 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -25,6 +25,7 @@ #include #include #include +#include #include @@ -471,7 +472,7 @@ enum bts_field { bts_flags, bts_qual = bts_from, - bts_jiffies = bts_to, + bts_clock = bts_to, bts_pid = bts_flags, bts_qual_mask = (bts_qual_max - 1), @@ -517,8 +518,8 @@ bts_read(struct bts_tracer *tracer, const void *at, struct bts_struct *out) memset(out, 0, sizeof(*out)); if ((bts_get(at, bts_qual) & ~bts_qual_mask) == bts_escape) { out->qualifier = (bts_get(at, bts_qual) & bts_qual_mask); - out->variant.timestamp.jiffies = bts_get(at, bts_jiffies); - out->variant.timestamp.pid = bts_get(at, bts_pid); + out->variant.event.clock = bts_get(at, bts_clock); + out->variant.event.pid = bts_get(at, bts_pid); } else { out->qualifier = bts_branch; out->variant.lbr.from = bts_get(at, bts_from); @@ -555,8 +556,8 @@ static int bts_write(struct bts_tracer *tracer, const struct bts_struct *in) case bts_task_arrives: case bts_task_departs: bts_set(raw, bts_qual, (bts_escape | in->qualifier)); - bts_set(raw, bts_jiffies, in->variant.timestamp.jiffies); - bts_set(raw, bts_pid, in->variant.timestamp.pid); + bts_set(raw, bts_clock, in->variant.event.clock); + bts_set(raw, bts_pid, in->variant.event.pid); break; default: return -EINVAL; @@ -1083,9 +1084,9 @@ static inline void ds_take_timestamp(struct ds_context *context, return; memset(&ts, 0, sizeof(ts)); - ts.qualifier = qualifier; - ts.variant.timestamp.jiffies = jiffies_64; - ts.variant.timestamp.pid = task->pid; + ts.qualifier = qualifier; + ts.variant.event.clock = trace_clock_global(); + ts.variant.event.pid = task->pid; bts_write(tracer, &ts); } From 35bb7600c17762bb129588c1877d2717fe325289 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:39 +0200 Subject: [PATCH 0267/2061] x86, debugctlmsr: add _on_cpu variants to debugctlmsr functions Add functions to get and set the debugctlmsr on different cpus. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144554.738772000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 2483807e06e..1efeb497f1f 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -785,6 +785,21 @@ static inline unsigned long get_debugctlmsr(void) return debugctlmsr; } +static inline unsigned long get_debugctlmsr_on_cpu(int cpu) +{ + u64 debugctlmsr = 0; + u32 val1, val2; + +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return 0; +#endif + rdmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, &val1, &val2); + debugctlmsr = val1 | ((u64)val2 << 32); + + return debugctlmsr; +} + static inline void update_debugctlmsr(unsigned long debugctlmsr) { #ifndef CONFIG_X86_DEBUGCTLMSR @@ -794,6 +809,18 @@ static inline void update_debugctlmsr(unsigned long debugctlmsr) wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctlmsr); } +static inline void update_debugctlmsr_on_cpu(int cpu, + unsigned long debugctlmsr) +{ +#ifndef CONFIG_X86_DEBUGCTLMSR + if (boot_cpu_data.x86 < 6) + return; +#endif + wrmsr_on_cpu(cpu, MSR_IA32_DEBUGCTLMSR, + (u32)((u64)debugctlmsr), + (u32)((u64)debugctlmsr >> 32)); +} + /* * from system description table in BIOS. Mostly for MCA use, but * others may find it useful: From de79f54f5347ad7ec6ff55ccbb6d4ab2a21f6a93 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:40 +0200 Subject: [PATCH 0268/2061] x86, bts, hw-branch-tracer: add _noirq variants to the debug store interface The hw-branch-tracer uses debug store functions from an on_each_cpu() context, which is simply wrong since the functions may sleep. Add _noirq variants for most functions, which may be called with interrupts disabled. Separate per-cpu and per-task tracing and allow per-cpu tracing to be controlled from any cpu. Make the hw-branch-tracer use the new debug store interface, synchronize with hotplug cpu event using get/put_online_cpus(), and remove the unnecessary spinlock. Make the ptrace bts and the ds selftest code use the new interface. Defer the ds selftest. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144555.658136000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 57 +++- arch/x86/kernel/ds.c | 476 ++++++++++++++++++++++++------- arch/x86/kernel/ds_selftest.c | 9 +- arch/x86/kernel/ptrace.c | 5 +- kernel/trace/trace_hw_branches.c | 191 +++++-------- 5 files changed, 492 insertions(+), 246 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 772f141afb9..413e127e567 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -15,8 +15,8 @@ * - buffer allocation (memory accounting) * * - * Copyright (C) 2007-2008 Intel Corporation. - * Markus Metzger , 2007-2008 + * Copyright (C) 2007-2009 Intel Corporation. + * Markus Metzger , 2007-2009 */ #ifndef _ASM_X86_DS_H @@ -83,8 +83,10 @@ enum ds_feature { * The interrupt threshold is independent from the overflow callback * to allow users to use their own overflow interrupt handling mechanism. * - * task: the task to request recording for; - * NULL for per-cpu recording on the current cpu + * The function might sleep. + * + * task: the task to request recording for + * cpu: the cpu to request recording for * base: the base pointer for the (non-pageable) buffer; * size: the size of the provided buffer in bytes * ovfl: pointer to a function to be called on buffer overflow; @@ -93,19 +95,28 @@ enum ds_feature { * -1 if no interrupt threshold is requested. * flags: a bit-mask of the above flags */ -extern struct bts_tracer *ds_request_bts(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, - size_t th, unsigned int flags); -extern struct pebs_tracer *ds_request_pebs(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, - size_t th, unsigned int flags); +extern struct bts_tracer *ds_request_bts_task(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags); +extern struct pebs_tracer *ds_request_pebs_cpu(int cpu, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags); /* * Release BTS or PEBS resources * Suspend and resume BTS or PEBS tracing * + * Must be called with irq's enabled. + * * tracer: the tracer handle returned from ds_request_~() */ extern void ds_release_bts(struct bts_tracer *tracer); @@ -115,6 +126,28 @@ extern void ds_release_pebs(struct pebs_tracer *tracer); extern void ds_suspend_pebs(struct pebs_tracer *tracer); extern void ds_resume_pebs(struct pebs_tracer *tracer); +/* + * Release BTS or PEBS resources + * Suspend and resume BTS or PEBS tracing + * + * Cpu tracers must call this on the traced cpu. + * Task tracers must call ds_release_~_noirq() for themselves. + * + * May be called with irq's disabled. + * + * Returns 0 if successful; + * -EPERM if the cpu tracer does not trace the current cpu. + * -EPERM if the task tracer does not trace itself. + * + * tracer: the tracer handle returned from ds_request_~() + */ +extern int ds_release_bts_noirq(struct bts_tracer *tracer); +extern int ds_suspend_bts_noirq(struct bts_tracer *tracer); +extern int ds_resume_bts_noirq(struct bts_tracer *tracer); +extern int ds_release_pebs_noirq(struct pebs_tracer *tracer); +extern int ds_suspend_pebs_noirq(struct pebs_tracer *tracer); +extern int ds_resume_pebs_noirq(struct pebs_tracer *tracer); + /* * The raw DS buffer state as it is used for BTS and PEBS recording. diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 2071b992c35..21a3852abf6 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -245,60 +245,50 @@ struct ds_context { struct pebs_tracer *pebs_master; /* Use count: */ - unsigned long count; + unsigned long count; /* Pointer to the context pointer field: */ struct ds_context **this; - /* The traced task; NULL for current cpu: */ + /* The traced task; NULL for cpu tracing: */ struct task_struct *task; + + /* The traced cpu; only valid if task is NULL: */ + int cpu; }; -static DEFINE_PER_CPU(struct ds_context *, system_context_array); - -#define system_context per_cpu(system_context_array, smp_processor_id()) +static DEFINE_PER_CPU(struct ds_context *, cpu_context); -static inline struct ds_context *ds_get_context(struct task_struct *task) +static struct ds_context *ds_get_context(struct task_struct *task, int cpu) { struct ds_context **p_context = - (task ? &task->thread.ds_ctx : &system_context); + (task ? &task->thread.ds_ctx : &per_cpu(cpu_context, cpu)); struct ds_context *context = NULL; struct ds_context *new_context = NULL; - unsigned long irq; - /* - * Chances are small that we already have a context. - * - * Contexts for per-cpu tracing are allocated using - * smp_call_function(). We must not sleep. - */ - new_context = kzalloc(sizeof(*new_context), GFP_ATOMIC); + /* Chances are small that we already have a context. */ + new_context = kzalloc(sizeof(*new_context), GFP_KERNEL); if (!new_context) return NULL; - spin_lock_irqsave(&ds_lock, irq); + spin_lock_irq(&ds_lock); context = *p_context; - if (!context) { + if (likely(!context)) { context = new_context; context->this = p_context; context->task = task; + context->cpu = cpu; context->count = 0; - if (task) - set_tsk_thread_flag(task, TIF_DS_AREA_MSR); - - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, (unsigned long)context->ds); - *p_context = context; } context->count++; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); if (context != new_context) kfree(new_context); @@ -306,7 +296,7 @@ static inline struct ds_context *ds_get_context(struct task_struct *task) return context; } -static inline void ds_put_context(struct ds_context *context) +static void ds_put_context(struct ds_context *context) { struct task_struct *task; unsigned long irq; @@ -328,8 +318,15 @@ static inline void ds_put_context(struct ds_context *context) if (task) clear_tsk_thread_flag(task, TIF_DS_AREA_MSR); - if (!task || (task == current)) - wrmsrl(MSR_IA32_DS_AREA, 0); + /* + * We leave the (now dangling) pointer to the DS configuration in + * the DS_AREA msr. This is as good or as bad as replacing it with + * NULL - the hardware would crash if we enabled tracing. + * + * This saves us some problems with having to write an msr on a + * different cpu while preventing others from doing the same for the + * next context for that same cpu. + */ spin_unlock_irqrestore(&ds_lock, irq); @@ -340,6 +337,31 @@ static inline void ds_put_context(struct ds_context *context) kfree(context); } +static void ds_install_ds_area(struct ds_context *context) +{ + unsigned long ds; + + ds = (unsigned long)context->ds; + + /* + * There is a race between the bts master and the pebs master. + * + * The thread/cpu access is synchronized via get/put_cpu() for + * task tracing and via wrmsr_on_cpu for cpu tracing. + * + * If bts and pebs are collected for the same task or same cpu, + * the same confiuration is written twice. + */ + if (context->task) { + get_cpu(); + if (context->task == current) + wrmsrl(MSR_IA32_DS_AREA, ds); + set_tsk_thread_flag(context->task, TIF_DS_AREA_MSR); + put_cpu(); + } else + wrmsr_on_cpu(context->cpu, MSR_IA32_DS_AREA, + (u32)((u64)ds), (u32)((u64)ds >> 32)); +} /* * Call the tracer's callback on a buffer overflow. @@ -622,6 +644,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, * The value for 'no threshold' is -1, which will set the * threshold outside of the buffer, just like we want it. */ + ith *= ds_cfg.sizeof_rec[qual]; trace->ith = (void *)(buffer + size - ith); trace->flags = flags; @@ -630,7 +653,7 @@ static void ds_init_ds_trace(struct ds_trace *trace, enum ds_qualifier qual, static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, enum ds_qualifier qual, struct task_struct *task, - void *base, size_t size, size_t th, unsigned int flags) + int cpu, void *base, size_t size, size_t th) { struct ds_context *context; int error; @@ -643,7 +666,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* We require some space to do alignment adjustments below. */ + /* We need space for alignment adjustments in ds_init_ds_trace(). */ error = -EINVAL; if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) goto out; @@ -660,25 +683,27 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, tracer->size = size; error = -ENOMEM; - context = ds_get_context(task); + context = ds_get_context(task, cpu); if (!context) goto out; tracer->context = context; - ds_init_ds_trace(trace, qual, base, size, th, flags); + /* + * Defer any tracer-specific initialization work for the context until + * context ownership has been clarified. + */ error = 0; out: return error; } -struct bts_tracer *ds_request_bts(struct task_struct *task, - void *base, size_t size, - bts_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +static struct bts_tracer *ds_request_bts(struct task_struct *task, int cpu, + void *base, size_t size, + bts_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct bts_tracer *tracer; - unsigned long irq; int error; /* Buffer overflow notification is not yet implemented. */ @@ -690,42 +715,46 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, if (error < 0) goto out; - /* - * Per-cpu tracing is typically requested using smp_call_function(). - * We must not sleep. - */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_bts, task, base, size, th, flags); + ds_bts, task, cpu, base, size, th); if (error < 0) goto out_tracer; - - spin_lock_irqsave(&ds_lock, irq); + /* Claim the bts part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->bts_master) goto out_unlock; tracer->ds.context->bts_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the bts part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_bts, base, size, th, flags); + ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + ds_install_ds_area(tracer->ds.context); tracer->trace.read = bts_read; tracer->trace.write = bts_write; - ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_bts); + /* Start tracing. */ ds_resume_bts(tracer); return tracer; out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); @@ -735,13 +764,27 @@ struct bts_tracer *ds_request_bts(struct task_struct *task, return ERR_PTR(error); } -struct pebs_tracer *ds_request_pebs(struct task_struct *task, - void *base, size_t size, - pebs_ovfl_callback_t ovfl, size_t th, - unsigned int flags) +struct bts_tracer *ds_request_bts_task(struct task_struct *task, + void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(task, 0, base, size, ovfl, th, flags); +} + +struct bts_tracer *ds_request_bts_cpu(int cpu, void *base, size_t size, + bts_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_bts(NULL, cpu, base, size, ovfl, th, flags); +} + +static struct pebs_tracer *ds_request_pebs(struct task_struct *task, int cpu, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, size_t th, + unsigned int flags) { struct pebs_tracer *tracer; - unsigned long irq; int error; /* Buffer overflow notification is not yet implemented. */ @@ -753,37 +796,43 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, if (error < 0) goto out; - /* - * Per-cpu tracing is typically requested using smp_call_function(). - * We must not sleep. - */ error = -ENOMEM; - tracer = kzalloc(sizeof(*tracer), GFP_ATOMIC); + tracer = kzalloc(sizeof(*tracer), GFP_KERNEL); if (!tracer) goto out_put_tracer; tracer->ovfl = ovfl; + /* Do some more error checking and acquire a tracing context. */ error = ds_request(&tracer->ds, &tracer->trace.ds, - ds_pebs, task, base, size, th, flags); + ds_pebs, task, cpu, base, size, th); if (error < 0) goto out_tracer; - spin_lock_irqsave(&ds_lock, irq); + /* Claim the pebs part of the tracing context we acquired above. */ + spin_lock_irq(&ds_lock); error = -EPERM; if (tracer->ds.context->pebs_master) goto out_unlock; tracer->ds.context->pebs_master = tracer; - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); + /* + * Now that we own the pebs part of the context, let's complete the + * initialization for that part. + */ + ds_init_ds_trace(&tracer->trace.ds, ds_pebs, base, size, th, flags); ds_write_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); + ds_install_ds_area(tracer->ds.context); + + /* Start tracing. */ ds_resume_pebs(tracer); return tracer; out_unlock: - spin_unlock_irqrestore(&ds_lock, irq); + spin_unlock_irq(&ds_lock); ds_put_context(tracer->ds.context); out_tracer: kfree(tracer); @@ -793,17 +842,27 @@ struct pebs_tracer *ds_request_pebs(struct task_struct *task, return ERR_PTR(error); } -void ds_release_bts(struct bts_tracer *tracer) +struct pebs_tracer *ds_request_pebs_task(struct task_struct *task, + void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_pebs(task, 0, base, size, ovfl, th, flags); +} + +struct pebs_tracer *ds_request_pebs_cpu(int cpu, void *base, size_t size, + pebs_ovfl_callback_t ovfl, + size_t th, unsigned int flags) +{ + return ds_request_pebs(NULL, cpu, base, size, ovfl, th, flags); +} + +static void ds_free_bts(struct bts_tracer *tracer) { struct task_struct *task; - if (!tracer) - return; - task = tracer->ds.context->task; - ds_suspend_bts(tracer); - WARN_ON_ONCE(tracer->ds.context->bts_master != tracer); tracer->ds.context->bts_master = NULL; @@ -817,9 +876,69 @@ void ds_release_bts(struct bts_tracer *tracer) kfree(tracer); } +void ds_release_bts(struct bts_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_bts(tracer); + ds_free_bts(tracer); +} + +int ds_release_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_bts_noirq(tracer); + ds_free_bts(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void update_task_debugctlmsr(struct task_struct *task, + unsigned long debugctlmsr) +{ + task->thread.debugctlmsr = debugctlmsr; + + get_cpu(); + if (task == current) + update_debugctlmsr(debugctlmsr); + + if (task->thread.debugctlmsr) + set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + else + clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); + put_cpu(); +} + void ds_suspend_bts(struct bts_tracer *tracer) { struct task_struct *task; + unsigned long debugctlmsr; + int cpu; if (!tracer) return; @@ -827,29 +946,60 @@ void ds_suspend_bts(struct bts_tracer *tracer) tracer->flags = 0; task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() & ~BTS_CONTROL); + WARN_ON(!task && irqs_disabled()); - if (task) { - task->thread.debugctlmsr &= ~BTS_CONTROL; + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr &= ~BTS_CONTROL; - if (!task->thread.debugctlmsr) - clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); } -void ds_resume_bts(struct bts_tracer *tracer) +int ds_suspend_bts_noirq(struct bts_tracer *tracer) { struct task_struct *task; - unsigned long control; + unsigned long debugctlmsr, irq; + int cpu, error = 0; if (!tracer) - return; + return 0; - tracer->flags = tracer->trace.ds.flags; + tracer->flags = 0; task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr &= ~BTS_CONTROL; + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static unsigned long ds_bts_control(struct bts_tracer *tracer) +{ + unsigned long control; control = ds_cfg.ctl[dsf_bts]; if (!(tracer->trace.ds.flags & BTS_KERNEL)) @@ -857,25 +1007,77 @@ void ds_resume_bts(struct bts_tracer *tracer) if (!(tracer->trace.ds.flags & BTS_USER)) control |= ds_cfg.ctl[dsf_bts_user]; - if (task) { - task->thread.debugctlmsr |= control; - set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - } - - if (!task || (task == current)) - update_debugctlmsr(get_debugctlmsr() | control); + return control; } -void ds_release_pebs(struct pebs_tracer *tracer) +void ds_resume_bts(struct bts_tracer *tracer) { struct task_struct *task; + unsigned long debugctlmsr; + int cpu; if (!tracer) return; - task = tracer->ds.context->task; + tracer->flags = tracer->trace.ds.flags; - ds_suspend_pebs(tracer); + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + WARN_ON(!task && irqs_disabled()); + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr_on_cpu(cpu)); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr_on_cpu(cpu, debugctlmsr); +} + +int ds_resume_bts_noirq(struct bts_tracer *tracer) +{ + struct task_struct *task; + unsigned long debugctlmsr, irq; + int cpu, error = 0; + + if (!tracer) + return 0; + + tracer->flags = tracer->trace.ds.flags; + + task = tracer->ds.context->task; + cpu = tracer->ds.context->cpu; + + local_irq_save(irq); + + error = -EPERM; + if (!task && (cpu != smp_processor_id())) + goto out; + + debugctlmsr = (task ? + task->thread.debugctlmsr : + get_debugctlmsr()); + debugctlmsr |= ds_bts_control(tracer); + + if (task) + update_task_debugctlmsr(task, debugctlmsr); + else + update_debugctlmsr(debugctlmsr); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + +static void ds_free_pebs(struct pebs_tracer *tracer) +{ + struct task_struct *task; + + task = tracer->ds.context->task; WARN_ON_ONCE(tracer->ds.context->pebs_master != tracer); tracer->ds.context->pebs_master = NULL; @@ -886,16 +1088,68 @@ void ds_release_pebs(struct pebs_tracer *tracer) kfree(tracer); } +void ds_release_pebs(struct pebs_tracer *tracer) +{ + might_sleep(); + + if (!tracer) + return; + + ds_suspend_pebs(tracer); + ds_free_pebs(tracer); +} + +int ds_release_pebs_noirq(struct pebs_tracer *tracer) +{ + struct task_struct *task; + unsigned long irq; + int error; + + if (!tracer) + return 0; + + task = tracer->ds.context->task; + + local_irq_save(irq); + + error = -EPERM; + if (!task && + (tracer->ds.context->cpu != smp_processor_id())) + goto out; + + error = -EPERM; + if (task && (task != current)) + goto out; + + ds_suspend_pebs_noirq(tracer); + ds_free_pebs(tracer); + + error = 0; + out: + local_irq_restore(irq); + return error; +} + void ds_suspend_pebs(struct pebs_tracer *tracer) { } +int ds_suspend_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + void ds_resume_pebs(struct pebs_tracer *tracer) { } +int ds_resume_pebs_noirq(struct pebs_tracer *tracer) +{ + return 0; +} + const struct bts_trace *ds_read_bts(struct bts_tracer *tracer) { if (!tracer) @@ -1004,26 +1258,6 @@ ds_configure(const struct ds_configuration *cfg, printk(KERN_INFO "[ds] pebs not available\n"); } - if (ds_cfg.sizeof_rec[ds_bts]) { - int error; - - error = ds_selftest_bts(); - if (error) { - WARN(1, "[ds] selftest failed. disabling bts.\n"); - ds_cfg.sizeof_rec[ds_bts] = 0; - } - } - - if (ds_cfg.sizeof_rec[ds_pebs]) { - int error; - - error = ds_selftest_pebs(); - if (error) { - WARN(1, "[ds] selftest failed. disabling pebs.\n"); - ds_cfg.sizeof_rec[ds_pebs] = 0; - } - } - printk(KERN_INFO "[ds] sizes: address: %u bit, ", 8 * ds_cfg.sizeof_ptr_field); printk("bts/pebs record: %u/%u bytes\n", @@ -1127,3 +1361,29 @@ void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) void ds_exit_thread(struct task_struct *tsk) { } + +static __init int ds_selftest(void) +{ + if (ds_cfg.sizeof_rec[ds_bts]) { + int error; + + error = ds_selftest_bts(); + if (error) { + WARN(1, "[ds] selftest failed. disabling bts.\n"); + ds_cfg.sizeof_rec[ds_bts] = 0; + } + } + + if (ds_cfg.sizeof_rec[ds_pebs]) { + int error; + + error = ds_selftest_pebs(); + if (error) { + WARN(1, "[ds] selftest failed. disabling pebs.\n"); + ds_cfg.sizeof_rec[ds_pebs] = 0; + } + } + + return 0; +} +device_initcall(ds_selftest); diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 8c46fbf38c4..e5a263c8a14 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -10,11 +10,12 @@ #include #include +#include #include -#define DS_SELFTEST_BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ +#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ static int ds_selftest_bts_consistency(const struct bts_trace *trace) @@ -125,12 +126,12 @@ int ds_selftest_bts(void) struct bts_tracer *tracer; int error = 0; void *top; - unsigned char buffer[DS_SELFTEST_BUFFER_SIZE]; + unsigned char buffer[BUFFER_SIZE]; printk(KERN_INFO "[ds] bts selftest..."); - tracer = ds_request_bts(NULL, buffer, DS_SELFTEST_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); + tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); if (IS_ERR(tracer)) { error = PTR_ERR(tracer); tracer = NULL; diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 7c21d1e8cae..adbb24322d8 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -800,8 +800,9 @@ static int ptrace_bts_config(struct task_struct *child, if (cfg.flags & PTRACE_BTS_O_SCHED) flags |= BTS_TIMESTAMPS; - context->tracer = ds_request_bts(child, context->buffer, context->size, - NULL, (size_t)-1, flags); + context->tracer = + ds_request_bts_task(child, context->buffer, context->size, + NULL, (size_t)-1, flags); if (unlikely(IS_ERR(context->tracer))) { int error = PTR_ERR(context->tracer); diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 8b2109a6c61..50565d8cd2e 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -4,7 +4,6 @@ * Copyright (C) 2008-2009 Intel Corporation. * Markus Metzger , 2008-2009 */ -#include #include #include #include @@ -21,168 +20,113 @@ #define BTS_BUFFER_SIZE (1 << 13) -/* - * The tracer lock protects the below per-cpu tracer array. - * It needs to be held to: - * - start tracing on all cpus - * - stop tracing on all cpus - * - start tracing on a single hotplug cpu - * - stop tracing on a single hotplug cpu - * - read the trace from all cpus - * - read the trace from a single cpu - */ -static DEFINE_SPINLOCK(bts_tracer_lock); static DEFINE_PER_CPU(struct bts_tracer *, tracer); static DEFINE_PER_CPU(unsigned char[BTS_BUFFER_SIZE], buffer); #define this_tracer per_cpu(tracer, smp_processor_id()) -#define this_buffer per_cpu(buffer, smp_processor_id()) static int trace_hw_branches_enabled __read_mostly; static int trace_hw_branches_suspended __read_mostly; static struct trace_array *hw_branch_trace __read_mostly; -/* - * Initialize the tracer for the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_init_cpu(void *arg) +static void bts_trace_init_cpu(int cpu) { - if (this_tracer) - ds_release_bts(this_tracer); + per_cpu(tracer, cpu) = + ds_request_bts_cpu(cpu, per_cpu(buffer, cpu), BTS_BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); - this_tracer = ds_request_bts(NULL, this_buffer, BTS_BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - if (IS_ERR(this_tracer)) { - this_tracer = NULL; - return; - } + if (IS_ERR(per_cpu(tracer, cpu))) + per_cpu(tracer, cpu) = NULL; } static int bts_trace_init(struct trace_array *tr) { - int cpu, avail; - - spin_lock(&bts_tracer_lock); + int cpu; hw_branch_trace = tr; + trace_hw_branches_enabled = 0; - on_each_cpu(bts_trace_init_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) { + bts_trace_init_cpu(cpu); - /* Check on how many cpus we could enable tracing */ - avail = 0; - for_each_online_cpu(cpu) - if (per_cpu(tracer, cpu)) - avail++; - - trace_hw_branches_enabled = (avail ? 1 : 0); + if (likely(per_cpu(tracer, cpu))) + trace_hw_branches_enabled = 1; + } trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); - + put_online_cpus(); /* If we could not enable tracing on a single cpu, we fail. */ - return avail ? 0 : -EOPNOTSUPP; -} - -/* - * Release the tracer for the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_release_cpu(void *arg) -{ - if (this_tracer) { - ds_release_bts(this_tracer); - this_tracer = NULL; - } + return trace_hw_branches_enabled ? 0 : -EOPNOTSUPP; } static void bts_trace_reset(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_release_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) { + if (likely(per_cpu(tracer, cpu))) { + ds_release_bts(per_cpu(tracer, cpu)); + per_cpu(tracer, cpu) = NULL; + } + } trace_hw_branches_enabled = 0; trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); -} - -/* - * Resume tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_resume_cpu(void *arg) -{ - if (this_tracer) - ds_resume_bts(this_tracer); + put_online_cpus(); } static void bts_trace_start(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_resume_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_resume_bts(per_cpu(tracer, cpu)); trace_hw_branches_suspended = 0; - - spin_unlock(&bts_tracer_lock); -} - -/* - * Suspend tracing on the current cpu. - * The argument is ignored. - * - * pre: bts_tracer_lock must be locked. - */ -static void bts_trace_suspend_cpu(void *arg) -{ - if (this_tracer) - ds_suspend_bts(this_tracer); + put_online_cpus(); } static void bts_trace_stop(struct trace_array *tr) { - spin_lock(&bts_tracer_lock); + int cpu; - on_each_cpu(bts_trace_suspend_cpu, NULL, 1); + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); trace_hw_branches_suspended = 1; - - spin_unlock(&bts_tracer_lock); + put_online_cpus(); } static int __cpuinit bts_hotcpu_handler(struct notifier_block *nfb, unsigned long action, void *hcpu) { - unsigned int cpu = (unsigned long)hcpu; - - spin_lock(&bts_tracer_lock); - - if (!trace_hw_branches_enabled) - goto out; + int cpu = (long)hcpu; switch (action) { case CPU_ONLINE: case CPU_DOWN_FAILED: - smp_call_function_single(cpu, bts_trace_init_cpu, NULL, 1); + /* The notification is sent with interrupts enabled. */ + if (trace_hw_branches_enabled) { + bts_trace_init_cpu(cpu); - if (trace_hw_branches_suspended) - smp_call_function_single(cpu, bts_trace_suspend_cpu, - NULL, 1); + if (trace_hw_branches_suspended && + likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + } break; + case CPU_DOWN_PREPARE: - smp_call_function_single(cpu, bts_trace_release_cpu, NULL, 1); - break; + /* The notification is sent with interrupts enabled. */ + if (likely(per_cpu(tracer, cpu))) { + ds_release_bts(per_cpu(tracer, cpu)); + per_cpu(tracer, cpu) = NULL; + } } - out: - spin_unlock(&bts_tracer_lock); return NOTIFY_DONE; } @@ -274,7 +218,7 @@ static void trace_bts_at(const struct bts_trace *trace, void *at) /* * Collect the trace on the current cpu and write it into the ftrace buffer. * - * pre: bts_tracer_lock must be locked + * pre: tracing must be suspended on the current cpu */ static void trace_bts_cpu(void *arg) { @@ -291,10 +235,9 @@ static void trace_bts_cpu(void *arg) if (unlikely(!this_tracer)) return; - ds_suspend_bts(this_tracer); trace = ds_read_bts(this_tracer); if (!trace) - goto out; + return; for (at = trace->ds.top; (void *)at < trace->ds.end; at += trace->ds.size) @@ -303,18 +246,27 @@ static void trace_bts_cpu(void *arg) for (at = trace->ds.begin; (void *)at < trace->ds.top; at += trace->ds.size) trace_bts_at(trace, at); - -out: - ds_resume_bts(this_tracer); } static void trace_bts_prepare(struct trace_iterator *iter) { - spin_lock(&bts_tracer_lock); + int cpu; + get_online_cpus(); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_suspend_bts(per_cpu(tracer, cpu)); + /* + * We need to collect the trace on the respective cpu since ftrace + * implicitly adds the record for the current cpu. + * Once that is more flexible, we could collect the data from any cpu. + */ on_each_cpu(trace_bts_cpu, iter->tr, 1); - spin_unlock(&bts_tracer_lock); + for_each_online_cpu(cpu) + if (likely(per_cpu(tracer, cpu))) + ds_resume_bts(per_cpu(tracer, cpu)); + put_online_cpus(); } static void trace_bts_close(struct trace_iterator *iter) @@ -324,12 +276,11 @@ static void trace_bts_close(struct trace_iterator *iter) void trace_hw_branch_oops(void) { - spin_lock(&bts_tracer_lock); - - if (trace_hw_branches_enabled) + if (this_tracer) { + ds_suspend_bts_noirq(this_tracer); trace_bts_cpu(hw_branch_trace); - - spin_unlock(&bts_tracer_lock); + ds_resume_bts_noirq(this_tracer); + } } struct tracer bts_tracer __read_mostly = From 4d657e51dfc042216febd4a007c6f36881f9256d Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:41 +0200 Subject: [PATCH 0269/2061] x86, hw-branch-tracer: allocate selftest iterator on heap Allocate the trace_iterator for the hw-branch-tracer selftest on the heap. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144556.578777000@intel.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_selftest.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index 499d01c44cd..00dd6485bdd 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -757,7 +757,7 @@ int trace_selftest_startup_hw_branches(struct tracer *trace, struct trace_array *tr) { - struct trace_iterator iter; + struct trace_iterator *iter; struct tracer tracer; unsigned long count; int ret; @@ -777,17 +777,21 @@ trace_selftest_startup_hw_branches(struct tracer *trace, * The hw-branch tracer needs to collect the trace from the various * cpu trace buffers - before tracing is stopped. */ - memset(&iter, 0, sizeof(iter)); + iter = kzalloc(sizeof(*iter), GFP_KERNEL); + if (!iter) + return -ENOMEM; + memcpy(&tracer, trace, sizeof(tracer)); - iter.trace = &tracer; - iter.tr = tr; - iter.pos = -1; - mutex_init(&iter.mutex); + iter->trace = &tracer; + iter->tr = tr; + iter->pos = -1; + mutex_init(&iter->mutex); - trace->open(&iter); + trace->open(iter); - mutex_destroy(&iter.mutex); + mutex_destroy(&iter->mutex); + kfree(iter); tracing_stop(); From 353afeea24cc51aafc0ff21a72ec740b6f0af50c Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:42 +0200 Subject: [PATCH 0270/2061] x86, ds: fix compiler warning Size_t is defined differently on i386 and x86_64. Change type to avoid compiler warning. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144557.523964000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index e5a263c8a14..e1ba5101b57 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -87,7 +87,7 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer, /* Now to the test itself. */ for (at = from; (void *)at < to; at += trace->ds.size) { struct bts_struct bts; - size_t index; + unsigned long index; int error; if (((void *)at - trace->ds.begin) % trace->ds.size) { From 84f201139245c30777ff858e71b8d7e134b8c3ed Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:43 +0200 Subject: [PATCH 0271/2061] x86, ds: fix bounds check in ds selftest Fix a bad bounds check in the debug store selftest. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144558.450027000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index e1ba5101b57..cccc19a38f6 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -47,8 +47,13 @@ static int ds_selftest_bts_consistency(const struct bts_trace *trace) printk(KERN_CONT "bad bts buffer setup..."); error = -1; } + /* + * We allow top in [begin; end], since its not clear when the + * overflow adjustment happens: after the increment or before the + * write. + */ if ((trace->ds.top < trace->ds.begin) || - (trace->ds.end <= trace->ds.top)) { + (trace->ds.end < trace->ds.top)) { printk(KERN_CONT "bts top out of bounds..."); error = -1; } From 01f6569ece6915616f6cae1d7d8b46ab8da9c1bd Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:44 +0200 Subject: [PATCH 0272/2061] x86, ds: selftest each cpu Perform debug store selftests on each cpu. Cover both the normal and the _noirq variant of the debug store interface. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144559.394583000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 182 +++++++++++++++++++++++++--------- 1 file changed, 135 insertions(+), 47 deletions(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index cccc19a38f6..599a9630062 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -11,13 +11,21 @@ #include #include #include +#include #include -#define BUFFER_SIZE 1021 /* Intentionally chose an odd size. */ +#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ +struct ds_selftest_bts_conf { + struct bts_tracer *tracer; + int error; + int (*suspend)(struct bts_tracer *); + int (*resume)(struct bts_tracer *); +}; + static int ds_selftest_bts_consistency(const struct bts_trace *trace) { int error = 0; @@ -125,36 +133,32 @@ static int ds_selftest_bts_read(struct bts_tracer *tracer, return 0; } -int ds_selftest_bts(void) +static void ds_selftest_bts_cpu(void *arg) { + struct ds_selftest_bts_conf *conf = arg; const struct bts_trace *trace; - struct bts_tracer *tracer; - int error = 0; void *top; - unsigned char buffer[BUFFER_SIZE]; - printk(KERN_INFO "[ds] bts selftest..."); - - tracer = ds_request_bts_cpu(smp_processor_id(), buffer, BUFFER_SIZE, - NULL, (size_t)-1, BTS_KERNEL); - if (IS_ERR(tracer)) { - error = PTR_ERR(tracer); - tracer = NULL; + if (IS_ERR(conf->tracer)) { + conf->error = PTR_ERR(conf->tracer); + conf->tracer = NULL; printk(KERN_CONT - "initialization failed (err: %d)...", error); - goto out; + "initialization failed (err: %d)...", conf->error); + return; } - /* The return should already give us enough trace. */ - ds_suspend_bts(tracer); + /* We should meanwhile have enough trace. */ + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; /* Let's see if we can access the trace. */ - trace = ds_read_bts(tracer); + trace = ds_read_bts(conf->tracer); - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; /* If everything went well, we should have a few trace entries. */ if (trace->ds.top == trace->ds.begin) { @@ -168,10 +172,11 @@ int ds_selftest_bts(void) } /* Let's try to read the trace we collected. */ - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.top); - if (error < 0) - goto out; + if (conf->error < 0) + return; /* * Let's read the trace again. @@ -179,26 +184,31 @@ int ds_selftest_bts(void) */ top = trace->ds.top; - trace = ds_read_bts(tracer); - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + trace = ds_read_bts(conf->tracer); + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; if (top != trace->ds.top) { printk(KERN_CONT "suspend not working..."); - error = -1; - goto out; + conf->error = -1; + return; } /* Let's collect some more trace - see if resume is working. */ - ds_resume_bts(tracer); - ds_suspend_bts(tracer); + conf->error = conf->resume(conf->tracer); + if (conf->error < 0) + return; - trace = ds_read_bts(tracer); + conf->error = conf->suspend(conf->tracer); + if (conf->error < 0) + return; - error = ds_selftest_bts_consistency(trace); - if (error < 0) - goto out; + trace = ds_read_bts(conf->tracer); + + conf->error = ds_selftest_bts_consistency(trace); + if (conf->error < 0) + return; if (trace->ds.top == top) { /* @@ -210,35 +220,113 @@ int ds_selftest_bts(void) printk(KERN_CONT "no resume progress/overflow..."); - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.end); } else if (trace->ds.top < top) { /* * We had a buffer overflow - the entire buffer should * contain trace records. */ - error = ds_selftest_bts_read(tracer, trace, + conf->error = + ds_selftest_bts_read(conf->tracer, trace, trace->ds.begin, trace->ds.end); } else { /* * It is quite likely that the buffer did not overflow. * Let's just check the delta trace. */ - error = ds_selftest_bts_read(tracer, trace, - top, trace->ds.top); + conf->error = + ds_selftest_bts_read(conf->tracer, trace, top, + trace->ds.top); } - if (error < 0) - goto out; + if (conf->error < 0) + return; - error = 0; + conf->error = 0; +} - /* The final test: release the tracer while tracing is suspended. */ +static int ds_suspend_bts_wrap(struct bts_tracer *tracer) +{ + ds_suspend_bts(tracer); + return 0; +} + +static int ds_resume_bts_wrap(struct bts_tracer *tracer) +{ + ds_resume_bts(tracer); + return 0; +} + +static void ds_release_bts_noirq_wrap(void *tracer) +{ + (void)ds_release_bts_noirq(tracer); +} + +static int ds_selftest_bts_bad_release_noirq(int cpu, + struct bts_tracer *tracer) +{ + int error = -EPERM; + + /* Try to release the tracer on the wrong cpu. */ + get_cpu(); + if (cpu != smp_processor_id()) { + error = ds_release_bts_noirq(tracer); + if (error != -EPERM) + printk(KERN_CONT "release on wrong cpu..."); + } + put_cpu(); + + return error ? 0 : -1; +} + +int ds_selftest_bts(void) +{ + struct ds_selftest_bts_conf conf; + unsigned char buffer[BUFFER_SIZE]; + int cpu; + + printk(KERN_INFO "[ds] bts selftest..."); + conf.error = 0; + + get_online_cpus(); + for_each_online_cpu(cpu) { + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + smp_call_function_single(cpu, ds_selftest_bts_cpu, &conf, 1); + if (conf.error >= 0) { + conf.error = + ds_selftest_bts_bad_release_noirq(cpu, + conf.tracer); + /* We must not release the tracer twice. */ + if (conf.error < 0) + conf.tracer = NULL; + } + smp_call_function_single(cpu, ds_release_bts_noirq_wrap, + conf.tracer, 1); + if (conf.error < 0) + goto out; + } + + conf.error = 0; out: - ds_release_bts(tracer); + put_online_cpus(); + printk(KERN_CONT "%s.\n", (conf.error ? "failed" : "passed")); - printk(KERN_CONT "%s.\n", (error ? "failed" : "passed")); - - return error; + return conf.error; } int ds_selftest_pebs(void) From 3a68eef945b234f286406d96dc690fe17863c203 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:45 +0200 Subject: [PATCH 0273/2061] x86, ds: add task tracing selftest Add selftests to cover per-task branch tracing. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144600.329346000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 71 +++++++++++++++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 599a9630062..a40b2533c71 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -280,10 +280,51 @@ static int ds_selftest_bts_bad_release_noirq(int cpu, return error ? 0 : -1; } +static int ds_selftest_bts_bad_request_cpu(int cpu, void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + ds_release_bts(tracer); + error = 0; + } + + if (error != -EPERM) + printk(KERN_CONT "cpu/task tracing overlap..."); + + return error ? 0 : -1; +} + +static int ds_selftest_bts_bad_request_task(void *buffer) +{ + struct bts_tracer *tracer; + int error; + + /* Try to request cpu tracing while task tracing is active. */ + tracer = ds_request_bts_task(current, buffer, BUFFER_SIZE, NULL, + (size_t)-1, BTS_KERNEL); + error = PTR_ERR(tracer); + if (!IS_ERR(tracer)) { + error = 0; + ds_release_bts(tracer); + } + + if (error != -EPERM) + printk(KERN_CONT "task/cpu tracing overlap..."); + + return error ? 0 : -1; +} + int ds_selftest_bts(void) { struct ds_selftest_bts_conf conf; unsigned char buffer[BUFFER_SIZE]; + unsigned long irq; int cpu; printk(KERN_INFO "[ds] bts selftest..."); @@ -297,6 +338,8 @@ int ds_selftest_bts(void) ds_request_bts_cpu(cpu, buffer, BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); ds_release_bts(conf.tracer); if (conf.error < 0) goto out; @@ -315,12 +358,40 @@ int ds_selftest_bts(void) if (conf.error < 0) conf.tracer = NULL; } + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_task(buffer); smp_call_function_single(cpu, ds_release_bts_noirq_wrap, conf.tracer, 1); if (conf.error < 0) goto out; } + conf.suspend = ds_suspend_bts_wrap; + conf.resume = ds_resume_bts_wrap; + conf.tracer = + ds_request_bts_task(current, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts(conf.tracer); + if (conf.error < 0) + goto out; + + conf.suspend = ds_suspend_bts_noirq; + conf.resume = ds_resume_bts_noirq; + conf.tracer = + ds_request_bts_task(current, buffer, BUFFER_SIZE, + NULL, (size_t)-1, BTS_KERNEL); + local_irq_save(irq); + ds_selftest_bts_cpu(&conf); + if (conf.error >= 0) + conf.error = ds_selftest_bts_bad_request_cpu(0, buffer); + ds_release_bts_noirq(conf.tracer); + local_irq_restore(irq); + if (conf.error < 0) + goto out; + conf.error = 0; out: put_online_cpus(); From 2311f0de21c17b2a8b960677a9cccfbfa52beb35 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:46 +0200 Subject: [PATCH 0274/2061] x86, ds: add leakage warning Add a warning in case a debug store context is not removed before the task it is attached to is freed. Remove the old warning at thread exit. It is too early. Declare the debug store context field in thread_struct unconditionally. Remove ds_copy_thread() and ds_exit_thread() and do the work directly in process*.c. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144601.254472000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 9 --------- arch/x86/include/asm/processor.h | 4 +--- arch/x86/kernel/ds.c | 10 ---------- arch/x86/kernel/process.c | 5 +++-- arch/x86/kernel/process_32.c | 3 ++- arch/x86/kernel/process_64.c | 3 ++- 6 files changed, 8 insertions(+), 26 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 413e127e567..149e5208e96 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -285,21 +285,12 @@ extern void __cpuinit ds_init_intel(struct cpuinfo_x86 *); */ extern void ds_switch_to(struct task_struct *prev, struct task_struct *next); -/* - * Task clone/init and cleanup work - */ -extern void ds_copy_thread(struct task_struct *tsk, struct task_struct *father); -extern void ds_exit_thread(struct task_struct *tsk); - #else /* CONFIG_X86_DS */ struct cpuinfo_x86; static inline void __cpuinit ds_init_intel(struct cpuinfo_x86 *ignored) {} static inline void ds_switch_to(struct task_struct *prev, struct task_struct *next) {} -static inline void ds_copy_thread(struct task_struct *tsk, - struct task_struct *father) {} -static inline void ds_exit_thread(struct task_struct *tsk) {} #endif /* CONFIG_X86_DS */ #endif /* _ASM_X86_DS_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 1efeb497f1f..7c39de7e709 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -454,10 +454,8 @@ struct thread_struct { unsigned io_bitmap_max; /* MSR_IA32_DEBUGCTLMSR value to switch in if TIF_DEBUGCTLMSR is set. */ unsigned long debugctlmsr; -#ifdef CONFIG_X86_DS -/* Debug Store context; see include/asm-x86/ds.h; goes into MSR_IA32_DS_AREA */ + /* Debug Store context; see asm/ds.h */ struct ds_context *ds_ctx; -#endif /* CONFIG_X86_DS */ }; static inline unsigned long native_get_debugreg(int regno) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 21a3852abf6..71cab3b62dc 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1352,16 +1352,6 @@ void ds_switch_to(struct task_struct *prev, struct task_struct *next) update_debugctlmsr(debugctlmsr); } -void ds_copy_thread(struct task_struct *tsk, struct task_struct *father) -{ - clear_tsk_thread_flag(tsk, TIF_DS_AREA_MSR); - tsk->thread.ds_ctx = NULL; -} - -void ds_exit_thread(struct task_struct *tsk) -{ -} - static __init int ds_selftest(void) { if (ds_cfg.sizeof_rec[ds_bts]) { diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e84..fb5dfb891f0 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -14,6 +14,7 @@ #include #include #include +#include unsigned long idle_halt; EXPORT_SYMBOL(idle_halt); @@ -45,6 +46,8 @@ void free_thread_xstate(struct task_struct *tsk) kmem_cache_free(task_xstate_cachep, tsk->thread.xstate); tsk->thread.xstate = NULL; } + + WARN(tsk->thread.ds_ctx, "leaking DS context\n"); } void free_thread_info(struct thread_info *ti) @@ -83,8 +86,6 @@ void exit_thread(void) put_cpu(); kfree(bp); } - - ds_exit_thread(current); } void flush_thread(void) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a..b5e4bfef447 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -290,7 +290,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.io_bitmap_max = 0; } - ds_copy_thread(p, current); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41392b..5a1a1de292e 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -335,7 +335,8 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, goto out; } - ds_copy_thread(p, me); + clear_tsk_thread_flag(p, TIF_DS_AREA_MSR); + p->thread.ds_ctx = NULL; clear_tsk_thread_flag(p, TIF_DEBUGCTLMSR); p->thread.debugctlmsr = 0; From ee811517a5604aa63fae803b7c044712699e1303 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:47 +0200 Subject: [PATCH 0275/2061] x86, ds: use single debug store cpu configuration Use a single configuration for all cpus. Reported-by: Ingo Molnar Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144602.191165000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 71cab3b62dc..443f415441d 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -47,9 +47,8 @@ struct ds_configuration { /* Control bit-masks indexed by enum ds_feature: */ unsigned long ctl[dsf_ctl_max]; }; -static DEFINE_PER_CPU(struct ds_configuration, ds_cfg_array); +static struct ds_configuration ds_cfg __read_mostly; -#define ds_cfg per_cpu(ds_cfg_array, smp_processor_id()) /* Maximal size of a DS configuration: */ #define MAX_SIZEOF_DS (12 * 8) @@ -1268,6 +1267,10 @@ ds_configure(const struct ds_configuration *cfg, void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) { + /* Only configure the first cpu. Others are identical. */ + if (ds_cfg.name) + return; + switch (c->x86) { case 0x6: switch (c->x86_model) { From 0f4814065ff8c24ca8bfd75c9b73502be152c287 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:48 +0200 Subject: [PATCH 0276/2061] x86, ptrace: add bts context unconditionally Add the ptrace bts context field to task_struct unconditionally. Initialize the field directly in copy_process(). Remove all the unneeded functionality used to initialize that field. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144603.292754000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ptrace.h | 9 ++++----- arch/x86/kernel/ptrace.c | 20 +------------------- include/linux/ptrace.h | 10 ---------- include/linux/sched.h | 2 -- kernel/fork.c | 4 ++-- kernel/ptrace.c | 10 ---------- 6 files changed, 7 insertions(+), 48 deletions(-) diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index e304b66abee..5cdd19f20b5 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -235,12 +235,11 @@ extern int do_get_thread_area(struct task_struct *p, int idx, extern int do_set_thread_area(struct task_struct *p, int idx, struct user_desc __user *info, int can_allocate); -extern void x86_ptrace_untrace(struct task_struct *); -extern void x86_ptrace_fork(struct task_struct *child, - unsigned long clone_flags); +#ifdef CONFIG_X86_PTRACE_BTS +extern void ptrace_bts_untrace(struct task_struct *tsk); -#define arch_ptrace_untrace(tsk) x86_ptrace_untrace(tsk) -#define arch_ptrace_fork(child, flags) x86_ptrace_fork(child, flags) +#define arch_ptrace_untrace(tsk) ptrace_bts_untrace(tsk) +#endif /* CONFIG_X86_PTRACE_BTS */ #endif /* __KERNEL__ */ diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index adbb24322d8..b32a8ee5338 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -887,37 +887,19 @@ static int ptrace_bts_size(struct task_struct *child) return (trace->ds.top - trace->ds.begin) / trace->ds.size; } -static inline void ptrace_bts_fork(struct task_struct *tsk) -{ - tsk->bts = NULL; -} - /* * Called from __ptrace_unlink() after the child has been moved back * to its original parent. */ -static inline void ptrace_bts_untrace(struct task_struct *child) +void ptrace_bts_untrace(struct task_struct *child) { if (unlikely(child->bts)) { free_bts_context(child->bts); child->bts = NULL; } } -#else -static inline void ptrace_bts_fork(struct task_struct *tsk) {} -static inline void ptrace_bts_untrace(struct task_struct *child) {} #endif /* CONFIG_X86_PTRACE_BTS */ -void x86_ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - ptrace_bts_fork(child); -} - -void x86_ptrace_untrace(struct task_struct *child) -{ - ptrace_bts_untrace(child); -} - /* * Called by kernel/ptrace.c when detaching.. * diff --git a/include/linux/ptrace.h b/include/linux/ptrace.h index 67c15653fc2..59e133d39d5 100644 --- a/include/linux/ptrace.h +++ b/include/linux/ptrace.h @@ -95,7 +95,6 @@ extern void __ptrace_link(struct task_struct *child, struct task_struct *new_parent); extern void __ptrace_unlink(struct task_struct *child); extern void exit_ptrace(struct task_struct *tracer); -extern void ptrace_fork(struct task_struct *task, unsigned long clone_flags); #define PTRACE_MODE_READ 1 #define PTRACE_MODE_ATTACH 2 /* Returns 0 on success, -errno on denial. */ @@ -327,15 +326,6 @@ static inline void user_enable_block_step(struct task_struct *task) #define arch_ptrace_untrace(task) do { } while (0) #endif -#ifndef arch_ptrace_fork -/* - * Do machine-specific work to initialize a new task. - * - * This is called from copy_process(). - */ -#define arch_ptrace_fork(child, clone_flags) do { } while (0) -#endif - extern int task_current_syscall(struct task_struct *target, long *callno, unsigned long args[6], unsigned int maxargs, unsigned long *sp, unsigned long *pc); diff --git a/include/linux/sched.h b/include/linux/sched.h index 52b8cd049c2..451186a22ef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1205,13 +1205,11 @@ struct task_struct { struct list_head ptraced; struct list_head ptrace_entry; -#ifdef CONFIG_X86_PTRACE_BTS /* * This is the tracer handle for the ptrace BTS extension. * This field actually belongs to the ptracer task. */ struct bts_context *bts; -#endif /* CONFIG_X86_PTRACE_BTS */ /* PID/PID hash table linkage. */ struct pid_link pids[PIDTYPE_MAX]; diff --git a/kernel/fork.c b/kernel/fork.c index 660c2b8765b..69bde7a22e9 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1086,8 +1086,8 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif - if (unlikely(current->ptrace)) - ptrace_fork(p, clone_flags); + + p->bts = NULL; /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); diff --git a/kernel/ptrace.c b/kernel/ptrace.c index aaad0ec3419..321127d965c 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -26,16 +26,6 @@ #include -/* - * Initialize a new task whose father had been ptraced. - * - * Called from copy_process(). - */ -void ptrace_fork(struct task_struct *child, unsigned long clone_flags) -{ - arch_ptrace_fork(child, clone_flags); -} - /* * ptrace a task: make the debugger its new parent and * move it to the ptrace list. From 6047550d3d26fed88b18a208b31f8b90b5ef3e9b Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:49 +0200 Subject: [PATCH 0277/2061] x86, ds: dont use TIF_DEBUGCTLMSR Debug store already uses TIF_DS_AREA_MSR to trigger debug store context switch handling. No need to use TIF_DEBUGCTLMSR, as well. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144604.256645000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 443f415441d..cab28320dac 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -925,11 +925,6 @@ static void update_task_debugctlmsr(struct task_struct *task, get_cpu(); if (task == current) update_debugctlmsr(debugctlmsr); - - if (task->thread.debugctlmsr) - set_tsk_thread_flag(task, TIF_DEBUGCTLMSR); - else - clear_tsk_thread_flag(task, TIF_DEBUGCTLMSR); put_cpu(); } From 608780a9048efa3e85fbc4d8649b26805cc588aa Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:50 +0200 Subject: [PATCH 0278/2061] x86, ds: fix bad ds_reset_pebs() Ds_reset_pebs() passed the wrong qualifier to a shared function resulting in a reset of bts, rather than pebs. Reported-by: Stephane Eranian Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144605.206510000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index cab28320dac..ebfb0fde8e6 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -1186,7 +1186,7 @@ int ds_reset_pebs(struct pebs_tracer *tracer) tracer->trace.ds.top = tracer->trace.ds.begin; - ds_set(tracer->ds.context->ds, ds_bts, ds_index, + ds_set(tracer->ds.context->ds, ds_pebs, ds_index, (unsigned long)tracer->trace.ds.top); return 0; From 150f5164c1258e05b7dea16f29e592f354c48f34 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:51 +0200 Subject: [PATCH 0279/2061] x86, ds: allow small debug store buffers Check the buffer size more precisely to allow buffers for exactly one element provided the base address is already properly aligned. Add a debug store selftest. Reported-by: Stephane Eranian Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144606.139137000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds.c | 9 +++++++-- arch/x86/kernel/ds_selftest.c | 6 +++--- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index ebfb0fde8e6..4e05157506a 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -656,6 +656,7 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, { struct ds_context *context; int error; + size_t req_size; error = -EOPNOTSUPP; if (!ds_cfg.sizeof_rec[qual]) @@ -665,9 +666,13 @@ static int ds_request(struct ds_tracer *tracer, struct ds_trace *trace, if (!base) goto out; - /* We need space for alignment adjustments in ds_init_ds_trace(). */ + req_size = ds_cfg.sizeof_rec[qual]; + /* We might need space for alignment adjustments. */ + if (!IS_ALIGNED((unsigned long)base, DS_ALIGNMENT)) + req_size += DS_ALIGNMENT; + error = -EINVAL; - if (size < (DS_ALIGNMENT + ds_cfg.sizeof_rec[qual])) + if (size < req_size) goto out; if (th != (size_t)-1) { diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index a40b2533c71..5f104a0ace6 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -16,8 +16,8 @@ #include -#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ - +#define BUFFER_SIZE 521 /* Intentionally chose an odd size. */ +#define SMALL_BUFFER_SIZE 24 /* A single bts entry. */ struct ds_selftest_bts_conf { struct bts_tracer *tracer; @@ -381,7 +381,7 @@ int ds_selftest_bts(void) conf.suspend = ds_suspend_bts_noirq; conf.resume = ds_resume_bts_noirq; conf.tracer = - ds_request_bts_task(current, buffer, BUFFER_SIZE, + ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); local_irq_save(irq); ds_selftest_bts_cpu(&conf); From 017bc617657c928cb9a0c45a7a7e9f4e66695347 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 3 Apr 2009 16:43:52 +0200 Subject: [PATCH 0280/2061] x86, ds: support Core i7 Add debug store support for Core i7. Core i7 adds a reset value for each performance counter and a new PEBS record format. Signed-off-by: Markus Metzger Cc: roland@redhat.com Cc: eranian@googlemail.com Cc: oleg@redhat.com Cc: juan.villacis@intel.com Cc: ak@linux.jf.intel.com LKML-Reference: <20090403144607.088997000@intel.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/ds.h | 12 +++++-- arch/x86/kernel/ds.c | 69 +++++++++++++++++++++++++++++++++++---- 2 files changed, 71 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/ds.h b/arch/x86/include/asm/ds.h index 149e5208e96..70dac199b09 100644 --- a/arch/x86/include/asm/ds.h +++ b/arch/x86/include/asm/ds.h @@ -234,8 +234,12 @@ struct bts_trace { struct pebs_trace { struct ds_trace ds; - /* the PEBS reset value */ - unsigned long long reset_value; + /* the number of valid counters in the below array */ + unsigned int counters; + +#define MAX_PEBS_COUNTERS 4 + /* the counter reset value */ + unsigned long long counter_reset[MAX_PEBS_COUNTERS]; }; @@ -270,9 +274,11 @@ extern int ds_reset_pebs(struct pebs_tracer *tracer); * Returns 0 on success; -Eerrno on error * * tracer: the tracer handle returned from ds_request_pebs() + * counter: the index of the counter * value: the new counter reset value */ -extern int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value); +extern int ds_set_pebs_reset(struct pebs_tracer *tracer, + unsigned int counter, u64 value); /* * Initialization diff --git a/arch/x86/kernel/ds.c b/arch/x86/kernel/ds.c index 4e05157506a..48bfe138603 100644 --- a/arch/x86/kernel/ds.c +++ b/arch/x86/kernel/ds.c @@ -44,6 +44,9 @@ struct ds_configuration { /* The size of a BTS/PEBS record in bytes: */ unsigned char sizeof_rec[2]; + /* The number of pebs counter reset values in the DS structure. */ + unsigned char nr_counter_reset; + /* Control bit-masks indexed by enum ds_feature: */ unsigned long ctl[dsf_ctl_max]; }; @@ -51,7 +54,7 @@ static struct ds_configuration ds_cfg __read_mostly; /* Maximal size of a DS configuration: */ -#define MAX_SIZEOF_DS (12 * 8) +#define MAX_SIZEOF_DS 0x80 /* Maximal size of a BTS record: */ #define MAX_SIZEOF_BTS (3 * 8) @@ -59,6 +62,12 @@ static struct ds_configuration ds_cfg __read_mostly; /* BTS and PEBS buffer alignment: */ #define DS_ALIGNMENT (1 << 3) +/* Number of buffer pointers in DS: */ +#define NUM_DS_PTR_FIELDS 8 + +/* Size of a pebs reset value in DS: */ +#define PEBS_RESET_FIELD_SIZE 8 + /* Mask of control bits in the DS MSR register: */ #define BTS_CONTROL \ ( ds_cfg.ctl[dsf_bts] | \ @@ -1164,9 +1173,12 @@ const struct pebs_trace *ds_read_pebs(struct pebs_tracer *tracer) return NULL; ds_read_config(tracer->ds.context, &tracer->trace.ds, ds_pebs); - tracer->trace.reset_value = - *(u64 *)(tracer->ds.context->ds + - (ds_cfg.sizeof_ptr_field * 8)); + + tracer->trace.counters = ds_cfg.nr_counter_reset; + memcpy(tracer->trace.counter_reset, + tracer->ds.context->ds + + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field), + ds_cfg.nr_counter_reset * PEBS_RESET_FIELD_SIZE); return &tracer->trace; } @@ -1197,13 +1209,18 @@ int ds_reset_pebs(struct pebs_tracer *tracer) return 0; } -int ds_set_pebs_reset(struct pebs_tracer *tracer, u64 value) +int ds_set_pebs_reset(struct pebs_tracer *tracer, + unsigned int counter, u64 value) { if (!tracer) return -EINVAL; + if (ds_cfg.nr_counter_reset < counter) + return -EINVAL; + *(u64 *)(tracer->ds.context->ds + - (ds_cfg.sizeof_ptr_field * 8)) = value; + (NUM_DS_PTR_FIELDS * ds_cfg.sizeof_ptr_field) + + (counter * PEBS_RESET_FIELD_SIZE)) = value; return 0; } @@ -1213,16 +1230,26 @@ static const struct ds_configuration ds_cfg_netburst = { .ctl[dsf_bts] = (1 << 2) | (1 << 3), .ctl[dsf_bts_kernel] = (1 << 5), .ctl[dsf_bts_user] = (1 << 6), + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_pentium_m = { .name = "Pentium M", .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .nr_counter_reset = 1, }; static const struct ds_configuration ds_cfg_core2_atom = { .name = "Core 2/Atom", .ctl[dsf_bts] = (1 << 6) | (1 << 7), .ctl[dsf_bts_kernel] = (1 << 9), .ctl[dsf_bts_user] = (1 << 10), + .nr_counter_reset = 1, +}; +static const struct ds_configuration ds_cfg_core_i7 = { + .name = "Core i7", + .ctl[dsf_bts] = (1 << 6) | (1 << 7), + .ctl[dsf_bts_kernel] = (1 << 9), + .ctl[dsf_bts_user] = (1 << 10), + .nr_counter_reset = 4, }; static void @@ -1239,6 +1266,32 @@ ds_configure(const struct ds_configuration *cfg, nr_pebs_fields = 18; #endif + /* + * Starting with version 2, architectural performance + * monitoring supports a format specifier. + */ + if ((cpuid_eax(0xa) & 0xff) > 1) { + unsigned long perf_capabilities, format; + + rdmsrl(MSR_IA32_PERF_CAPABILITIES, perf_capabilities); + + format = (perf_capabilities >> 8) & 0xf; + + switch (format) { + case 0: + nr_pebs_fields = 18; + break; + case 1: + nr_pebs_fields = 22; + break; + default: + printk(KERN_INFO + "[ds] unknown PEBS format: %lu\n", format); + nr_pebs_fields = 0; + break; + } + } + memset(&ds_cfg, 0, sizeof(ds_cfg)); ds_cfg = *cfg; @@ -1262,7 +1315,7 @@ ds_configure(const struct ds_configuration *cfg, printk("bts/pebs record: %u/%u bytes\n", ds_cfg.sizeof_rec[ds_bts], ds_cfg.sizeof_rec[ds_pebs]); - WARN_ON_ONCE(MAX_SIZEOF_DS < (12 * ds_cfg.sizeof_ptr_field)); + WARN_ON_ONCE(MAX_PEBS_COUNTERS < ds_cfg.nr_counter_reset); } void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) @@ -1284,6 +1337,8 @@ void __cpuinit ds_init_intel(struct cpuinfo_x86 *c) ds_configure(&ds_cfg_core2_atom, c); break; case 0x1a: /* Core i7 */ + ds_configure(&ds_cfg_core_i7, c); + break; default: /* Sorry, don't know about them. */ break; From a5dec5573f3c7e63f2f9b5852b9759ea342a5ff9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 27 Mar 2009 14:55:44 +0800 Subject: [PATCH 0281/2061] tracing: use macros to denote usec and nsec per second Impact: cleanup Use USEC_PER_SEC and NSEC_PER_SEC instead of 1000000 and 1000000000. Signed-off-by: Li Zefan LKML-Reference: <49CC7870.9000309@cn.fujitsu.com> Acked-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_boot.c | 5 +++-- kernel/trace/trace_mmiotrace.c | 6 ++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_boot.c b/kernel/trace/trace_boot.c index 7a30fc4c364..a29ef23ffb4 100644 --- a/kernel/trace/trace_boot.c +++ b/kernel/trace/trace_boot.c @@ -9,6 +9,7 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" @@ -67,7 +68,7 @@ initcall_call_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); call = &field->boot_call; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] calling %s @ %i\n", (unsigned long)ts, nsec_rem, call->func, call->caller); @@ -92,7 +93,7 @@ initcall_ret_print_line(struct trace_iterator *iter) trace_assign_type(field, entry); init_ret = &field->boot_ret; ts = iter->ts; - nsec_rem = do_div(ts, 1000000000); + nsec_rem = do_div(ts, NSEC_PER_SEC); ret = trace_seq_printf(s, "[%5ld.%09ld] initcall %s " "returned %d after %llu msecs\n", diff --git a/kernel/trace/trace_mmiotrace.c b/kernel/trace/trace_mmiotrace.c index 8e37fcddd8b..d53b45ed080 100644 --- a/kernel/trace/trace_mmiotrace.c +++ b/kernel/trace/trace_mmiotrace.c @@ -9,6 +9,8 @@ #include #include #include +#include + #include #include "trace.h" @@ -174,7 +176,7 @@ static enum print_line_t mmio_print_rw(struct trace_iterator *iter) struct mmiotrace_rw *rw; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret = 1; @@ -221,7 +223,7 @@ static enum print_line_t mmio_print_map(struct trace_iterator *iter) struct mmiotrace_map *m; struct trace_seq *s = &iter->seq; unsigned long long t = ns2usecs(iter->ts); - unsigned long usec_rem = do_div(t, 1000000ULL); + unsigned long usec_rem = do_div(t, USEC_PER_SEC); unsigned secs = (unsigned long)t; int ret; From 5452af664f6fba26b80eb2c8c4ceae2999d5cf56 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 27 Mar 2009 00:25:38 +0100 Subject: [PATCH 0282/2061] tracing/ftrace: factorize the tracing files creation Impact: cleanup Most of the tracing files creation follow the same pattern: ret = debugfs_create_file(...) if (!ret) pr_warning("Couldn't create ... entry\n") Unify it! Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker LKML-Reference: <1238109938-11840-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 39 ++------ kernel/trace/ring_buffer.c | 7 +- kernel/trace/trace.c | 143 +++++++++++------------------ kernel/trace/trace.h | 6 ++ kernel/trace/trace_event_profile.c | 1 - kernel/trace/trace_printk.c | 6 +- kernel/trace/trace_stack.c | 13 +-- kernel/trace/trace_sysprof.c | 6 +- 8 files changed, 78 insertions(+), 143 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 678e3d6caf8..6ea5a1ae6a9 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2698,38 +2698,23 @@ static const struct file_operations ftrace_graph_fops = { static __init int ftrace_init_dyn_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("available_filter_functions", 0444, - d_tracer, NULL, &ftrace_avail_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'available_filter_functions' entry\n"); + trace_create_file("available_filter_functions", 0444, + d_tracer, NULL, &ftrace_avail_fops); - entry = debugfs_create_file("failures", 0444, - d_tracer, NULL, &ftrace_failures_fops); - if (!entry) - pr_warning("Could not create debugfs 'failures' entry\n"); + trace_create_file("failures", 0444, + d_tracer, NULL, &ftrace_failures_fops); - entry = debugfs_create_file("set_ftrace_filter", 0644, d_tracer, - NULL, &ftrace_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_filter' entry\n"); + trace_create_file("set_ftrace_filter", 0644, d_tracer, + NULL, &ftrace_filter_fops); - entry = debugfs_create_file("set_ftrace_notrace", 0644, d_tracer, + trace_create_file("set_ftrace_notrace", 0644, d_tracer, NULL, &ftrace_notrace_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_notrace' entry\n"); #ifdef CONFIG_FUNCTION_GRAPH_TRACER - entry = debugfs_create_file("set_graph_function", 0444, d_tracer, + trace_create_file("set_graph_function", 0444, d_tracer, NULL, &ftrace_graph_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_graph_function' entry\n"); #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ return 0; @@ -2987,7 +2972,6 @@ static const struct file_operations ftrace_pid_fops = { static __init int ftrace_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -2995,11 +2979,8 @@ static __init int ftrace_init_debugfs(void) ftrace_init_dyn_debugfs(d_tracer); - entry = debugfs_create_file("set_ftrace_pid", 0644, d_tracer, - NULL, &ftrace_pid_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'set_ftrace_pid' entry\n"); + trace_create_file("set_ftrace_pid", 0644, d_tracer, + NULL, &ftrace_pid_fops); ftrace_profile_debugfs(d_tracer); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 960cbf44c84..74a11808c28 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2845,14 +2845,11 @@ static const struct file_operations rb_simple_fops = { static __init int rb_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_on", 0644, d_tracer, - &ring_buffer_flags, &rb_simple_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_on' entry\n"); + trace_create_file("tracing_on", 0644, d_tracer, + &ring_buffer_flags, &rb_simple_fops); return 0; } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 32653c8c6e2..0615751a3ed 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3581,7 +3581,7 @@ struct dentry *tracing_dentry_percpu(void) static void tracing_init_debugfs_percpu(long cpu) { struct dentry *d_percpu = tracing_dentry_percpu(); - struct dentry *entry, *d_cpu; + struct dentry *d_cpu; /* strlen(cpu) + MAX(log10(cpu)) + '\0' */ char cpu_dir[7]; @@ -3596,21 +3596,15 @@ static void tracing_init_debugfs_percpu(long cpu) } /* per cpu trace_pipe */ - entry = debugfs_create_file("trace_pipe", 0444, d_cpu, - (void *) cpu, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe' entry\n"); + trace_create_file("trace_pipe", 0444, d_cpu, + (void *) cpu, &tracing_pipe_fops); /* per cpu trace */ - entry = debugfs_create_file("trace", 0644, d_cpu, - (void *) cpu, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + trace_create_file("trace", 0644, d_cpu, + (void *) cpu, &tracing_fops); - entry = debugfs_create_file("trace_pipe_raw", 0444, d_cpu, - (void *) cpu, &tracing_buffers_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_pipe_raw' entry\n"); + trace_create_file("trace_pipe_raw", 0444, d_cpu, + (void *) cpu, &tracing_buffers_fops); } #ifdef CONFIG_FTRACE_SELFTEST @@ -3766,6 +3760,22 @@ static const struct file_operations trace_options_core_fops = { .write = trace_options_core_write, }; +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops) +{ + struct dentry *ret; + + ret = debugfs_create_file(name, mode, parent, data, fops); + if (!ret) + pr_warning("Could not create debugfs '%s' entry\n", name); + + return ret; +} + + static struct dentry *trace_options_init_dentry(void) { struct dentry *d_tracer; @@ -3793,7 +3803,6 @@ create_trace_option_file(struct trace_option_dentry *topt, struct tracer_opt *opt) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) @@ -3802,11 +3811,9 @@ create_trace_option_file(struct trace_option_dentry *topt, topt->flags = flags; topt->opt = opt; - entry = debugfs_create_file(opt->name, 0644, t_options, topt, + topt->entry = trace_create_file(opt->name, 0644, t_options, topt, &trace_options_fops); - topt->entry = entry; - } static struct trace_option_dentry * @@ -3861,123 +3868,81 @@ static struct dentry * create_trace_option_core_file(const char *option, long index) { struct dentry *t_options; - struct dentry *entry; t_options = trace_options_init_dentry(); if (!t_options) return NULL; - entry = debugfs_create_file(option, 0644, t_options, (void *)index, + return trace_create_file(option, 0644, t_options, (void *)index, &trace_options_core_fops); - - return entry; } static __init void create_trace_options_dir(void) { struct dentry *t_options; - struct dentry *entry; int i; t_options = trace_options_init_dentry(); if (!t_options) return; - for (i = 0; trace_options[i]; i++) { - entry = create_trace_option_core_file(trace_options[i], i); - if (!entry) - pr_warning("Could not create debugfs %s entry\n", - trace_options[i]); - } + for (i = 0; trace_options[i]; i++) + create_trace_option_core_file(trace_options[i], i); } static __init int tracer_init_debugfs(void) { struct dentry *d_tracer; - struct dentry *entry; int cpu; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("tracing_enabled", 0644, d_tracer, - &global_trace, &tracing_ctrl_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_enabled' entry\n"); + trace_create_file("tracing_enabled", 0644, d_tracer, + &global_trace, &tracing_ctrl_fops); - entry = debugfs_create_file("trace_options", 0644, d_tracer, - NULL, &tracing_iter_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace_options' entry\n"); + trace_create_file("trace_options", 0644, d_tracer, + NULL, &tracing_iter_fops); - create_trace_options_dir(); + trace_create_file("tracing_cpumask", 0644, d_tracer, + NULL, &tracing_cpumask_fops); - entry = debugfs_create_file("tracing_cpumask", 0644, d_tracer, - NULL, &tracing_cpumask_fops); - if (!entry) - pr_warning("Could not create debugfs 'tracing_cpumask' entry\n"); + trace_create_file("trace", 0644, d_tracer, + (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); - entry = debugfs_create_file("trace", 0644, d_tracer, - (void *) TRACE_PIPE_ALL_CPU, &tracing_fops); - if (!entry) - pr_warning("Could not create debugfs 'trace' entry\n"); + trace_create_file("available_tracers", 0444, d_tracer, + &global_trace, &show_traces_fops); - entry = debugfs_create_file("available_tracers", 0444, d_tracer, - &global_trace, &show_traces_fops); - if (!entry) - pr_warning("Could not create debugfs 'available_tracers' entry\n"); + trace_create_file("current_tracer", 0444, d_tracer, + &global_trace, &set_tracer_fops); - entry = debugfs_create_file("current_tracer", 0444, d_tracer, - &global_trace, &set_tracer_fops); - if (!entry) - pr_warning("Could not create debugfs 'current_tracer' entry\n"); + trace_create_file("tracing_max_latency", 0644, d_tracer, + &tracing_max_latency, &tracing_max_lat_fops); - entry = debugfs_create_file("tracing_max_latency", 0644, d_tracer, - &tracing_max_latency, - &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_max_latency' entry\n"); + trace_create_file("tracing_thresh", 0644, d_tracer, + &tracing_thresh, &tracing_max_lat_fops); - entry = debugfs_create_file("tracing_thresh", 0644, d_tracer, - &tracing_thresh, &tracing_max_lat_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'tracing_thresh' entry\n"); - entry = debugfs_create_file("README", 0644, d_tracer, - NULL, &tracing_readme_fops); - if (!entry) - pr_warning("Could not create debugfs 'README' entry\n"); + trace_create_file("README", 0644, d_tracer, + NULL, &tracing_readme_fops); - entry = debugfs_create_file("trace_pipe", 0444, d_tracer, + trace_create_file("trace_pipe", 0444, d_tracer, (void *) TRACE_PIPE_ALL_CPU, &tracing_pipe_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_pipe' entry\n"); - entry = debugfs_create_file("buffer_size_kb", 0644, d_tracer, - &global_trace, &tracing_entries_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'buffer_size_kb' entry\n"); + trace_create_file("buffer_size_kb", 0644, d_tracer, + &global_trace, &tracing_entries_fops); - entry = debugfs_create_file("trace_marker", 0220, d_tracer, - NULL, &tracing_mark_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'trace_marker' entry\n"); + trace_create_file("trace_marker", 0220, d_tracer, + NULL, &tracing_mark_fops); #ifdef CONFIG_DYNAMIC_FTRACE - entry = debugfs_create_file("dyn_ftrace_total_info", 0444, d_tracer, - &ftrace_update_tot_cnt, - &tracing_dyn_info_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'dyn_ftrace_total_info' entry\n"); + trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, + &ftrace_update_tot_cnt, &tracing_dyn_info_fops); #endif #ifdef CONFIG_SYSPROF_TRACER init_tracer_sysprof_debugfs(d_tracer); #endif + create_trace_options_dir(); + for_each_tracing_cpu(cpu) tracing_init_debugfs_percpu(cpu); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 47aa6d0c97a..f76a8f8689d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -470,6 +470,12 @@ void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); int tracing_open_generic(struct inode *inode, struct file *filp); +struct dentry *trace_create_file(const char *name, + mode_t mode, + struct dentry *parent, + void *data, + const struct file_operations *fops); + struct dentry *tracing_init_dentry(void); void init_tracer_sysprof_debugfs(struct dentry *d_tracer); diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 22cba997077..199de9c7422 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -28,4 +28,3 @@ void ftrace_profile_disable(int event_id) return event->profile_disable(event); } } - diff --git a/kernel/trace/trace_printk.c b/kernel/trace/trace_printk.c index eb81556107f..9bece9687b6 100644 --- a/kernel/trace/trace_printk.c +++ b/kernel/trace/trace_printk.c @@ -245,17 +245,13 @@ static const struct file_operations ftrace_formats_fops = { static __init int init_trace_printk_function_export(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); if (!d_tracer) return 0; - entry = debugfs_create_file("printk_formats", 0444, d_tracer, + trace_create_file("printk_formats", 0444, d_tracer, NULL, &ftrace_formats_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'printk_formats' entry\n"); return 0; } diff --git a/kernel/trace/trace_stack.c b/kernel/trace/trace_stack.c index c750f65f966..1796f00524e 100644 --- a/kernel/trace/trace_stack.c +++ b/kernel/trace/trace_stack.c @@ -352,19 +352,14 @@ __setup("stacktrace", enable_stacktrace); static __init int stack_trace_init(void) { struct dentry *d_tracer; - struct dentry *entry; d_tracer = tracing_init_dentry(); - entry = debugfs_create_file("stack_max_size", 0644, d_tracer, - &max_stack_size, &stack_max_size_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_max_size' entry\n"); + trace_create_file("stack_max_size", 0644, d_tracer, + &max_stack_size, &stack_max_size_fops); - entry = debugfs_create_file("stack_trace", 0444, d_tracer, - NULL, &stack_trace_fops); - if (!entry) - pr_warning("Could not create debugfs 'stack_trace' entry\n"); + trace_create_file("stack_trace", 0444, d_tracer, + NULL, &stack_trace_fops); if (stack_tracer_enabled) register_ftrace_function(&trace_ops); diff --git a/kernel/trace/trace_sysprof.c b/kernel/trace/trace_sysprof.c index 91fd19c2149..e04b76cc238 100644 --- a/kernel/trace/trace_sysprof.c +++ b/kernel/trace/trace_sysprof.c @@ -321,11 +321,7 @@ static const struct file_operations sysprof_sample_fops = { void init_tracer_sysprof_debugfs(struct dentry *d_tracer) { - struct dentry *entry; - entry = debugfs_create_file("sysprof_sample_period", 0644, + trace_create_file("sysprof_sample_period", 0644, d_tracer, NULL, &sysprof_sample_fops); - if (entry) - return; - pr_warning("Could not create debugfs 'sysprof_sample_period' entry\n"); } From 597af81537654097b67fd7a0c92775e66d4a86fe Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 3 Apr 2009 15:24:12 -0400 Subject: [PATCH 0283/2061] function-graph: use int instead of atomic for ftrace_graph_active Impact: cleanup The variable ftrace_graph_active is only modified under the ftrace_lock mutex, thus an atomic is not necessary for modification. Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ftrace.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 6ea5a1ae6a9..8e6a0b5c994 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3092,7 +3092,7 @@ ftrace_enable_sysctl(struct ctl_table *table, int write, #ifdef CONFIG_FUNCTION_GRAPH_TRACER -static atomic_t ftrace_graph_active; +static int ftrace_graph_active; static struct notifier_block ftrace_suspend_notifier; int ftrace_graph_entry_stub(struct ftrace_graph_ent *trace) @@ -3244,7 +3244,7 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, mutex_lock(&ftrace_lock); /* we currently allow only one tracer registered at a time */ - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { ret = -EBUSY; goto out; } @@ -3252,10 +3252,10 @@ int register_ftrace_graph(trace_func_graph_ret_t retfunc, ftrace_suspend_notifier.notifier_call = ftrace_suspend_notifier_call; register_pm_notifier(&ftrace_suspend_notifier); - atomic_inc(&ftrace_graph_active); + ftrace_graph_active++; ret = start_graph_tracing(); if (ret) { - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; goto out; } @@ -3273,10 +3273,10 @@ void unregister_ftrace_graph(void) { mutex_lock(&ftrace_lock); - if (!unlikely(atomic_read(&ftrace_graph_active))) + if (unlikely(!ftrace_graph_active)) goto out; - atomic_dec(&ftrace_graph_active); + ftrace_graph_active--; unregister_trace_sched_switch(ftrace_graph_probe_sched_switch); ftrace_graph_return = (trace_func_graph_ret_t)ftrace_stub; ftrace_graph_entry = ftrace_graph_entry_stub; @@ -3290,7 +3290,7 @@ void unregister_ftrace_graph(void) /* Allocate a return stack for newly created task */ void ftrace_graph_init_task(struct task_struct *t) { - if (atomic_read(&ftrace_graph_active)) { + if (ftrace_graph_active) { t->ret_stack = kmalloc(FTRACE_RETFUNC_DEPTH * sizeof(struct ftrace_ret_stack), GFP_KERNEL); From dcef788eb9659b61a2110284fcce3ca6e63480d2 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Tue, 31 Mar 2009 15:26:14 +0800 Subject: [PATCH 0284/2061] ftrace: clean up enable logic for sched_switch Unify sched_switch and sched_wakeup's action to following logic: Do record_cmdline when start_cmdline_record() is called. Start tracing events when the tracer is started. Signed-off-by: Zhao Lei LKML-Reference: <49D1C596.5050203@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_switch.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9117cea6f1a..9d8cccdfaa0 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -29,13 +29,13 @@ probe_sched_switch(struct rq *__rq, struct task_struct *prev, int cpu; int pc; - if (!sched_ref || sched_stopped) + if (unlikely(!sched_ref)) return; tracing_record_cmdline(prev); tracing_record_cmdline(next); - if (!tracer_enabled) + if (!tracer_enabled || sched_stopped) return; pc = preempt_count(); @@ -56,15 +56,15 @@ probe_sched_wakeup(struct rq *__rq, struct task_struct *wakee, int success) unsigned long flags; int cpu, pc; - if (!likely(tracer_enabled)) + if (unlikely(!sched_ref)) + return; + + tracing_record_cmdline(current); + + if (!tracer_enabled || sched_stopped) return; pc = preempt_count(); - tracing_record_cmdline(current); - - if (sched_stopped) - return; - local_irq_save(flags); cpu = raw_smp_processor_id(); data = ctx_trace->data[cpu]; From 9c83633ad38138855181af6936e8ac570ef7e2cb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 7 Apr 2009 14:48:16 +0300 Subject: [PATCH 0285/2061] missing unlock in jfs_quota_write() We should unlock &inode->i_mutex on the error path. This bug was in ext2_quota_write(). I sent a patch to them today as well. Found by smatch (http://repo.or.cz/w/smatch.git). Compile tested. regards, dan carpenter Signed-off-by: Dan Carpenter Signed-off-by: Dave Kleikamp --- fs/jfs/super.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 6f21adf9479..d9b0e92b360 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -720,8 +720,10 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, blk++; } out: - if (len == towrite) + if (len == towrite) { + mutex_unlock(&inode->i_mutex); return err; + } if (inode->i_size < off+len-towrite) i_size_write(inode, off+len-towrite); inode->i_version++; From 169aafbc8d3f05431b5cfeb60294a12b8ef2bcee Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 7 Apr 2009 13:37:26 -0700 Subject: [PATCH 0286/2061] lguest: update lazy mmu changes to match lguest's use of kvm hypercalls Duplicate hcall -> kvm_hypercall0 convertion from "lguest: use KVM hypercalls". Signed-off-by: Jeremy Fitzhardinge Cc: Matias Zabaljauregui Cc: Rusty Russell --- arch/x86/lguest/boot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index 5ab239711cc..cfb2d68dc79 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -168,7 +168,7 @@ static void lazy_hcall3(unsigned long call, * issue the do-nothing hypercall to flush any stored calls. */ static void lguest_leave_lazy_mmu_mode(void) { - hcall(LHCALL_FLUSH_ASYNC, 0, 0, 0); + kvm_hypercall0(LHCALL_FLUSH_ASYNC); paravirt_leave_lazy_mmu(); } From 44bc9dc729e33a4ec6ebed4d0b6c08e8d20b42cf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 10:47:17 +0200 Subject: [PATCH 0287/2061] mm, x86, ptrace, bts: defer branch trace stopping, cleanup Andrew Morton noticed that mm.h needlessly includes sched.h - remove it. Reported-by: Andrew Morton Signed-off-by: Ingo Molnar --- include/linux/mm.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 64d8ed2538a..776b641f37e 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -13,7 +13,6 @@ #include #include #include -#include struct mempolicy; struct anon_vma; From a34b50ddc265bae058c66661b096ef6384c5a8b1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 8 Apr 2009 10:56:54 +0200 Subject: [PATCH 0288/2061] mm, x86, ptrace, bts: defer branch trace stopping, remove dead code Remove the unused free_locked_buffer() API. Signed-off-by: Ingo Molnar --- include/linux/mm.h | 1 - mm/mlock.c | 6 ------ 2 files changed, 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 776b641f37e..a3963ba23a6 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1320,7 +1320,6 @@ int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); extern void *alloc_locked_buffer(size_t size); -extern void free_locked_buffer(void *buffer, size_t size); extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 749383b442c..28be15ead9c 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -671,9 +671,3 @@ void refund_locked_buffer_memory(struct mm_struct *mm, size_t size) up_write(&mm->mmap_sem); } - -void free_locked_buffer(void *buffer, size_t size) -{ - refund_locked_buffer_memory(current->mm, size); - kfree(buffer); -} From dc66270b51a62b1a6888d5309229e638a305c47b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 8 Apr 2009 20:30:10 +1000 Subject: [PATCH 0289/2061] perf_counter: fix powerpc build Commit 4af4998b ("perf_counter: rework context time") changed struct perf_counter_context to have a 'time' field instead of a 'time_now' field, but neglected to fix the place in the powerpc perf_counter.c where the time_now field was accessed. This fixes it. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18908.31922.411398.147810@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index f88c35d0710..0e5651385dd 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -457,8 +457,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu) { counter->state = PERF_COUNTER_STATE_ACTIVE; counter->oncpu = cpu; - counter->tstamp_running += counter->ctx->time_now - - counter->tstamp_stopped; + counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped; if (is_software_counter(counter)) counter->hw_ops->enable(counter); } From f708223d49ac39f5af1643985056206c98033f5b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 8 Apr 2009 20:30:18 +1000 Subject: [PATCH 0290/2061] perf_counter: powerpc: set sample enable bit for marked instruction events Impact: enable access to hardware feature POWER processors have the ability to "mark" a subset of the instructions and provide more detailed information on what happens to the marked instructions as they flow through the pipeline. This marking is enabled by the "sample enable" bit in MMCRA, and there are synchronization requirements around setting and clearing the bit. This adds logic to the processor-specific back-ends so that they know which events relate to marked instructions and set the sampling enable bit if any event that we want to put on the PMU is a marked instruction event. It also adds logic to the generic powerpc code to do the necessary synchronization if that bit is set. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18908.31930.1024.228867@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 28 +++++-- arch/powerpc/kernel/power5+-pmu.c | 103 ++++++++++++++++++++++- arch/powerpc/kernel/power5-pmu.c | 96 +++++++++++++++++++++- arch/powerpc/kernel/power6-pmu.c | 126 ++++++++++++++++++++++++++++- arch/powerpc/kernel/ppc970-pmu.c | 72 ++++++++++++++++- 5 files changed, 413 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 0e5651385dd..0697ade84dd 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -306,6 +306,15 @@ u64 hw_perf_save_disable(void) cpuhw->pmcs_enabled = 1; } + /* + * Disable instruction sampling if it was enabled + */ + if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { + mtspr(SPRN_MMCRA, + cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); + mb(); + } + /* * Set the 'freeze counters' bit. * The barrier is to make sure the mtspr has been @@ -347,12 +356,11 @@ void hw_perf_restore(u64 disable) * (possibly updated for removal of counters). */ if (!cpuhw->n_added) { - mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); - mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); if (cpuhw->n_counters == 0) get_lppaca()->pmcregs_in_use = 0; - goto out; + goto out_enable; } /* @@ -385,7 +393,7 @@ void hw_perf_restore(u64 disable) * Then unfreeze the counters. */ get_lppaca()->pmcregs_in_use = 1; - mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2] & ~MMCRA_SAMPLE_ENABLE); mtspr(SPRN_MMCR1, cpuhw->mmcr[1]); mtspr(SPRN_MMCR0, (cpuhw->mmcr[0] & ~(MMCR0_PMC1CE | MMCR0_PMCjCE)) | MMCR0_FC); @@ -421,10 +429,20 @@ void hw_perf_restore(u64 disable) write_pmc(counter->hw.idx, val); perf_counter_update_userpage(counter); } - mb(); cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; + + out_enable: + mb(); mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + /* + * Enable instruction sampling if necessary + */ + if (cpuhw->mmcr[2] & MMCRA_SAMPLE_ENABLE) { + mb(); + mtspr(SPRN_MMCRA, cpuhw->mmcr[2]); + } + out: local_irq_restore(flags); } diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index cec21ea65b0..1222c8ea3c2 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -1,5 +1,5 @@ /* - * Performance counter support for POWER5 (not POWER5++) processors. + * Performance counter support for POWER5+/++ (not POWER5) processors. * * Copyright 2009 Paul Mackerras, IBM Corporation. * @@ -281,10 +281,107 @@ static int power5p_get_alternatives(unsigned int event, unsigned int alt[]) return nalt; } +/* + * Map of which direct events on which PMCs are marked instruction events. + * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event. + * Bit 0 is set if it is marked for all PMCs. + * The 0x80 bit indicates a byte decode PMCSEL value. + */ +static unsigned char direct_event_is_marked[0x28] = { + 0, /* 00 */ + 0x1f, /* 01 PM_IOPS_CMPL */ + 0x2, /* 02 PM_MRK_GRP_DISP */ + 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ + 0, /* 04 */ + 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */ + 0x80, /* 06 */ + 0x80, /* 07 */ + 0, 0, 0,/* 08 - 0a */ + 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */ + 0, /* 0c */ + 0x80, /* 0d */ + 0x80, /* 0e */ + 0, /* 0f */ + 0, /* 10 */ + 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */ + 0, /* 12 */ + 0x10, /* 13 PM_MRK_GRP_CMPL */ + 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */ + 0x2, /* 15 PM_MRK_GRP_ISSUED */ + 0x80, /* 16 */ + 0x80, /* 17 */ + 0, 0, 0, 0, 0, + 0x80, /* 1d */ + 0x80, /* 1e */ + 0, /* 1f */ + 0x80, /* 20 */ + 0x80, /* 21 */ + 0x80, /* 22 */ + 0x80, /* 23 */ + 0x80, /* 24 */ + 0x80, /* 25 */ + 0x80, /* 26 */ + 0x80, /* 27 */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power5p_marked_instr_event(unsigned int event) +{ + int pmc, psel; + int bit, byte, unit; + u32 mask; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = event & PM_PMCSEL_MSK; + if (pmc >= 5) + return 0; + + bit = -1; + if (psel < sizeof(direct_event_is_marked)) { + if (direct_event_is_marked[psel] & (1 << pmc)) + return 1; + if (direct_event_is_marked[psel] & 0x80) + bit = 4; + else if (psel == 0x08) + bit = pmc - 1; + else if (psel == 0x10) + bit = 4 - pmc; + else if (psel == 0x1b && (pmc == 1 || pmc == 3)) + bit = 4; + } else if ((psel & 0x48) == 0x40) { + bit = psel & 7; + } else if (psel == 0x28) { + bit = pmc - 1; + } else if (pmc == 3 && (psel == 0x2e || psel == 0x2f)) { + bit = 4; + } + + if (!(event & PM_BUSEVENT_MSK) || bit == -1) + return 0; + + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + if (unit == PM_LSU0) { + /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */ + mask = 0x5dff00; + } else if (unit == PM_LSU1 && byte >= 4) { + byte -= 4; + /* byte 5 bits 6-7, byte 6 bits 0,4, byte 7 bits 0-4,6 */ + mask = 0x5f11c000; + } else + return 0; + + return (mask >> (byte * 8 + bit)) & 1; +} + static int power5p_compute_mmcr(unsigned int event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; + u64 mmcra = 0; unsigned int pmc, unit, byte, psel; unsigned int ttm; int i, isbus, bit, grsel; @@ -404,6 +501,8 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev, grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; mmcr1 |= (u64)grsel << grsel_shift[bit]; } + if (power5p_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; if ((psel & 0x58) == 0x40 && (byte & 1) != ((pmc >> 1) & 1)) /* select alternate byte lane */ psel |= 0x10; @@ -419,7 +518,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev, if (pmc_inuse & 0x3e) mmcr[0] |= MMCR0_PMCjCE; mmcr[1] = mmcr1; - mmcr[2] = 0; + mmcr[2] = mmcra; return 0; } diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 379ed1087cc..116c4bb1809 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -290,10 +290,102 @@ static int power5_get_alternatives(unsigned int event, unsigned int alt[]) return nalt; } +/* + * Map of which direct events on which PMCs are marked instruction events. + * Indexed by PMCSEL value, bit i (LE) set if PMC i is a marked event. + * Bit 0 is set if it is marked for all PMCs. + * The 0x80 bit indicates a byte decode PMCSEL value. + */ +static unsigned char direct_event_is_marked[0x28] = { + 0, /* 00 */ + 0x1f, /* 01 PM_IOPS_CMPL */ + 0x2, /* 02 PM_MRK_GRP_DISP */ + 0xe, /* 03 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ + 0, /* 04 */ + 0x1c, /* 05 PM_MRK_BRU_FIN, PM_MRK_INST_FIN, PM_MRK_CRU_FIN */ + 0x80, /* 06 */ + 0x80, /* 07 */ + 0, 0, 0,/* 08 - 0a */ + 0x18, /* 0b PM_THRESH_TIMEO, PM_MRK_GRP_TIMEO */ + 0, /* 0c */ + 0x80, /* 0d */ + 0x80, /* 0e */ + 0, /* 0f */ + 0, /* 10 */ + 0x14, /* 11 PM_MRK_GRP_BR_REDIR, PM_MRK_GRP_IC_MISS */ + 0, /* 12 */ + 0x10, /* 13 PM_MRK_GRP_CMPL */ + 0x1f, /* 14 PM_GRP_MRK, PM_MRK_{FXU,FPU,LSU}_FIN */ + 0x2, /* 15 PM_MRK_GRP_ISSUED */ + 0x80, /* 16 */ + 0x80, /* 17 */ + 0, 0, 0, 0, 0, + 0x80, /* 1d */ + 0x80, /* 1e */ + 0, /* 1f */ + 0x80, /* 20 */ + 0x80, /* 21 */ + 0x80, /* 22 */ + 0x80, /* 23 */ + 0x80, /* 24 */ + 0x80, /* 25 */ + 0x80, /* 26 */ + 0x80, /* 27 */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power5_marked_instr_event(unsigned int event) +{ + int pmc, psel; + int bit, byte, unit; + u32 mask; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = event & PM_PMCSEL_MSK; + if (pmc >= 5) + return 0; + + bit = -1; + if (psel < sizeof(direct_event_is_marked)) { + if (direct_event_is_marked[psel] & (1 << pmc)) + return 1; + if (direct_event_is_marked[psel] & 0x80) + bit = 4; + else if (psel == 0x08) + bit = pmc - 1; + else if (psel == 0x10) + bit = 4 - pmc; + else if (psel == 0x1b && (pmc == 1 || pmc == 3)) + bit = 4; + } else if ((psel & 0x58) == 0x40) + bit = psel & 7; + + if (!(event & PM_BUSEVENT_MSK)) + return 0; + + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + if (unit == PM_LSU0) { + /* byte 1 bits 0-7, byte 2 bits 0,2-4,6 */ + mask = 0x5dff00; + } else if (unit == PM_LSU1 && byte >= 4) { + byte -= 4; + /* byte 4 bits 1,3,5,7, byte 5 bits 6-7, byte 7 bits 0-4,6 */ + mask = 0x5f00c0aa; + } else + return 0; + + return (mask >> (byte * 8 + bit)) & 1; +} + static int power5_compute_mmcr(unsigned int event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; + u64 mmcra = 0; unsigned int pmc, unit, byte, psel; unsigned int ttm, grp; int i, isbus, bit, grsel; @@ -430,6 +522,8 @@ static int power5_compute_mmcr(unsigned int event[], int n_ev, grsel = (event[i] >> PM_GRS_SH) & PM_GRS_MSK; mmcr1 |= (u64)grsel << grsel_shift[bit]; } + if (power5_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; if (pmc <= 3) mmcr1 |= psel << MMCR1_PMCSEL_SH(pmc); hwc[i] = pmc; @@ -442,7 +536,7 @@ static int power5_compute_mmcr(unsigned int event[], int n_ev, if (pmc_inuse & 0x3e) mmcr[0] |= MMCR0_PMCjCE; mmcr[1] = mmcr1; - mmcr[2] = 0; + mmcr[2] = mmcra; return 0; } diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index b1f61f3c97b..fce1fc290a1 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -48,6 +48,127 @@ #define MMCR1_PMCSEL_SH(n) (MMCR1_PMC1SEL_SH - (n) * 8) #define MMCR1_PMCSEL_MSK 0xff +/* + * Map of which direct events on which PMCs are marked instruction events. + * Indexed by PMCSEL value >> 1. + * Bottom 4 bits are a map of which PMCs are interesting, + * top 4 bits say what sort of event: + * 0 = direct marked event, + * 1 = byte decode event, + * 4 = add/and event (PMC1 -> bits 0 & 4), + * 5 = add/and event (PMC1 -> bits 1 & 5), + * 6 = add/and event (PMC1 -> bits 2 & 6), + * 7 = add/and event (PMC1 -> bits 3 & 7). + */ +static unsigned char direct_event_is_marked[0x60 >> 1] = { + 0, /* 00 */ + 0, /* 02 */ + 0, /* 04 */ + 0x07, /* 06 PM_MRK_ST_CMPL, PM_MRK_ST_GPS, PM_MRK_ST_CMPL_INT */ + 0x04, /* 08 PM_MRK_DFU_FIN */ + 0x06, /* 0a PM_MRK_IFU_FIN, PM_MRK_INST_FIN */ + 0, /* 0c */ + 0, /* 0e */ + 0x02, /* 10 PM_MRK_INST_DISP */ + 0x08, /* 12 PM_MRK_LSU_DERAT_MISS */ + 0, /* 14 */ + 0, /* 16 */ + 0x0c, /* 18 PM_THRESH_TIMEO, PM_MRK_INST_FIN */ + 0x0f, /* 1a PM_MRK_INST_DISP, PM_MRK_{FXU,FPU,LSU}_FIN */ + 0x01, /* 1c PM_MRK_INST_ISSUED */ + 0, /* 1e */ + 0, /* 20 */ + 0, /* 22 */ + 0, /* 24 */ + 0, /* 26 */ + 0x15, /* 28 PM_MRK_DATA_FROM_L2MISS, PM_MRK_DATA_FROM_L3MISS */ + 0, /* 2a */ + 0, /* 2c */ + 0, /* 2e */ + 0x4f, /* 30 */ + 0x7f, /* 32 */ + 0x4f, /* 34 */ + 0x5f, /* 36 */ + 0x6f, /* 38 */ + 0x4f, /* 3a */ + 0, /* 3c */ + 0x08, /* 3e PM_MRK_INST_TIMEO */ + 0x1f, /* 40 */ + 0x1f, /* 42 */ + 0x1f, /* 44 */ + 0x1f, /* 46 */ + 0x1f, /* 48 */ + 0x1f, /* 4a */ + 0x1f, /* 4c */ + 0x1f, /* 4e */ + 0, /* 50 */ + 0x05, /* 52 PM_MRK_BR_TAKEN, PM_MRK_BR_MPRED */ + 0x1c, /* 54 PM_MRK_PTEG_FROM_L3MISS, PM_MRK_PTEG_FROM_L2MISS */ + 0x02, /* 56 PM_MRK_LD_MISS_L1 */ + 0, /* 58 */ + 0, /* 5a */ + 0, /* 5c */ + 0, /* 5e */ +}; + +/* + * Masks showing for each unit which bits are marked events. + * These masks are in LE order, i.e. 0x00000001 is byte 0, bit 0. + */ +static u32 marked_bus_events[16] = { + 0x01000000, /* direct events set 1: byte 3 bit 0 */ + 0x00010000, /* direct events set 2: byte 2 bit 0 */ + 0, 0, 0, 0, /* IDU, IFU, nest: nothing */ + 0x00000088, /* VMX set 1: byte 0 bits 3, 7 */ + 0x000000c0, /* VMX set 2: byte 0 bits 4-7 */ + 0x04010000, /* LSU set 1: byte 2 bit 0, byte 3 bit 2 */ + 0xff010000u, /* LSU set 2: byte 2 bit 0, all of byte 3 */ + 0, /* LSU set 3 */ + 0x00000010, /* VMX set 3: byte 0 bit 4 */ + 0, /* BFP set 1 */ + 0x00000022, /* BFP set 2: byte 0 bits 1, 5 */ + 0, 0 +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int power6_marked_instr_event(unsigned int event) +{ + int pmc, psel, ptype; + int bit, byte, unit; + u32 mask; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = (event & PM_PMCSEL_MSK) >> 1; /* drop edge/level bit */ + if (pmc >= 5) + return 0; + + bit = -1; + if (psel < sizeof(direct_event_is_marked)) { + ptype = direct_event_is_marked[psel]; + if (pmc == 0 || !(ptype & (1 << (pmc - 1)))) + return 0; + ptype >>= 4; + if (ptype == 0) + return 1; + if (ptype == 1) + bit = 0; + else + bit = ptype ^ (pmc - 1); + } else if ((psel & 0x48) == 0x40) + bit = psel & 7; + + if (!(event & PM_BUSEVENT_MSK) || bit == -1) + return 0; + + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + mask = marked_bus_events[unit]; + return (mask >> (byte * 8 + bit)) & 1; +} + /* * Assign PMC numbers and compute MMCR1 value for a set of events */ @@ -55,6 +176,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; + u64 mmcra = 0; int i; unsigned int pmc, ev, b, u, s, psel; unsigned int ttmset = 0; @@ -116,6 +238,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, if (ev & PM_LLAV) mmcr1 |= MMCR1_PMC1_LLA_VALUE >> pmc; } + if (power6_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); } mmcr[0] = 0; @@ -124,7 +248,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, if (pmc_inuse & 0xe) mmcr[0] |= MMCR0_PMCjCE; mmcr[1] = mmcr1; - mmcr[2] = 0; + mmcr[2] = mmcra; return 0; } diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index c3256580be1..aed8ccd7c07 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -19,6 +19,8 @@ #define PM_PMC_MSK 0xf #define PM_UNIT_SH 8 /* TTMMUX number and setting - unit select */ #define PM_UNIT_MSK 0xf +#define PM_SPCSEL_SH 6 +#define PM_SPCSEL_MSK 3 #define PM_BYTE_SH 4 /* Byte number of event bus to use */ #define PM_BYTE_MSK 3 #define PM_PMCSEL_MSK 0xf @@ -88,8 +90,11 @@ static short mmcr1_adder_bits[8] = { * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 - * <><>[ >[ >[ >< >< >< >< ><><><><><><><><> - * T0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 + * <><><>[ >[ >[ >< >< >< >< ><><><><><><><><> + * SPT0T1 UC PS1 PS2 B0 B1 B2 B3 P1P2P3P4P5P6P7P8 + * + * SP - SPCSEL constraint + * 48-49: SPCSEL value 0x3_0000_0000_0000 * * T0 - TTM0 constraint * 46-47: TTM0SEL value (0=FPU, 2=IFU, 3=VPU) 0xC000_0000_0000 @@ -126,6 +131,57 @@ static short mmcr1_adder_bits[8] = { * 0-13: Count of events needing PMC2..PMC8 */ +static unsigned char direct_marked_event[8] = { + (1<<2) | (1<<3), /* PMC1: PM_MRK_GRP_DISP, PM_MRK_ST_CMPL */ + (1<<3) | (1<<5), /* PMC2: PM_THRESH_TIMEO, PM_MRK_BRU_FIN */ + (1<<3) | (1<<5), /* PMC3: PM_MRK_ST_CMPL_INT, PM_MRK_VMX_FIN */ + (1<<4) | (1<<5), /* PMC4: PM_MRK_GRP_CMPL, PM_MRK_CRU_FIN */ + (1<<4) | (1<<5), /* PMC5: PM_GRP_MRK, PM_MRK_GRP_TIMEO */ + (1<<3) | (1<<4) | (1<<5), + /* PMC6: PM_MRK_ST_STS, PM_MRK_FXU_FIN, PM_MRK_GRP_ISSUED */ + (1<<4) | (1<<5), /* PMC7: PM_MRK_FPU_FIN, PM_MRK_INST_FIN */ + (1<<4) /* PMC8: PM_MRK_LSU_FIN */ +}; + +/* + * Returns 1 if event counts things relating to marked instructions + * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. + */ +static int p970_marked_instr_event(unsigned int event) +{ + int pmc, psel, unit, byte, bit; + unsigned int mask; + + pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + psel = event & PM_PMCSEL_MSK; + if (pmc) { + if (direct_marked_event[pmc - 1] & (1 << psel)) + return 1; + if (psel == 0) /* add events */ + bit = (pmc <= 4)? pmc - 1: 8 - pmc; + else if (psel == 7 || psel == 13) /* decode events */ + bit = 4; + else + return 0; + } else + bit = psel; + + byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; + unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; + mask = 0; + switch (unit) { + case PM_VPU: + mask = 0x4c; /* byte 0 bits 2,3,6 */ + case PM_LSU0: + /* byte 2 bits 0,2,3,4,6; all of byte 1 */ + mask = 0x085dff00; + case PM_LSU1L: + mask = 0x50 << 24; /* byte 3 bits 4,6 */ + break; + } + return (mask >> (byte * 8 + bit)) & 1; +} + /* Masks and values for using events from the various units */ static u64 unit_cons[PM_LASTUNIT+1][2] = { [PM_FPU] = { 0xc80000000000ull, 0x040000000000ull }, @@ -138,7 +194,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = { static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) { - int pmc, byte, unit, sh; + int pmc, byte, unit, sh, spcsel; u64 mask = 0, value = 0; int grp = -1; @@ -177,6 +233,11 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) mask |= 0x800000000ull; value |= 0x100000000ull; } + spcsel = (event >> PM_SPCSEL_SH) & PM_SPCSEL_MSK; + if (spcsel) { + mask |= 3ull << 48; + value |= (u64)spcsel << 48; + } *maskp = mask; *valp = value; return 0; @@ -209,6 +270,7 @@ static int p970_compute_mmcr(unsigned int event[], int n_ev, unsigned char ttmuse[2]; unsigned char pmcsel[8]; int i; + int spcsel; if (n_ev > 8) return -1; @@ -316,6 +378,10 @@ static int p970_compute_mmcr(unsigned int event[], int n_ev, } pmcsel[pmc] = psel; hwc[i] = pmc; + spcsel = (event[i] >> PM_SPCSEL_SH) & PM_SPCSEL_MSK; + mmcr1 |= spcsel; + if (p970_marked_instr_event(event[i])) + mmcra |= MMCRA_SAMPLE_ENABLE; } for (pmc = 0; pmc < 2; ++pmc) mmcr0 |= pmcsel[pmc] << (MMCR0_PMC1SEL_SH - 7 * pmc); From bab5bc9e857638880facef76e4b4c3fa807f8c73 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Tue, 7 Apr 2009 23:23:50 -0700 Subject: [PATCH 0291/2061] futex: fixup unlocked requeue pi case Thomas's testing caught a problem when the requeue target futex is unowned and multiple tasks are requeued to it. This patch ensures the FUTEX_WAITERS bit gets set if futex_requeue() will requeue one or more tasks in addition to the one acquiring the lock. Signed-off-by: Darren Hart Signed-off-by: Thomas Gleixner --- kernel/futex.c | 65 ++++++++++++++++++++++++++++++++++---------------- 1 file changed, 44 insertions(+), 21 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 185c981d89e..041bf3ac4be 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -565,12 +565,14 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, /** * futex_lock_pi_atomic() - atomic work required to acquire a pi aware futex - * @uaddr: the pi futex user address - * @hb: the pi futex hash bucket - * @key: the futex key associated with uaddr and hb - * @ps: the pi_state pointer where we store the result of the lookup - * @task: the task to perform the atomic lock work for. This will be - * "current" except in the case of requeue pi. + * @uaddr: the pi futex user address + * @hb: the pi futex hash bucket + * @key: the futex key associated with uaddr and hb + * @ps: the pi_state pointer where we store the result of the + * lookup + * @task: the task to perform the atomic lock work for. This will + * be "current" except in the case of requeue pi. + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Returns: * 0 - ready to wait @@ -582,7 +584,7 @@ lookup_pi_state(u32 uval, struct futex_hash_bucket *hb, static int futex_lock_pi_atomic(u32 __user *uaddr, struct futex_hash_bucket *hb, union futex_key *key, struct futex_pi_state **ps, - struct task_struct *task) + struct task_struct *task, int set_waiters) { int lock_taken, ret, ownerdied = 0; u32 uval, newval, curval; @@ -596,6 +598,8 @@ retry: * the locks. It will most likely not succeed. */ newval = task_pid_vnr(task); + if (set_waiters) + newval |= FUTEX_WAITERS; curval = cmpxchg_futex_value_locked(uaddr, 0, newval); @@ -1004,14 +1008,18 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) /** * futex_proxy_trylock_atomic() - Attempt an atomic lock for the top waiter - * @pifutex: the user address of the to futex - * @hb1: the from futex hash bucket, must be locked by the caller - * @hb2: the to futex hash bucket, must be locked by the caller - * @key1: the from futex key - * @key2: the to futex key + * @pifutex: the user address of the to futex + * @hb1: the from futex hash bucket, must be locked by the caller + * @hb2: the to futex hash bucket, must be locked by the caller + * @key1: the from futex key + * @key2: the to futex key + * @ps: address to store the pi_state pointer + * @set_waiters: force setting the FUTEX_WAITERS bit (1) or not (0) * * Try and get the lock on behalf of the top waiter if we can do it atomically. - * Wake the top waiter if we succeed. hb1 and hb2 must be held by the caller. + * Wake the top waiter if we succeed. If the caller specified set_waiters, + * then direct futex_lock_pi_atomic() to force setting the FUTEX_WAITERS bit. + * hb1 and hb2 must be held by the caller. * * Returns: * 0 - failed to acquire the lock atomicly @@ -1022,15 +1030,23 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, struct futex_hash_bucket *hb1, struct futex_hash_bucket *hb2, union futex_key *key1, union futex_key *key2, - struct futex_pi_state **ps) + struct futex_pi_state **ps, int set_waiters) { - struct futex_q *top_waiter; + struct futex_q *top_waiter = NULL; u32 curval; int ret; if (get_futex_value_locked(&curval, pifutex)) return -EFAULT; + /* + * Find the top_waiter and determine if there are additional waiters. + * If the caller intends to requeue more than 1 waiter to pifutex, + * force futex_lock_pi_atomic() to set the FUTEX_WAITERS bit now, + * as we have means to handle the possible fault. If not, don't set + * the bit unecessarily as it will force the subsequent unlock to enter + * the kernel. + */ top_waiter = futex_top_waiter(hb1, key1); /* There are no waiters, nothing for us to do. */ @@ -1038,10 +1054,12 @@ static int futex_proxy_trylock_atomic(u32 __user *pifutex, return 0; /* - * Either take the lock for top_waiter or set the FUTEX_WAITERS bit. - * The pi_state is returned in ps in contended cases. + * Try to take the lock for top_waiter. Set the FUTEX_WAITERS bit in + * the contended case or if set_waiters is 1. The pi_state is returned + * in ps in contended cases. */ - ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task); + ret = futex_lock_pi_atomic(pifutex, hb2, key2, ps, top_waiter->task, + set_waiters); if (ret == 1) requeue_pi_wake_futex(top_waiter, key2); @@ -1146,9 +1164,14 @@ retry_private: } if (requeue_pi && (task_count - nr_wake < nr_requeue)) { - /* Attempt to acquire uaddr2 and wake the top_waiter. */ + /* + * Attempt to acquire uaddr2 and wake the top waiter. If we + * intend to requeue waiters, force setting the FUTEX_WAITERS + * bit. We force this here where we are able to easily handle + * faults rather in the requeue loop below. + */ ret = futex_proxy_trylock_atomic(uaddr2, hb1, hb2, &key1, - &key2, &pi_state); + &key2, &pi_state, nr_requeue); /* * At this point the top_waiter has either taken uaddr2 or is @@ -1810,7 +1833,7 @@ retry: retry_private: hb = queue_lock(&q); - ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current); + ret = futex_lock_pi_atomic(uaddr, hb, &q.key, &q.pi_state, current, 0); if (unlikely(ret)) { switch (ret) { case 1: From fcb2ac5bdfa3a7a04fb9749b916f64400f4c35a8 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 8 Apr 2009 13:31:58 +0200 Subject: [PATCH 0292/2061] x86_32: introduce restore_fpu_checking() Impact: cleanup, prepare FPU code unificaton Like on x86_64, return an error from restore_fpu and kill the task if it fails. Also rename restore_fpu to restore_fpu_checking which allows ifdefs to be removed in math_state_restore(). Signed-off-by: Jiri Slaby LKML-Reference: <1239190320-23952-1-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 9 ++++----- arch/x86/kernel/traps.c | 5 +---- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 71c9e518398..09a2d6dfd85 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -185,12 +185,10 @@ static inline void tolerant_fwait(void) asm volatile("fnclex ; fwait"); } -static inline void restore_fpu(struct task_struct *tsk) +static inline int restore_fpu_checking(struct task_struct *tsk) { - if (task_thread_info(tsk)->status & TS_XSAVE) { - xrstor_checking(&tsk->thread.xstate->xsave); - return; - } + if (task_thread_info(tsk)->status & TS_XSAVE) + return xrstor_checking(&tsk->thread.xstate->xsave); /* * The "nop" is needed to make the instructions the same * length. @@ -200,6 +198,7 @@ static inline void restore_fpu(struct task_struct *tsk) "fxrstor %1", X86_FEATURE_FXSR, "m" (tsk->thread.xstate->fxsave)); + return 0; } /* We need a safe address that is cheap to find and that is already diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a1d288327ff..d696145855b 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -839,9 +839,6 @@ asmlinkage void math_state_restore(void) } clts(); /* Allow maths ops (or we recurse) */ -#ifdef CONFIG_X86_32 - restore_fpu(tsk); -#else /* * Paranoid restore. send a SIGSEGV if we fail to restore the state. */ @@ -850,7 +847,7 @@ asmlinkage void math_state_restore(void) force_sig(SIGSEGV, tsk); return; } -#endif + thread->status |= TS_USEDFPU; /* So we fnsave on switch_to() */ tsk->fpu_counter++; } From 34ba476a01e128aad51e02f9be854584e9ec73cf Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 8 Apr 2009 13:31:59 +0200 Subject: [PATCH 0293/2061] x86: unify restore_fpu_checking Impact: cleanup On x86_32, separate f*rstor to an inline function which makes restore_fpu_checking the same on both platforms -> move it outside the ifdefs. Signed-off-by: Jiri Slaby LKML-Reference: <1239190320-23952-2-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 09a2d6dfd85..7a6f21d95cf 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -75,14 +75,6 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) return err; } -static inline int restore_fpu_checking(struct task_struct *tsk) -{ - if (task_thread_info(tsk)->status & TS_XSAVE) - return xrstor_checking(&tsk->thread.xstate->xsave); - else - return fxrstor_checking(&tsk->thread.xstate->fxsave); -} - /* AMD CPUs don't save/restore FDP/FIP/FOP unless an exception is pending. Clear the x87 state here by setting it to fixed values. The kernel data segment can be sometimes 0 and sometimes @@ -185,10 +177,9 @@ static inline void tolerant_fwait(void) asm volatile("fnclex ; fwait"); } -static inline int restore_fpu_checking(struct task_struct *tsk) +/* perform fxrstor iff the processor has extended states, otherwise frstor */ +static inline int fxrstor_checking(struct i387_fxsave_struct *fx) { - if (task_thread_info(tsk)->status & TS_XSAVE) - return xrstor_checking(&tsk->thread.xstate->xsave); /* * The "nop" is needed to make the instructions the same * length. @@ -197,7 +188,8 @@ static inline int restore_fpu_checking(struct task_struct *tsk) "nop ; frstor %1", "fxrstor %1", X86_FEATURE_FXSR, - "m" (tsk->thread.xstate->fxsave)); + "m" (*fx)); + return 0; } @@ -261,6 +253,14 @@ end: #endif /* CONFIG_X86_64 */ +static inline int restore_fpu_checking(struct task_struct *tsk) +{ + if (task_thread_info(tsk)->status & TS_XSAVE) + return xrstor_checking(&tsk->thread.xstate->xsave); + else + return fxrstor_checking(&tsk->thread.xstate->fxsave); +} + /* * Signal frame handlers... */ From 4ecf458492c2d97b3f9d850a5f92d79792e0a7e7 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 8 Apr 2009 13:32:00 +0200 Subject: [PATCH 0294/2061] x86_64: fix incorrect comments Impact: cleanup The comments which fxrstor_checking and fxsave_uset refer to is now in fxsave. Change the comments appropriately. Signed-off-by: Jiri Slaby Cc: Jiri Slaby LKML-Reference: <1239190320-23952-3-git-send-email-jirislaby@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/i387.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 7a6f21d95cf..63d185087d9 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -67,7 +67,7 @@ static inline int fxrstor_checking(struct i387_fxsave_struct *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err) -#if 0 /* See comment in __save_init_fpu() below. */ +#if 0 /* See comment in fxsave() below. */ : [fx] "r" (fx), "m" (*fx), "0" (0)); #else : [fx] "cdaSDb" (fx), "m" (*fx), "0" (0)); @@ -112,7 +112,7 @@ static inline int fxsave_user(struct i387_fxsave_struct __user *fx) ".previous\n" _ASM_EXTABLE(1b, 3b) : [err] "=r" (err), "=m" (*fx) -#if 0 /* See comment in __fxsave_clear() below. */ +#if 0 /* See comment in fxsave() below. */ : [fx] "r" (fx), "0" (0)); #else : [fx] "cdaSDb" (fx), "0" (0)); From a59dacfdc9ba06903652fa4883bf1106278b18ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Oct 2008 14:38:08 +0200 Subject: [PATCH 0295/2061] x86 early quirks: eliminate unused function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Impact: cleanup this warning: arch/x86/kernel/early-quirks.c:99: warning: ‘ati_ixp4x0_rev’ defined but not used triggers because ati_ixp4x0_rev() is only used in the ACPI && X86_IO_APIC case. Signed-off-by: Ingo Molnar --- arch/x86/kernel/early-quirks.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 76b8cd953de..ebdb85cf268 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -96,6 +96,7 @@ static void __init nvidia_bugs(int num, int slot, int func) } +#if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) #if defined(CONFIG_ACPI) && defined(CONFIG_X86_IO_APIC) static u32 __init ati_ixp4x0_rev(int num, int slot, int func) { @@ -114,6 +115,7 @@ static u32 __init ati_ixp4x0_rev(int num, int slot, int func) d &= 0xff; return d; } +#endif static void __init ati_bugs(int num, int slot, int func) { From ceb5ac3264686e75e6951de6a18d4baa9bdecb92 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:15 -0500 Subject: [PATCH 0296/2061] swiotlb: comment corrections Impact: cleanup swiotlb_map/unmap_single are now swiotlb_map/unmap_page; trivially change all the comments to reference new names. Also, there were some comments that should have been referring to just plain old map_single, not swiotlb_map_single; fix those as well. Also change a use of the word "pointer", when what is referred to is actually a dma/physical address. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-2-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 2b0b5a7d2ce..170cf56af6a 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -60,8 +60,8 @@ enum dma_sync_target { int swiotlb_force; /* - * Used to do a quick range check in swiotlb_unmap_single and - * swiotlb_sync_single_*, to see if the memory was in fact allocated by this + * Used to do a quick range check in unmap_single and + * sync_single_*, to see if the memory was in fact allocated by this * API. */ static char *io_tlb_start, *io_tlb_end; @@ -560,7 +560,6 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, size)) { /* * The allocated memory isn't reachable by the device. - * Fall back on swiotlb_map_single(). */ free_pages((unsigned long) ret, order); ret = NULL; @@ -568,9 +567,8 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, if (!ret) { /* * We are either out of memory or the device can't DMA - * to GFP_DMA memory; fall back on - * swiotlb_map_single(), which will grab memory from - * the lowest available address range. + * to GFP_DMA memory; fall back on map_single(), which + * will grab memory from the lowest available address range. */ ret = map_single(hwdev, 0, size, DMA_FROM_DEVICE); if (!ret) @@ -634,7 +632,7 @@ swiotlb_full(struct device *dev, size_t size, int dir, int do_panic) * physical address to use is returned. * * Once the device is given the dma address, the device owns this memory until - * either swiotlb_unmap_single or swiotlb_dma_sync_single is performed. + * either swiotlb_unmap_page or swiotlb_dma_sync_single is performed. */ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, unsigned long offset, size_t size, @@ -648,7 +646,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, BUG_ON(dir == DMA_NONE); /* - * If the pointer passed in happens to be in the device's DMA window, + * If the address happens to be in the device's DMA window, * we can safely return the device addr and not worry about bounce * buffering it. */ @@ -679,7 +677,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); /* * Unmap a single streaming mode DMA translation. The dma_addr and size must - * match what was provided for in a previous swiotlb_map_single call. All + * match what was provided for in a previous swiotlb_map_page call. All * other usages are undefined. * * After this call, reads by the cpu to the buffer are guaranteed to see @@ -703,7 +701,7 @@ EXPORT_SYMBOL_GPL(swiotlb_unmap_page); * Make physical memory consistent for a single streaming mode DMA translation * after a transfer. * - * If you perform a swiotlb_map_single() but wish to interrogate the buffer + * If you perform a swiotlb_map_page() but wish to interrogate the buffer * using the cpu, yet do not wish to teardown the dma mapping, you must * call this function before doing so. At the next point you give the dma * address back to the card, you must first perform a @@ -777,7 +775,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device); /* * Map a set of buffers described by scatterlist in streaming mode for DMA. - * This is the scatter-gather version of the above swiotlb_map_single + * This is the scatter-gather version of the above swiotlb_map_page * interface. Here the scatter gather list elements are each tagged with the * appropriate dma address and length. They are obtained via * sg_dma_{address,length}(SG). @@ -788,7 +786,7 @@ EXPORT_SYMBOL_GPL(swiotlb_sync_single_range_for_device); * The routine returns the number of addr/length pairs actually * used, at most nents. * - * Device ownership issues as mentioned above for swiotlb_map_single are the + * Device ownership issues as mentioned above for swiotlb_map_page are the * same here. */ int @@ -836,7 +834,7 @@ EXPORT_SYMBOL(swiotlb_map_sg); /* * Unmap a set of streaming mode DMA translations. Again, cpu read rules - * concerning calls here are the same as for swiotlb_unmap_single() above. + * concerning calls here are the same as for swiotlb_unmap_page() above. */ void swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, From 67131ad0514d7105b55003a0506209cf1bba3f00 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:16 -0500 Subject: [PATCH 0297/2061] swiotlb: fix compile warning Squash a build warning seen on 32-bit powerpc caused by calling min() with 2 different types. Use min_t() instead. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-3-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 170cf56af6a..4fd6a76e728 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -341,7 +341,7 @@ static void swiotlb_bounce(phys_addr_t phys, char *dma_addr, size_t size, unsigned long flags; while (size) { - sz = min(PAGE_SIZE - offset, size); + sz = min_t(size_t, PAGE_SIZE - offset, size); local_irq_save(flags); buffer = kmap_atomic(pfn_to_page(pfn), From dd6b02fe427f30520d0adc94aa52352367227873 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:17 -0500 Subject: [PATCH 0298/2061] swiotlb: map_page fix for highmem systems The current code calls virt_to_phys() on address that might be in highmem, which is bad. This wasn't needed, anyway, because we already have the physical address we need. Get rid of the now-unused virtual address as well. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-4-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 4fd6a76e728..e8a47c8cf77 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -640,7 +640,6 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, struct dma_attrs *attrs) { phys_addr_t phys = page_to_phys(page) + offset; - void *ptr = page_address(page) + offset; dma_addr_t dev_addr = swiotlb_phys_to_bus(dev, phys); void *map; @@ -651,7 +650,7 @@ dma_addr_t swiotlb_map_page(struct device *dev, struct page *page, * buffering it. */ if (!address_needs_mapping(dev, dev_addr, size) && - !range_needs_mapping(virt_to_phys(ptr), size)) + !range_needs_mapping(phys, size)) return dev_addr; /* From ef5722f698bde01cfec2b98fff733a48663ebf55 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:18 -0500 Subject: [PATCH 0299/2061] swiotlb: allow arch override of address_needs_mapping Some architectures require additional checking to determine if a device can dma to an address and need to provide their own address_needs_mapping.. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-5-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index e8a47c8cf77..d81afab8516 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -145,6 +145,12 @@ static void *swiotlb_bus_to_virt(dma_addr_t address) return phys_to_virt(swiotlb_bus_to_phys(address)); } +int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, + dma_addr_t addr, size_t size) +{ + return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); +} + int __weak swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size) { return 0; @@ -309,10 +315,10 @@ cleanup1: return -ENOMEM; } -static int +static inline int address_needs_mapping(struct device *hwdev, dma_addr_t addr, size_t size) { - return !is_buffer_dma_capable(dma_get_mask(hwdev), addr, size); + return swiotlb_arch_address_needs_mapping(hwdev, addr, size); } static inline int range_needs_mapping(phys_addr_t paddr, size_t size) From 7fcebbd2d984eac3fdd6da2f4453e7c43d32de89 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:19 -0500 Subject: [PATCH 0300/2061] swiotlb: rename unmap_single to do_unmap_single Previously, swiotlb_unmap_page and swiotlb_unmap_sg were duplicating very similar code. Refactor that code into a new unmap_single and unmap_single use do_unmap_single. Note that the swiotlb_unmap_sg code was previously doing a complicated comparison to determine if an addresses needed to be unmapped where a simple is_swiotlb_buffer() call would have sufficed. Signed-off-by: Becky Bruce Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-6-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 42 +++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 17 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index d81afab8516..2bde54a40d8 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -482,7 +482,7 @@ found: * dma_addr is the kernel virtual address of the bounce buffer to unmap. */ static void -unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) +do_unmap_single(struct device *hwdev, char *dma_addr, size_t size, int dir) { unsigned long flags; int i, count, nslots = ALIGN(size, 1 << IO_TLB_SHIFT) >> IO_TLB_SHIFT; @@ -591,7 +591,7 @@ swiotlb_alloc_coherent(struct device *hwdev, size_t size, (unsigned long long)dev_addr); /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - unmap_single(hwdev, ret, size, DMA_TO_DEVICE); + do_unmap_single(hwdev, ret, size, DMA_TO_DEVICE); return NULL; } *dma_handle = dev_addr; @@ -608,7 +608,7 @@ swiotlb_free_coherent(struct device *hwdev, size_t size, void *vaddr, free_pages((unsigned long) vaddr, get_order(size)); else /* DMA_TO_DEVICE to avoid memcpy in unmap_single */ - unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); + do_unmap_single(hwdev, vaddr, size, DMA_TO_DEVICE); } EXPORT_SYMBOL(swiotlb_free_coherent); @@ -688,17 +688,29 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); * After this call, reads by the cpu to the buffer are guaranteed to see * whatever the device wrote there. */ -void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, - size_t size, enum dma_data_direction dir, - struct dma_attrs *attrs) +static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, + size_t size, int dir) { char *dma_addr = swiotlb_bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) - unmap_single(hwdev, dma_addr, size, dir); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(dma_addr, size); + + if (is_swiotlb_buffer(dma_addr)) { + do_unmap_single(hwdev, dma_addr, size, dir); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(dma_addr, size); +} + +void swiotlb_unmap_page(struct device *hwdev, dma_addr_t dev_addr, + size_t size, enum dma_data_direction dir, + struct dma_attrs *attrs) +{ + unmap_single(hwdev, dev_addr, size, dir); } EXPORT_SYMBOL_GPL(swiotlb_unmap_page); @@ -850,13 +862,9 @@ swiotlb_unmap_sg_attrs(struct device *hwdev, struct scatterlist *sgl, BUG_ON(dir == DMA_NONE); - for_each_sg(sgl, sg, nelems, i) { - if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg))) - unmap_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), - sg->dma_length, dir); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length); - } + for_each_sg(sgl, sg, nelems, i) + unmap_single(hwdev, sg->dma_address, sg->dma_length, dir); + } EXPORT_SYMBOL(swiotlb_unmap_sg_attrs); From 380d687833aee098c4a2c3b35beaefe1c1f48d01 Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:20 -0500 Subject: [PATCH 0301/2061] swiotlb: use swiotlb_sync_single instead of duplicating code Right now both swiotlb_sync_single_range and swiotlb_sync_sg were duplicating the code in swiotlb_sync_single. Just call it instead. Also rearrange the sync_single code for readability. Note that the swiotlb_sync_sg code was previously doing a complicated comparison to determine if an addresses needed to be unmapped where a simple is_swiotlb_buffer() call would have sufficed. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-7-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- lib/swiotlb.c | 30 ++++++++++++------------------ 1 file changed, 12 insertions(+), 18 deletions(-) diff --git a/lib/swiotlb.c b/lib/swiotlb.c index 2bde54a40d8..d912f068145 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -731,10 +731,16 @@ swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, char *dma_addr = swiotlb_bus_to_virt(dev_addr); BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) + + if (is_swiotlb_buffer(dma_addr)) { sync_single(hwdev, dma_addr, size, dir, target); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(dma_addr, size); + return; + } + + if (dir != DMA_FROM_DEVICE) + return; + + dma_mark_clean(dma_addr, size); } void @@ -761,13 +767,7 @@ swiotlb_sync_single_range(struct device *hwdev, dma_addr_t dev_addr, unsigned long offset, size_t size, int dir, int target) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr) + offset; - - BUG_ON(dir == DMA_NONE); - if (is_swiotlb_buffer(dma_addr)) - sync_single(hwdev, dma_addr, size, dir, target); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(dma_addr, size); + swiotlb_sync_single(hwdev, dev_addr + offset, size, dir, target); } void @@ -890,15 +890,9 @@ swiotlb_sync_sg(struct device *hwdev, struct scatterlist *sgl, struct scatterlist *sg; int i; - BUG_ON(dir == DMA_NONE); - - for_each_sg(sgl, sg, nelems, i) { - if (sg->dma_address != swiotlb_phys_to_bus(hwdev, sg_phys(sg))) - sync_single(hwdev, swiotlb_bus_to_virt(sg->dma_address), + for_each_sg(sgl, sg, nelems, i) + swiotlb_sync_single(hwdev, sg->dma_address, sg->dma_length, dir, target); - else if (dir == DMA_FROM_DEVICE) - dma_mark_clean(swiotlb_bus_to_virt(sg->dma_address), sg->dma_length); - } } void From 42d7c5e353cef9062129b0de3ec9ddf10567b9ca Mon Sep 17 00:00:00 2001 From: Becky Bruce Date: Wed, 8 Apr 2009 09:09:21 -0500 Subject: [PATCH 0302/2061] swiotlb: change swiotlb_bus_to[phys,virt] prototypes Add a hwdev argument that is needed on some architectures in order to access a per-device offset that is taken into account when producing a physical address (also needed to get from bus address to virtual address because the physical address is an intermediate step). Also make swiotlb_bus_to_virt weak so architectures can override it. Signed-off-by: Becky Bruce Acked-by: FUJITA Tomonori Signed-off-by: Kumar Gala Cc: jeremy@goop.org Cc: ian.campbell@citrix.com LKML-Reference: <1239199761-22886-8-git-send-email-galak@kernel.crashing.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-swiotlb.c | 2 +- include/linux/swiotlb.h | 3 ++- lib/swiotlb.c | 10 +++++----- 3 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/pci-swiotlb.c b/arch/x86/kernel/pci-swiotlb.c index 34f12e9996e..887388a1c57 100644 --- a/arch/x86/kernel/pci-swiotlb.c +++ b/arch/x86/kernel/pci-swiotlb.c @@ -28,7 +28,7 @@ dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) return paddr; } -phys_addr_t swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return baddr; } diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index ac9ff54f7cb..cb1a6631b8f 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -29,7 +29,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); extern dma_addr_t swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t address); -extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address); +extern phys_addr_t swiotlb_bus_to_phys(struct device *hwdev, + dma_addr_t address); extern int swiotlb_arch_range_needs_mapping(phys_addr_t paddr, size_t size); diff --git a/lib/swiotlb.c b/lib/swiotlb.c index d912f068145..bffe6d7ef9d 100644 --- a/lib/swiotlb.c +++ b/lib/swiotlb.c @@ -129,7 +129,7 @@ dma_addr_t __weak swiotlb_phys_to_bus(struct device *hwdev, phys_addr_t paddr) return paddr; } -phys_addr_t __weak swiotlb_bus_to_phys(dma_addr_t baddr) +phys_addr_t __weak swiotlb_bus_to_phys(struct device *hwdev, dma_addr_t baddr) { return baddr; } @@ -140,9 +140,9 @@ static dma_addr_t swiotlb_virt_to_bus(struct device *hwdev, return swiotlb_phys_to_bus(hwdev, virt_to_phys(address)); } -static void *swiotlb_bus_to_virt(dma_addr_t address) +void * __weak swiotlb_bus_to_virt(struct device *hwdev, dma_addr_t address) { - return phys_to_virt(swiotlb_bus_to_phys(address)); + return phys_to_virt(swiotlb_bus_to_phys(hwdev, address)); } int __weak swiotlb_arch_address_needs_mapping(struct device *hwdev, @@ -691,7 +691,7 @@ EXPORT_SYMBOL_GPL(swiotlb_map_page); static void unmap_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); @@ -728,7 +728,7 @@ static void swiotlb_sync_single(struct device *hwdev, dma_addr_t dev_addr, size_t size, int dir, int target) { - char *dma_addr = swiotlb_bus_to_virt(dev_addr); + char *dma_addr = swiotlb_bus_to_virt(hwdev, dev_addr); BUG_ON(dir == DMA_NONE); From a4e94ef0dd391eae05bdeacd12b8da3510957a97 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 27 Mar 2009 17:07:05 +0800 Subject: [PATCH 0303/2061] printk: add support of hh length modifier for printk Impact: new feature, extend vsprintf format strings hh is used as length modifier for signed char or unsigned char. It is supported by glibc, we add kernel support now. Signed-off-by: Zhao Lei Acked-by: Lai Jiangshan Acked-by: Frederic Weisbecker Cc: torvalds@linux-foundation.org Cc: Steven Rostedt LKML-Reference: <49CC9739.30107@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- lib/vsprintf.c | 37 ++++++++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index 7536acea135..b56f6d039d2 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -408,6 +408,8 @@ enum format_type { FORMAT_TYPE_LONG_LONG, FORMAT_TYPE_ULONG, FORMAT_TYPE_LONG, + FORMAT_TYPE_UBYTE, + FORMAT_TYPE_BYTE, FORMAT_TYPE_USHORT, FORMAT_TYPE_SHORT, FORMAT_TYPE_UINT, @@ -853,11 +855,15 @@ qualifier: spec->qualifier = -1; if (*fmt == 'h' || *fmt == 'l' || *fmt == 'L' || *fmt == 'Z' || *fmt == 'z' || *fmt == 't') { - spec->qualifier = *fmt; - ++fmt; - if (spec->qualifier == 'l' && *fmt == 'l') { - spec->qualifier = 'L'; - ++fmt; + spec->qualifier = *fmt++; + if (unlikely(spec->qualifier == *fmt)) { + if (spec->qualifier == 'l') { + spec->qualifier = 'L'; + ++fmt; + } else if (spec->qualifier == 'h') { + spec->qualifier = 'H'; + ++fmt; + } } } @@ -919,6 +925,11 @@ qualifier: spec->type = FORMAT_TYPE_SIZE_T; } else if (spec->qualifier == 't') { spec->type = FORMAT_TYPE_PTRDIFF; + } else if (spec->qualifier == 'H') { + if (spec->flags & SIGN) + spec->type = FORMAT_TYPE_BYTE; + else + spec->type = FORMAT_TYPE_UBYTE; } else if (spec->qualifier == 'h') { if (spec->flags & SIGN) spec->type = FORMAT_TYPE_SHORT; @@ -1087,6 +1098,12 @@ int vsnprintf(char *buf, size_t size, const char *fmt, va_list args) case FORMAT_TYPE_PTRDIFF: num = va_arg(args, ptrdiff_t); break; + case FORMAT_TYPE_UBYTE: + num = (unsigned char) va_arg(args, int); + break; + case FORMAT_TYPE_BYTE: + num = (signed char) va_arg(args, int); + break; case FORMAT_TYPE_USHORT: num = (unsigned short) va_arg(args, int); break; @@ -1363,6 +1380,10 @@ do { \ case FORMAT_TYPE_PTRDIFF: save_arg(ptrdiff_t); break; + case FORMAT_TYPE_UBYTE: + case FORMAT_TYPE_BYTE: + save_arg(char); + break; case FORMAT_TYPE_USHORT: case FORMAT_TYPE_SHORT: save_arg(short); @@ -1538,6 +1559,12 @@ int bstr_printf(char *buf, size_t size, const char *fmt, const u32 *bin_buf) case FORMAT_TYPE_PTRDIFF: num = get_arg(ptrdiff_t); break; + case FORMAT_TYPE_UBYTE: + num = get_arg(unsigned char); + break; + case FORMAT_TYPE_BYTE: + num = get_arg(signed char); + break; case FORMAT_TYPE_USHORT: num = get_arg(unsigned short); break; From 7333a8003cdc0470e8c0ae8b949cbc44f3165ff3 Mon Sep 17 00:00:00 2001 From: Hidetoshi Seto Date: Wed, 25 Mar 2009 10:50:34 +0900 Subject: [PATCH 0304/2061] x86: smarten /proc/interrupts output for new counters Now /proc/interrupts of tip tree has new counters: CNT: Performance counter interrupts Format change of output, as like that by commit: commit 7a81d9a7da03d2f27840d659f97ef140d032f609 x86: smarten /proc/interrupts output should be applied to these new counters too. Signed-off-by: Hidetoshi Seto Cc: Jan Beulich LKML-Reference: <49C98DEA.8060208@jp.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index d465487da58..dccaaa85578 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -63,7 +63,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->irq_spurious_count); seq_printf(p, " Spurious interrupts\n"); - seq_printf(p, "CNT: "); + seq_printf(p, "%*s: ", prec, "CNT"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); From ae9e6bc9f74f8247cbca50a6a93c80e0d686fa19 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 2 Apr 2009 13:19:54 +0900 Subject: [PATCH 0305/2061] percpu: don't put the first chunk in reverse-map rbtree Impact: both first chunks don't use rbtree, no functional change There can be two first chunks - reserved and dynamic with the former one being optional. Dynamic first chunk was linked on reverse-mapping rbtree while the reserved one was mapped manually using the start address and reserved offset limit. This patch makes both first chunks to be looked up manually without using the rbtree. This is to help getting rid of the rbtree. Signed-off-by: Tejun Heo Cc: Martin Schwidefsky Cc: rusty@rustcorp.com.au Cc: Paul Mundt Cc: rmk@arm.linux.org.uk Cc: starvik@axis.com Cc: ralf@linux-mips.org Cc: davem@davemloft.net Cc: cooloney@kernel.org Cc: kyle@mcmartin.ca Cc: matthew@wil.cx Cc: grundler@parisc-linux.org Cc: takata@linux-m32r.org Cc: benh@kernel.crashing.org Cc: rth@twiddle.net Cc: ink@jurassic.park.msu.ru Cc: heiko.carstens@de.ibm.com Cc: Linus Torvalds Cc: Nick Piggin Cc: Christoph Lameter LKML-Reference: <49D43CEA.3040609@kernel.org> Signed-off-by: Ingo Molnar --- mm/percpu.c | 41 ++++++++++++++++++++++++----------------- 1 file changed, 24 insertions(+), 17 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 1aa5d8fbca1..bf1bf1f4a72 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -110,9 +110,21 @@ static size_t pcpu_chunk_struct_size __read_mostly; void *pcpu_base_addr __read_mostly; EXPORT_SYMBOL_GPL(pcpu_base_addr); -/* optional reserved chunk, only accessible for reserved allocations */ +/* + * The first chunk which always exists. Note that unlike other + * chunks, this one can be allocated and mapped in several different + * ways and thus often doesn't live in the vmalloc area. + */ +static struct pcpu_chunk *pcpu_first_chunk; + +/* + * Optional reserved chunk. This chunk reserves part of the first + * chunk and serves it for reserved allocations. The amount of + * reserved offset is in pcpu_reserved_chunk_limit. When reserved + * area doesn't exist, the following variables contain NULL and 0 + * respectively. + */ static struct pcpu_chunk *pcpu_reserved_chunk; -/* offset limit of the reserved chunk */ static int pcpu_reserved_chunk_limit; /* @@ -297,15 +309,16 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr, */ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { + void *first_start = pcpu_first_chunk->vm->addr; struct rb_node *n, *parent; struct pcpu_chunk *chunk; - /* is it in the reserved chunk? */ - if (pcpu_reserved_chunk) { - void *start = pcpu_reserved_chunk->vm->addr; - - if (addr >= start && addr < start + pcpu_reserved_chunk_limit) + /* is it in the first chunk? */ + if (addr >= first_start && addr < first_start + pcpu_chunk_size) { + /* is it in the reserved area? */ + if (addr < first_start + pcpu_reserved_chunk_limit) return pcpu_reserved_chunk; + return pcpu_first_chunk; } /* nah... search the regular ones */ @@ -1147,7 +1160,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (reserved_size) { schunk->free_size = reserved_size; - pcpu_reserved_chunk = schunk; /* not for dynamic alloc */ + pcpu_reserved_chunk = schunk; + pcpu_reserved_chunk_limit = static_size + reserved_size; } else { schunk->free_size = dyn_size; dyn_size = 0; /* dynamic area covered */ @@ -1158,8 +1172,6 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, if (schunk->free_size) schunk->map[schunk->map_used++] = schunk->free_size; - pcpu_reserved_chunk_limit = static_size + schunk->free_size; - /* init dynamic chunk if necessary */ if (dyn_size) { dchunk = alloc_bootmem(sizeof(struct pcpu_chunk)); @@ -1226,13 +1238,8 @@ size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn, } /* link the first chunk in */ - if (!dchunk) { - pcpu_chunk_relocate(schunk, -1); - pcpu_chunk_addr_insert(schunk); - } else { - pcpu_chunk_relocate(dchunk, -1); - pcpu_chunk_addr_insert(dchunk); - } + pcpu_first_chunk = dchunk ?: schunk; + pcpu_chunk_relocate(pcpu_first_chunk, -1); /* we're done */ pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0); From e1b9aa3f47242e757c776a3771bb6613e675bf9c Mon Sep 17 00:00:00 2001 From: Christoph Lameter Date: Thu, 2 Apr 2009 13:21:44 +0900 Subject: [PATCH 0306/2061] percpu: remove rbtree and use page->index instead Impact: use page->index for addr to chunk mapping instead of dedicated rbtree The rbtree is used to determine the chunk from the virtual address. However, we can already determine the page struct from a virtual address and there are several unused fields in page struct used by vmalloc. Use the index field to store a pointer to the chunk. Then there is no need anymore for an rbtree. tj: * s/(set|get)_chunk/pcpu_\1_page_chunk/ * Drop inline from the above two functions and moved them upwards so that they are with other simple helpers. * Initial pages might not (actually most of the time don't) live in the vmalloc area. With the previous patch to manually reverse-map both first chunks, this is no longer an issue. Removed pcpu_set_chunk() call on initial pages. Signed-off-by: Christoph Lameter Signed-off-by: Tejun Heo Cc: Martin Schwidefsky Cc: rusty@rustcorp.com.au Cc: Paul Mundt Cc: rmk@arm.linux.org.uk Cc: starvik@axis.com Cc: ralf@linux-mips.org Cc: davem@davemloft.net Cc: cooloney@kernel.org Cc: kyle@mcmartin.ca Cc: matthew@wil.cx Cc: grundler@parisc-linux.org Cc: takata@linux-m32r.org Cc: benh@kernel.crashing.org Cc: rth@twiddle.net Cc: ink@jurassic.park.msu.ru Cc: heiko.carstens@de.ibm.com Cc: Linus Torvalds Cc: Nick Piggin LKML-Reference: <49D43D58.4050102@kernel.org> Signed-off-by: Ingo Molnar --- mm/percpu.c | 100 +++++++++++----------------------------------------- 1 file changed, 20 insertions(+), 80 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index bf1bf1f4a72..c0b2c1a76e8 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -23,7 +23,7 @@ * Allocation is done in offset-size areas of single unit space. Ie, * an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0, * c1:u1, c1:u2 and c1:u3. Percpu access can be done by configuring - * percpu base registers UNIT_SIZE apart. + * percpu base registers pcpu_unit_size apart. * * There are usually many small percpu allocations many of them as * small as 4 bytes. The allocator organizes chunks into lists @@ -38,8 +38,8 @@ * region and negative allocated. Allocation inside a chunk is done * by scanning this map sequentially and serving the first matching * entry. This is mostly copied from the percpu_modalloc() allocator. - * Chunks are also linked into a rb tree to ease address to chunk - * mapping during free. + * Chunks can be determined from the address using the index field + * in the page struct. The index field contains a pointer to the chunk. * * To use this allocator, arch code should do the followings. * @@ -61,7 +61,6 @@ #include #include #include -#include #include #include #include @@ -88,7 +87,6 @@ struct pcpu_chunk { struct list_head list; /* linked to pcpu_slot lists */ - struct rb_node rb_node; /* key is chunk->vm->addr */ int free_size; /* free bytes in the chunk */ int contig_hint; /* max contiguous size hint */ struct vm_struct *vm; /* mapped vmalloc region */ @@ -133,7 +131,7 @@ static int pcpu_reserved_chunk_limit; * There are two locks - pcpu_alloc_mutex and pcpu_lock. The former * protects allocation/reclaim paths, chunks and chunk->page arrays. * The latter is a spinlock and protects the index data structures - - * chunk slots, rbtree, chunks and area maps in chunks. + * chunk slots, chunks and area maps in chunks. * * During allocation, pcpu_alloc_mutex is kept locked all the time and * pcpu_lock is grabbed and released as necessary. All actual memory @@ -152,7 +150,6 @@ static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */ static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */ static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */ -static struct rb_root pcpu_addr_root = RB_ROOT; /* chunks by address */ /* reclaim work to release fully free chunks, scheduled from free path */ static void pcpu_reclaim(struct work_struct *work); @@ -203,6 +200,18 @@ static bool pcpu_chunk_page_occupied(struct pcpu_chunk *chunk, return *pcpu_chunk_pagep(chunk, 0, page_idx) != NULL; } +/* set the pointer to a chunk in a page struct */ +static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu) +{ + page->index = (unsigned long)pcpu; +} + +/* obtain pointer to a chunk from a page struct */ +static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page) +{ + return (struct pcpu_chunk *)page->index; +} + /** * pcpu_mem_alloc - allocate memory * @size: bytes to allocate @@ -269,40 +278,9 @@ static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot) } } -static struct rb_node **pcpu_chunk_rb_search(void *addr, - struct rb_node **parentp) -{ - struct rb_node **p = &pcpu_addr_root.rb_node; - struct rb_node *parent = NULL; - struct pcpu_chunk *chunk; - - while (*p) { - parent = *p; - chunk = rb_entry(parent, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) - p = &(*p)->rb_left; - else if (addr > chunk->vm->addr) - p = &(*p)->rb_right; - else - break; - } - - if (parentp) - *parentp = parent; - return p; -} - /** - * pcpu_chunk_addr_search - search for chunk containing specified address - * @addr: address to search for - * - * Look for chunk which might contain @addr. More specifically, it - * searchs for the chunk with the highest start address which isn't - * beyond @addr. - * - * CONTEXT: - * pcpu_lock. + * pcpu_chunk_addr_search - determine chunk containing specified address + * @addr: address for which the chunk needs to be determined. * * RETURNS: * The address of the found chunk. @@ -310,8 +288,6 @@ static struct rb_node **pcpu_chunk_rb_search(void *addr, static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) { void *first_start = pcpu_first_chunk->vm->addr; - struct rb_node *n, *parent; - struct pcpu_chunk *chunk; /* is it in the first chunk? */ if (addr >= first_start && addr < first_start + pcpu_chunk_size) { @@ -321,42 +297,7 @@ static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr) return pcpu_first_chunk; } - /* nah... search the regular ones */ - n = *pcpu_chunk_rb_search(addr, &parent); - if (!n) { - /* no exactly matching chunk, the parent is the closest */ - n = parent; - BUG_ON(!n); - } - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - - if (addr < chunk->vm->addr) { - /* the parent was the next one, look for the previous one */ - n = rb_prev(n); - BUG_ON(!n); - chunk = rb_entry(n, struct pcpu_chunk, rb_node); - } - - return chunk; -} - -/** - * pcpu_chunk_addr_insert - insert chunk into address rb tree - * @new: chunk to insert - * - * Insert @new into address rb tree. - * - * CONTEXT: - * pcpu_lock. - */ -static void pcpu_chunk_addr_insert(struct pcpu_chunk *new) -{ - struct rb_node **p, *parent; - - p = pcpu_chunk_rb_search(new->vm->addr, &parent); - BUG_ON(*p); - rb_link_node(&new->rb_node, parent, p); - rb_insert_color(&new->rb_node, &pcpu_addr_root); + return pcpu_get_page_chunk(vmalloc_to_page(addr)); } /** @@ -768,6 +709,7 @@ static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size) alloc_mask, 0); if (!*pagep) goto err; + pcpu_set_page_chunk(*pagep, chunk); } } @@ -892,7 +834,6 @@ restart: spin_lock_irq(&pcpu_lock); pcpu_chunk_relocate(chunk, -1); - pcpu_chunk_addr_insert(chunk); goto restart; area_found: @@ -981,7 +922,6 @@ static void pcpu_reclaim(struct work_struct *work) if (chunk == list_first_entry(head, struct pcpu_chunk, list)) continue; - rb_erase(&chunk->rb_node, &pcpu_addr_root); list_move(&chunk->list, &todo); } From e30e08f65c7ef6c230424264f09c3d53f117f58b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:25 +0200 Subject: [PATCH 0307/2061] perf_counter: fix NMI race in task clock We should not be updating ctx->time from NMI context, work around that. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.681326666@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 863703b3158..84a39081344 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -319,8 +319,6 @@ static void __perf_counter_disable(void *info) spin_lock_irqsave(&ctx->lock, flags); - update_context_time(ctx); - /* * If the counter is on, turn it off. * If it is in error state, leave it in error state. @@ -2335,13 +2333,11 @@ static const struct hw_perf_counter_ops perf_ops_cpu_clock = { * Software counter: task time clock */ -static void task_clock_perf_counter_update(struct perf_counter *counter) +static void task_clock_perf_counter_update(struct perf_counter *counter, u64 now) { - u64 prev, now; + u64 prev; s64 delta; - now = counter->ctx->time; - prev = atomic64_xchg(&counter->hw.prev_count, now); delta = now - prev; atomic64_add(delta, &counter->count); @@ -2369,13 +2365,24 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) static void task_clock_perf_counter_disable(struct perf_counter *counter) { hrtimer_cancel(&counter->hw.hrtimer); - task_clock_perf_counter_update(counter); + task_clock_perf_counter_update(counter, counter->ctx->time); + } static void task_clock_perf_counter_read(struct perf_counter *counter) { - update_context_time(counter->ctx); - task_clock_perf_counter_update(counter); + u64 time; + + if (!in_nmi()) { + update_context_time(counter->ctx); + time = counter->ctx->time; + } else { + u64 now = perf_clock(); + u64 delta = now - counter->ctx->timestamp; + time = counter->ctx->time + delta; + } + + task_clock_perf_counter_update(counter, time); } static const struct hw_perf_counter_ops perf_ops_task_clock = { From 6fab01927e8bdbbc77bafba2abb4810c5591ad52 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:26 +0200 Subject: [PATCH 0308/2061] perf_counter: provide misc bits in the event header Limit the size of each record to 64k (or should we count in multiples of u64 and have a 512K limit?), this gives 16 bits or spare room in the header, which we can use for misc bits, so as to not have to grow the record with u64 every time we have a few bits to report. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.769271806@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 6 +++++- kernel/perf_counter.c | 3 +++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 7f5d353d78a..5bd8817b12d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -201,9 +201,13 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (1 << 1) + struct perf_event_header { __u32 type; - __u32 size; + __u16 misc; + __u16 size; }; enum perf_event_type { diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 84a39081344..4af98f943d3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1831,6 +1831,9 @@ static void perf_counter_output(struct perf_counter *counter, header.type = PERF_EVENT_COUNTER_OVERFLOW; header.size = sizeof(header); + header.misc = user_mode(regs) ? + PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; + if (record_type & PERF_RECORD_IP) { ip = instruction_pointer(regs); header.type |= __PERF_EVENT_IP; From 6b6e5486b3a168f0328c82a8d4376caf901472b1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:27 +0200 Subject: [PATCH 0309/2061] perf_counter: use misc field to widen type Push the PERF_EVENT_COUNTER_OVERFLOW bit into the misc field so that we can have the full 32bit for PERF_RECORD_ bits. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.891867663@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 28 ++++++++++------------------ kernel/perf_counter.c | 15 ++++++++------- 2 files changed, 18 insertions(+), 25 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 5bd8817b12d..4809ae18a94 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -201,8 +201,9 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; -#define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (1 << 1) +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (1 << 1) +#define PERF_EVENT_MISC_OVERFLOW (1 << 2) struct perf_event_header { __u32 type; @@ -230,36 +231,27 @@ enum perf_event_type { PERF_EVENT_MUNMAP = 2, /* - * Half the event type space is reserved for the counter overflow - * bitfields, as found in hw_event.record_type. - * - * These events will have types of the form: - * PERF_EVENT_COUNTER_OVERFLOW { | __PERF_EVENT_* } * + * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field + * will be PERF_RECORD_* * * struct { * struct perf_event_header header; * - * { u64 ip; } && __PERF_EVENT_IP - * { u32 pid, tid; } && __PERF_EVENT_TID + * { u64 ip; } && PERF_RECORD_IP + * { u32 pid, tid; } && PERF_RECORD_TID * * { u64 nr; - * { u64 event, val; } cnt[nr]; } && __PERF_EVENT_GROUP + * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP * * { u16 nr, * hv, * kernel, * user; - * u64 ips[nr]; } && __PERF_EVENT_CALLCHAIN + * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN * - * { u64 time; } && __PERF_EVENT_TIME + * { u64 time; } && PERF_RECORD_TIME * }; */ - PERF_EVENT_COUNTER_OVERFLOW = 1UL << 31, - __PERF_EVENT_IP = PERF_RECORD_IP, - __PERF_EVENT_TID = PERF_RECORD_TID, - __PERF_EVENT_GROUP = PERF_RECORD_GROUP, - __PERF_EVENT_CALLCHAIN = PERF_RECORD_CALLCHAIN, - __PERF_EVENT_TIME = PERF_RECORD_TIME, }; #ifdef __KERNEL__ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4af98f943d3..bf12df6f353 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1828,15 +1828,16 @@ static void perf_counter_output(struct perf_counter *counter, int callchain_size = 0; u64 time; - header.type = PERF_EVENT_COUNTER_OVERFLOW; + header.type = 0; header.size = sizeof(header); - header.misc = user_mode(regs) ? + header.misc = PERF_EVENT_MISC_OVERFLOW; + header.misc |= user_mode(regs) ? PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; if (record_type & PERF_RECORD_IP) { ip = instruction_pointer(regs); - header.type |= __PERF_EVENT_IP; + header.type |= PERF_RECORD_IP; header.size += sizeof(ip); } @@ -1845,12 +1846,12 @@ static void perf_counter_output(struct perf_counter *counter, tid_entry.pid = current->group_leader->pid; tid_entry.tid = current->pid; - header.type |= __PERF_EVENT_TID; + header.type |= PERF_RECORD_TID; header.size += sizeof(tid_entry); } if (record_type & PERF_RECORD_GROUP) { - header.type |= __PERF_EVENT_GROUP; + header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + counter->nr_siblings * sizeof(group_entry); } @@ -1861,7 +1862,7 @@ static void perf_counter_output(struct perf_counter *counter, if (callchain) { callchain_size = (1 + callchain->nr) * sizeof(u64); - header.type |= __PERF_EVENT_CALLCHAIN; + header.type |= PERF_RECORD_CALLCHAIN; header.size += callchain_size; } } @@ -1872,7 +1873,7 @@ static void perf_counter_output(struct perf_counter *counter, */ time = sched_clock(); - header.type |= __PERF_EVENT_TIME; + header.type |= PERF_RECORD_TIME; header.size += sizeof(u64); } From 808382b33bb4c60df6379ec2db39f332cc56b82a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:28 +0200 Subject: [PATCH 0310/2061] perf_counter: kerneltop: keep up with ABI changes Update kerneltop to use PERF_EVENT_MISC_OVERFLOW Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130408.947197470@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/kerneltop.c | 32 +++++++++++++------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c index 15f3a5f9019..042c1b83a87 100644 --- a/Documentation/perf_counter/kerneltop.c +++ b/Documentation/perf_counter/kerneltop.c @@ -1277,22 +1277,22 @@ static void mmap_read(struct mmap_data *md) old += size; - switch (event->header.type) { - case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP: - case PERF_EVENT_COUNTER_OVERFLOW | __PERF_EVENT_IP | __PERF_EVENT_TID: - process_event(event->ip.ip, md->counter); - break; - - case PERF_EVENT_MMAP: - case PERF_EVENT_MUNMAP: - printf("%s: %Lu %Lu %Lu %s\n", - event->header.type == PERF_EVENT_MMAP - ? "mmap" : "munmap", - event->mmap.start, - event->mmap.len, - event->mmap.pgoff, - event->mmap.filename); - break; + if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) { + if (event->header.type & PERF_RECORD_IP) + process_event(event->ip.ip, md->counter); + } else { + switch (event->header.type) { + case PERF_EVENT_MMAP: + case PERF_EVENT_MUNMAP: + printf("%s: %Lu %Lu %Lu %s\n", + event->header.type == PERF_EVENT_MMAP + ? "mmap" : "munmap", + event->mmap.start, + event->mmap.len, + event->mmap.pgoff, + event->mmap.filename); + break; + } } } From 8740f9418c78dcad694b46ab25d1645d5aef1f5e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:29 +0200 Subject: [PATCH 0311/2061] perf_counter: add some comments Add a few comments because I was forgetting what field what for what functionality. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.036984214@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4809ae18a94..8bf764fc622 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -344,10 +344,12 @@ struct file; struct perf_mmap_data { struct rcu_head rcu_head; - int nr_pages; - atomic_t wakeup; - atomic_t head; - atomic_t events; + int nr_pages; /* nr of data pages */ + + atomic_t wakeup; /* POLL_ for wakeups */ + atomic_t head; /* write position */ + atomic_t events; /* event limit */ + struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; From 8d1b2d9361b494bfc761700c348c65ebbe3deb5b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:30 +0200 Subject: [PATCH 0312/2061] perf_counter: track task-comm data Similar to the mmap data stream, add one that tracks the task COMM field, so that the userspace reporting knows what to call a task. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.127422406@chello.nl> Signed-off-by: Ingo Molnar --- fs/exec.c | 1 + include/linux/perf_counter.h | 16 ++++++- kernel/perf_counter.c | 93 ++++++++++++++++++++++++++++++++++++ 3 files changed, 109 insertions(+), 1 deletion(-) diff --git a/fs/exec.c b/fs/exec.c index e015c0b5a08..bf47ed0278f 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -951,6 +951,7 @@ void set_task_comm(struct task_struct *tsk, char *buf) task_lock(tsk); strlcpy(tsk->comm, buf, sizeof(tsk->comm)); task_unlock(tsk); + perf_counter_comm(tsk); } int flush_old_exec(struct linux_binprm * bprm) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8bf764fc622..a70a55f2759 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -142,8 +142,9 @@ struct perf_counter_hw_event { exclude_idle : 1, /* don't count when idle */ mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ + comm : 1, /* include comm data */ - __reserved_1 : 53; + __reserved_1 : 52; __u32 extra_config_len; __u32 wakeup_events; /* wakeup every n events */ @@ -230,6 +231,16 @@ enum perf_event_type { PERF_EVENT_MMAP = 1, PERF_EVENT_MUNMAP = 2, + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * }; + */ + PERF_EVENT_COMM = 3, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* @@ -545,6 +556,8 @@ extern void perf_counter_mmap(unsigned long addr, unsigned long len, extern void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); +extern void perf_counter_comm(struct task_struct *tsk); + #define MAX_STACK_DEPTH 255 struct perf_callchain_entry { @@ -583,6 +596,7 @@ static inline void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { } +static inline void perf_counter_comm(struct task_struct *tsk) { } #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index bf12df6f353..2d4aebb2982 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1916,6 +1916,99 @@ static void perf_counter_output(struct perf_counter *counter, perf_output_end(&handle); } +/* + * comm tracking + */ + +struct perf_comm_event { + struct task_struct *task; + char *comm; + int comm_size; + + struct { + struct perf_event_header header; + + u32 pid; + u32 tid; + } event; +}; + +static void perf_counter_comm_output(struct perf_counter *counter, + struct perf_comm_event *comm_event) +{ + struct perf_output_handle handle; + int size = comm_event->event.header.size; + int ret = perf_output_begin(&handle, counter, size, 0, 0); + + if (ret) + return; + + perf_output_put(&handle, comm_event->event); + perf_output_copy(&handle, comm_event->comm, + comm_event->comm_size); + perf_output_end(&handle); +} + +static int perf_counter_comm_match(struct perf_counter *counter, + struct perf_comm_event *comm_event) +{ + if (counter->hw_event.comm && + comm_event->event.header.type == PERF_EVENT_COMM) + return 1; + + return 0; +} + +static void perf_counter_comm_ctx(struct perf_counter_context *ctx, + struct perf_comm_event *comm_event) +{ + struct perf_counter *counter; + + if (system_state != SYSTEM_RUNNING || list_empty(&ctx->event_list)) + return; + + rcu_read_lock(); + list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { + if (perf_counter_comm_match(counter, comm_event)) + perf_counter_comm_output(counter, comm_event); + } + rcu_read_unlock(); +} + +static void perf_counter_comm_event(struct perf_comm_event *comm_event) +{ + struct perf_cpu_context *cpuctx; + unsigned int size; + char *comm = comm_event->task->comm; + + size = ALIGN(strlen(comm), sizeof(u64)); + + comm_event->comm = comm; + comm_event->comm_size = size; + + comm_event->event.header.size = sizeof(comm_event->event) + size; + + cpuctx = &get_cpu_var(perf_cpu_context); + perf_counter_comm_ctx(&cpuctx->ctx, comm_event); + put_cpu_var(perf_cpu_context); + + perf_counter_comm_ctx(¤t->perf_counter_ctx, comm_event); +} + +void perf_counter_comm(struct task_struct *task) +{ + struct perf_comm_event comm_event = { + .task = task, + .event = { + .header = { .type = PERF_EVENT_COMM, }, + .pid = task->group_leader->pid, + .tid = task->pid, + }, + }; + + perf_counter_comm_event(&comm_event); +} + /* * mmap tracking */ From de9ac07bbf8f51e0ce40e5428c3a8f627bd237c2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:31 +0200 Subject: [PATCH 0313/2061] perf_counter: some simple userspace profiling # perf-record make -j4 kernel/ # perf-report | tail -15 0.39 cc1 [kernel] lock_acquired 0.42 cc1 [kernel] lock_acquire 0.51 cc1 [ user ] /lib64/libc-2.8.90.so: _int_free 0.51 as [kernel] clear_page_c 0.53 cc1 [ user ] /lib64/libc-2.8.90.so: memcpy 0.56 cc1 [ user ] /lib64/libc-2.8.90.so: _IO_vfprintf 0.63 cc1 [kernel] lock_release 0.67 cc1 [ user ] /lib64/libc-2.8.90.so: strlen 0.68 cc1 [kernel] debug_smp_processor_id 1.38 cc1 [ user ] /lib64/libc-2.8.90.so: _int_malloc 1.55 cc1 [ user ] /lib64/libc-2.8.90.so: memset 1.77 cc1 [kernel] __lock_acquire 1.88 cc1 [kernel] clear_page_c 3.61 as [ user ] /usr/bin/as: 59.16 cc1 [ user ] /usr/libexec/gcc/x86_64-redhat-linux/4.3.2/cc1: Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090408130409.220518450@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/Makefile | 8 +- Documentation/perf_counter/perf-record.c | 530 ++++++++++++++++++++++ Documentation/perf_counter/perf-report.cc | 472 +++++++++++++++++++ 3 files changed, 1009 insertions(+), 1 deletion(-) create mode 100644 Documentation/perf_counter/perf-record.c create mode 100644 Documentation/perf_counter/perf-report.cc diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index 194b6621558..1dd37ee7dbd 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -1,10 +1,16 @@ -BINS = kerneltop perfstat +BINS = kerneltop perfstat perf-record perf-report all: $(BINS) kerneltop: kerneltop.c ../../include/linux/perf_counter.h cc -O6 -Wall -lrt -o $@ $< +perf-record: perf-record.c ../../include/linux/perf_counter.h + cc -O6 -Wall -lrt -o $@ $< + +perf-report: perf-report.cc ../../include/linux/perf_counter.h + g++ -O6 -Wall -lrt -o $@ $< + perfstat: kerneltop ln -sf kerneltop perfstat diff --git a/Documentation/perf_counter/perf-record.c b/Documentation/perf_counter/perf-record.c new file mode 100644 index 00000000000..614de7c468b --- /dev/null +++ b/Documentation/perf_counter/perf-record.c @@ -0,0 +1,530 @@ + + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../../include/linux/perf_counter.h" + + +/* + * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * counters in the current task. + */ +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#define rdclock() \ +({ \ + struct timespec ts; \ + \ + clock_gettime(CLOCK_MONOTONIC, &ts); \ + ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ +}) + +/* + * Pick up some kernel type conventions: + */ +#define __user +#define asmlinkage + +#ifdef __x86_64__ +#define __NR_perf_counter_open 295 +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __i386__ +#define __NR_perf_counter_open 333 +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __powerpc__ +#define __NR_perf_counter_open 319 +#define rmb() asm volatile ("sync" ::: "memory") +#define cpu_relax() asm volatile ("" ::: "memory"); +#endif + +#define unlikely(x) __builtin_expect(!!(x), 0) +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +asmlinkage int sys_perf_counter_open( + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd, + unsigned long flags) +{ + return syscall( + __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); +} + +#define MAX_COUNTERS 64 +#define MAX_NR_CPUS 256 + +#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) + +static int nr_counters = 0; +static __u64 event_id[MAX_COUNTERS] = { }; +static int default_interval = 100000; +static int event_count[MAX_COUNTERS]; +static int fd[MAX_NR_CPUS][MAX_COUNTERS]; +static int nr_cpus = 0; +static unsigned int page_size; +static unsigned int mmap_pages = 16; +static int output; +static char *output_name = "output.perf"; +static int group = 0; +static unsigned int realtime_prio = 0; + +const unsigned int default_count[] = { + 1000000, + 1000000, + 10000, + 10000, + 1000000, + 10000, +}; + +static char *hw_event_names[] = { + "CPU cycles", + "instructions", + "cache references", + "cache misses", + "branches", + "branch misses", + "bus cycles", +}; + +static char *sw_event_names[] = { + "cpu clock ticks", + "task clock ticks", + "pagefaults", + "context switches", + "CPU migrations", + "minor faults", + "major faults", +}; + +struct event_symbol { + __u64 event; + char *symbol; +}; + +static struct event_symbol event_symbols[] = { + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", }, + + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", }, +}; + +/* + * Each event can have multiple symbolic names. + * Symbolic names are (almost) exactly matched. + */ +static __u64 match_event_symbols(char *str) +{ + __u64 config, id; + int type; + unsigned int i; + + if (sscanf(str, "r%llx", &config) == 1) + return config | PERF_COUNTER_RAW_MASK; + + if (sscanf(str, "%d:%llu", &type, &id) == 2) + return EID(type, id); + + for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { + if (!strncmp(str, event_symbols[i].symbol, + strlen(event_symbols[i].symbol))) + return event_symbols[i].event; + } + + return ~0ULL; +} + +static int parse_events(char *str) +{ + __u64 config; + +again: + if (nr_counters == MAX_COUNTERS) + return -1; + + config = match_event_symbols(str); + if (config == ~0ULL) + return -1; + + event_id[nr_counters] = config; + nr_counters++; + + str = strstr(str, ","); + if (str) { + str++; + goto again; + } + + return 0; +} + +#define __PERF_COUNTER_FIELD(config, name) \ + ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) + +#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW) +#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG) +#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) +#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) + +static void display_events_help(void) +{ + unsigned int i; + __u64 e; + + printf( + " -e EVENT --event=EVENT # symbolic-name abbreviations"); + + for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { + int type, id; + + e = event_symbols[i].event; + type = PERF_COUNTER_TYPE(e); + id = PERF_COUNTER_ID(e); + + printf("\n %d:%d: %-20s", + type, id, event_symbols[i].symbol); + } + + printf("\n" + " rNNN: raw PMU events (eventsel+umask)\n\n"); +} + +static void display_help(void) +{ + printf( + "Usage: perf-record []\n" + "perf-record Options (up to %d event types can be specified at once):\n\n", + MAX_COUNTERS); + + display_events_help(); + + printf( + " -c CNT --count=CNT # event period to sample\n" + " -m pages --mmap_pages= # number of mmap data pages\n" + " -o file --output= # output file\n" + " -r prio --realtime= # use RT prio\n" + ); + + exit(0); +} + +static void process_options(int argc, char *argv[]) +{ + int error = 0, counter; + + for (;;) { + int option_index = 0; + /** Options for getopt */ + static struct option long_options[] = { + {"count", required_argument, NULL, 'c'}, + {"event", required_argument, NULL, 'e'}, + {"mmap_pages", required_argument, NULL, 'm'}, + {"output", required_argument, NULL, 'o'}, + {"realtime", required_argument, NULL, 'r'}, + {NULL, 0, NULL, 0 } + }; + int c = getopt_long(argc, argv, "+:c:e:m:o:r:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'c': default_interval = atoi(optarg); break; + case 'e': error = parse_events(optarg); break; + case 'm': mmap_pages = atoi(optarg); break; + case 'o': output_name = strdup(optarg); break; + case 'r': realtime_prio = atoi(optarg); break; + default: error = 1; break; + } + } + if (error) + display_help(); + + if (!nr_counters) { + nr_counters = 1; + event_id[0] = 0; + } + + for (counter = 0; counter < nr_counters; counter++) { + if (event_count[counter]) + continue; + + event_count[counter] = default_interval; + } +} + +struct mmap_data { + int counter; + void *base; + unsigned int mask; + unsigned int prev; +}; + +static unsigned int mmap_read_head(struct mmap_data *md) +{ + struct perf_counter_mmap_page *pc = md->base; + int head; + + head = pc->data_head; + rmb(); + + return head; +} + +static long events; +static struct timeval last_read, this_read; + +static void mmap_read(struct mmap_data *md) +{ + unsigned int head = mmap_read_head(md); + unsigned int old = md->prev; + unsigned char *data = md->base + page_size; + unsigned long size; + void *buf; + int diff; + + gettimeofday(&this_read, NULL); + + /* + * If we're further behind than half the buffer, there's a chance + * the writer will bite our tail and screw up the events under us. + * + * If we somehow ended up ahead of the head, we got messed up. + * + * In either case, truncate and restart at head. + */ + diff = head - old; + if (diff > md->mask / 2 || diff < 0) { + struct timeval iv; + unsigned long msecs; + + timersub(&this_read, &last_read, &iv); + msecs = iv.tv_sec*1000 + iv.tv_usec/1000; + + fprintf(stderr, "WARNING: failed to keep up with mmap data." + " Last read %lu msecs ago.\n", msecs); + + /* + * head points to a known good entry, start there. + */ + old = head; + } + + last_read = this_read; + + if (old != head) + events++; + + size = head - old; + + if ((old & md->mask) + size != (head & md->mask)) { + buf = &data[old & md->mask]; + size = md->mask + 1 - (old & md->mask); + old += size; + while (size) { + int ret = write(output, buf, size); + if (ret < 0) { + perror("failed to write"); + exit(-1); + } + size -= ret; + buf += ret; + } + } + + buf = &data[old & md->mask]; + size = head - old; + old += size; + while (size) { + int ret = write(output, buf, size); + if (ret < 0) { + perror("failed to write"); + exit(-1); + } + size -= ret; + buf += ret; + } + + md->prev = old; +} + +static volatile int done = 0; + +static void sigchld_handler(int sig) +{ + if (sig == SIGCHLD) + done = 1; +} + +int main(int argc, char *argv[]) +{ + struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; + struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; + struct perf_counter_hw_event hw_event; + int i, counter, group_fd, nr_poll = 0; + pid_t pid; + int ret; + + page_size = sysconf(_SC_PAGE_SIZE); + + process_options(argc, argv); + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + assert(nr_cpus <= MAX_NR_CPUS); + assert(nr_cpus >= 0); + + output = open(output_name, O_CREAT|O_RDWR, S_IRWXU); + if (output < 0) { + perror("failed to create output file"); + exit(-1); + } + + argc -= optind; + argv += optind; + + for (i = 0; i < nr_cpus; i++) { + group_fd = -1; + for (counter = 0; counter < nr_counters; counter++) { + + memset(&hw_event, 0, sizeof(hw_event)); + hw_event.config = event_id[counter]; + hw_event.irq_period = event_count[counter]; + hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID; + hw_event.nmi = 1; + hw_event.mmap = 1; + hw_event.comm = 1; + + fd[i][counter] = sys_perf_counter_open(&hw_event, -1, i, group_fd, 0); + if (fd[i][counter] < 0) { + int err = errno; + printf("kerneltop error: syscall returned with %d (%s)\n", + fd[i][counter], strerror(err)); + if (err == EPERM) + printf("Are you root?\n"); + exit(-1); + } + assert(fd[i][counter] >= 0); + fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); + + /* + * First counter acts as the group leader: + */ + if (group && group_fd == -1) + group_fd = fd[i][counter]; + + event_array[nr_poll].fd = fd[i][counter]; + event_array[nr_poll].events = POLLIN; + nr_poll++; + + mmap_array[i][counter].counter = counter; + mmap_array[i][counter].prev = 0; + mmap_array[i][counter].mask = mmap_pages*page_size - 1; + mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size, + PROT_READ, MAP_SHARED, fd[i][counter], 0); + if (mmap_array[i][counter].base == MAP_FAILED) { + printf("kerneltop error: failed to mmap with %d (%s)\n", + errno, strerror(errno)); + exit(-1); + } + } + } + + signal(SIGCHLD, sigchld_handler); + + pid = fork(); + if (pid < 0) + perror("failed to fork"); + + if (!pid) { + if (execvp(argv[0], argv)) { + perror(argv[0]); + exit(-1); + } + } + + if (realtime_prio) { + struct sched_param param; + + param.sched_priority = realtime_prio; + if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { + printf("Could not set realtime priority.\n"); + exit(-1); + } + } + + /* + * TODO: store the current /proc/$/maps information somewhere + */ + + while (!done) { + int hits = events; + + for (i = 0; i < nr_cpus; i++) { + for (counter = 0; counter < nr_counters; counter++) + mmap_read(&mmap_array[i][counter]); + } + + if (hits == events) + ret = poll(event_array, nr_poll, 100); + } + + return 0; +} diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc new file mode 100644 index 00000000000..09da0ba482c --- /dev/null +++ b/Documentation/perf_counter/perf-report.cc @@ -0,0 +1,472 @@ +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../../include/linux/perf_counter.h" + +#include +#include +#include + + +static char const *input_name = "output.perf"; +static int input; + +static unsigned long page_size; +static unsigned long mmap_window = 32; + +struct ip_event { + struct perf_event_header header; + __u64 ip; + __u32 pid, tid; +}; +struct mmap_event { + struct perf_event_header header; + __u32 pid, tid; + __u64 start; + __u64 len; + __u64 pgoff; + char filename[PATH_MAX]; +}; +struct comm_event { + struct perf_event_header header; + __u32 pid,tid; + char comm[16]; +}; + +typedef union event_union { + struct perf_event_header header; + struct ip_event ip; + struct mmap_event mmap; + struct comm_event comm; +} event_t; + +struct section { + uint64_t start; + uint64_t end; + + uint64_t offset; + + std::string name; + + section() { }; + + section(uint64_t stab) : end(stab) { }; + + section(uint64_t start, uint64_t size, uint64_t offset, std::string name) : + start(start), end(start + size), offset(offset), name(name) + { }; + + bool operator < (const struct section &s) const { + return end < s.end; + }; +}; + +typedef std::set sections_t; + +struct symbol { + uint64_t start; + uint64_t end; + + std::string name; + + symbol() { }; + + symbol(uint64_t ip) : start(ip) { } + + symbol(uint64_t start, uint64_t len, std::string name) : + start(start), end(start + len), name(name) + { }; + + bool operator < (const struct symbol &s) const { + return start < s.start; + }; +}; + +typedef std::set symbols_t; + +struct dso { + sections_t sections; + symbols_t syms; +}; + +static std::map dsos; + +static void load_dso_sections(std::string dso_name) +{ + struct dso &dso = dsos[dso_name]; + + std::string cmd = "readelf -DSW " + dso_name; + + FILE *file = popen(cmd.c_str(), "r"); + if (!file) { + perror("failed to open pipe"); + exit(-1); + } + + char *line = NULL; + size_t n = 0; + + while (!feof(file)) { + uint64_t addr, off, size; + char name[32]; + + if (getline(&line, &n, file) < 0) + break; + if (!line) + break; + + if (sscanf(line, " [%*2d] %16s %*14s %Lx %Lx %Lx", + name, &addr, &off, &size) == 4) { + + dso.sections.insert(section(addr, size, addr - off, name)); + } +#if 0 + /* + * for reading readelf symbols (-s), however these don't seem + * to include nearly everything, so use nm for that. + */ + if (sscanf(line, " %*4d %*3d: %Lx %5Lu %*7s %*6s %*7s %3d %s", + &start, &size, §ion, sym) == 4) { + + start -= dso.section_offsets[section]; + + dso.syms.insert(symbol(start, size, std::string(sym))); + } +#endif + } + pclose(file); +} + +static void load_dso_symbols(std::string dso_name, std::string args) +{ + struct dso &dso = dsos[dso_name]; + + std::string cmd = "nm -nSC " + args + " " + dso_name; + + FILE *file = popen(cmd.c_str(), "r"); + if (!file) { + perror("failed to open pipe"); + exit(-1); + } + + char *line = NULL; + size_t n = 0; + + while (!feof(file)) { + uint64_t start, size; + char c; + char sym[1024]; + + if (getline(&line, &n, file) < 0) + break; + if (!line) + break; + + + if (sscanf(line, "%Lx %Lx %c %s", &start, &size, &c, sym) == 4) { + sections_t::const_iterator si = + dso.sections.upper_bound(section(start)); + if (si == dso.sections.end()) { + printf("symbol in unknown section: %s\n", sym); + continue; + } + + start -= si->offset; + + dso.syms.insert(symbol(start, size, sym)); + } + } + pclose(file); +} + +static void load_dso(std::string dso_name) +{ + load_dso_sections(dso_name); + load_dso_symbols(dso_name, "-D"); /* dynamic symbols */ + load_dso_symbols(dso_name, ""); /* regular ones */ +} + +void load_kallsyms(void) +{ + struct dso &dso = dsos["[kernel]"]; + + FILE *file = fopen("/proc/kallsyms", "r"); + if (!file) { + perror("failed to open kallsyms"); + exit(-1); + } + + char *line; + size_t n; + + while (!feof(file)) { + uint64_t start; + char c; + char sym[1024]; + + if (getline(&line, &n, file) < 0) + break; + if (!line) + break; + + if (sscanf(line, "%Lx %c %s", &start, &c, sym) == 3) + dso.syms.insert(symbol(start, 0x1000000, std::string(sym))); + } + fclose(file); +} + +struct map { + uint64_t start; + uint64_t end; + uint64_t pgoff; + + std::string dso; + + map() { }; + + map(uint64_t ip) : end(ip) { } + + map(mmap_event *mmap) { + start = mmap->start; + end = mmap->start + mmap->len; + pgoff = mmap->pgoff; + + dso = std::string(mmap->filename); + + if (dsos.find(dso) == dsos.end()) + load_dso(dso); + }; + + bool operator < (const struct map &m) const { + return end < m.end; + }; +}; + +typedef std::set maps_t; + +static std::map maps; + +static std::map comms; + +static std::map hist; +static std::multimap rev_hist; + +static std::string resolve_comm(int pid) +{ + std::string comm = ""; + std::map::const_iterator ci = comms.find(pid); + if (ci != comms.end()) + comm = ci->second; + + return comm; +} + +static std::string resolve_user_symbol(int pid, uint64_t ip) +{ + std::string sym = ""; + + maps_t &m = maps[pid]; + maps_t::const_iterator mi = m.upper_bound(map(ip)); + if (mi == m.end()) + return sym; + + ip -= mi->start + mi->pgoff; + + symbols_t &s = dsos[mi->dso].syms; + symbols_t::const_iterator si = s.upper_bound(symbol(ip)); + + sym = mi->dso + ": "; + + if (si == s.begin()) + return sym; + si--; + + if (si->start <= ip && ip < si->end) + sym = mi->dso + ": " + si->name; +#if 0 + else if (si->start <= ip) + sym = mi->dso + ": ?" + si->name; +#endif + + return sym; +} + +static std::string resolve_kernel_symbol(uint64_t ip) +{ + std::string sym = ""; + + symbols_t &s = dsos["[kernel]"].syms; + symbols_t::const_iterator si = s.upper_bound(symbol(ip)); + + if (si == s.begin()) + return sym; + si--; + + if (si->start <= ip && ip < si->end) + sym = si->name; + + return sym; +} + +static void display_help(void) +{ + printf( + "Usage: perf-report []\n" + " -i file --input= # input file\n" + ); + + exit(0); +} + +static void process_options(int argc, char *argv[]) +{ + int error = 0; + + for (;;) { + int option_index = 0; + /** Options for getopt */ + static struct option long_options[] = { + {"input", required_argument, NULL, 'i'}, + {NULL, 0, NULL, 0 } + }; + int c = getopt_long(argc, argv, "+:i:", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'i': input_name = strdup(optarg); break; + default: error = 1; break; + } + } + + if (error) + display_help(); +} + +int main(int argc, char *argv[]) +{ + unsigned long offset = 0; + unsigned long head = 0; + struct stat stat; + char *buf; + event_t *event; + int ret; + unsigned long total = 0; + + page_size = getpagesize(); + + process_options(argc, argv); + + input = open(input_name, O_RDONLY); + if (input < 0) { + perror("failed to open file"); + exit(-1); + } + + ret = fstat(input, &stat); + if (ret < 0) { + perror("failed to stat file"); + exit(-1); + } + + load_kallsyms(); + +remap: + buf = (char *)mmap(NULL, page_size * mmap_window, PROT_READ, + MAP_SHARED, input, offset); + if (buf == MAP_FAILED) { + perror("failed to mmap file"); + exit(-1); + } + +more: + event = (event_t *)(buf + head); + + if (head + event->header.size >= page_size * mmap_window) { + unsigned long shift = page_size * (head / page_size); + + munmap(buf, page_size * mmap_window); + offset += shift; + head -= shift; + goto remap; + } + head += event->header.size; + + if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) { + std::string comm, sym, level; + char output[1024]; + + if (event->header.misc & PERF_EVENT_MISC_KERNEL) { + level = "[kernel]"; + sym = resolve_kernel_symbol(event->ip.ip); + } else if (event->header.misc & PERF_EVENT_MISC_USER) { + level = "[ user ]"; + sym = resolve_user_symbol(event->ip.pid, event->ip.ip); + } else { + level = "[ hv ]"; + } + comm = resolve_comm(event->ip.pid); + + snprintf(output, sizeof(output), "%16s %s %s", + comm.c_str(), level.c_str(), sym.c_str()); + hist[output]++; + + total++; + + } else switch (event->header.type) { + case PERF_EVENT_MMAP: + maps[event->mmap.pid].insert(map(&event->mmap)); + break; + + case PERF_EVENT_COMM: + comms[event->comm.pid] = std::string(event->comm.comm); + break; + } + + if (offset + head < stat.st_size) + goto more; + + close(input); + + std::map::iterator hi = hist.begin(); + + while (hi != hist.end()) { + rev_hist.insert(std::pair(hi->second, hi->first)); + hist.erase(hi++); + } + + std::multimap::const_iterator ri = rev_hist.begin(); + + while (ri != rev_hist.end()) { + printf(" %5.2f %s\n", (100.0 * ri->first)/total, ri->second.c_str()); + ri++; + } + + return 0; +} + From 4d855457d84b819fefcd1cd1b0a2a0a0ec475c07 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:32 +0200 Subject: [PATCH 0314/2061] perf_counter: move PERF_RECORD_TIME Move PERF_RECORD_TIME so that all the fixed length items come before the variable length ones. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.307926436@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 9 ++++----- kernel/perf_counter.c | 26 +++++++++++++------------- 2 files changed, 17 insertions(+), 18 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a70a55f2759..8bd1be58c93 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -100,9 +100,9 @@ enum sw_event_ids { enum perf_counter_record_format { PERF_RECORD_IP = 1U << 0, PERF_RECORD_TID = 1U << 1, - PERF_RECORD_GROUP = 1U << 2, - PERF_RECORD_CALLCHAIN = 1U << 3, - PERF_RECORD_TIME = 1U << 4, + PERF_RECORD_TIME = 1U << 2, + PERF_RECORD_GROUP = 1U << 3, + PERF_RECORD_CALLCHAIN = 1U << 4, }; /* @@ -250,6 +250,7 @@ enum perf_event_type { * * { u64 ip; } && PERF_RECORD_IP * { u32 pid, tid; } && PERF_RECORD_TID + * { u64 time; } && PERF_RECORD_TIME * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP @@ -259,8 +260,6 @@ enum perf_event_type { * kernel, * user; * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN - * - * { u64 time; } && PERF_RECORD_TIME * }; */ }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2d4aebb2982..4dc8600d282 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1850,6 +1850,16 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(tid_entry); } + if (record_type & PERF_RECORD_TIME) { + /* + * Maybe do better on x86 and provide cpu_clock_nmi() + */ + time = sched_clock(); + + header.type |= PERF_RECORD_TIME; + header.size += sizeof(u64); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -1867,16 +1877,6 @@ static void perf_counter_output(struct perf_counter *counter, } } - if (record_type & PERF_RECORD_TIME) { - /* - * Maybe do better on x86 and provide cpu_clock_nmi() - */ - time = sched_clock(); - - header.type |= PERF_RECORD_TIME; - header.size += sizeof(u64); - } - ret = perf_output_begin(&handle, counter, header.size, nmi, 1); if (ret) return; @@ -1889,6 +1889,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_TID) perf_output_put(&handle, tid_entry); + if (record_type & PERF_RECORD_TIME) + perf_output_put(&handle, time); + if (record_type & PERF_RECORD_GROUP) { struct perf_counter *leader, *sub; u64 nr = counter->nr_siblings; @@ -1910,9 +1913,6 @@ static void perf_counter_output(struct perf_counter *counter, if (callchain) perf_output_copy(&handle, callchain, callchain_size); - if (record_type & PERF_RECORD_TIME) - perf_output_put(&handle, time); - perf_output_end(&handle); } From 78f13e9525ba777da25c4ddab89f28e9366a8b7c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 8 Apr 2009 15:01:33 +0200 Subject: [PATCH 0315/2061] perf_counter: allow for data addresses to be recorded Paul suggested we allow for data addresses to be recorded along with the traditional IPs as power can provide these. For now, only the software pagefault events provide data addresses, but in the future power might as well for some events. x86 doesn't seem capable of providing this atm. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090408130409.394816925@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 2 +- arch/powerpc/mm/fault.c | 8 ++++-- arch/x86/kernel/cpu/perf_counter.c | 2 +- arch/x86/mm/fault.c | 8 ++++-- include/linux/perf_counter.h | 14 +++++---- kernel/perf_counter.c | 46 +++++++++++++++++++----------- 6 files changed, 49 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 0697ade84dd..c9d019f1907 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val, * Finally record data if requested. */ if (record) - perf_counter_overflow(counter, 1, regs); + perf_counter_overflow(counter, 1, regs, 0); } /* diff --git a/arch/powerpc/mm/fault.c b/arch/powerpc/mm/fault.c index 17bbf6f91fb..ac0e112031b 100644 --- a/arch/powerpc/mm/fault.c +++ b/arch/powerpc/mm/fault.c @@ -171,7 +171,7 @@ int __kprobes do_page_fault(struct pt_regs *regs, unsigned long address, die("Weird page fault", regs, SIGSEGV); } - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address); /* When running in the kernel we expect faults to occur only to * addresses in user space. All other faults represent errors in the @@ -312,7 +312,8 @@ good_area: } if (ret & VM_FAULT_MAJOR) { current->maj_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, + regs, address); #ifdef CONFIG_PPC_SMLPAR if (firmware_has_feature(FW_FEATURE_CMO)) { preempt_disable(); @@ -322,7 +323,8 @@ good_area: #endif } else { current->min_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, + regs, address); } up_read(&mm->mmap_sem); return 0; diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1116a41bc7b..0fcbaab83f9 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -800,7 +800,7 @@ again: continue; perf_save_and_restart(counter); - if (perf_counter_overflow(counter, nmi, regs)) + if (perf_counter_overflow(counter, nmi, regs, 0)) __pmc_generic_disable(counter, &counter->hw, bit); } diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index f2d3324d921..6f9df2babe4 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -1045,7 +1045,7 @@ do_page_fault(struct pt_regs *regs, unsigned long error_code) if (unlikely(error_code & PF_RSVD)) pgtable_bad(regs, error_code, address); - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS, 1, 0, regs, address); /* * If we're in an interrupt, have no user context or are running @@ -1142,10 +1142,12 @@ good_area: if (fault & VM_FAULT_MAJOR) { tsk->maj_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MAJ, 1, 0, + regs, address); } else { tsk->min_flt++; - perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, regs); + perf_swcounter_event(PERF_COUNT_PAGE_FAULTS_MIN, 1, 0, + regs, address); } check_v8086_mode(regs, address, tsk); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 8bd1be58c93..c22363a4f74 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -101,8 +101,9 @@ enum perf_counter_record_format { PERF_RECORD_IP = 1U << 0, PERF_RECORD_TID = 1U << 1, PERF_RECORD_TIME = 1U << 2, - PERF_RECORD_GROUP = 1U << 3, - PERF_RECORD_CALLCHAIN = 1U << 4, + PERF_RECORD_ADDR = 1U << 3, + PERF_RECORD_GROUP = 1U << 4, + PERF_RECORD_CALLCHAIN = 1U << 5, }; /* @@ -251,6 +252,7 @@ enum perf_event_type { * { u64 ip; } && PERF_RECORD_IP * { u32 pid, tid; } && PERF_RECORD_TID * { u64 time; } && PERF_RECORD_TIME + * { u64 addr; } && PERF_RECORD_ADDR * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP @@ -537,7 +539,7 @@ extern int hw_perf_group_sched_in(struct perf_counter *group_leader, extern void perf_counter_update_userpage(struct perf_counter *counter); extern int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs); + int nmi, struct pt_regs *regs, u64 addr); /* * Return 1 for a software counter, 0 for a hardware counter */ @@ -547,7 +549,7 @@ static inline int is_software_counter(struct perf_counter *counter) perf_event_type(&counter->hw_event) != PERF_TYPE_HARDWARE; } -extern void perf_swcounter_event(u32, u64, int, struct pt_regs *); +extern void perf_swcounter_event(u32, u64, int, struct pt_regs *, u64); extern void perf_counter_mmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file); @@ -584,8 +586,8 @@ static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } static inline void -perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) { } - +perf_swcounter_event(u32 event, u64 nr, int nmi, + struct pt_regs *regs, u64 addr) { } static inline void perf_counter_mmap(unsigned long addr, unsigned long len, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4dc8600d282..321c57e3556 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -800,7 +800,7 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) update_context_time(ctx); regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs); + perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; @@ -1810,7 +1810,7 @@ static void perf_output_end(struct perf_output_handle *handle) } static void perf_counter_output(struct perf_counter *counter, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { int ret; u64 record_type = counter->hw_event.record_type; @@ -1860,6 +1860,11 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } + if (record_type & PERF_RECORD_ADDR) { + header.type |= PERF_RECORD_ADDR; + header.size += sizeof(u64); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -1892,6 +1897,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_TIME) perf_output_put(&handle, time); + if (record_type & PERF_RECORD_ADDR) + perf_output_put(&handle, addr); + if (record_type & PERF_RECORD_GROUP) { struct perf_counter *leader, *sub; u64 nr = counter->nr_siblings; @@ -2158,7 +2166,7 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, */ int perf_counter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { int events = atomic_read(&counter->event_limit); int ret = 0; @@ -2175,7 +2183,7 @@ int perf_counter_overflow(struct perf_counter *counter, perf_counter_disable(counter); } - perf_counter_output(counter, nmi, regs); + perf_counter_output(counter, nmi, regs, addr); return ret; } @@ -2240,7 +2248,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) regs = task_pt_regs(current); if (regs) { - if (perf_counter_overflow(counter, 0, regs)) + if (perf_counter_overflow(counter, 0, regs, 0)) ret = HRTIMER_NORESTART; } @@ -2250,11 +2258,11 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) } static void perf_swcounter_overflow(struct perf_counter *counter, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { perf_swcounter_update(counter); perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, regs)) + if (perf_counter_overflow(counter, nmi, regs, addr)) /* soft-disable the counter */ ; @@ -2286,16 +2294,17 @@ static int perf_swcounter_match(struct perf_counter *counter, } static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct pt_regs *regs) + int nmi, struct pt_regs *regs, u64 addr) { int neg = atomic64_add_negative(nr, &counter->hw.count); if (counter->hw.irq_period && !neg) - perf_swcounter_overflow(counter, nmi, regs); + perf_swcounter_overflow(counter, nmi, regs, addr); } static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_event_types type, u32 event, - u64 nr, int nmi, struct pt_regs *regs) + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) { struct perf_counter *counter; @@ -2305,7 +2314,7 @@ static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, rcu_read_lock(); list_for_each_entry_rcu(counter, &ctx->event_list, event_entry) { if (perf_swcounter_match(counter, type, event, regs)) - perf_swcounter_add(counter, nr, nmi, regs); + perf_swcounter_add(counter, nr, nmi, regs, addr); } rcu_read_unlock(); } @@ -2325,7 +2334,8 @@ static int *perf_swcounter_recursion_context(struct perf_cpu_context *cpuctx) } static void __perf_swcounter_event(enum perf_event_types type, u32 event, - u64 nr, int nmi, struct pt_regs *regs) + u64 nr, int nmi, struct pt_regs *regs, + u64 addr) { struct perf_cpu_context *cpuctx = &get_cpu_var(perf_cpu_context); int *recursion = perf_swcounter_recursion_context(cpuctx); @@ -2336,10 +2346,11 @@ static void __perf_swcounter_event(enum perf_event_types type, u32 event, (*recursion)++; barrier(); - perf_swcounter_ctx_event(&cpuctx->ctx, type, event, nr, nmi, regs); + perf_swcounter_ctx_event(&cpuctx->ctx, type, event, + nr, nmi, regs, addr); if (cpuctx->task_ctx) { perf_swcounter_ctx_event(cpuctx->task_ctx, type, event, - nr, nmi, regs); + nr, nmi, regs, addr); } barrier(); @@ -2349,9 +2360,10 @@ out: put_cpu_var(perf_cpu_context); } -void perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs) +void +perf_swcounter_event(u32 event, u64 nr, int nmi, struct pt_regs *regs, u64 addr) { - __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs); + __perf_swcounter_event(PERF_TYPE_SOFTWARE, event, nr, nmi, regs, addr); } static void perf_swcounter_read(struct perf_counter *counter) @@ -2548,7 +2560,7 @@ void perf_tpcounter_event(int event_id) if (!regs) regs = task_pt_regs(current); - __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs); + __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); } extern int ftrace_profile_enable(int); From 6c0b324435ff49fb3c68fe808a93853d81c7fb97 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 9 Apr 2009 09:27:37 +1000 Subject: [PATCH 0316/2061] perf_counter: add MAINTAINERS entry This adds an entry in MAINTAINERS for the perf_counter subsystem. Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <18909.13033.345975.434902@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- MAINTAINERS | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index c3b215970f7..16fb45adb53 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3483,6 +3483,16 @@ M: balbir@linux.vnet.ibm.com L: linux-kernel@vger.kernel.org S: Maintained +PERFORMANCE COUNTER SUBSYSTEM +P: Peter Zijlstra +M: a.p.zijlstra@chello.nl +P: Paul Mackerras +M: paulus@samba.org +P: Ingo Molnar +M: mingo@elte.hu +L: linux-kernel@vger.kernel.org +S: Supported + PERSONALITY HANDLING P: Christoph Hellwig M: hch@infradead.org From ca8f2d7f019a8547f39ddb9ed0144932f12807f2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 9 Apr 2009 14:42:56 +1000 Subject: [PATCH 0317/2061] perf_counter: powerpc: add nmi_enter/nmi_exit calls Impact: fix potential deadlocks on powerpc Now that the core is using in_nmi() (added in e30e08f6, "perf_counter: fix NMI race in task clock"), we need the powerpc perf_counter_interrupt to call nmi_enter() and nmi_exit() in those cases where the interrupt happens when interrupts are soft-disabled. If interrupts were soft-enabled, we can treat it as a regular interrupt and do irq_enter/irq_exit around the whole routine. This lets us get rid of the test_perf_counter_pending() call at the end of perf_counter_interrupt, thus simplifying things a little. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <18909.31952.873098.336615@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 31 ++++++++++++++++-------------- 1 file changed, 17 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index c9d019f1907..bd76d0fa2c3 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -714,7 +714,7 @@ hw_perf_counter_init(struct perf_counter *counter) * here so there is no possibility of being interrupted. */ static void record_and_restart(struct perf_counter *counter, long val, - struct pt_regs *regs) + struct pt_regs *regs, int nmi) { s64 prev, delta, left; int record = 0; @@ -749,7 +749,7 @@ static void record_and_restart(struct perf_counter *counter, long val, * Finally record data if requested. */ if (record) - perf_counter_overflow(counter, 1, regs, 0); + perf_counter_overflow(counter, nmi, regs, 0); } /* @@ -762,6 +762,17 @@ static void perf_counter_interrupt(struct pt_regs *regs) struct perf_counter *counter; long val; int found = 0; + int nmi; + + /* + * If interrupts were soft-disabled when this PMU interrupt + * occurred, treat it as an NMI. + */ + nmi = !regs->softe; + if (nmi) + nmi_enter(); + else + irq_enter(); for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; @@ -769,7 +780,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) if ((int)val < 0) { /* counter has overflowed */ found = 1; - record_and_restart(counter, val, regs); + record_and_restart(counter, val, regs, nmi); } } @@ -796,18 +807,10 @@ static void perf_counter_interrupt(struct pt_regs *regs) */ mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); - /* - * If we need a wakeup, check whether interrupts were soft-enabled - * when we took the interrupt. If they were, we can wake stuff up - * immediately; otherwise we'll have do the wakeup when interrupts - * get soft-enabled. - */ - if (test_perf_counter_pending() && regs->softe) { - irq_enter(); - clear_perf_counter_pending(); - perf_counter_do_pending(); + if (nmi) + nmi_exit(); + else irq_exit(); - } } void hw_perf_counter_setup(int cpu) From d2de688891909b148efe83a6fc9520a9cd6015f0 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 9 Apr 2009 15:17:22 +1000 Subject: [PATCH 0318/2061] sparc64: extend TI_RESTART_BLOCK space by 8 bytes Impact: build fix Today's linux-next build (sparc64 defconfig) failed like this: arch/sparc/kernel/built-in.o: In function `trap_init': (.init.text+0x4): undefined reference to `thread_info_offsets_are_bolixed_dave' Caused by commit 52400ba946759af28442dee6265c5c0180ac7122 ("futex: add requeue_pi functionality") (from the tip-core tree) which changed the size of struct restart_block. Shift TI_KUNA_REGS and TI_KUNA_INSN up by 8 bytes to make space for the larger restart block. Signed-off-by: Stephen Rothwell Acked-by: "David S. Miller" Cc: Darren Hart LKML-Reference: <20090409151722.c8eabb56.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- arch/sparc/include/asm/thread_info_64.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sparc/include/asm/thread_info_64.h b/arch/sparc/include/asm/thread_info_64.h index 639ac805448..65865726b28 100644 --- a/arch/sparc/include/asm/thread_info_64.h +++ b/arch/sparc/include/asm/thread_info_64.h @@ -102,8 +102,8 @@ struct thread_info { #define TI_KERN_CNTD1 0x00000488 #define TI_PCR 0x00000490 #define TI_RESTART_BLOCK 0x00000498 -#define TI_KUNA_REGS 0x000004c0 -#define TI_KUNA_INSN 0x000004c8 +#define TI_KUNA_REGS 0x000004c8 +#define TI_KUNA_INSN 0x000004d0 #define TI_FPREGS 0x00000500 /* We embed this in the uppermost byte of thread_info->flags */ From 002f128b473fb82f454654be5081b0919ee01ab2 Mon Sep 17 00:00:00 2001 From: Paul Turner Date: Wed, 8 Apr 2009 15:29:43 -0700 Subject: [PATCH 0319/2061] sched: remove redundant hierarchy walk in check_preempt_wakeup Impact: micro-optimization Under group scheduling we traverse up until we are at common siblings to make the wakeup comparison on. At this point however, they should have the same parent so continuing to check up the tree is redundant. Signed-off-by: Paul Turner Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched_fair.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/kernel/sched_fair.c b/kernel/sched_fair.c index 3816f217f11..5f9650e8fe7 100644 --- a/kernel/sched_fair.c +++ b/kernel/sched_fair.c @@ -1487,17 +1487,10 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int sync) find_matching_se(&se, &pse); - while (se) { - BUG_ON(!pse); + BUG_ON(!pse); - if (wakeup_preempt_entity(se, pse) == 1) { - resched_task(curr); - break; - } - - se = parent_entity(se); - pse = parent_entity(pse); - } + if (wakeup_preempt_entity(se, pse) == 1) + resched_task(curr); } static struct task_struct *pick_next_task_fair(struct rq *rq) From 888fcee066a2f4abd0d0bc9418c0535f9b01e6e5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 9 Apr 2009 09:48:22 +0200 Subject: [PATCH 0320/2061] perf_counter: fix off task->comm by one strlen() does not include the \0. Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 321c57e3556..b07195bbd22 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1989,7 +1989,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) unsigned int size; char *comm = comm_event->task->comm; - size = ALIGN(strlen(comm), sizeof(u64)); + size = ALIGN(strlen(comm)+1, sizeof(u64)); comm_event->comm = comm; comm_event->comm_size = size; @@ -2109,7 +2109,7 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) } got_name: - size = ALIGN(strlen(name), sizeof(u64)); + size = ALIGN(strlen(name)+1, sizeof(u64)); mmap_event->file_name = name; mmap_event->file_size = size; From b3828ebb3901adfe989d8d4157ed28247aeec132 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 9 Apr 2009 09:50:04 +0200 Subject: [PATCH 0321/2061] perf_counter tools: include PID in perf-report output, tweak user/kernel printut It's handier than an entry. Also replace the kernel/user column with a more compact version: 0.52 cc1 [k] page_fault 0.57 :0 [k] _spin_lock 0.59 :7506 [.] 0.69 as [.] /usr/bin/as: 0.76 cc1 [.] /lib64/libc-2.8.so: _int_free 0.92 cc1 [k] clear_page_c 1.00 :7465 [.] 1.43 cc1 [.] /lib64/libc-2.8.so: memset 1.86 cc1 [.] /lib64/libc-2.8.so: _int_malloc 70.33 cc1 [.] /usr/libexec/gcc/x86_64-redhat-linux/4.3.2/cc1: Signed-off-by: Ingo Molnar --- Documentation/perf_counter/perf-report.cc | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc index 09da0ba482c..1727317352b 100644 --- a/Documentation/perf_counter/perf-report.cc +++ b/Documentation/perf_counter/perf-report.cc @@ -277,10 +277,17 @@ static std::multimap rev_hist; static std::string resolve_comm(int pid) { - std::string comm = ""; + std::string comm; + std::map::const_iterator ci = comms.find(pid); - if (ci != comms.end()) + if (ci != comms.end()) { comm = ci->second; + } else { + char pid_str[30]; + + sprintf(pid_str, ":%d", pid); + comm = pid_str; + } return comm; } @@ -422,13 +429,13 @@ more: char output[1024]; if (event->header.misc & PERF_EVENT_MISC_KERNEL) { - level = "[kernel]"; + level = " [k] "; sym = resolve_kernel_symbol(event->ip.ip); } else if (event->header.misc & PERF_EVENT_MISC_USER) { - level = "[ user ]"; + level = " [.] "; sym = resolve_user_symbol(event->ip.pid, event->ip.ip); } else { - level = "[ hv ]"; + level = " [H] "; } comm = resolve_comm(event->ip.pid); From 9ee318a7825929bc3734110b83ae8e20e53d9de3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Apr 2009 10:53:44 +0200 Subject: [PATCH 0322/2061] perf_counter: optimize mmap/comm tracking Impact: performance optimization The mmap/comm tracking code does quite a lot of work before it discovers there's no interest in it, avoid that by keeping a counter. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090409085524.427173196@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 39 ++++++++++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b07195bbd22..76376ecb23b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -38,6 +38,10 @@ int perf_max_counters __read_mostly = 1; static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; +static atomic_t nr_mmap_tracking __read_mostly; +static atomic_t nr_munmap_tracking __read_mostly; +static atomic_t nr_comm_tracking __read_mostly; + /* * Mutex for (sysadmin-configurable) counter reservations: */ @@ -1186,6 +1190,13 @@ static void free_counter(struct perf_counter *counter) { perf_pending_sync(counter); + if (counter->hw_event.mmap) + atomic_dec(&nr_mmap_tracking); + if (counter->hw_event.munmap) + atomic_dec(&nr_munmap_tracking); + if (counter->hw_event.comm) + atomic_dec(&nr_comm_tracking); + if (counter->destroy) counter->destroy(counter); @@ -2005,7 +2016,12 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) void perf_counter_comm(struct task_struct *task) { - struct perf_comm_event comm_event = { + struct perf_comm_event comm_event; + + if (!atomic_read(&nr_comm_tracking)) + return; + + comm_event = (struct perf_comm_event){ .task = task, .event = { .header = { .type = PERF_EVENT_COMM, }, @@ -2128,7 +2144,12 @@ got_name: void perf_counter_mmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { - struct perf_mmap_event mmap_event = { + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_mmap_tracking)) + return; + + mmap_event = (struct perf_mmap_event){ .file = file, .event = { .header = { .type = PERF_EVENT_MMAP, }, @@ -2146,7 +2167,12 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, void perf_counter_munmap(unsigned long addr, unsigned long len, unsigned long pgoff, struct file *file) { - struct perf_mmap_event mmap_event = { + struct perf_mmap_event mmap_event; + + if (!atomic_read(&nr_munmap_tracking)) + return; + + mmap_event = (struct perf_mmap_event){ .file = file, .event = { .header = { .type = PERF_EVENT_MUNMAP, }, @@ -2725,6 +2751,13 @@ done: counter->hw_ops = hw_ops; + if (counter->hw_event.mmap) + atomic_inc(&nr_mmap_tracking); + if (counter->hw_event.munmap) + atomic_inc(&nr_munmap_tracking); + if (counter->hw_event.comm) + atomic_inc(&nr_comm_tracking); + return counter; } From 1ccd15497869f3ed83b5225d410df53a96e52757 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Apr 2009 10:53:45 +0200 Subject: [PATCH 0323/2061] perf_counter: sysctl for system wide perf counters Impact: add sysctl for paranoid/relaxed perfcounters policy Allow the use of system wide perf counters to everybody, but provide a sysctl to disable it for the paranoid security minded. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090409085524.514046352@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 4 +++- kernel/sysctl.c | 11 +++++++++++ 3 files changed, 16 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c22363a4f74..98143288530 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -568,6 +568,8 @@ struct perf_callchain_entry { extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); +extern int sysctl_perf_counter_priv; + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 76376ecb23b..7efb7ebaaae 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -42,6 +42,8 @@ static atomic_t nr_mmap_tracking __read_mostly; static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; +int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ + /* * Mutex for (sysadmin-configurable) counter reservations: */ @@ -1132,7 +1134,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) */ if (cpu != -1) { /* Must be root to operate on a CPU counter: */ - if (!capable(CAP_SYS_ADMIN)) + if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) return ERR_PTR(-EACCES); if (cpu < 0 || cpu > num_possible_cpus()) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4286b62b34a..8ba457838d9 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include @@ -920,6 +921,16 @@ static struct ctl_table kern_table[] = { .child = slow_work_sysctls, }, #endif +#ifdef CONFIG_PERF_COUNTERS + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_privileged", + .data = &sysctl_perf_counter_priv, + .maxlen = sizeof(sysctl_perf_counter_priv), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif /* * NOTE: do not add new entries to this table unless you have read * Documentation/sysctl/ctl_unnumbered.txt From d3d21c412d8525eb2e208d990ab5eee5fb0fe03d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 9 Apr 2009 10:53:46 +0200 Subject: [PATCH 0324/2061] perf_counter: log full path names Impact: fix perf-report output for /home mounted binaries, etc. dentry_path() only provide path-names up to the mount root, which is unsuited for out purpose, use d_path() instead. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090409085524.601794134@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7efb7ebaaae..7f9521c3c01 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2116,7 +2116,7 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event) name = strncpy(tmp, "//enomem", sizeof(tmp)); goto got_name; } - name = dentry_path(file->f_dentry, buf, PATH_MAX); + name = d_path(&file->f_path, buf, PATH_MAX); if (IS_ERR(name)) { name = strncpy(tmp, "//toolong", sizeof(tmp)); goto got_name; From e7c064889606aab3569669078c69b87b2c527e72 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sat, 7 Mar 2009 23:48:41 -0800 Subject: [PATCH 0325/2061] xen: add FIX_TEXT_POKE to fixmap FIX_TEXT_POKE[01] are used to map kernel addresses, so they're mapping pfns, not mfns. Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/mmu.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c index 77b242c9a11..a96f5b9393e 100644 --- a/arch/x86/xen/mmu.c +++ b/arch/x86/xen/mmu.c @@ -1812,6 +1812,9 @@ static void xen_set_fixmap(unsigned idx, unsigned long phys, pgprot_t prot) #ifdef CONFIG_X86_LOCAL_APIC case FIX_APIC_BASE: /* maps dummy local APIC */ #endif + case FIX_TEXT_POKE0: + case FIX_TEXT_POKE1: + /* All local page mappings */ pte = pfn_pte(phys, prot); break; From 7a734e7dd93b9aea08ed51036a9a0e2c9dfd8dac Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:08:28 -0700 Subject: [PATCH 0326/2061] x86, setup: "glove box" BIOS calls -- infrastructure Impact: new interfaces (not yet used) For all the platforms out there, there is an infinite number of buggy BIOSes. This adds infrastructure to treat BIOS interrupts more like toxic waste and "glove box" them -- we switch out the register set, perform the BIOS interrupt, and then restore the previous state. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin Cc: Pavel Machek Cc: Rafael J. Wysocki --- arch/x86/boot/Makefile | 5 +- arch/x86/boot/bioscall.S | 82 ++++++++++++++++++++++++ arch/x86/boot/boot.h | 48 ++++++++++++++ arch/x86/boot/header.S | 2 +- arch/x86/boot/regs.c | 29 +++++++++ arch/x86/boot/setup.ld | 6 ++ arch/x86/kernel/acpi/realmode/Makefile | 2 +- arch/x86/kernel/acpi/realmode/bioscall.S | 1 + arch/x86/kernel/acpi/realmode/regs.c | 1 + 9 files changed, 172 insertions(+), 4 deletions(-) create mode 100644 arch/x86/boot/bioscall.S create mode 100644 arch/x86/boot/regs.c create mode 100644 arch/x86/kernel/acpi/realmode/bioscall.S create mode 100644 arch/x86/kernel/acpi/realmode/regs.c diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 6633b6e7505..658bc525cac 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -26,9 +26,10 @@ targets := vmlinux.bin setup.bin setup.elf bzImage targets += fdimage fdimage144 fdimage288 image.iso mtools.conf subdir- := compressed -setup-y += a20.o cmdline.o copy.o cpu.o cpucheck.o edd.o +setup-y += a20.o bioscall.o cmdline.o copy.o cpu.o cpucheck.o edd.o setup-y += header.o main.o mca.o memory.o pm.o pmjump.o -setup-y += printf.o string.o tty.o video.o video-mode.o version.o +setup-y += printf.o regs.o string.o tty.o video.o video-mode.o +setup-y += version.o setup-$(CONFIG_X86_APM_BOOT) += apm.o # The link order of the video-*.o modules can matter. In particular, diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S new file mode 100644 index 00000000000..22b4b3efb9f --- /dev/null +++ b/arch/x86/boot/bioscall.S @@ -0,0 +1,82 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2009 Intel Corporation; author H. Peter Anvin + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes + * touching memory they shouldn't be. + */ + + .code16 + .text + .globl intcall + .type intcall, @function +intcall: + /* Self-modify the INT instruction. Ugly, but works. */ + cmpb %al, 3f + je 1f + movb %al, 3f + jmp 1f /* Synchronize pipeline */ +1: + /* Save state */ + pushfl + pushw %fs + pushw %gs + pushal + + /* Copy input state to stack frame */ + subw $44, %sp + movw %dx, %si + movw %sp, %di + movw $11, %cx + rep; movsd + + /* Pop full state from the stack */ + popal + popw %gs + popw %fs + popw %es + popw %ds + popfl + + /* Actual INT */ + .byte 0xcd /* INT opcode */ +3: .byte 0 + + /* Push full state to the stack */ + pushfl + pushw %ds + pushw %es + pushw %fs + pushw %gs + pushal + + /* Re-establish C environment invariants */ + cld + movzwl %sp, %esp + movw %cs, %ax + movw %ax, %ds + movw %ax, %es + + /* Copy output state from stack frame */ + movw 68(%esp), %di /* Original %cx == 3rd argument */ + andw %di, %di + jz 4f + movw %sp, %si + movw $11, %cx + rep; movsd +4: addw $44, %sp + + /* Restore state and return */ + popal + popw %gs + popw %fs + popfl + retl + .size intcall, .-intcall diff --git a/arch/x86/boot/boot.h b/arch/x86/boot/boot.h index 7b2692e897e..98239d2658f 100644 --- a/arch/x86/boot/boot.h +++ b/arch/x86/boot/boot.h @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -26,6 +27,7 @@ #include #include "bitops.h" #include +#include /* Useful macros */ #define BUILD_BUG_ON(condition) ((void)sizeof(char[1 - 2*!!(condition)])) @@ -241,6 +243,49 @@ int enable_a20(void); /* apm.c */ int query_apm_bios(void); +/* bioscall.c */ +struct biosregs { + union { + struct { + u32 edi; + u32 esi; + u32 ebp; + u32 _esp; + u32 ebx; + u32 edx; + u32 ecx; + u32 eax; + u32 _fsgs; + u32 _dses; + u32 eflags; + }; + struct { + u16 di, hdi; + u16 si, hsi; + u16 bp, hbp; + u16 _sp, _hsp; + u16 bx, hbx; + u16 dx, hdx; + u16 cx, hcx; + u16 ax, hax; + u16 gs, fs; + u16 es, ds; + u16 flags, hflags; + }; + struct { + u8 dil, dih, edi2, edi3; + u8 sil, sih, esi2, esi3; + u8 bpl, bph, ebp2, ebp3; + u8 _spl, _sph, _esp2, _esp3; + u8 bl, bh, ebx2, ebx3; + u8 dl, dh, edx2, edx3; + u8 cl, ch, ecx2, ecx3; + u8 al, ah, eax2, eax3; + }; + }; +}; +void intcall(u8 int_no, const struct biosregs *ireg, struct biosregs *oreg); + /* cmdline.c */ int cmdline_find_option(const char *option, char *buffer, int bufsize); int cmdline_find_option_bool(const char *option); @@ -279,6 +324,9 @@ int sprintf(char *buf, const char *fmt, ...); int vsprintf(char *buf, const char *fmt, va_list args); int printf(const char *fmt, ...); +/* regs.c */ +void initregs(struct biosregs *regs); + /* string.c */ int strcmp(const char *str1, const char *str2); size_t strnlen(const char *s, size_t maxlen); diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 5d84d1c74e4..486d97fa7f4 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -221,7 +221,7 @@ setup_data: .quad 0 # 64-bit physical pointer to # End of setup header ##################################################### - .section ".inittext", "ax" + .section ".entrytext", "ax" start_of_setup: #ifdef SAFE_RESET_DISK_CONTROLLER # Reset the disk controller. diff --git a/arch/x86/boot/regs.c b/arch/x86/boot/regs.c new file mode 100644 index 00000000000..958019b1cfa --- /dev/null +++ b/arch/x86/boot/regs.c @@ -0,0 +1,29 @@ +/* ----------------------------------------------------------------------- + * + * Copyright 2009 Intel Corporation; author H. Peter Anvin + * + * This file is part of the Linux kernel, and is made available under + * the terms of the GNU General Public License version 2 or (at your + * option) any later version; incorporated herein by reference. + * + * ----------------------------------------------------------------------- */ + +/* + * Simple helper function for initializing a register set. + * + * Note that this sets EFLAGS_CF in the input register set; this + * makes it easier to catch functions which do nothing but don't + * explicitly set CF. + */ + +#include "boot.h" + +void initregs(struct biosregs *reg) +{ + memset(reg, 0, sizeof *reg); + reg->eflags |= X86_EFLAGS_CF; + reg->ds = ds(); + reg->es = ds(); + reg->fs = fs(); + reg->gs = gs(); +} diff --git a/arch/x86/boot/setup.ld b/arch/x86/boot/setup.ld index bb8dc2de796..0f6ec455a2b 100644 --- a/arch/x86/boot/setup.ld +++ b/arch/x86/boot/setup.ld @@ -15,8 +15,11 @@ SECTIONS . = 497; .header : { *(.header) } + .entrytext : { *(.entrytext) } .inittext : { *(.inittext) } .initdata : { *(.initdata) } + __end_init = .; + .text : { *(.text) } .text32 : { *(.text32) } @@ -52,4 +55,7 @@ SECTIONS . = ASSERT(_end <= 0x8000, "Setup too big!"); . = ASSERT(hdr == 0x1f1, "The setup header has the wrong offset!"); + /* Necessary for the very-old-loader check to work... */ + . = ASSERT(__end_init <= 5*512, "init sections too big!"); + } diff --git a/arch/x86/kernel/acpi/realmode/Makefile b/arch/x86/kernel/acpi/realmode/Makefile index 1c31cc0e9de..167bc16ce0e 100644 --- a/arch/x86/kernel/acpi/realmode/Makefile +++ b/arch/x86/kernel/acpi/realmode/Makefile @@ -9,7 +9,7 @@ always := wakeup.bin targets := wakeup.elf wakeup.lds -wakeup-y += wakeup.o wakemain.o video-mode.o copy.o +wakeup-y += wakeup.o wakemain.o video-mode.o copy.o bioscall.o regs.o # The link order of the video-*.o modules can matter. In particular, # video-vga.o *must* be listed first, followed by video-vesa.o. diff --git a/arch/x86/kernel/acpi/realmode/bioscall.S b/arch/x86/kernel/acpi/realmode/bioscall.S new file mode 100644 index 00000000000..f51eb0bb56c --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/bioscall.S @@ -0,0 +1 @@ +#include "../../../boot/bioscall.S" diff --git a/arch/x86/kernel/acpi/realmode/regs.c b/arch/x86/kernel/acpi/realmode/regs.c new file mode 100644 index 00000000000..6206033ba20 --- /dev/null +++ b/arch/x86/kernel/acpi/realmode/regs.c @@ -0,0 +1 @@ +#include "../../../boot/regs.c" From df7699c56421c0476704f24a43409ac8c505f3d2 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:13:46 -0700 Subject: [PATCH 0327/2061] x86, setup: "glove box" BIOS interrupts in the core boot code Impact: BIOS proofing "Glove box" off BIOS interrupts in the core boot code. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin --- arch/x86/boot/a20.c | 9 +++-- arch/x86/boot/main.c | 39 ++++++++++---------- arch/x86/boot/memory.c | 81 +++++++++++++++++++++--------------------- arch/x86/boot/tty.c | 52 ++++++++++++++------------- 4 files changed, 95 insertions(+), 86 deletions(-) diff --git a/arch/x86/boot/a20.c b/arch/x86/boot/a20.c index 7c19ce8c244..64a31a6d751 100644 --- a/arch/x86/boot/a20.c +++ b/arch/x86/boot/a20.c @@ -2,7 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007-2008 rPath, Inc. - All Rights Reserved - * Copyright 2009 Intel Corporation + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -90,8 +90,11 @@ static int a20_test_long(void) static void enable_a20_bios(void) { - asm volatile("pushfl; int $0x15; popfl" - : : "a" ((u16)0x2401)); + struct biosregs ireg; + + initregs(&ireg); + ireg.ax = 0x2401; + intcall(0x15, &ireg, NULL); } static void enable_a20_kbc(void) diff --git a/arch/x86/boot/main.c b/arch/x86/boot/main.c index 58f0415d3ae..140172b895b 100644 --- a/arch/x86/boot/main.c +++ b/arch/x86/boot/main.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -61,11 +62,10 @@ static void copy_boot_params(void) */ static void keyboard_set_repeat(void) { - u16 ax = 0x0305; - u16 bx = 0; - asm volatile("int $0x16" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); + struct biosregs ireg; + initregs(&ireg); + ireg.ax = 0x0305; + intcall(0x16, &ireg, NULL); } /* @@ -73,18 +73,22 @@ static void keyboard_set_repeat(void) */ static void query_ist(void) { + struct biosregs ireg, oreg; + /* Some older BIOSes apparently crash on this call, so filter it from machines too old to have SpeedStep at all. */ if (cpu.level < 6) return; - asm("int $0x15" - : "=a" (boot_params.ist_info.signature), - "=b" (boot_params.ist_info.command), - "=c" (boot_params.ist_info.event), - "=d" (boot_params.ist_info.perf_level) - : "a" (0x0000e980), /* IST Support */ - "d" (0x47534943)); /* Request value */ + initregs(&ireg); + ireg.ax = 0xe980; /* IST Support */ + ireg.edx = 0x47534943; /* Request value */ + intcall(0x15, &ireg, &oreg); + + boot_params.ist_info.signature = oreg.eax; + boot_params.ist_info.command = oreg.ebx; + boot_params.ist_info.event = oreg.ecx; + boot_params.ist_info.perf_level = oreg.edx; } /* @@ -93,13 +97,12 @@ static void query_ist(void) static void set_bios_mode(void) { #ifdef CONFIG_X86_64 - u32 eax, ebx; + struct biosregs ireg; - eax = 0xec00; - ebx = 2; - asm volatile("int $0x15" - : "+a" (eax), "+b" (ebx) - : : "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.ax = 0xec00; + ireg.bx = 2; + intcall(0x15, &ireg, NULL); #endif } diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c index 5054c2ddd1a..d989de810ca 100644 --- a/arch/x86/boot/memory.c +++ b/arch/x86/boot/memory.c @@ -25,12 +25,16 @@ struct e820_ext_entry { static int detect_memory_e820(void) { int count = 0; - u32 next = 0; - u32 size, id, edi; - u8 err; + struct biosregs ireg, oreg; struct e820entry *desc = boot_params.e820_map; static struct e820_ext_entry buf; /* static so it is zeroed */ + initregs(&ireg); + ireg.ax = 0xe820; + ireg.cx = sizeof buf; + ireg.edx = SMAP; + ireg.di = (size_t)&buf; + /* * Set this here so that if the BIOS doesn't change this field * but still doesn't change %ecx, we're still okay... @@ -38,22 +42,13 @@ static int detect_memory_e820(void) buf.ext_flags = 1; do { - size = sizeof buf; - - /* Important: %edx and %esi are clobbered by some BIOSes, - so they must be either used for the error output - or explicitly marked clobbered. Given that, assume there - is something out there clobbering %ebp and %edi, too. */ - asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0" - : "=d" (err), "+b" (next), "=a" (id), "+c" (size), - "=D" (edi), "+m" (buf) - : "D" (&buf), "d" (SMAP), "a" (0xe820) - : "esi"); + intcall(0x15, &ireg, &oreg); + ireg.ebx = oreg.ebx; /* for next iteration... */ /* BIOSes which terminate the chain with CF = 1 as opposed to %ebx = 0 don't always report the SMAP signature on the final, failing, probe. */ - if (err) + if (oreg.eflags & X86_EFLAGS_CF) break; /* Some BIOSes stop returning SMAP in the middle of @@ -61,7 +56,7 @@ static int detect_memory_e820(void) screwed up the map at that point, we might have a partial map, the full map, or complete garbage, so just return failure. */ - if (id != SMAP) { + if (oreg.eax != SMAP) { count = 0; break; } @@ -69,58 +64,62 @@ static int detect_memory_e820(void) /* ACPI 3.0 added the extended flags support. If bit 0 in the extended flags is zero, we're supposed to simply ignore the entry -- a backwards incompatible change! */ - if (size > 20 && !(buf.ext_flags & 1)) + if (oreg.cx > 20 && !(buf.ext_flags & 1)) continue; *desc++ = buf.std; count++; - } while (next && count < ARRAY_SIZE(boot_params.e820_map)); + } while (ireg.ebx && count < ARRAY_SIZE(boot_params.e820_map)); return boot_params.e820_entries = count; } static int detect_memory_e801(void) { - u16 ax, bx, cx, dx; - u8 err; + struct biosregs ireg, oreg; - bx = cx = dx = 0; - ax = 0xe801; - asm("stc; int $0x15; setc %0" - : "=m" (err), "+a" (ax), "+b" (bx), "+c" (cx), "+d" (dx)); + initregs(&ireg); + ireg.ax = 0xe801; + intcall(0x15, &ireg, &oreg); - if (err) + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* Do we really need to do this? */ - if (cx || dx) { - ax = cx; - bx = dx; + if (oreg.cx || oreg.dx) { + oreg.ax = oreg.cx; + oreg.bx = oreg.dx; } - if (ax > 15*1024) + if (oreg.ax > 15*1024) { return -1; /* Bogus! */ - - /* This ignores memory above 16MB if we have a memory hole - there. If someone actually finds a machine with a memory - hole at 16MB and no support for 0E820h they should probably - generate a fake e820 map. */ - boot_params.alt_mem_k = (ax == 15*1024) ? (dx << 6)+ax : ax; + } else if (oreg.ax == 15*1024) { + boot_params.alt_mem_k = (oreg.dx << 6) + oreg.ax; + } else { + /* + * This ignores memory above 16MB if we have a memory + * hole there. If someone actually finds a machine + * with a memory hole at 16MB and no support for + * 0E820h they should probably generate a fake e820 + * map. + */ + boot_params.alt_mem_k = oreg.ax; + } return 0; } static int detect_memory_88(void) { - u16 ax; - u8 err; + struct biosregs ireg, oreg; - ax = 0x8800; - asm("stc; int $0x15; setc %0" : "=bcdm" (err), "+a" (ax)); + initregs(&ireg); + ireg.ah = 0x88; + intcall(0x15, &ireg, &oreg); - boot_params.screen_info.ext_mem_k = ax; + boot_params.screen_info.ext_mem_k = oreg.ax; - return -err; + return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */ } int detect_memory(void) diff --git a/arch/x86/boot/tty.c b/arch/x86/boot/tty.c index 7e8e8b25f5f..01ec69c901c 100644 --- a/arch/x86/boot/tty.c +++ b/arch/x86/boot/tty.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -22,24 +23,23 @@ void __attribute__((section(".inittext"))) putchar(int ch) { - unsigned char c = ch; + struct biosregs ireg; - if (c == '\n') + if (ch == '\n') putchar('\r'); /* \n -> \r\n */ - /* int $0x10 is known to have bugs involving touching registers - it shouldn't. Be extra conservative... */ - asm volatile("pushal; pushw %%ds; int $0x10; popw %%ds; popal" - : : "b" (0x0007), "c" (0x0001), "a" (0x0e00|ch)); + initregs(&ireg); + ireg.bx = 0x0007; + ireg.cx = 0x0001; + ireg.ah = 0x0e; + ireg.al = ch; + intcall(0x10, &ireg, NULL); } void __attribute__((section(".inittext"))) puts(const char *str) { - int n = 0; - while (*str) { + while (*str) putchar(*str++); - n++; - } } /* @@ -49,14 +49,13 @@ void __attribute__((section(".inittext"))) puts(const char *str) static u8 gettime(void) { - u16 ax = 0x0200; - u16 cx, dx; + struct biosregs ireg, oreg; - asm volatile("int $0x1a" - : "+a" (ax), "=c" (cx), "=d" (dx) - : : "ebx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x02; + intcall(0x1a, &ireg, &oreg); - return dx >> 8; + return oreg.dh; } /* @@ -64,19 +63,24 @@ static u8 gettime(void) */ int getchar(void) { - u16 ax = 0; - asm volatile("int $0x16" : "+a" (ax)); + struct biosregs ireg, oreg; - return ax & 0xff; + initregs(&ireg); + /* ireg.ah = 0x00; */ + intcall(0x16, &ireg, &oreg); + + return oreg.al; } static int kbd_pending(void) { - u8 pending; - asm volatile("int $0x16; setnz %0" - : "=qm" (pending) - : "a" (0x0100)); - return pending; + struct biosregs ireg, oreg; + + initregs(&ireg); + ireg.ah = 0x01; + intcall(0x16, &ireg, &oreg); + + return !(oreg.eflags & X86_EFLAGS_ZF); } void kbd_flush(void) From d54ea252e4c92357226992cf65d94616a96e6fce Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:14:26 -0700 Subject: [PATCH 0328/2061] x86, setup: "glove box" BIOS interrupts in the APM code Impact: BIOS proofing "Glove box" off BIOS interrupts in the APM code. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin Cc: Stephen Rothwell --- arch/x86/boot/apm.c | 72 +++++++++++++++++---------------------------- 1 file changed, 27 insertions(+), 45 deletions(-) diff --git a/arch/x86/boot/apm.c b/arch/x86/boot/apm.c index 7aa6033001f..ee274834ea8 100644 --- a/arch/x86/boot/apm.c +++ b/arch/x86/boot/apm.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * Original APM BIOS checking by Stephen Rothwell, May 1994 * (sfr@canb.auug.org.au) @@ -19,75 +20,56 @@ int query_apm_bios(void) { - u16 ax, bx, cx, dx, di; - u32 ebx, esi; - u8 err; + struct biosregs ireg, oreg; /* APM BIOS installation check */ - ax = 0x5300; - bx = cx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" - : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx) - : : "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x53; + intcall(0x15, &ireg, &oreg); - if (err) + if (oreg.flags & X86_EFLAGS_CF) return -1; /* No APM BIOS */ - if (bx != 0x504d) /* "PM" signature */ + if (oreg.bx != 0x504d) /* "PM" signature */ return -1; - if (!(cx & 0x02)) /* 32 bits supported? */ + if (!(oreg.cx & 0x02)) /* 32 bits supported? */ return -1; /* Disconnect first, just in case */ - ax = 0x5304; - bx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); - - /* Paranoia */ - ebx = esi = 0; - cx = dx = di = 0; + ireg.al = 0x04; + intcall(0x15, &ireg, NULL); /* 32-bit connect */ - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %6" - : "=a" (ax), "+b" (ebx), "+c" (cx), "+d" (dx), - "+S" (esi), "+D" (di), "=m" (err) - : "a" (0x5303)); + ireg.al = 0x03; + intcall(0x15, &ireg, &oreg); - boot_params.apm_bios_info.cseg = ax; - boot_params.apm_bios_info.offset = ebx; - boot_params.apm_bios_info.cseg_16 = cx; - boot_params.apm_bios_info.dseg = dx; - boot_params.apm_bios_info.cseg_len = (u16)esi; - boot_params.apm_bios_info.cseg_16_len = esi >> 16; - boot_params.apm_bios_info.dseg_len = di; + boot_params.apm_bios_info.cseg = oreg.ax; + boot_params.apm_bios_info.offset = oreg.ebx; + boot_params.apm_bios_info.cseg_16 = oreg.cx; + boot_params.apm_bios_info.dseg = oreg.dx; + boot_params.apm_bios_info.cseg_len = oreg.si; + boot_params.apm_bios_info.cseg_16_len = oreg.hsi; + boot_params.apm_bios_info.dseg_len = oreg.di; - if (err) + if (oreg.flags & X86_EFLAGS_CF) return -1; /* Redo the installation check as the 32-bit connect; some BIOSes return different flags this way... */ - ax = 0x5300; - bx = cx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp ; setc %0" - : "=d" (err), "+a" (ax), "+b" (bx), "+c" (cx) - : : "esi", "edi"); + ireg.al = 0x00; + intcall(0x15, &ireg, &oreg); - if (err || bx != 0x504d) { + if ((oreg.eflags & X86_EFLAGS_CF) || oreg.bx != 0x504d) { /* Failure with 32-bit connect, try to disconect and ignore */ - ax = 0x5304; - bx = 0; - asm volatile("pushl %%ebp ; int $0x15 ; popl %%ebp" - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); + ireg.al = 0x04; + intcall(0x15, &ireg, NULL); return -1; } - boot_params.apm_bios_info.version = ax; - boot_params.apm_bios_info.flags = cx; + boot_params.apm_bios_info.version = oreg.ax; + boot_params.apm_bios_info.flags = oreg.cx; return 0; } From 3435d3476c5ed955d56a6216ed2d156847b3a575 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:17:17 -0700 Subject: [PATCH 0329/2061] x86, setup: "glove box" BIOS interrupts in the EDD code Impact: BIOS proofing "Glove box" off BIOS interrupts in the EDD code. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin --- arch/x86/boot/edd.c | 69 ++++++++++++++++++++------------------------- 1 file changed, 30 insertions(+), 39 deletions(-) diff --git a/arch/x86/boot/edd.c b/arch/x86/boot/edd.c index 1aae8f3e5ca..c501a5b466f 100644 --- a/arch/x86/boot/edd.c +++ b/arch/x86/boot/edd.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -22,17 +23,17 @@ */ static int read_mbr(u8 devno, void *buf) { - u16 ax, bx, cx, dx; + struct biosregs ireg, oreg; - ax = 0x0201; /* Legacy Read, one sector */ - cx = 0x0001; /* Sector 0-0-1 */ - dx = devno; - bx = (size_t)buf; - asm volatile("pushfl; stc; int $0x13; setc %%al; popfl" - : "+a" (ax), "+c" (cx), "+d" (dx), "+b" (bx) - : : "esi", "edi", "memory"); + initregs(&ireg); + ireg.ax = 0x0201; /* Legacy Read, one sector */ + ireg.cx = 0x0001; /* Sector 0-0-1 */ + ireg.dl = devno; + ireg.bx = (size_t)buf; - return -(u8)ax; /* 0 or -1 */ + intcall(0x13, &ireg, &oreg); + + return -(oreg.eflags & X86_EFLAGS_CF); /* 0 or -1 */ } static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) @@ -72,56 +73,46 @@ static u32 read_mbr_sig(u8 devno, struct edd_info *ei, u32 *mbrsig) static int get_edd_info(u8 devno, struct edd_info *ei) { - u16 ax, bx, cx, dx, di; + struct biosregs ireg, oreg; memset(ei, 0, sizeof *ei); /* Check Extensions Present */ - ax = 0x4100; - bx = EDDMAGIC1; - dx = devno; - asm("pushfl; stc; int $0x13; setc %%al; popfl" - : "+a" (ax), "+b" (bx), "=c" (cx), "+d" (dx) - : : "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x41; + ireg.bx = EDDMAGIC1; + ireg.dl = devno; + intcall(0x13, &ireg, &oreg); - if ((u8)ax) + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* No extended information */ - if (bx != EDDMAGIC2) + if (oreg.bx != EDDMAGIC2) return -1; ei->device = devno; - ei->version = ax >> 8; /* EDD version number */ - ei->interface_support = cx; /* EDD functionality subsets */ + ei->version = oreg.ah; /* EDD version number */ + ei->interface_support = oreg.cx; /* EDD functionality subsets */ /* Extended Get Device Parameters */ ei->params.length = sizeof(ei->params); - ax = 0x4800; - dx = devno; - asm("pushfl; int $0x13; popfl" - : "+a" (ax), "+d" (dx), "=m" (ei->params) - : "S" (&ei->params) - : "ebx", "ecx", "edi"); + ireg.ah = 0x48; + ireg.si = (size_t)&ei->params; + intcall(0x13, &ireg, &oreg); /* Get legacy CHS parameters */ /* Ralf Brown recommends setting ES:DI to 0:0 */ - ax = 0x0800; - dx = devno; - di = 0; - asm("pushw %%es; " - "movw %%di,%%es; " - "pushfl; stc; int $0x13; setc %%al; popfl; " - "popw %%es" - : "+a" (ax), "=b" (bx), "=c" (cx), "+d" (dx), "+D" (di) - : : "esi"); + ireg.ah = 0x08; + ireg.es = 0; + intcall(0x13, &ireg, &oreg); - if ((u8)ax == 0) { - ei->legacy_max_cylinder = (cx >> 8) + ((cx & 0xc0) << 2); - ei->legacy_max_head = dx >> 8; - ei->legacy_sectors_per_track = cx & 0x3f; + if (!(oreg.eflags & X86_EFLAGS_CF)) { + ei->legacy_max_cylinder = oreg.ch + ((oreg.cl & 0xc0) << 2); + ei->legacy_max_head = oreg.dh; + ei->legacy_sectors_per_track = oreg.cl & 0x3f; } return 0; From 0a706db320768f8f6e43bbf73b58d2aabdc93354 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:19:00 -0700 Subject: [PATCH 0330/2061] x86, setup: "glove box" BIOS interrupts in the MCA code Impact: BIOS proofing "Glove box" off BIOS interrupts in the MCA code. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin Cc: James Bottomley --- arch/x86/boot/mca.c | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/arch/x86/boot/mca.c b/arch/x86/boot/mca.c index 911eaae5d69..a95a531148e 100644 --- a/arch/x86/boot/mca.c +++ b/arch/x86/boot/mca.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -16,26 +17,22 @@ int query_mca(void) { - u8 err; - u16 es, bx, len; + struct biosregs ireg, oreg; + u16 len; - asm("pushw %%es ; " - "int $0x15 ; " - "setc %0 ; " - "movw %%es, %1 ; " - "popw %%es" - : "=acd" (err), "=acdSD" (es), "=b" (bx) - : "a" (0xc000)); + initregs(&ireg); + ireg.ah = 0xc0; + intcall(0x15, &ireg, &oreg); - if (err) + if (oreg.eflags & X86_EFLAGS_CF) return -1; /* No MCA present */ - set_fs(es); - len = rdfs16(bx); + set_fs(oreg.es); + len = rdfs16(oreg.bx); if (len > sizeof(boot_params.sys_desc_table)) len = sizeof(boot_params.sys_desc_table); - copy_from_fs(&boot_params.sys_desc_table, bx, len); + copy_from_fs(&boot_params.sys_desc_table, oreg.bx, len); return 0; } From cf06de7b9cdd3efee7a59dced1977b3c21d43732 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 1 Apr 2009 18:20:11 -0700 Subject: [PATCH 0331/2061] x86, setup: "glove box" BIOS interrupts in the video code Impact: BIOS proofing "Glove box" off BIOS interrupts in the video code. LKML-Reference: <49DE7F79.4030106@zytor.com> Signed-off-by: H. Peter Anvin Cc: Pavel Machek Cc: Rafael J. Wysocki --- arch/x86/boot/video-bios.c | 27 ++++---- arch/x86/boot/video-vesa.c | 137 ++++++++++++++++--------------------- arch/x86/boot/video-vga.c | 95 +++++++++++++++---------- arch/x86/boot/video.c | 42 +++++------- arch/x86/boot/video.h | 14 ---- 5 files changed, 151 insertions(+), 164 deletions(-) diff --git a/arch/x86/boot/video-bios.c b/arch/x86/boot/video-bios.c index 3fa979c9c36..d660be49236 100644 --- a/arch/x86/boot/video-bios.c +++ b/arch/x86/boot/video-bios.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -29,21 +30,21 @@ static int bios_set_mode(struct mode_info *mi) static int set_bios_mode(u8 mode) { - u16 ax; + struct biosregs ireg, oreg; u8 new_mode; - ax = mode; /* AH=0x00 Set Video Mode */ - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.al = mode; /* AH=0x00 Set Video Mode */ + intcall(0x10, &ireg, NULL); - ax = 0x0f00; /* Get Current Video Mode */ - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + + ireg.ah = 0x0f; /* Get Current Video Mode */ + intcall(0x10, &ireg, &oreg); do_restore = 1; /* Assume video contents were lost */ - new_mode = ax & 0x7f; /* Not all BIOSes are clean with the top bit */ + + /* Not all BIOSes are clean with the top bit */ + new_mode = ireg.al & 0x7f; if (new_mode == mode) return 0; /* Mode change OK */ @@ -53,10 +54,8 @@ static int set_bios_mode(u8 mode) /* Mode setting failed, but we didn't end up where we started. That's bad. Try to revert to the original video mode. */ - ax = boot_params.screen_info.orig_video_mode; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = boot_params.screen_info.orig_video_mode; + intcall(0x10, &ireg, NULL); } #endif return -1; diff --git a/arch/x86/boot/video-vesa.c b/arch/x86/boot/video-vesa.c index 4a58c8ce3f6..c700147d6ff 100644 --- a/arch/x86/boot/video-vesa.c +++ b/arch/x86/boot/video-vesa.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -31,7 +32,7 @@ static inline void vesa_store_mode_params_graphics(void) {} static int vesa_probe(void) { #if defined(CONFIG_VIDEO_VESA) || defined(CONFIG_FIRMWARE_EDID) - u16 ax, cx, di; + struct biosregs ireg, oreg; u16 mode; addr_t mode_ptr; struct mode_info *mi; @@ -39,13 +40,12 @@ static int vesa_probe(void) video_vesa.modes = GET_HEAP(struct mode_info, 0); - ax = 0x4f00; - di = (size_t)&vginfo; - asm(INT10 - : "+a" (ax), "+D" (di), "=m" (vginfo) - : : "ebx", "ecx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f00; + ireg.di = (size_t)&vginfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f || + if (ireg.ax != 0x004f || vginfo.signature != VESA_MAGIC || vginfo.version < 0x0102) return 0; /* Not present */ @@ -65,14 +65,12 @@ static int vesa_probe(void) memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ - ax = 0x4f01; - cx = mode; - di = (size_t)&vminfo; - asm(INT10 - : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) - : : "ebx", "edx", "esi"); + ireg.ax = 0x4f01; + ireg.cx = mode; + ireg.di = (size_t)&vminfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (ireg.ax != 0x004f) continue; if ((vminfo.mode_attr & 0x15) == 0x05) { @@ -111,20 +109,19 @@ static int vesa_probe(void) static int vesa_set_mode(struct mode_info *mode) { - u16 ax, bx, cx, di; + struct biosregs ireg, oreg; int is_graphic; u16 vesa_mode = mode->mode - VIDEO_FIRST_VESA; memset(&vminfo, 0, sizeof vminfo); /* Just in case... */ - ax = 0x4f01; - cx = vesa_mode; - di = (size_t)&vminfo; - asm(INT10 - : "+a" (ax), "+c" (cx), "+D" (di), "=m" (vminfo) - : : "ebx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f01; + ireg.cx = vesa_mode; + ireg.di = (size_t)&vminfo; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return -1; if ((vminfo.mode_attr & 0x15) == 0x05) { @@ -141,14 +138,12 @@ static int vesa_set_mode(struct mode_info *mode) } - ax = 0x4f02; - bx = vesa_mode; - di = 0; - asm volatile(INT10 - : "+a" (ax), "+b" (bx), "+D" (di) - : : "ecx", "edx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f02; + ireg.bx = vesa_mode; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return -1; graphic_mode = is_graphic; @@ -171,50 +166,45 @@ static int vesa_set_mode(struct mode_info *mode) /* Switch DAC to 8-bit mode */ static void vesa_dac_set_8bits(void) { + struct biosregs ireg, oreg; u8 dac_size = 6; /* If possible, switch the DAC to 8-bit mode */ if (vginfo.capabilities & 1) { - u16 ax, bx; - - ax = 0x4f08; - bx = 0x0800; - asm volatile(INT10 - : "+a" (ax), "+b" (bx) - : : "ecx", "edx", "esi", "edi"); - - if (ax == 0x004f) - dac_size = bx >> 8; + initregs(&ireg); + ireg.ax = 0x4f08; + ireg.bh = 0x08; + intcall(0x10, &ireg, &oreg); + if (oreg.ax == 0x004f) + dac_size = oreg.bh; } /* Set the color sizes to the DAC size, and offsets to 0 */ - boot_params.screen_info.red_size = dac_size; + boot_params.screen_info.red_size = dac_size; boot_params.screen_info.green_size = dac_size; - boot_params.screen_info.blue_size = dac_size; - boot_params.screen_info.rsvd_size = dac_size; + boot_params.screen_info.blue_size = dac_size; + boot_params.screen_info.rsvd_size = dac_size; - boot_params.screen_info.red_pos = 0; - boot_params.screen_info.green_pos = 0; - boot_params.screen_info.blue_pos = 0; - boot_params.screen_info.rsvd_pos = 0; + boot_params.screen_info.red_pos = 0; + boot_params.screen_info.green_pos = 0; + boot_params.screen_info.blue_pos = 0; + boot_params.screen_info.rsvd_pos = 0; } /* Save the VESA protected mode info */ static void vesa_store_pm_info(void) { - u16 ax, bx, di, es; + struct biosregs ireg, oreg; - ax = 0x4f0a; - bx = di = 0; - asm("pushw %%es; "INT10"; movw %%es,%0; popw %%es" - : "=d" (es), "+a" (ax), "+b" (bx), "+D" (di) - : : "ecx", "esi"); + initregs(&ireg); + ireg.ax = 0x4f0a; + intcall(0x10, &ireg, &oreg); - if (ax != 0x004f) + if (oreg.ax != 0x004f) return; - boot_params.screen_info.vesapm_seg = es; - boot_params.screen_info.vesapm_off = di; + boot_params.screen_info.vesapm_seg = oreg.es; + boot_params.screen_info.vesapm_off = oreg.di; } /* @@ -252,7 +242,7 @@ static void vesa_store_mode_params_graphics(void) void vesa_store_edid(void) { #ifdef CONFIG_FIRMWARE_EDID - u16 ax, bx, cx, dx, di; + struct biosregs ireg, oreg; /* Apparently used as a nonsense token... */ memset(&boot_params.edid_info, 0x13, sizeof boot_params.edid_info); @@ -260,33 +250,26 @@ void vesa_store_edid(void) if (vginfo.version < 0x0200) return; /* EDID requires VBE 2.0+ */ - ax = 0x4f15; /* VBE DDC */ - bx = 0x0000; /* Report DDC capabilities */ - cx = 0; /* Controller 0 */ - di = 0; /* ES:DI must be 0 by spec */ + initregs(&ireg); + ireg.ax = 0x4f15; /* VBE DDC */ + /* ireg.bx = 0x0000; */ /* Report DDC capabilities */ + /* ireg.cx = 0; */ /* Controller 0 */ + ireg.es = 0; /* ES:DI must be 0 by spec */ + intcall(0x10, &ireg, &oreg); - /* Note: The VBE DDC spec is different from the main VESA spec; - we genuinely have to assume all registers are destroyed here. */ - - asm("pushw %%es; movw %2,%%es; "INT10"; popw %%es" - : "+a" (ax), "+b" (bx), "+c" (cx), "+D" (di) - : : "esi", "edx"); - - if (ax != 0x004f) + if (oreg.ax != 0x004f) return; /* No EDID */ /* BH = time in seconds to transfer EDD information */ /* BL = DDC level supported */ - ax = 0x4f15; /* VBE DDC */ - bx = 0x0001; /* Read EDID */ - cx = 0; /* Controller 0 */ - dx = 0; /* EDID block number */ - di =(size_t) &boot_params.edid_info; /* (ES:)Pointer to block */ - asm(INT10 - : "+a" (ax), "+b" (bx), "+d" (dx), "=m" (boot_params.edid_info), - "+c" (cx), "+D" (di) - : : "esi"); + ireg.ax = 0x4f15; /* VBE DDC */ + ireg.bx = 0x0001; /* Read EDID */ + /* ireg.cx = 0; */ /* Controller 0 */ + /* ireg.dx = 0; */ /* EDID block number */ + ireg.es = ds(); + ireg.di =(size_t)&boot_params.edid_info; /* (ES:)Pointer to block */ + intcall(0x10, &ireg, &oreg); #endif /* CONFIG_FIRMWARE_EDID */ } diff --git a/arch/x86/boot/video-vga.c b/arch/x86/boot/video-vga.c index 9e0587a3776..8f8d827e254 100644 --- a/arch/x86/boot/video-vga.c +++ b/arch/x86/boot/video-vga.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -39,30 +40,30 @@ static __videocard video_vga; /* Set basic 80x25 mode */ static u8 vga_set_basic_mode(void) { + struct biosregs ireg, oreg; u16 ax; u8 rows; u8 mode; + initregs(&ireg); + #ifdef CONFIG_VIDEO_400_HACK if (adapter >= ADAPTER_VGA) { - asm volatile(INT10 - : : "a" (0x1202), "b" (0x0030) - : "ecx", "edx", "esi", "edi"); + ireg.ax = 0x1202; + ireg.bx = 0x0030; + intcall(0x10, &ireg, NULL); } #endif ax = 0x0f00; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); - - mode = (u8)ax; + intcall(0x10, &ireg, &oreg); + mode = oreg.al; set_fs(0); rows = rdfs8(0x484); /* rows minus one */ #ifndef CONFIG_VIDEO_400_HACK - if ((ax == 0x5003 || ax == 0x5007) && + if ((oreg.ax == 0x5003 || oreg.ax == 0x5007) && (rows == 0 || rows == 24)) return mode; #endif @@ -71,10 +72,8 @@ static u8 vga_set_basic_mode(void) mode = 3; /* Set the mode */ - ax = mode; - asm volatile(INT10 - : "+a" (ax) - : : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = mode; /* AH=0: set mode */ + intcall(0x10, &ireg, NULL); do_restore = 1; return mode; } @@ -82,43 +81,69 @@ static u8 vga_set_basic_mode(void) static void vga_set_8font(void) { /* Set 8x8 font - 80x43 on EGA, 80x50 on VGA */ + struct biosregs ireg; + + initregs(&ireg); /* Set 8x8 font */ - asm volatile(INT10 : : "a" (0x1112), "b" (0)); + ireg.ax = 0x1112; + /* ireg.bl = 0; */ + intcall(0x10, &ireg, NULL); /* Use alternate print screen */ - asm volatile(INT10 : : "a" (0x1200), "b" (0x20)); + ireg.ax = 0x1200; + ireg.bl = 0x20; + intcall(0x10, &ireg, NULL); /* Turn off cursor emulation */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); + ireg.ax = 0x1201; + ireg.bl = 0x34; + intcall(0x10, &ireg, NULL); /* Cursor is scan lines 6-7 */ - asm volatile(INT10 : : "a" (0x0100), "c" (0x0607)); + ireg.ax = 0x0100; + ireg.cx = 0x0607; + intcall(0x10, &ireg, NULL); } static void vga_set_14font(void) { /* Set 9x14 font - 80x28 on VGA */ + struct biosregs ireg; + + initregs(&ireg); /* Set 9x14 font */ - asm volatile(INT10 : : "a" (0x1111), "b" (0)); + ireg.ax = 0x1111; + /* ireg.bl = 0; */ + intcall(0x10, &ireg, NULL); /* Turn off cursor emulation */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x34)); + ireg.ax = 0x1201; + ireg.bl = 0x34; + intcall(0x10, &ireg, NULL); /* Cursor is scan lines 11-12 */ - asm volatile(INT10 : : "a" (0x0100), "c" (0x0b0c)); + ireg.ax = 0x0100; + ireg.cx = 0x0b0c; + intcall(0x10, &ireg, NULL); } static void vga_set_80x43(void) { /* Set 80x43 mode on VGA (not EGA) */ + struct biosregs ireg; + + initregs(&ireg); /* Set 350 scans */ - asm volatile(INT10 : : "a" (0x1201), "b" (0x30)); + ireg.ax = 0x1201; + ireg.bl = 0x30; + intcall(0x10, &ireg, NULL); /* Reset video mode */ - asm volatile(INT10 : : "a" (0x0003)); + ireg.ax = 0x0003; + intcall(0x10, &ireg, NULL); vga_set_8font(); } @@ -225,8 +250,6 @@ static int vga_set_mode(struct mode_info *mode) */ static int vga_probe(void) { - u16 ega_bx; - static const char *card_name[] = { "CGA/MDA/HGC", "EGA", "VGA" }; @@ -240,26 +263,26 @@ static int vga_probe(void) sizeof(ega_modes)/sizeof(struct mode_info), sizeof(vga_modes)/sizeof(struct mode_info), }; - u8 vga_flag; - asm(INT10 - : "=b" (ega_bx) - : "a" (0x1200), "b" (0x10) /* Check EGA/VGA */ - : "ecx", "edx", "esi", "edi"); + struct biosregs ireg, oreg; + + initregs(&ireg); + + ireg.ax = 0x1200; + ireg.bl = 0x10; /* Check EGA/VGA */ + intcall(0x10, &ireg, &oreg); #ifndef _WAKEUP - boot_params.screen_info.orig_video_ega_bx = ega_bx; + boot_params.screen_info.orig_video_ega_bx = oreg.bx; #endif /* If we have MDA/CGA/HGC then BL will be unchanged at 0x10 */ - if ((u8)ega_bx != 0x10) { + if (oreg.bl != 0x10) { /* EGA/VGA */ - asm(INT10 - : "=a" (vga_flag) - : "a" (0x1a00) - : "ebx", "ecx", "edx", "esi", "edi"); + ireg.ax = 0x1a00; + intcall(0x10, &ireg, &oreg); - if (vga_flag == 0x1a) { + if (oreg.al == 0x1a) { adapter = ADAPTER_VGA; #ifndef _WAKEUP boot_params.screen_info.orig_video_isVGA = 1; diff --git a/arch/x86/boot/video.c b/arch/x86/boot/video.c index 3bef2c1febe..bad728b76fc 100644 --- a/arch/x86/boot/video.c +++ b/arch/x86/boot/video.c @@ -2,6 +2,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * Copyright 2007 rPath, Inc. - All Rights Reserved + * Copyright 2009 Intel Corporation; author H. Peter Anvin * * This file is part of the Linux kernel, and is made available under * the terms of the GNU General Public License version 2. @@ -18,33 +19,29 @@ static void store_cursor_position(void) { - u16 curpos; - u16 ax, bx; + struct biosregs ireg, oreg; - ax = 0x0300; - bx = 0; - asm(INT10 - : "=d" (curpos), "+a" (ax), "+b" (bx) - : : "ecx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x03; + intcall(0x10, &ireg, &oreg); - boot_params.screen_info.orig_x = curpos; - boot_params.screen_info.orig_y = curpos >> 8; + boot_params.screen_info.orig_x = oreg.dl; + boot_params.screen_info.orig_y = oreg.dh; } static void store_video_mode(void) { - u16 ax, page; + struct biosregs ireg, oreg; /* N.B.: the saving of the video page here is a bit silly, since we pretty much assume page 0 everywhere. */ - ax = 0x0f00; - asm(INT10 - : "+a" (ax), "=b" (page) - : : "ecx", "edx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x0f; + intcall(0x10, &ireg, &oreg); /* Not all BIOSes are clean with respect to the top bit */ - boot_params.screen_info.orig_video_mode = ax & 0x7f; - boot_params.screen_info.orig_video_page = page >> 8; + boot_params.screen_info.orig_video_mode = oreg.al & 0x7f; + boot_params.screen_info.orig_video_page = oreg.bh; } /* @@ -257,7 +254,7 @@ static void restore_screen(void) int y; addr_t dst = 0; u16 *src = saved.data; - u16 ax, bx, dx; + struct biosregs ireg; if (graphic_mode) return; /* Can't restore onto a graphic mode */ @@ -296,12 +293,11 @@ static void restore_screen(void) } /* Restore cursor position */ - ax = 0x0200; /* Set cursor position */ - bx = 0; /* Page number (<< 8) */ - dx = (saved.cury << 8)+saved.curx; - asm volatile(INT10 - : "+a" (ax), "+b" (bx), "+d" (dx) - : : "ecx", "esi", "edi"); + initregs(&ireg); + ireg.ah = 0x02; /* Set cursor position */ + ireg.dh = saved.cury; + ireg.dl = saved.curx; + intcall(0x10, &ireg, NULL); } #else #define save_screen() ((void)0) diff --git a/arch/x86/boot/video.h b/arch/x86/boot/video.h index ee63f5d1446..5bb174a997f 100644 --- a/arch/x86/boot/video.h +++ b/arch/x86/boot/video.h @@ -112,20 +112,6 @@ extern int force_x, force_y; /* Don't query the BIOS for cols/rows */ extern int do_restore; /* Restore screen contents */ extern int graphic_mode; /* Graphics mode with linear frame buffer */ -/* - * int $0x10 is notorious for touching registers it shouldn't. - * gcc doesn't like %ebp being clobbered, so define it as a push/pop - * sequence here. - * - * A number of systems, including the original PC can clobber %bp in - * certain circumstances, like when scrolling. There exists at least - * one Trident video card which could clobber DS under a set of - * circumstances that we are unlikely to encounter (scrolling when - * using an extended graphics mode of more than 800x600 pixels), but - * it's cheap insurance to deal with that here. - */ -#define INT10 "pushl %%ebp; pushw %%ds; int $0x10; popw %%ds; popl %%ebp" - /* Accessing VGA indexed registers */ static inline u8 in_idx(u16 port, u8 index) { From 2062501ae6505dbc5bff3a792246c2661d114050 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 6 Apr 2009 01:49:33 +0200 Subject: [PATCH 0332/2061] tracing/lockdep: report the time waited for a lock While trying to optimize the new lock on reiserfs to replace the bkl, I find the lock tracing very useful though it lacks something important for performance (and latency) instrumentation: the time a task waits for a lock. That's what this patch implements: bash-4816 [000] 202.652815: lock_contended: lock_contended: &sb->s_type->i_mutex_key bash-4816 [000] 202.652819: lock_acquired: &rq->lock (0.000 us) <...>-4787 [000] 202.652825: lock_acquired: &rq->lock (0.000 us) <...>-4787 [000] 202.652829: lock_acquired: &rq->lock (0.000 us) bash-4816 [000] 202.652833: lock_acquired: &sb->s_type->i_mutex_key (16.005 us) As shown above, the "lock acquired" field is followed by the time it has been waiting for the lock. Usually, a lock contended entry is followed by a near lock_acquired entry with a non-zero time waited. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Steven Rostedt LKML-Reference: <1238975373-15739-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/trace/lockdep_event_types.h | 23 ++++++++++++++++++----- kernel/lockdep.c | 8 ++++---- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h index adccfcd2ec8..863f1e4583a 100644 --- a/include/trace/lockdep_event_types.h +++ b/include/trace/lockdep_event_types.h @@ -32,11 +32,24 @@ TRACE_FORMAT(lock_contended, TP_FMT("%s", lock->name) ); -TRACE_FORMAT(lock_acquired, - TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), - TP_FMT("%s", lock->name) - ); +TRACE_EVENT(lock_acquired, + TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), + + TP_ARGS(lock, ip, waittime), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, wait_usec) + __field(unsigned long, wait_nsec_rem) + ), + TP_fast_assign( + __entry->name = lock->name; + __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); + __entry->wait_usec = (unsigned long) waittime; + ), + TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, + __entry->wait_nsec_rem) +); #endif #endif diff --git a/kernel/lockdep.c b/kernel/lockdep.c index b0f01186696..c4582a6ea95 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -3061,6 +3061,8 @@ found_it: put_lock_stats(stats); } +DEFINE_TRACE(lock_acquired); + static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { @@ -3099,6 +3101,8 @@ found_it: hlock->holdtime_stamp = now; } + trace_lock_acquired(lock, ip, waittime); + stats = get_lock_stats(hlock_class(hlock)); if (waittime) { if (hlock->read) @@ -3137,14 +3141,10 @@ void lock_contended(struct lockdep_map *lock, unsigned long ip) } EXPORT_SYMBOL_GPL(lock_contended); -DEFINE_TRACE(lock_acquired); - void lock_acquired(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; - trace_lock_acquired(lock, ip); - if (unlikely(!lock_stat)) return; From e71e99c294058a61b7a8b9bb6da2f745ac51aa4f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 25 Mar 2009 14:30:04 -0400 Subject: [PATCH 0333/2061] x86, function-graph: only save return values on x86_64 Impact: speed up The return to handler portion of the function graph tracer should only need to save the return values. The caller already saved off the registers that the callee can modify. The returning function already saved the registers it modified. When we call our own trace function it too will save the registers that the callee must restore. There's no reason to save off anything more that the registers used to return the values. Note, I did a complete kernel build with this modification and the function graph tracer running on x86_64. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- arch/x86/kernel/entry_64.S | 19 +++---------------- 1 file changed, 3 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index a331ec38af9..1ac99865591 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -147,27 +147,14 @@ END(ftrace_graph_caller) GLOBAL(return_to_handler) subq $80, %rsp + /* Save the return values */ movq %rax, (%rsp) - movq %rcx, 8(%rsp) - movq %rdx, 16(%rsp) - movq %rsi, 24(%rsp) - movq %rdi, 32(%rsp) - movq %r8, 40(%rsp) - movq %r9, 48(%rsp) - movq %r10, 56(%rsp) - movq %r11, 64(%rsp) + movq %rdx, 8(%rsp) call ftrace_return_to_handler movq %rax, 72(%rsp) - movq 64(%rsp), %r11 - movq 56(%rsp), %r10 - movq 48(%rsp), %r9 - movq 40(%rsp), %r8 - movq 32(%rsp), %rdi - movq 24(%rsp), %rsi - movq 16(%rsp), %rdx - movq 8(%rsp), %rcx + movq 8(%rsp), %rdx movq (%rsp), %rax addq $72, %rsp retq From 5cb3d1d9d34ac04bcaa2034139345b2a5fea54c1 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Thu, 9 Apr 2009 14:08:18 +0800 Subject: [PATCH 0334/2061] tracing, net, skb tracepoint: make skb tracepoint use the TRACE_EVENT() macro TRACE_EVENT is a more generic way to define a tracepoint. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions Signed-off-by: Zhao Lei Acked-by: Neil Horman Cc: "David S. Miller" Cc: Arnaldo Carvalho de Melo Cc: "Steven Rostedt ;" Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49DD90D2.5020604@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/skb.h | 4 +--- include/trace/skb_event_types.h | 38 +++++++++++++++++++++++++++++++ include/trace/trace_event_types.h | 1 + include/trace/trace_events.h | 1 + 4 files changed, 41 insertions(+), 3 deletions(-) create mode 100644 include/trace/skb_event_types.h diff --git a/include/trace/skb.h b/include/trace/skb.h index b66206d9be7..d2de7174a6e 100644 --- a/include/trace/skb.h +++ b/include/trace/skb.h @@ -4,8 +4,6 @@ #include #include -DECLARE_TRACE(kfree_skb, - TP_PROTO(struct sk_buff *skb, void *location), - TP_ARGS(skb, location)); +#include #endif diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h new file mode 100644 index 00000000000..4a1c504c0e1 --- /dev/null +++ b/include/trace/skb_event_types.h @@ -0,0 +1,38 @@ + +/* use instead */ +#ifndef TRACE_EVENT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM skb + +/* + * Tracepoint for free an sk_buff: + */ +TRACE_EVENT(kfree_skb, + + TP_PROTO(struct sk_buff *skb, void *location), + + TP_ARGS(skb, location), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned short, protocol ) + __field( void *, location ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + if (skb) { + __entry->protocol = ntohs(skb->protocol); + } + __entry->location = location; + ), + + TP_printk("skbaddr=%p protocol=%u location=%p", + __entry->skbaddr, __entry->protocol, __entry->location) +); + +#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h index df56f5694be..33b6bfcba93 100644 --- a/include/trace/trace_event_types.h +++ b/include/trace/trace_event_types.h @@ -3,3 +3,4 @@ #include #include #include +#include diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index fd13750ca4b..0e2aa80076d 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -3,3 +3,4 @@ #include #include #include +#include From bda869c614c937c318547c3ee1d65a316b693c21 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:05:10 +0200 Subject: [PATCH 0335/2061] x86: cacheinfo: use L3 cache index disable feature only for CPUs that support it AMD family 0x11 CPU doesn't support the feature. Some AMD family 0x10 CPUs do not support it or have an erratum, see erratum #382 in "Revision Guide for AMD Family 10h Processors, 41322 Rev. 3.40 February 2009". Signed-off-by: Andreas Herrmann CC: Mark Langsdorf Cc: Andrew Morton LKML-Reference: <20090409130510.GG31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 483eda96e10..72401264912 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -291,6 +291,14 @@ amd_check_l3_disable(int index, struct _cpuid4_info_regs *this_leaf) { if (index < 3) return; + + if (boot_cpu_data.x86 == 0x11) + return; + + /* see erratum #382 */ + if ((boot_cpu_data.x86 == 0x10) && (boot_cpu_data.x86_model < 0x8)) + return; + this_leaf->can_disable = 1; } From 845d8c761ec763871936c62b837c4a9ea6d0fbdb Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:07:29 +0200 Subject: [PATCH 0336/2061] x86: cacheinfo: correct return value when cache_disable feature is not active Impact: bug fix If user writes to "cache_disable" attribute on a CPU that does not support this feature, the process hangs due to an invalid return value in store_cache_disable(). Signed-off-by: Andreas Herrmann Cc: Andrew Morton Cc: Mark Langsdorf LKML-Reference: <20090409130729.GH31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 72401264912..1ab46e05adf 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -771,7 +771,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, unsigned int ret, index, val; if (!this_leaf->can_disable) - return 0; + return -EINVAL; if (strlen(buf) > 15) return -EINVAL; From afd9fceec55225d33be878927056a548c2eef26c Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:16:17 +0200 Subject: [PATCH 0337/2061] x86: cacheinfo: use cached K8 NB_MISC devices instead of scanning for it Impact: avoid code duplication Signed-off-by: Andreas Herrmann Cc: Andrew Morton Cc: Mark Langsdorf LKML-Reference: <20090409131617.GI31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/k8.h | 8 ++++++ arch/x86/kernel/cpu/intel_cacheinfo.c | 37 +++------------------------ 2 files changed, 11 insertions(+), 34 deletions(-) diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index 54c8cc53b24..c23b3d171be 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -12,4 +12,12 @@ extern int cache_k8_northbridges(void); extern void k8_flush_garts(void); extern int k8_scan_nodes(unsigned long start, unsigned long end); +#ifdef CONFIG_K8_NB +#define node_to_k8_nb_misc(node) \ + (node < num_k8_northbridges) ? k8_northbridges[node] : NULL +#else +#define node_to_k8_nb_misc(node) NULL +#endif + + #endif /* _ASM_X86_K8_H */ diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 1ab46e05adf..0cde0715369 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -17,6 +17,7 @@ #include #include +#include #define LVL_1_INST 1 #define LVL_1_DATA 2 @@ -159,14 +160,6 @@ struct _cpuid4_info_regs { unsigned long can_disable; }; -#if defined(CONFIG_PCI) && defined(CONFIG_SYSFS) -static struct pci_device_id k8_nb_id[] = { - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1103) }, - { PCI_DEVICE(PCI_VENDOR_ID_AMD, 0x1203) }, - {} -}; -#endif - unsigned short num_cache_leaves; /* AMD doesn't have CPUID4. Emulate it here to report the same @@ -704,30 +697,6 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -#ifdef CONFIG_PCI -static struct pci_dev *get_k8_northbridge(int node) -{ - struct pci_dev *dev = NULL; - int i; - - for (i = 0; i <= node; i++) { - do { - dev = pci_get_device(PCI_ANY_ID, PCI_ANY_ID, dev); - if (!dev) - break; - } while (!pci_match_id(&k8_nb_id[0], dev)); - if (!dev) - break; - } - return dev; -} -#else -static struct pci_dev *get_k8_northbridge(int node) -{ - return NULL; -} -#endif - static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) { const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); @@ -739,7 +708,7 @@ static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) if (!this_leaf->can_disable) return sprintf(buf, "Feature not enabled\n"); - dev = get_k8_northbridge(node); + dev = node_to_k8_nb_misc(node); if (!dev) { printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; @@ -783,7 +752,7 @@ store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, return -EINVAL; val |= 0xc0000000; - dev = get_k8_northbridge(node); + dev = node_to_k8_nb_misc(node); if (!dev) { printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; From f8b201fc7110c3673437254e8ba02451461ece0b Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 9 Apr 2009 15:18:49 +0200 Subject: [PATCH 0338/2061] x86: cacheinfo: replace sysfs interface for cache_disable feature Impact: replace sysfs attribute Current interface violates against "one-value-per-sysfs-attribute rule". This patch replaces current attribute with two attributes -- one for each L3 Cache Index Disable register. Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Cc: Andrew Morton LKML-Reference: <20090409131849.GJ31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 94 +++++++++++++-------------- 1 file changed, 47 insertions(+), 47 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index 0cde0715369..fc28291e40b 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -697,74 +697,70 @@ static ssize_t show_type(struct _cpuid4_info *this_leaf, char *buf) #define to_object(k) container_of(k, struct _index_kobject, kobj) #define to_attr(a) container_of(a, struct _cache_attr, attr) -static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf) +static ssize_t show_cache_disable(struct _cpuid4_info *this_leaf, char *buf, + unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - ssize_t ret = 0; - int i; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned int reg = 0; if (!this_leaf->can_disable) - return sprintf(buf, "Feature not enabled\n"); - - dev = node_to_k8_nb_misc(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); return -EINVAL; - } - for (i = 0; i < 2; i++) { - unsigned int reg; + if (!dev) + return -EINVAL; - pci_read_config_dword(dev, 0x1BC + i * 4, ®); - - ret += sprintf(buf, "%sEntry: %d\n", buf, i); - ret += sprintf(buf, "%sReads: %s\tNew Entries: %s\n", - buf, - reg & 0x80000000 ? "Disabled" : "Allowed", - reg & 0x40000000 ? "Disabled" : "Allowed"); - ret += sprintf(buf, "%sSubCache: %x\tIndex: %x\n", - buf, (reg & 0x30000) >> 16, reg & 0xfff); - } - return ret; + pci_read_config_dword(dev, 0x1BC + index * 4, ®); + return sprintf(buf, "%x\n", reg); } -static ssize_t -store_cache_disable(struct _cpuid4_info *this_leaf, const char *buf, - size_t count) +#define SHOW_CACHE_DISABLE(index) \ +static ssize_t \ +show_cache_disable_##index(struct _cpuid4_info *this_leaf, char *buf) \ +{ \ + return show_cache_disable(this_leaf, buf, index); \ +} +SHOW_CACHE_DISABLE(0) +SHOW_CACHE_DISABLE(1) + +static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, + const char *buf, size_t count, unsigned int index) { - const struct cpumask *mask = to_cpumask(this_leaf->shared_cpu_map); - int node = cpu_to_node(cpumask_first(mask)); - struct pci_dev *dev = NULL; - unsigned int ret, index, val; + int cpu = cpumask_first(to_cpumask(this_leaf->shared_cpu_map)); + int node = cpu_to_node(cpu); + struct pci_dev *dev = node_to_k8_nb_misc(node); + unsigned long val = 0; if (!this_leaf->can_disable) return -EINVAL; - if (strlen(buf) > 15) + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!dev) return -EINVAL; - ret = sscanf(buf, "%x %x", &index, &val); - if (ret != 2) - return -EINVAL; - if (index > 1) + if (strict_strtoul(buf, 10, &val) < 0) return -EINVAL; val |= 0xc0000000; - dev = node_to_k8_nb_misc(node); - if (!dev) { - printk(KERN_ERR "Attempting AMD northbridge operation on a system with no northbridge\n"); - return -EINVAL; - } - pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); - - return 1; + return count; } +#define STORE_CACHE_DISABLE(index) \ +static ssize_t \ +store_cache_disable_##index(struct _cpuid4_info *this_leaf, \ + const char *buf, size_t count) \ +{ \ + return store_cache_disable(this_leaf, buf, count, index); \ +} +STORE_CACHE_DISABLE(0) +STORE_CACHE_DISABLE(1) + struct _cache_attr { struct attribute attr; ssize_t (*show)(struct _cpuid4_info *, char *); @@ -785,7 +781,10 @@ define_one_ro(size); define_one_ro(shared_cpu_map); define_one_ro(shared_cpu_list); -static struct _cache_attr cache_disable = __ATTR(cache_disable, 0644, show_cache_disable, store_cache_disable); +static struct _cache_attr cache_disable_0 = __ATTR(cache_disable_0, 0644, + show_cache_disable_0, store_cache_disable_0); +static struct _cache_attr cache_disable_1 = __ATTR(cache_disable_1, 0644, + show_cache_disable_1, store_cache_disable_1); static struct attribute * default_attrs[] = { &type.attr, @@ -797,7 +796,8 @@ static struct attribute * default_attrs[] = { &size.attr, &shared_cpu_map.attr, &shared_cpu_list.attr, - &cache_disable.attr, + &cache_disable_0.attr, + &cache_disable_1.attr, NULL }; From ba518bea2db21c72d44a6cbfd825b026ef9cdcb6 Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 9 Apr 2009 15:24:06 +0200 Subject: [PATCH 0339/2061] x86: cacheinfo: disable L3 ECC scrubbing when L3 cache index is disabled (Use correct mask to zero out bits 24-28 by Andreas) Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Cc: Andrew Morton LKML-Reference: <20090409132406.GK31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index fc28291e40b..d46a849f44a 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -731,6 +731,7 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, int node = cpu_to_node(cpu); struct pci_dev *dev = node_to_k8_nb_misc(node); unsigned long val = 0; + unsigned int scrubber = 0; if (!this_leaf->can_disable) return -EINVAL; @@ -745,6 +746,11 @@ static ssize_t store_cache_disable(struct _cpuid4_info *this_leaf, return -EINVAL; val |= 0xc0000000; + + pci_read_config_dword(dev, 0x58, &scrubber); + scrubber &= ~0x1f000000; + pci_write_config_dword(dev, 0x58, scrubber); + pci_write_config_dword(dev, 0x1BC + index * 4, val & ~0x40000000); wbinvd(); pci_write_config_dword(dev, 0x1BC + index * 4, val); From 2fad2d9bb8310889f3261035b594b4e068b6eb8b Mon Sep 17 00:00:00 2001 From: Mark Langsdorf Date: Thu, 9 Apr 2009 15:31:53 +0200 Subject: [PATCH 0340/2061] x86/docs: add description for cache_disable sysfs interface Signed-off-by: Mark Langsdorf Signed-off-by: Andreas Herrmann Cc: Andrew Morton LKML-Reference: <20090409133153.GL31527@alberich.amd.com> Signed-off-by: Ingo Molnar --- .../ABI/testing/sysfs-devices-cache_disable | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-devices-cache_disable diff --git a/Documentation/ABI/testing/sysfs-devices-cache_disable b/Documentation/ABI/testing/sysfs-devices-cache_disable new file mode 100644 index 00000000000..175bb4f7051 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-devices-cache_disable @@ -0,0 +1,18 @@ +What: /sys/devices/system/cpu/cpu*/cache/index*/cache_disable_X +Date: August 2008 +KernelVersion: 2.6.27 +Contact: mark.langsdorf@amd.com +Description: These files exist in every cpu's cache index directories. + There are currently 2 cache_disable_# files in each + directory. Reading from these files on a supported + processor will return that cache disable index value + for that processor and node. Writing to one of these + files will cause the specificed cache index to be disabled. + + Currently, only AMD Family 10h Processors support cache index + disable, and only for their L3 caches. See the BIOS and + Kernel Developer's Guide at + http://www.amd.com/us-en/assets/content_type/white_papers_and_tech_docs/31116-Public-GH-BKDG_3.20_2-4-09.pdf + for formatting information and other details on the + cache index disable. +Users: joachim.deguara@amd.com From 6265ff19ca08df0d96c859ae5e4dc2d9ad07070e Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Thu, 9 Apr 2009 15:47:10 +0200 Subject: [PATCH 0341/2061] x86: cacheinfo: complete L2/L3 Cache and TLB associativity field definitions See "CPUID Specification" (AMD Publication #: 25481, Rev. 2.28, April 2008) Signed-off-by: Andreas Herrmann Cc: Mark Langsdorf LKML-Reference: <20090409134710.GA8026@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/intel_cacheinfo.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/intel_cacheinfo.c b/arch/x86/kernel/cpu/intel_cacheinfo.c index d46a849f44a..789efe217e1 100644 --- a/arch/x86/kernel/cpu/intel_cacheinfo.c +++ b/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -200,10 +200,17 @@ union l3_cache { }; static const unsigned short __cpuinitconst assocs[] = { - [1] = 1, [2] = 2, [4] = 4, [6] = 8, - [8] = 16, [0xa] = 32, [0xb] = 48, + [1] = 1, + [2] = 2, + [4] = 4, + [6] = 8, + [8] = 16, + [0xa] = 32, + [0xb] = 48, [0xc] = 64, - [0xf] = 0xffff // ?? + [0xd] = 96, + [0xe] = 128, + [0xf] = 0xffff /* fully associative - no way to show this currently */ }; static const unsigned char __cpuinitconst levels[] = { 1, 1, 2, 3 }; @@ -264,7 +271,8 @@ amd_cpuid4(int leaf, union _cpuid4_leaf_eax *eax, eax->split.type = types[leaf]; eax->split.level = levels[leaf]; if (leaf == 3) - eax->split.num_threads_sharing = current_cpu_data.x86_max_cores - 1; + eax->split.num_threads_sharing = + current_cpu_data.x86_max_cores - 1; else eax->split.num_threads_sharing = 0; eax->split.num_cores_on_die = current_cpu_data.x86_max_cores - 1; From a5a2a0c7fa039c59619bc908b3b1ed24734d442a Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Fri, 10 Apr 2009 09:50:05 -0700 Subject: [PATCH 0342/2061] futex: fix futex_wait_setup key handling If the get_futex_key() call were to fail, the existing code would try and put_futex_key() prior to returning. This patch makes sure we only put_futex_key() if get_futex_key() succeeded. Reported-by: Clark Williams Signed-off-by: Darren Hart LKML-Reference: <20090410165005.14342.16973.stgit@Aeon> Signed-off-by: Thomas Gleixner --- kernel/futex.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/futex.c b/kernel/futex.c index 041bf3ac4be..6d2daa46f9f 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1668,7 +1668,7 @@ retry: q->key = FUTEX_KEY_INIT; ret = get_futex_key(uaddr, fshared, &q->key); if (unlikely(ret != 0)) - goto out; + return ret; retry_private: *hb = queue_lock(q); From cf9972a921470b0a2da7906104bcd540b20e33bf Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Sat, 11 Apr 2009 22:24:05 -0700 Subject: [PATCH 0343/2061] x86, setup: fix comment in the "glove box" code Impact: Comment change only The glove box is about avoiding problems with *registers* being touched, not *memory*. Signed-off-by: H. Peter Anvin --- arch/x86/boot/bioscall.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/boot/bioscall.S b/arch/x86/boot/bioscall.S index 22b4b3efb9f..507793739ea 100644 --- a/arch/x86/boot/bioscall.S +++ b/arch/x86/boot/bioscall.S @@ -10,7 +10,7 @@ /* * "Glove box" for BIOS calls. Avoids the constant problems with BIOSes - * touching memory they shouldn't be. + * touching registers they shouldn't be. */ .code16 From 2de1f33e99cec5fd79542a1d0e26efb9c36a98bb Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 11 Apr 2009 12:55:26 +0530 Subject: [PATCH 0344/2061] x86: apic/x2apic_cluster.c x86_cpu_to_logical_apicid should be static Impact: reduce kernel size a bit, address sparse warning Addresses the problem pointed out by this sparse warning: arch/x86/kernel/apic/x2apic_cluster.c:13:1: warning: symbol 'per_cpu__x86_cpu_to_logical_apicid' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput Cc: Suresh Siddha LKML-Reference: <1239434726.4418.24.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/x2apic_cluster.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/apic/x2apic_cluster.c b/arch/x86/kernel/apic/x2apic_cluster.c index 4a903e2f0d1..8e4cbb255c3 100644 --- a/arch/x86/kernel/apic/x2apic_cluster.c +++ b/arch/x86/kernel/apic/x2apic_cluster.c @@ -10,7 +10,7 @@ #include #include -DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); +static DEFINE_PER_CPU(u32, x86_cpu_to_logical_apicid); static int x2apic_acpi_madt_oem_check(char *oem_id, char *oem_table_id) { From 56c49951747f250d8398582509e02ae5ce1d36d1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 11 Apr 2009 15:51:19 -0400 Subject: [PATCH 0345/2061] tracing: Add documentation for the power tracer Signed-off-by: "Theodore Ts'o" Acked-by: Arjan van de Ven Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <1239479479-2603-4-git-send-email-tytso@mit.edu> Signed-off-by: Ingo Molnar --- Documentation/trace/power.txt | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) create mode 100644 Documentation/trace/power.txt diff --git a/Documentation/trace/power.txt b/Documentation/trace/power.txt new file mode 100644 index 00000000000..cd805e16dc2 --- /dev/null +++ b/Documentation/trace/power.txt @@ -0,0 +1,17 @@ +The power tracer collects detailed information about C-state and P-state +transitions, instead of just looking at the high-level "average" +information. + +There is a helper script found in scrips/tracing/power.pl in the kernel +sources which can be used to parse this information and create a +Scalable Vector Graphics (SVG) picture from the trace data. + +To use this tracer: + + echo 0 > /sys/kernel/debug/tracing/tracing_enabled + echo power > /sys/kernel/debug/tracing/current_tracer + echo 1 > /sys/kernel/debug/tracing/tracing_enabled + sleep 1 + echo 0 > /sys/kernel/debug/tracing/tracing_enabled + cat /sys/kernel/debug/tracing/trace | \ + perl scripts/tracing/power.pl > out.sv From abd41443ac76d3e9c29a8c1d9e9a3312306cc55e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 11 Apr 2009 15:51:18 -0400 Subject: [PATCH 0346/2061] tracing: Document the event tracing system Signed-off-by: "Theodore Ts'o" Cc: Theodore Ts'o Cc: Steven Rostedt LKML-Reference: <1239479479-2603-3-git-send-email-tytso@mit.edu> Signed-off-by: Ingo Molnar --- Documentation/trace/events.txt | 135 +++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) create mode 100644 Documentation/trace/events.txt diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt new file mode 100644 index 00000000000..abdee664c0f --- /dev/null +++ b/Documentation/trace/events.txt @@ -0,0 +1,135 @@ + Event Tracing + + Documentation written by Theodore Ts'o + +Introduction +============ + +Tracepoints (see Documentation/trace/tracepoints.txt) can be used +without creating custom kernel modules to register probe functions +using the event tracing infrastructure. + +Not all tracepoints can be traced using the event tracing system; +the kernel developer must provide code snippets which define how the +tracing information is saved into the tracing buffer, and how the +the tracing information should be printed. + +Using Event Tracing +=================== + +The events which are available for tracing can be found in the file +/sys/kernel/debug/tracing/available_events. + +To enable a particular event, such as 'sched_wakeup', simply echo it +to /sys/debug/tracing/set_event. For example: + + # echo sched_wakeup > /sys/kernel/debug/tracing/set_event + +[ Note: events can also be enabled/disabled via the 'enabled' toggle + found in the /sys/kernel/tracing/events/ hierarchy of directories. ] + +To disable an event, echo the event name to the set_event file prefixed +with an exclamation point: + + # echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event + +To disable events, echo an empty line to the set_event file: + + # echo > /sys/kernel/debug/tracing/set_event + +The events are organized into subsystems, such as ext4, irq, sched, +etc., and a full event name looks like this: :. The +subsystem name is optional, but it is displayed in the available_events +file. All of the events in a subsystem can be specified via the syntax +":*"; for example, to enable all irq events, you can use the +command: + + # echo 'irq:*' > /sys/kernel/debug/tracing/set_event + +Defining an event-enabled tracepoint +------------------------------------ + +A kernel developer which wishes to define an event-enabled tracepoint +must declare the tracepoint using TRACE_EVENT instead of DECLARE_TRACE. +This is done via two header files in include/trace. For example, to +event-enable the jbd2 subsystem, we must create two files, +include/trace/jbd2.h and include/trace/jbd2_event_types.h. The +include/trace/jbd2.h file should be included by kernel source files that +will have a tracepoint inserted, and might look like this: + +#ifndef _TRACE_JBD2_H +#define _TRACE_JBD2_H + +#include +#include + +#include + +#endif + +In a file that utilizes a jbd2 tracepoint, this header file would be +included. Note that you still have to use DEFINE_TRACE(). So for +example, if fs/jbd2/commit.c planned to use the jbd2_start_commit +tracepoint, it would have the following near the beginning of the file: + +#include + +DEFINE_TRACE(jbd2_start_commit); + +Then in the function that would call the tracepoint, it would call the +tracepoint function. (For more information, please see the tracepoint +documentation in Documentation/trace/tracepoints.txt): + + trace_jbd2_start_commit(journal, commit_transaction); + +The code snippets which allow jbd2_start_commit to be an event-enabled +tracepoint are placed in the file include/trace/jbd2_event_types.h: + +/* use instead */ +#ifndef TRACE_EVENT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM jbd2 + +#include + +TRACE_EVENT(jbd2_start_commit, + TP_PROTO(journal_t *journal, transaction_t *commit_transaction), + TP_ARGS(journal, commit_transaction), + TP_STRUCT__entry( + __array( char, devname, BDEVNAME_SIZE+24 ) + __field( int, transaction ) + ), + TP_fast_assign( + memcpy(__entry->devname, journal->j_devname, BDEVNAME_SIZE+24); + __entry->transaction = commit_transaction->t_tid; + ), + TP_printk("dev %s transaction %d", + __entry->devname, __entry->transaction) +); + +The TP_PROTO and TP_ARGS are unchanged from DECLARE_TRACE. The new +arguments to TRACE_EVENT are TP_STRUCT__entry, TP_fast_assign, and +TP_printk. + +TP_STRUCT__entry defines the data structure which will be stored in the +trace buffer. Normally, fields in __entry will be arrays or simple +types. It is possible to place data structures in __entry --- however, +pointers in the data structure can not be trusted, since they will be +accessed sometime later by TP_printk, and if the data structure contains +fields that will not or cannot be used by TP_printk, this will waste +space in the trace buffer. In general, data structures should be +avoided, unless they do only contain non-pointer types and all of the +fields will be used by TP_printk. + +TP_fast_assign defines the code snippet which saves information into the +__entry data structure, using the passed-in arguments defined in +TP_PROTO and TP_ARGS. + +Finally, TP_printk will print the __entry data structure. At the time +when the code snippet defined by TP_printk is executed, it will not have +access to the TP_ARGS arguments; it can only use the information saved +in the __entry data structure. From 2c1b284e4fa260fd922b9a65c99169e2630c6862 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sat, 11 Apr 2009 00:03:10 +0530 Subject: [PATCH 0347/2061] x86: clean up declarations and variables Impact: cleanup, no code changed - syscalls.h update declarations due to unifications - irq.c declare smp_generic_interrupt() before it gets used - process.c declare sys_fork() and sys_vfork() before they get used - tsc.c rename tsc_khz shadowed variable - apic/probe_32.c declare apic_default before it gets used - apic/nmi.c prev_nmi_count should be unsigned - apic/io_apic.c declare smp_irq_move_cleanup_interrupt() before it gets used - mm/init.c declare direct_gbpages and free_initrd_mem before they get used Signed-off-by: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/include/asm/apic.h | 3 +++ arch/x86/include/asm/hw_irq.h | 4 +++ arch/x86/include/asm/pgtable.h | 2 ++ arch/x86/include/asm/pgtable_64.h | 6 ----- arch/x86/include/asm/syscalls.h | 45 ++++++++++++++++--------------- arch/x86/kernel/apic/io_apic.c | 1 + arch/x86/kernel/apic/nmi.c | 2 +- arch/x86/kernel/apic/probe_32.c | 1 - arch/x86/kernel/irq.c | 1 + arch/x86/kernel/process.c | 1 + arch/x86/kernel/tsc.c | 8 +++--- arch/x86/mm/init.c | 1 + 12 files changed, 42 insertions(+), 33 deletions(-) diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index 42f2f837742..5773660c8cd 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -478,6 +478,9 @@ static inline unsigned int read_apic_id(void) extern void default_setup_apic_routing(void); #ifdef CONFIG_X86_32 + +extern struct apic apic_default; + /* * Set up the logical destination ID. * diff --git a/arch/x86/include/asm/hw_irq.h b/arch/x86/include/asm/hw_irq.h index b762ea49bd7..be9ae4111c9 100644 --- a/arch/x86/include/asm/hw_irq.h +++ b/arch/x86/include/asm/hw_irq.h @@ -78,7 +78,11 @@ extern void eisa_set_level_irq(unsigned int irq); /* SMP */ extern void smp_apic_timer_interrupt(struct pt_regs *); extern void smp_spurious_interrupt(struct pt_regs *); +extern void smp_generic_interrupt(struct pt_regs *); extern void smp_error_interrupt(struct pt_regs *); +#ifdef CONFIG_X86_IO_APIC +extern asmlinkage void smp_irq_move_cleanup_interrupt(void); +#endif #ifdef CONFIG_SMP extern void smp_reschedule_interrupt(struct pt_regs *); extern void smp_call_function_interrupt(struct pt_regs *); diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index 29d96d168bc..3f8d09d94eb 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -503,6 +503,8 @@ static inline int pgd_none(pgd_t pgd) #ifndef __ASSEMBLY__ +extern int direct_gbpages; + /* local pte updates need not use xchg for locking */ static inline pte_t native_local_ptep_get_and_clear(pte_t *ptep) { diff --git a/arch/x86/include/asm/pgtable_64.h b/arch/x86/include/asm/pgtable_64.h index 6b87bc6d501..abde308fdb0 100644 --- a/arch/x86/include/asm/pgtable_64.h +++ b/arch/x86/include/asm/pgtable_64.h @@ -25,10 +25,6 @@ extern pgd_t init_level4_pgt[]; extern void paging_init(void); -#endif /* !__ASSEMBLY__ */ - -#ifndef __ASSEMBLY__ - #define pte_ERROR(e) \ printk("%s:%d: bad pte %p(%016lx).\n", \ __FILE__, __LINE__, &(e), pte_val(e)) @@ -135,8 +131,6 @@ static inline int pgd_large(pgd_t pgd) { return 0; } #define update_mmu_cache(vma, address, pte) do { } while (0) -extern int direct_gbpages; - /* Encode and de-code a swap entry */ #if _PAGE_BIT_FILE < _PAGE_BIT_PROTNONE #define SWP_TYPE_BITS (_PAGE_BIT_FILE - _PAGE_BIT_PRESENT - 1) diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h index 7043408f690..372b76edd63 100644 --- a/arch/x86/include/asm/syscalls.h +++ b/arch/x86/include/asm/syscalls.h @@ -1,7 +1,7 @@ /* * syscalls.h - Linux syscall interfaces (arch-specific) * - * Copyright (c) 2008 Jaswinder Singh + * Copyright (c) 2008 Jaswinder Singh Rajput * * This file is released under the GPLv2. * See the file COPYING for more details. @@ -12,50 +12,55 @@ #include #include -#include #include +#include /* Common in X86_32 and X86_64 */ /* kernel/ioport.c */ asmlinkage long sys_ioperm(unsigned long, unsigned long, int); +/* kernel/process.c */ +int sys_fork(struct pt_regs *); +int sys_vfork(struct pt_regs *); + /* kernel/ldt.c */ asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +/* kernel/signal.c */ +long sys_rt_sigreturn(struct pt_regs *); + /* kernel/tls.c */ asmlinkage int sys_set_thread_area(struct user_desc __user *); asmlinkage int sys_get_thread_area(struct user_desc __user *); /* X86_32 only */ #ifdef CONFIG_X86_32 +/* kernel/ioport.c */ +long sys_iopl(struct pt_regs *); + /* kernel/process_32.c */ -int sys_fork(struct pt_regs *); int sys_clone(struct pt_regs *); -int sys_vfork(struct pt_regs *); int sys_execve(struct pt_regs *); -/* kernel/signal_32.c */ +/* kernel/signal.c */ asmlinkage int sys_sigsuspend(int, int, old_sigset_t); asmlinkage int sys_sigaction(int, const struct old_sigaction __user *, struct old_sigaction __user *); int sys_sigaltstack(struct pt_regs *); unsigned long sys_sigreturn(struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); - -/* kernel/ioport.c */ -long sys_iopl(struct pt_regs *); /* kernel/sys_i386_32.c */ +struct mmap_arg_struct; +struct sel_arg_struct; +struct oldold_utsname; +struct old_utsname; + asmlinkage long sys_mmap2(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -struct mmap_arg_struct; asmlinkage int old_mmap(struct mmap_arg_struct __user *); -struct sel_arg_struct; asmlinkage int old_select(struct sel_arg_struct __user *); asmlinkage int sys_ipc(uint, int, int, int, void __user *, long); -struct old_utsname; asmlinkage int sys_uname(struct old_utsname __user *); -struct oldold_utsname; asmlinkage int sys_olduname(struct oldold_utsname __user *); /* kernel/vm86_32.c */ @@ -65,29 +70,27 @@ int sys_vm86(struct pt_regs *); #else /* CONFIG_X86_32 */ /* X86_64 only */ +/* kernel/ioport.c */ +asmlinkage long sys_iopl(unsigned int, struct pt_regs *); + /* kernel/process_64.c */ -asmlinkage long sys_fork(struct pt_regs *); asmlinkage long sys_clone(unsigned long, unsigned long, void __user *, void __user *, struct pt_regs *); -asmlinkage long sys_vfork(struct pt_regs *); asmlinkage long sys_execve(char __user *, char __user * __user *, char __user * __user *, struct pt_regs *); long sys_arch_prctl(int, unsigned long); -/* kernel/ioport.c */ -asmlinkage long sys_iopl(unsigned int, struct pt_regs *); - -/* kernel/signal_64.c */ +/* kernel/signal.c */ asmlinkage long sys_sigaltstack(const stack_t __user *, stack_t __user *, struct pt_regs *); -long sys_rt_sigreturn(struct pt_regs *); /* kernel/sys_x86_64.c */ +struct new_utsname; + asmlinkage long sys_mmap(unsigned long, unsigned long, unsigned long, unsigned long, unsigned long, unsigned long); -struct new_utsname; asmlinkage long sys_uname(struct new_utsname __user *); #endif /* CONFIG_X86_32 */ diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 767fe7e46d6..870c92ddaf9 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -59,6 +59,7 @@ #include #include #include +#include #include #include diff --git a/arch/x86/kernel/apic/nmi.c b/arch/x86/kernel/apic/nmi.c index d6bd6240715..02056310f2f 100644 --- a/arch/x86/kernel/apic/nmi.c +++ b/arch/x86/kernel/apic/nmi.c @@ -104,7 +104,7 @@ static __init void nmi_cpu_busy(void *data) } #endif -static void report_broken_nmi(int cpu, int *prev_nmi_count) +static void report_broken_nmi(int cpu, unsigned int *prev_nmi_count) { printk(KERN_CONT "\n"); diff --git a/arch/x86/kernel/apic/probe_32.c b/arch/x86/kernel/apic/probe_32.c index 01eda2ac65e..440a8bccd91 100644 --- a/arch/x86/kernel/apic/probe_32.c +++ b/arch/x86/kernel/apic/probe_32.c @@ -160,7 +160,6 @@ extern struct apic apic_summit; extern struct apic apic_bigsmp; extern struct apic apic_es7000; extern struct apic apic_es7000_cluster; -extern struct apic apic_default; struct apic *apic = &apic_default; EXPORT_SYMBOL_GPL(apic); diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index 3aaf7b9e3a8..2188267f523 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -12,6 +12,7 @@ #include #include #include +#include atomic_t irq_err_count; diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e84..3e21e38d7e3 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index 7a567ebe636..a8dc0d00b83 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -384,13 +384,13 @@ unsigned long native_calibrate_tsc(void) { u64 tsc1, tsc2, delta, ref1, ref2; unsigned long tsc_pit_min = ULONG_MAX, tsc_ref_min = ULONG_MAX; - unsigned long flags, latch, ms, fast_calibrate, tsc_khz; + unsigned long flags, latch, ms, fast_calibrate, hv_tsc_khz; int hpet = is_hpet_enabled(), i, loopmin; - tsc_khz = get_hypervisor_tsc_freq(); - if (tsc_khz) { + hv_tsc_khz = get_hypervisor_tsc_freq(); + if (hv_tsc_khz) { printk(KERN_INFO "TSC: Frequency read from the hypervisor\n"); - return tsc_khz; + return hv_tsc_khz; } local_irq_save(flags); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fd3da1dda1c..40924e445f5 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -1,3 +1,4 @@ +#include #include #include From 02af61bb50f5d5f0322dbe5ab2a0d75808d25c7b Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 10 Apr 2009 14:26:18 +0800 Subject: [PATCH 0348/2061] tracing, kmemtrace: Separate include/trace/kmemtrace.h to kmemtrace part and tracepoint part Impact: refactor code for future changes Current kmemtrace.h is used both as header file of kmemtrace and kmem's tracepoints definition. Tracepoints' definition file may be used by other code, and should only have definition of tracepoint. We can separate include/trace/kmemtrace.h into 2 files: include/linux/kmemtrace.h: header file for kmemtrace include/trace/kmem.h: definition of kmem tracepoints Signed-off-by: Zhao Lei Acked-by: Eduard - Gabriel Munteanu Acked-by: Pekka Enberg Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49DEE68A.5040902@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/kmemtrace.h | 25 +++++++++++++++++++++++++ include/linux/slab_def.h | 2 +- include/linux/slub_def.h | 2 +- include/trace/{kmemtrace.h => kmem.h} | 25 +++---------------------- init/main.c | 2 +- kernel/trace/kmemtrace.c | 2 +- kernel/trace/trace.h | 2 +- mm/slab.c | 2 +- mm/slob.c | 2 +- mm/slub.c | 2 +- 10 files changed, 36 insertions(+), 30 deletions(-) create mode 100644 include/linux/kmemtrace.h rename include/trace/{kmemtrace.h => kmem.h} (78%) diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h new file mode 100644 index 00000000000..15c45a27a92 --- /dev/null +++ b/include/linux/kmemtrace.h @@ -0,0 +1,25 @@ +/* + * Copyright (C) 2008 Eduard - Gabriel Munteanu + * + * This file is released under GPL version 2. + */ + +#ifndef _LINUX_KMEMTRACE_H +#define _LINUX_KMEMTRACE_H + +#ifdef __KERNEL__ + +#include + +#ifdef CONFIG_KMEMTRACE +extern void kmemtrace_init(void); +#else +static inline void kmemtrace_init(void) +{ +} +#endif + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_KMEMTRACE_H */ + diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index 5ac9b0bcaf9..713f841ecaa 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -14,7 +14,7 @@ #include /* kmalloc_sizes.h needs PAGE_SIZE */ #include /* kmalloc_sizes.h needs L1_CACHE_BYTES */ #include -#include +#include /* Size description struct for general caches. */ struct cache_sizes { diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 5046f90c117..be5d40c43bd 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -10,7 +10,7 @@ #include #include #include -#include +#include enum stat_item { ALLOC_FASTPATH, /* Allocation from cpu slab */ diff --git a/include/trace/kmemtrace.h b/include/trace/kmem.h similarity index 78% rename from include/trace/kmemtrace.h rename to include/trace/kmem.h index 28ee69f9cd4..24d25192818 100644 --- a/include/trace/kmemtrace.h +++ b/include/trace/kmem.h @@ -1,25 +1,9 @@ -/* - * Copyright (C) 2008 Eduard - Gabriel Munteanu - * - * This file is released under GPL version 2. - */ - -#ifndef _LINUX_KMEMTRACE_H -#define _LINUX_KMEMTRACE_H - -#ifdef __KERNEL__ +#ifndef _TRACE_KMEM_H +#define _TRACE_KMEM_H #include #include -#ifdef CONFIG_KMEMTRACE -extern void kmemtrace_init(void); -#else -static inline void kmemtrace_init(void) -{ -} -#endif - DECLARE_TRACE(kmalloc, TP_PROTO(unsigned long call_site, const void *ptr, @@ -57,7 +41,4 @@ DECLARE_TRACE(kmem_cache_free, TP_PROTO(unsigned long call_site, const void *ptr), TP_ARGS(call_site, ptr)); -#endif /* __KERNEL__ */ - -#endif /* _LINUX_KMEMTRACE_H */ - +#endif /* _TRACE_KMEM_H */ diff --git a/init/main.c b/init/main.c index 3585f073d63..eece40cd8a6 100644 --- a/init/main.c +++ b/init/main.c @@ -64,6 +64,7 @@ #include #include #include +#include #include #include @@ -71,7 +72,6 @@ #include #include #include -#include #ifdef CONFIG_X86_LOCAL_APIC #include diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 5011f4d91e3..7a0aa0e260d 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -12,7 +12,7 @@ #include #include -#include +#include #include "trace_output.h" #include "trace.h" diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index f76a8f8689d..34b94c3f40a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -9,7 +9,7 @@ #include #include #include -#include +#include #include enum trace_type { diff --git a/mm/slab.c b/mm/slab.c index 9a90b00d2f9..f85831da908 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -102,7 +102,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/mm/slob.c b/mm/slob.c index a2d4ab32198..494f05f1941 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -65,7 +65,7 @@ #include #include #include -#include +#include #include /* diff --git a/mm/slub.c b/mm/slub.c index 7ab54ecbd3f..ea9e7160e2e 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -16,7 +16,7 @@ #include #include #include -#include +#include #include #include #include From fc182a4330fc22ea1b68fa3d5064dd85a73a4c4a Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 10 Apr 2009 14:27:38 +0800 Subject: [PATCH 0349/2061] tracing, kmemtrace: Make kmem tracepoints use TRACE_EVENT macro TRACE_EVENT is a more generic way to define tracepoints. Doing so adds these new capabilities to this tracepoint: - zero-copy and per-cpu splice() tracing - binary tracing without printf overhead - structured logging records exposed under /debug/tracing/events - trace events embedded in function tracer output and other plugins - user-defined, per tracepoint filter expressions Signed-off-by: Zhao Lei Acked-by: Eduard - Gabriel Munteanu Acked-by: Pekka Enberg Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49DEE6DA.80600@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/kmem.h | 39 +----- include/trace/kmem_event_types.h | 193 ++++++++++++++++++++++++++++++ include/trace/trace_event_types.h | 1 + include/trace/trace_events.h | 1 + 4 files changed, 197 insertions(+), 37 deletions(-) create mode 100644 include/trace/kmem_event_types.h diff --git a/include/trace/kmem.h b/include/trace/kmem.h index 24d25192818..46efc2423f0 100644 --- a/include/trace/kmem.h +++ b/include/trace/kmem.h @@ -1,44 +1,9 @@ #ifndef _TRACE_KMEM_H #define _TRACE_KMEM_H -#include #include +#include -DECLARE_TRACE(kmalloc, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); -DECLARE_TRACE(kmem_cache_alloc, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags)); -DECLARE_TRACE(kmalloc_node, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); -DECLARE_TRACE(kmem_cache_alloc_node, - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node)); -DECLARE_TRACE(kfree, - TP_PROTO(unsigned long call_site, const void *ptr), - TP_ARGS(call_site, ptr)); -DECLARE_TRACE(kmem_cache_free, - TP_PROTO(unsigned long call_site, const void *ptr), - TP_ARGS(call_site, ptr)); +#include #endif /* _TRACE_KMEM_H */ diff --git a/include/trace/kmem_event_types.h b/include/trace/kmem_event_types.h new file mode 100644 index 00000000000..4ff420fe467 --- /dev/null +++ b/include/trace/kmem_event_types.h @@ -0,0 +1,193 @@ + +/* use instead */ +#ifndef TRACE_EVENT +# error Do not include this file directly. +# error Unless you know what you are doing. +#endif + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kmem + +TRACE_EVENT(kmalloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmem_cache_alloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmalloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kmem_cache_alloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kfree, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +TRACE_EVENT(kmem_cache_free, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h index 33b6bfcba93..552a50e169a 100644 --- a/include/trace/trace_event_types.h +++ b/include/trace/trace_event_types.h @@ -4,3 +4,4 @@ #include #include #include +#include diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h index 0e2aa80076d..13d6b85668c 100644 --- a/include/trace/trace_events.h +++ b/include/trace/trace_events.h @@ -4,3 +4,4 @@ #include #include #include +#include From b78825d608f30a47e3154ab6872a03f0de0c9d45 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Apr 2009 16:18:53 +0800 Subject: [PATCH 0350/2061] blktrace: fix output of unknown events Not all events are pc (packet command) events. An event is a pc event only if it has BLK_TC_PC bit set. Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49D3236D.3090705@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 921ef5d1f0b..e45e1af1356 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1182,7 +1182,7 @@ static enum print_line_t print_one_line(struct trace_iterator *iter, } if (unlikely(what == 0 || what >= ARRAY_SIZE(what2act))) - ret = trace_seq_printf(s, "Bad pc action %x\n", what); + ret = trace_seq_printf(s, "Unknown action %x\n", what); else { ret = log_action(iter, what2act[what].act[long_act]); if (ret) From 66de7792c02693b49671afe58c771fde3b092fc7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 1 Apr 2009 16:19:19 +0800 Subject: [PATCH 0351/2061] blktrace: fix output of BLK_TC_PC events BLK_TC_PC events should be treated differently with BLK_TC_FS events. Before this patch: # echo 1 > /sys/block/sda/sda1/trace/enable # echo pc > /sys/block/sda/sda1/trace/act_mask # echo blk > /debugfs/tracing/current_tracer # (generate some BLK_TC_PC events) # cat trace bash-2184 [000] 1774.275413: 8,7 I N [bash] bash-2184 [000] 1774.275435: 8,7 D N [bash] bash-2184 [000] 1774.275540: 8,7 I R [bash] bash-2184 [000] 1774.275547: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275580: 8,7 C N 0 [0] bash-2184 [000] 1774.275648: 8,7 I R [bash] bash-2184 [000] 1774.275653: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275682: 8,7 C N 0 [0] bash-2184 [000] 1774.275739: 8,7 I R [bash] bash-2184 [000] 1774.275744: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275771: 8,7 C N 0 [0] bash-2184 [000] 1774.275804: 8,7 I R [bash] bash-2184 [000] 1774.275808: 8,7 D R [bash] ksoftirqd/0-4 [000] 1774.275836: 8,7 C N 0 [0] After this patch: # cat trace bash-2263 [000] 366.782149: 8,7 I N 0 (00 ..) [bash] bash-2263 [000] 366.782323: 8,7 D N 0 (00 ..) [bash] bash-2263 [000] 366.782557: 8,7 I R 8 (25 00 ..) [bash] bash-2263 [000] 366.782560: 8,7 D R 8 (25 00 ..) [bash] ksoftirqd/0-4 [000] 366.782582: 8,7 C N (25 00 ..) [0] bash-2263 [000] 366.782648: 8,7 I R 8 (5a 00 3f 00) [bash] bash-2263 [000] 366.782650: 8,7 D R 8 (5a 00 3f 00) [bash] ksoftirqd/0-4 [000] 366.782669: 8,7 C N (5a 00 3f 00) [0] bash-2263 [000] 366.782710: 8,7 I R 8 (5a 00 08 00) [bash] bash-2263 [000] 366.782713: 8,7 D R 8 (5a 00 08 00) [bash] ksoftirqd/0-4 [000] 366.782730: 8,7 C N (5a 00 08 00) [0] bash-2263 [000] 366.783375: 8,7 I R 36 (5a 00 08 00) [bash] bash-2263 [000] 366.783379: 8,7 D R 36 (5a 00 08 00) [bash] ksoftirqd/0-4 [000] 366.783404: 8,7 C N (5a 00 08 00) [0] This is what we do with PC events in user-space blktrace. Signed-off-by: Li Zefan Acked-by: Arnaldo Carvalho de Melo Cc: Jens Axboe Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49D32387.9040106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 88 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 80 insertions(+), 8 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e45e1af1356..2b98195b338 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -971,6 +971,16 @@ static inline const void *pdu_start(const struct trace_entry *ent) return te_blk_io_trace(ent) + 1; } +static inline u32 t_action(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->action; +} + +static inline u32 t_bytes(const struct trace_entry *ent) +{ + return te_blk_io_trace(ent)->bytes; +} + static inline u32 t_sec(const struct trace_entry *ent) { return te_blk_io_trace(ent)->bytes >> 9; @@ -1031,25 +1041,87 @@ static int blk_log_action(struct trace_iterator *iter, const char *act) MAJOR(t->device), MINOR(t->device), act, rwbs); } +static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) +{ + const char *pdu_buf; + int pdu_len; + int i, end, ret; + + pdu_buf = pdu_start(ent); + pdu_len = te_blk_io_trace(ent)->pdu_len; + + if (!pdu_len) + return 1; + + /* find the last zero that needs to be printed */ + for (end = pdu_len - 1; end >= 0; end--) + if (pdu_buf[end]) + break; + end++; + + if (!trace_seq_putc(s, '(')) + return 0; + + for (i = 0; i < pdu_len; i++) { + + ret = trace_seq_printf(s, "%s%02x", + i == 0 ? "" : " ", pdu_buf[i]); + if (!ret) + return ret; + + /* + * stop when the rest is just zeroes and indicate so + * with a ".." appended + */ + if (i == end && end != pdu_len - 1) + return trace_seq_puts(s, " ..) "); + } + + return trace_seq_puts(s, ") "); +} + static int blk_log_generic(struct trace_seq *s, const struct trace_entry *ent) { char cmd[TASK_COMM_LEN]; trace_find_cmdline(ent->pid, cmd); - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%s]\n", - t_sector(ent), t_sec(ent), cmd); - return trace_seq_printf(s, "[%s]\n", cmd); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = trace_seq_printf(s, "%u ", t_bytes(ent)); + if (!ret) + return 0; + ret = blk_log_dump_pdu(s, ent); + if (!ret) + return 0; + return trace_seq_printf(s, "[%s]\n", cmd); + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%s]\n", + t_sector(ent), t_sec(ent), cmd); + return trace_seq_printf(s, "[%s]\n", cmd); + } } static int blk_log_with_error(struct trace_seq *s, const struct trace_entry *ent) { - if (t_sec(ent)) - return trace_seq_printf(s, "%llu + %u [%d]\n", t_sector(ent), - t_sec(ent), t_error(ent)); - return trace_seq_printf(s, "%llu [%d]\n", t_sector(ent), t_error(ent)); + if (t_action(ent) & BLK_TC_ACT(BLK_TC_PC)) { + int ret; + + ret = blk_log_dump_pdu(s, ent); + if (ret) + return trace_seq_printf(s, "[%d]\n", t_error(ent)); + return 0; + } else { + if (t_sec(ent)) + return trace_seq_printf(s, "%llu + %u [%d]\n", + t_sector(ent), + t_sec(ent), t_error(ent)); + return trace_seq_printf(s, "%llu [%d]\n", + t_sector(ent), t_error(ent)); + } } static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) From 3fa89ca7ba5ba50b3924a11f6604b4bdce5f7842 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Sun, 12 Apr 2009 20:37:25 +0530 Subject: [PATCH 0352/2061] x86: vdso/vma.c declare vdso_enabled and arch_setup_additional_pages before they get used Impact: cleanup, address sparse warnings Addresses the problem pointed out by these sparse warning: arch/x86/vdso/vma.c:19:28: warning: symbol 'vdso_enabled' was not declared. Should it be static? arch/x86/vdso/vma.c:101:5: warning: symbol 'arch_setup_additional_pages' was not declared. Should it be static? Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1239548845.4170.2.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/vdso/vma.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/vdso/vma.c b/arch/x86/vdso/vma.c index 7133cdf9098..cac083386e0 100644 --- a/arch/x86/vdso/vma.c +++ b/arch/x86/vdso/vma.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include From 0f3fd87ce43727d6b8573191ce89e874533b1429 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Mon, 13 Apr 2009 20:24:50 +0100 Subject: [PATCH 0353/2061] perf_counter: fix alignment in /proc/interrupts Trivial fix on columns alignment in /proc/interrupts file. Signed-off-by: Luis Henriques Cc: Peter Zijlstra LKML-Reference: <20090413192449.GA3920@hades.domain.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/irq.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/irq.c b/arch/x86/kernel/irq.c index dccaaa85578..849cfabb1fd 100644 --- a/arch/x86/kernel/irq.c +++ b/arch/x86/kernel/irq.c @@ -67,7 +67,7 @@ static int show_other_interrupts(struct seq_file *p, int prec) for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_perf_irqs); seq_printf(p, " Performance counter interrupts\n"); - seq_printf(p, "PND: "); + seq_printf(p, "%*s: ", prec, "PND"); for_each_online_cpu(j) seq_printf(p, "%10u ", irq_stats(j)->apic_pending_irqs); seq_printf(p, " Performance pending work\n"); From e1112b4d96859367a93468027c9635e2ac04eb3f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 31 Mar 2009 00:48:49 -0500 Subject: [PATCH 0354/2061] tracing/filters: add run-time field descriptions to TRACE_EVENT_FORMAT events This patch adds run-time field descriptions to all the event formats exported using TRACE_EVENT_FORMAT. It also hooks up all the tracers that use them (i.e. the tracers in the 'ftrace subsystem') so they can also have their output filtered by the event-filtering mechanism. When I was testing this, there were a couple of things that fooled me into thinking the filters weren't working, when actually they were - I'll mention them here so others don't make the same mistakes (and file bug reports. ;-) One is that some of the tracers trace multiple events e.g. the sched_switch tracer uses the context_switch and wakeup events, and if you don't set filters on all of the traced events, the unfiltered output from the events without filters on them can make it look like the filtering as a whole isn't working properly, when actually it is doing what it was asked to do - it just wasn't asked to do the right thing. The other is that for the really high-volume tracers e.g. the function tracer, the volume of filtered events can be so high that it pushes the unfiltered events out of the ring buffer before they can be read so e.g. cat'ing the trace file repeatedly shows either no output, or once in awhile some output but that isn't there the next time you read the trace, which isn't what you normally expect when reading the trace file. If you read from the trace_pipe file though, you can catch them before they disappear. Changes from v1: As suggested by Frederic Weisbecker: - get rid of externs in functions - added unlikely() to filter_check_discard() Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 6 +++ kernel/trace/trace.c | 25 ++++++++++++ kernel/trace/trace.h | 20 ++++++++++ kernel/trace/trace_branch.c | 3 ++ kernel/trace/trace_event_types.h | 6 ++- kernel/trace/trace_events.c | 7 ++++ kernel/trace/trace_events_filter.c | 4 +- kernel/trace/trace_events_stage_2.h | 7 ---- kernel/trace/trace_export.c | 59 +++++++++++++++++++++++++++-- kernel/trace/trace_hw_branches.c | 2 + kernel/trace/trace_power.c | 4 ++ 11 files changed, 128 insertions(+), 15 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 7a0aa0e260d..9419ad10541 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -42,6 +42,7 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, gfp_t gfp_flags, int node) { + struct ftrace_event_call *call = &event_kmem_alloc; struct trace_array *tr = kmemtrace_array; struct kmemtrace_alloc_entry *entry; struct ring_buffer_event *event; @@ -62,6 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); @@ -71,6 +74,7 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, unsigned long call_site, const void *ptr) { + struct ftrace_event_call *call = &event_kmem_free; struct trace_array *tr = kmemtrace_array; struct kmemtrace_free_entry *entry; struct ring_buffer_event *event; @@ -86,6 +90,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4865459f609..962e6179994 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -898,6 +898,7 @@ trace_function(struct trace_array *tr, unsigned long ip, unsigned long parent_ip, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_function; struct ring_buffer_event *event; struct ftrace_entry *entry; @@ -912,6 +913,9 @@ trace_function(struct trace_array *tr, entry = ring_buffer_event_data(event); entry->ip = ip; entry->parent_ip = parent_ip; + + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); } @@ -921,6 +925,7 @@ static int __trace_graph_entry(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_entry; struct ring_buffer_event *event; struct ftrace_graph_ent_entry *entry; @@ -933,6 +938,7 @@ static int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(global_trace.buffer, event); return 1; @@ -943,6 +949,7 @@ static void __trace_graph_return(struct trace_array *tr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_funcgraph_exit; struct ring_buffer_event *event; struct ftrace_graph_ret_entry *entry; @@ -955,6 +962,7 @@ static void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -973,6 +981,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, int skip, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_kernel_stack; struct ring_buffer_event *event; struct stack_entry *entry; struct stack_trace trace; @@ -990,6 +999,7 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1015,6 +1025,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, unsigned long flags, int pc) { #ifdef CONFIG_STACKTRACE + struct ftrace_event_call *call = &event_user_stack; struct ring_buffer_event *event; struct userstack_entry *entry; struct stack_trace trace; @@ -1036,6 +1047,7 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1052,6 +1064,7 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { + struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct special_entry *entry; @@ -1064,6 +1077,7 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, pc); } @@ -1080,6 +1094,7 @@ tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *next, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_context_switch; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1095,6 +1110,9 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_prio = next->prio; entry->next_state = next->state; entry->next_cpu = task_cpu(next); + + filter_check_discard(call, entry, event); + trace_buffer_unlock_commit(tr, event, flags, pc); } @@ -1104,6 +1122,7 @@ tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *curr, unsigned long flags, int pc) { + struct ftrace_event_call *call = &event_wakeup; struct ring_buffer_event *event; struct ctx_switch_entry *entry; @@ -1120,6 +1139,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); @@ -1221,6 +1242,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; static u32 trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_bprint; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1260,6 +1282,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); out_unlock: @@ -1279,6 +1302,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) static raw_spinlock_t trace_buf_lock = __RAW_SPIN_LOCK_UNLOCKED; static char trace_buf[TRACE_BUF_SIZE]; + struct ftrace_event_call *call = &event_print; struct ring_buffer_event *event; struct trace_array *tr = &global_trace; struct trace_array_cpu *data; @@ -1314,6 +1338,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; + filter_check_discard(call, entry, event); ring_buffer_unlock_commit(tr->buffer, event); out_unlock: diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 34b94c3f40a..e7737281953 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -866,6 +866,21 @@ extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); +static inline void +filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + if (unlikely(call->preds) && !filter_match_preds(call, rec)) + ring_buffer_event_discard(event); +} + +#define __common_field(type, item) \ + ret = trace_define_field(event_call, #type, "common_" #item, \ + offsetof(typeof(field.ent), item), \ + sizeof(field.ent.item)); \ + if (ret) \ + return ret; + void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; @@ -897,4 +912,9 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ + extern struct ftrace_event_call event_##call; +#include "trace_event_types.h" + #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index e6e32912ffb..c95c25d838e 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -30,6 +30,7 @@ static struct trace_array *branch_tracer; static void probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) { + struct ftrace_event_call *call = &event_branch; struct trace_array *tr = branch_tracer; struct ring_buffer_event *event; struct trace_branch *entry; @@ -73,6 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; + filter_check_discard(call, entry, event); + ring_buffer_unlock_commit(tr->buffer, event); out: diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index fd78bee71dd..95b147aac22 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -122,8 +122,10 @@ TRACE_EVENT_FORMAT(print, TRACE_PRINT, print_entry, ignore, TRACE_EVENT_FORMAT(branch, TRACE_BRANCH, trace_branch, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned int, line, line) - TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, func) - TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, file) + TRACE_FIELD_SPECIAL(char func[TRACE_FUNC_SIZE+1], func, + TRACE_FUNC_SIZE+1, func) + TRACE_FIELD_SPECIAL(char file[TRACE_FUNC_SIZE+1], file, + TRACE_FUNC_SIZE+1, file) TRACE_FIELD(char, correct, correct) ), TP_RAW_FMT("%u:%s:%s (%u)") diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 64ec4d278ff..be9299a53e2 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -680,6 +680,7 @@ static struct dentry * event_subsystem_dir(const char *name, struct dentry *d_events) { struct event_subsystem *system; + struct dentry *entry; /* First see if we did not already create this dir */ list_for_each_entry(system, &event_subsystems, list) { @@ -708,6 +709,12 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->preds = NULL; + entry = debugfs_create_file("filter", 0644, system->entry, system, + &ftrace_subsystem_filter_fops); + if (!entry) + pr_warning("Could not create debugfs " + "'%s/filter' entry\n", name); + return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 026be412f35..470ad9487ec 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -185,7 +185,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system) } events_for_each(call) { - if (!call->name || !call->regfunc) + if (!call->define_fields) continue; if (!strcmp(call->system, system->name)) @@ -324,7 +324,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system, events_for_each(call) { int err; - if (!call->name || !call->regfunc) + if (!call->define_fields) continue; if (strcmp(call->system, system->name)) diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 30743f7d411..1c94b87c718 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -146,13 +146,6 @@ ftrace_format_##call(struct trace_seq *s) \ if (ret) \ return ret; -#define __common_field(type, item) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ - if (ret) \ - return ret; - #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ int \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 07a22c33ebf..f4e46616c48 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -30,7 +30,7 @@ #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ ret = trace_seq_printf(s, "\tfield special:" #type_item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ @@ -85,18 +85,69 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_ENTRY entry #undef TRACE_FIELD_SPECIAL -#define TRACE_FIELD_SPECIAL(type_item, item, cmd) \ +#define TRACE_FIELD_SPECIAL(type_item, item, len, cmd) \ cmd; #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int ftrace_define_fields_##call(void); \ +static int ftrace_raw_init_event_##call(void); \ \ -static struct ftrace_event_call __used \ +struct ftrace_event_call __used \ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .id = proto, \ .system = __stringify(TRACE_SYSTEM), \ + .raw_init = ftrace_raw_init_event_##call, \ .show_format = ftrace_format_##call, \ -} + .define_fields = ftrace_define_fields_##call, \ +}; \ +static int ftrace_raw_init_event_##call(void) \ +{ \ + INIT_LIST_HEAD(&event_##call.fields); \ + return 0; \ +} \ + +#include "trace_event_types.h" + +#undef TRACE_FIELD +#define TRACE_FIELD(type, item, assign) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SPECIAL +#define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_ZERO_CHAR +#define TRACE_FIELD_ZERO_CHAR(item) + +#undef TRACE_EVENT_FORMAT +#define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_event_call *event_call = &event_##call; \ + struct args field; \ + int ret; \ + \ + __common_field(unsigned char, type); \ + __common_field(unsigned char, flags); \ + __common_field(unsigned char, preempt_count); \ + __common_field(int, pid); \ + __common_field(int, tgid); \ + \ + tstruct; \ + \ + return ret; \ +} + #include "trace_event_types.h" diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index 7bfdf4c2347..e6b275b22ac 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -168,6 +168,7 @@ static enum print_line_t bts_trace_print_line(struct trace_iterator *iter) void trace_hw_branch(u64 from, u64 to) { + struct ftrace_event_call *call = &event_hw_branch; struct trace_array *tr = hw_branch_trace; struct ring_buffer_event *event; struct hw_branch_entry *entry; @@ -194,6 +195,7 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index bae791ebcc5..8ce7d7d62c0 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -36,6 +36,7 @@ static void probe_power_start(struct power_trace *it, unsigned int type, static void probe_power_end(struct power_trace *it) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -54,6 +55,7 @@ static void probe_power_end(struct power_trace *it) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); @@ -62,6 +64,7 @@ static void probe_power_end(struct power_trace *it) static void probe_power_mark(struct power_trace *it, unsigned int type, unsigned int level) { + struct ftrace_event_call *call = &event_power; struct ring_buffer_event *event; struct trace_power *entry; struct trace_array_cpu *data; @@ -84,6 +87,7 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; + filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); From e45f2e2bd298e1ff687448e5fd15a3588b5807ec Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 31 Mar 2009 00:49:16 -0500 Subject: [PATCH 0355/2061] tracing/filters: add TRACE_EVENT_FORMAT_NOFILTER event macro Frederic Weisbecker suggested that the trace_special event shouldn't be filterable; this patch adds a TRACE_EVENT_FORMAT_NOFILTER event macro that allows an event format to be exported without having a filter attached, and removes filtering from the trace_special event. Signed-off-by: Tom Zanussi Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 2 -- kernel/trace/trace.h | 2 ++ kernel/trace/trace_event_types.h | 2 +- kernel/trace/trace_export.c | 33 ++++++++++++++++++++++++++++++++ 4 files changed, 36 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 962e6179994..c209d214169 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1064,7 +1064,6 @@ ftrace_trace_special(void *__tr, unsigned long arg1, unsigned long arg2, unsigned long arg3, int pc) { - struct ftrace_event_call *call = &event_special; struct ring_buffer_event *event; struct trace_array *tr = __tr; struct special_entry *entry; @@ -1077,7 +1076,6 @@ ftrace_trace_special(void *__tr, entry->arg1 = arg1; entry->arg2 = arg2; entry->arg3 = arg3; - filter_check_discard(call, entry, event); trace_buffer_unlock_commit(tr, event, 0, pc); } diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index e7737281953..3cf856fa597 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -915,6 +915,8 @@ do { \ #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ extern struct ftrace_event_call event_##call; +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, tpfmt) #include "trace_event_types.h" #endif /* _LINUX_KERNEL_TRACE_H */ diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index 95b147aac22..cfcecc4fd86 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -57,7 +57,7 @@ TRACE_EVENT_FORMAT(context_switch, TRACE_CTX, ctx_switch_entry, ignore, TP_RAW_FMT("%u:%u:%u ==+ %u:%u:%u [%03u]") ); -TRACE_EVENT_FORMAT(special, TRACE_SPECIAL, special_entry, ignore, +TRACE_EVENT_FORMAT_NOFILTER(special, TRACE_SPECIAL, special_entry, ignore, TRACE_STRUCT( TRACE_FIELD(unsigned long, arg1, arg1) TRACE_FIELD(unsigned long, arg2, arg2) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index f4e46616c48..77c494f5e1d 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -65,6 +65,22 @@ ftrace_format_##call(struct trace_seq *s) \ return ret; \ } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct args field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: \"%s\"\n", tpfmt); \ + \ + return ret; \ +} + #include "trace_event_types.h" #undef TRACE_ZERO_CHAR @@ -109,6 +125,19 @@ static int ftrace_raw_init_event_##call(void) \ return 0; \ } \ +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) \ + \ +struct ftrace_event_call __used \ +__attribute__((__aligned__(4))) \ +__attribute__((section("_ftrace_events"))) event_##call = { \ + .name = #call, \ + .id = proto, \ + .system = __stringify(TRACE_SYSTEM), \ + .show_format = ftrace_format_##call, \ +}; + #include "trace_event_types.h" #undef TRACE_FIELD @@ -150,4 +179,8 @@ ftrace_define_fields_##call(void) \ return ret; \ } +#undef TRACE_EVENT_FORMAT_NOFILTER +#define TRACE_EVENT_FORMAT_NOFILTER(call, proto, args, fmt, tstruct, \ + tpfmt) + #include "trace_event_types.h" From fa1b47dd85453ec7d4bcfe4aa4a2d172ba452fc3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 2 Apr 2009 00:09:41 -0400 Subject: [PATCH 0356/2061] ring-buffer: add ring_buffer_discard_commit The ring_buffer_discard_commit is similar to ring_buffer_event_discard but it can only be done on an event that has yet to be commited. Unpredictable results can happen otherwise. The main difference between ring_buffer_discard_commit and ring_buffer_event_discard is that ring_buffer_discard_commit will try to free the data in the ring buffer if nothing has addded data after the reserved event. If something did, then it acts almost the same as ring_buffer_event_discard followed by a ring_buffer_unlock_commit. Note, either ring_buffer_commit_discard and ring_buffer_unlock_commit can be called on an event, not both. This commit also exports both discard functions to be usable by GPL modules. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- include/linux/ring_buffer.h | 29 +++++++++ kernel/trace/ring_buffer.c | 125 ++++++++++++++++++++++++++++++------ 2 files changed, 133 insertions(+), 21 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index e1b7b217388..f0aa486d131 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -68,8 +68,37 @@ ring_buffer_event_time_delta(struct ring_buffer_event *event) return event->time_delta; } +/* + * ring_buffer_event_discard can discard any event in the ring buffer. + * it is up to the caller to protect against a reader from + * consuming it or a writer from wrapping and replacing it. + * + * No external protection is needed if this is called before + * the event is commited. But in that case it would be better to + * use ring_buffer_discard_commit. + * + * Note, if an event that has not been committed is discarded + * with ring_buffer_event_discard, it must still be committed. + */ void ring_buffer_event_discard(struct ring_buffer_event *event); +/* + * ring_buffer_discard_commit will remove an event that has not + * ben committed yet. If this is used, then ring_buffer_unlock_commit + * must not be called on the discarded event. This function + * will try to remove the event from the ring buffer completely + * if another event has not been written after it. + * + * Example use: + * + * if (some_condition) + * ring_buffer_discard_commit(buffer, event); + * else + * ring_buffer_unlock_commit(buffer, event); + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event); + /* * size is in bytes for each per CPU buffer. */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 74a11808c28..f935bd5ec3e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -205,27 +205,6 @@ static void rb_event_set_padding(struct ring_buffer_event *event) event->time_delta = 0; } -/** - * ring_buffer_event_discard - discard an event in the ring buffer - * @buffer: the ring buffer - * @event: the event to discard - * - * Sometimes a event that is in the ring buffer needs to be ignored. - * This function lets the user discard an event in the ring buffer - * and then that event will not be read later. - * - * Note, it is up to the user to be careful with this, and protect - * against races. If the user discards an event that has been consumed - * it is possible that it could corrupt the ring buffer. - */ -void ring_buffer_event_discard(struct ring_buffer_event *event) -{ - event->type = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; -} - static unsigned rb_event_data_length(struct ring_buffer_event *event) { @@ -1570,6 +1549,110 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +/** + * ring_buffer_event_discard - discard any event in the ring buffer + * @event: the event to discard + * + * Sometimes a event that is in the ring buffer needs to be ignored. + * This function lets the user discard an event in the ring buffer + * and then that event will not be read later. + * + * Note, it is up to the user to be careful with this, and protect + * against races. If the user discards an event that has been consumed + * it is possible that it could corrupt the ring buffer. + */ +void ring_buffer_event_discard(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} +EXPORT_SYMBOL_GPL(ring_buffer_event_discard); + +/** + * ring_buffer_commit_discard - discard an event that has not been committed + * @buffer: the ring buffer + * @event: non committed event to discard + * + * This is similar to ring_buffer_event_discard but must only be + * performed on an event that has not been committed yet. The difference + * is that this will also try to free the event from the ring buffer + * if another event has not been added behind it. + * + * If another event has been added behind it, it will set the event + * up as discarded, and perform the commit. + * + * If this function is called, do not call ring_buffer_unlock_commit on + * the event. + */ +void ring_buffer_discard_commit(struct ring_buffer *buffer, + struct ring_buffer_event *event) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long new_index, old_index; + struct buffer_page *bpage; + unsigned long index; + unsigned long addr; + int cpu; + + /* The event is discarded regardless */ + ring_buffer_event_discard(event); + + /* + * This must only be called if the event has not been + * committed yet. Thus we can assume that preemption + * is still disabled. + */ + RB_WARN_ON(buffer, !preempt_count()); + + cpu = smp_processor_id(); + cpu_buffer = buffer->buffers[cpu]; + + new_index = rb_event_index(event); + old_index = new_index + rb_event_length(event); + addr = (unsigned long)event; + addr &= PAGE_MASK; + + bpage = cpu_buffer->tail_page; + + if (bpage == (void *)addr && rb_page_write(bpage) == old_index) { + /* + * This is on the tail page. It is possible that + * a write could come in and move the tail page + * and write to the next page. That is fine + * because we just shorten what is on this page. + */ + index = local_cmpxchg(&bpage->write, old_index, new_index); + if (index == old_index) + goto out; + } + + /* + * The commit is still visible by the reader, so we + * must increment entries. + */ + cpu_buffer->entries++; + out: + /* + * If a write came in and pushed the tail page + * we still need to update the commit pointer + * if we were the commit. + */ + if (rb_is_commit(cpu_buffer, event)) + rb_set_commit_to_write(cpu_buffer); + + /* + * Only the last preempt count needs to restore preemption. + */ + if (preempt_count() == 1) + ftrace_preempt_enable(per_cpu(rb_need_resched, cpu)); + else + preempt_enable_no_resched_notrace(); + +} +EXPORT_SYMBOL_GPL(ring_buffer_discard_commit); + /** * ring_buffer_write - write data to the buffer without reserving * @buffer: The ring buffer to write to. From 77d9f465d46fd67cdb82ee5e1ab99dd57a17c486 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 2 Apr 2009 01:16:59 -0400 Subject: [PATCH 0357/2061] tracing/filters: use ring_buffer_discard_commit for discarded events The ring_buffer_discard_commit makes better usage of the ring_buffer when an event has been discarded. It tries to remove it completely if possible. This patch converts the trace event filtering to use ring_buffer_discard_commit instead of the ring_buffer_event_discard. Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 9 +++++++-- kernel/trace/trace.h | 1 + kernel/trace/trace_events_stage_3.h | 6 +++--- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c209d214169..d880ab2772c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -884,13 +884,18 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { - return __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); + __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); +} + +void trace_current_buffer_discard_commit(struct ring_buffer_event *event) +{ + ring_buffer_discard_commit(global_trace.buffer, event); } void diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 3cf856fa597..dfefffd7ae3 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -497,6 +497,7 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); +void trace_current_buffer_discard_commit(struct ring_buffer_event *event); struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 9d2fa78cecc..d2f34bf30e5 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -223,9 +223,9 @@ static void ftrace_raw_event_##call(proto) \ assign; \ \ if (call->preds && !filter_match_preds(call, entry)) \ - ring_buffer_event_discard(event); \ - \ - trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ + trace_current_buffer_discard_commit(event); \ + else \ + trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ \ } \ \ From 5f77a88b3f8268b11940b51d2e03d26a663ceb90 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 8 Apr 2009 03:14:01 -0500 Subject: [PATCH 0358/2061] tracing/infrastructure: separate event tracer from event support Add a new config option, CONFIG_EVENT_TRACING that gets selected when CONFIG_TRACING is selected and adds everything needed by the stuff in trace_export - basically all the event tracing support needed by e.g. bprint, minus the actual events, which are only included if CONFIG_EVENT_TRACER is selected. So CONFIG_EVENT_TRACER can be used to turn on or off the generated events (what I think of as the 'event tracer'), while CONFIG_EVENT_TRACING turns on or off the base event tracing support used by both the event tracer and the other things such as bprint that can't be configured out. Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <1239178441.10295.34.camel@tropicana> Signed-off-by: Ingo Molnar --- include/asm-generic/vmlinux.lds.h | 2 +- kernel/trace/Kconfig | 4 ++++ kernel/trace/Makefile | 6 +++--- 3 files changed, 8 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h index 7fa660fd449..7e9b1e9f711 100644 --- a/include/asm-generic/vmlinux.lds.h +++ b/include/asm-generic/vmlinux.lds.h @@ -61,7 +61,7 @@ #define BRANCH_PROFILE() #endif -#ifdef CONFIG_EVENT_TRACER +#ifdef CONFIG_EVENT_TRACING #define FTRACE_EVENTS() VMLINUX_SYMBOL(__start_ftrace_events) = .; \ *(_ftrace_events) \ VMLINUX_SYMBOL(__stop_ftrace_events) = .; diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 23b96ebbf89..644606e899f 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -48,6 +48,9 @@ config FTRACE_NMI_ENTER depends on HAVE_FTRACE_NMI_ENTER default y +config EVENT_TRACING + bool + config TRACING bool select DEBUG_FS @@ -56,6 +59,7 @@ config TRACING select TRACEPOINTS select NOP_TRACER select BINARY_PRINTF + select EVENT_TRACING # # Minimum requirements an architecture has to meet for us to diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f5121ec..3ad367e7c97 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -40,11 +40,11 @@ obj-$(CONFIG_POWER_TRACER) += trace_power.o obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o -obj-$(CONFIG_EVENT_TRACER) += trace_events.o +obj-$(CONFIG_EVENT_TRACING) += trace_events.o obj-$(CONFIG_EVENT_TRACER) += events.o -obj-$(CONFIG_EVENT_TRACER) += trace_export.o +obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o -obj-$(CONFIG_EVENT_TRACER) += trace_events_filter.o +obj-$(CONFIG_EVENT_TRACING) += trace_events_filter.o libftrace-y := ftrace.o From eb02ce017dd83985041a7e54c6449f92d53b026f Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Wed, 8 Apr 2009 03:15:54 -0500 Subject: [PATCH 0359/2061] tracing/filters: use ring_buffer_discard_commit() in filter_check_discard() This patch changes filter_check_discard() to make use of the new ring_buffer_discard_commit() function and modifies the current users to call the old commit function in the non-discard case. It also introduces a version of filter_check_discard() that uses the global trace buffer (filter_current_check_discard()) for those cases. v2 changes: - fix compile error noticed by Ingo Molnar Signed-off-by: Tom Zanussi Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <1239178554.10295.36.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/kmemtrace.c | 10 +++---- kernel/trace/trace.c | 45 +++++++++++++++-------------- kernel/trace/trace.h | 14 +++++++-- kernel/trace/trace_branch.c | 5 ++-- kernel/trace/trace_events_stage_3.h | 5 +--- kernel/trace/trace_hw_branches.c | 4 +-- kernel/trace/trace_power.c | 8 ++--- 7 files changed, 48 insertions(+), 43 deletions(-) diff --git a/kernel/trace/kmemtrace.c b/kernel/trace/kmemtrace.c index 9419ad10541..86cdf671d7e 100644 --- a/kernel/trace/kmemtrace.c +++ b/kernel/trace/kmemtrace.c @@ -63,9 +63,8 @@ static inline void kmemtrace_alloc(enum kmemtrace_type_id type_id, entry->gfp_flags = gfp_flags; entry->node = node; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } @@ -90,9 +89,8 @@ static inline void kmemtrace_free(enum kmemtrace_type_id type_id, entry->call_site = call_site; entry->ptr = ptr; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); trace_wake_up(); } diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d880ab2772c..c0047fcf707 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -171,6 +171,12 @@ static struct trace_array global_trace; static DEFINE_PER_CPU(struct trace_array_cpu, global_trace_cpu); +int filter_current_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer_event *event) +{ + return filter_check_discard(call, rec, global_trace.buffer, event); +} + cycle_t ftrace_now(int cpu) { u64 ts; @@ -919,9 +925,8 @@ trace_function(struct trace_array *tr, entry->ip = ip; entry->parent_ip = parent_ip; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -943,8 +948,8 @@ static int __trace_graph_entry(struct trace_array *tr, return 0; entry = ring_buffer_event_data(event); entry->graph_ent = *trace; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); return 1; } @@ -967,8 +972,8 @@ static void __trace_graph_return(struct trace_array *tr, return; entry = ring_buffer_event_data(event); entry->ret = *trace; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(global_trace.buffer, event); + if (!filter_current_check_discard(call, entry, event)) + ring_buffer_unlock_commit(global_trace.buffer, event); } #endif @@ -1004,8 +1009,8 @@ static void __ftrace_trace_stack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace(&trace); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1052,8 +1057,8 @@ static void ftrace_trace_userstack(struct trace_array *tr, trace.entries = entry->caller; save_stack_trace_user(&trace); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); #endif } @@ -1114,9 +1119,8 @@ tracing_sched_switch_trace(struct trace_array *tr, entry->next_state = next->state; entry->next_cpu = task_cpu(next); - filter_check_discard(call, entry, event); - - trace_buffer_unlock_commit(tr, event, flags, pc); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, flags, pc); } void @@ -1142,9 +1146,8 @@ tracing_sched_wakeup_trace(struct trace_array *tr, entry->next_state = wakee->state; entry->next_cpu = task_cpu(wakee); - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); ftrace_trace_stack(tr, flags, 6, pc); ftrace_trace_userstack(tr, flags, pc); } @@ -1285,8 +1288,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) entry->fmt = fmt; memcpy(entry->buf, trace_buf, sizeof(u32) * len); - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); @@ -1341,8 +1344,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) memcpy(&entry->buf, trace_buf, len); entry->buf[len] = 0; - filter_check_discard(call, entry, event); - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out_unlock: __raw_spin_unlock(&trace_buf_lock); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index dfefffd7ae3..9729d14767d 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -866,13 +866,21 @@ extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); +extern int filter_current_check_discard(struct ftrace_event_call *call, + void *rec, + struct ring_buffer_event *event); -static inline void +static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, + struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->preds) && !filter_match_preds(call, rec)) - ring_buffer_event_discard(event); + if (unlikely(call->preds) && !filter_match_preds(call, rec)) { + ring_buffer_discard_commit(buffer, event); + return 1; + } + + return 0; } #define __common_field(type, item) \ diff --git a/kernel/trace/trace_branch.c b/kernel/trace/trace_branch.c index c95c25d838e..8e64e604f5a 100644 --- a/kernel/trace/trace_branch.c +++ b/kernel/trace/trace_branch.c @@ -74,9 +74,8 @@ probe_likely_condition(struct ftrace_branch_data *f, int val, int expect) entry->line = f->line; entry->correct = val == expect; - filter_check_discard(call, entry, event); - - ring_buffer_unlock_commit(tr->buffer, event); + if (!filter_check_discard(call, entry, tr->buffer, event)) + ring_buffer_unlock_commit(tr->buffer, event); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index d2f34bf30e5..b2b298269eb 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -222,11 +222,8 @@ static void ftrace_raw_event_##call(proto) \ \ assign; \ \ - if (call->preds && !filter_match_preds(call, entry)) \ - trace_current_buffer_discard_commit(event); \ - else \ + if (!filter_current_check_discard(call, entry, event)) \ trace_nowake_buffer_unlock_commit(event, irq_flags, pc); \ - \ } \ \ static int ftrace_raw_reg_event_##call(void) \ diff --git a/kernel/trace/trace_hw_branches.c b/kernel/trace/trace_hw_branches.c index e6b275b22ac..8683d50a753 100644 --- a/kernel/trace/trace_hw_branches.c +++ b/kernel/trace/trace_hw_branches.c @@ -195,8 +195,8 @@ void trace_hw_branch(u64 from, u64 to) entry->ent.type = TRACE_HW_BRANCHES; entry->from = from; entry->to = to; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: atomic_dec(&tr->data[cpu]->disabled); diff --git a/kernel/trace/trace_power.c b/kernel/trace/trace_power.c index 8ce7d7d62c0..810a5b7cf1c 100644 --- a/kernel/trace/trace_power.c +++ b/kernel/trace/trace_power.c @@ -55,8 +55,8 @@ static void probe_power_end(struct power_trace *it) goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } @@ -87,8 +87,8 @@ static void probe_power_mark(struct power_trace *it, unsigned int type, goto out; entry = ring_buffer_event_data(event); entry->state_data = *it; - filter_check_discard(call, entry, event); - trace_buffer_unlock_commit(tr, event, 0, 0); + if (!filter_check_discard(call, entry, tr->buffer, event)) + trace_buffer_unlock_commit(tr, event, 0, 0); out: preempt_enable(); } From 0a19e53c1514ad8e9c3cbab40c6c3f52c86f403d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Mon, 13 Apr 2009 03:17:50 -0500 Subject: [PATCH 0360/2061] tracing/filters: allow on-the-fly filter switching This patch allows event filters to be safely removed or switched on-the-fly while avoiding the use of rcu or the suspension of tracing of previous versions. It does it by adding a new filter_pred_none() predicate function which does nothing and by never deallocating either the predicates or any of the filter_pred members used in matching; the predicate lists are allocated and initialized during ftrace_event_calls initialization. Whenever a filter is removed or replaced, the filter_pred_* functions currently in use by the affected ftrace_event_call are immediately switched over to to the filter_pred_none() function, while the rest of the filter_pred members are left intact, allowing any currently executing filter_pred_* functions to finish up, using the values they're currently using. In the case of filter replacement, the new predicate values are copied into the old predicates after the above step, and the filter_pred_none() functions are replaced by the filter_pred_* functions for the new filter. In this case, it is possible though very unlikely that a previous filter_pred_* is still running even after the filter_pred_none() switch and the switch to the new filter_pred_*. In that case, however, because nothing has been deallocated in the filter_pred, the worst that can happen is that the old filter_pred_* function sees the new values and as a result produces either a false positive or a false negative, depending on the values it finds. So one downside to this method is that rarely, it can produce a bad match during the filter switch, but it should be possible to live with that, IMHO. The other downside is that at least in this patch the predicate lists are always pre-allocated, taking up memory from the start. They could probably be allocated on first-use, and de-allocated when tracing is completely stopped - if this patch makes sense, I could create another one to do that later on. Oh, and it also places a restriction on the size of __arrays in events, currently set to 128, since they can't be larger than the now embedded str_val arrays in the filter_pred struct. Signed-off-by: Tom Zanussi Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: paulmck@linux.vnet.ibm.com LKML-Reference: <1239610670.6660.49.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 14 +- kernel/trace/trace_events.c | 9 +- kernel/trace/trace_events_filter.c | 254 +++++++++++++++------------- kernel/trace/trace_events_stage_2.h | 1 + kernel/trace/trace_events_stage_3.h | 1 + kernel/trace/trace_export.c | 1 + 6 files changed, 151 insertions(+), 129 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 9729d14767d..b05b6ac982a 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -813,6 +813,7 @@ struct ftrace_event_call { int (*show_format)(struct trace_seq *s); int (*define_fields)(void); struct list_head fields; + int n_preds; struct filter_pred **preds; #ifdef CONFIG_EVENT_PROFILE @@ -826,6 +827,7 @@ struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; + int n_preds; struct filter_pred **preds; }; @@ -834,7 +836,8 @@ struct event_subsystem { (unsigned long)event < (unsigned long)__stop_ftrace_events; \ event++) -#define MAX_FILTER_PRED 8 +#define MAX_FILTER_PRED 8 +#define MAX_FILTER_STR_VAL 128 struct filter_pred; @@ -843,7 +846,7 @@ typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); struct filter_pred { filter_pred_fn_t fn; u64 val; - char *str_val; + char str_val[MAX_FILTER_STR_VAL]; int str_len; char *field_name; int offset; @@ -855,13 +858,14 @@ struct filter_pred { int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size); +extern int init_preds(struct ftrace_event_call *call); extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, +extern void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s); extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); -extern void filter_free_preds(struct ftrace_event_call *call); +extern void filter_disable_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, @@ -875,7 +879,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->preds) && !filter_match_preds(call, rec)) { + if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 789e14eb09a..ead68ac9919 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -481,7 +481,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call->preds, s); + filter_print_preds(call->preds, call->n_preds, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -516,7 +516,7 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, } if (pred->clear) { - filter_free_preds(call); + filter_disable_preds(call); filter_free_pred(pred); return cnt; } @@ -527,6 +527,8 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return err; } + filter_free_pred(pred); + *ppos += cnt; return cnt; @@ -549,7 +551,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(system->preds, s); + filter_print_preds(system->preds, system->n_preds, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -712,6 +714,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) list_add(&system->list, &event_subsystems); system->preds = NULL; + system->n_preds = 0; entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 9f8ecca34a5..de42dad42a8 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -82,25 +82,27 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } +static int filter_pred_none(struct filter_pred *pred, void *event) +{ + return 0; +} + /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct ftrace_event_call *call, void *rec) { int i, matched, and_failed = 0; struct filter_pred *pred; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (call->preds[i]) { - pred = call->preds[i]; - if (and_failed && !pred->or) - continue; - matched = pred->fn(pred, rec); - if (!matched && !pred->or) { - and_failed = 1; - continue; - } else if (matched && pred->or) - return 1; - } else - break; + for (i = 0; i < call->n_preds; i++) { + pred = call->preds[i]; + if (and_failed && !pred->or) + continue; + matched = pred->fn(pred, rec); + if (!matched && !pred->or) { + and_failed = 1; + continue; + } else if (matched && pred->or) + return 1; } if (and_failed) @@ -109,31 +111,29 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) return 1; } -void filter_print_preds(struct filter_pred **preds, struct trace_seq *s) +void filter_print_preds(struct filter_pred **preds, int n_preds, + struct trace_seq *s) { char *field_name; struct filter_pred *pred; int i; - if (!preds) { + if (!n_preds) { trace_seq_printf(s, "none\n"); return; } - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (preds[i]) { - pred = preds[i]; - field_name = pred->field_name; - if (i) - trace_seq_printf(s, pred->or ? "|| " : "&& "); - trace_seq_printf(s, "%s ", field_name); - trace_seq_printf(s, pred->not ? "!= " : "== "); - if (pred->str_val) - trace_seq_printf(s, "%s\n", pred->str_val); - else - trace_seq_printf(s, "%llu\n", pred->val); - } else - break; + for (i = 0; i < n_preds; i++) { + pred = preds[i]; + field_name = pred->field_name; + if (i) + trace_seq_printf(s, pred->or ? "|| " : "&& "); + trace_seq_printf(s, "%s ", field_name); + trace_seq_printf(s, pred->not ? "!= " : "== "); + if (pred->str_len) + trace_seq_printf(s, "%s\n", pred->str_val); + else + trace_seq_printf(s, "%llu\n", pred->val); } } @@ -156,20 +156,69 @@ void filter_free_pred(struct filter_pred *pred) return; kfree(pred->field_name); - kfree(pred->str_val); kfree(pred); } -void filter_free_preds(struct ftrace_event_call *call) +static void filter_clear_pred(struct filter_pred *pred) +{ + kfree(pred->field_name); + pred->field_name = NULL; + pred->str_len = 0; +} + +static int filter_set_pred(struct filter_pred *dest, + struct filter_pred *src, + filter_pred_fn_t fn) +{ + *dest = *src; + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; + dest->fn = fn; + + return 0; +} + +void filter_disable_preds(struct ftrace_event_call *call) { int i; - if (call->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) - filter_free_pred(call->preds[i]); - kfree(call->preds); - call->preds = NULL; + call->n_preds = 0; + + for (i = 0; i < MAX_FILTER_PRED; i++) + call->preds[i]->fn = filter_pred_none; +} + +int init_preds(struct ftrace_event_call *call) +{ + struct filter_pred *pred; + int i; + + call->n_preds = 0; + + call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + if (!call->preds) + return -ENOMEM; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + goto oom; + pred->fn = filter_pred_none; + call->preds[i] = pred; } + + return 0; + +oom: + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (call->preds[i]) + filter_free_pred(call->preds[i]); + } + kfree(call->preds); + call->preds = NULL; + + return -ENOMEM; } void filter_free_subsystem_preds(struct event_subsystem *system) @@ -177,11 +226,12 @@ void filter_free_subsystem_preds(struct event_subsystem *system) struct ftrace_event_call *call = __start_ftrace_events; int i; - if (system->preds) { - for (i = 0; i < MAX_FILTER_PRED; i++) + if (system->n_preds) { + for (i = 0; i < system->n_preds; i++) filter_free_pred(system->preds[i]); kfree(system->preds); system->preds = NULL; + system->n_preds = 0; } events_for_each(call) { @@ -189,33 +239,31 @@ void filter_free_subsystem_preds(struct event_subsystem *system) continue; if (!strcmp(call->system, system->name)) - filter_free_preds(call); + filter_disable_preds(call); } } static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred) + struct filter_pred *pred, + filter_pred_fn_t fn) { - int i; + int idx, err; - if (call->preds && !pred->compound) - filter_free_preds(call); + if (call->n_preds && !pred->compound) + filter_disable_preds(call); - if (!call->preds) { - call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), - GFP_KERNEL); - if (!call->preds) - return -ENOMEM; - } + if (call->n_preds == MAX_FILTER_PRED) + return -ENOSPC; - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!call->preds[i]) { - call->preds[i] = pred; - return 0; - } - } + idx = call->n_preds; + filter_clear_pred(call->preds[idx]); + err = filter_set_pred(call->preds[idx], pred, fn); + if (err) + return err; - return -ENOSPC; + call->n_preds++; + + return 0; } static int is_string_field(const char *type) @@ -229,98 +277,66 @@ static int is_string_field(const char *type) int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) { struct ftrace_event_field *field; + filter_pred_fn_t fn; field = find_event_field(call, pred->field_name); if (!field) return -EINVAL; + pred->fn = filter_pred_none; pred->offset = field->offset; if (is_string_field(field->type)) { - if (!pred->str_val) + if (!pred->str_len) return -EINVAL; - pred->fn = filter_pred_string; + fn = filter_pred_string; pred->str_len = field->size; - return __filter_add_pred(call, pred); + return __filter_add_pred(call, pred, fn); } else { - if (pred->str_val) + if (pred->str_len) return -EINVAL; } switch (field->size) { case 8: - pred->fn = filter_pred_64; + fn = filter_pred_64; break; case 4: - pred->fn = filter_pred_32; + fn = filter_pred_32; break; case 2: - pred->fn = filter_pred_16; + fn = filter_pred_16; break; case 1: - pred->fn = filter_pred_8; + fn = filter_pred_8; break; default: return -EINVAL; } - return __filter_add_pred(call, pred); -} - -static struct filter_pred *copy_pred(struct filter_pred *pred) -{ - struct filter_pred *new_pred = kmalloc(sizeof(*pred), GFP_KERNEL); - if (!new_pred) - return NULL; - - memcpy(new_pred, pred, sizeof(*pred)); - - if (pred->field_name) { - new_pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!new_pred->field_name) { - kfree(new_pred); - return NULL; - } - } - - if (pred->str_val) { - new_pred->str_val = kstrdup(pred->str_val, GFP_KERNEL); - if (!new_pred->str_val) { - filter_free_pred(new_pred); - return NULL; - } - } - - return new_pred; + return __filter_add_pred(call, pred, fn); } int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { struct ftrace_event_call *call = __start_ftrace_events; - struct filter_pred *event_pred; - int i; - if (system->preds && !pred->compound) + if (system->n_preds && !pred->compound) filter_free_subsystem_preds(system); - if (!system->preds) { + if (!system->n_preds) { system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); if (!system->preds) return -ENOMEM; } - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (!system->preds[i]) { - system->preds[i] = pred; - break; - } - } - - if (i == MAX_FILTER_PRED) + if (system->n_preds == MAX_FILTER_PRED) return -ENOSPC; + system->preds[system->n_preds] = pred; + events_for_each(call) { int err; @@ -333,22 +349,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (!find_event_field(call, pred->field_name)) continue; - event_pred = copy_pred(pred); - if (!event_pred) - goto oom; - - err = filter_add_pred(call, event_pred); - if (err) - filter_free_pred(event_pred); - if (err == -ENOMEM) - goto oom; + err = filter_add_pred(call, pred); + if (err == -ENOMEM) { + system->preds[system->n_preds] = NULL; + return err; + } } - return 0; + system->n_preds++; -oom: - system->preds[i] = NULL; - return -ENOMEM; + return 0; } int filter_parse(char **pbuf, struct filter_pred *pred) @@ -410,7 +420,8 @@ int filter_parse(char **pbuf, struct filter_pred *pred) } } - if (!val_str) { + if (!val_str || !strlen(val_str) + || strlen(val_str) >= MAX_FILTER_STR_VAL) { pred->field_name = NULL; return -EINVAL; } @@ -419,11 +430,12 @@ int filter_parse(char **pbuf, struct filter_pred *pred) if (!pred->field_name) return -ENOMEM; + pred->str_len = 0; pred->val = simple_strtoull(val_str, &tmp, 0); if (tmp == val_str) { - pred->str_val = kstrdup(val_str, GFP_KERNEL); - if (!pred->str_val) - return -ENOMEM; + strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL); + pred->str_len = strlen(val_str); + pred->str_val[pred->str_len] = '\0'; } else if (*tmp != '\0') return -EINVAL; diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 02fb710193e..59cfd7dfe68 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -140,6 +140,7 @@ ftrace_format_##call(struct trace_seq *s) \ #undef __array #define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ sizeof(field.item)); \ diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index b2b298269eb..5bb1b7ffbdb 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -255,6 +255,7 @@ static int ftrace_raw_init_event_##call(void) \ return -ENODEV; \ event_##call.id = id; \ INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ return 0; \ } \ \ diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 77c494f5e1d..48fc02fe73a 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -122,6 +122,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ static int ftrace_raw_init_event_##call(void) \ { \ INIT_LIST_HEAD(&event_##call.fields); \ + init_preds(&event_##call); \ return 0; \ } \ From 6e837fb152410e571a81aaadbd9884f0bc46a55e Mon Sep 17 00:00:00 2001 From: Etienne Basset Date: Wed, 8 Apr 2009 20:39:40 +0200 Subject: [PATCH 0361/2061] smack: implement logging V3 This patch creates auditing functions usable by LSM to audit security events. It provides standard dumping of FS, NET, task etc ... events (code borrowed from SELinux) and provides 2 callbacks to define LSM specific auditing, which should be flexible enough to convert SELinux too. Signed-off-by: Etienne Basset Acked-by: Casey Schaufler cked-by: Eric Paris Signed-off-by: James Morris --- include/linux/lsm_audit.h | 111 +++++++++++ security/lsm_audit.c | 386 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 497 insertions(+) create mode 100644 include/linux/lsm_audit.h create mode 100644 security/lsm_audit.c diff --git a/include/linux/lsm_audit.h b/include/linux/lsm_audit.h new file mode 100644 index 00000000000..e461b2c3d71 --- /dev/null +++ b/include/linux/lsm_audit.h @@ -0,0 +1,111 @@ +/* + * Common LSM logging functions + * Heavily borrowed from selinux/avc.h + * + * Author : Etienne BASSET + * + * All credits to : Stephen Smalley, + * All BUGS to : Etienne BASSET + */ +#ifndef _LSM_COMMON_LOGGING_ +#define _LSM_COMMON_LOGGING_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +/* Auxiliary data to use in generating the audit record. */ +struct common_audit_data { + char type; +#define LSM_AUDIT_DATA_FS 1 +#define LSM_AUDIT_DATA_NET 2 +#define LSM_AUDIT_DATA_CAP 3 +#define LSM_AUDIT_DATA_IPC 4 +#define LSM_AUDIT_DATA_TASK 5 +#define LSM_AUDIT_DATA_KEY 6 + struct task_struct *tsk; + union { + struct { + struct path path; + struct inode *inode; + } fs; + struct { + int netif; + struct sock *sk; + u16 family; + __be16 dport; + __be16 sport; + union { + struct { + __be32 daddr; + __be32 saddr; + } v4; + struct { + struct in6_addr daddr; + struct in6_addr saddr; + } v6; + } fam; + } net; + int cap; + int ipc_id; + struct task_struct *tsk; +#ifdef CONFIG_KEYS + struct { + key_serial_t key; + char *key_desc; + } key_struct; +#endif + } u; + const char *function; + /* this union contains LSM specific data */ + union { + /* SMACK data */ + struct smack_audit_data { + char *subject; + char *object; + char *request; + int result; + } smack_audit_data; + /* SELinux data */ + struct { + u32 ssid; + u32 tsid; + u16 tclass; + u32 requested; + u32 audited; + struct av_decision *avd; + int result; + } selinux_audit_data; + } lsm_priv; + /* these callback will be implemented by a specific LSM */ + void (*lsm_pre_audit)(struct audit_buffer *, void *); + void (*lsm_post_audit)(struct audit_buffer *, void *); +}; + +#define v4info fam.v4 +#define v6info fam.v6 + +int ipv4_skb_to_auditdata(struct sk_buff *skb, + struct common_audit_data *ad, u8 *proto); + +int ipv6_skb_to_auditdata(struct sk_buff *skb, + struct common_audit_data *ad, u8 *proto); + +/* Initialize an LSM audit data structure. */ +#define COMMON_AUDIT_DATA_INIT(_d, _t) \ + { memset((_d), 0, sizeof(struct common_audit_data)); \ + (_d)->type = LSM_AUDIT_DATA_##_t; (_d)->function = __func__; } + +void common_lsm_audit(struct common_audit_data *a); + +#endif diff --git a/security/lsm_audit.c b/security/lsm_audit.c new file mode 100644 index 00000000000..94b868494b3 --- /dev/null +++ b/security/lsm_audit.c @@ -0,0 +1,386 @@ +/* + * common LSM auditing functions + * + * Based on code written for SELinux by : + * Stephen Smalley, + * James Morris + * Author : Etienne Basset, + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/** + * ipv4_skb_to_auditdata : fill auditdata from skb + * @skb : the skb + * @ad : the audit data to fill + * @proto : the layer 4 protocol + * + * return 0 on success + */ +int ipv4_skb_to_auditdata(struct sk_buff *skb, + struct common_audit_data *ad, u8 *proto) +{ + int ret = 0; + struct iphdr *ih; + + ih = ip_hdr(skb); + if (ih == NULL) + return -EINVAL; + + ad->u.net.v4info.saddr = ih->saddr; + ad->u.net.v4info.daddr = ih->daddr; + + if (proto) + *proto = ih->protocol; + /* non initial fragment */ + if (ntohs(ih->frag_off) & IP_OFFSET) + return 0; + + switch (ih->protocol) { + case IPPROTO_TCP: { + struct tcphdr *th = tcp_hdr(skb); + if (th == NULL) + break; + + ad->u.net.sport = th->source; + ad->u.net.dport = th->dest; + break; + } + case IPPROTO_UDP: { + struct udphdr *uh = udp_hdr(skb); + if (uh == NULL) + break; + + ad->u.net.sport = uh->source; + ad->u.net.dport = uh->dest; + break; + } + case IPPROTO_DCCP: { + struct dccp_hdr *dh = dccp_hdr(skb); + if (dh == NULL) + break; + + ad->u.net.sport = dh->dccph_sport; + ad->u.net.dport = dh->dccph_dport; + break; + } + case IPPROTO_SCTP: { + struct sctphdr *sh = sctp_hdr(skb); + if (sh == NULL) + break; + ad->u.net.sport = sh->source; + ad->u.net.dport = sh->dest; + break; + } + default: + ret = -EINVAL; + } + return ret; +} +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +/** + * ipv6_skb_to_auditdata : fill auditdata from skb + * @skb : the skb + * @ad : the audit data to fill + * @proto : the layer 4 protocol + * + * return 0 on success + */ +int ipv6_skb_to_auditdata(struct sk_buff *skb, + struct common_audit_data *ad, u8 *proto) +{ + int offset, ret = 0; + struct ipv6hdr *ip6; + u8 nexthdr; + + ip6 = ipv6_hdr(skb); + if (ip6 == NULL) + return -EINVAL; + ipv6_addr_copy(&ad->u.net.v6info.saddr, &ip6->saddr); + ipv6_addr_copy(&ad->u.net.v6info.daddr, &ip6->daddr); + ret = 0; + /* IPv6 can have several extension header before the Transport header + * skip them */ + offset = skb_network_offset(skb); + offset += sizeof(*ip6); + nexthdr = ip6->nexthdr; + offset = ipv6_skip_exthdr(skb, offset, &nexthdr); + if (offset < 0) + return 0; + if (proto) + *proto = nexthdr; + switch (nexthdr) { + case IPPROTO_TCP: { + struct tcphdr _tcph, *th; + + th = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph); + if (th == NULL) + break; + + ad->u.net.sport = th->source; + ad->u.net.dport = th->dest; + break; + } + case IPPROTO_UDP: { + struct udphdr _udph, *uh; + + uh = skb_header_pointer(skb, offset, sizeof(_udph), &_udph); + if (uh == NULL) + break; + + ad->u.net.sport = uh->source; + ad->u.net.dport = uh->dest; + break; + } + case IPPROTO_DCCP: { + struct dccp_hdr _dccph, *dh; + + dh = skb_header_pointer(skb, offset, sizeof(_dccph), &_dccph); + if (dh == NULL) + break; + + ad->u.net.sport = dh->dccph_sport; + ad->u.net.dport = dh->dccph_dport; + break; + } + case IPPROTO_SCTP: { + struct sctphdr _sctph, *sh; + + sh = skb_header_pointer(skb, offset, sizeof(_sctph), &_sctph); + if (sh == NULL) + break; + ad->u.net.sport = sh->source; + ad->u.net.dport = sh->dest; + break; + } + default: + ret = -EINVAL; + } + return ret; +} +#endif + + +static inline void print_ipv6_addr(struct audit_buffer *ab, + struct in6_addr *addr, __be16 port, + char *name1, char *name2) +{ + if (!ipv6_addr_any(addr)) + audit_log_format(ab, " %s=%pI6", name1, addr); + if (port) + audit_log_format(ab, " %s=%d", name2, ntohs(port)); +} + +static inline void print_ipv4_addr(struct audit_buffer *ab, __be32 addr, + __be16 port, char *name1, char *name2) +{ + if (addr) + audit_log_format(ab, " %s=%pI4", name1, &addr); + if (port) + audit_log_format(ab, " %s=%d", name2, ntohs(port)); +} + +/** + * dump_common_audit_data - helper to dump common audit data + * @a : common audit data + * + */ +static void dump_common_audit_data(struct audit_buffer *ab, + struct common_audit_data *a) +{ + struct inode *inode = NULL; + struct task_struct *tsk = current; + + if (a->tsk) + tsk = a->tsk; + if (tsk && tsk->pid) { + audit_log_format(ab, " pid=%d comm=", tsk->pid); + audit_log_untrustedstring(ab, tsk->comm); + } + + switch (a->type) { + case LSM_AUDIT_DATA_IPC: + audit_log_format(ab, " key=%d ", a->u.ipc_id); + break; + case LSM_AUDIT_DATA_CAP: + audit_log_format(ab, " capability=%d ", a->u.cap); + break; + case LSM_AUDIT_DATA_FS: + if (a->u.fs.path.dentry) { + struct dentry *dentry = a->u.fs.path.dentry; + if (a->u.fs.path.mnt) { + audit_log_d_path(ab, "path=", &a->u.fs.path); + } else { + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, + dentry->d_name.name); + } + inode = dentry->d_inode; + } else if (a->u.fs.inode) { + struct dentry *dentry; + inode = a->u.fs.inode; + dentry = d_find_alias(inode); + if (dentry) { + audit_log_format(ab, " name="); + audit_log_untrustedstring(ab, + dentry->d_name.name); + dput(dentry); + } + } + if (inode) + audit_log_format(ab, " dev=%s ino=%lu", + inode->i_sb->s_id, + inode->i_ino); + break; + case LSM_AUDIT_DATA_TASK: + tsk = a->u.tsk; + if (tsk && tsk->pid) { + audit_log_format(ab, " pid=%d comm=", tsk->pid); + audit_log_untrustedstring(ab, tsk->comm); + } + break; + case LSM_AUDIT_DATA_NET: + if (a->u.net.sk) { + struct sock *sk = a->u.net.sk; + struct unix_sock *u; + int len = 0; + char *p = NULL; + + switch (sk->sk_family) { + case AF_INET: { + struct inet_sock *inet = inet_sk(sk); + + print_ipv4_addr(ab, inet->rcv_saddr, + inet->sport, + "laddr", "lport"); + print_ipv4_addr(ab, inet->daddr, + inet->dport, + "faddr", "fport"); + break; + } + case AF_INET6: { + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *inet6 = inet6_sk(sk); + + print_ipv6_addr(ab, &inet6->rcv_saddr, + inet->sport, + "laddr", "lport"); + print_ipv6_addr(ab, &inet6->daddr, + inet->dport, + "faddr", "fport"); + break; + } + case AF_UNIX: + u = unix_sk(sk); + if (u->dentry) { + struct path path = { + .dentry = u->dentry, + .mnt = u->mnt + }; + audit_log_d_path(ab, "path=", &path); + break; + } + if (!u->addr) + break; + len = u->addr->len-sizeof(short); + p = &u->addr->name->sun_path[0]; + audit_log_format(ab, " path="); + if (*p) + audit_log_untrustedstring(ab, p); + else + audit_log_n_hex(ab, p, len); + break; + } + } + + switch (a->u.net.family) { + case AF_INET: + print_ipv4_addr(ab, a->u.net.v4info.saddr, + a->u.net.sport, + "saddr", "src"); + print_ipv4_addr(ab, a->u.net.v4info.daddr, + a->u.net.dport, + "daddr", "dest"); + break; + case AF_INET6: + print_ipv6_addr(ab, &a->u.net.v6info.saddr, + a->u.net.sport, + "saddr", "src"); + print_ipv6_addr(ab, &a->u.net.v6info.daddr, + a->u.net.dport, + "daddr", "dest"); + break; + } + if (a->u.net.netif > 0) { + struct net_device *dev; + + /* NOTE: we always use init's namespace */ + dev = dev_get_by_index(&init_net, a->u.net.netif); + if (dev) { + audit_log_format(ab, " netif=%s", dev->name); + dev_put(dev); + } + } + break; +#ifdef CONFIG_KEYS + case LSM_AUDIT_DATA_KEY: + audit_log_format(ab, " key_serial=%u", a->u.key_struct.key); + if (a->u.key_struct.key_desc) { + audit_log_format(ab, " key_desc="); + audit_log_untrustedstring(ab, a->u.key_struct.key_desc); + } + break; +#endif + } /* switch (a->type) */ +} + +/** + * common_lsm_audit - generic LSM auditing function + * @a: auxiliary audit data + * + * setup the audit buffer for common security information + * uses callback to print LSM specific information + */ +void common_lsm_audit(struct common_audit_data *a) +{ + struct audit_buffer *ab; + + if (a == NULL) + return; + /* we use GFP_ATOMIC so we won't sleep */ + ab = audit_log_start(current->audit_context, GFP_ATOMIC, AUDIT_AVC); + + if (ab == NULL) + return; + + if (a->lsm_pre_audit) + a->lsm_pre_audit(ab, a); + + dump_common_audit_data(ab, a); + + if (a->lsm_post_audit) + a->lsm_post_audit(ab, a); + + audit_log_end(ab); +} From ecfcc53fef3c357574bb6143dce6631e6d56295c Mon Sep 17 00:00:00 2001 From: Etienne Basset Date: Wed, 8 Apr 2009 20:40:06 +0200 Subject: [PATCH 0362/2061] smack: implement logging V3 the following patch, add logging of Smack security decisions. This is of course very useful to understand what your current smack policy does. As suggested by Casey, it also now forbids labels with ', " or \ It introduces a '/smack/logging' switch : 0: no logging 1: log denied (default) 2: log accepted 3: log denied&accepted Signed-off-by: Etienne Basset Acked-by: Casey Schaufler Acked-by: Eric Paris Signed-off-by: James Morris --- Documentation/Smack.txt | 20 +- security/Makefile | 3 + security/smack/smack.h | 108 +++++++++- security/smack/smack_access.c | 141 ++++++++++-- security/smack/smack_lsm.c | 390 ++++++++++++++++++++++++++-------- security/smack/smackfs.c | 66 ++++++ 6 files changed, 617 insertions(+), 111 deletions(-) diff --git a/Documentation/Smack.txt b/Documentation/Smack.txt index 629c92e9978..34614b4c708 100644 --- a/Documentation/Smack.txt +++ b/Documentation/Smack.txt @@ -184,8 +184,9 @@ length. Single character labels using special characters, that being anything other than a letter or digit, are reserved for use by the Smack development team. Smack labels are unstructured, case sensitive, and the only operation ever performed on them is comparison for equality. Smack labels cannot -contain unprintable characters or the "/" (slash) character. Smack labels -cannot begin with a '-', which is reserved for special options. +contain unprintable characters, the "/" (slash), the "\" (backslash), the "'" +(quote) and '"' (double-quote) characters. +Smack labels cannot begin with a '-', which is reserved for special options. There are some predefined labels: @@ -523,3 +524,18 @@ Smack supports some mount options: These mount options apply to all file system types. +Smack auditing + +If you want Smack auditing of security events, you need to set CONFIG_AUDIT +in your kernel configuration. +By default, all denied events will be audited. You can change this behavior by +writing a single character to the /smack/logging file : +0 : no logging +1 : log denied (default) +2 : log accepted +3 : log denied & accepted + +Events are logged as 'key=value' pairs, for each event you at least will get +the subjet, the object, the rights requested, the action, the kernel function +that triggered the event, plus other pairs depending on the type of event +audited. diff --git a/security/Makefile b/security/Makefile index fa77021d977..c67557cdaa8 100644 --- a/security/Makefile +++ b/security/Makefile @@ -16,6 +16,9 @@ obj-$(CONFIG_SECURITYFS) += inode.o # Must precede capability.o in order to stack properly. obj-$(CONFIG_SECURITY_SELINUX) += selinux/built-in.o obj-$(CONFIG_SECURITY_SMACK) += smack/built-in.o +ifeq ($(CONFIG_AUDIT),y) +obj-$(CONFIG_SECURITY_SMACK) += lsm_audit.o +endif obj-$(CONFIG_SECURITY_TOMOYO) += tomoyo/built-in.o obj-$(CONFIG_SECURITY_ROOTPLUG) += root_plug.o obj-$(CONFIG_CGROUP_DEVICE) += device_cgroup.o diff --git a/security/smack/smack.h b/security/smack/smack.h index 42ef313f985..243bec175be 100644 --- a/security/smack/smack.h +++ b/security/smack/smack.h @@ -20,6 +20,7 @@ #include #include #include +#include /* * Why 23? CIPSO is constrained to 30, so a 32 byte buffer is @@ -178,6 +179,20 @@ struct smack_known { #define MAY_READWRITE (MAY_READ | MAY_WRITE) #define MAY_NOT 0 +/* + * Number of access types used by Smack (rwxa) + */ +#define SMK_NUM_ACCESS_TYPE 4 + +/* + * Smack audit data; is empty if CONFIG_AUDIT not set + * to save some stack + */ +struct smk_audit_info { +#ifdef CONFIG_AUDIT + struct common_audit_data a; +#endif +}; /* * These functions are in smack_lsm.c */ @@ -186,8 +201,8 @@ struct inode_smack *new_inode_smack(char *); /* * These functions are in smack_access.c */ -int smk_access(char *, char *, int); -int smk_curacc(char *, u32); +int smk_access(char *, char *, int, struct smk_audit_info *); +int smk_curacc(char *, u32, struct smk_audit_info *); int smack_to_cipso(const char *, struct smack_cipso *); void smack_from_cipso(u32, char *, char *); char *smack_from_secid(const u32); @@ -237,4 +252,93 @@ static inline char *smk_of_inode(const struct inode *isp) return sip->smk_inode; } +/* + * logging functions + */ +#define SMACK_AUDIT_DENIED 0x1 +#define SMACK_AUDIT_ACCEPT 0x2 +extern int log_policy; + +void smack_log(char *subject_label, char *object_label, + int request, + int result, struct smk_audit_info *auditdata); + +#ifdef CONFIG_AUDIT + +/* + * some inline functions to set up audit data + * they do nothing if CONFIG_AUDIT is not set + * + */ +static inline void smk_ad_init(struct smk_audit_info *a, const char *func, + char type) +{ + memset(a, 0, sizeof(*a)); + a->a.type = type; + a->a.function = func; +} + +static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a, + struct task_struct *t) +{ + a->a.u.tsk = t; +} +static inline void smk_ad_setfield_u_fs_path_dentry(struct smk_audit_info *a, + struct dentry *d) +{ + a->a.u.fs.path.dentry = d; +} +static inline void smk_ad_setfield_u_fs_path_mnt(struct smk_audit_info *a, + struct vfsmount *m) +{ + a->a.u.fs.path.mnt = m; +} +static inline void smk_ad_setfield_u_fs_inode(struct smk_audit_info *a, + struct inode *i) +{ + a->a.u.fs.inode = i; +} +static inline void smk_ad_setfield_u_fs_path(struct smk_audit_info *a, + struct path p) +{ + a->a.u.fs.path = p; +} +static inline void smk_ad_setfield_u_net_sk(struct smk_audit_info *a, + struct sock *sk) +{ + a->a.u.net.sk = sk; +} + +#else /* no AUDIT */ + +static inline void smk_ad_init(struct smk_audit_info *a, const char *func, + char type) +{ +} +static inline void smk_ad_setfield_u_tsk(struct smk_audit_info *a, + struct task_struct *t) +{ +} +static inline void smk_ad_setfield_u_fs_path_dentry(struct smk_audit_info *a, + struct dentry *d) +{ +} +static inline void smk_ad_setfield_u_fs_path_mnt(struct smk_audit_info *a, + struct vfsmount *m) +{ +} +static inline void smk_ad_setfield_u_fs_inode(struct smk_audit_info *a, + struct inode *i) +{ +} +static inline void smk_ad_setfield_u_fs_path(struct smk_audit_info *a, + struct path p) +{ +} +static inline void smk_ad_setfield_u_net_sk(struct smk_audit_info *a, + struct sock *sk) +{ +} +#endif + #endif /* _SECURITY_SMACK_H */ diff --git a/security/smack/smack_access.c b/security/smack/smack_access.c index ac0a2707f6d..513dc1aa16d 100644 --- a/security/smack/smack_access.c +++ b/security/smack/smack_access.c @@ -59,11 +59,18 @@ LIST_HEAD(smack_known_list); */ static u32 smack_next_secid = 10; +/* + * what events do we log + * can be overwritten at run-time by /smack/logging + */ +int log_policy = SMACK_AUDIT_DENIED; + /** * smk_access - determine if a subject has a specific access to an object * @subject_label: a pointer to the subject's Smack label * @object_label: a pointer to the object's Smack label * @request: the access requested, in "MAY" format + * @a : a pointer to the audit data * * This function looks up the subject/object pair in the * access rule list and returns 0 if the access is permitted, @@ -78,10 +85,12 @@ static u32 smack_next_secid = 10; * will be on the list, so checking the pointers may be a worthwhile * optimization. */ -int smk_access(char *subject_label, char *object_label, int request) +int smk_access(char *subject_label, char *object_label, int request, + struct smk_audit_info *a) { u32 may = MAY_NOT; struct smack_rule *srp; + int rc = 0; /* * Hardcoded comparisons. @@ -89,8 +98,10 @@ int smk_access(char *subject_label, char *object_label, int request) * A star subject can't access any object. */ if (subject_label == smack_known_star.smk_known || - strcmp(subject_label, smack_known_star.smk_known) == 0) - return -EACCES; + strcmp(subject_label, smack_known_star.smk_known) == 0) { + rc = -EACCES; + goto out_audit; + } /* * An internet object can be accessed by any subject. * Tasks cannot be assigned the internet label. @@ -100,20 +111,20 @@ int smk_access(char *subject_label, char *object_label, int request) subject_label == smack_known_web.smk_known || strcmp(object_label, smack_known_web.smk_known) == 0 || strcmp(subject_label, smack_known_web.smk_known) == 0) - return 0; + goto out_audit; /* * A star object can be accessed by any subject. */ if (object_label == smack_known_star.smk_known || strcmp(object_label, smack_known_star.smk_known) == 0) - return 0; + goto out_audit; /* * An object can be accessed in any way by a subject * with the same label. */ if (subject_label == object_label || strcmp(subject_label, object_label) == 0) - return 0; + goto out_audit; /* * A hat subject can read any object. * A floor object can be read by any subject. @@ -121,10 +132,10 @@ int smk_access(char *subject_label, char *object_label, int request) if ((request & MAY_ANYREAD) == request) { if (object_label == smack_known_floor.smk_known || strcmp(object_label, smack_known_floor.smk_known) == 0) - return 0; + goto out_audit; if (subject_label == smack_known_hat.smk_known || strcmp(subject_label, smack_known_hat.smk_known) == 0) - return 0; + goto out_audit; } /* * Beyond here an explicit relationship is required. @@ -148,28 +159,36 @@ int smk_access(char *subject_label, char *object_label, int request) * This is a bit map operation. */ if ((request & may) == request) - return 0; + goto out_audit; - return -EACCES; + rc = -EACCES; +out_audit: +#ifdef CONFIG_AUDIT + if (a) + smack_log(subject_label, object_label, request, rc, a); +#endif + return rc; } /** * smk_curacc - determine if current has a specific access to an object * @obj_label: a pointer to the object's Smack label * @mode: the access requested, in "MAY" format + * @a : common audit data * * This function checks the current subject label/object label pair * in the access rule list and returns 0 if the access is permitted, * non zero otherwise. It allows that current may have the capability * to override the rules. */ -int smk_curacc(char *obj_label, u32 mode) +int smk_curacc(char *obj_label, u32 mode, struct smk_audit_info *a) { int rc; + char *sp = current_security(); - rc = smk_access(current_security(), obj_label, mode); + rc = smk_access(sp, obj_label, mode, NULL); if (rc == 0) - return 0; + goto out_audit; /* * Return if a specific label has been designated as the @@ -177,14 +196,105 @@ int smk_curacc(char *obj_label, u32 mode) * have that label. */ if (smack_onlycap != NULL && smack_onlycap != current->cred->security) - return rc; + goto out_audit; if (capable(CAP_MAC_OVERRIDE)) return 0; +out_audit: +#ifdef CONFIG_AUDIT + if (a) + smack_log(sp, obj_label, mode, rc, a); +#endif return rc; } +#ifdef CONFIG_AUDIT +/** + * smack_str_from_perm : helper to transalate an int to a + * readable string + * @string : the string to fill + * @access : the int + * + */ +static inline void smack_str_from_perm(char *string, int access) +{ + int i = 0; + if (access & MAY_READ) + string[i++] = 'r'; + if (access & MAY_WRITE) + string[i++] = 'w'; + if (access & MAY_EXEC) + string[i++] = 'x'; + if (access & MAY_APPEND) + string[i++] = 'a'; + string[i] = '\0'; +} +/** + * smack_log_callback - SMACK specific information + * will be called by generic audit code + * @ab : the audit_buffer + * @a : audit_data + * + */ +static void smack_log_callback(struct audit_buffer *ab, void *a) +{ + struct common_audit_data *ad = a; + struct smack_audit_data *sad = &ad->lsm_priv.smack_audit_data; + audit_log_format(ab, "lsm=SMACK fn=%s action=%s", ad->function, + sad->result ? "denied" : "granted"); + audit_log_format(ab, " subject="); + audit_log_untrustedstring(ab, sad->subject); + audit_log_format(ab, " object="); + audit_log_untrustedstring(ab, sad->object); + audit_log_format(ab, " requested=%s", sad->request); +} + +/** + * smack_log - Audit the granting or denial of permissions. + * @subject_label : smack label of the requester + * @object_label : smack label of the object being accessed + * @request: requested permissions + * @result: result from smk_access + * @a: auxiliary audit data + * + * Audit the granting or denial of permissions in accordance + * with the policy. + */ +void smack_log(char *subject_label, char *object_label, int request, + int result, struct smk_audit_info *ad) +{ + char request_buffer[SMK_NUM_ACCESS_TYPE + 1]; + struct smack_audit_data *sad; + struct common_audit_data *a = &ad->a; + + /* check if we have to log the current event */ + if (result != 0 && (log_policy & SMACK_AUDIT_DENIED) == 0) + return; + if (result == 0 && (log_policy & SMACK_AUDIT_ACCEPT) == 0) + return; + + if (a->function == NULL) + a->function = "unknown"; + + /* end preparing the audit data */ + sad = &a->lsm_priv.smack_audit_data; + smack_str_from_perm(request_buffer, request); + sad->subject = subject_label; + sad->object = object_label; + sad->request = request_buffer; + sad->result = result; + a->lsm_pre_audit = smack_log_callback; + + common_lsm_audit(a); +} +#else /* #ifdef CONFIG_AUDIT */ +void smack_log(char *subject_label, char *object_label, int request, + int result, struct smk_audit_info *ad) +{ +} +#endif + static DEFINE_MUTEX(smack_known_lock); /** @@ -209,7 +319,8 @@ struct smack_known *smk_import_entry(const char *string, int len) if (found) smack[i] = '\0'; else if (i >= len || string[i] > '~' || string[i] <= ' ' || - string[i] == '/') { + string[i] == '/' || string[i] == '"' || + string[i] == '\\' || string[i] == '\'') { smack[i] = '\0'; found = 1; } else diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 921514902ec..f557767911c 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -30,7 +30,6 @@ #include #include #include - #include "smack.h" #define task_security(task) (task_cred_xxx((task), security)) @@ -103,14 +102,24 @@ struct inode_smack *new_inode_smack(char *smack) static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode) { int rc; + struct smk_audit_info ad; + char *sp, *tsp; rc = cap_ptrace_may_access(ctp, mode); if (rc != 0) return rc; - rc = smk_access(current_security(), task_security(ctp), MAY_READWRITE); + sp = current_security(); + tsp = task_security(ctp); + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); + smk_ad_setfield_u_tsk(&ad, ctp); + + /* we won't log here, because rc can be overriden */ + rc = smk_access(sp, tsp, MAY_READWRITE, NULL); if (rc != 0 && capable(CAP_MAC_OVERRIDE)) - return 0; + rc = 0; + + smack_log(sp, tsp, MAY_READWRITE, rc, &ad); return rc; } @@ -125,14 +134,24 @@ static int smack_ptrace_may_access(struct task_struct *ctp, unsigned int mode) static int smack_ptrace_traceme(struct task_struct *ptp) { int rc; + struct smk_audit_info ad; + char *sp, *tsp; rc = cap_ptrace_traceme(ptp); if (rc != 0) return rc; - rc = smk_access(task_security(ptp), current_security(), MAY_READWRITE); + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); + smk_ad_setfield_u_tsk(&ad, ptp); + + sp = current_security(); + tsp = task_security(ptp); + /* we won't log here, because rc can be overriden */ + rc = smk_access(tsp, sp, MAY_READWRITE, NULL); if (rc != 0 && has_capability(ptp, CAP_MAC_OVERRIDE)) - return 0; + rc = 0; + + smack_log(tsp, sp, MAY_READWRITE, rc, &ad); return rc; } @@ -327,8 +346,14 @@ static int smack_sb_kern_mount(struct super_block *sb, int flags, void *data) static int smack_sb_statfs(struct dentry *dentry) { struct superblock_smack *sbp = dentry->d_sb->s_security; + int rc; + struct smk_audit_info ad; - return smk_curacc(sbp->smk_floor, MAY_READ); + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, dentry); + + rc = smk_curacc(sbp->smk_floor, MAY_READ, &ad); + return rc; } /** @@ -346,8 +371,12 @@ static int smack_sb_mount(char *dev_name, struct path *path, char *type, unsigned long flags, void *data) { struct superblock_smack *sbp = path->mnt->mnt_sb->s_security; + struct smk_audit_info ad; - return smk_curacc(sbp->smk_floor, MAY_WRITE); + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path(&ad, *path); + + return smk_curacc(sbp->smk_floor, MAY_WRITE, &ad); } /** @@ -361,10 +390,14 @@ static int smack_sb_mount(char *dev_name, struct path *path, static int smack_sb_umount(struct vfsmount *mnt, int flags) { struct superblock_smack *sbp; + struct smk_audit_info ad; + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, mnt->mnt_mountpoint); + smk_ad_setfield_u_fs_path_mnt(&ad, mnt); sbp = mnt->mnt_sb->s_security; - - return smk_curacc(sbp->smk_floor, MAY_WRITE); + return smk_curacc(sbp->smk_floor, MAY_WRITE, &ad); } /* @@ -441,15 +474,20 @@ static int smack_inode_init_security(struct inode *inode, struct inode *dir, static int smack_inode_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_dentry) { - int rc; char *isp; + struct smk_audit_info ad; + int rc; + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry); isp = smk_of_inode(old_dentry->d_inode); - rc = smk_curacc(isp, MAY_WRITE); + rc = smk_curacc(isp, MAY_WRITE, &ad); if (rc == 0 && new_dentry->d_inode != NULL) { isp = smk_of_inode(new_dentry->d_inode); - rc = smk_curacc(isp, MAY_WRITE); + smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry); + rc = smk_curacc(isp, MAY_WRITE, &ad); } return rc; @@ -466,18 +504,24 @@ static int smack_inode_link(struct dentry *old_dentry, struct inode *dir, static int smack_inode_unlink(struct inode *dir, struct dentry *dentry) { struct inode *ip = dentry->d_inode; + struct smk_audit_info ad; int rc; + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, dentry); + /* * You need write access to the thing you're unlinking */ - rc = smk_curacc(smk_of_inode(ip), MAY_WRITE); - if (rc == 0) + rc = smk_curacc(smk_of_inode(ip), MAY_WRITE, &ad); + if (rc == 0) { /* * You also need write access to the containing directory */ - rc = smk_curacc(smk_of_inode(dir), MAY_WRITE); - + smk_ad_setfield_u_fs_path_dentry(&ad, NULL); + smk_ad_setfield_u_fs_inode(&ad, dir); + rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad); + } return rc; } @@ -491,17 +535,24 @@ static int smack_inode_unlink(struct inode *dir, struct dentry *dentry) */ static int smack_inode_rmdir(struct inode *dir, struct dentry *dentry) { + struct smk_audit_info ad; int rc; + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, dentry); + /* * You need write access to the thing you're removing */ - rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE); - if (rc == 0) + rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad); + if (rc == 0) { /* * You also need write access to the containing directory */ - rc = smk_curacc(smk_of_inode(dir), MAY_WRITE); + smk_ad_setfield_u_fs_path_dentry(&ad, NULL); + smk_ad_setfield_u_fs_inode(&ad, dir); + rc = smk_curacc(smk_of_inode(dir), MAY_WRITE, &ad); + } return rc; } @@ -525,15 +576,19 @@ static int smack_inode_rename(struct inode *old_inode, { int rc; char *isp; + struct smk_audit_info ad; + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, old_dentry); isp = smk_of_inode(old_dentry->d_inode); - rc = smk_curacc(isp, MAY_READWRITE); + rc = smk_curacc(isp, MAY_READWRITE, &ad); if (rc == 0 && new_dentry->d_inode != NULL) { isp = smk_of_inode(new_dentry->d_inode); - rc = smk_curacc(isp, MAY_READWRITE); + smk_ad_setfield_u_fs_path_dentry(&ad, new_dentry); + rc = smk_curacc(isp, MAY_READWRITE, &ad); } - return rc; } @@ -548,13 +603,15 @@ static int smack_inode_rename(struct inode *old_inode, */ static int smack_inode_permission(struct inode *inode, int mask) { + struct smk_audit_info ad; /* * No permission to check. Existence test. Yup, it's there. */ if (mask == 0) return 0; - - return smk_curacc(smk_of_inode(inode), mask); + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_inode(&ad, inode); + return smk_curacc(smk_of_inode(inode), mask, &ad); } /** @@ -566,13 +623,16 @@ static int smack_inode_permission(struct inode *inode, int mask) */ static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr) { + struct smk_audit_info ad; /* * Need to allow for clearing the setuid bit. */ if (iattr->ia_valid & ATTR_FORCE) return 0; + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, dentry); - return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE); + return smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad); } /** @@ -584,7 +644,12 @@ static int smack_inode_setattr(struct dentry *dentry, struct iattr *iattr) */ static int smack_inode_getattr(struct vfsmount *mnt, struct dentry *dentry) { - return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ); + struct smk_audit_info ad; + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, dentry); + smk_ad_setfield_u_fs_path_mnt(&ad, mnt); + return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ, &ad); } /** @@ -602,6 +667,7 @@ static int smack_inode_getattr(struct vfsmount *mnt, struct dentry *dentry) static int smack_inode_setxattr(struct dentry *dentry, const char *name, const void *value, size_t size, int flags) { + struct smk_audit_info ad; int rc = 0; if (strcmp(name, XATTR_NAME_SMACK) == 0 || @@ -615,8 +681,11 @@ static int smack_inode_setxattr(struct dentry *dentry, const char *name, } else rc = cap_inode_setxattr(dentry, name, value, size, flags); + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, dentry); + if (rc == 0) - rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE); + rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad); return rc; } @@ -671,7 +740,12 @@ static void smack_inode_post_setxattr(struct dentry *dentry, const char *name, */ static int smack_inode_getxattr(struct dentry *dentry, const char *name) { - return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ); + struct smk_audit_info ad; + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, dentry); + + return smk_curacc(smk_of_inode(dentry->d_inode), MAY_READ, &ad); } /* @@ -685,6 +759,7 @@ static int smack_inode_getxattr(struct dentry *dentry, const char *name) */ static int smack_inode_removexattr(struct dentry *dentry, const char *name) { + struct smk_audit_info ad; int rc = 0; if (strcmp(name, XATTR_NAME_SMACK) == 0 || @@ -695,8 +770,10 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name) } else rc = cap_inode_removexattr(dentry, name); + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, dentry); if (rc == 0) - rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE); + rc = smk_curacc(smk_of_inode(dentry->d_inode), MAY_WRITE, &ad); return rc; } @@ -855,12 +932,16 @@ static int smack_file_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { int rc = 0; + struct smk_audit_info ad; + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path(&ad, file->f_path); if (_IOC_DIR(cmd) & _IOC_WRITE) - rc = smk_curacc(file->f_security, MAY_WRITE); + rc = smk_curacc(file->f_security, MAY_WRITE, &ad); if (rc == 0 && (_IOC_DIR(cmd) & _IOC_READ)) - rc = smk_curacc(file->f_security, MAY_READ); + rc = smk_curacc(file->f_security, MAY_READ, &ad); return rc; } @@ -874,7 +955,11 @@ static int smack_file_ioctl(struct file *file, unsigned int cmd, */ static int smack_file_lock(struct file *file, unsigned int cmd) { - return smk_curacc(file->f_security, MAY_WRITE); + struct smk_audit_info ad; + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path_dentry(&ad, file->f_path.dentry); + return smk_curacc(file->f_security, MAY_WRITE, &ad); } /** @@ -888,8 +973,12 @@ static int smack_file_lock(struct file *file, unsigned int cmd) static int smack_file_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { + struct smk_audit_info ad; int rc; + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_FS); + smk_ad_setfield_u_fs_path(&ad, file->f_path); + switch (cmd) { case F_DUPFD: case F_GETFD: @@ -897,7 +986,7 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd, case F_GETLK: case F_GETOWN: case F_GETSIG: - rc = smk_curacc(file->f_security, MAY_READ); + rc = smk_curacc(file->f_security, MAY_READ, &ad); break; case F_SETFD: case F_SETFL: @@ -905,10 +994,10 @@ static int smack_file_fcntl(struct file *file, unsigned int cmd, case F_SETLKW: case F_SETOWN: case F_SETSIG: - rc = smk_curacc(file->f_security, MAY_WRITE); + rc = smk_curacc(file->f_security, MAY_WRITE, &ad); break; default: - rc = smk_curacc(file->f_security, MAY_READWRITE); + rc = smk_curacc(file->f_security, MAY_READWRITE, &ad); } return rc; @@ -943,14 +1032,21 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, { struct file *file; int rc; + char *tsp = tsk->cred->security; + struct smk_audit_info ad; /* * struct fown_struct is never outside the context of a struct file */ file = container_of(fown, struct file, f_owner); - rc = smk_access(file->f_security, tsk->cred->security, MAY_WRITE); + /* we don't log here as rc can be overriden */ + rc = smk_access(file->f_security, tsp, MAY_WRITE, NULL); if (rc != 0 && has_capability(tsk, CAP_MAC_OVERRIDE)) - return 0; + rc = 0; + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); + smk_ad_setfield_u_tsk(&ad, tsk); + smack_log(file->f_security, tsp, MAY_WRITE, rc, &ad); return rc; } @@ -963,7 +1059,10 @@ static int smack_file_send_sigiotask(struct task_struct *tsk, static int smack_file_receive(struct file *file) { int may = 0; + struct smk_audit_info ad; + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); + smk_ad_setfield_u_fs_path(&ad, file->f_path); /* * This code relies on bitmasks. */ @@ -972,7 +1071,7 @@ static int smack_file_receive(struct file *file) if (file->f_mode & FMODE_WRITE) may |= MAY_WRITE; - return smk_curacc(file->f_security, may); + return smk_curacc(file->f_security, may, &ad); } /* @@ -1051,6 +1150,22 @@ static int smack_kernel_create_files_as(struct cred *new, return 0; } +/** + * smk_curacc_on_task - helper to log task related access + * @p: the task object + * @access : the access requested + * + * Return 0 if access is permitted + */ +static int smk_curacc_on_task(struct task_struct *p, int access) +{ + struct smk_audit_info ad; + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); + smk_ad_setfield_u_tsk(&ad, p); + return smk_curacc(task_security(p), access, &ad); +} + /** * smack_task_setpgid - Smack check on setting pgid * @p: the task object @@ -1060,7 +1175,7 @@ static int smack_kernel_create_files_as(struct cred *new, */ static int smack_task_setpgid(struct task_struct *p, pid_t pgid) { - return smk_curacc(task_security(p), MAY_WRITE); + return smk_curacc_on_task(p, MAY_WRITE); } /** @@ -1071,7 +1186,7 @@ static int smack_task_setpgid(struct task_struct *p, pid_t pgid) */ static int smack_task_getpgid(struct task_struct *p) { - return smk_curacc(task_security(p), MAY_READ); + return smk_curacc_on_task(p, MAY_READ); } /** @@ -1082,7 +1197,7 @@ static int smack_task_getpgid(struct task_struct *p) */ static int smack_task_getsid(struct task_struct *p) { - return smk_curacc(task_security(p), MAY_READ); + return smk_curacc_on_task(p, MAY_READ); } /** @@ -1110,7 +1225,7 @@ static int smack_task_setnice(struct task_struct *p, int nice) rc = cap_task_setnice(p, nice); if (rc == 0) - rc = smk_curacc(task_security(p), MAY_WRITE); + rc = smk_curacc_on_task(p, MAY_WRITE); return rc; } @@ -1127,7 +1242,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio) rc = cap_task_setioprio(p, ioprio); if (rc == 0) - rc = smk_curacc(task_security(p), MAY_WRITE); + rc = smk_curacc_on_task(p, MAY_WRITE); return rc; } @@ -1139,7 +1254,7 @@ static int smack_task_setioprio(struct task_struct *p, int ioprio) */ static int smack_task_getioprio(struct task_struct *p) { - return smk_curacc(task_security(p), MAY_READ); + return smk_curacc_on_task(p, MAY_READ); } /** @@ -1157,7 +1272,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy, rc = cap_task_setscheduler(p, policy, lp); if (rc == 0) - rc = smk_curacc(task_security(p), MAY_WRITE); + rc = smk_curacc_on_task(p, MAY_WRITE); return rc; } @@ -1169,7 +1284,7 @@ static int smack_task_setscheduler(struct task_struct *p, int policy, */ static int smack_task_getscheduler(struct task_struct *p) { - return smk_curacc(task_security(p), MAY_READ); + return smk_curacc_on_task(p, MAY_READ); } /** @@ -1180,7 +1295,7 @@ static int smack_task_getscheduler(struct task_struct *p) */ static int smack_task_movememory(struct task_struct *p) { - return smk_curacc(task_security(p), MAY_WRITE); + return smk_curacc_on_task(p, MAY_WRITE); } /** @@ -1198,18 +1313,23 @@ static int smack_task_movememory(struct task_struct *p) static int smack_task_kill(struct task_struct *p, struct siginfo *info, int sig, u32 secid) { + struct smk_audit_info ad; + + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); + smk_ad_setfield_u_tsk(&ad, p); /* * Sending a signal requires that the sender * can write the receiver. */ if (secid == 0) - return smk_curacc(task_security(p), MAY_WRITE); + return smk_curacc(task_security(p), MAY_WRITE, &ad); /* * If the secid isn't 0 we're dealing with some USB IO * specific behavior. This is not clean. For one thing * we can't take privilege into account. */ - return smk_access(smack_from_secid(secid), task_security(p), MAY_WRITE); + return smk_access(smack_from_secid(secid), task_security(p), + MAY_WRITE, &ad); } /** @@ -1220,11 +1340,15 @@ static int smack_task_kill(struct task_struct *p, struct siginfo *info, */ static int smack_task_wait(struct task_struct *p) { + struct smk_audit_info ad; + char *sp = current_security(); + char *tsp = task_security(p); int rc; - rc = smk_access(current_security(), task_security(p), MAY_WRITE); + /* we don't log here, we can be overriden */ + rc = smk_access(sp, tsp, MAY_WRITE, NULL); if (rc == 0) - return 0; + goto out_log; /* * Allow the operation to succeed if either task @@ -1238,8 +1362,12 @@ static int smack_task_wait(struct task_struct *p) * the smack value. */ if (capable(CAP_MAC_OVERRIDE) || has_capability(p, CAP_MAC_OVERRIDE)) - return 0; - + rc = 0; + /* we log only if we didn't get overriden */ + out_log: + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_TASK); + smk_ad_setfield_u_tsk(&ad, p); + smack_log(sp, tsp, MAY_WRITE, rc, &ad); return rc; } @@ -1455,12 +1583,19 @@ static int smack_netlabel_send(struct sock *sk, struct sockaddr_in *sap) int sk_lbl; char *hostsp; struct socket_smack *ssp = sk->sk_security; + struct smk_audit_info ad; rcu_read_lock(); hostsp = smack_host_label(sap); if (hostsp != NULL) { sk_lbl = SMACK_UNLABELED_SOCKET; - rc = smk_access(ssp->smk_out, hostsp, MAY_WRITE); +#ifdef CONFIG_AUDIT + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET); + ad.a.u.net.family = sap->sin_family; + ad.a.u.net.dport = sap->sin_port; + ad.a.u.net.v4info.daddr = sap->sin_addr.s_addr; +#endif + rc = smk_access(ssp->smk_out, hostsp, MAY_WRITE, &ad); } else { sk_lbl = SMACK_CIPSO_SOCKET; rc = 0; @@ -1655,6 +1790,25 @@ static void smack_shm_free_security(struct shmid_kernel *shp) isp->security = NULL; } +/** + * smk_curacc_shm : check if current has access on shm + * @shp : the object + * @access : access requested + * + * Returns 0 if current has the requested access, error code otherwise + */ +static int smk_curacc_shm(struct shmid_kernel *shp, int access) +{ + char *ssp = smack_of_shm(shp); + struct smk_audit_info ad; + +#ifdef CONFIG_AUDIT + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC); + ad.a.u.ipc_id = shp->shm_perm.id; +#endif + return smk_curacc(ssp, access, &ad); +} + /** * smack_shm_associate - Smack access check for shm * @shp: the object @@ -1664,11 +1818,10 @@ static void smack_shm_free_security(struct shmid_kernel *shp) */ static int smack_shm_associate(struct shmid_kernel *shp, int shmflg) { - char *ssp = smack_of_shm(shp); int may; may = smack_flags_to_may(shmflg); - return smk_curacc(ssp, may); + return smk_curacc_shm(shp, may); } /** @@ -1680,7 +1833,6 @@ static int smack_shm_associate(struct shmid_kernel *shp, int shmflg) */ static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd) { - char *ssp; int may; switch (cmd) { @@ -1703,9 +1855,7 @@ static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd) default: return -EINVAL; } - - ssp = smack_of_shm(shp); - return smk_curacc(ssp, may); + return smk_curacc_shm(shp, may); } /** @@ -1719,11 +1869,10 @@ static int smack_shm_shmctl(struct shmid_kernel *shp, int cmd) static int smack_shm_shmat(struct shmid_kernel *shp, char __user *shmaddr, int shmflg) { - char *ssp = smack_of_shm(shp); int may; may = smack_flags_to_may(shmflg); - return smk_curacc(ssp, may); + return smk_curacc_shm(shp, may); } /** @@ -1764,6 +1913,25 @@ static void smack_sem_free_security(struct sem_array *sma) isp->security = NULL; } +/** + * smk_curacc_sem : check if current has access on sem + * @sma : the object + * @access : access requested + * + * Returns 0 if current has the requested access, error code otherwise + */ +static int smk_curacc_sem(struct sem_array *sma, int access) +{ + char *ssp = smack_of_sem(sma); + struct smk_audit_info ad; + +#ifdef CONFIG_AUDIT + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC); + ad.a.u.ipc_id = sma->sem_perm.id; +#endif + return smk_curacc(ssp, access, &ad); +} + /** * smack_sem_associate - Smack access check for sem * @sma: the object @@ -1773,11 +1941,10 @@ static void smack_sem_free_security(struct sem_array *sma) */ static int smack_sem_associate(struct sem_array *sma, int semflg) { - char *ssp = smack_of_sem(sma); int may; may = smack_flags_to_may(semflg); - return smk_curacc(ssp, may); + return smk_curacc_sem(sma, may); } /** @@ -1789,7 +1956,6 @@ static int smack_sem_associate(struct sem_array *sma, int semflg) */ static int smack_sem_semctl(struct sem_array *sma, int cmd) { - char *ssp; int may; switch (cmd) { @@ -1818,8 +1984,7 @@ static int smack_sem_semctl(struct sem_array *sma, int cmd) return -EINVAL; } - ssp = smack_of_sem(sma); - return smk_curacc(ssp, may); + return smk_curacc_sem(sma, may); } /** @@ -1836,9 +2001,7 @@ static int smack_sem_semctl(struct sem_array *sma, int cmd) static int smack_sem_semop(struct sem_array *sma, struct sembuf *sops, unsigned nsops, int alter) { - char *ssp = smack_of_sem(sma); - - return smk_curacc(ssp, MAY_READWRITE); + return smk_curacc_sem(sma, MAY_READWRITE); } /** @@ -1879,6 +2042,25 @@ static char *smack_of_msq(struct msg_queue *msq) return (char *)msq->q_perm.security; } +/** + * smk_curacc_msq : helper to check if current has access on msq + * @msq : the msq + * @access : access requested + * + * return 0 if current has access, error otherwise + */ +static int smk_curacc_msq(struct msg_queue *msq, int access) +{ + char *msp = smack_of_msq(msq); + struct smk_audit_info ad; + +#ifdef CONFIG_AUDIT + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC); + ad.a.u.ipc_id = msq->q_perm.id; +#endif + return smk_curacc(msp, access, &ad); +} + /** * smack_msg_queue_associate - Smack access check for msg_queue * @msq: the object @@ -1888,11 +2070,10 @@ static char *smack_of_msq(struct msg_queue *msq) */ static int smack_msg_queue_associate(struct msg_queue *msq, int msqflg) { - char *msp = smack_of_msq(msq); int may; may = smack_flags_to_may(msqflg); - return smk_curacc(msp, may); + return smk_curacc_msq(msq, may); } /** @@ -1904,7 +2085,6 @@ static int smack_msg_queue_associate(struct msg_queue *msq, int msqflg) */ static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd) { - char *msp; int may; switch (cmd) { @@ -1926,8 +2106,7 @@ static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd) return -EINVAL; } - msp = smack_of_msq(msq); - return smk_curacc(msp, may); + return smk_curacc_msq(msq, may); } /** @@ -1941,11 +2120,10 @@ static int smack_msg_queue_msgctl(struct msg_queue *msq, int cmd) static int smack_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg, int msqflg) { - char *msp = smack_of_msq(msq); - int rc; + int may; - rc = smack_flags_to_may(msqflg); - return smk_curacc(msp, rc); + may = smack_flags_to_may(msqflg); + return smk_curacc_msq(msq, may); } /** @@ -1961,9 +2139,7 @@ static int smack_msg_queue_msgsnd(struct msg_queue *msq, struct msg_msg *msg, static int smack_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg, struct task_struct *target, long type, int mode) { - char *msp = smack_of_msq(msq); - - return smk_curacc(msp, MAY_READWRITE); + return smk_curacc_msq(msq, MAY_READWRITE); } /** @@ -1976,10 +2152,14 @@ static int smack_msg_queue_msgrcv(struct msg_queue *msq, struct msg_msg *msg, static int smack_ipc_permission(struct kern_ipc_perm *ipp, short flag) { char *isp = ipp->security; - int may; + int may = smack_flags_to_may(flag); + struct smk_audit_info ad; - may = smack_flags_to_may(flag); - return smk_curacc(isp, may); +#ifdef CONFIG_AUDIT + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_IPC); + ad.a.u.ipc_id = ipp->id; +#endif + return smk_curacc(isp, may, &ad); } /** @@ -2238,8 +2418,12 @@ static int smack_unix_stream_connect(struct socket *sock, { struct inode *sp = SOCK_INODE(sock); struct inode *op = SOCK_INODE(other); + struct smk_audit_info ad; - return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_READWRITE); + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET); + smk_ad_setfield_u_net_sk(&ad, other->sk); + return smk_access(smk_of_inode(sp), smk_of_inode(op), + MAY_READWRITE, &ad); } /** @@ -2254,8 +2438,11 @@ static int smack_unix_may_send(struct socket *sock, struct socket *other) { struct inode *sp = SOCK_INODE(sock); struct inode *op = SOCK_INODE(other); + struct smk_audit_info ad; - return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_WRITE); + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET); + smk_ad_setfield_u_net_sk(&ad, other->sk); + return smk_access(smk_of_inode(sp), smk_of_inode(op), MAY_WRITE, &ad); } /** @@ -2370,7 +2557,7 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) char smack[SMK_LABELLEN]; char *csp; int rc; - + struct smk_audit_info ad; if (sk->sk_family != PF_INET && sk->sk_family != PF_INET6) return 0; @@ -2388,13 +2575,19 @@ static int smack_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) netlbl_secattr_destroy(&secattr); +#ifdef CONFIG_AUDIT + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET); + ad.a.u.net.family = sk->sk_family; + ad.a.u.net.netif = skb->iif; + ipv4_skb_to_auditdata(skb, &ad.a, NULL); +#endif /* * Receiving a packet requires that the other end * be able to write here. Read access is not required. * This is the simplist possible security model * for networking. */ - rc = smk_access(csp, ssp->smk_in, MAY_WRITE); + rc = smk_access(csp, ssp->smk_in, MAY_WRITE, &ad); if (rc != 0) netlbl_skbuff_err(skb, rc, 0); return rc; @@ -2523,6 +2716,7 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb, struct iphdr *hdr; char smack[SMK_LABELLEN]; int rc; + struct smk_audit_info ad; /* handle mapped IPv4 packets arriving via IPv6 sockets */ if (family == PF_INET6 && skb->protocol == htons(ETH_P_IP)) @@ -2536,11 +2730,17 @@ static int smack_inet_conn_request(struct sock *sk, struct sk_buff *skb, strncpy(smack, smack_known_huh.smk_known, SMK_MAXLEN); netlbl_secattr_destroy(&secattr); +#ifdef CONFIG_AUDIT + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_NET); + ad.a.u.net.family = family; + ad.a.u.net.netif = skb->iif; + ipv4_skb_to_auditdata(skb, &ad.a, NULL); +#endif /* * Receiving a packet requires that the other end be able to write * here. Read access is not required. */ - rc = smk_access(smack, ssp->smk_in, MAY_WRITE); + rc = smk_access(smack, ssp->smk_in, MAY_WRITE, &ad); if (rc != 0) return rc; @@ -2642,6 +2842,7 @@ static int smack_key_permission(key_ref_t key_ref, const struct cred *cred, key_perm_t perm) { struct key *keyp; + struct smk_audit_info ad; keyp = key_ref_to_ptr(key_ref); if (keyp == NULL) @@ -2657,8 +2858,13 @@ static int smack_key_permission(key_ref_t key_ref, */ if (cred->security == NULL) return -EACCES; - - return smk_access(cred->security, keyp->security, MAY_READWRITE); +#ifdef CONFIG_AUDIT + smk_ad_init(&ad, __func__, LSM_AUDIT_DATA_KEY); + ad.a.u.key_struct.key = keyp->serial; + ad.a.u.key_struct.key_desc = keyp->description; +#endif + return smk_access(cred->security, keyp->security, + MAY_READWRITE, &ad); } #endif /* CONFIG_KEYS */ diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index e03a7e19c73..904af348328 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -41,6 +41,7 @@ enum smk_inos { SMK_AMBIENT = 7, /* internet ambient label */ SMK_NETLBLADDR = 8, /* single label hosts */ SMK_ONLYCAP = 9, /* the only "capable" label */ + SMK_LOGGING = 10, /* logging */ }; /* @@ -1191,6 +1192,69 @@ static const struct file_operations smk_onlycap_ops = { .write = smk_write_onlycap, }; +/** + * smk_read_logging - read() for /smack/logging + * @filp: file pointer, not actually used + * @buf: where to put the result + * @cn: maximum to send along + * @ppos: where to start + * + * Returns number of bytes read or error code, as appropriate + */ +static ssize_t smk_read_logging(struct file *filp, char __user *buf, + size_t count, loff_t *ppos) +{ + char temp[32]; + ssize_t rc; + + if (*ppos != 0) + return 0; + + sprintf(temp, "%d\n", log_policy); + rc = simple_read_from_buffer(buf, count, ppos, temp, strlen(temp)); + return rc; +} + +/** + * smk_write_logging - write() for /smack/logging + * @file: file pointer, not actually used + * @buf: where to get the data from + * @count: bytes sent + * @ppos: where to start + * + * Returns number of bytes written or error code, as appropriate + */ +static ssize_t smk_write_logging(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + char temp[32]; + int i; + + if (!capable(CAP_MAC_ADMIN)) + return -EPERM; + + if (count >= sizeof(temp) || count == 0) + return -EINVAL; + + if (copy_from_user(temp, buf, count) != 0) + return -EFAULT; + + temp[count] = '\0'; + + if (sscanf(temp, "%d", &i) != 1) + return -EINVAL; + if (i < 0 || i > 3) + return -EINVAL; + log_policy = i; + return count; +} + + + +static const struct file_operations smk_logging_ops = { + .read = smk_read_logging, + .write = smk_write_logging, +}; /** * smk_fill_super - fill the /smackfs superblock * @sb: the empty superblock @@ -1221,6 +1285,8 @@ static int smk_fill_super(struct super_block *sb, void *data, int silent) {"netlabel", &smk_netlbladdr_ops, S_IRUGO|S_IWUSR}, [SMK_ONLYCAP] = {"onlycap", &smk_onlycap_ops, S_IRUGO|S_IWUSR}, + [SMK_LOGGING] = + {"logging", &smk_logging_ops, S_IRUGO|S_IWUSR}, /* last one */ {""} }; From 7ba5c840e64d4a967379f1ae3eca73278180b11d Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2009 21:31:17 -0700 Subject: [PATCH 0363/2061] rcu: Add __rcu_pending tracing to hierarchical RCU Add tracing to __rcu_pending() to provide information on why RCU processing was kicked off. This is helpful for debugging hierarchical RCU, and might also be helpful in learning how hierarchical RCU operates. Located-by: Anton Blanchard Tested-by: Anton Blanchard Signed-off-by: Paul E. McKenney Cc: anton@samba.org Cc: akpm@linux-foundation.org Cc: dipankar@in.ibm.com Cc: manfred@colorfullife.com Cc: cl@linux-foundation.org Cc: josht@linux.vnet.ibm.com Cc: schamp@sgi.com Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: ego@in.ibm.com Cc: laijs@cn.fujitsu.com Cc: rostedt@goodmis.org Cc: peterz@infradead.org Cc: penberg@cs.helsinki.fi Cc: andi@firstfloor.org Cc: "Paul E. McKenney" LKML-Reference: <1239683479943-git-send-email-> Signed-off-by: Ingo Molnar --- include/linux/rcutree.h | 9 +++++- kernel/rcutree.c | 25 ++++++++++++---- kernel/rcutree_trace.c | 64 ++++++++++++++++++++++++++++++++++++++++- 3 files changed, 90 insertions(+), 8 deletions(-) diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 58b2aa5312b..5a5153806c4 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -161,8 +161,15 @@ struct rcu_data { unsigned long offline_fqs; /* Kicked due to being offline. */ unsigned long resched_ipi; /* Sent a resched IPI. */ - /* 5) For future __rcu_pending statistics. */ + /* 5) __rcu_pending() statistics. */ long n_rcu_pending; /* rcu_pending() calls since boot. */ + long n_rp_qs_pending; + long n_rp_cb_ready; + long n_rp_cpu_needs_gp; + long n_rp_gp_completed; + long n_rp_gp_started; + long n_rp_need_fqs; + long n_rp_need_nothing; int cpu; }; diff --git a/kernel/rcutree.c b/kernel/rcutree.c index d2a372fb0b9..0dccfbba6d2 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1259,31 +1259,44 @@ static int __rcu_pending(struct rcu_state *rsp, struct rcu_data *rdp) check_cpu_stall(rsp, rdp); /* Is the RCU core waiting for a quiescent state from this CPU? */ - if (rdp->qs_pending) + if (rdp->qs_pending) { + rdp->n_rp_qs_pending++; return 1; + } /* Does this CPU have callbacks ready to invoke? */ - if (cpu_has_callbacks_ready_to_invoke(rdp)) + if (cpu_has_callbacks_ready_to_invoke(rdp)) { + rdp->n_rp_cb_ready++; return 1; + } /* Has RCU gone idle with this CPU needing another grace period? */ - if (cpu_needs_another_gp(rsp, rdp)) + if (cpu_needs_another_gp(rsp, rdp)) { + rdp->n_rp_cpu_needs_gp++; return 1; + } /* Has another RCU grace period completed? */ - if (ACCESS_ONCE(rsp->completed) != rdp->completed) /* outside of lock */ + if (ACCESS_ONCE(rsp->completed) != rdp->completed) { /* outside lock */ + rdp->n_rp_gp_completed++; return 1; + } /* Has a new RCU grace period started? */ - if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) /* outside of lock */ + if (ACCESS_ONCE(rsp->gpnum) != rdp->gpnum) { /* outside lock */ + rdp->n_rp_gp_started++; return 1; + } /* Has an RCU GP gone long enough to send resched IPIs &c? */ if (ACCESS_ONCE(rsp->completed) != ACCESS_ONCE(rsp->gpnum) && - ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) + ((long)(ACCESS_ONCE(rsp->jiffies_force_qs) - jiffies) < 0)) { + rdp->n_rp_need_fqs++; return 1; + } /* nothing to do */ + rdp->n_rp_need_nothing++; return 0; } diff --git a/kernel/rcutree_trace.c b/kernel/rcutree_trace.c index 4b1875ba940..fe1dcdbf1ca 100644 --- a/kernel/rcutree_trace.c +++ b/kernel/rcutree_trace.c @@ -213,7 +213,63 @@ static struct file_operations rcugp_fops = { .release = single_release, }; -static struct dentry *rcudir, *datadir, *datadir_csv, *hierdir, *gpdir; +static void print_one_rcu_pending(struct seq_file *m, struct rcu_data *rdp) +{ + seq_printf(m, "%3d%cnp=%ld " + "qsp=%ld cbr=%ld cng=%ld gpc=%ld gps=%ld nf=%ld nn=%ld\n", + rdp->cpu, + cpu_is_offline(rdp->cpu) ? '!' : ' ', + rdp->n_rcu_pending, + rdp->n_rp_qs_pending, + rdp->n_rp_cb_ready, + rdp->n_rp_cpu_needs_gp, + rdp->n_rp_gp_completed, + rdp->n_rp_gp_started, + rdp->n_rp_need_fqs, + rdp->n_rp_need_nothing); +} + +static void print_rcu_pendings(struct seq_file *m, struct rcu_state *rsp) +{ + int cpu; + struct rcu_data *rdp; + + for_each_possible_cpu(cpu) { + rdp = rsp->rda[cpu]; + if (rdp->beenonline) + print_one_rcu_pending(m, rdp); + } +} + +static int show_rcu_pending(struct seq_file *m, void *unused) +{ + seq_puts(m, "rcu:\n"); + print_rcu_pendings(m, &rcu_state); + seq_puts(m, "rcu_bh:\n"); + print_rcu_pendings(m, &rcu_bh_state); + return 0; +} + +static int rcu_pending_open(struct inode *inode, struct file *file) +{ + return single_open(file, show_rcu_pending, NULL); +} + +static struct file_operations rcu_pending_fops = { + .owner = THIS_MODULE, + .open = rcu_pending_open, + .read = seq_read, + .llseek = seq_lseek, + .release = single_release, +}; + +static struct dentry *rcudir; +static struct dentry *datadir; +static struct dentry *datadir_csv; +static struct dentry *gpdir; +static struct dentry *hierdir; +static struct dentry *rcu_pendingdir; + static int __init rcuclassic_trace_init(void) { rcudir = debugfs_create_dir("rcu", NULL); @@ -238,6 +294,11 @@ static int __init rcuclassic_trace_init(void) NULL, &rcuhier_fops); if (!hierdir) goto free_out; + + rcu_pendingdir = debugfs_create_file("rcu_pending", 0444, rcudir, + NULL, &rcu_pending_fops); + if (!rcu_pendingdir) + goto free_out; return 0; free_out: if (datadir) @@ -257,6 +318,7 @@ static void __exit rcuclassic_trace_cleanup(void) debugfs_remove(datadir_csv); debugfs_remove(gpdir); debugfs_remove(hierdir); + debugfs_remove(rcu_pendingdir); debugfs_remove(rcudir); } From 6fd9b3a40b82081d9e6490b0d7cd656e9a78a134 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 13 Apr 2009 21:31:18 -0700 Subject: [PATCH 0364/2061] rcu: Update RCU tracing documentation for __rcu_pending This patch updates the RCU documentation to reflect the changes in tracing made in the previous patch in the set. Located-by: Anton Blanchard Tested-by: Anton Blanchard Signed-off-by: Paul E. McKenney Cc: anton@samba.org Cc: akpm@linux-foundation.org Cc: dipankar@in.ibm.com Cc: manfred@colorfullife.com Cc: cl@linux-foundation.org Cc: josht@linux.vnet.ibm.com Cc: schamp@sgi.com Cc: niv@us.ibm.com Cc: dvhltc@us.ibm.com Cc: ego@in.ibm.com Cc: laijs@cn.fujitsu.com Cc: rostedt@goodmis.org Cc: peterz@infradead.org Cc: penberg@cs.helsinki.fi Cc: andi@firstfloor.org Cc: "Paul E. McKenney" LKML-Reference: <12396834792865-git-send-email-> Signed-off-by: Ingo Molnar --- Documentation/RCU/trace.txt | 102 ++++++++++++++++++++++++++++-------- 1 file changed, 80 insertions(+), 22 deletions(-) diff --git a/Documentation/RCU/trace.txt b/Documentation/RCU/trace.txt index 068848240a8..02cced183b2 100644 --- a/Documentation/RCU/trace.txt +++ b/Documentation/RCU/trace.txt @@ -192,23 +192,24 @@ rcu/rcuhier (which displays the struct rcu_node hierarchy). The output of "cat rcu/rcudata" looks as follows: rcu: - 0 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=1 rp=3c2a dt=23301/73 dn=2 df=1882 of=0 ri=2126 ql=2 b=10 - 1 c=4011 g=4012 pq=1 pqc=4011 qp=0 rpfq=3 rp=39a6 dt=78073/1 dn=2 df=1402 of=0 ri=1875 ql=46 b=10 - 2 c=4010 g=4010 pq=1 pqc=4010 qp=0 rpfq=-5 rp=1d12 dt=16646/0 dn=2 df=3140 of=0 ri=2080 ql=0 b=10 - 3 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=2b50 dt=21159/1 dn=2 df=2230 of=0 ri=1923 ql=72 b=10 - 4 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1644 dt=5783/1 dn=2 df=3348 of=0 ri=2805 ql=7 b=10 - 5 c=4012 g=4013 pq=0 pqc=4011 qp=1 rpfq=3 rp=1aac dt=5879/1 dn=2 df=3140 of=0 ri=2066 ql=10 b=10 - 6 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=ed8 dt=5847/1 dn=2 df=3797 of=0 ri=1266 ql=10 b=10 - 7 c=4012 g=4013 pq=1 pqc=4012 qp=1 rpfq=3 rp=1fa2 dt=6199/1 dn=2 df=2795 of=0 ri=2162 ql=28 b=10 +rcu: + 0 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=10951/1 dn=0 df=1101 of=0 ri=36 ql=0 b=10 + 1 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=16117/1 dn=0 df=1015 of=0 ri=0 ql=0 b=10 + 2 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1445/1 dn=0 df=1839 of=0 ri=0 ql=0 b=10 + 3 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=6681/1 dn=0 df=1545 of=0 ri=0 ql=0 b=10 + 4 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=1003/1 dn=0 df=1992 of=0 ri=0 ql=0 b=10 + 5 c=17829 g=17830 pq=1 pqc=17829 qp=1 dt=3887/1 dn=0 df=3331 of=0 ri=4 ql=2 b=10 + 6 c=17829 g=17829 pq=1 pqc=17829 qp=0 dt=859/1 dn=0 df=3224 of=0 ri=0 ql=0 b=10 + 7 c=17829 g=17830 pq=0 pqc=17829 qp=1 dt=3761/1 dn=0 df=1818 of=0 ri=0 ql=2 b=10 rcu_bh: - 0 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-145 rp=21d6 dt=23301/73 dn=2 df=0 of=0 ri=0 ql=0 b=10 - 1 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-170 rp=20ce dt=78073/1 dn=2 df=26 of=0 ri=5 ql=0 b=10 - 2 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-83 rp=fbd dt=16646/0 dn=2 df=28 of=0 ri=4 ql=0 b=10 - 3 c=-268 g=-268 pq=1 pqc=-268 qp=0 rpfq=-105 rp=178c dt=21159/1 dn=2 df=28 of=0 ri=2 ql=0 b=10 - 4 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-30 rp=b54 dt=5783/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 - 5 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-29 rp=df5 dt=5879/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 - 6 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-28 rp=788 dt=5847/1 dn=2 df=32 of=0 ri=0 ql=0 b=10 - 7 c=-268 g=-268 pq=1 pqc=-268 qp=1 rpfq=-53 rp=1098 dt=6199/1 dn=2 df=30 of=0 ri=3 ql=0 b=10 + 0 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=10951/1 dn=0 df=0 of=0 ri=0 ql=0 b=10 + 1 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=16117/1 dn=0 df=13 of=0 ri=0 ql=0 b=10 + 2 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1445/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 + 3 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=6681/1 dn=0 df=9 of=0 ri=0 ql=0 b=10 + 4 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=1003/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 + 5 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3887/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 + 6 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=859/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 + 7 c=-275 g=-275 pq=1 pqc=-275 qp=0 dt=3761/1 dn=0 df=15 of=0 ri=0 ql=0 b=10 The first section lists the rcu_data structures for rcu, the second for rcu_bh. Each section has one line per CPU, or eight for this 8-CPU system. @@ -253,12 +254,6 @@ o "pqc" indicates which grace period the last-observed quiescent o "qp" indicates that RCU still expects a quiescent state from this CPU. -o "rpfq" is the number of rcu_pending() calls on this CPU required - to induce this CPU to invoke force_quiescent_state(). - -o "rp" is low-order four hex digits of the count of how many times - rcu_pending() has been invoked on this CPU. - o "dt" is the current value of the dyntick counter that is incremented when entering or leaving dynticks idle state, either by the scheduler or by irq. The number after the "/" is the interrupt @@ -305,6 +300,9 @@ o "b" is the batch limit for this CPU. If more than this number of RCU callbacks is ready to invoke, then the remainder will be deferred. +There is also an rcu/rcudata.csv file with the same information in +comma-separated-variable spreadsheet format. + The output of "cat rcu/rcugp" looks as follows: @@ -411,3 +409,63 @@ o Each element of the form "1/1 0:127 ^0" represents one struct For example, the first entry at the lowest level shows "^0", indicating that it corresponds to bit zero in the first entry at the middle level. + + +The output of "cat rcu/rcu_pending" looks as follows: + +rcu: + 0 np=255892 qsp=53936 cbr=0 cng=14417 gpc=10033 gps=24320 nf=6445 nn=146741 + 1 np=261224 qsp=54638 cbr=0 cng=25723 gpc=16310 gps=2849 nf=5912 nn=155792 + 2 np=237496 qsp=49664 cbr=0 cng=2762 gpc=45478 gps=1762 nf=1201 nn=136629 + 3 np=236249 qsp=48766 cbr=0 cng=286 gpc=48049 gps=1218 nf=207 nn=137723 + 4 np=221310 qsp=46850 cbr=0 cng=26 gpc=43161 gps=4634 nf=3529 nn=123110 + 5 np=237332 qsp=48449 cbr=0 cng=54 gpc=47920 gps=3252 nf=201 nn=137456 + 6 np=219995 qsp=46718 cbr=0 cng=50 gpc=42098 gps=6093 nf=4202 nn=120834 + 7 np=249893 qsp=49390 cbr=0 cng=72 gpc=38400 gps=17102 nf=41 nn=144888 +rcu_bh: + 0 np=146741 qsp=1419 cbr=0 cng=6 gpc=0 gps=0 nf=2 nn=145314 + 1 np=155792 qsp=12597 cbr=0 cng=0 gpc=4 gps=8 nf=3 nn=143180 + 2 np=136629 qsp=18680 cbr=0 cng=0 gpc=7 gps=6 nf=0 nn=117936 + 3 np=137723 qsp=2843 cbr=0 cng=0 gpc=10 gps=7 nf=0 nn=134863 + 4 np=123110 qsp=12433 cbr=0 cng=0 gpc=4 gps=2 nf=0 nn=110671 + 5 np=137456 qsp=4210 cbr=0 cng=0 gpc=6 gps=5 nf=0 nn=133235 + 6 np=120834 qsp=9902 cbr=0 cng=0 gpc=6 gps=3 nf=2 nn=110921 + 7 np=144888 qsp=26336 cbr=0 cng=0 gpc=8 gps=2 nf=0 nn=118542 + +As always, this is once again split into "rcu" and "rcu_bh" portions. +The fields are as follows: + +o "np" is the number of times that __rcu_pending() has been invoked + for the corresponding flavor of RCU. + +o "qsp" is the number of times that the RCU was waiting for a + quiescent state from this CPU. + +o "cbr" is the number of times that this CPU had RCU callbacks + that had passed through a grace period, and were thus ready + to be invoked. + +o "cng" is the number of times that this CPU needed another + grace period while RCU was idle. + +o "gpc" is the number of times that an old grace period had + completed, but this CPU was not yet aware of it. + +o "gps" is the number of times that a new grace period had started, + but this CPU was not yet aware of it. + +o "nf" is the number of times that this CPU suspected that the + current grace period had run for too long, and thus needed to + be forced. + + Please note that "forcing" consists of sending resched IPIs + to holdout CPUs. If that CPU really still is in an old RCU + read-side critical section, then we really do have to wait for it. + The assumption behing "forcing" is that the CPU is not still in + an old RCU read-side critical section, but has not yet responded + for some other reason. + +o "nn" is the number of times that this CPU needed nothing. Alert + readers will note that the rcu "nn" number for a given CPU very + closely matches the rcu_bh "np" number for that same CPU. This + is due to short-circuit evaluation in rcu_pending(). From 66aa230e437d89ca56224135f617e2d8e391a3ef Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 14 Apr 2009 12:54:29 +0530 Subject: [PATCH 0365/2061] x86: page_types.h unification of declarations Impact: unification of declarations, cleanup Unification of declarations: moved init_memory_mapping, initmem_init and free_initmem from page_XX_types.h to page_types.h Signed-off-by: Jaswinder Singh Rajput Acked-by: Pekka Enberg Cc: Andrew Morton LKML-Reference: <1239693869.3033.31.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/page_32_types.h | 4 ---- arch/x86/include/asm/page_64_types.h | 6 ------ arch/x86/include/asm/page_types.h | 6 ++++++ 3 files changed, 6 insertions(+), 10 deletions(-) diff --git a/arch/x86/include/asm/page_32_types.h b/arch/x86/include/asm/page_32_types.h index 0f915ae649a..6f1b7331313 100644 --- a/arch/x86/include/asm/page_32_types.h +++ b/arch/x86/include/asm/page_32_types.h @@ -54,10 +54,6 @@ extern unsigned int __VMALLOC_RESERVE; extern int sysctl_legacy_va_layout; extern void find_low_pfn_range(void); -extern unsigned long init_memory_mapping(unsigned long start, - unsigned long end); -extern void initmem_init(unsigned long, unsigned long); -extern void free_initmem(void); extern void setup_bootmem_allocator(void); #endif /* !__ASSEMBLY__ */ diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d38c91b7024..3f587188ae6 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -71,12 +71,6 @@ extern unsigned long __phys_addr(unsigned long); #define vmemmap ((struct page *)VMEMMAP_START) -extern unsigned long init_memory_mapping(unsigned long start, - unsigned long end); - -extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); -extern void free_initmem(void); - extern void init_extra_mapping_uc(unsigned long phys, unsigned long size); extern void init_extra_mapping_wb(unsigned long phys, unsigned long size); diff --git a/arch/x86/include/asm/page_types.h b/arch/x86/include/asm/page_types.h index 826ad37006a..6473f5ccff8 100644 --- a/arch/x86/include/asm/page_types.h +++ b/arch/x86/include/asm/page_types.h @@ -46,6 +46,12 @@ extern int devmem_is_allowed(unsigned long pagenr); extern unsigned long max_low_pfn_mapped; extern unsigned long max_pfn_mapped; +extern unsigned long init_memory_mapping(unsigned long start, + unsigned long end); + +extern void initmem_init(unsigned long start_pfn, unsigned long end_pfn); +extern void free_initmem(void); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_X86_PAGE_DEFS_H */ From e7d43a74cb07cbc4b8e9b5e4a914816b33fb0719 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 14 Apr 2009 13:18:28 +0530 Subject: [PATCH 0366/2061] x86: avoid multiple declaration of kstack_depth_to_print Impact: cleanup asm/stacktrace.h is more appropriate so removing other 2 declarations. Signed-off-by: Jaswinder Singh Rajput Cc: Neil Horman LKML-Reference: <1239695308.3033.34.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/traps.h | 1 - arch/x86/kernel/dumpstack.h | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0d5342515b8..9aa3ab26205 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -74,7 +74,6 @@ static inline int get_si_code(unsigned long condition) } extern int panic_on_unrecovered_nmi; -extern int kstack_depth_to_print; void math_error(void __user *); void math_emulate(struct math_emu_info *); diff --git a/arch/x86/kernel/dumpstack.h b/arch/x86/kernel/dumpstack.h index da87590b869..81086c227ab 100644 --- a/arch/x86/kernel/dumpstack.h +++ b/arch/x86/kernel/dumpstack.h @@ -29,7 +29,6 @@ show_stack_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *sp, unsigned long bp, char *log_lvl); extern unsigned int code_bytes; -extern int kstack_depth_to_print; /* The form of the top of the frame on the stack */ struct stack_frame { From f711f6090a81cbd396b63de90f415d33f563af9b Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 14 Apr 2009 10:25:30 +0530 Subject: [PATCH 0367/2061] sched: Nominate idle load balancer from a semi-idle package. Currently the nomination of idle-load balancer is done by choosing the first idle cpu in the nohz.cpu_mask. This may not be power-efficient, since such an idle cpu could come from a completely idle core/package thereby preventing the whole core/package from being in a low-power state. For eg, consider a quad-core dual package system. The cpu numbering need not be sequential and can something like [0, 2, 4, 6] and [1, 3, 5, 7]. With sched_mc/smt_power_savings and the power-aware IRQ balance, we try to keep as fewer Packages/Cores active. But the current idle load balancer logic goes against this by choosing the first_cpu in the nohz.cpu_mask and not taking the system topology into consideration. Improve the algorithm to nominate the idle load balancer from a semi idle cores/packages thereby increasing the probability of the cores/packages being in deeper sleep states for longer duration. The algorithm is activated only when sched_mc/smt_power_savings != 0. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra LKML-Reference: <20090414045530.7645.12175.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 127 +++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 118 insertions(+), 9 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 5724508c3b6..b0fefa300b4 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4240,10 +4240,126 @@ static void active_load_balance(struct rq *busiest_rq, int busiest_cpu) static struct { atomic_t load_balancer; cpumask_var_t cpu_mask; + cpumask_var_t ilb_grp_nohz_mask; } nohz ____cacheline_aligned = { .load_balancer = ATOMIC_INIT(-1), }; +#if defined(CONFIG_SCHED_MC) || defined(CONFIG_SCHED_SMT) +/** + * lowest_flag_domain - Return lowest sched_domain containing flag. + * @cpu: The cpu whose lowest level of sched domain is to + * be returned. + * @flag: The flag to check for the lowest sched_domain + * for the given cpu. + * + * Returns the lowest sched_domain of a cpu which contains the given flag. + */ +static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) +{ + struct sched_domain *sd; + + for_each_domain(cpu, sd) + if (sd && (sd->flags & flag)) + break; + + return sd; +} + +/** + * for_each_flag_domain - Iterates over sched_domains containing the flag. + * @cpu: The cpu whose domains we're iterating over. + * @sd: variable holding the value of the power_savings_sd + * for cpu. + * @flag: The flag to filter the sched_domains to be iterated. + * + * Iterates over all the scheduler domains for a given cpu that has the 'flag' + * set, starting from the lowest sched_domain to the highest. + */ +#define for_each_flag_domain(cpu, sd, flag) \ + for (sd = lowest_flag_domain(cpu, flag); \ + (sd && (sd->flags & flag)); sd = sd->parent) + +/** + * is_semi_idle_group - Checks if the given sched_group is semi-idle. + * @ilb_group: group to be checked for semi-idleness + * + * Returns: 1 if the group is semi-idle. 0 otherwise. + * + * We define a sched_group to be semi idle if it has atleast one idle-CPU + * and atleast one non-idle CPU. This helper function checks if the given + * sched_group is semi-idle or not. + */ +static inline int is_semi_idle_group(struct sched_group *ilb_group) +{ + cpumask_and(nohz.ilb_grp_nohz_mask, nohz.cpu_mask, + sched_group_cpus(ilb_group)); + + /* + * A sched_group is semi-idle when it has atleast one busy cpu + * and atleast one idle cpu. + */ + if (cpumask_empty(nohz.ilb_grp_nohz_mask)) + return 0; + + if (cpumask_equal(nohz.ilb_grp_nohz_mask, sched_group_cpus(ilb_group))) + return 0; + + return 1; +} +/** + * find_new_ilb - Finds the optimum idle load balancer for nomination. + * @cpu: The cpu which is nominating a new idle_load_balancer. + * + * Returns: Returns the id of the idle load balancer if it exists, + * Else, returns >= nr_cpu_ids. + * + * This algorithm picks the idle load balancer such that it belongs to a + * semi-idle powersavings sched_domain. The idea is to try and avoid + * completely idle packages/cores just for the purpose of idle load balancing + * when there are other idle cpu's which are better suited for that job. + */ +static int find_new_ilb(int cpu) +{ + struct sched_domain *sd; + struct sched_group *ilb_group; + + /* + * Have idle load balancer selection from semi-idle packages only + * when power-aware load balancing is enabled + */ + if (!(sched_smt_power_savings || sched_mc_power_savings)) + goto out_done; + + /* + * Optimize for the case when we have no idle CPUs or only one + * idle CPU. Don't walk the sched_domain hierarchy in such cases + */ + if (cpumask_weight(nohz.cpu_mask) < 2) + goto out_done; + + for_each_flag_domain(cpu, sd, SD_POWERSAVINGS_BALANCE) { + ilb_group = sd->groups; + + do { + if (is_semi_idle_group(ilb_group)) + return cpumask_first(nohz.ilb_grp_nohz_mask); + + ilb_group = ilb_group->next; + + } while (ilb_group != sd->groups); + } + +out_done: + return cpumask_first(nohz.cpu_mask); +} +#else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ +static inline int find_new_ilb(int call_cpu) +{ + return first_cpu(nohz.cpu_mask); +} +#endif + /* * This routine will try to nominate the ilb (idle load balancing) * owner among the cpus whose ticks are stopped. ilb owner will do the idle @@ -4468,15 +4584,7 @@ static inline void trigger_load_balance(struct rq *rq, int cpu) } if (atomic_read(&nohz.load_balancer) == -1) { - /* - * simple selection for now: Nominate the - * first cpu in the nohz list to be the next - * ilb owner. - * - * TBD: Traverse the sched domains and nominate - * the nearest cpu in the nohz.cpu_mask. - */ - int ilb = cpumask_first(nohz.cpu_mask); + int ilb = find_new_ilb(cpu); if (ilb < nr_cpu_ids) resched_cpu(ilb); @@ -9051,6 +9159,7 @@ void __init sched_init(void) #ifdef CONFIG_SMP #ifdef CONFIG_NO_HZ alloc_bootmem_cpumask_var(&nohz.cpu_mask); + alloc_bootmem_cpumask_var(&nohz.ilb_grp_nohz_mask); #endif alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ From e790fb0ba64bfec158e1219d899cb588275d12ab Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 14 Apr 2009 10:25:35 +0530 Subject: [PATCH 0368/2061] sched: Nominate a power-efficient ilb in select_nohz_balancer() The CPU that first goes idle becomes the idle-load-balancer and remains that until either it picks up a task or till all the CPUs of the system goes idle. Optimize this further to allow it to relinquish it's post once all it's siblings in the power-aware sched_domain go idle, thereby allowing the whole package-core to go idle. While relinquising the post, nominate another an idle-load balancer from a semi-idle core/package. Signed-off-by: Gautham R Shenoy Acked-by: Peter Zijlstra LKML-Reference: <20090414045535.7645.31641.stgit@sofia.in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index b0fefa300b4..36d213bca47 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4414,8 +4414,24 @@ int select_nohz_load_balancer(int stop_tick) /* make me the ilb owner */ if (atomic_cmpxchg(&nohz.load_balancer, -1, cpu) == -1) return 1; - } else if (atomic_read(&nohz.load_balancer) == cpu) + } else if (atomic_read(&nohz.load_balancer) == cpu) { + int new_ilb; + + if (!(sched_smt_power_savings || + sched_mc_power_savings)) + return 1; + /* + * Check to see if there is a more power-efficient + * ilb. + */ + new_ilb = find_new_ilb(cpu); + if (new_ilb < nr_cpu_ids && new_ilb != cpu) { + atomic_set(&nohz.load_balancer, -1); + resched_cpu(new_ilb); + return 0; + } return 1; + } } else { if (!cpumask_test_cpu(cpu, nohz.cpu_mask)) return 0; From 6424fb38667fffbbb1b90be0ffd9a0c540db6a4b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 13 Apr 2009 23:51:46 -0700 Subject: [PATCH 0369/2061] x86: remove (null) in /sys kernel_page_tables Impact: cleanup %p prints out 0x000000000000000 as (null) so use %lx instead. Signed-off-by: Yinghai Lu LKML-Reference: <49E43282.1090607@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/dump_pagetables.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/dump_pagetables.c b/arch/x86/mm/dump_pagetables.c index e7277cbcfb4..a725b7f760a 100644 --- a/arch/x86/mm/dump_pagetables.c +++ b/arch/x86/mm/dump_pagetables.c @@ -161,13 +161,14 @@ static void note_page(struct seq_file *m, struct pg_state *st, st->current_address >= st->marker[1].start_address) { const char *unit = units; unsigned long delta; + int width = sizeof(unsigned long) * 2; /* * Now print the actual finished series */ - seq_printf(m, "0x%p-0x%p ", - (void *)st->start_address, - (void *)st->current_address); + seq_printf(m, "0x%0*lx-0x%0*lx ", + width, st->start_address, + width, st->current_address); delta = (st->current_address - st->start_address) >> 10; while (!(delta & 1023) && unit[1]) { From 7e05575c422d45f393c2d9b5900e97a30bf69bea Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 14 Apr 2009 12:12:29 +0900 Subject: [PATCH 0370/2061] x86: calgary: remove IOMMU_DEBUG CONFIG_IOMMU_DEBUG has depends on CONFIG_GART_IOMMU: config IOMMU_DEBUG bool "Enable IOMMU debugging" depends on GART_IOMMU && DEBUG_KERNEL depends on X86_64 So it's not useful to have CONFIG_IOMMU_DEBUG in Calgary IOMMU code, which does the extra checking of the bitmap space management. And Calgary uses the iommu helper for the bitmap space management now so it would be better to have the extra checking feature in the iommu helper rather than Calgary code (if necessary). Signed-off-by: FUJITA Tomonori Acked-by: Muli Ben-Yehuda Cc: Joerg Roedel Cc: alexisb@us.ibm.com LKML-Reference: <20090414120827G.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/kernel/pci-calgary_64.c | 54 ++------------------------------ 1 file changed, 2 insertions(+), 52 deletions(-) diff --git a/arch/x86/kernel/pci-calgary_64.c b/arch/x86/kernel/pci-calgary_64.c index 755c21e906f..971a3bec47a 100644 --- a/arch/x86/kernel/pci-calgary_64.c +++ b/arch/x86/kernel/pci-calgary_64.c @@ -186,37 +186,6 @@ static struct cal_chipset_ops calioc2_chip_ops = { static struct calgary_bus_info bus_info[MAX_PHB_BUS_NUM] = { { NULL, 0, 0 }, }; -/* enable this to stress test the chip's TCE cache */ -#ifdef CONFIG_IOMMU_DEBUG -static int debugging = 1; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - unsigned long idx = start; - - BUG_ON(start >= end); - - while (idx < end) { - if (!!test_bit(idx, bitmap) != expected) - return idx; - ++idx; - } - - /* all bits have the expected value */ - return ~0UL; -} -#else /* debugging is disabled */ -static int debugging; - -static inline unsigned long verify_bit_range(unsigned long* bitmap, - int expected, unsigned long start, unsigned long end) -{ - return ~0UL; -} - -#endif /* CONFIG_IOMMU_DEBUG */ - static inline int translation_enabled(struct iommu_table *tbl) { /* only PHBs with translation enabled have an IOMMU table */ @@ -228,7 +197,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, { unsigned long index; unsigned long end; - unsigned long badbit; unsigned long flags; index = start_addr >> PAGE_SHIFT; @@ -243,14 +211,6 @@ static void iommu_range_reserve(struct iommu_table *tbl, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 0, index, end); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: entry already allocated at " - "0x%lx tbl %p dma 0x%lx npages %u\n", - badbit, tbl, start_addr, npages); - } - iommu_area_reserve(tbl->it_map, index, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -326,7 +286,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, unsigned int npages) { unsigned long entry; - unsigned long badbit; unsigned long badend; unsigned long flags; @@ -346,14 +305,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, spin_lock_irqsave(&tbl->it_lock, flags); - badbit = verify_bit_range(tbl->it_map, 1, entry, entry + npages); - if (badbit != ~0UL) { - if (printk_ratelimit()) - printk(KERN_ERR "Calgary: bit is off at 0x%lx " - "tbl %p dma 0x%Lx entry 0x%lx npages %u\n", - badbit, tbl, dma_addr, entry, npages); - } - iommu_area_free(tbl->it_map, entry, npages); spin_unlock_irqrestore(&tbl->it_lock, flags); @@ -1488,9 +1439,8 @@ void __init detect_calgary(void) iommu_detected = 1; calgary_detected = 1; printk(KERN_INFO "PCI-DMA: Calgary IOMMU detected.\n"); - printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d, " - "CONFIG_IOMMU_DEBUG is %s.\n", specified_table_size, - debugging ? "enabled" : "disabled"); + printk(KERN_INFO "PCI-DMA: Calgary TCE table spec is %d\n", + specified_table_size); /* swiotlb for devices that aren't behind the Calgary. */ if (max_pfn > MAX_DMA32_PFN) From ea20d9293ce423a39717ed4375393129a2e701f9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 08:54:16 -0400 Subject: [PATCH 0371/2061] tracing: consolidate trace and trace_event headers Impact: clean up Neil Horman (et. al.) criticized the way the trace events were broken up into two files. The reason for that was that ftrace needed to separate out the declarations from where the #include was used. It then dawned on me that the tracepoint.h header only needs to define the TRACE_EVENT macro if it is not already defined. The solution is simply to test if TRACE_EVENT is defined, and if it is not then the linux/tracepoint.h header can define it. This change consolidates all the .h and _event_types.h into the .h file. Reported-by: Neil Horman Reported-by: Theodore Tso Reported-by: Jiaying Zhang Cc: Zhaolei Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Jason Baron Cc: Mathieu Desnoyers Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 9 +- include/trace/irq.h | 51 ++++- include/trace/irq_event_types.h | 55 ----- include/trace/kmem.h | 189 +++++++++++++++- include/trace/lockdep.h | 52 ++++- include/trace/lockdep_event_types.h | 57 ----- include/trace/sched.h | 333 ++++++++++++++++++++++++++- include/trace/sched_event_types.h | 337 ---------------------------- include/trace/skb.h | 36 ++- include/trace/skb_event_types.h | 38 ---- include/trace/trace_event_types.h | 7 - kernel/trace/events.c | 1 + kernel/trace/trace_events_stage_1.h | 4 +- kernel/trace/trace_events_stage_2.h | 8 +- kernel/trace/trace_events_stage_3.h | 4 +- 15 files changed, 663 insertions(+), 518 deletions(-) delete mode 100644 include/trace/irq_event_types.h delete mode 100644 include/trace/lockdep_event_types.h delete mode 100644 include/trace/sched_event_types.h delete mode 100644 include/trace/skb_event_types.h delete mode 100644 include/trace/trace_event_types.h diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index d35a7ee7611..4353f3f7e62 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -31,6 +31,8 @@ struct tracepoint { * Keep in sync with vmlinux.lds.h. */ +#ifndef DECLARE_TRACE + #define TP_PROTO(args...) args #define TP_ARGS(args...) args @@ -114,6 +116,7 @@ static inline void tracepoint_update_probe_range(struct tracepoint *begin, struct tracepoint *end) { } #endif /* CONFIG_TRACEPOINTS */ +#endif /* DECLARE_TRACE */ /* * Connect a probe to a tracepoint. @@ -154,10 +157,13 @@ static inline void tracepoint_synchronize_unregister(void) } #define PARAMS(args...) args + +#ifndef TRACE_FORMAT #define TRACE_FORMAT(name, proto, args, fmt) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#endif - +#ifndef TRACE_EVENT /* * For use with the TRACE_EVENT macro: * @@ -262,5 +268,6 @@ static inline void tracepoint_synchronize_unregister(void) #define TRACE_EVENT(name, proto, args, struct, assign, print) \ DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) +#endif #endif diff --git a/include/trace/irq.h b/include/trace/irq.h index ff5d4495dc3..04ab4c65222 100644 --- a/include/trace/irq.h +++ b/include/trace/irq.h @@ -1,9 +1,54 @@ -#ifndef _TRACE_IRQ_H +#if !defined(_TRACE_IRQ_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_IRQ_H -#include #include +#include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM irq + +/* + * Tracepoint for entry of interrupt handler: + */ +TRACE_FORMAT(irq_handler_entry, + TP_PROTO(int irq, struct irqaction *action), + TP_ARGS(irq, action), + TP_FMT("irq=%d handler=%s", irq, action->name) + ); + +/* + * Tracepoint for return of an interrupt handler: + */ +TRACE_EVENT(irq_handler_exit, + + TP_PROTO(int irq, struct irqaction *action, int ret), + + TP_ARGS(irq, action, ret), + + TP_STRUCT__entry( + __field( int, irq ) + __field( int, ret ) + ), + + TP_fast_assign( + __entry->irq = irq; + __entry->ret = ret; + ), + + TP_printk("irq=%d return=%s", + __entry->irq, __entry->ret ? "handled" : "unhandled") +); + +TRACE_FORMAT(softirq_entry, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); + +TRACE_FORMAT(softirq_exit, + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), + TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) + ); #endif diff --git a/include/trace/irq_event_types.h b/include/trace/irq_event_types.h deleted file mode 100644 index 85964ebd47e..00000000000 --- a/include/trace/irq_event_types.h +++ /dev/null @@ -1,55 +0,0 @@ - -/* use instead */ -#ifndef TRACE_FORMAT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM irq - -/* - * Tracepoint for entry of interrupt handler: - */ -TRACE_FORMAT(irq_handler_entry, - TP_PROTO(int irq, struct irqaction *action), - TP_ARGS(irq, action), - TP_FMT("irq=%d handler=%s", irq, action->name) - ); - -/* - * Tracepoint for return of an interrupt handler: - */ -TRACE_EVENT(irq_handler_exit, - - TP_PROTO(int irq, struct irqaction *action, int ret), - - TP_ARGS(irq, action, ret), - - TP_STRUCT__entry( - __field( int, irq ) - __field( int, ret ) - ), - - TP_fast_assign( - __entry->irq = irq; - __entry->ret = ret; - ), - - TP_printk("irq=%d return=%s", - __entry->irq, __entry->ret ? "handled" : "unhandled") -); - -TRACE_FORMAT(softirq_entry, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), - TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); - -TRACE_FORMAT(softirq_exit, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), - TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); - -#undef TRACE_SYSTEM diff --git a/include/trace/kmem.h b/include/trace/kmem.h index 46efc2423f0..d7d12189e5c 100644 --- a/include/trace/kmem.h +++ b/include/trace/kmem.h @@ -1,9 +1,192 @@ -#ifndef _TRACE_KMEM_H +#if !defined(_TRACE_KMEM_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_KMEM_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM kmem -#endif /* _TRACE_KMEM_H */ +TRACE_EVENT(kmalloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmem_cache_alloc, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags) +); + +TRACE_EVENT(kmalloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kmem_cache_alloc_node, + + TP_PROTO(unsigned long call_site, + const void *ptr, + size_t bytes_req, + size_t bytes_alloc, + gfp_t gfp_flags, + int node), + + TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + __field( size_t, bytes_req ) + __field( size_t, bytes_alloc ) + __field( gfp_t, gfp_flags ) + __field( int, node ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + __entry->bytes_req = bytes_req; + __entry->bytes_alloc = bytes_alloc; + __entry->gfp_flags = gfp_flags; + __entry->node = node; + ), + + TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", + __entry->call_site, + __entry->ptr, + __entry->bytes_req, + __entry->bytes_alloc, + __entry->gfp_flags, + __entry->node) +); + +TRACE_EVENT(kfree, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +TRACE_EVENT(kmem_cache_free, + + TP_PROTO(unsigned long call_site, const void *ptr), + + TP_ARGS(call_site, ptr), + + TP_STRUCT__entry( + __field( unsigned long, call_site ) + __field( const void *, ptr ) + ), + + TP_fast_assign( + __entry->call_site = call_site; + __entry->ptr = ptr; + ), + + TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) +); + +#endif diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h index 5ca67df87f2..8ee7900b38c 100644 --- a/include/trace/lockdep.h +++ b/include/trace/lockdep.h @@ -1,9 +1,57 @@ -#ifndef _TRACE_LOCKDEP_H +#if !defined(_TRACE_LOCKDEP_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_LOCKDEP_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM lock + +#ifdef CONFIG_LOCKDEP + +TRACE_FORMAT(lock_acquire, + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, + int trylock, int read, int check, + struct lockdep_map *next_lock, unsigned long ip), + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), + TP_FMT("%s%s%s", trylock ? "try " : "", + read ? "read " : "", lock->name) + ); + +TRACE_FORMAT(lock_release, + TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_ARGS(lock, nested, ip), + TP_FMT("%s", lock->name) + ); + +#ifdef CONFIG_LOCK_STAT + +TRACE_FORMAT(lock_contended, + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), + TP_FMT("%s", lock->name) + ); + +TRACE_EVENT(lock_acquired, + TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), + + TP_ARGS(lock, ip, waittime), + + TP_STRUCT__entry( + __field(const char *, name) + __field(unsigned long, wait_usec) + __field(unsigned long, wait_nsec_rem) + ), + TP_fast_assign( + __entry->name = lock->name; + __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); + __entry->wait_usec = (unsigned long) waittime; + ), + TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, + __entry->wait_nsec_rem) +); #endif +#endif + +#endif /* _TRACE_LOCKDEP_H */ diff --git a/include/trace/lockdep_event_types.h b/include/trace/lockdep_event_types.h deleted file mode 100644 index 863f1e4583a..00000000000 --- a/include/trace/lockdep_event_types.h +++ /dev/null @@ -1,57 +0,0 @@ - -#ifndef TRACE_FORMAT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM lock - -#ifdef CONFIG_LOCKDEP - -TRACE_FORMAT(lock_acquire, - TP_PROTO(struct lockdep_map *lock, unsigned int subclass, - int trylock, int read, int check, - struct lockdep_map *next_lock, unsigned long ip), - TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), - TP_FMT("%s%s%s", trylock ? "try " : "", - read ? "read " : "", lock->name) - ); - -TRACE_FORMAT(lock_release, - TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), - TP_ARGS(lock, nested, ip), - TP_FMT("%s", lock->name) - ); - -#ifdef CONFIG_LOCK_STAT - -TRACE_FORMAT(lock_contended, - TP_PROTO(struct lockdep_map *lock, unsigned long ip), - TP_ARGS(lock, ip), - TP_FMT("%s", lock->name) - ); - -TRACE_EVENT(lock_acquired, - TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), - - TP_ARGS(lock, ip, waittime), - - TP_STRUCT__entry( - __field(const char *, name) - __field(unsigned long, wait_usec) - __field(unsigned long, wait_nsec_rem) - ), - TP_fast_assign( - __entry->name = lock->name; - __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); - __entry->wait_usec = (unsigned long) waittime; - ), - TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, - __entry->wait_nsec_rem) -); - -#endif -#endif - -#undef TRACE_SYSTEM diff --git a/include/trace/sched.h b/include/trace/sched.h index 4e372a1a29b..5b1cf4a2846 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -1,9 +1,336 @@ -#ifndef _TRACE_SCHED_H +#if !defined(_TRACE_SCHED_H) || defined(TRACE_HEADER_MULTI_READ) #define _TRACE_SCHED_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM sched -#endif +/* + * Tracepoint for calling kthread_stop, performed to end a kthread: + */ +TRACE_EVENT(sched_kthread_stop, + + TP_PROTO(struct task_struct *t), + + TP_ARGS(t), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, t->comm, TASK_COMM_LEN); + __entry->pid = t->pid; + ), + + TP_printk("task %s:%d", __entry->comm, __entry->pid) +); + +/* + * Tracepoint for the return value of the kthread stopping: + */ +TRACE_EVENT(sched_kthread_stop_ret, + + TP_PROTO(int ret), + + TP_ARGS(ret), + + TP_STRUCT__entry( + __field( int, ret ) + ), + + TP_fast_assign( + __entry->ret = ret; + ), + + TP_printk("ret %d", __entry->ret) +); + +/* + * Tracepoint for waiting on task to unschedule: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wait_task, + + TP_PROTO(struct rq *rq, struct task_struct *p), + + TP_ARGS(rq, p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for waking up a task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wakeup, + + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + + TP_ARGS(rq, p, success), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, success ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->success = success; + ), + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success) +); + +/* + * Tracepoint for waking up a new task: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_wakeup_new, + + TP_PROTO(struct rq *rq, struct task_struct *p, int success), + + TP_ARGS(rq, p, success), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, success ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->success = success; + ), + + TP_printk("task %s:%d [%d] success=%d", + __entry->comm, __entry->pid, __entry->prio, + __entry->success) +); + +/* + * Tracepoint for task switches, performed by the scheduler: + * + * (NOTE: the 'rq' argument is not used by generic trace events, + * but used by the latency tracer plugin. ) + */ +TRACE_EVENT(sched_switch, + + TP_PROTO(struct rq *rq, struct task_struct *prev, + struct task_struct *next), + + TP_ARGS(rq, prev, next), + + TP_STRUCT__entry( + __array( char, prev_comm, TASK_COMM_LEN ) + __field( pid_t, prev_pid ) + __field( int, prev_prio ) + __array( char, next_comm, TASK_COMM_LEN ) + __field( pid_t, next_pid ) + __field( int, next_prio ) + ), + + TP_fast_assign( + memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); + __entry->prev_pid = prev->pid; + __entry->prev_prio = prev->prio; + memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); + __entry->next_pid = next->pid; + __entry->next_prio = next->prio; + ), + + TP_printk("task %s:%d [%d] ==> %s:%d [%d]", + __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, + __entry->next_comm, __entry->next_pid, __entry->next_prio) +); + +/* + * Tracepoint for a task being migrated: + */ +TRACE_EVENT(sched_migrate_task, + + TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + + TP_ARGS(p, orig_cpu, dest_cpu), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + __field( int, orig_cpu ) + __field( int, dest_cpu ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + __entry->orig_cpu = orig_cpu; + __entry->dest_cpu = dest_cpu; + ), + + TP_printk("task %s:%d [%d] from: %d to: %d", + __entry->comm, __entry->pid, __entry->prio, + __entry->orig_cpu, __entry->dest_cpu) +); + +/* + * Tracepoint for freeing a task: + */ +TRACE_EVENT(sched_process_free, + + TP_PROTO(struct task_struct *p), + + TP_ARGS(p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for a task exiting: + */ +TRACE_EVENT(sched_process_exit, + + TP_PROTO(struct task_struct *p), + + TP_ARGS(p), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->prio = p->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for a waiting task: + */ +TRACE_EVENT(sched_process_wait, + + TP_PROTO(struct pid *pid), + + TP_ARGS(pid), + + TP_STRUCT__entry( + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + __field( int, prio ) + ), + + TP_fast_assign( + memcpy(__entry->comm, current->comm, TASK_COMM_LEN); + __entry->pid = pid_nr(pid); + __entry->prio = current->prio; + ), + + TP_printk("task %s:%d [%d]", + __entry->comm, __entry->pid, __entry->prio) +); + +/* + * Tracepoint for do_fork: + */ +TRACE_EVENT(sched_process_fork, + + TP_PROTO(struct task_struct *parent, struct task_struct *child), + + TP_ARGS(parent, child), + + TP_STRUCT__entry( + __array( char, parent_comm, TASK_COMM_LEN ) + __field( pid_t, parent_pid ) + __array( char, child_comm, TASK_COMM_LEN ) + __field( pid_t, child_pid ) + ), + + TP_fast_assign( + memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); + __entry->parent_pid = parent->pid; + memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); + __entry->child_pid = child->pid; + ), + + TP_printk("parent %s:%d child %s:%d", + __entry->parent_comm, __entry->parent_pid, + __entry->child_comm, __entry->child_pid) +); + +/* + * Tracepoint for sending a signal: + */ +TRACE_EVENT(sched_signal_send, + + TP_PROTO(int sig, struct task_struct *p), + + TP_ARGS(sig, p), + + TP_STRUCT__entry( + __field( int, sig ) + __array( char, comm, TASK_COMM_LEN ) + __field( pid_t, pid ) + ), + + TP_fast_assign( + memcpy(__entry->comm, p->comm, TASK_COMM_LEN); + __entry->pid = p->pid; + __entry->sig = sig; + ), + + TP_printk("sig: %d task %s:%d", + __entry->sig, __entry->comm, __entry->pid) +); + +#endif /* _TRACE_SCHED_H */ diff --git a/include/trace/sched_event_types.h b/include/trace/sched_event_types.h deleted file mode 100644 index 63547dc1125..00000000000 --- a/include/trace/sched_event_types.h +++ /dev/null @@ -1,337 +0,0 @@ - -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM sched - -/* - * Tracepoint for calling kthread_stop, performed to end a kthread: - */ -TRACE_EVENT(sched_kthread_stop, - - TP_PROTO(struct task_struct *t), - - TP_ARGS(t), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - ), - - TP_fast_assign( - memcpy(__entry->comm, t->comm, TASK_COMM_LEN); - __entry->pid = t->pid; - ), - - TP_printk("task %s:%d", __entry->comm, __entry->pid) -); - -/* - * Tracepoint for the return value of the kthread stopping: - */ -TRACE_EVENT(sched_kthread_stop_ret, - - TP_PROTO(int ret), - - TP_ARGS(ret), - - TP_STRUCT__entry( - __field( int, ret ) - ), - - TP_fast_assign( - __entry->ret = ret; - ), - - TP_printk("ret %d", __entry->ret) -); - -/* - * Tracepoint for waiting on task to unschedule: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wait_task, - - TP_PROTO(struct rq *rq, struct task_struct *p), - - TP_ARGS(rq, p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for waking up a task: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wakeup, - - TP_PROTO(struct rq *rq, struct task_struct *p, int success), - - TP_ARGS(rq, p, success), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, success ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->success = success; - ), - - TP_printk("task %s:%d [%d] success=%d", - __entry->comm, __entry->pid, __entry->prio, - __entry->success) -); - -/* - * Tracepoint for waking up a new task: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_wakeup_new, - - TP_PROTO(struct rq *rq, struct task_struct *p, int success), - - TP_ARGS(rq, p, success), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, success ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->success = success; - ), - - TP_printk("task %s:%d [%d] success=%d", - __entry->comm, __entry->pid, __entry->prio, - __entry->success) -); - -/* - * Tracepoint for task switches, performed by the scheduler: - * - * (NOTE: the 'rq' argument is not used by generic trace events, - * but used by the latency tracer plugin. ) - */ -TRACE_EVENT(sched_switch, - - TP_PROTO(struct rq *rq, struct task_struct *prev, - struct task_struct *next), - - TP_ARGS(rq, prev, next), - - TP_STRUCT__entry( - __array( char, prev_comm, TASK_COMM_LEN ) - __field( pid_t, prev_pid ) - __field( int, prev_prio ) - __array( char, next_comm, TASK_COMM_LEN ) - __field( pid_t, next_pid ) - __field( int, next_prio ) - ), - - TP_fast_assign( - memcpy(__entry->next_comm, next->comm, TASK_COMM_LEN); - __entry->prev_pid = prev->pid; - __entry->prev_prio = prev->prio; - memcpy(__entry->prev_comm, prev->comm, TASK_COMM_LEN); - __entry->next_pid = next->pid; - __entry->next_prio = next->prio; - ), - - TP_printk("task %s:%d [%d] ==> %s:%d [%d]", - __entry->prev_comm, __entry->prev_pid, __entry->prev_prio, - __entry->next_comm, __entry->next_pid, __entry->next_prio) -); - -/* - * Tracepoint for a task being migrated: - */ -TRACE_EVENT(sched_migrate_task, - - TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), - - TP_ARGS(p, orig_cpu, dest_cpu), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - __field( int, orig_cpu ) - __field( int, dest_cpu ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - __entry->orig_cpu = orig_cpu; - __entry->dest_cpu = dest_cpu; - ), - - TP_printk("task %s:%d [%d] from: %d to: %d", - __entry->comm, __entry->pid, __entry->prio, - __entry->orig_cpu, __entry->dest_cpu) -); - -/* - * Tracepoint for freeing a task: - */ -TRACE_EVENT(sched_process_free, - - TP_PROTO(struct task_struct *p), - - TP_ARGS(p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for a task exiting: - */ -TRACE_EVENT(sched_process_exit, - - TP_PROTO(struct task_struct *p), - - TP_ARGS(p), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->prio = p->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for a waiting task: - */ -TRACE_EVENT(sched_process_wait, - - TP_PROTO(struct pid *pid), - - TP_ARGS(pid), - - TP_STRUCT__entry( - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - __field( int, prio ) - ), - - TP_fast_assign( - memcpy(__entry->comm, current->comm, TASK_COMM_LEN); - __entry->pid = pid_nr(pid); - __entry->prio = current->prio; - ), - - TP_printk("task %s:%d [%d]", - __entry->comm, __entry->pid, __entry->prio) -); - -/* - * Tracepoint for do_fork: - */ -TRACE_EVENT(sched_process_fork, - - TP_PROTO(struct task_struct *parent, struct task_struct *child), - - TP_ARGS(parent, child), - - TP_STRUCT__entry( - __array( char, parent_comm, TASK_COMM_LEN ) - __field( pid_t, parent_pid ) - __array( char, child_comm, TASK_COMM_LEN ) - __field( pid_t, child_pid ) - ), - - TP_fast_assign( - memcpy(__entry->parent_comm, parent->comm, TASK_COMM_LEN); - __entry->parent_pid = parent->pid; - memcpy(__entry->child_comm, child->comm, TASK_COMM_LEN); - __entry->child_pid = child->pid; - ), - - TP_printk("parent %s:%d child %s:%d", - __entry->parent_comm, __entry->parent_pid, - __entry->child_comm, __entry->child_pid) -); - -/* - * Tracepoint for sending a signal: - */ -TRACE_EVENT(sched_signal_send, - - TP_PROTO(int sig, struct task_struct *p), - - TP_ARGS(sig, p), - - TP_STRUCT__entry( - __field( int, sig ) - __array( char, comm, TASK_COMM_LEN ) - __field( pid_t, pid ) - ), - - TP_fast_assign( - memcpy(__entry->comm, p->comm, TASK_COMM_LEN); - __entry->pid = p->pid; - __entry->sig = sig; - ), - - TP_printk("sig: %d task %s:%d", - __entry->sig, __entry->comm, __entry->pid) -); - -#undef TRACE_SYSTEM diff --git a/include/trace/skb.h b/include/trace/skb.h index d2de7174a6e..e6fd281f7f8 100644 --- a/include/trace/skb.h +++ b/include/trace/skb.h @@ -1,9 +1,37 @@ -#ifndef _TRACE_SKB_H_ -#define _TRACE_SKB_H_ +#if !defined(_TRACE_SKB_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_SKB_H #include #include -#include +#undef TRACE_SYSTEM +#define TRACE_SYSTEM skb -#endif +/* + * Tracepoint for free an sk_buff: + */ +TRACE_EVENT(kfree_skb, + + TP_PROTO(struct sk_buff *skb, void *location), + + TP_ARGS(skb, location), + + TP_STRUCT__entry( + __field( void *, skbaddr ) + __field( unsigned short, protocol ) + __field( void *, location ) + ), + + TP_fast_assign( + __entry->skbaddr = skb; + if (skb) { + __entry->protocol = ntohs(skb->protocol); + } + __entry->location = location; + ), + + TP_printk("skbaddr=%p protocol=%u location=%p", + __entry->skbaddr, __entry->protocol, __entry->location) +); + +#endif /* _TRACE_SKB_H */ diff --git a/include/trace/skb_event_types.h b/include/trace/skb_event_types.h deleted file mode 100644 index 4a1c504c0e1..00000000000 --- a/include/trace/skb_event_types.h +++ /dev/null @@ -1,38 +0,0 @@ - -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM skb - -/* - * Tracepoint for free an sk_buff: - */ -TRACE_EVENT(kfree_skb, - - TP_PROTO(struct sk_buff *skb, void *location), - - TP_ARGS(skb, location), - - TP_STRUCT__entry( - __field( void *, skbaddr ) - __field( unsigned short, protocol ) - __field( void *, location ) - ), - - TP_fast_assign( - __entry->skbaddr = skb; - if (skb) { - __entry->protocol = ntohs(skb->protocol); - } - __entry->location = location; - ), - - TP_printk("skbaddr=%p protocol=%u location=%p", - __entry->skbaddr, __entry->protocol, __entry->location) -); - -#undef TRACE_SYSTEM diff --git a/include/trace/trace_event_types.h b/include/trace/trace_event_types.h deleted file mode 100644 index 552a50e169a..00000000000 --- a/include/trace/trace_event_types.h +++ /dev/null @@ -1,7 +0,0 @@ -/* trace/_event_types.h here */ - -#include -#include -#include -#include -#include diff --git a/kernel/trace/events.c b/kernel/trace/events.c index 246f2aa6dc4..5a35a914f0e 100644 --- a/kernel/trace/events.c +++ b/kernel/trace/events.c @@ -8,6 +8,7 @@ #include "trace_output.h" +#define TRACE_HEADER_MULTI_READ #include "trace_events_stage_1.h" #include "trace_events_stage_2.h" #include "trace_events_stage_3.h" diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h index 38985f9b379..475f46a047a 100644 --- a/kernel/trace/trace_events_stage_1.h +++ b/kernel/trace/trace_events_stage_1.h @@ -1,7 +1,7 @@ /* * Stage 1 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * struct ftrace_raw_ { * struct trace_entry ent; @@ -36,4 +36,4 @@ }; \ static struct ftrace_event_call event_##name -#include +#include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h index 59cfd7dfe68..aa4a67a0656 100644 --- a/kernel/trace/trace_events_stage_2.h +++ b/kernel/trace/trace_events_stage_2.h @@ -1,7 +1,7 @@ /* * Stage 2 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * enum print_line_t * ftrace_raw_output_(struct trace_iterator *iter, int flags) @@ -64,7 +64,7 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ return TRACE_TYPE_HANDLED; \ } -#include +#include /* * Setup the showing format of trace point. @@ -128,7 +128,7 @@ ftrace_format_##call(struct trace_seq *s) \ return ret; \ } -#include +#include #undef __field #define __field(type, item) \ @@ -167,4 +167,4 @@ ftrace_define_fields_##call(void) \ return ret; \ } -#include +#include diff --git a/kernel/trace/trace_events_stage_3.h b/kernel/trace/trace_events_stage_3.h index 5bb1b7ffbdb..45c04e1f38d 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/kernel/trace/trace_events_stage_3.h @@ -1,7 +1,7 @@ /* * Stage 3 of the trace events. * - * Override the macros in to include the following: + * Override the macros in to include the following: * * static void ftrace_event_(proto) * { @@ -272,7 +272,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ _TRACE_PROFILE_INIT(call) \ } -#include +#include #undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT From 78ddb08feb7d4fbe3c0a9931804c51ee58be4023 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Tue, 14 Apr 2009 16:53:05 +0200 Subject: [PATCH 0372/2061] wait: don't use __wake_up_common() '777c6c5 wait: prevent exclusive waiter starvation' made __wake_up_common() global to be used from abort_exclusive_wait(). It was needed to do a wake-up with the waitqueue lock held while passing down a key to the wake-up function. Since '4ede816 epoll keyed wakeups: add __wake_up_locked_key() and __wake_up_sync_key()' there is an appropriate wrapper for this case: __wake_up_locked_key(). Use it here and make __wake_up_common() private to the scheduler again. Signed-off-by: Johannes Weiner Cc: Andrew Morton Cc: Peter Zijlstra LKML-Reference: <1239720785-19661-1-git-send-email-hannes@cmpxchg.org> Signed-off-by: Ingo Molnar --- include/linux/wait.h | 2 -- kernel/sched.c | 2 +- kernel/wait.c | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/include/linux/wait.h b/include/linux/wait.h index 5d631c17eae..67d4e89b62d 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -132,8 +132,6 @@ static inline void __remove_wait_queue(wait_queue_head_t *head, list_del(&old->task_list); } -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, - int nr_exclusive, int sync, void *key); void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr, void *key); void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key); void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr, diff --git a/kernel/sched.c b/kernel/sched.c index 36d213bca47..92b4b56ad09 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5345,7 +5345,7 @@ EXPORT_SYMBOL(default_wake_function); * started to run but is not in state TASK_RUNNING. try_to_wake_up() returns * zero in this (rare) case, and we handle it by continuing to scan the queue. */ -void __wake_up_common(wait_queue_head_t *q, unsigned int mode, +static void __wake_up_common(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, int sync, void *key) { wait_queue_t *curr, *next; diff --git a/kernel/wait.c b/kernel/wait.c index 42a2dbc181c..ea7c3b4275c 100644 --- a/kernel/wait.c +++ b/kernel/wait.c @@ -154,7 +154,7 @@ void abort_exclusive_wait(wait_queue_head_t *q, wait_queue_t *wait, if (!list_empty(&wait->task_list)) list_del_init(&wait->task_list); else if (waitqueue_active(q)) - __wake_up_common(q, mode, 1, 0, key); + __wake_up_locked_key(q, mode, key); spin_unlock_irqrestore(&q->lock, flags); } EXPORT_SYMBOL(abort_exclusive_wait); From 56449f437add737a1e5e1cb7e00f63ac8ead1938 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 14 Apr 2009 11:24:36 +0200 Subject: [PATCH 0373/2061] tracing: make the trace clocks available generally Jeremy Fitzhardinge reported this build failure: LD .tmp_vmlinux1 arch/x86/kernel/built-in.o: In function `ds_take_timestamp': git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global' git/linux/arch/x86/kernel/ds.c:1380: undefined reference to `trace_clock_global' Which is due to !CONFIG_TRACING && CONFIG_X86_DS=y. Expose the trace clock code to CONFIG_X86_DS as well. [ Unfortunately librarizing doesnt work well - ancient architectures with no raw_local_irq_save() primitive break the build. ] Reported-by: Jeremy Fitzhardinge LKML-Reference: <49E4413F.7070700@goop.org> Signed-off-by: Ingo Molnar --- kernel/Makefile | 1 + kernel/trace/Makefile | 7 ++++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/kernel/Makefile b/kernel/Makefile index bab1dffe37e..c8e1be5f0b0 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -92,6 +92,7 @@ obj-$(CONFIG_LATENCYTOP) += latencytop.o obj-$(CONFIG_HAVE_GENERIC_DMA_COHERENT) += dma-coherent.o obj-$(CONFIG_FUNCTION_TRACER) += trace/ obj-$(CONFIG_TRACING) += trace/ +obj-$(CONFIG_X86_DS) += trace/ obj-$(CONFIG_SMP) += sched_cpupri.o obj-$(CONFIG_SLOW_WORK) += slow-work.o diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 2630f5121ec..ecc671e9f14 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -15,11 +15,16 @@ ifdef CONFIG_TRACING_BRANCHES KBUILD_CFLAGS += -DDISABLE_BRANCH_PROFILING endif +# +# Make the trace clocks available generally: it's infrastructure +# relied on by ptrace for example: +# +obj-y += trace_clock.o + obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o obj-$(CONFIG_TRACING) += trace.o -obj-$(CONFIG_TRACING) += trace_clock.o obj-$(CONFIG_TRACING) += trace_output.o obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o From 72c6a9870f901045f2464c3dc6ee8914bfdc07aa Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 14 Apr 2009 17:33:57 +0200 Subject: [PATCH 0374/2061] rculist.h: introduce list_entry_rcu() and list_first_entry_rcu() I've run into the situation where I need to use list_first_entry with rcu-guarded list. This patch introduces this. Also simplify list_for_each_entry_rcu() to use new list_entry_rcu() instead of list_entry(). Signed-off-by: Jiri Pirko Reviewed-by: Paul E. McKenney Cc: dipankar@in.ibm.com LKML-Reference: <20090414153356.GC3999@psychotron.englab.brq.redhat.com> Signed-off-by: Ingo Molnar --- include/linux/rculist.h | 30 ++++++++++++++++++++++++++++-- 1 file changed, 28 insertions(+), 2 deletions(-) diff --git a/include/linux/rculist.h b/include/linux/rculist.h index e649bd3f2c9..5710f43bbc9 100644 --- a/include/linux/rculist.h +++ b/include/linux/rculist.h @@ -198,6 +198,32 @@ static inline void list_splice_init_rcu(struct list_head *list, at->prev = last; } +/** + * list_entry_rcu - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + */ +#define list_entry_rcu(ptr, type, member) \ + container_of(rcu_dereference(ptr), type, member) + +/** + * list_first_entry_rcu - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + * + * This primitive may safely run concurrently with the _rcu list-mutation + * primitives such as list_add_rcu() as long as it's guarded by rcu_read_lock(). + */ +#define list_first_entry_rcu(ptr, type, member) \ + list_entry_rcu((ptr)->next, type, member) + #define __list_for_each_rcu(pos, head) \ for (pos = rcu_dereference((head)->next); \ pos != (head); \ @@ -214,9 +240,9 @@ static inline void list_splice_init_rcu(struct list_head *list, * as long as the traversal is guarded by rcu_read_lock(). */ #define list_for_each_entry_rcu(pos, head, member) \ - for (pos = list_entry(rcu_dereference((head)->next), typeof(*pos), member); \ + for (pos = list_entry_rcu((head)->next, typeof(*pos), member); \ prefetch(pos->member.next), &pos->member != (head); \ - pos = list_entry(rcu_dereference(pos->member.next), typeof(*pos), member)) + pos = list_entry_rcu(pos->member.next, typeof(*pos), member)) /** From a8d154b009168337494fbf345671bab74d3e4b8b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 09:36:00 -0400 Subject: [PATCH 0375/2061] tracing: create automated trace defines This patch lowers the number of places a developer must modify to add new tracepoints. The current method to add a new tracepoint into an existing system is to write the trace point macro in the trace header with one of the macros TRACE_EVENT, TRACE_FORMAT or DECLARE_TRACE, then they must add the same named item into the C file with the macro DEFINE_TRACE(name) and then add the trace point. This change cuts out the needing to add the DEFINE_TRACE(name). Every file that uses the tracepoint must still include the trace/.h file, but the one C file must also add a define before the including of that file. #define CREATE_TRACE_POINTS #include This will cause the trace/mytrace.h file to also produce the C code necessary to implement the trace point. Note, if more than one trace/.h is used to create the C code it is best to list them all together. #define CREATE_TRACE_POINTS #include #include #include Thanks to Mathieu Desnoyers and Christoph Hellwig for coming up with the cleaner solution of the define above the includes over my first design to have the C code include a "special" header. This patch converts sched, irq and lockdep and skb to use this new method. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 75 ++++++++++++++++++++++++++++++++++++ include/trace/irq.h | 5 ++- include/trace/kmem.h | 4 +- include/trace/lockdep.h | 3 ++ include/trace/sched.h | 3 ++ include/trace/skb.h | 3 ++ kernel/exit.c | 4 -- kernel/fork.c | 2 - kernel/irq/handle.c | 7 ++-- kernel/kthread.c | 3 -- kernel/lockdep.c | 12 ++---- kernel/sched.c | 10 ++--- kernel/signal.c | 2 - kernel/softirq.c | 3 -- mm/util.c | 11 ++---- net/core/net-traces.c | 4 +- 16 files changed, 105 insertions(+), 46 deletions(-) create mode 100644 include/trace/define_trace.h diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h new file mode 100644 index 00000000000..de9dc7d8508 --- /dev/null +++ b/include/trace/define_trace.h @@ -0,0 +1,75 @@ +/* + * Trace files that want to automate creationg of all tracepoints defined + * in their file should include this file. The following are macros that the + * trace file may define: + * + * TRACE_SYSTEM defines the system the tracepoint is for + * + * TRACE_INCLUDE_FILE if the file name is something other than TRACE_SYSTEM.h + * This macro may be defined to tell define_trace.h what file to include. + * Note, leave off the ".h". + * + * TRACE_INCLUDE_PATH if the path is something other than core kernel include/trace + * then this macro can define the path to use. Note, the path is relative to + * define_trace.h, not the file including it. Full path names for out of tree + * modules must be used. + */ + +#ifdef CREATE_TRACE_POINTS + +/* Prevent recursion */ +#undef CREATE_TRACE_POINTS + +#include + +#undef TRACE_EVENT +#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ + DEFINE_TRACE(name) + +#undef TRACE_FORMAT +#define TRACE_FORMAT(name, proto, args, print) \ + DEFINE_TRACE(name) + +#undef DECLARE_TRACE +#define DECLARE_TRACE(name, proto, args) \ + DEFINE_TRACE(name) + +#undef TRACE_INCLUDE +#undef __TRACE_INCLUDE + +#ifndef TRACE_INCLUDE_FILE +# define TRACE_INCLUDE_FILE TRACE_SYSTEM +# define UNDEF_TRACE_INCLUDE_FILE +#endif + +#ifndef TRACE_INCLUDE_PATH +# define __TRACE_INCLUDE(system) +# define UNDEF_TRACE_INCLUDE_FILE +#else +# define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h) +#endif + +# define TRACE_INCLUDE(system) __TRACE_INCLUDE(system) + +/* Let the trace headers be reread */ +#define TRACE_HEADER_MULTI_READ + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef TRACE_HEADER_MULTI_READ + +/* Only undef what we defined in this file */ +#ifdef UNDEF_TRACE_INCLUDE_FILE +# undef TRACE_INCLUDE_PATH +# undef UNDEF_TRACE_INCLUDE_FILE +#endif + +#ifdef UNDEF_TRACE_INCLUDE_FILE +# undef TRACE_INCLUDE_PATH +# undef UNDEF_TRACE_INCLUDE_FILE +#endif + +/* We may be processing more files */ +#define CREATE_TRACE_POINTS + +#endif /* CREATE_TRACE_POINTS */ diff --git a/include/trace/irq.h b/include/trace/irq.h index 04ab4c65222..75e3468e449 100644 --- a/include/trace/irq.h +++ b/include/trace/irq.h @@ -51,4 +51,7 @@ TRACE_FORMAT(softirq_exit, TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) ); -#endif +#endif /* _TRACE_IRQ_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/kmem.h b/include/trace/kmem.h index d7d12189e5c..c22c42f980b 100644 --- a/include/trace/kmem.h +++ b/include/trace/kmem.h @@ -188,5 +188,7 @@ TRACE_EVENT(kmem_cache_free, TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) ); +#endif /* _TRACE_KMEM_H */ -#endif +/* This part must be outside protection */ +#include diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h index 8ee7900b38c..4d301e758de 100644 --- a/include/trace/lockdep.h +++ b/include/trace/lockdep.h @@ -55,3 +55,6 @@ TRACE_EVENT(lock_acquired, #endif #endif /* _TRACE_LOCKDEP_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/sched.h b/include/trace/sched.h index 5b1cf4a2846..ffa1cab586b 100644 --- a/include/trace/sched.h +++ b/include/trace/sched.h @@ -334,3 +334,6 @@ TRACE_EVENT(sched_signal_send, ); #endif /* _TRACE_SCHED_H */ + +/* This part must be outside protection */ +#include diff --git a/include/trace/skb.h b/include/trace/skb.h index e6fd281f7f8..1e8fabb57c0 100644 --- a/include/trace/skb.h +++ b/include/trace/skb.h @@ -35,3 +35,6 @@ TRACE_EVENT(kfree_skb, ); #endif /* _TRACE_SKB_H */ + +/* This part must be outside protection */ +#include diff --git a/kernel/exit.c b/kernel/exit.c index abf9cf3b95c..2fe9d2c7eee 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -56,10 +56,6 @@ #include #include "cred-internals.h" -DEFINE_TRACE(sched_process_free); -DEFINE_TRACE(sched_process_exit); -DEFINE_TRACE(sched_process_wait); - static void exit_mm(struct task_struct * tsk); static void __unhash_process(struct task_struct *p) diff --git a/kernel/fork.c b/kernel/fork.c index b9e2edd0072..4bebf263923 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -83,8 +83,6 @@ DEFINE_PER_CPU(unsigned long, process_counts) = 0; __cacheline_aligned DEFINE_RWLOCK(tasklist_lock); /* outer */ -DEFINE_TRACE(sched_process_fork); - int nr_processes(void) { int cpu; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index d82142be8dd..983d8be8dff 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -17,9 +17,11 @@ #include #include #include -#include #include +#define CREATE_TRACE_POINTS +#include + #include "internals.h" /* @@ -348,9 +350,6 @@ static void warn_no_thread(unsigned int irq, struct irqaction *action) "but no thread function available.", irq, action->name); } -DEFINE_TRACE(irq_handler_entry); -DEFINE_TRACE(irq_handler_exit); - /** * handle_IRQ_event - irq action chain handler * @irq: the interrupt number diff --git a/kernel/kthread.c b/kernel/kthread.c index 4ebaf8519ab..e1c76924545 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -21,9 +21,6 @@ static DEFINE_SPINLOCK(kthread_create_lock); static LIST_HEAD(kthread_create_list); struct task_struct *kthreadd_task; -DEFINE_TRACE(sched_kthread_stop); -DEFINE_TRACE(sched_kthread_stop_ret); - struct kthread_create_info { /* Information passed to kthread() from kthreadd. */ diff --git a/kernel/lockdep.c b/kernel/lockdep.c index c4582a6ea95..257f21a76c5 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -42,12 +42,14 @@ #include #include #include -#include #include #include "lockdep_internals.h" +#define CREATE_TRACE_POINTS +#include + #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; module_param(prove_locking, int, 0644); @@ -2929,8 +2931,6 @@ void lock_set_class(struct lockdep_map *lock, const char *name, } EXPORT_SYMBOL_GPL(lock_set_class); -DEFINE_TRACE(lock_acquire); - /* * We are not always called with irqs disabled - do that here, * and also avoid lockdep recursion: @@ -2957,8 +2957,6 @@ void lock_acquire(struct lockdep_map *lock, unsigned int subclass, } EXPORT_SYMBOL_GPL(lock_acquire); -DEFINE_TRACE(lock_release); - void lock_release(struct lockdep_map *lock, int nested, unsigned long ip) { @@ -3061,8 +3059,6 @@ found_it: put_lock_stats(stats); } -DEFINE_TRACE(lock_acquired); - static void __lock_acquired(struct lockdep_map *lock, unsigned long ip) { @@ -3118,8 +3114,6 @@ found_it: lock->ip = ip; } -DEFINE_TRACE(lock_contended); - void lock_contended(struct lockdep_map *lock, unsigned long ip) { unsigned long flags; diff --git a/kernel/sched.c b/kernel/sched.c index 5724508c3b6..e6d4518d47e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -72,13 +72,15 @@ #include #include #include -#include #include #include #include "sched_cpupri.h" +#define CREATE_TRACE_POINTS +#include + /* * Convert user-nice values [ -20 ... 0 ... 19 ] * to static priority [ MAX_RT_PRIO..MAX_PRIO-1 ], @@ -118,12 +120,6 @@ */ #define RUNTIME_INF ((u64)~0ULL) -DEFINE_TRACE(sched_wait_task); -DEFINE_TRACE(sched_wakeup); -DEFINE_TRACE(sched_wakeup_new); -DEFINE_TRACE(sched_switch); -DEFINE_TRACE(sched_migrate_task); - #ifdef CONFIG_SMP static void double_rq_lock(struct rq *rq1, struct rq *rq2); diff --git a/kernel/signal.c b/kernel/signal.c index d8034737db4..1d5703ff003 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -41,8 +41,6 @@ static struct kmem_cache *sigqueue_cachep; -DEFINE_TRACE(sched_signal_send); - static void __user *sig_handler(struct task_struct *t, int sig) { return t->sighand->action[sig - 1].sa.sa_handler; diff --git a/kernel/softirq.c b/kernel/softirq.c index 2fecefacdc5..a2d9b458ac2 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -186,9 +186,6 @@ EXPORT_SYMBOL(local_bh_enable_ip); */ #define MAX_SOFTIRQ_RESTART 10 -DEFINE_TRACE(softirq_entry); -DEFINE_TRACE(softirq_exit); - asmlinkage void __do_softirq(void) { struct softirq_action *h; diff --git a/mm/util.c b/mm/util.c index 2599e83eea1..0e74a22791c 100644 --- a/mm/util.c +++ b/mm/util.c @@ -4,9 +4,11 @@ #include #include #include -#include #include +#define CREATE_TRACE_POINTS +#include + /** * kstrdup - allocate space for and copy an existing string * @s: the string to duplicate @@ -239,13 +241,6 @@ int __attribute__((weak)) get_user_pages_fast(unsigned long start, EXPORT_SYMBOL_GPL(get_user_pages_fast); /* Tracepoints definitions. */ -DEFINE_TRACE(kmalloc); -DEFINE_TRACE(kmem_cache_alloc); -DEFINE_TRACE(kmalloc_node); -DEFINE_TRACE(kmem_cache_alloc_node); -DEFINE_TRACE(kfree); -DEFINE_TRACE(kmem_cache_free); - EXPORT_TRACEPOINT_SYMBOL(kmalloc); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kmalloc_node); diff --git a/net/core/net-traces.c b/net/core/net-traces.c index c8fb45665e4..80177205947 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -19,11 +19,11 @@ #include #include #include -#include #include #include +#define CREATE_TRACE_POINTS +#include -DEFINE_TRACE(kfree_skb); EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); From 9504504cbab29ecb694186b1c5b15d3579c43c51 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Sat, 11 Apr 2009 12:59:57 -0400 Subject: [PATCH 0376/2061] tracing: make trace_seq operations available for core kernel In the process to make TRACE_EVENT macro work for modules, the trace_seq operations must be available for core kernel code. These operations are quite useful and can be used for other implementations. The main idea is that we create a trace_seq handle that acts very much like the seq_file handle. struct trace_seq *s = kmalloc(sizeof(*s, GFP_KERNEL); trace_seq_init(s); trace_seq_printf(s, "some data %d\n", variable); printk("%s", s->buffer); The main use is to allow a top level function call several other functions that may store printf like data into the buffer. Then at the end, the top level function can process all the data with any method it would like to. It could be passed to userspace, output via printk or even use seq_file: trace_seq_to_user(s, ubuf, cnt); seq_puts(m, s->buffer); Signed-off-by: Steven Rostedt --- include/linux/trace_seq.h | 89 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 15 +------ kernel/trace/trace_output.h | 16 +------ 3 files changed, 92 insertions(+), 28 deletions(-) create mode 100644 include/linux/trace_seq.h diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h new file mode 100644 index 00000000000..28051da876d --- /dev/null +++ b/include/linux/trace_seq.h @@ -0,0 +1,89 @@ +#ifndef _LINUX_TRACE_SEQ_H +#define _LINUX_TRACE_SEQ_H + +/* + * Trace sequences are used to allow a function to call several other functions + * to create a string of data to use (up to a max of PAGE_SIZE. + */ + +struct trace_seq { + unsigned char buffer[PAGE_SIZE]; + unsigned int len; + unsigned int readpos; +}; + +static inline void +trace_seq_init(struct trace_seq *s) +{ + s->len = 0; + s->readpos = 0; +} + +/* + * Currently only defined when tracing is enabled. + */ +#ifdef CONFIG_TRACING +extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))); +extern int +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); +extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); +extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt); +extern int trace_seq_puts(struct trace_seq *s, const char *str); +extern int trace_seq_putc(struct trace_seq *s, unsigned char c); +extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); +extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + size_t len); +extern void *trace_seq_reserve(struct trace_seq *s, size_t len); +extern int trace_seq_path(struct trace_seq *s, struct path *path); + +#else /* CONFIG_TRACING */ +static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) + __attribute__ ((format (printf, 2, 3))) +{ + return 0; +} +static inline int +trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) +{ + return 0; +} + +static inline void trace_print_seq(struct seq_file *m, struct trace_seq *s) +{ +} +static inline ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, + size_t cnt) +{ + return 0; +} +static inline int trace_seq_puts(struct trace_seq *s, const char *str) +{ + return 0; +} +static inline int trace_seq_putc(struct trace_seq *s, unsigned char c); +{ + return 0; +} +static inline int +trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len) +{ + return 0; +} +static inline int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, + size_t len) +{ + return 0; +} +static inline void *trace_seq_reserve(struct trace_seq *s, size_t len) +{ + return NULL; +} +static inline int trace_seq_path(struct trace_seq *s, struct path *path) +{ + return 0; +} +#endif /* CONFIG_TRACING */ + +#endif /* _LINUX_TRACE_SEQ_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index b05b6ac982a..1882846b738 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -12,6 +12,8 @@ #include #include +#include + enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -423,19 +425,6 @@ struct tracer { struct tracer_stat *stats; }; -struct trace_seq { - unsigned char buffer[PAGE_SIZE]; - unsigned int len; - unsigned int readpos; -}; - -static inline void -trace_seq_init(struct trace_seq *s) -{ - s->len = 0; - s->readpos = 0; -} - #define TRACE_PIPE_ALL_CPU -1 diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 91630217fb4..5c7cbfb65c7 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -1,6 +1,7 @@ #ifndef __TRACE_EVENTS_H #define __TRACE_EVENTS_H +#include #include "trace.h" typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, @@ -20,24 +21,9 @@ trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t trace_print_printk_msg_only(struct trace_iterator *iter); -extern void trace_print_seq(struct seq_file *m, struct trace_seq *s); - -extern int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))); -extern int -trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary); extern int seq_print_ip_sym(struct trace_seq *s, unsigned long ip, unsigned long sym_flags); -extern ssize_t trace_seq_to_user(struct trace_seq *s, char __user *ubuf, - size_t cnt); -extern int trace_seq_puts(struct trace_seq *s, const char *str); -extern int trace_seq_putc(struct trace_seq *s, unsigned char c); -extern int trace_seq_putmem(struct trace_seq *s, const void *mem, size_t len); -extern int trace_seq_putmem_hex(struct trace_seq *s, const void *mem, - size_t len); -extern void *trace_seq_reserve(struct trace_seq *s, size_t len); -extern int trace_seq_path(struct trace_seq *s, struct path *path); extern int seq_print_userip_objs(const struct userstack_entry *entry, struct trace_seq *s, unsigned long sym_flags); extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, From 97f2025153499faa17267a0d4e18c7afaf73f39d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Apr 2009 11:20:49 -0400 Subject: [PATCH 0377/2061] tracing/events: move declarations from trace directory to core include In preparation to allowing trace events to happen in modules, we need to move some of the local declarations in the kernel/trace directory into include/linux. This patch simply moves the declarations and performs no context changes. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 146 +++++++++++++++++++++++++++++++++++ kernel/trace/trace.h | 120 +--------------------------- kernel/trace/trace_output.h | 14 ---- 3 files changed, 147 insertions(+), 133 deletions(-) create mode 100644 include/linux/ftrace_event.h diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h new file mode 100644 index 00000000000..496b76d9f9d --- /dev/null +++ b/include/linux/ftrace_event.h @@ -0,0 +1,146 @@ +#ifndef _LINUX_FTRACE_EVENT_H +#define _LINUX_FTRACE_EVENT_H + +#include +#include + + +struct trace_array; +struct tracer; + +/* + * The trace entry - the most basic unit of tracing. This is what + * is printed in the end as a single line in the trace output, such as: + * + * bash-15816 [01] 235.197585: idle_cpu <- irq_enter + */ +struct trace_entry { + unsigned char type; + unsigned char flags; + unsigned char preempt_count; + int pid; + int tgid; +}; + +/* + * Trace iterator - used by printout routines who present trace + * results to users and which routines might sleep, etc: + */ +struct trace_iterator { + struct trace_array *tr; + struct tracer *trace; + void *private; + int cpu_file; + struct mutex mutex; + struct ring_buffer_iter *buffer_iter[NR_CPUS]; + + /* The below is zeroed out in pipe_read */ + struct trace_seq seq; + struct trace_entry *ent; + int cpu; + u64 ts; + + unsigned long iter_flags; + loff_t pos; + long idx; + + cpumask_var_t started; +}; + + +typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, + int flags); +struct trace_event { + struct hlist_node node; + int type; + trace_print_func trace; + trace_print_func raw; + trace_print_func hex; + trace_print_func binary; +}; + +extern int register_ftrace_event(struct trace_event *event); +extern int unregister_ftrace_event(struct trace_event *event); + +/* Return values for print_line callback */ +enum print_line_t { + TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ + TRACE_TYPE_HANDLED = 1, + TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ + TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ +}; + + +struct ring_buffer_event * +trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, + unsigned long flags, int pc); +void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, + unsigned long flags, int pc); +void trace_current_buffer_discard_commit(struct ring_buffer_event *event); + +void tracing_record_cmdline(struct task_struct *tsk); + +struct ftrace_event_call { + char *name; + char *system; + struct dentry *dir; + int enabled; + int (*regfunc)(void); + void (*unregfunc)(void); + int id; + int (*raw_init)(void); + int (*show_format)(struct trace_seq *s); + int (*define_fields)(void); + struct list_head fields; + int n_preds; + struct filter_pred **preds; + +#ifdef CONFIG_EVENT_PROFILE + atomic_t profile_count; + int (*profile_enable)(struct ftrace_event_call *); + void (*profile_disable)(struct ftrace_event_call *); +#endif +}; + +#define MAX_FILTER_PRED 8 +#define MAX_FILTER_STR_VAL 128 + +extern int init_preds(struct ftrace_event_call *call); +extern int filter_match_preds(struct ftrace_event_call *call, void *rec); +extern int filter_current_check_discard(struct ftrace_event_call *call, + void *rec, + struct ring_buffer_event *event); + +extern int trace_define_field(struct ftrace_event_call *call, char *type, + char *name, int offset, int size); + + +/* + * The double __builtin_constant_p is because gcc will give us an error + * if we try to allocate the static variable to fmt if it is not a + * constant. Even with the outer if statement optimizing out. + */ +#define event_trace_printk(ip, fmt, args...) \ +do { \ + __trace_printk_check_format(fmt, ##args); \ + tracing_record_cmdline(current); \ + if (__builtin_constant_p(fmt)) { \ + static const char *trace_printk_fmt \ + __attribute__((section("__trace_printk_fmt"))) = \ + __builtin_constant_p(fmt) ? fmt : NULL; \ + \ + __trace_bprintk(ip, trace_printk_fmt, ##args); \ + } else \ + __trace_printk(ip, fmt, ##args); \ +} while (0) + +#define __common_field(type, item) \ + ret = trace_define_field(event_call, #type, "common_" #item, \ + offsetof(typeof(field.ent), item), \ + sizeof(field.ent.item)); \ + if (ret) \ + return ret; + +#endif /* _LINUX_FTRACE_EVENT_H */ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1882846b738..6bcdf4af9b2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -13,6 +13,7 @@ #include #include +#include enum trace_type { __TRACE_FIRST_TYPE = 0, @@ -43,20 +44,6 @@ enum trace_type { __TRACE_LAST_TYPE, }; -/* - * The trace entry - the most basic unit of tracing. This is what - * is printed in the end as a single line in the trace output, such as: - * - * bash-15816 [01] 235.197585: idle_cpu <- irq_enter - */ -struct trace_entry { - unsigned char type; - unsigned char flags; - unsigned char preempt_count; - int pid; - int tgid; -}; - /* * Function trace entry - function address and parent function addres: */ @@ -265,8 +252,6 @@ struct trace_array_cpu { char comm[TASK_COMM_LEN]; }; -struct trace_iterator; - /* * The trace array - an array of per-CPU trace arrays. This is the * highest level data structure that individual tracers deal with. @@ -341,15 +326,6 @@ extern void __ftrace_bad_type(void); __ftrace_bad_type(); \ } while (0) -/* Return values for print_line callback */ -enum print_line_t { - TRACE_TYPE_PARTIAL_LINE = 0, /* Retry after flushing the seq */ - TRACE_TYPE_HANDLED = 1, - TRACE_TYPE_UNHANDLED = 2, /* Relay to other output functions */ - TRACE_TYPE_NO_CONSUME = 3 /* Handled but ask to not consume */ -}; - - /* * An option specific to a tracer. This is a boolean value. * The bit is the bit index that sets its value on the @@ -428,31 +404,6 @@ struct tracer { #define TRACE_PIPE_ALL_CPU -1 -/* - * Trace iterator - used by printout routines who present trace - * results to users and which routines might sleep, etc: - */ -struct trace_iterator { - struct trace_array *tr; - struct tracer *trace; - void *private; - int cpu_file; - struct mutex mutex; - struct ring_buffer_iter *buffer_iter[NR_CPUS]; - - /* The below is zeroed out in pipe_read */ - struct trace_seq seq; - struct trace_entry *ent; - int cpu; - u64 ts; - - unsigned long iter_flags; - loff_t pos; - long idx; - - cpumask_var_t started; -}; - int tracer_init(struct tracer *t, struct trace_array *tr); int tracing_is_enabled(void); void trace_wake_up(void); @@ -479,15 +430,6 @@ void trace_buffer_unlock_commit(struct trace_array *tr, struct ring_buffer_event *event, unsigned long flags, int pc); -struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, - unsigned long flags, int pc); -void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, - unsigned long flags, int pc); -void trace_current_buffer_discard_commit(struct ring_buffer_event *event); - struct trace_entry *tracing_get_trace_entry(struct trace_array *tr, struct trace_array_cpu *data); @@ -510,7 +452,6 @@ void tracing_sched_switch_trace(struct trace_array *tr, struct task_struct *prev, struct task_struct *next, unsigned long flags, int pc); -void tracing_record_cmdline(struct task_struct *tsk); void tracing_sched_wakeup_trace(struct trace_array *tr, struct task_struct *wakee, @@ -790,28 +731,6 @@ struct ftrace_event_field { int size; }; -struct ftrace_event_call { - char *name; - char *system; - struct dentry *dir; - int enabled; - int (*regfunc)(void); - void (*unregfunc)(void); - int id; - int (*raw_init)(void); - int (*show_format)(struct trace_seq *s); - int (*define_fields)(void); - struct list_head fields; - int n_preds; - struct filter_pred **preds; - -#ifdef CONFIG_EVENT_PROFILE - atomic_t profile_count; - int (*profile_enable)(struct ftrace_event_call *); - void (*profile_disable)(struct ftrace_event_call *); -#endif -}; - struct event_subsystem { struct list_head list; const char *name; @@ -825,9 +744,6 @@ struct event_subsystem { (unsigned long)event < (unsigned long)__stop_ftrace_events; \ event++) -#define MAX_FILTER_PRED 8 -#define MAX_FILTER_STR_VAL 128 - struct filter_pred; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); @@ -845,9 +761,6 @@ struct filter_pred { int clear; }; -int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size); -extern int init_preds(struct ftrace_event_call *call); extern void filter_free_pred(struct filter_pred *pred); extern void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s); @@ -855,13 +768,9 @@ extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); extern void filter_disable_preds(struct ftrace_event_call *call); -extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern void filter_free_subsystem_preds(struct event_subsystem *system); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); -extern int filter_current_check_discard(struct ftrace_event_call *call, - void *rec, - struct ring_buffer_event *event); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -876,14 +785,6 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -#define __common_field(type, item) \ - ret = trace_define_field(event_call, #type, "common_" #item, \ - offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ - if (ret) \ - return ret; - -void event_trace_printk(unsigned long ip, const char *fmt, ...); extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; @@ -895,25 +796,6 @@ extern struct ftrace_event_call __stop_ftrace_events[]; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; -/* - * The double __builtin_constant_p is because gcc will give us an error - * if we try to allocate the static variable to fmt if it is not a - * constant. Even with the outer if statement optimizing out. - */ -#define event_trace_printk(ip, fmt, args...) \ -do { \ - __trace_printk_check_format(fmt, ##args); \ - tracing_record_cmdline(current); \ - if (__builtin_constant_p(fmt)) { \ - static const char *trace_printk_fmt \ - __attribute__((section("__trace_printk_fmt"))) = \ - __builtin_constant_p(fmt) ? fmt : NULL; \ - \ - __trace_bprintk(ip, trace_printk_fmt, ##args); \ - } else \ - __trace_printk(ip, fmt, ##args); \ -} while (0) - #undef TRACE_EVENT_FORMAT #define TRACE_EVENT_FORMAT(call, proto, args, fmt, tstruct, tpfmt) \ extern struct ftrace_event_call event_##call; diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 5c7cbfb65c7..6e220a8e570 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -4,18 +4,6 @@ #include #include "trace.h" -typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, - int flags); - -struct trace_event { - struct hlist_node node; - int type; - trace_print_func trace; - trace_print_func raw; - trace_print_func hex; - trace_print_func binary; -}; - extern enum print_line_t trace_print_bprintk_msg_only(struct trace_iterator *iter); extern enum print_line_t @@ -33,8 +21,6 @@ extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); extern struct trace_event *ftrace_find_event(int type); -extern int register_ftrace_event(struct trace_event *event); -extern int unregister_ftrace_event(struct trace_event *event); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, int flags); From f42c85e74faa422cf0bc747ed808681145448f88 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 13 Apr 2009 12:25:37 -0400 Subject: [PATCH 0378/2061] tracing/events: move the ftrace event tracing code to core This patch moves the ftrace creation into include/trace/ftrace.h and simplifies the work of developers in adding new tracepoints. Just the act of creating the trace points in include/trace and including define_trace.h will create the events in the debugfs/tracing/events directory. This patch removes the need of include/trace/trace_events.h Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 4 + .../trace/ftrace.h | 215 +++++++++++++++++- include/trace/trace_events.h | 7 - kernel/trace/Makefile | 1 - kernel/trace/events.c | 15 -- kernel/trace/trace_events_stage_1.h | 39 ---- kernel/trace/trace_events_stage_2.h | 170 -------------- 7 files changed, 218 insertions(+), 233 deletions(-) rename kernel/trace/trace_events_stage_3.h => include/trace/ftrace.h (58%) delete mode 100644 include/trace/trace_events.h delete mode 100644 kernel/trace/events.c delete mode 100644 kernel/trace/trace_events_stage_1.h delete mode 100644 kernel/trace/trace_events_stage_2.h diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index de9dc7d8508..980eb66a6e3 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -56,6 +56,10 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) +#ifdef CONFIG_EVENT_TRACER +#include +#endif + #undef TRACE_HEADER_MULTI_READ /* Only undef what we defined in this file */ diff --git a/kernel/trace/trace_events_stage_3.h b/include/trace/ftrace.h similarity index 58% rename from kernel/trace/trace_events_stage_3.h rename to include/trace/ftrace.h index 45c04e1f38d..955b967acd7 100644 --- a/kernel/trace/trace_events_stage_3.h +++ b/include/trace/ftrace.h @@ -1,3 +1,216 @@ +/* + * Stage 1 of the trace events. + * + * Override the macros in to include the following: + * + * struct ftrace_raw_ { + * struct trace_entry ent; + * ; + * []; + * [...] + * }; + * + * The is created by the __field(type, item) macro or + * the __array(type2, item2, len) macro. + * We simply do "type item;", and that will create the fields + * in the structure. + */ + +#include + +#undef TRACE_FORMAT +#define TRACE_FORMAT(call, proto, args, fmt) + +#undef __array +#define __array(type, item, len) type item[len]; + +#undef __field +#define __field(type, item) type item; + +#undef TP_STRUCT__entry +#define TP_STRUCT__entry(args...) args + +#undef TRACE_EVENT +#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ + struct ftrace_raw_##name { \ + struct trace_entry ent; \ + tstruct \ + }; \ + static struct ftrace_event_call event_##name + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +/* + * Stage 2 of the trace events. + * + * Override the macros in to include the following: + * + * enum print_line_t + * ftrace_raw_output_(struct trace_iterator *iter, int flags) + * { + * struct trace_seq *s = &iter->seq; + * struct ftrace_raw_ *field; <-- defined in stage 1 + * struct trace_entry *entry; + * int ret; + * + * entry = iter->ent; + * + * if (entry->type != event_.id) { + * WARN_ON_ONCE(1); + * return TRACE_TYPE_UNHANDLED; + * } + * + * field = (typeof(field))entry; + * + * ret = trace_seq_printf(s, "\n"); + * if (!ret) + * return TRACE_TYPE_PARTIAL_LINE; + * + * return TRACE_TYPE_HANDLED; + * } + * + * This is the method used to print the raw event to the trace + * output format. Note, this is not needed if the data is read + * in binary. + */ + +#undef __entry +#define __entry field + +#undef TP_printk +#define TP_printk(fmt, args...) fmt "\n", args + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ +enum print_line_t \ +ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ +{ \ + struct trace_seq *s = &iter->seq; \ + struct ftrace_raw_##call *field; \ + struct trace_entry *entry; \ + int ret; \ + \ + entry = iter->ent; \ + \ + if (entry->type != event_##call.id) { \ + WARN_ON_ONCE(1); \ + return TRACE_TYPE_UNHANDLED; \ + } \ + \ + field = (typeof(field))entry; \ + \ + ret = trace_seq_printf(s, #call ": " print); \ + if (!ret) \ + return TRACE_TYPE_PARTIAL_LINE; \ + \ + return TRACE_TYPE_HANDLED; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +/* + * Setup the showing format of trace point. + * + * int + * ftrace_format_##call(struct trace_seq *s) + * { + * struct ftrace_raw_##call field; + * int ret; + * + * ret = trace_seq_printf(s, #type " " #item ";" + * " offset:%u; size:%u;\n", + * offsetof(struct ftrace_raw_##call, item), + * sizeof(field.type)); + * + * } + */ + +#undef TP_STRUCT__entry +#define TP_STRUCT__entry(args...) args + +#undef __field +#define __field(type, item) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __array +#define __array(type, item, len) \ + ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ + "offset:%u;\tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), item), \ + (unsigned int)sizeof(field.item)); \ + if (!ret) \ + return 0; + +#undef __entry +#define __entry REC + +#undef TP_printk +#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) + +#undef TP_fast_assign +#define TP_fast_assign(args...) args + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +static int \ +ftrace_format_##call(struct trace_seq *s) \ +{ \ + struct ftrace_raw_##call field; \ + int ret; \ + \ + tstruct; \ + \ + trace_seq_printf(s, "\nprint fmt: " print); \ + \ + return ret; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +#undef __field +#define __field(type, item) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef __array +#define __array(type, item, len) \ + BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ + ret = trace_define_field(event_call, #type "[" #len "]", #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item)); \ + if (ret) \ + return ret; + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ +int \ +ftrace_define_fields_##call(void) \ +{ \ + struct ftrace_raw_##call field; \ + struct ftrace_event_call *event_call = &event_##call; \ + int ret; \ + \ + __common_field(unsigned char, type); \ + __common_field(unsigned char, flags); \ + __common_field(unsigned char, preempt_count); \ + __common_field(int, pid); \ + __common_field(int, tgid); \ + \ + tstruct; \ + \ + return ret; \ +} + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + /* * Stage 3 of the trace events. * @@ -272,7 +485,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ _TRACE_PROFILE_INIT(call) \ } -#include +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) #undef _TRACE_PROFILE #undef _TRACE_PROFILE_INIT diff --git a/include/trace/trace_events.h b/include/trace/trace_events.h deleted file mode 100644 index 13d6b85668c..00000000000 --- a/include/trace/trace_events.h +++ /dev/null @@ -1,7 +0,0 @@ -/* trace/.h here */ - -#include -#include -#include -#include -#include diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index 3ad367e7c97..fb9d7f96489 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -41,7 +41,6 @@ obj-$(CONFIG_KMEMTRACE) += kmemtrace.o obj-$(CONFIG_WORKQUEUE_TRACER) += trace_workqueue.o obj-$(CONFIG_BLK_DEV_IO_TRACE) += blktrace.o obj-$(CONFIG_EVENT_TRACING) += trace_events.o -obj-$(CONFIG_EVENT_TRACER) += events.o obj-$(CONFIG_EVENT_TRACING) += trace_export.o obj-$(CONFIG_FTRACE_SYSCALLS) += trace_syscalls.o obj-$(CONFIG_EVENT_PROFILE) += trace_event_profile.o diff --git a/kernel/trace/events.c b/kernel/trace/events.c deleted file mode 100644 index 5a35a914f0e..00000000000 --- a/kernel/trace/events.c +++ /dev/null @@ -1,15 +0,0 @@ -/* - * This is the place to register all trace points as events. - */ - -#include - -#include - -#include "trace_output.h" - -#define TRACE_HEADER_MULTI_READ -#include "trace_events_stage_1.h" -#include "trace_events_stage_2.h" -#include "trace_events_stage_3.h" - diff --git a/kernel/trace/trace_events_stage_1.h b/kernel/trace/trace_events_stage_1.h deleted file mode 100644 index 475f46a047a..00000000000 --- a/kernel/trace/trace_events_stage_1.h +++ /dev/null @@ -1,39 +0,0 @@ -/* - * Stage 1 of the trace events. - * - * Override the macros in to include the following: - * - * struct ftrace_raw_ { - * struct trace_entry ent; - * ; - * []; - * [...] - * }; - * - * The is created by the __field(type, item) macro or - * the __array(type2, item2, len) macro. - * We simply do "type item;", and that will create the fields - * in the structure. - */ - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - -#undef __array -#define __array(type, item, len) type item[len]; - -#undef __field -#define __field(type, item) type item; - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ - struct ftrace_raw_##name { \ - struct trace_entry ent; \ - tstruct \ - }; \ - static struct ftrace_event_call event_##name - -#include diff --git a/kernel/trace/trace_events_stage_2.h b/kernel/trace/trace_events_stage_2.h deleted file mode 100644 index aa4a67a0656..00000000000 --- a/kernel/trace/trace_events_stage_2.h +++ /dev/null @@ -1,170 +0,0 @@ -/* - * Stage 2 of the trace events. - * - * Override the macros in to include the following: - * - * enum print_line_t - * ftrace_raw_output_(struct trace_iterator *iter, int flags) - * { - * struct trace_seq *s = &iter->seq; - * struct ftrace_raw_ *field; <-- defined in stage 1 - * struct trace_entry *entry; - * int ret; - * - * entry = iter->ent; - * - * if (entry->type != event_.id) { - * WARN_ON_ONCE(1); - * return TRACE_TYPE_UNHANDLED; - * } - * - * field = (typeof(field))entry; - * - * ret = trace_seq_printf(s, "\n"); - * if (!ret) - * return TRACE_TYPE_PARTIAL_LINE; - * - * return TRACE_TYPE_HANDLED; - * } - * - * This is the method used to print the raw event to the trace - * output format. Note, this is not needed if the data is read - * in binary. - */ - -#undef __entry -#define __entry field - -#undef TP_printk -#define TP_printk(fmt, args...) fmt "\n", args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ -enum print_line_t \ -ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ -{ \ - struct trace_seq *s = &iter->seq; \ - struct ftrace_raw_##call *field; \ - struct trace_entry *entry; \ - int ret; \ - \ - entry = iter->ent; \ - \ - if (entry->type != event_##call.id) { \ - WARN_ON_ONCE(1); \ - return TRACE_TYPE_UNHANDLED; \ - } \ - \ - field = (typeof(field))entry; \ - \ - ret = trace_seq_printf(s, #call ": " print); \ - if (!ret) \ - return TRACE_TYPE_PARTIAL_LINE; \ - \ - return TRACE_TYPE_HANDLED; \ -} - -#include - -/* - * Setup the showing format of trace point. - * - * int - * ftrace_format_##call(struct trace_seq *s) - * { - * struct ftrace_raw_##call field; - * int ret; - * - * ret = trace_seq_printf(s, #type " " #item ";" - * " offset:%u; size:%u;\n", - * offsetof(struct ftrace_raw_##call, item), - * sizeof(field.type)); - * - * } - */ - -#undef TP_STRUCT__entry -#define TP_STRUCT__entry(args...) args - -#undef __field -#define __field(type, item) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __array -#define __array(type, item, len) \ - ret = trace_seq_printf(s, "\tfield:" #type " " #item "[" #len "];\t" \ - "offset:%u;\tsize:%u;\n", \ - (unsigned int)offsetof(typeof(field), item), \ - (unsigned int)sizeof(field.item)); \ - if (!ret) \ - return 0; - -#undef __entry -#define __entry REC - -#undef TP_printk -#define TP_printk(fmt, args...) "%s, %s\n", #fmt, __stringify(args) - -#undef TP_fast_assign -#define TP_fast_assign(args...) args - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -static int \ -ftrace_format_##call(struct trace_seq *s) \ -{ \ - struct ftrace_raw_##call field; \ - int ret; \ - \ - tstruct; \ - \ - trace_seq_printf(s, "\nprint fmt: " print); \ - \ - return ret; \ -} - -#include - -#undef __field -#define __field(type, item) \ - ret = trace_define_field(event_call, #type, #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#undef __array -#define __array(type, item, len) \ - BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ - ret = trace_define_field(event_call, #type "[" #len "]", #item, \ - offsetof(typeof(field), item), \ - sizeof(field.item)); \ - if (ret) \ - return ret; - -#undef TRACE_EVENT -#define TRACE_EVENT(call, proto, args, tstruct, func, print) \ -int \ -ftrace_define_fields_##call(void) \ -{ \ - struct ftrace_raw_##call field; \ - struct ftrace_event_call *event_call = &event_##call; \ - int ret; \ - \ - __common_field(unsigned char, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ - \ - tstruct; \ - \ - return ret; \ -} - -#include From a59fd6027218bd7c994e39d14afe0242f895144f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 13:52:20 -0400 Subject: [PATCH 0379/2061] tracing/events: convert event call sites to use a link list Impact: makes it possible to define events in modules The events are created by reading down the section that they are linked in by the macros. But this is not scalable to modules. This patch converts the manipulations to use a global link list, and on boot up it adds the items in the section to the list. This change will allow modules to add their tracing events to the list as well. Note, this change alone does not permit modules to use the TRACE_EVENT macros, but the change is needed for them to eventually do so. Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace.h | 13 +------- kernel/trace/trace_event_profile.c | 4 +-- kernel/trace/trace_events.c | 51 ++++++++++++++++++------------ kernel/trace/trace_events_filter.c | 8 ++--- 5 files changed, 39 insertions(+), 38 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 496b76d9f9d..17810853b4f 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -83,6 +83,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event); void tracing_record_cmdline(struct task_struct *tsk); struct ftrace_event_call { + struct list_head list; char *name; char *system; struct dentry *dir; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 6bcdf4af9b2..8817c18ef97 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -739,11 +739,6 @@ struct event_subsystem { struct filter_pred **preds; }; -#define events_for_each(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - struct filter_pred; typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); @@ -785,13 +780,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } -extern struct ftrace_event_call __start_ftrace_events[]; -extern struct ftrace_event_call __stop_ftrace_events[]; - -#define for_each_event(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) +extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; extern const char *__stop___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 199de9c7422..7bf2ad65eee 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -11,7 +11,7 @@ int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; - for_each_event(event) { + list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) return event->profile_enable(event); } @@ -23,7 +23,7 @@ void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; - for_each_event(event) { + list_for_each_entry(event, &ftrace_events, list) { if (event->id == event_id) return event->profile_disable(event); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index ead68ac9919..5c66aaff07c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -19,6 +19,8 @@ static DEFINE_MUTEX(event_mutex); +LIST_HEAD(ftrace_events); + int trace_define_field(struct ftrace_event_call *call, char *type, char *name, int offset, int size) { @@ -54,16 +56,14 @@ err: static void ftrace_clear_events(void) { - struct ftrace_event_call *call = (void *)__start_ftrace_events; + struct ftrace_event_call *call; - - while ((unsigned long)call < (unsigned long)__stop_ftrace_events) { + list_for_each_entry(call, &ftrace_events, list) { if (call->enabled) { call->enabled = 0; call->unregfunc(); } - call++; } } @@ -89,7 +89,7 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, static int ftrace_set_clr_event(char *buf, int set) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; char *event = NULL, *sub = NULL, *match; int ret = -EINVAL; @@ -118,7 +118,7 @@ static int ftrace_set_clr_event(char *buf, int set) } mutex_lock(&event_mutex); - for_each_event(call) { + list_for_each_entry(call, &ftrace_events, list) { if (!call->name || !call->regfunc) continue; @@ -224,15 +224,17 @@ ftrace_event_write(struct file *file, const char __user *ubuf, static void * t_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next = call; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; for (;;) { - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + /* * The ftrace subsystem is for showing formats only. * They can not be enabled or disabled via the event files. @@ -240,11 +242,10 @@ t_next(struct seq_file *m, void *v, loff_t *pos) if (call->regfunc) break; - call++; - next = call; + list = list->next; } - m->private = ++next; + m->private = list->next; return call; } @@ -257,22 +258,23 @@ static void *t_start(struct seq_file *m, loff_t *pos) static void * s_next(struct seq_file *m, void *v, loff_t *pos) { - struct ftrace_event_call *call = m->private; - struct ftrace_event_call *next; + struct list_head *list = m->private; + struct ftrace_event_call *call; (*pos)++; retry: - if ((unsigned long)call >= (unsigned long)__stop_ftrace_events) + if (list == &ftrace_events) return NULL; + call = list_entry(list, struct ftrace_event_call, list); + if (!call->enabled) { - call++; + list = list->next; goto retry; } - next = call; - m->private = ++next; + m->private = list->next; return call; } @@ -312,7 +314,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) if (!ret) { struct seq_file *m = file->private_data; - m->private = __start_ftrace_events; + m->private = ftrace_events.next; } return ret; } @@ -797,9 +799,17 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return 0; } +extern struct ftrace_event_call __start_ftrace_events[]; +extern struct ftrace_event_call __stop_ftrace_events[]; + +#define for_each_event(event) \ + for (event = __start_ftrace_events; \ + (unsigned long)event < (unsigned long)__stop_ftrace_events; \ + event++) + static __init int event_trace_init(void) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; @@ -830,6 +840,7 @@ static __init int event_trace_init(void) /* The linker may leave blanks */ if (!call->name) continue; + list_add(&call->list, &ftrace_events); event_create_dir(call, d_events); } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index de42dad42a8..d30b06b02b4 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -223,7 +223,7 @@ oom: void filter_free_subsystem_preds(struct event_subsystem *system) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; int i; if (system->n_preds) { @@ -234,7 +234,7 @@ void filter_free_subsystem_preds(struct event_subsystem *system) system->n_preds = 0; } - events_for_each(call) { + list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; @@ -320,7 +320,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { - struct ftrace_event_call *call = __start_ftrace_events; + struct ftrace_event_call *call; if (system->n_preds && !pred->compound) filter_free_subsystem_preds(system); @@ -337,7 +337,7 @@ int filter_add_subsystem_pred(struct event_subsystem *system, system->preds[system->n_preds] = pred; - events_for_each(call) { + list_for_each_entry(call, &ftrace_events, list) { int err; if (!call->define_fields) From 17c873ec280a03894bc718af817f7f24fa787ae1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 18:12:50 -0400 Subject: [PATCH 0380/2061] tracing/events: add export symbols for trace events in modules Impact: let modules add trace events The trace event code requires some functions to be exported to allow modules to use TRACE_EVENT. This patch adds EXPORT_SYMBOL_GPL to the necessary functions. Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 3 +++ kernel/trace/trace_events.c | 1 + kernel/trace/trace_events_filter.c | 2 ++ kernel/trace/trace_output.c | 3 +++ 4 files changed, 9 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index c0047fcf707..2d69b26b3cc 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -176,6 +176,7 @@ int filter_current_check_discard(struct ftrace_event_call *call, void *rec, { return filter_check_discard(call, rec, global_trace.buffer, event); } +EXPORT_SYMBOL_GPL(filter_current_check_discard); cycle_t ftrace_now(int cpu) { @@ -886,6 +887,7 @@ trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, return trace_buffer_lock_reserve(&global_trace, type, len, flags, pc); } +EXPORT_SYMBOL(trace_current_buffer_lock_reserve); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) @@ -903,6 +905,7 @@ void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { ring_buffer_discard_commit(global_trace.buffer, event); } +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); void trace_function(struct trace_array *tr, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5c66aaff07c..8b9e621b80b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -53,6 +53,7 @@ err: return -ENOMEM; } +EXPORT_SYMBOL_GPL(trace_define_field); static void ftrace_clear_events(void) { diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index d30b06b02b4..f8e5eab0424 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -110,6 +110,7 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) return 1; } +EXPORT_SYMBOL_GPL(filter_match_preds); void filter_print_preds(struct filter_pred **preds, int n_preds, struct trace_seq *s) @@ -220,6 +221,7 @@ oom: return -ENOMEM; } +EXPORT_SYMBOL_GPL(init_preds); void filter_free_subsystem_preds(struct event_subsystem *system) { diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 0e70fb07ca7..83a8abb9640 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -94,6 +94,7 @@ trace_seq_printf(struct trace_seq *s, const char *fmt, ...) return len; } +EXPORT_SYMBOL_GPL(trace_seq_printf); int trace_seq_bprintf(struct trace_seq *s, const char *fmt, const u32 *binary) { @@ -538,6 +539,7 @@ int register_ftrace_event(struct trace_event *event) return ret; } +EXPORT_SYMBOL_GPL(register_ftrace_event); /** * unregister_ftrace_event - remove a no longer used event @@ -551,6 +553,7 @@ int unregister_ftrace_event(struct trace_event *event) return 0; } +EXPORT_SYMBOL_GPL(unregister_ftrace_event); /* * Standard events From 6d723736e472f7a0cd5b62c84152fceead241328 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 10 Apr 2009 14:53:50 -0400 Subject: [PATCH 0381/2061] tracing/events: add support for modules to TRACE_EVENT Impact: allow modules to add TRACE_EVENTS on load This patch adds the final hooks to allow modules to use the TRACE_EVENT macro. A notifier and a data structure are used to link the TRACE_EVENTs defined in the module to connect them with the ftrace event tracing system. It also adds the necessary automated clean ups to the trace events when a module is removed. Cc: Rusty Russell Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 3 + include/linux/module.h | 4 ++ include/linux/trace_seq.h | 2 + include/trace/ftrace.h | 1 + kernel/module.c | 7 ++ kernel/trace/trace_events.c | 128 ++++++++++++++++++++++++++--------- 6 files changed, 113 insertions(+), 32 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 17810853b4f..75f3ac01a87 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -7,6 +7,7 @@ struct trace_array; struct tracer; +struct dentry; /* * The trace entry - the most basic unit of tracing. This is what @@ -87,6 +88,7 @@ struct ftrace_event_call { char *name; char *system; struct dentry *dir; + struct trace_event *event; int enabled; int (*regfunc)(void); void (*unregfunc)(void); @@ -97,6 +99,7 @@ struct ftrace_event_call { struct list_head fields; int n_preds; struct filter_pred **preds; + void *mod; #ifdef CONFIG_EVENT_PROFILE atomic_t profile_count; diff --git a/include/linux/module.h b/include/linux/module.h index 627ac082e2a..6155fa44168 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -337,6 +337,10 @@ struct module const char **trace_bprintk_fmt_start; unsigned int num_trace_bprintk_fmt; #endif +#ifdef CONFIG_EVENT_TRACING + struct ftrace_event_call *trace_events; + unsigned int num_trace_events; +#endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 28051da876d..15ca2c71af1 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -1,6 +1,8 @@ #ifndef _LINUX_TRACE_SEQ_H #define _LINUX_TRACE_SEQ_H +#include + /* * Trace sequences are used to allow a function to call several other functions * to create a string of data to use (up to a max of PAGE_SIZE. diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 955b967acd7..60c5323bee6 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -477,6 +477,7 @@ __attribute__((__aligned__(4))) \ __attribute__((section("_ftrace_events"))) event_##call = { \ .name = #call, \ .system = __stringify(TRACE_SYSTEM), \ + .event = &ftrace_event_type_##call, \ .raw_init = ftrace_raw_init_event_##call, \ .regfunc = ftrace_raw_reg_event_##call, \ .unregfunc = ftrace_raw_unreg_event_##call, \ diff --git a/kernel/module.c b/kernel/module.c index e797812a4d9..a0394706f10 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -18,6 +18,7 @@ */ #include #include +#include #include #include #include @@ -2172,6 +2173,12 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->tracepoints), &mod->num_tracepoints); #endif +#ifdef CONFIG_EVENT_TRACING + mod->trace_events = section_objs(hdr, sechdrs, secstrings, + "_ftrace_events", + sizeof(*mod->trace_events), + &mod->num_trace_events); +#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8b9e621b80b..a4b177720a6 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -713,7 +713,13 @@ event_subsystem_dir(const char *name, struct dentry *d_events) return d_events; } - system->name = name; + system->name = kstrdup(name, GFP_KERNEL); + if (!system->name) { + debugfs_remove(system->entry); + kfree(system); + return d_events; + } + list_add(&system->list, &event_subsystems); system->preds = NULL; @@ -738,7 +744,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) * If the trace point header did not define TRACE_SYSTEM * then the system would be called "TRACE_SYSTEM". */ - if (strcmp(call->system, "TRACE_SYSTEM") != 0) + if (strcmp(call->system, TRACE_SYSTEM) != 0) d_events = event_subsystem_dir(call->system, d_events); if (call->raw_init) { @@ -757,21 +763,13 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return -1; } - if (call->regfunc) { - entry = debugfs_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/enable' entry\n", call->name); - } + if (call->regfunc) + entry = trace_create_file("enable", 0644, call->dir, call, + &ftrace_enable_fops); - if (call->id) { - entry = debugfs_create_file("id", 0444, call->dir, call, - &ftrace_event_id_fops); - if (!entry) - pr_warning("Could not create debugfs '%s/id' entry\n", - call->name); - } + if (call->id) + entry = trace_create_file("id", 0444, call->dir, call, + &ftrace_event_id_fops); if (call->define_fields) { ret = call->define_fields(); @@ -780,40 +778,102 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) " events/%s\n", call->name); return ret; } - entry = debugfs_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/filter' entry\n", call->name); + entry = trace_create_file("filter", 0644, call->dir, call, + &ftrace_event_filter_fops); } /* A trace may not want to export its format */ if (!call->show_format) return 0; - entry = debugfs_create_file("format", 0444, call->dir, call, - &ftrace_event_format_fops); - if (!entry) - pr_warning("Could not create debugfs " - "'%s/format' entry\n", call->name); + entry = trace_create_file("format", 0444, call->dir, call, + &ftrace_event_format_fops); return 0; } +#define for_each_event(event, start, end) \ + for (event = start; \ + (unsigned long)event < (unsigned long)end; \ + event++) + +static void trace_module_add_events(struct module *mod) +{ + struct ftrace_event_call *call, *start, *end; + struct dentry *d_events; + + start = mod->trace_events; + end = mod->trace_events + mod->num_trace_events; + + if (start == end) + return; + + d_events = event_trace_events_dir(); + if (!d_events) + return; + + for_each_event(call, start, end) { + /* The linker may leave blanks */ + if (!call->name) + continue; + call->mod = mod; + list_add(&call->list, &ftrace_events); + event_create_dir(call, d_events); + } +} + +static void trace_module_remove_events(struct module *mod) +{ + struct ftrace_event_call *call, *p; + + list_for_each_entry_safe(call, p, &ftrace_events, list) { + if (call->mod == mod) { + if (call->enabled) { + call->enabled = 0; + call->unregfunc(); + } + if (call->event) + unregister_ftrace_event(call->event); + debugfs_remove_recursive(call->dir); + list_del(&call->list); + } + } +} + +int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + mutex_lock(&event_mutex); + switch (val) { + case MODULE_STATE_COMING: + trace_module_add_events(mod); + break; + case MODULE_STATE_GOING: + trace_module_remove_events(mod); + break; + } + mutex_unlock(&event_mutex); + + return 0; +} + +struct notifier_block trace_module_nb = { + .notifier_call = trace_module_notify, + .priority = 0, +}; + extern struct ftrace_event_call __start_ftrace_events[]; extern struct ftrace_event_call __stop_ftrace_events[]; -#define for_each_event(event) \ - for (event = __start_ftrace_events; \ - (unsigned long)event < (unsigned long)__stop_ftrace_events; \ - event++) - static __init int event_trace_init(void) { struct ftrace_event_call *call; struct dentry *d_tracer; struct dentry *entry; struct dentry *d_events; + int ret; d_tracer = tracing_init_dentry(); if (!d_tracer) @@ -837,7 +897,7 @@ static __init int event_trace_init(void) if (!d_events) return 0; - for_each_event(call) { + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) continue; @@ -845,6 +905,10 @@ static __init int event_trace_init(void) event_create_dir(call, d_events); } + ret = register_module_notifier(&trace_module_nb); + if (!ret) + pr_warning("Failed to register trace events module notifier\n"); + return 0; } fs_initcall(event_trace_init); From 19e4529ee7345079eeacc8e40cf69a304a64dc23 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 14 Apr 2009 17:27:18 +1000 Subject: [PATCH 0382/2061] modules: Fix up build when CONFIG_MODULE_UNLOAD=n. Commit 3d43321b7015387cfebbe26436d0e9d299162ea1 ("modules: sysctl to block module loading") introduces a modules_disabled variable that is only defined if CONFIG_MODULE_UNLOAD is enabled, despite being used in other places. This moves it up and fixes up the build. CC kernel/module.o kernel/module.c: In function 'sys_init_module': kernel/module.c:2401: error: 'modules_disabled' undeclared (first use in this function) kernel/module.c:2401: error: (Each undeclared identifier is reported only once kernel/module.c:2401: error: for each function it appears in.) make[1]: *** [kernel/module.o] Error 1 make: *** [kernel/module.o] Error 2 Signed-off-by: Paul Mundt Signed-off-by: James Morris --- kernel/module.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/module.c b/kernel/module.c index eeb3f7b1383..ee7ab612daf 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -71,6 +71,9 @@ static DEFINE_MUTEX(module_mutex); static LIST_HEAD(modules); +/* Block module loading/unloading? */ +int modules_disabled = 0; + /* Waiting for a module to finish initializing? */ static DECLARE_WAIT_QUEUE_HEAD(module_wq); @@ -778,9 +781,6 @@ static void wait_for_zero_refcount(struct module *mod) mutex_lock(&module_mutex); } -/* Block module loading/unloading? */ -int modules_disabled = 0; - SYSCALL_DEFINE2(delete_module, const char __user *, name_user, unsigned int, flags) { From 61f919a12fbdc3fd20f980a34a118d597198a392 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 18:22:32 -0400 Subject: [PATCH 0383/2061] tracing/events: fix compile for modules disabled Impact: compile fix The addition of TRACE_EVENT for modules breaks the build for when modules are disabled. This code fixes that. Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index a4b177720a6..6591d83e1e7 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -797,6 +797,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) (unsigned long)event < (unsigned long)end; \ event++) +#ifdef CONFIG_MODULES static void trace_module_add_events(struct module *mod) { struct ftrace_event_call *call, *start, *end; @@ -840,8 +841,8 @@ static void trace_module_remove_events(struct module *mod) } } -int trace_module_notify(struct notifier_block *self, - unsigned long val, void *data) +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) { struct module *mod = data; @@ -858,6 +859,13 @@ int trace_module_notify(struct notifier_block *self, return 0; } +#else +static int trace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ struct notifier_block trace_module_nb = { .notifier_call = trace_module_notify, From ecda8ae02a08ef065ff387f5cb2a2d4999da2408 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 18:49:38 -0400 Subject: [PATCH 0384/2061] tracing/events: fix lockdep system name Impact: fix compile error of lockdep event tracer Ingo Molnar pointed out that the system name for the lockdep tracer was "lock" which is used to include the event trace file name. It should be "lockdep" Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- include/trace/lockdep.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/lockdep.h b/include/trace/lockdep.h index 4d301e758de..45e326b5c7f 100644 --- a/include/trace/lockdep.h +++ b/include/trace/lockdep.h @@ -5,7 +5,7 @@ #include #undef TRACE_SYSTEM -#define TRACE_SYSTEM lock +#define TRACE_SYSTEM lockdep #ifdef CONFIG_LOCKDEP From ad8d75fff811a6a230f7f43b05a6483099349533 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 19:39:12 -0400 Subject: [PATCH 0385/2061] tracing/events: move trace point headers into include/trace/events Impact: clean up Create a sub directory in include/trace called events to keep the trace point headers in their own separate directory. Only headers that declare trace points should be defined in this directory. Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Neil Horman Cc: Zhao Lei Cc: Eduard - Gabriel Munteanu Cc: Pekka Enberg Signed-off-by: Steven Rostedt --- include/linux/kmemtrace.h | 2 +- include/trace/define_trace.h | 2 +- include/trace/{ => events}/irq.h | 0 include/trace/{ => events}/kmem.h | 0 include/trace/{ => events}/lockdep.h | 0 include/trace/{ => events}/sched.h | 0 include/trace/{ => events}/skb.h | 0 kernel/exit.c | 2 +- kernel/fork.c | 3 ++- kernel/irq/handle.c | 2 +- kernel/kthread.c | 2 +- kernel/lockdep.c | 2 +- kernel/sched.c | 2 +- kernel/signal.c | 2 +- kernel/softirq.c | 2 +- kernel/trace/ftrace.c | 2 +- kernel/trace/trace_sched_switch.c | 2 +- kernel/trace/trace_sched_wakeup.c | 2 +- mm/util.c | 2 +- net/core/drop_monitor.c | 2 +- net/core/net-traces.c | 2 +- net/core/skbuff.c | 2 +- 22 files changed, 18 insertions(+), 17 deletions(-) rename include/trace/{ => events}/irq.h (100%) rename include/trace/{ => events}/kmem.h (100%) rename include/trace/{ => events}/lockdep.h (100%) rename include/trace/{ => events}/sched.h (100%) rename include/trace/{ => events}/skb.h (100%) diff --git a/include/linux/kmemtrace.h b/include/linux/kmemtrace.h index 15c45a27a92..b616d3930c3 100644 --- a/include/linux/kmemtrace.h +++ b/include/linux/kmemtrace.h @@ -9,7 +9,7 @@ #ifdef __KERNEL__ -#include +#include #ifdef CONFIG_KMEMTRACE extern void kmemtrace_init(void); diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 980eb66a6e3..18869417109 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -43,7 +43,7 @@ #endif #ifndef TRACE_INCLUDE_PATH -# define __TRACE_INCLUDE(system) +# define __TRACE_INCLUDE(system) # define UNDEF_TRACE_INCLUDE_FILE #else # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h) diff --git a/include/trace/irq.h b/include/trace/events/irq.h similarity index 100% rename from include/trace/irq.h rename to include/trace/events/irq.h diff --git a/include/trace/kmem.h b/include/trace/events/kmem.h similarity index 100% rename from include/trace/kmem.h rename to include/trace/events/kmem.h diff --git a/include/trace/lockdep.h b/include/trace/events/lockdep.h similarity index 100% rename from include/trace/lockdep.h rename to include/trace/events/lockdep.h diff --git a/include/trace/sched.h b/include/trace/events/sched.h similarity index 100% rename from include/trace/sched.h rename to include/trace/events/sched.h diff --git a/include/trace/skb.h b/include/trace/events/skb.h similarity index 100% rename from include/trace/skb.h rename to include/trace/events/skb.h diff --git a/kernel/exit.c b/kernel/exit.c index 2fe9d2c7eee..cab535c427b 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,7 +48,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/fork.c b/kernel/fork.c index 4bebf263923..085f73ebcea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -61,7 +61,6 @@ #include #include #include -#include #include #include @@ -71,6 +70,8 @@ #include #include +#include + /* * Protected counters by write_lock_irq(&tasklist_lock) */ diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 983d8be8dff..37c63633e78 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -20,7 +20,7 @@ #include #define CREATE_TRACE_POINTS -#include +#include #include "internals.h" diff --git a/kernel/kthread.c b/kernel/kthread.c index e1c76924545..41c88fe4050 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #define KTHREAD_NICE_LEVEL (-5) diff --git a/kernel/lockdep.c b/kernel/lockdep.c index 257f21a76c5..47b201ecc6d 100644 --- a/kernel/lockdep.c +++ b/kernel/lockdep.c @@ -48,7 +48,7 @@ #include "lockdep_internals.h" #define CREATE_TRACE_POINTS -#include +#include #ifdef CONFIG_PROVE_LOCKING int prove_locking = 1; diff --git a/kernel/sched.c b/kernel/sched.c index e6d4518d47e..9f7ffd00b6e 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -79,7 +79,7 @@ #include "sched_cpupri.h" #define CREATE_TRACE_POINTS -#include +#include /* * Convert user-nice values [ -20 ... 0 ... 19 ] diff --git a/kernel/signal.c b/kernel/signal.c index 1d5703ff003..94ec0a4dde0 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -27,7 +27,7 @@ #include #include #include -#include +#include #include #include diff --git a/kernel/softirq.c b/kernel/softirq.c index a2d9b458ac2..7ab9dfd8d08 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include /* diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 8e6a0b5c994..a2348898858 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -29,7 +29,7 @@ #include #include -#include +#include #include diff --git a/kernel/trace/trace_sched_switch.c b/kernel/trace/trace_sched_switch.c index 9d8cccdfaa0..a98106dd979 100644 --- a/kernel/trace/trace_sched_switch.c +++ b/kernel/trace/trace_sched_switch.c @@ -10,7 +10,7 @@ #include #include #include -#include +#include #include "trace.h" diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index 5bc00e8f153..b8b13c5540f 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -15,7 +15,7 @@ #include #include #include -#include +#include #include "trace.h" diff --git a/mm/util.c b/mm/util.c index 0e74a22791c..6794a336e9a 100644 --- a/mm/util.c +++ b/mm/util.c @@ -7,7 +7,7 @@ #include #define CREATE_TRACE_POINTS -#include +#include /** * kstrdup - allocate space for and copy an existing string diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index 9fd0dc3cca9..b75b6cea49d 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -23,7 +23,7 @@ #include #include -#include +#include #include diff --git a/net/core/net-traces.c b/net/core/net-traces.c index 80177205947..499a67eaf3a 100644 --- a/net/core/net-traces.c +++ b/net/core/net-traces.c @@ -24,6 +24,6 @@ #include #define CREATE_TRACE_POINTS -#include +#include EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index ce6356cd9f7..12806b84445 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -65,7 +65,7 @@ #include #include -#include +#include #include "kmap_skb.h" From 9cfe06f8cd5c8c3ad6ab323973e87dde670642b8 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 14 Apr 2009 21:37:03 -0400 Subject: [PATCH 0386/2061] tracing/events: add trace-events-sample This patch adds a sample to the samples directory on how to create and use TRACE_EVENT trace points. Signed-off-by: Steven Rostedt --- samples/Kconfig | 7 ++ samples/Makefile | 2 +- samples/trace_events/Makefile | 8 ++ samples/trace_events/trace-events-sample.c | 56 ++++++++++ samples/trace_events/trace-events-sample.h | 124 +++++++++++++++++++++ 5 files changed, 196 insertions(+), 1 deletion(-) create mode 100644 samples/trace_events/Makefile create mode 100644 samples/trace_events/trace-events-sample.c create mode 100644 samples/trace_events/trace-events-sample.h diff --git a/samples/Kconfig b/samples/Kconfig index 4b02f5a0e65..93f41c05109 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -19,6 +19,13 @@ config SAMPLE_TRACEPOINTS help This build tracepoints example modules. +config SAMPLE_TRACE_EVENTS + tristate "Build trace_events examples" + depends on EVENT_TRACING + default m + help + This build trace event example modules. + config SAMPLE_KOBJECT tristate "Build kobject examples" help diff --git a/samples/Makefile b/samples/Makefile index 10eaca89fe1..13e4b470b53 100644 --- a/samples/Makefile +++ b/samples/Makefile @@ -1,3 +1,3 @@ # Makefile for Linux samples code -obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ tracepoints/ +obj-$(CONFIG_SAMPLES) += markers/ kobject/ kprobes/ tracepoints/ trace_events/ diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile new file mode 100644 index 00000000000..06c6dea1eb8 --- /dev/null +++ b/samples/trace_events/Makefile @@ -0,0 +1,8 @@ +# builds the trace events example kernel modules; +# then to use one (as root): insmod + +PWD := $(shell pwd) + +CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/ + +obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c new file mode 100644 index 00000000000..f33b3ba744a --- /dev/null +++ b/samples/trace_events/trace-events-sample.c @@ -0,0 +1,56 @@ +#include +#include + +/* + * Any file that uses trace points, must include the header. + * But only one file, must include the header by defining + * CREATE_TRACE_POINTS first. This will make the C code that + * creates the handles for the trace points. + */ +#define CREATE_TRACE_POINTS +#include "trace-events-sample.h" + + +static void simple_thread_func(int cnt) +{ + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ); + trace_foo_bar("hello", cnt); + + if (!(cnt % 10)) + /* It is really important that I say "hi!" */ + printk(KERN_EMERG "hi!\n"); +} + +static int simple_thread(void *arg) +{ + int cnt = 0; + + while (!kthread_should_stop()) + simple_thread_func(cnt++); + + return 0; +} + +static struct task_struct *simple_tsk; + +static int __init trace_event_init(void) +{ + simple_tsk = kthread_run(simple_thread, NULL, "event-sample"); + if (IS_ERR(simple_tsk)) + return -1; + + return 0; +} + +static void __exit trace_event_exit(void) +{ + kthread_stop(simple_tsk); +} + +module_init(trace_event_init); +module_exit(trace_event_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("trace-events-sample"); +MODULE_LICENSE("GPL"); diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h new file mode 100644 index 00000000000..eab46443e61 --- /dev/null +++ b/samples/trace_events/trace-events-sample.h @@ -0,0 +1,124 @@ +/* + * Notice that this file is not protected like a normal header. + * We also must allow for rereading of this file. The + * + * || defined(TRACE_HEADER_MULTI_READ) + * + * serves this purpose. + */ +#if !defined(_TRACE_EVENT_SAMPLE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_EVENT_SAMPLE_H + +/* + * All trace headers should include tracepoint.h, until we finally + * make it into a standard header. + */ +#include + +/* + * If TRACE_SYSTEM is defined, that will be the directory created + * in the ftrace directory under /debugfs/tracing/events/ + * + * The define_trace.h belowe will also look for a file name of + * TRACE_SYSTEM.h where TRACE_SYSTEM is what is defined here. + * + * If you want a different system than file name, you can override + * the header name by defining TRACE_INCLUDE_FILE + * + * If this file was called, goofy.h, then we would define: + * + * #define TRACE_INCLUDE_FILE goofy + * + */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM trace-events-sample + +/* + * The TRACE_EVENT macro is broken up into 5 parts. + * + * name: name of the trace point. This is also how to enable the tracepoint. + * A function called trace_foo_bar() will be created. + * + * proto: the prototype of the function trace_foo_bar() + * Here it is trace_foo_bar(char *foo, int bar). + * + * args: must match the arguments in the prototype. + * Here it is simply "foo, bar". + * + * struct: This defines the way the data will be stored in the ring buffer. + * There are currently two types of elements. __field and __array. + * a __field is broken up into (type, name). Where type can be any + * type but an array. + * For an array. there are three fields. (type, name, size). The + * type of elements in the array, the name of the field and the size + * of the array. + * + * __array( char, foo, 10) is the same as saying char foo[10]. + * + * fast_assign: This is a C like function that is used to store the items + * into the ring buffer. + * + * printk: This is a way to print out the data in pretty print. This is + * useful if the system crashes and you are logging via a serial line, + * the data can be printed to the console using this "printk" method. + * + * Note, that for both the assign and the printk, __entry is the handler + * to the data structure in the ring buffer, and is defined by the + * TP_STRUCT__entry. + */ +TRACE_EVENT(foo_bar, + + TP_PROTO(char *foo, int bar), + + TP_ARGS(foo, bar), + + TP_STRUCT__entry( + __array( char, foo, 10 ) + __field( int, bar ) + ), + + TP_fast_assign( + strncpy(__entry->foo, foo, 10); + __entry->bar = bar; + ), + + TP_printk("foo %s %d", __entry->foo, __entry->bar) +); +#endif + +/***** NOTICE! The #if protection ends here. *****/ + + +/* + * There are several ways I could have done this. If I left out the + * TRACE_INCLUDE_PATH, then it would default to the kernel source + * include/trace/events directory. + * + * I could specify a path from the define_trace.h file back to this + * file. + * + * #define TRACE_INCLUDE_PATH ../../samples/trace_events + * + * But I chose to simply make it use the current directory and then in + * the Makefile I added: + * + * CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/ + * + * This will make sure the current path is part of the include + * structure for our file so that we can find it. + * + * I could have made only the top level directory the include: + * + * CFLAGS_trace-events-sample.o := -I$(PWD) + * + * And then let the path to this directory be the TRACE_INCLUDE_PATH: + * + * #define TRACE_INCLUDE_PATH samples/trace_events + * + * But then if something defines "samples" or "trace_events" then we + * could risk that being converted too, and give us an unexpected + * result. + */ +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . +#include From 05725f7eb4b8acb147c5fc7b91397b1f6bcab00d Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 14 Apr 2009 20:17:16 +0200 Subject: [PATCH 0387/2061] rculist: use list_entry_rcu in places where it's appropriate Use previously introduced list_entry_rcu instead of an open-coded list_entry + rcu_dereference combination. Signed-off-by: Jiri Pirko Reviewed-by: Paul E. McKenney Cc: dipankar@in.ibm.com LKML-Reference: <20090414181715.GA3634@psychotron.englab.brq.redhat.com> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 8 +++++--- ipc/sem.c | 4 ++-- security/integrity/ima/ima_fs.c | 4 ++-- security/smack/smackfs.c | 8 ++++---- 4 files changed, 13 insertions(+), 11 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049..886df41e745 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -77,6 +77,7 @@ struct sched_param { #include #include #include +#include #include #include @@ -2010,7 +2011,8 @@ static inline unsigned long wait_task_inactive(struct task_struct *p, } #endif -#define next_task(p) list_entry(rcu_dereference((p)->tasks.next), struct task_struct, tasks) +#define next_task(p) \ + list_entry_rcu((p)->tasks.next, struct task_struct, tasks) #define for_each_process(p) \ for (p = &init_task ; (p = next_task(p)) != &init_task ; ) @@ -2049,8 +2051,8 @@ int same_thread_group(struct task_struct *p1, struct task_struct *p2) static inline struct task_struct *next_thread(const struct task_struct *p) { - return list_entry(rcu_dereference(p->thread_group.next), - struct task_struct, thread_group); + return list_entry_rcu(p->thread_group.next, + struct task_struct, thread_group); } static inline int thread_group_empty(struct task_struct *p) diff --git a/ipc/sem.c b/ipc/sem.c index 16a2189e96f..87c2b641fd7 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1290,8 +1290,8 @@ void exit_sem(struct task_struct *tsk) int i; rcu_read_lock(); - un = list_entry(rcu_dereference(ulp->list_proc.next), - struct sem_undo, list_proc); + un = list_entry_rcu(ulp->list_proc.next, + struct sem_undo, list_proc); if (&un->list_proc == &ulp->list_proc) semid = -1; else diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index ffbe259700b..510186f0b72 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -84,8 +84,8 @@ static void *ima_measurements_next(struct seq_file *m, void *v, loff_t *pos) * against concurrent list-extension */ rcu_read_lock(); - qe = list_entry(rcu_dereference(qe->later.next), - struct ima_queue_entry, later); + qe = list_entry_rcu(qe->later.next, + struct ima_queue_entry, later); rcu_read_unlock(); (*pos)++; diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index e03a7e19c73..11d2cb19d7a 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -734,8 +734,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new) return; } - m = list_entry(rcu_dereference(smk_netlbladdr_list.next), - struct smk_netlbladdr, list); + m = list_entry_rcu(smk_netlbladdr_list.next, + struct smk_netlbladdr, list); /* the comparison '>' is a bit hacky, but works */ if (new->smk_mask.s_addr > m->smk_mask.s_addr) { @@ -748,8 +748,8 @@ static void smk_netlbladdr_insert(struct smk_netlbladdr *new) list_add_rcu(&new->list, &m->list); return; } - m_next = list_entry(rcu_dereference(m->list.next), - struct smk_netlbladdr, list); + m_next = list_entry_rcu(m->list.next, + struct smk_netlbladdr, list); if (new->smk_mask.s_addr > m_next->smk_mask.s_addr) { list_add_rcu(&new->list, &m->list); return; From b206525ad1f653b7da35f5827be93770d28eae11 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 14 Apr 2009 23:04:37 +0530 Subject: [PATCH 0388/2061] x86: k8 convert node_to_k8_nb_misc() from a macro to an inline function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Converting node_to_k8_nb_misc() from a macro to an inline function makes compiler see the 'node' parameter in the !CONFIG_K8_NB too, which eliminates these compiler warnings: arch/x86/kernel/cpu/intel_cacheinfo.c: In function ‘show_cache_disable’: arch/x86/kernel/cpu/intel_cacheinfo.c:712: warning: unused variable ‘node’ arch/x86/kernel/cpu/intel_cacheinfo.c: In function ‘store_cache_disable’: arch/x86/kernel/cpu/intel_cacheinfo.c:739: warning: unused variable ‘node’ Signed-off-by: Jaswinder Singh Rajput Cc: Andreas Herrmann Cc: Mark Langsdorf LKML-Reference: <1239730477.2966.26.camel@ht.satnam> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/k8.h | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/k8.h b/arch/x86/include/asm/k8.h index c23b3d171be..c2d1f3b58e5 100644 --- a/arch/x86/include/asm/k8.h +++ b/arch/x86/include/asm/k8.h @@ -13,10 +13,15 @@ extern void k8_flush_garts(void); extern int k8_scan_nodes(unsigned long start, unsigned long end); #ifdef CONFIG_K8_NB -#define node_to_k8_nb_misc(node) \ - (node < num_k8_northbridges) ? k8_northbridges[node] : NULL +static inline struct pci_dev *node_to_k8_nb_misc(int node) +{ + return (node < num_k8_northbridges) ? k8_northbridges[node] : NULL; +} #else -#define node_to_k8_nb_misc(node) NULL +static inline struct pci_dev *node_to_k8_nb_misc(int node) +{ + return NULL; +} #endif From e6a1a89d572c31b62d6dcf11a371c7323852d9b2 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Wed, 15 Apr 2009 18:22:41 +0900 Subject: [PATCH 0389/2061] dma-debug: add dma_debug_resize_entries() to adjust the number of dma_debug_entries We use a static value for the number of dma_debug_entries. It can be overwritten by a kernel command line option. Some IOMMUs (e.g. GART) can't set an appropriate value by a kernel command line option because they can't know such value until they finish initializing up their hardware. This patch adds dma_debug_resize_entries() enables IOMMUs to adjust the number of dma_debug_entries anytime. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Cc: fujita.tomonori@lab.ntt.co.jp Cc: akpm@linux-foundation.org LKML-Reference: <20090415182234R.fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- include/linux/dma-debug.h | 7 ++++ lib/dma-debug.c | 72 +++++++++++++++++++++++++++++++++++---- 2 files changed, 73 insertions(+), 6 deletions(-) diff --git a/include/linux/dma-debug.h b/include/linux/dma-debug.h index 28d53cb7b5a..171ad8aedc8 100644 --- a/include/linux/dma-debug.h +++ b/include/linux/dma-debug.h @@ -32,6 +32,8 @@ extern void dma_debug_add_bus(struct bus_type *bus); extern void dma_debug_init(u32 num_entries); +extern int dma_debug_resize_entries(u32 num_entries); + extern void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, int direction, dma_addr_t dma_addr, @@ -91,6 +93,11 @@ static inline void dma_debug_init(u32 num_entries) { } +static inline int dma_debug_resize_entries(u32 num_entries) +{ + return 0; +} + static inline void debug_dma_map_page(struct device *dev, struct page *page, size_t offset, size_t size, int direction, dma_addr_t dma_addr, diff --git a/lib/dma-debug.c b/lib/dma-debug.c index d3da7edc034..5d61019330c 100644 --- a/lib/dma-debug.c +++ b/lib/dma-debug.c @@ -85,6 +85,7 @@ static u32 show_num_errors = 1; static u32 num_free_entries; static u32 min_free_entries; +static u32 nr_total_entries; /* number of preallocated entries requested by kernel cmdline */ static u32 req_entries; @@ -257,6 +258,21 @@ static void add_dma_entry(struct dma_debug_entry *entry) put_hash_bucket(bucket, &flags); } +static struct dma_debug_entry *__dma_entry_alloc(void) +{ + struct dma_debug_entry *entry; + + entry = list_entry(free_entries.next, struct dma_debug_entry, list); + list_del(&entry->list); + memset(entry, 0, sizeof(*entry)); + + num_free_entries -= 1; + if (num_free_entries < min_free_entries) + min_free_entries = num_free_entries; + + return entry; +} + /* struct dma_entry allocator * * The next two functions implement the allocator for @@ -276,9 +292,7 @@ static struct dma_debug_entry *dma_entry_alloc(void) goto out; } - entry = list_entry(free_entries.next, struct dma_debug_entry, list); - list_del(&entry->list); - memset(entry, 0, sizeof(*entry)); + entry = __dma_entry_alloc(); #ifdef CONFIG_STACKTRACE entry->stacktrace.max_entries = DMA_DEBUG_STACKTRACE_ENTRIES; @@ -286,9 +300,6 @@ static struct dma_debug_entry *dma_entry_alloc(void) entry->stacktrace.skip = 2; save_stack_trace(&entry->stacktrace); #endif - num_free_entries -= 1; - if (num_free_entries < min_free_entries) - min_free_entries = num_free_entries; out: spin_unlock_irqrestore(&free_entries_lock, flags); @@ -310,6 +321,53 @@ static void dma_entry_free(struct dma_debug_entry *entry) spin_unlock_irqrestore(&free_entries_lock, flags); } +int dma_debug_resize_entries(u32 num_entries) +{ + int i, delta, ret = 0; + unsigned long flags; + struct dma_debug_entry *entry; + LIST_HEAD(tmp); + + spin_lock_irqsave(&free_entries_lock, flags); + + if (nr_total_entries < num_entries) { + delta = num_entries - nr_total_entries; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + for (i = 0; i < delta; i++) { + entry = kzalloc(sizeof(*entry), GFP_KERNEL); + if (!entry) + break; + + list_add_tail(&entry->list, &tmp); + } + + spin_lock_irqsave(&free_entries_lock, flags); + + list_splice(&tmp, &free_entries); + nr_total_entries += i; + num_free_entries += i; + } else { + delta = nr_total_entries - num_entries; + + for (i = 0; i < delta && !list_empty(&free_entries); i++) { + entry = __dma_entry_alloc(); + kfree(entry); + } + + nr_total_entries -= i; + } + + if (nr_total_entries != num_entries) + ret = 1; + + spin_unlock_irqrestore(&free_entries_lock, flags); + + return ret; +} +EXPORT_SYMBOL(dma_debug_resize_entries); + /* * DMA-API debugging init code * @@ -490,6 +548,8 @@ void dma_debug_init(u32 num_entries) return; } + nr_total_entries = num_free_entries; + printk(KERN_INFO "DMA-API: debugging enabled by kernel config\n"); } From 19c1a6f5764d787113fa323ffb18be7991208f82 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 14 Apr 2009 09:43:19 +0900 Subject: [PATCH 0390/2061] x86 gart: reimplement IOMMU_LEAK feature by using DMA_API_DEBUG IOMMU_LEAK, GART's own feature, dumps the used IOMMU entries when IOMMU entries is full, which might be useful to find a bad driver that eats IOMMU entries. DMA_API_DEBUG provides the similar feature, debug_dma_dump_mappings, and it's better than GART's IOMMU_LEAK feature. GART's IOMMU_LEAK feature doesn't say who uses IOMMU entries so it's hard to find a bad driver. This patch reimplements the GART's IOMMU_LEAK feature by using DMA_API_DEBUG. Signed-off-by: FUJITA Tomonori Acked-by: Joerg Roedel Cc: Andrew Morton LKML-Reference: <1239669799-23579-2-git-send-email-fujita.tomonori@lab.ntt.co.jp> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 3 +-- arch/x86/kernel/pci-gart_64.c | 45 +++++++---------------------------- 2 files changed, 9 insertions(+), 39 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index d8359e73317..5865712d105 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -161,8 +161,7 @@ config IOMMU_DEBUG config IOMMU_LEAK bool "IOMMU leak tracing" - depends on DEBUG_KERNEL - depends on IOMMU_DEBUG + depends on IOMMU_DEBUG && DMA_API_DEBUG ---help--- Add a simple leak tracer to the IOMMU code. This is useful when you are debugging a buggy device driver that leaks IOMMU mappings. diff --git a/arch/x86/kernel/pci-gart_64.c b/arch/x86/kernel/pci-gart_64.c index b284b58c035..1e8920d98f7 100644 --- a/arch/x86/kernel/pci-gart_64.c +++ b/arch/x86/kernel/pci-gart_64.c @@ -144,48 +144,21 @@ static void flush_gart(void) } #ifdef CONFIG_IOMMU_LEAK - -#define SET_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = __builtin_return_address(0);\ - } while (0) - -#define CLEAR_LEAK(x) \ - do { \ - if (iommu_leak_tab) \ - iommu_leak_tab[x] = NULL; \ - } while (0) - /* Debugging aid for drivers that don't free their IOMMU tables */ -static void **iommu_leak_tab; static int leak_trace; static int iommu_leak_pages = 20; static void dump_leak(void) { - int i; static int dump; - if (dump || !iommu_leak_tab) + if (dump) return; dump = 1; - show_stack(NULL, NULL); - /* Very crude. dump some from the end of the table too */ - printk(KERN_DEBUG "Dumping %d pages from end of IOMMU:\n", - iommu_leak_pages); - for (i = 0; i < iommu_leak_pages; i += 2) { - printk(KERN_DEBUG "%lu: ", iommu_pages-i); - printk_address((unsigned long) iommu_leak_tab[iommu_pages-i], - 0); - printk(KERN_CONT "%c", (i+1)%2 == 0 ? '\n' : ' '); - } - printk(KERN_DEBUG "\n"); + show_stack(NULL, NULL); + debug_dma_dump_mappings(NULL); } -#else -# define SET_LEAK(x) -# define CLEAR_LEAK(x) #endif static void iommu_full(struct device *dev, size_t size, int dir) @@ -248,7 +221,6 @@ static dma_addr_t dma_map_area(struct device *dev, dma_addr_t phys_mem, for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = GPTE_ENCODE(phys_mem); - SET_LEAK(iommu_page + i); phys_mem += PAGE_SIZE; } return iommu_bus_base + iommu_page*PAGE_SIZE + (phys_mem & ~PAGE_MASK); @@ -294,7 +266,6 @@ static void gart_unmap_page(struct device *dev, dma_addr_t dma_addr, npages = iommu_num_pages(dma_addr, size, PAGE_SIZE); for (i = 0; i < npages; i++) { iommu_gatt_base[iommu_page + i] = gart_unmapped_entry; - CLEAR_LEAK(iommu_page + i); } free_iommu(iommu_page, npages); } @@ -377,7 +348,6 @@ static int __dma_map_cont(struct device *dev, struct scatterlist *start, pages = iommu_num_pages(s->offset, s->length, PAGE_SIZE); while (pages--) { iommu_gatt_base[iommu_page] = GPTE_ENCODE(addr); - SET_LEAK(iommu_page); addr += PAGE_SIZE; iommu_page++; } @@ -801,11 +771,12 @@ void __init gart_iommu_init(void) #ifdef CONFIG_IOMMU_LEAK if (leak_trace) { - iommu_leak_tab = (void *)__get_free_pages(GFP_KERNEL|__GFP_ZERO, - get_order(iommu_pages*sizeof(void *))); - if (!iommu_leak_tab) + int ret; + + ret = dma_debug_resize_entries(iommu_pages); + if (ret) printk(KERN_DEBUG - "PCI-DMA: Cannot allocate leak trace area\n"); + "PCI-DMA: Cannot trace all the entries\n"); } #endif From 13318a7186d8e0ae08c996ea4111a945e7789772 Mon Sep 17 00:00:00 2001 From: Miao Xie Date: Wed, 15 Apr 2009 09:59:10 +0800 Subject: [PATCH 0391/2061] sched: use group_first_cpu() instead of cpumask_first(sched_group_cpus()) Impact: cleanup This patch changes cpumask_first(sched_group_cpus()) to group_first_cpu() for maintainability. Signed-off-by: Miao Xie Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- kernel/sched.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 92b4b56ad09..7601ceebf7c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7995,7 +7995,7 @@ static void init_numa_sched_groups_power(struct sched_group *group_head) struct sched_domain *sd; sd = &per_cpu(phys_domains, j).sd; - if (j != cpumask_first(sched_group_cpus(sd->groups))) { + if (j != group_first_cpu(sd->groups)) { /* * Only add "power" once for each * physical package. @@ -8073,7 +8073,7 @@ static void init_sched_groups_power(int cpu, struct sched_domain *sd) WARN_ON(!sd || !sd->groups); - if (cpu != cpumask_first(sched_group_cpus(sd->groups))) + if (cpu != group_first_cpu(sd->groups)) return; child = sd->child; From ff7b1b4f000cea84f071c1b6aa2918b2119d66f1 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 15 Apr 2009 16:55:05 +0100 Subject: [PATCH 0392/2061] perfcounters: export perf_tpcounter_event Needed for modular tracepoint support. Signed-off-by: Steven Whitehouse Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7f9521c3c01..09396098dd0 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2590,6 +2590,7 @@ void perf_tpcounter_event(int event_id) __perf_swcounter_event(PERF_TYPE_TRACEPOINT, event_id, 1, 1, regs, 0); } +EXPORT_SYMBOL_GPL(perf_tpcounter_event); extern int ftrace_profile_enable(int); extern void ftrace_profile_disable(int); From 0207a2efb43d81e29e23662b5d035945688a103f Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 16 Apr 2009 14:40:56 +0900 Subject: [PATCH 0393/2061] sh: Add support for SH7724 (SH-Mobile R2R) CPU subtype. This implements initial support for the SH-Mobile R2R CPU. Based on Rev 0.11 of the initial SH7724 hardware manual. Signed-off-by: Kuninori Morimoto Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 10 + arch/sh/include/asm/processor.h | 2 +- arch/sh/include/cpu-sh4/cpu/freq.h | 18 + arch/sh/include/cpu-sh4/cpu/sh7724.h | 255 +++ arch/sh/kernel/cpu/sh4/probe.c | 6 + arch/sh/kernel/cpu/sh4a/Makefile | 3 + arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 112 +- arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c | 2230 +++++++++++++++++++++++ arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 371 ++++ arch/sh/kernel/setup.c | 3 +- arch/sh/oprofile/common.c | 1 + 11 files changed, 2997 insertions(+), 14 deletions(-) create mode 100644 arch/sh/include/cpu-sh4/cpu/sh7724.h create mode 100644 arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c create mode 100644 arch/sh/kernel/cpu/sh4a/setup-sh7724.c diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index e7390dd0283..505d1acbd0a 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -347,6 +347,15 @@ config CPU_SUBTYPE_SH7723 help Select SH7723 if you have an SH-MobileR2 CPU. +config CPU_SUBTYPE_SH7724 + bool "Support SH7724 processor" + select CPU_SH4A + select CPU_SHX2 + select ARCH_SPARSEMEM_ENABLE + select SYS_SUPPORTS_CMT + help + Select SH7724 if you have an SH-MobileR2R CPU. + config CPU_SUBTYPE_SH7763 bool "Support SH7763 processor" select CPU_SH4A @@ -495,6 +504,7 @@ config SH_PCLK_FREQ CPU_SUBTYPE_SH7203 || CPU_SUBTYPE_SH7206 || \ CPU_SUBTYPE_SH7263 || CPU_SUBTYPE_MXG || \ CPU_SUBTYPE_SH7786 + default "41666666" if CPU_SUBTYPE_SH7724 default "60000000" if CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R default "66000000" if CPU_SUBTYPE_SH4_202 default "50000000" diff --git a/arch/sh/include/asm/processor.h b/arch/sh/include/asm/processor.h index 1fd58b42143..005c962c8b1 100644 --- a/arch/sh/include/asm/processor.h +++ b/arch/sh/include/asm/processor.h @@ -32,7 +32,7 @@ enum cpu_type { /* SH-4A types */ CPU_SH7763, CPU_SH7770, CPU_SH7780, CPU_SH7781, CPU_SH7785, CPU_SH7786, - CPU_SH7723, CPU_SHX3, + CPU_SH7723, CPU_SH7724, CPU_SHX3, /* SH4AL-DSP types */ CPU_SH7343, CPU_SH7722, CPU_SH7366, diff --git a/arch/sh/include/cpu-sh4/cpu/freq.h b/arch/sh/include/cpu-sh4/cpu/freq.h index 749d1c43433..ccf1d999db6 100644 --- a/arch/sh/include/cpu-sh4/cpu/freq.h +++ b/arch/sh/include/cpu-sh4/cpu/freq.h @@ -25,6 +25,24 @@ #elif defined(CONFIG_CPU_SUBTYPE_SH7763) || \ defined(CONFIG_CPU_SUBTYPE_SH7780) #define FRQCR 0xffc80000 +#elif defined(CONFIG_CPU_SUBTYPE_SH7724) +#define FRQCRA 0xa4150000 +#define FRQCRB 0xa4150004 +#define VCLKCR 0xa4150048 + +#define FCLKACR 0xa4150008 +#define FCLKBCR 0xa415000c +#define FRQCR FRQCRA +#define SCLKACR FCLKACR +#define SCLKBCR FCLKBCR +#define FCLKACR 0xa4150008 +#define FCLKBCR 0xa415000c +#define IrDACLKCR 0xa4150018 + +#define MSTPCR0 0xa4150030 +#define MSTPCR1 0xa4150034 +#define MSTPCR2 0xa4150038 + #elif defined(CONFIG_CPU_SUBTYPE_SH7785) #define FRQCR0 0xffc80000 #define FRQCR1 0xffc80004 diff --git a/arch/sh/include/cpu-sh4/cpu/sh7724.h b/arch/sh/include/cpu-sh4/cpu/sh7724.h new file mode 100644 index 00000000000..34605c9e354 --- /dev/null +++ b/arch/sh/include/cpu-sh4/cpu/sh7724.h @@ -0,0 +1,255 @@ +#ifndef __ASM_SH7724_H__ +#define __ASM_SH7724_H__ + +enum { + /* PTA */ + GPIO_PTA7, GPIO_PTA6, GPIO_PTA5, GPIO_PTA4, + GPIO_PTA3, GPIO_PTA2, GPIO_PTA1, GPIO_PTA0, + + /* PTB */ + GPIO_PTB7, GPIO_PTB6, GPIO_PTB5, GPIO_PTB4, + GPIO_PTB3, GPIO_PTB2, GPIO_PTB1, GPIO_PTB0, + + /* PTC */ + GPIO_PTC7, GPIO_PTC6, GPIO_PTC5, GPIO_PTC4, + GPIO_PTC3, GPIO_PTC2, GPIO_PTC1, GPIO_PTC0, + + /* PTD */ + GPIO_PTD7, GPIO_PTD6, GPIO_PTD5, GPIO_PTD4, + GPIO_PTD3, GPIO_PTD2, GPIO_PTD1, GPIO_PTD0, + + /* PTE */ + GPIO_PTE7, GPIO_PTE6, GPIO_PTE5, GPIO_PTE4, + GPIO_PTE3, GPIO_PTE2, GPIO_PTE1, GPIO_PTE0, + + /* PTF */ + GPIO_PTF7, GPIO_PTF6, GPIO_PTF5, GPIO_PTF4, + GPIO_PTF3, GPIO_PTF2, GPIO_PTF1, GPIO_PTF0, + + /* PTG */ + GPIO_PTG5, GPIO_PTG4, + GPIO_PTG3, GPIO_PTG2, GPIO_PTG1, GPIO_PTG0, + + /* PTH */ + GPIO_PTH7, GPIO_PTH6, GPIO_PTH5, GPIO_PTH4, + GPIO_PTH3, GPIO_PTH2, GPIO_PTH1, GPIO_PTH0, + + /* PTJ */ + GPIO_PTJ7, GPIO_PTJ6, GPIO_PTJ5, + GPIO_PTJ3, GPIO_PTJ2, GPIO_PTJ1, GPIO_PTJ0, + + /* PTK */ + GPIO_PTK7, GPIO_PTK6, GPIO_PTK5, GPIO_PTK4, + GPIO_PTK3, GPIO_PTK2, GPIO_PTK1, GPIO_PTK0, + + /* PTL */ + GPIO_PTL7, GPIO_PTL6, GPIO_PTL5, GPIO_PTL4, + GPIO_PTL3, GPIO_PTL2, GPIO_PTL1, GPIO_PTL0, + + /* PTM */ + GPIO_PTM7, GPIO_PTM6, GPIO_PTM5, GPIO_PTM4, + GPIO_PTM3, GPIO_PTM2, GPIO_PTM1, GPIO_PTM0, + + /* PTN */ + GPIO_PTN7, GPIO_PTN6, GPIO_PTN5, GPIO_PTN4, + GPIO_PTN3, GPIO_PTN2, GPIO_PTN1, GPIO_PTN0, + + /* PTQ */ + GPIO_PTQ7, GPIO_PTQ6, GPIO_PTQ5, GPIO_PTQ4, + GPIO_PTQ3, GPIO_PTQ2, GPIO_PTQ1, GPIO_PTQ0, + + /* PTR */ + GPIO_PTR7, GPIO_PTR6, GPIO_PTR5, GPIO_PTR4, + GPIO_PTR3, GPIO_PTR2, GPIO_PTR1, GPIO_PTR0, + + /* PTS */ + GPIO_PTS6, GPIO_PTS5, GPIO_PTS4, + GPIO_PTS3, GPIO_PTS2, GPIO_PTS1, GPIO_PTS0, + + /* PTT */ + GPIO_PTT7, GPIO_PTT6, GPIO_PTT5, GPIO_PTT4, + GPIO_PTT3, GPIO_PTT2, GPIO_PTT1, GPIO_PTT0, + + /* PTU */ + GPIO_PTU7, GPIO_PTU6, GPIO_PTU5, GPIO_PTU4, + GPIO_PTU3, GPIO_PTU2, GPIO_PTU1, GPIO_PTU0, + + /* PTV */ + GPIO_PTV7, GPIO_PTV6, GPIO_PTV5, GPIO_PTV4, + GPIO_PTV3, GPIO_PTV2, GPIO_PTV1, GPIO_PTV0, + + /* PTW */ + GPIO_PTW7, GPIO_PTW6, GPIO_PTW5, GPIO_PTW4, + GPIO_PTW3, GPIO_PTW2, GPIO_PTW1, GPIO_PTW0, + + /* PTX */ + GPIO_PTX7, GPIO_PTX6, GPIO_PTX5, GPIO_PTX4, + GPIO_PTX3, GPIO_PTX2, GPIO_PTX1, GPIO_PTX0, + + /* PTY */ + GPIO_PTY7, GPIO_PTY6, GPIO_PTY5, GPIO_PTY4, + GPIO_PTY3, GPIO_PTY2, GPIO_PTY1, GPIO_PTY0, + + /* PTZ */ + GPIO_PTZ7, GPIO_PTZ6, GPIO_PTZ5, GPIO_PTZ4, + GPIO_PTZ3, GPIO_PTZ2, GPIO_PTZ1, GPIO_PTZ0, + + /* BSC (PTA/PTB/PTJ/PTQ/PTR/PTT) */ + GPIO_FN_D31, GPIO_FN_D30, GPIO_FN_D29, GPIO_FN_D28, + GPIO_FN_D27, GPIO_FN_D26, GPIO_FN_D25, GPIO_FN_D24, + GPIO_FN_D23, GPIO_FN_D22, GPIO_FN_D21, GPIO_FN_D20, + GPIO_FN_D19, GPIO_FN_D18, GPIO_FN_D17, GPIO_FN_D16, + GPIO_FN_D15, GPIO_FN_D14, GPIO_FN_D13, GPIO_FN_D12, + GPIO_FN_D11, GPIO_FN_D10, GPIO_FN_D9, GPIO_FN_D8, + GPIO_FN_D7, GPIO_FN_D6, GPIO_FN_D5, GPIO_FN_D4, + GPIO_FN_D3, GPIO_FN_D2, GPIO_FN_D1, GPIO_FN_D0, + GPIO_FN_A25, GPIO_FN_A24, GPIO_FN_A23, GPIO_FN_A22, + GPIO_FN_CS6B_CE1B, GPIO_FN_CS6A_CE2B, + GPIO_FN_CS5B_CE1A, GPIO_FN_CS5A_CE2A, + GPIO_FN_WE3_ICIOWR, GPIO_FN_WE2_ICIORD, + GPIO_FN_IOIS16, GPIO_FN_WAIT, + GPIO_FN_BS, + + /* KEYSC (PTA/PTB)*/ + GPIO_FN_KEYOUT5_IN5, GPIO_FN_KEYOUT4_IN6, GPIO_FN_KEYIN4, + GPIO_FN_KEYIN3, GPIO_FN_KEYIN2, GPIO_FN_KEYIN1, GPIO_FN_KEYIN0, + GPIO_FN_KEYOUT3, GPIO_FN_KEYOUT2, GPIO_FN_KEYOUT1, GPIO_FN_KEYOUT0, + + /* ATAPI (PTA/PTB/PTK/PTR/PTS/PTW) */ + GPIO_FN_IDED15, GPIO_FN_IDED14, GPIO_FN_IDED13, GPIO_FN_IDED12, + GPIO_FN_IDED11, GPIO_FN_IDED10, GPIO_FN_IDED9, GPIO_FN_IDED8, + GPIO_FN_IDED7, GPIO_FN_IDED6, GPIO_FN_IDED5, GPIO_FN_IDED4, + GPIO_FN_IDED3, GPIO_FN_IDED2, GPIO_FN_IDED1, GPIO_FN_IDED0, + GPIO_FN_IDEA2, GPIO_FN_IDEA1, GPIO_FN_IDEA0, GPIO_FN_IDEIOWR, + GPIO_FN_IODREQ, GPIO_FN_IDECS0, GPIO_FN_IDECS1, GPIO_FN_IDEIORD, + GPIO_FN_DIRECTION, GPIO_FN_EXBUF_ENB, GPIO_FN_IDERST, GPIO_FN_IODACK, + GPIO_FN_IDEINT, GPIO_FN_IDEIORDY, + + /* TPU (PTB/PTR/PTS) */ + GPIO_FN_TPUTO3, GPIO_FN_TPUTO2, GPIO_FN_TPUTO1, GPIO_FN_TPUTO0, + GPIO_FN_TPUTI3, GPIO_FN_TPUTI2, + + /* LCDC (PTC/PTD/PTE/PTF/PTM/PTR) */ + GPIO_FN_LCDD23, GPIO_FN_LCDD22, GPIO_FN_LCDD21, GPIO_FN_LCDD20, + GPIO_FN_LCDD19, GPIO_FN_LCDD18, GPIO_FN_LCDD17, GPIO_FN_LCDD16, + GPIO_FN_LCDD15, GPIO_FN_LCDD14, GPIO_FN_LCDD13, GPIO_FN_LCDD12, + GPIO_FN_LCDD11, GPIO_FN_LCDD10, GPIO_FN_LCDD9, GPIO_FN_LCDD8, + GPIO_FN_LCDD7, GPIO_FN_LCDD6, GPIO_FN_LCDD5, GPIO_FN_LCDD4, + GPIO_FN_LCDD3, GPIO_FN_LCDD2, GPIO_FN_LCDD1, GPIO_FN_LCDD0, + GPIO_FN_LCDVSYN, GPIO_FN_LCDDISP, GPIO_FN_LCDRS, GPIO_FN_LCDHSYN, + GPIO_FN_LCDCS, GPIO_FN_LCDDON, GPIO_FN_LCDDCK, GPIO_FN_LCDWR, + GPIO_FN_LCDVEPWC, GPIO_FN_LCDVCPWC, GPIO_FN_LCDRD, GPIO_FN_LCDLCLK, + + /* SCIF0 (PTF/PTM) */ + GPIO_FN_SCIF0_TXD, GPIO_FN_SCIF0_RXD, GPIO_FN_SCIF0_SCK, + + /* SCIF1 (PTL) */ + GPIO_FN_SCIF1_SCK, GPIO_FN_SCIF1_RXD, GPIO_FN_SCIF1_TXD, + + /* SCIF2 (PTE/PTF/PTN) with LCDC, VOU */ + GPIO_FN_SCIF2_L_TXD, GPIO_FN_SCIF2_L_SCK, GPIO_FN_SCIF2_L_RXD, + GPIO_FN_SCIF2_V_TXD, GPIO_FN_SCIF2_V_SCK, GPIO_FN_SCIF2_V_RXD, + + /* SCIF3 (PTL/PTN/PTZ) with VOU, IRQ */ + GPIO_FN_SCIF3_V_SCK, GPIO_FN_SCIF3_V_RXD, GPIO_FN_SCIF3_V_TXD, + GPIO_FN_SCIF3_V_CTS, GPIO_FN_SCIF3_V_RTS, + GPIO_FN_SCIF3_I_SCK, GPIO_FN_SCIF3_I_RXD, GPIO_FN_SCIF3_I_TXD, + GPIO_FN_SCIF3_I_CTS, GPIO_FN_SCIF3_I_RTS, + + /* SCIF4 (PTE) */ + GPIO_FN_SCIF4_SCK, GPIO_FN_SCIF4_RXD, GPIO_FN_SCIF4_TXD, + + /* SCIF5 (PTS) */ + GPIO_FN_SCIF5_SCK, GPIO_FN_SCIF5_RXD, GPIO_FN_SCIF5_TXD, + + /* FSI (PTE/PTU/PTV) */ + GPIO_FN_FSIMCKB, GPIO_FN_FSIMCKA, GPIO_FN_FSIOASD, + GPIO_FN_FSIIABCK, GPIO_FN_FSIIALRCK, GPIO_FN_FSIOABCK, + GPIO_FN_FSIOALRCK, GPIO_FN_CLKAUDIOAO, GPIO_FN_FSIIBSD, + GPIO_FN_FSIOBSD, GPIO_FN_FSIIBBCK, GPIO_FN_FSIIBLRCK, + GPIO_FN_FSIOBBCK, GPIO_FN_FSIOBLRCK, GPIO_FN_CLKAUDIOBO, + GPIO_FN_FSIIASD, + + /* AUD (PTG) */ + GPIO_FN_AUDCK, GPIO_FN_AUDSYNC, GPIO_FN_AUDATA3, + GPIO_FN_AUDATA2, GPIO_FN_AUDATA1, GPIO_FN_AUDATA0, + + /* VIO (PTS) (common?) */ + GPIO_FN_VIO_CKO, + + /* VIO0 (PTH/PTK) */ + GPIO_FN_VIO0_D15, GPIO_FN_VIO0_D14, GPIO_FN_VIO0_D13, GPIO_FN_VIO0_D12, + GPIO_FN_VIO0_D11, GPIO_FN_VIO0_D10, GPIO_FN_VIO0_D9, GPIO_FN_VIO0_D8, + GPIO_FN_VIO0_D7, GPIO_FN_VIO0_D6, GPIO_FN_VIO0_D5, GPIO_FN_VIO0_D4, + GPIO_FN_VIO0_D3, GPIO_FN_VIO0_D2, GPIO_FN_VIO0_D1, GPIO_FN_VIO0_D0, + GPIO_FN_VIO0_VD, GPIO_FN_VIO0_CLK, + GPIO_FN_VIO0_FLD, GPIO_FN_VIO0_HD, + + /* VIO1 (PTK/PTS) */ + GPIO_FN_VIO1_D7, GPIO_FN_VIO1_D6, GPIO_FN_VIO1_D5, GPIO_FN_VIO1_D4, + GPIO_FN_VIO1_D3, GPIO_FN_VIO1_D2, GPIO_FN_VIO1_D1, GPIO_FN_VIO1_D0, + GPIO_FN_VIO1_FLD, GPIO_FN_VIO1_HD, GPIO_FN_VIO1_VD, GPIO_FN_VIO1_CLK, + + /* Eth (PTL/PTN/PTX) */ + GPIO_FN_RMII_RXD0, GPIO_FN_RMII_RXD1, + GPIO_FN_RMII_TXD0, GPIO_FN_RMII_TXD1, + GPIO_FN_RMII_REF_CLK, GPIO_FN_RMII_TX_EN, + GPIO_FN_RMII_RX_ER, GPIO_FN_RMII_CRS_DV, + GPIO_FN_LNKSTA, GPIO_FN_MDIO, + GPIO_FN_MDC, + + /* System (PTJ) */ + GPIO_FN_PDSTATUS, GPIO_FN_STATUS2, GPIO_FN_STATUS0, + + /* VOU (PTL/PTM/PTN*/ + GPIO_FN_DV_D15, GPIO_FN_DV_D14, GPIO_FN_DV_D13, GPIO_FN_DV_D12, + GPIO_FN_DV_D11, GPIO_FN_DV_D10, GPIO_FN_DV_D9, GPIO_FN_DV_D8, + GPIO_FN_DV_D7, GPIO_FN_DV_D6, GPIO_FN_DV_D5, GPIO_FN_DV_D4, + GPIO_FN_DV_D3, GPIO_FN_DV_D2, GPIO_FN_DV_D1, GPIO_FN_DV_D0, + GPIO_FN_DV_CLKI, GPIO_FN_DV_CLK, GPIO_FN_DV_VSYNC, GPIO_FN_DV_HSYNC, + + /* MSIOF0 (PTL/PTM) */ + GPIO_FN_MSIOF0_RXD, GPIO_FN_MSIOF0_TXD, + GPIO_FN_MSIOF0_MCK, GPIO_FN_MSIOF0_TSCK, + GPIO_FN_MSIOF0_SS1, GPIO_FN_MSIOF0_SS2, + GPIO_FN_MSIOF0_TSYNC, GPIO_FN_MSIOF0_RSCK, + GPIO_FN_MSIOF0_RSYNC, + + /* MSIOF1 (PTV) */ + GPIO_FN_MSIOF1_RXD, GPIO_FN_MSIOF1_TXD, + GPIO_FN_MSIOF1_MCK, GPIO_FN_MSIOF1_TSCK, + GPIO_FN_MSIOF1_SS1, GPIO_FN_MSIOF1_SS2, + GPIO_FN_MSIOF1_TSYNC, GPIO_FN_MSIOF1_RSCK, + GPIO_FN_MSIOF1_RSYNC, + + /* DMAC (PTU/PTX) */ + GPIO_FN_DMAC_DACK0, GPIO_FN_DMAC_DREQ0, + GPIO_FN_DMAC_DACK1, GPIO_FN_DMAC_DREQ1, + + /* SDHI0 (PTY) */ + GPIO_FN_SDHI0CD, GPIO_FN_SDHI0WP, GPIO_FN_SDHI0CMD, GPIO_FN_SDHI0CLK, + GPIO_FN_SDHI0D3, GPIO_FN_SDHI0D2, GPIO_FN_SDHI0D1, GPIO_FN_SDHI0D0, + + /* SDHI1 (PTW) */ + GPIO_FN_SDHI1CD, GPIO_FN_SDHI1WP, GPIO_FN_SDHI1CMD, GPIO_FN_SDHI1CLK, + GPIO_FN_SDHI1D3, GPIO_FN_SDHI1D2, GPIO_FN_SDHI1D1, GPIO_FN_SDHI1D0, + + /* MMC (PTW/PTX)*/ + GPIO_FN_MMC_D7, GPIO_FN_MMC_D6, GPIO_FN_MMC_D5, GPIO_FN_MMC_D4, + GPIO_FN_MMC_D3, GPIO_FN_MMC_D2, GPIO_FN_MMC_D1, GPIO_FN_MMC_D0, + GPIO_FN_MMC_CLK, GPIO_FN_MMC_CMD, + + /* IrDA (PTX) */ + GPIO_FN_IRDA_OUT, GPIO_FN_IRDA_IN, + + /* TSIF (PTX) */ + GPIO_FN_TSIF_TS0_SDAT, GPIO_FN_TSIF_TS0_SCK, + GPIO_FN_TSIF_TS0_SDEN, GPIO_FN_TSIF_TS0_SPSYNC, + + /* IRQ (PTZ) */ + GPIO_FN_INTC_IRQ7, GPIO_FN_INTC_IRQ6, GPIO_FN_INTC_IRQ5, + GPIO_FN_INTC_IRQ4, GPIO_FN_INTC_IRQ3, GPIO_FN_INTC_IRQ2, + GPIO_FN_INTC_IRQ1, GPIO_FN_INTC_IRQ0, +}; + +#endif /* __ASM_SH7724_H__ */ diff --git a/arch/sh/kernel/cpu/sh4/probe.c b/arch/sh/kernel/cpu/sh4/probe.c index 91e3677ae09..973ff831c8a 100644 --- a/arch/sh/kernel/cpu/sh4/probe.c +++ b/arch/sh/kernel/cpu/sh4/probe.c @@ -156,6 +156,12 @@ int __init detect_cpu_and_cache_system(void) break; } break; + case 0x300b: + boot_cpu_data.type = CPU_SH7724; + boot_cpu_data.icache.ways = 4; + boot_cpu_data.dcache.ways = 4; + boot_cpu_data.flags |= CPU_HAS_LLSC | CPU_HAS_FPU; + break; case 0x4000: /* 1st cut */ case 0x4001: /* 2nd cut */ boot_cpu_data.type = CPU_SHX3; diff --git a/arch/sh/kernel/cpu/sh4a/Makefile b/arch/sh/kernel/cpu/sh4a/Makefile index 1a92361feeb..afd6fba4784 100644 --- a/arch/sh/kernel/cpu/sh4a/Makefile +++ b/arch/sh/kernel/cpu/sh4a/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_CPU_SUBTYPE_SH7786) += setup-sh7786.o obj-$(CONFIG_CPU_SUBTYPE_SH7343) += setup-sh7343.o obj-$(CONFIG_CPU_SUBTYPE_SH7722) += setup-sh7722.o obj-$(CONFIG_CPU_SUBTYPE_SH7723) += setup-sh7723.o +obj-$(CONFIG_CPU_SUBTYPE_SH7724) += setup-sh7724.o obj-$(CONFIG_CPU_SUBTYPE_SH7366) += setup-sh7366.o obj-$(CONFIG_CPU_SUBTYPE_SHX3) += setup-shx3.o @@ -26,12 +27,14 @@ clock-$(CONFIG_CPU_SUBTYPE_SH7786) := clock-sh7786.o clock-$(CONFIG_CPU_SUBTYPE_SH7343) := clock-sh7722.o clock-$(CONFIG_CPU_SUBTYPE_SH7722) := clock-sh7722.o clock-$(CONFIG_CPU_SUBTYPE_SH7723) := clock-sh7722.o +clock-$(CONFIG_CPU_SUBTYPE_SH7724) := clock-sh7722.o clock-$(CONFIG_CPU_SUBTYPE_SH7366) := clock-sh7722.o clock-$(CONFIG_CPU_SUBTYPE_SHX3) := clock-shx3.o # Pinmux setup pinmux-$(CONFIG_CPU_SUBTYPE_SH7722) := pinmux-sh7722.o pinmux-$(CONFIG_CPU_SUBTYPE_SH7723) := pinmux-sh7723.o +pinmux-$(CONFIG_CPU_SUBTYPE_SH7724) := pinmux-sh7724.o pinmux-$(CONFIG_CPU_SUBTYPE_SH7785) := pinmux-sh7785.o pinmux-$(CONFIG_CPU_SUBTYPE_SH7786) := pinmux-sh7786.o diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c index 0e174af2187..1ccdfc561fe 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c @@ -130,6 +130,12 @@ static void adjust_clocks(int originate, int *l, unsigned long v[], * is quite simple.. */ +#if defined(CONFIG_CPU_SUBTYPE_SH7724) +#define STCPLL(frqcr) ((((frqcr >> 24) & 0x3f) + 1) * 2) +#else +#define STCPLL(frqcr) (((frqcr >> 24) & 0x1f) + 1) +#endif + /* * Instead of having two separate multipliers/divisors set, like this: * @@ -139,13 +145,17 @@ static void adjust_clocks(int originate, int *l, unsigned long v[], * I created the divisors2 array, which is used to calculate rate like * rate = parent * 2 / divisors2[ divisor ]; */ +#if defined(CONFIG_CPU_SUBTYPE_SH7724) +static int divisors2[] = { 4, 1, 8, 12, 16, 24, 32, 1, 48, 64, 72, 96, 1, 144 }; +#else static int divisors2[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32, 40 }; +#endif static void master_clk_recalc(struct clk *clk) { unsigned frqcr = ctrl_inl(FRQCR); - clk->rate = CONFIG_SH_PCLK_FREQ * (((frqcr >> 24) & 0x1f) + 1); + clk->rate = CONFIG_SH_PCLK_FREQ * STCPLL(frqcr); } static void master_clk_init(struct clk *clk) @@ -161,13 +171,30 @@ static void module_clk_recalc(struct clk *clk) { unsigned long frqcr = ctrl_inl(FRQCR); - clk->rate = clk->parent->rate / (((frqcr >> 24) & 0x1f) + 1); + clk->rate = clk->parent->rate / STCPLL(frqcr); } +#if defined(CONFIG_CPU_SUBTYPE_SH7724) +#define MASTERDIVS { 12, 16, 24, 30, 32, 36, 48 } +#define STCMASK 0x3f +#define DIVCALC(div) (div/2-1) +#define FRQCRKICK 0x80000000 +#elif defined(CONFIG_CPU_SUBTYPE_SH7723) +#define MASTERDIVS { 6, 8, 12, 16 } +#define STCMASK 0x1f +#define DIVCALC(div) (div-1) +#define FRQCRKICK 0x00000000 +#else +#define MASTERDIVS { 2, 3, 4, 6, 8, 16 } +#define STCMASK 0x1f +#define DIVCALC(div) (div-1) +#define FRQCRKICK 0x00000000 +#endif + static int master_clk_setrate(struct clk *clk, unsigned long rate, int id) { int div = rate / clk->rate; - int master_divs[] = { 2, 3, 4, 6, 8, 16 }; + int master_divs[] = MASTERDIVS; int index; unsigned long frqcr; @@ -180,8 +207,9 @@ static int master_clk_setrate(struct clk *clk, unsigned long rate, int id) div = master_divs[index - 1]; frqcr = ctrl_inl(FRQCR); - frqcr &= ~(0xF << 24); - frqcr |= ( (div-1) << 24); + frqcr &= ~(STCMASK << 24); + frqcr |= (DIVCALC(div) << 24); + frqcr |= FRQCRKICK; ctrl_outl(frqcr, FRQCR); return 0; @@ -377,6 +405,7 @@ static int sh7722_frqcr_set_rate(struct clk *clk, unsigned long rate, /* clear FRQCR bits */ frqcr &= ~(ctx.mask << ctx.shift); frqcr |= div << ctx.shift; + frqcr |= FRQCRKICK; /* ...and perform actual change */ ctrl_outl(frqcr, FRQCR); @@ -542,8 +571,8 @@ static struct clk sh7722_r_clock = { .flags = CLK_RATE_PROPAGATES, }; -#ifndef CONFIG_CPU_SUBTYPE_SH7343 - +#if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\ + !defined(CONFIG_CPU_SUBTYPE_SH7724) /* * these three clocks - SIU A, SIU B, IrDA - share the same clk_ops * methods of clk_ops determine which register they should access by @@ -560,15 +589,16 @@ static struct clk sh7722_siu_b_clock = { .arch_flags = SCLKBCR, .ops = &sh7722_siu_clk_ops, }; +#endif /* CONFIG_CPU_SUBTYPE_SH7343, SH7724 */ -#if defined(CONFIG_CPU_SUBTYPE_SH7722) +#if defined(CONFIG_CPU_SUBTYPE_SH7722) ||\ + defined(CONFIG_CPU_SUBTYPE_SH7724) static struct clk sh7722_irda_clock = { .name = "irda_clk", .arch_flags = IrDACLKCR, .ops = &sh7722_siu_clk_ops, }; #endif -#endif /* CONFIG_CPU_SUBTYPE_SH7343 */ static struct clk sh7722_video_clock = { .name = "video_clk", @@ -715,6 +745,61 @@ static struct clk sh7722_mstpcr_clocks[] = { MSTPCR("vpu0", "bus_clk", 2, 1), MSTPCR("lcdc0", "bus_clk", 2, 0), #endif +#if defined(CONFIG_CPU_SUBTYPE_SH7724) + /* See Datasheet : Overview -> Block Diagram */ + MSTPCR("tlb0", "cpu_clk", 0, 31), + MSTPCR("ic0", "cpu_clk", 0, 30), + MSTPCR("oc0", "cpu_clk", 0, 29), + MSTPCR("rs0", "bus_clk", 0, 28), + MSTPCR("ilmem0", "cpu_clk", 0, 27), + MSTPCR("l2c0", "sh_clk", 0, 26), + MSTPCR("fpu0", "cpu_clk", 0, 24), + MSTPCR("intc0", "peripheral_clk", 0, 22), + MSTPCR("dmac0", "bus_clk", 0, 21), + MSTPCR("sh0", "sh_clk", 0, 20), + MSTPCR("hudi0", "peripheral_clk", 0, 19), + MSTPCR("ubc0", "cpu_clk", 0, 17), + MSTPCR("tmu0", "peripheral_clk", 0, 15), + MSTPCR("cmt0", "r_clk", 0, 14), + MSTPCR("rwdt0", "r_clk", 0, 13), + MSTPCR("dmac1", "bus_clk", 0, 12), + MSTPCR("tmu1", "peripheral_clk", 0, 10), + MSTPCR("scif0", "peripheral_clk", 0, 9), + MSTPCR("scif1", "peripheral_clk", 0, 8), + MSTPCR("scif2", "peripheral_clk", 0, 7), + MSTPCR("scif3", "bus_clk", 0, 6), + MSTPCR("scif4", "bus_clk", 0, 5), + MSTPCR("scif5", "bus_clk", 0, 4), + MSTPCR("msiof0", "bus_clk", 0, 2), + MSTPCR("msiof1", "bus_clk", 0, 1), + MSTPCR("keysc0", "r_clk", 1, 12), + MSTPCR("rtc0", "r_clk", 1, 11), + MSTPCR("i2c0", "peripheral_clk", 1, 9), + MSTPCR("i2c1", "peripheral_clk", 1, 8), + MSTPCR("mmc0", "bus_clk", 2, 29), + MSTPCR("eth0", "bus_clk", 2, 28), + MSTPCR("atapi0", "bus_clk", 2, 26), + MSTPCR("tpu0", "bus_clk", 2, 25), + MSTPCR("irda0", "peripheral_clk", 2, 24), + MSTPCR("tsif0", "bus_clk", 2, 22), + MSTPCR("usb1", "bus_clk", 2, 21), + MSTPCR("usb0", "bus_clk", 2, 20), + MSTPCR("2dg0", "bus_clk", 2, 19), + MSTPCR("sdhi0", "bus_clk", 2, 18), + MSTPCR("sdhi1", "bus_clk", 2, 17), + MSTPCR("veu1", "bus_clk", 2, 15), + MSTPCR("ceu1", "bus_clk", 2, 13), + MSTPCR("beu1", "bus_clk", 2, 12), + MSTPCR("2ddmac0", "sh_clk", 2, 10), + MSTPCR("spu0", "bus_clk", 2, 9), + MSTPCR("jpu0", "bus_clk", 2, 6), + MSTPCR("vou0", "bus_clk", 2, 5), + MSTPCR("beu0", "bus_clk", 2, 4), + MSTPCR("ceu0", "bus_clk", 2, 3), + MSTPCR("veu0", "bus_clk", 2, 2), + MSTPCR("vpu0", "bus_clk", 2, 1), + MSTPCR("lcdc0", "bus_clk", 2, 0), +#endif #if defined(CONFIG_CPU_SUBTYPE_SH7343) MSTPCR("uram0", "umem_clk", 0, 28), MSTPCR("xymem0", "bus_clk", 0, 26), @@ -786,12 +871,15 @@ static struct clk *sh7722_clocks[] = { &sh7722_sh_clock, &sh7722_peripheral_clock, &sh7722_sdram_clock, -#ifndef CONFIG_CPU_SUBTYPE_SH7343 +#if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\ + !defined(CONFIG_CPU_SUBTYPE_SH7724) &sh7722_siu_a_clock, &sh7722_siu_b_clock, -#if defined(CONFIG_CPU_SUBTYPE_SH7722) - &sh7722_irda_clock, #endif +/* 7724 should support FSI clock */ +#if defined(CONFIG_CPU_SUBTYPE_SH7722) || \ + defined(CONFIG_CPU_SUBTYPE_SH7724) + &sh7722_irda_clock, #endif &sh7722_video_clock, }; diff --git a/arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c b/arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c new file mode 100644 index 00000000000..1af0f958637 --- /dev/null +++ b/arch/sh/kernel/cpu/sh4a/pinmux-sh7724.c @@ -0,0 +1,2230 @@ +/* + * SH7724 Pinmux + * + * Copyright (C) 2009 Renesas Solutions Corp. + * + * Kuninori Morimoto + * + * Based on SH7723 Pinmux + * Copyright (C) 2008 Magnus Damm + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +#include +#include +#include +#include + +enum { + PINMUX_RESERVED = 0, + + PINMUX_DATA_BEGIN, + PTA7_DATA, PTA6_DATA, PTA5_DATA, PTA4_DATA, + PTA3_DATA, PTA2_DATA, PTA1_DATA, PTA0_DATA, + PTB7_DATA, PTB6_DATA, PTB5_DATA, PTB4_DATA, + PTB3_DATA, PTB2_DATA, PTB1_DATA, PTB0_DATA, + PTC7_DATA, PTC6_DATA, PTC5_DATA, PTC4_DATA, + PTC3_DATA, PTC2_DATA, PTC1_DATA, PTC0_DATA, + PTD7_DATA, PTD6_DATA, PTD5_DATA, PTD4_DATA, + PTD3_DATA, PTD2_DATA, PTD1_DATA, PTD0_DATA, + PTE7_DATA, PTE6_DATA, PTE5_DATA, PTE4_DATA, + PTE3_DATA, PTE2_DATA, PTE1_DATA, PTE0_DATA, + PTF7_DATA, PTF6_DATA, PTF5_DATA, PTF4_DATA, + PTF3_DATA, PTF2_DATA, PTF1_DATA, PTF0_DATA, + PTG5_DATA, PTG4_DATA, + PTG3_DATA, PTG2_DATA, PTG1_DATA, PTG0_DATA, + PTH7_DATA, PTH6_DATA, PTH5_DATA, PTH4_DATA, + PTH3_DATA, PTH2_DATA, PTH1_DATA, PTH0_DATA, + PTJ7_DATA, PTJ6_DATA, PTJ5_DATA, + PTJ3_DATA, PTJ2_DATA, PTJ1_DATA, PTJ0_DATA, + PTK7_DATA, PTK6_DATA, PTK5_DATA, PTK4_DATA, + PTK3_DATA, PTK2_DATA, PTK1_DATA, PTK0_DATA, + PTL7_DATA, PTL6_DATA, PTL5_DATA, PTL4_DATA, + PTL3_DATA, PTL2_DATA, PTL1_DATA, PTL0_DATA, + PTM7_DATA, PTM6_DATA, PTM5_DATA, PTM4_DATA, + PTM3_DATA, PTM2_DATA, PTM1_DATA, PTM0_DATA, + PTN7_DATA, PTN6_DATA, PTN5_DATA, PTN4_DATA, + PTN3_DATA, PTN2_DATA, PTN1_DATA, PTN0_DATA, + PTQ7_DATA, PTQ6_DATA, PTQ5_DATA, PTQ4_DATA, + PTQ3_DATA, PTQ2_DATA, PTQ1_DATA, PTQ0_DATA, + PTR7_DATA, PTR6_DATA, PTR5_DATA, PTR4_DATA, + PTR3_DATA, PTR2_DATA, PTR1_DATA, PTR0_DATA, + PTS6_DATA, PTS5_DATA, PTS4_DATA, + PTS3_DATA, PTS2_DATA, PTS1_DATA, PTS0_DATA, + PTT7_DATA, PTT6_DATA, PTT5_DATA, PTT4_DATA, + PTT3_DATA, PTT2_DATA, PTT1_DATA, PTT0_DATA, + PTU7_DATA, PTU6_DATA, PTU5_DATA, PTU4_DATA, + PTU3_DATA, PTU2_DATA, PTU1_DATA, PTU0_DATA, + PTV7_DATA, PTV6_DATA, PTV5_DATA, PTV4_DATA, + PTV3_DATA, PTV2_DATA, PTV1_DATA, PTV0_DATA, + PTW7_DATA, PTW6_DATA, PTW5_DATA, PTW4_DATA, + PTW3_DATA, PTW2_DATA, PTW1_DATA, PTW0_DATA, + PTX7_DATA, PTX6_DATA, PTX5_DATA, PTX4_DATA, + PTX3_DATA, PTX2_DATA, PTX1_DATA, PTX0_DATA, + PTY7_DATA, PTY6_DATA, PTY5_DATA, PTY4_DATA, + PTY3_DATA, PTY2_DATA, PTY1_DATA, PTY0_DATA, + PTZ7_DATA, PTZ6_DATA, PTZ5_DATA, PTZ4_DATA, + PTZ3_DATA, PTZ2_DATA, PTZ1_DATA, PTZ0_DATA, + PINMUX_DATA_END, + + PINMUX_INPUT_BEGIN, + PTA7_IN, PTA6_IN, PTA5_IN, PTA4_IN, + PTA3_IN, PTA2_IN, PTA1_IN, PTA0_IN, + PTB7_IN, PTB6_IN, PTB5_IN, PTB4_IN, + PTB3_IN, PTB2_IN, PTB1_IN, PTB0_IN, + PTC7_IN, PTC6_IN, PTC5_IN, PTC4_IN, + PTC3_IN, PTC2_IN, PTC1_IN, PTC0_IN, + PTD7_IN, PTD6_IN, PTD5_IN, PTD4_IN, + PTD3_IN, PTD2_IN, PTD1_IN, PTD0_IN, + PTE7_IN, PTE6_IN, PTE5_IN, PTE4_IN, + PTE3_IN, PTE2_IN, PTE1_IN, PTE0_IN, + PTF7_IN, PTF6_IN, PTF5_IN, PTF4_IN, + PTF3_IN, PTF2_IN, PTF1_IN, PTF0_IN, + PTH7_IN, PTH6_IN, PTH5_IN, PTH4_IN, + PTH3_IN, PTH2_IN, PTH1_IN, PTH0_IN, + PTJ3_IN, PTJ2_IN, PTJ1_IN, PTJ0_IN, + PTK7_IN, PTK6_IN, PTK5_IN, PTK4_IN, + PTK3_IN, PTK2_IN, PTK1_IN, PTK0_IN, + PTL7_IN, PTL6_IN, PTL5_IN, PTL4_IN, + PTL3_IN, PTL2_IN, PTL1_IN, PTL0_IN, + PTM7_IN, PTM6_IN, PTM5_IN, PTM4_IN, + PTM3_IN, PTM2_IN, PTM1_IN, PTM0_IN, + PTN7_IN, PTN6_IN, PTN5_IN, PTN4_IN, + PTN3_IN, PTN2_IN, PTN1_IN, PTN0_IN, + PTQ7_IN, PTQ6_IN, PTQ5_IN, PTQ4_IN, + PTQ3_IN, PTQ2_IN, PTQ1_IN, PTQ0_IN, + PTR7_IN, PTR6_IN, PTR5_IN, PTR4_IN, + PTR3_IN, PTR2_IN, PTR1_IN, PTR0_IN, + PTS6_IN, PTS5_IN, PTS4_IN, + PTS3_IN, PTS2_IN, PTS1_IN, PTS0_IN, + PTT7_IN, PTT6_IN, PTT5_IN, PTT4_IN, + PTT3_IN, PTT2_IN, PTT1_IN, PTT0_IN, + PTU7_IN, PTU6_IN, PTU5_IN, PTU4_IN, + PTU3_IN, PTU2_IN, PTU1_IN, PTU0_IN, + PTV7_IN, PTV6_IN, PTV5_IN, PTV4_IN, + PTV3_IN, PTV2_IN, PTV1_IN, PTV0_IN, + PTW7_IN, PTW6_IN, PTW5_IN, PTW4_IN, + PTW3_IN, PTW2_IN, PTW1_IN, PTW0_IN, + PTX7_IN, PTX6_IN, PTX5_IN, PTX4_IN, + PTX3_IN, PTX2_IN, PTX1_IN, PTX0_IN, + PTY7_IN, PTY6_IN, PTY5_IN, PTY4_IN, + PTY3_IN, PTY2_IN, PTY1_IN, PTY0_IN, + PTZ7_IN, PTZ6_IN, PTZ5_IN, PTZ4_IN, + PTZ3_IN, PTZ2_IN, PTZ1_IN, PTZ0_IN, + PINMUX_INPUT_END, + + PINMUX_INPUT_PULLUP_BEGIN, + PTA7_IN_PU, PTA6_IN_PU, PTA5_IN_PU, PTA4_IN_PU, + PTA3_IN_PU, PTA2_IN_PU, PTA1_IN_PU, PTA0_IN_PU, + PTB7_IN_PU, PTB6_IN_PU, PTB5_IN_PU, PTB4_IN_PU, + PTB3_IN_PU, PTB2_IN_PU, PTB1_IN_PU, PTB0_IN_PU, + PTC7_IN_PU, PTC6_IN_PU, PTC5_IN_PU, PTC4_IN_PU, + PTC3_IN_PU, PTC2_IN_PU, PTC1_IN_PU, PTC0_IN_PU, + PTD7_IN_PU, PTD6_IN_PU, PTD5_IN_PU, PTD4_IN_PU, + PTD3_IN_PU, PTD2_IN_PU, PTD1_IN_PU, PTD0_IN_PU, + PTE7_IN_PU, PTE6_IN_PU, PTE5_IN_PU, PTE4_IN_PU, + PTE3_IN_PU, PTE2_IN_PU, PTE1_IN_PU, PTE0_IN_PU, + PTF7_IN_PU, PTF6_IN_PU, PTF5_IN_PU, PTF4_IN_PU, + PTF3_IN_PU, PTF2_IN_PU, PTF1_IN_PU, PTF0_IN_PU, + PTH7_IN_PU, PTH6_IN_PU, PTH5_IN_PU, PTH4_IN_PU, + PTH3_IN_PU, PTH2_IN_PU, PTH1_IN_PU, PTH0_IN_PU, + PTJ3_IN_PU, PTJ2_IN_PU, PTJ1_IN_PU, PTJ0_IN_PU, + PTK7_IN_PU, PTK6_IN_PU, PTK5_IN_PU, PTK4_IN_PU, + PTK3_IN_PU, PTK2_IN_PU, PTK1_IN_PU, PTK0_IN_PU, + PTL7_IN_PU, PTL6_IN_PU, PTL5_IN_PU, PTL4_IN_PU, + PTL3_IN_PU, PTL2_IN_PU, PTL1_IN_PU, PTL0_IN_PU, + PTM7_IN_PU, PTM6_IN_PU, PTM5_IN_PU, PTM4_IN_PU, + PTM3_IN_PU, PTM2_IN_PU, PTM1_IN_PU, PTM0_IN_PU, + PTN7_IN_PU, PTN6_IN_PU, PTN5_IN_PU, PTN4_IN_PU, + PTN3_IN_PU, PTN2_IN_PU, PTN1_IN_PU, PTN0_IN_PU, + PTQ7_IN_PU, PTQ6_IN_PU, PTQ5_IN_PU, PTQ4_IN_PU, + PTQ3_IN_PU, PTQ2_IN_PU, PTQ1_IN_PU, PTQ0_IN_PU, + PTR7_IN_PU, PTR6_IN_PU, PTR5_IN_PU, PTR4_IN_PU, + PTR3_IN_PU, PTR2_IN_PU, PTR1_IN_PU, PTR0_IN_PU, + PTS6_IN_PU, PTS5_IN_PU, PTS4_IN_PU, + PTS3_IN_PU, PTS2_IN_PU, PTS1_IN_PU, PTS0_IN_PU, + PTT7_IN_PU, PTT6_IN_PU, PTT5_IN_PU, PTT4_IN_PU, + PTT3_IN_PU, PTT2_IN_PU, PTT1_IN_PU, PTT0_IN_PU, + PTU7_IN_PU, PTU6_IN_PU, PTU5_IN_PU, PTU4_IN_PU, + PTU3_IN_PU, PTU2_IN_PU, PTU1_IN_PU, PTU0_IN_PU, + PTV7_IN_PU, PTV6_IN_PU, PTV5_IN_PU, PTV4_IN_PU, + PTV3_IN_PU, PTV2_IN_PU, PTV1_IN_PU, PTV0_IN_PU, + PTW7_IN_PU, PTW6_IN_PU, PTW5_IN_PU, PTW4_IN_PU, + PTW3_IN_PU, PTW2_IN_PU, PTW1_IN_PU, PTW0_IN_PU, + PTX7_IN_PU, PTX6_IN_PU, PTX5_IN_PU, PTX4_IN_PU, + PTX3_IN_PU, PTX2_IN_PU, PTX1_IN_PU, PTX0_IN_PU, + PTY7_IN_PU, PTY6_IN_PU, PTY5_IN_PU, PTY4_IN_PU, + PTY3_IN_PU, PTY2_IN_PU, PTY1_IN_PU, PTY0_IN_PU, + PTZ7_IN_PU, PTZ6_IN_PU, PTZ5_IN_PU, PTZ4_IN_PU, + PTZ3_IN_PU, PTZ2_IN_PU, PTZ1_IN_PU, PTZ0_IN_PU, + PINMUX_INPUT_PULLUP_END, + + PINMUX_OUTPUT_BEGIN, + PTA7_OUT, PTA6_OUT, PTA5_OUT, PTA4_OUT, + PTA3_OUT, PTA2_OUT, PTA1_OUT, PTA0_OUT, + PTB7_OUT, PTB6_OUT, PTB5_OUT, PTB4_OUT, + PTB3_OUT, PTB2_OUT, PTB1_OUT, PTB0_OUT, + PTC7_OUT, PTC6_OUT, PTC5_OUT, PTC4_OUT, + PTC3_OUT, PTC2_OUT, PTC1_OUT, PTC0_OUT, + PTD7_OUT, PTD6_OUT, PTD5_OUT, PTD4_OUT, + PTD3_OUT, PTD2_OUT, PTD1_OUT, PTD0_OUT, + PTE7_OUT, PTE6_OUT, PTE5_OUT, PTE4_OUT, + PTE3_OUT, PTE2_OUT, PTE1_OUT, PTE0_OUT, + PTF7_OUT, PTF6_OUT, PTF5_OUT, PTF4_OUT, + PTF3_OUT, PTF2_OUT, PTF1_OUT, PTF0_OUT, + PTG5_OUT, PTG4_OUT, + PTG3_OUT, PTG2_OUT, PTG1_OUT, PTG0_OUT, + PTH7_OUT, PTH6_OUT, PTH5_OUT, PTH4_OUT, + PTH3_OUT, PTH2_OUT, PTH1_OUT, PTH0_OUT, + PTJ7_OUT, PTJ6_OUT, PTJ5_OUT, + PTJ3_OUT, PTJ2_OUT, PTJ1_OUT, PTJ0_OUT, + PTK7_OUT, PTK6_OUT, PTK5_OUT, PTK4_OUT, + PTK3_OUT, PTK2_OUT, PTK1_OUT, PTK0_OUT, + PTL7_OUT, PTL6_OUT, PTL5_OUT, PTL4_OUT, + PTL3_OUT, PTL2_OUT, PTL1_OUT, PTL0_OUT, + PTM7_OUT, PTM6_OUT, PTM5_OUT, PTM4_OUT, + PTM3_OUT, PTM2_OUT, PTM1_OUT, PTM0_OUT, + PTN7_OUT, PTN6_OUT, PTN5_OUT, PTN4_OUT, + PTN3_OUT, PTN2_OUT, PTN1_OUT, PTN0_OUT, + PTQ7_OUT, PTQ6_OUT, PTQ5_OUT, PTQ4_OUT, + PTQ3_OUT, PTQ2_OUT, PTQ1_OUT, PTQ0_OUT, + PTR7_OUT, PTR6_OUT, PTR5_OUT, PTR4_OUT, + PTR1_OUT, PTR0_OUT, + PTS6_OUT, PTS5_OUT, PTS4_OUT, + PTS3_OUT, PTS2_OUT, PTS1_OUT, PTS0_OUT, + PTT7_OUT, PTT6_OUT, PTT5_OUT, PTT4_OUT, + PTT3_OUT, PTT2_OUT, PTT1_OUT, PTT0_OUT, + PTU7_OUT, PTU6_OUT, PTU5_OUT, PTU4_OUT, + PTU3_OUT, PTU2_OUT, PTU1_OUT, PTU0_OUT, + PTV7_OUT, PTV6_OUT, PTV5_OUT, PTV4_OUT, + PTV3_OUT, PTV2_OUT, PTV1_OUT, PTV0_OUT, + PTW7_OUT, PTW6_OUT, PTW5_OUT, PTW4_OUT, + PTW3_OUT, PTW2_OUT, PTW1_OUT, PTW0_OUT, + PTX7_OUT, PTX6_OUT, PTX5_OUT, PTX4_OUT, + PTX3_OUT, PTX2_OUT, PTX1_OUT, PTX0_OUT, + PTY7_OUT, PTY6_OUT, PTY5_OUT, PTY4_OUT, + PTY3_OUT, PTY2_OUT, PTY1_OUT, PTY0_OUT, + PTZ7_OUT, PTZ6_OUT, PTZ5_OUT, PTZ4_OUT, + PTZ3_OUT, PTZ2_OUT, PTZ1_OUT, PTZ0_OUT, + PINMUX_OUTPUT_END, + + PINMUX_FUNCTION_BEGIN, + PTA7_FN, PTA6_FN, PTA5_FN, PTA4_FN, + PTA3_FN, PTA2_FN, PTA1_FN, PTA0_FN, + PTB7_FN, PTB6_FN, PTB5_FN, PTB4_FN, + PTB3_FN, PTB2_FN, PTB1_FN, PTB0_FN, + PTC7_FN, PTC6_FN, PTC5_FN, PTC4_FN, + PTC3_FN, PTC2_FN, PTC1_FN, PTC0_FN, + PTD7_FN, PTD6_FN, PTD5_FN, PTD4_FN, + PTD3_FN, PTD2_FN, PTD1_FN, PTD0_FN, + PTE7_FN, PTE6_FN, PTE5_FN, PTE4_FN, + PTE3_FN, PTE2_FN, PTE1_FN, PTE0_FN, + PTF7_FN, PTF6_FN, PTF5_FN, PTF4_FN, + PTF3_FN, PTF2_FN, PTF1_FN, PTF0_FN, + PTG5_FN, PTG4_FN, + PTG3_FN, PTG2_FN, PTG1_FN, PTG0_FN, + PTH7_FN, PTH6_FN, PTH5_FN, PTH4_FN, + PTH3_FN, PTH2_FN, PTH1_FN, PTH0_FN, + PTJ7_FN, PTJ6_FN, PTJ5_FN, + PTJ3_FN, PTJ2_FN, PTJ1_FN, PTJ0_FN, + PTK7_FN, PTK6_FN, PTK5_FN, PTK4_FN, + PTK3_FN, PTK2_FN, PTK1_FN, PTK0_FN, + PTL7_FN, PTL6_FN, PTL5_FN, PTL4_FN, + PTL3_FN, PTL2_FN, PTL1_FN, PTL0_FN, + PTM7_FN, PTM6_FN, PTM5_FN, PTM4_FN, + PTM3_FN, PTM2_FN, PTM1_FN, PTM0_FN, + PTN7_FN, PTN6_FN, PTN5_FN, PTN4_FN, + PTN3_FN, PTN2_FN, PTN1_FN, PTN0_FN, + PTQ7_FN, PTQ6_FN, PTQ5_FN, PTQ4_FN, + PTQ3_FN, PTQ2_FN, PTQ1_FN, PTQ0_FN, + PTR7_FN, PTR6_FN, PTR5_FN, PTR4_FN, + PTR3_FN, PTR2_FN, PTR1_FN, PTR0_FN, + PTS6_FN, PTS5_FN, PTS4_FN, + PTS3_FN, PTS2_FN, PTS1_FN, PTS0_FN, + PTT7_FN, PTT6_FN, PTT5_FN, PTT4_FN, + PTT3_FN, PTT2_FN, PTT1_FN, PTT0_FN, + PTU7_FN, PTU6_FN, PTU5_FN, PTU4_FN, + PTU3_FN, PTU2_FN, PTU1_FN, PTU0_FN, + PTV7_FN, PTV6_FN, PTV5_FN, PTV4_FN, + PTV3_FN, PTV2_FN, PTV1_FN, PTV0_FN, + PTW7_FN, PTW6_FN, PTW5_FN, PTW4_FN, + PTW3_FN, PTW2_FN, PTW1_FN, PTW0_FN, + PTX7_FN, PTX6_FN, PTX5_FN, PTX4_FN, + PTX3_FN, PTX2_FN, PTX1_FN, PTX0_FN, + PTY7_FN, PTY6_FN, PTY5_FN, PTY4_FN, + PTY3_FN, PTY2_FN, PTY1_FN, PTY0_FN, + PTZ7_FN, PTZ6_FN, PTZ5_FN, PTZ4_FN, + PTZ3_FN, PTZ2_FN, PTZ1_FN, PTZ0_FN, + + + PSA15_0, PSA15_1, + PSA14_0, PSA14_1, + PSA13_0, PSA13_1, + PSA12_0, PSA12_1, + PSA10_0, PSA10_1, + PSA9_0, PSA9_1, + PSA8_0, PSA8_1, + PSA7_0, PSA7_1, + PSA6_0, PSA6_1, + PSA5_0, PSA5_1, + PSA3_0, PSA3_1, + PSA2_0, PSA2_1, + PSA1_0, PSA1_1, + PSA0_0, PSA0_1, + + PSB14_0, PSB14_1, + PSB13_0, PSB13_1, + PSB12_0, PSB12_1, + PSB11_0, PSB11_1, + PSB10_0, PSB10_1, + PSB9_0, PSB9_1, + PSB8_0, PSB8_1, + PSB7_0, PSB7_1, + PSB6_0, PSB6_1, + PSB5_0, PSB5_1, + PSB4_0, PSB4_1, + PSB3_0, PSB3_1, + PSB2_0, PSB2_1, + PSB1_0, PSB1_1, + PSB0_0, PSB0_1, + + PSC15_0, PSC15_1, + PSC14_0, PSC14_1, + PSC13_0, PSC13_1, + PSC12_0, PSC12_1, + PSC11_0, PSC11_1, + PSC10_0, PSC10_1, + PSC9_0, PSC9_1, + PSC8_0, PSC8_1, + PSC7_0, PSC7_1, + PSC6_0, PSC6_1, + PSC5_0, PSC5_1, + PSC4_0, PSC4_1, + PSC2_0, PSC2_1, + PSC1_0, PSC1_1, + PSC0_0, PSC0_1, + + PSD15_0, PSD15_1, + PSD14_0, PSD14_1, + PSD13_0, PSD13_1, + PSD12_0, PSD12_1, + PSD11_0, PSD11_1, + PSD10_0, PSD10_1, + PSD9_0, PSD9_1, + PSD8_0, PSD8_1, + PSD7_0, PSD7_1, + PSD6_0, PSD6_1, + PSD5_0, PSD5_1, + PSD4_0, PSD4_1, + PSD3_0, PSD3_1, + PSD2_0, PSD2_1, + PSD1_0, PSD1_1, + PSD0_0, PSD0_1, + + PSE15_0, PSE15_1, + PSE14_0, PSE14_1, + PSE13_0, PSE13_1, + PSE12_0, PSE12_1, + PSE11_0, PSE11_1, + PSE10_0, PSE10_1, + PSE9_0, PSE9_1, + PSE8_0, PSE8_1, + PSE7_0, PSE7_1, + PSE6_0, PSE6_1, + PSE5_0, PSE5_1, + PSE4_0, PSE4_1, + PSE3_0, PSE3_1, + PSE2_0, PSE2_1, + PSE1_0, PSE1_1, + PSE0_0, PSE0_1, + PINMUX_FUNCTION_END, + + PINMUX_MARK_BEGIN, + /*PTA*/ + D23_MARK, KEYOUT2_MARK, IDED15_MARK, + D22_MARK, KEYOUT1_MARK, IDED14_MARK, + D21_MARK, KEYOUT0_MARK, IDED13_MARK, + D20_MARK, KEYIN4_MARK, IDED12_MARK, + D19_MARK, KEYIN3_MARK, IDED11_MARK, + D18_MARK, KEYIN2_MARK, IDED10_MARK, + D17_MARK, KEYIN1_MARK, IDED9_MARK, + D16_MARK, KEYIN0_MARK, IDED8_MARK, + + /*PTB*/ + D31_MARK, TPUTO1_MARK, IDEA1_MARK, + D30_MARK, TPUTO0_MARK, IDEA0_MARK, + D29_MARK, IODREQ_MARK, + D28_MARK, IDECS0_MARK, + D27_MARK, IDECS1_MARK, + D26_MARK, KEYOUT5_IN5_MARK, IDEIORD_MARK, + D25_MARK, KEYOUT4_IN6_MARK, IDEIOWR_MARK, + D24_MARK, KEYOUT3_MARK, IDEINT_MARK, + + /*PTC*/ + LCDD7_MARK, + LCDD6_MARK, + LCDD5_MARK, + LCDD4_MARK, + LCDD3_MARK, + LCDD2_MARK, + LCDD1_MARK, + LCDD0_MARK, + + /*PTD*/ + LCDD15_MARK, + LCDD14_MARK, + LCDD13_MARK, + LCDD12_MARK, + LCDD11_MARK, + LCDD10_MARK, + LCDD9_MARK, + LCDD8_MARK, + + /*PTE*/ + FSIMCKB_MARK, + FSIMCKA_MARK, + LCDD21_MARK, SCIF2_L_TXD_MARK, + LCDD20_MARK, SCIF4_SCK_MARK, + LCDD19_MARK, SCIF4_RXD_MARK, + LCDD18_MARK, SCIF4_TXD_MARK, + LCDD17_MARK, + LCDD16_MARK, + + /*PTF*/ + LCDVSYN_MARK, + LCDDISP_MARK, LCDRS_MARK, + LCDHSYN_MARK, LCDCS_MARK, + LCDDON_MARK, + LCDDCK_MARK, LCDWR_MARK, + LCDVEPWC_MARK, SCIF0_TXD_MARK, + LCDD23_MARK, SCIF2_L_SCK_MARK, + LCDD22_MARK, SCIF2_L_RXD_MARK, + + /*PTG*/ + AUDCK_MARK, + AUDSYNC_MARK, + AUDATA3_MARK, + AUDATA2_MARK, + AUDATA1_MARK, + AUDATA0_MARK, + + /*PTH*/ + VIO0_VD_MARK, + VIO0_CLK_MARK, + VIO0_D7_MARK, + VIO0_D6_MARK, + VIO0_D5_MARK, + VIO0_D4_MARK, + VIO0_D3_MARK, + VIO0_D2_MARK, + + /*PTJ*/ + PDSTATUS_MARK, + STATUS2_MARK, + STATUS0_MARK, + A25_MARK, BS_MARK, + A24_MARK, + A23_MARK, + A22_MARK, + + /*PTK*/ + VIO1_D5_MARK, VIO0_D13_MARK, IDED5_MARK, + VIO1_D4_MARK, VIO0_D12_MARK, IDED4_MARK, + VIO1_D3_MARK, VIO0_D11_MARK, IDED3_MARK, + VIO1_D2_MARK, VIO0_D10_MARK, IDED2_MARK, + VIO1_D1_MARK, VIO0_D9_MARK, IDED1_MARK, + VIO1_D0_MARK, VIO0_D8_MARK, IDED0_MARK, + VIO0_FLD_MARK, + VIO0_HD_MARK, + + /*PTL*/ + DV_D5_MARK, SCIF3_V_SCK_MARK, RMII_RXD0_MARK, + DV_D4_MARK, SCIF3_V_RXD_MARK, RMII_RXD1_MARK, + DV_D3_MARK, SCIF3_V_TXD_MARK, RMII_REF_CLK_MARK, + DV_D2_MARK, SCIF1_SCK_MARK, RMII_TX_EN_MARK, + DV_D1_MARK, SCIF1_RXD_MARK, RMII_TXD0_MARK, + DV_D0_MARK, SCIF1_TXD_MARK, RMII_TXD1_MARK, + DV_D15_MARK, + DV_D14_MARK, MSIOF0_MCK_MARK, + + /*PTM*/ + DV_D13_MARK, MSIOF0_TSCK_MARK, + DV_D12_MARK, MSIOF0_RXD_MARK, + DV_D11_MARK, MSIOF0_TXD_MARK, + DV_D10_MARK, MSIOF0_TSYNC_MARK, + DV_D9_MARK, MSIOF0_SS1_MARK, MSIOF0_RSCK_MARK, + DV_D8_MARK, MSIOF0_SS2_MARK, MSIOF0_RSYNC_MARK, + LCDVCPWC_MARK, SCIF0_RXD_MARK, + LCDRD_MARK, SCIF0_SCK_MARK, + + /*PTN*/ + VIO0_D1_MARK, + VIO0_D0_MARK, + DV_CLKI_MARK, + DV_CLK_MARK, SCIF2_V_SCK_MARK, + DV_VSYNC_MARK, SCIF2_V_RXD_MARK, + DV_HSYNC_MARK, SCIF2_V_TXD_MARK, + DV_D7_MARK, SCIF3_V_CTS_MARK, RMII_RX_ER_MARK, + DV_D6_MARK, SCIF3_V_RTS_MARK, RMII_CRS_DV_MARK, + + /*PTQ*/ + D7_MARK, + D6_MARK, + D5_MARK, + D4_MARK, + D3_MARK, + D2_MARK, + D1_MARK, + D0_MARK, + + /*PTR*/ + CS6B_CE1B_MARK, + CS6A_CE2B_MARK, + CS5B_CE1A_MARK, + CS5A_CE2A_MARK, + IOIS16_MARK, LCDLCLK_MARK, + WAIT_MARK, + WE3_ICIOWR_MARK, TPUTO3_MARK, TPUTI3_MARK, + WE2_ICIORD_MARK, TPUTO2_MARK, IDEA2_MARK, + + /*PTS*/ + VIO_CKO_MARK, + VIO1_FLD_MARK, TPUTI2_MARK, IDEIORDY_MARK, + VIO1_HD_MARK, SCIF5_SCK_MARK, + VIO1_VD_MARK, SCIF5_RXD_MARK, + VIO1_CLK_MARK, SCIF5_TXD_MARK, + VIO1_D7_MARK, VIO0_D15_MARK, IDED7_MARK, + VIO1_D6_MARK, VIO0_D14_MARK, IDED6_MARK, + + /*PTT*/ + D15_MARK, + D14_MARK, + D13_MARK, + D12_MARK, + D11_MARK, + D10_MARK, + D9_MARK, + D8_MARK, + + /*PTU*/ + DMAC_DACK0_MARK, + DMAC_DREQ0_MARK, + FSIOASD_MARK, + FSIIABCK_MARK, + FSIIALRCK_MARK, + FSIOABCK_MARK, + FSIOALRCK_MARK, + CLKAUDIOAO_MARK, + + /*PTV*/ + FSIIBSD_MARK, MSIOF1_SS2_MARK, MSIOF1_RSYNC_MARK, + FSIOBSD_MARK, MSIOF1_SS1_MARK, MSIOF1_RSCK_MARK, + FSIIBBCK_MARK, MSIOF1_RXD_MARK, + FSIIBLRCK_MARK, MSIOF1_TSYNC_MARK, + FSIOBBCK_MARK, MSIOF1_TSCK_MARK, + FSIOBLRCK_MARK, MSIOF1_TXD_MARK, + CLKAUDIOBO_MARK, MSIOF1_MCK_MARK, + FSIIASD_MARK, + + /*PTW*/ + MMC_D7_MARK, SDHI1CD_MARK, IODACK_MARK, + MMC_D6_MARK, SDHI1WP_MARK, IDERST_MARK, + MMC_D5_MARK, SDHI1D3_MARK, EXBUF_ENB_MARK, + MMC_D4_MARK, SDHI1D2_MARK, DIRECTION_MARK, + MMC_D3_MARK, SDHI1D1_MARK, + MMC_D2_MARK, SDHI1D0_MARK, + MMC_D1_MARK, SDHI1CMD_MARK, + MMC_D0_MARK, SDHI1CLK_MARK, + + /*PTX*/ + DMAC_DACK1_MARK, IRDA_OUT_MARK, + DMAC_DREQ1_MARK, IRDA_IN_MARK, + TSIF_TS0_SDAT_MARK, LNKSTA_MARK, + TSIF_TS0_SCK_MARK, MDIO_MARK, + TSIF_TS0_SDEN_MARK, MDC_MARK, + TSIF_TS0_SPSYNC_MARK, + MMC_CLK_MARK, + MMC_CMD_MARK, + + /*PTY*/ + SDHI0CD_MARK, + SDHI0WP_MARK, + SDHI0D3_MARK, + SDHI0D2_MARK, + SDHI0D1_MARK, + SDHI0D0_MARK, + SDHI0CMD_MARK, + SDHI0CLK_MARK, + + /*PTZ*/ + INTC_IRQ7_MARK, SCIF3_I_CTS_MARK, + INTC_IRQ6_MARK, SCIF3_I_RTS_MARK, + INTC_IRQ5_MARK, SCIF3_I_SCK_MARK, + INTC_IRQ4_MARK, SCIF3_I_RXD_MARK, + INTC_IRQ3_MARK, SCIF3_I_TXD_MARK, + INTC_IRQ2_MARK, + INTC_IRQ1_MARK, + INTC_IRQ0_MARK, + PINMUX_MARK_END, +}; + +static pinmux_enum_t pinmux_data[] = { + /* PTA GPIO */ + PINMUX_DATA(PTA7_DATA, PTA7_IN, PTA7_OUT, PTA7_IN_PU), + PINMUX_DATA(PTA6_DATA, PTA6_IN, PTA6_OUT, PTA6_IN_PU), + PINMUX_DATA(PTA5_DATA, PTA5_IN, PTA5_OUT, PTA5_IN_PU), + PINMUX_DATA(PTA4_DATA, PTA4_IN, PTA4_OUT, PTA4_IN_PU), + PINMUX_DATA(PTA3_DATA, PTA3_IN, PTA3_OUT, PTA3_IN_PU), + PINMUX_DATA(PTA2_DATA, PTA2_IN, PTA2_OUT, PTA2_IN_PU), + PINMUX_DATA(PTA1_DATA, PTA1_IN, PTA1_OUT, PTA1_IN_PU), + PINMUX_DATA(PTA0_DATA, PTA0_IN, PTA0_OUT, PTA0_IN_PU), + + /* PTB GPIO */ + PINMUX_DATA(PTB7_DATA, PTB7_IN, PTB7_OUT, PTB7_IN_PU), + PINMUX_DATA(PTB6_DATA, PTB6_IN, PTB6_OUT, PTB6_IN_PU), + PINMUX_DATA(PTB5_DATA, PTB5_IN, PTB5_OUT, PTB5_IN_PU), + PINMUX_DATA(PTB4_DATA, PTB4_IN, PTB4_OUT, PTB4_IN_PU), + PINMUX_DATA(PTB3_DATA, PTB3_IN, PTB3_OUT, PTB3_IN_PU), + PINMUX_DATA(PTB2_DATA, PTB2_IN, PTB2_OUT, PTB2_IN_PU), + PINMUX_DATA(PTB1_DATA, PTB1_IN, PTB1_OUT, PTB1_IN_PU), + PINMUX_DATA(PTB0_DATA, PTB0_IN, PTB0_OUT, PTB0_IN_PU), + + /* PTC GPIO */ + PINMUX_DATA(PTC7_DATA, PTC7_IN, PTC7_OUT, PTC7_IN_PU), + PINMUX_DATA(PTC6_DATA, PTC6_IN, PTC6_OUT, PTC6_IN_PU), + PINMUX_DATA(PTC5_DATA, PTC5_IN, PTC5_OUT, PTC5_IN_PU), + PINMUX_DATA(PTC4_DATA, PTC4_IN, PTC4_OUT, PTC4_IN_PU), + PINMUX_DATA(PTC3_DATA, PTC3_IN, PTC3_OUT, PTC3_IN_PU), + PINMUX_DATA(PTC2_DATA, PTC2_IN, PTC2_OUT, PTC2_IN_PU), + PINMUX_DATA(PTC1_DATA, PTC1_IN, PTC1_OUT, PTC1_IN_PU), + PINMUX_DATA(PTC0_DATA, PTC0_IN, PTC0_OUT, PTC0_IN_PU), + + /* PTD GPIO */ + PINMUX_DATA(PTD7_DATA, PTD7_IN, PTD7_OUT, PTD7_IN_PU), + PINMUX_DATA(PTD6_DATA, PTD6_IN, PTD6_OUT, PTD6_IN_PU), + PINMUX_DATA(PTD5_DATA, PTD5_IN, PTD5_OUT, PTD5_IN_PU), + PINMUX_DATA(PTD4_DATA, PTD4_IN, PTD4_OUT, PTD4_IN_PU), + PINMUX_DATA(PTD3_DATA, PTD3_IN, PTD3_OUT, PTD3_IN_PU), + PINMUX_DATA(PTD2_DATA, PTD2_IN, PTD2_OUT, PTD2_IN_PU), + PINMUX_DATA(PTD1_DATA, PTD1_IN, PTD1_OUT, PTD1_IN_PU), + PINMUX_DATA(PTD0_DATA, PTD0_IN, PTD0_OUT, PTD0_IN_PU), + + /* PTE GPIO */ + PINMUX_DATA(PTE7_DATA, PTE7_IN, PTE7_OUT, PTE7_IN_PU), + PINMUX_DATA(PTE6_DATA, PTE6_IN, PTE6_OUT, PTE6_IN_PU), + PINMUX_DATA(PTE5_DATA, PTE5_IN, PTE5_OUT, PTE5_IN_PU), + PINMUX_DATA(PTE4_DATA, PTE4_IN, PTE4_OUT, PTE4_IN_PU), + PINMUX_DATA(PTE3_DATA, PTE3_IN, PTE3_OUT, PTE3_IN_PU), + PINMUX_DATA(PTE2_DATA, PTE2_IN, PTE2_OUT, PTE2_IN_PU), + PINMUX_DATA(PTE1_DATA, PTE1_IN, PTE1_OUT, PTE1_IN_PU), + PINMUX_DATA(PTE0_DATA, PTE0_IN, PTE0_OUT, PTE0_IN_PU), + + /* PTF GPIO */ + PINMUX_DATA(PTF7_DATA, PTF7_IN, PTF7_OUT, PTF7_IN_PU), + PINMUX_DATA(PTF6_DATA, PTF6_IN, PTF6_OUT, PTF6_IN_PU), + PINMUX_DATA(PTF5_DATA, PTF5_IN, PTF5_OUT, PTF5_IN_PU), + PINMUX_DATA(PTF4_DATA, PTF4_IN, PTF4_OUT, PTF4_IN_PU), + PINMUX_DATA(PTF3_DATA, PTF3_IN, PTF3_OUT, PTF3_IN_PU), + PINMUX_DATA(PTF2_DATA, PTF2_IN, PTF2_OUT, PTF2_IN_PU), + PINMUX_DATA(PTF1_DATA, PTF1_IN, PTF1_OUT, PTF1_IN_PU), + PINMUX_DATA(PTF0_DATA, PTF0_IN, PTF0_OUT, PTF0_IN_PU), + + /* PTG GPIO */ + PINMUX_DATA(PTG5_DATA, PTG5_OUT), + PINMUX_DATA(PTG4_DATA, PTG4_OUT), + PINMUX_DATA(PTG3_DATA, PTG3_OUT), + PINMUX_DATA(PTG2_DATA, PTG2_OUT), + PINMUX_DATA(PTG1_DATA, PTG1_OUT), + PINMUX_DATA(PTG0_DATA, PTG0_OUT), + + /* PTH GPIO */ + PINMUX_DATA(PTH7_DATA, PTH7_IN, PTH7_OUT, PTH7_IN_PU), + PINMUX_DATA(PTH6_DATA, PTH6_IN, PTH6_OUT, PTH6_IN_PU), + PINMUX_DATA(PTH5_DATA, PTH5_IN, PTH5_OUT, PTH5_IN_PU), + PINMUX_DATA(PTH4_DATA, PTH4_IN, PTH4_OUT, PTH4_IN_PU), + PINMUX_DATA(PTH3_DATA, PTH3_IN, PTH3_OUT, PTH3_IN_PU), + PINMUX_DATA(PTH2_DATA, PTH2_IN, PTH2_OUT, PTH2_IN_PU), + PINMUX_DATA(PTH1_DATA, PTH1_IN, PTH1_OUT, PTH1_IN_PU), + PINMUX_DATA(PTH0_DATA, PTH0_IN, PTH0_OUT, PTH0_IN_PU), + + /* PTJ GPIO */ + PINMUX_DATA(PTJ7_DATA, PTJ7_OUT), + PINMUX_DATA(PTJ6_DATA, PTJ6_OUT), + PINMUX_DATA(PTJ5_DATA, PTJ5_OUT), + PINMUX_DATA(PTJ3_DATA, PTJ3_IN, PTJ3_OUT, PTJ3_IN_PU), + PINMUX_DATA(PTJ2_DATA, PTJ2_IN, PTJ2_OUT, PTJ2_IN_PU), + PINMUX_DATA(PTJ1_DATA, PTJ1_IN, PTJ1_OUT, PTJ1_IN_PU), + PINMUX_DATA(PTJ0_DATA, PTJ0_IN, PTJ0_OUT, PTJ0_IN_PU), + + /* PTK GPIO */ + PINMUX_DATA(PTK7_DATA, PTK7_IN, PTK7_OUT, PTK7_IN_PU), + PINMUX_DATA(PTK6_DATA, PTK6_IN, PTK6_OUT, PTK6_IN_PU), + PINMUX_DATA(PTK5_DATA, PTK5_IN, PTK5_OUT, PTK5_IN_PU), + PINMUX_DATA(PTK4_DATA, PTK4_IN, PTK4_OUT, PTK4_IN_PU), + PINMUX_DATA(PTK3_DATA, PTK3_IN, PTK3_OUT, PTK3_IN_PU), + PINMUX_DATA(PTK2_DATA, PTK2_IN, PTK2_OUT, PTK2_IN_PU), + PINMUX_DATA(PTK1_DATA, PTK1_IN, PTK1_OUT, PTK1_IN_PU), + PINMUX_DATA(PTK0_DATA, PTK0_IN, PTK0_OUT, PTK0_IN_PU), + + /* PTL GPIO */ + PINMUX_DATA(PTL7_DATA, PTL7_IN, PTL7_OUT, PTL7_IN_PU), + PINMUX_DATA(PTL6_DATA, PTL6_IN, PTL6_OUT, PTL6_IN_PU), + PINMUX_DATA(PTL5_DATA, PTL5_IN, PTL5_OUT, PTL5_IN_PU), + PINMUX_DATA(PTL4_DATA, PTL4_IN, PTL4_OUT, PTL4_IN_PU), + PINMUX_DATA(PTL3_DATA, PTL3_IN, PTL3_OUT, PTL3_IN_PU), + PINMUX_DATA(PTL2_DATA, PTL2_IN, PTL2_OUT, PTL2_IN_PU), + PINMUX_DATA(PTL1_DATA, PTL1_IN, PTL1_OUT, PTL1_IN_PU), + PINMUX_DATA(PTL0_DATA, PTL0_IN, PTL0_OUT, PTL0_IN_PU), + + /* PTM GPIO */ + PINMUX_DATA(PTM7_DATA, PTM7_IN, PTM7_OUT, PTM7_IN_PU), + PINMUX_DATA(PTM6_DATA, PTM6_IN, PTM6_OUT, PTM6_IN_PU), + PINMUX_DATA(PTM5_DATA, PTM5_IN, PTM5_OUT, PTM5_IN_PU), + PINMUX_DATA(PTM4_DATA, PTM4_IN, PTM4_OUT, PTM4_IN_PU), + PINMUX_DATA(PTM3_DATA, PTM3_IN, PTM3_OUT, PTM3_IN_PU), + PINMUX_DATA(PTM2_DATA, PTM2_IN, PTM2_OUT, PTM2_IN_PU), + PINMUX_DATA(PTM1_DATA, PTM1_IN, PTM1_OUT, PTM1_IN_PU), + PINMUX_DATA(PTM0_DATA, PTM0_IN, PTM0_OUT, PTM0_IN_PU), + + /* PTN GPIO */ + PINMUX_DATA(PTN7_DATA, PTN7_IN, PTN7_OUT, PTN7_IN_PU), + PINMUX_DATA(PTN6_DATA, PTN6_IN, PTN6_OUT, PTN6_IN_PU), + PINMUX_DATA(PTN5_DATA, PTN5_IN, PTN5_OUT, PTN5_IN_PU), + PINMUX_DATA(PTN4_DATA, PTN4_IN, PTN4_OUT, PTN4_IN_PU), + PINMUX_DATA(PTN3_DATA, PTN3_IN, PTN3_OUT, PTN3_IN_PU), + PINMUX_DATA(PTN2_DATA, PTN2_IN, PTN2_OUT, PTN2_IN_PU), + PINMUX_DATA(PTN1_DATA, PTN1_IN, PTN1_OUT, PTN1_IN_PU), + PINMUX_DATA(PTN0_DATA, PTN0_IN, PTN0_OUT, PTN0_IN_PU), + + /* PTQ GPIO */ + PINMUX_DATA(PTQ7_DATA, PTQ7_IN, PTQ7_OUT, PTQ7_IN_PU), + PINMUX_DATA(PTQ6_DATA, PTQ6_IN, PTQ6_OUT, PTQ6_IN_PU), + PINMUX_DATA(PTQ5_DATA, PTQ5_IN, PTQ5_OUT, PTQ5_IN_PU), + PINMUX_DATA(PTQ4_DATA, PTQ4_IN, PTQ4_OUT, PTQ4_IN_PU), + PINMUX_DATA(PTQ3_DATA, PTQ3_IN, PTQ3_OUT, PTQ3_IN_PU), + PINMUX_DATA(PTQ2_DATA, PTQ2_IN, PTQ2_OUT, PTQ2_IN_PU), + PINMUX_DATA(PTQ1_DATA, PTQ1_IN, PTQ1_OUT, PTQ1_IN_PU), + PINMUX_DATA(PTQ0_DATA, PTQ0_IN, PTQ0_OUT, PTQ0_IN_PU), + + /* PTR GPIO */ + PINMUX_DATA(PTR7_DATA, PTR7_IN, PTR7_OUT, PTR7_IN_PU), + PINMUX_DATA(PTR6_DATA, PTR6_IN, PTR6_OUT, PTR6_IN_PU), + PINMUX_DATA(PTR5_DATA, PTR5_IN, PTR5_OUT, PTR5_IN_PU), + PINMUX_DATA(PTR4_DATA, PTR4_IN, PTR4_OUT, PTR4_IN_PU), + PINMUX_DATA(PTR3_DATA, PTR3_IN, PTR3_IN_PU), + PINMUX_DATA(PTR2_DATA, PTR2_IN, PTR2_IN_PU), + PINMUX_DATA(PTR1_DATA, PTR1_IN, PTR1_OUT, PTR1_IN_PU), + PINMUX_DATA(PTR0_DATA, PTR0_IN, PTR0_OUT, PTR0_IN_PU), + + /* PTS GPIO */ + PINMUX_DATA(PTS6_DATA, PTS6_IN, PTS6_OUT, PTS6_IN_PU), + PINMUX_DATA(PTS5_DATA, PTS5_IN, PTS5_OUT, PTS5_IN_PU), + PINMUX_DATA(PTS4_DATA, PTS4_IN, PTS4_OUT, PTS4_IN_PU), + PINMUX_DATA(PTS3_DATA, PTS3_IN, PTS3_OUT, PTS3_IN_PU), + PINMUX_DATA(PTS2_DATA, PTS2_IN, PTS2_OUT, PTS2_IN_PU), + PINMUX_DATA(PTS1_DATA, PTS1_IN, PTS1_OUT, PTS1_IN_PU), + PINMUX_DATA(PTS0_DATA, PTS0_IN, PTS0_OUT, PTS0_IN_PU), + + /* PTT GPIO */ + PINMUX_DATA(PTT7_DATA, PTT7_IN, PTT7_OUT, PTT7_IN_PU), + PINMUX_DATA(PTT6_DATA, PTT6_IN, PTT6_OUT, PTT6_IN_PU), + PINMUX_DATA(PTT5_DATA, PTT5_IN, PTT5_OUT, PTT5_IN_PU), + PINMUX_DATA(PTT4_DATA, PTT4_IN, PTT4_OUT, PTT4_IN_PU), + PINMUX_DATA(PTT3_DATA, PTT3_IN, PTT3_OUT, PTT3_IN_PU), + PINMUX_DATA(PTT2_DATA, PTT2_IN, PTT2_OUT, PTT2_IN_PU), + PINMUX_DATA(PTT1_DATA, PTT1_IN, PTT1_OUT, PTT1_IN_PU), + PINMUX_DATA(PTT0_DATA, PTT0_IN, PTT0_OUT, PTT0_IN_PU), + + /* PTU GPIO */ + PINMUX_DATA(PTU7_DATA, PTU7_IN, PTU7_OUT, PTU7_IN_PU), + PINMUX_DATA(PTU6_DATA, PTU6_IN, PTU6_OUT, PTU6_IN_PU), + PINMUX_DATA(PTU5_DATA, PTU5_IN, PTU5_OUT, PTU5_IN_PU), + PINMUX_DATA(PTU4_DATA, PTU4_IN, PTU4_OUT, PTU4_IN_PU), + PINMUX_DATA(PTU3_DATA, PTU3_IN, PTU3_OUT, PTU3_IN_PU), + PINMUX_DATA(PTU2_DATA, PTU2_IN, PTU2_OUT, PTU2_IN_PU), + PINMUX_DATA(PTU1_DATA, PTU1_IN, PTU1_OUT, PTU1_IN_PU), + PINMUX_DATA(PTU0_DATA, PTU0_IN, PTU0_OUT, PTU0_IN_PU), + + /* PTV GPIO */ + PINMUX_DATA(PTV7_DATA, PTV7_IN, PTV7_OUT, PTV7_IN_PU), + PINMUX_DATA(PTV6_DATA, PTV6_IN, PTV6_OUT, PTV6_IN_PU), + PINMUX_DATA(PTV5_DATA, PTV5_IN, PTV5_OUT, PTV5_IN_PU), + PINMUX_DATA(PTV4_DATA, PTV4_IN, PTV4_OUT, PTV4_IN_PU), + PINMUX_DATA(PTV3_DATA, PTV3_IN, PTV3_OUT, PTV3_IN_PU), + PINMUX_DATA(PTV2_DATA, PTV2_IN, PTV2_OUT, PTV2_IN_PU), + PINMUX_DATA(PTV1_DATA, PTV1_IN, PTV1_OUT, PTV1_IN_PU), + PINMUX_DATA(PTV0_DATA, PTV0_IN, PTV0_OUT, PTV0_IN_PU), + + /* PTW GPIO */ + PINMUX_DATA(PTW7_DATA, PTW7_IN, PTW7_OUT, PTW7_IN_PU), + PINMUX_DATA(PTW6_DATA, PTW6_IN, PTW6_OUT, PTW6_IN_PU), + PINMUX_DATA(PTW5_DATA, PTW5_IN, PTW5_OUT, PTW5_IN_PU), + PINMUX_DATA(PTW4_DATA, PTW4_IN, PTW4_OUT, PTW4_IN_PU), + PINMUX_DATA(PTW3_DATA, PTW3_IN, PTW3_OUT, PTW3_IN_PU), + PINMUX_DATA(PTW2_DATA, PTW2_IN, PTW2_OUT, PTW2_IN_PU), + PINMUX_DATA(PTW1_DATA, PTW1_IN, PTW1_OUT, PTW1_IN_PU), + PINMUX_DATA(PTW0_DATA, PTW0_IN, PTW0_OUT, PTW0_IN_PU), + + /* PTX GPIO */ + PINMUX_DATA(PTX7_DATA, PTX7_IN, PTX7_OUT, PTX7_IN_PU), + PINMUX_DATA(PTX6_DATA, PTX6_IN, PTX6_OUT, PTX6_IN_PU), + PINMUX_DATA(PTX5_DATA, PTX5_IN, PTX5_OUT, PTX5_IN_PU), + PINMUX_DATA(PTX4_DATA, PTX4_IN, PTX4_OUT, PTX4_IN_PU), + PINMUX_DATA(PTX3_DATA, PTX3_IN, PTX3_OUT, PTX3_IN_PU), + PINMUX_DATA(PTX2_DATA, PTX2_IN, PTX2_OUT, PTX2_IN_PU), + PINMUX_DATA(PTX1_DATA, PTX1_IN, PTX1_OUT, PTX1_IN_PU), + PINMUX_DATA(PTX0_DATA, PTX0_IN, PTX0_OUT, PTX0_IN_PU), + + /* PTY GPIO */ + PINMUX_DATA(PTY7_DATA, PTY7_IN, PTY7_OUT, PTY7_IN_PU), + PINMUX_DATA(PTY6_DATA, PTY6_IN, PTY6_OUT, PTY6_IN_PU), + PINMUX_DATA(PTY5_DATA, PTY5_IN, PTY5_OUT, PTY5_IN_PU), + PINMUX_DATA(PTY4_DATA, PTY4_IN, PTY4_OUT, PTY4_IN_PU), + PINMUX_DATA(PTY3_DATA, PTY3_IN, PTY3_OUT, PTY3_IN_PU), + PINMUX_DATA(PTY2_DATA, PTY2_IN, PTY2_OUT, PTY2_IN_PU), + PINMUX_DATA(PTY1_DATA, PTY1_IN, PTY1_OUT, PTY1_IN_PU), + PINMUX_DATA(PTY0_DATA, PTY0_IN, PTY0_OUT, PTY0_IN_PU), + + /* PTZ GPIO */ + PINMUX_DATA(PTZ7_DATA, PTZ7_IN, PTZ7_OUT, PTZ7_IN_PU), + PINMUX_DATA(PTZ6_DATA, PTZ6_IN, PTZ6_OUT, PTZ6_IN_PU), + PINMUX_DATA(PTZ5_DATA, PTZ5_IN, PTZ5_OUT, PTZ5_IN_PU), + PINMUX_DATA(PTZ4_DATA, PTZ4_IN, PTZ4_OUT, PTZ4_IN_PU), + PINMUX_DATA(PTZ3_DATA, PTZ3_IN, PTZ3_OUT, PTZ3_IN_PU), + PINMUX_DATA(PTZ2_DATA, PTZ2_IN, PTZ2_OUT, PTZ2_IN_PU), + PINMUX_DATA(PTZ1_DATA, PTZ1_IN, PTZ1_OUT, PTZ1_IN_PU), + PINMUX_DATA(PTZ0_DATA, PTZ0_IN, PTZ0_OUT, PTZ0_IN_PU), + + /* PTA FN */ + PINMUX_DATA(D23_MARK, PSA15_0, PSA14_0, PTA7_FN), + PINMUX_DATA(D22_MARK, PSA15_0, PSA14_0, PTA6_FN), + PINMUX_DATA(D21_MARK, PSA15_0, PSA14_0, PTA5_FN), + PINMUX_DATA(D20_MARK, PSA15_0, PSA14_0, PTA4_FN), + PINMUX_DATA(D19_MARK, PSA15_0, PSA14_0, PTA3_FN), + PINMUX_DATA(D18_MARK, PSA15_0, PSA14_0, PTA2_FN), + PINMUX_DATA(D17_MARK, PSA15_0, PSA14_0, PTA1_FN), + PINMUX_DATA(D16_MARK, PSA15_0, PSA14_0, PTA0_FN), + + PINMUX_DATA(KEYOUT2_MARK, PSA15_0, PSA14_1, PTA7_FN), + PINMUX_DATA(KEYOUT1_MARK, PSA15_0, PSA14_1, PTA6_FN), + PINMUX_DATA(KEYOUT0_MARK, PSA15_0, PSA14_1, PTA5_FN), + PINMUX_DATA(KEYIN4_MARK, PSA15_0, PSA14_1, PTA4_FN), + PINMUX_DATA(KEYIN3_MARK, PSA15_0, PSA14_1, PTA3_FN), + PINMUX_DATA(KEYIN2_MARK, PSA15_0, PSA14_1, PTA2_FN), + PINMUX_DATA(KEYIN1_MARK, PSA15_0, PSA14_1, PTA1_FN), + PINMUX_DATA(KEYIN0_MARK, PSA15_0, PSA14_1, PTA0_FN), + + PINMUX_DATA(IDED15_MARK, PSA15_1, PSA14_0, PTA7_FN), + PINMUX_DATA(IDED14_MARK, PSA15_1, PSA14_0, PTA6_FN), + PINMUX_DATA(IDED13_MARK, PSA15_1, PSA14_0, PTA5_FN), + PINMUX_DATA(IDED12_MARK, PSA15_1, PSA14_0, PTA4_FN), + PINMUX_DATA(IDED11_MARK, PSA15_1, PSA14_0, PTA3_FN), + PINMUX_DATA(IDED10_MARK, PSA15_1, PSA14_0, PTA2_FN), + PINMUX_DATA(IDED9_MARK, PSA15_1, PSA14_0, PTA1_FN), + PINMUX_DATA(IDED8_MARK, PSA15_1, PSA14_0, PTA0_FN), + + /* PTB FN */ + PINMUX_DATA(D31_MARK, PSE15_0, PSE14_0, PTB7_FN), + PINMUX_DATA(D30_MARK, PSE15_0, PSE14_0, PTB6_FN), + PINMUX_DATA(D29_MARK, PSE11_0, PTB5_FN), + PINMUX_DATA(D28_MARK, PSE11_0, PTB4_FN), + PINMUX_DATA(D27_MARK, PSE11_0, PTB3_FN), + PINMUX_DATA(D26_MARK, PSA15_0, PSA14_0, PTB2_FN), + PINMUX_DATA(D25_MARK, PSA15_0, PSA14_0, PTB1_FN), + PINMUX_DATA(D24_MARK, PSA15_0, PSA14_0, PTB0_FN), + + PINMUX_DATA(IDEA1_MARK, PSE15_1, PSE14_0, PTB7_FN), + PINMUX_DATA(IDEA0_MARK, PSE15_1, PSE14_0, PTB6_FN), + PINMUX_DATA(IODREQ_MARK, PSE11_1, PTB5_FN), + PINMUX_DATA(IDECS0_MARK, PSE11_1, PTB4_FN), + PINMUX_DATA(IDECS1_MARK, PSE11_1, PTB3_FN), + PINMUX_DATA(IDEIORD_MARK, PSA15_1, PSA14_0, PTB2_FN), + PINMUX_DATA(IDEIOWR_MARK, PSA15_1, PSA14_0, PTB1_FN), + PINMUX_DATA(IDEINT_MARK, PSA15_1, PSA14_0, PTB0_FN), + + PINMUX_DATA(TPUTO1_MARK, PSE15_0, PSE14_1, PTB7_FN), + PINMUX_DATA(TPUTO0_MARK, PSE15_0, PSE14_1, PTB6_FN), + + PINMUX_DATA(KEYOUT5_IN5_MARK, PSA15_0, PSA14_1, PTB2_FN), + PINMUX_DATA(KEYOUT4_IN6_MARK, PSA15_0, PSA14_1, PTB1_FN), + PINMUX_DATA(KEYOUT3_MARK, PSA15_0, PSA14_1, PTB0_FN), + + /* PTC FN */ + PINMUX_DATA(LCDD7_MARK, PSD5_0, PTC7_FN), + PINMUX_DATA(LCDD6_MARK, PSD5_0, PTC6_FN), + PINMUX_DATA(LCDD5_MARK, PSD5_0, PTC5_FN), + PINMUX_DATA(LCDD4_MARK, PSD5_0, PTC4_FN), + PINMUX_DATA(LCDD3_MARK, PSD5_0, PTC3_FN), + PINMUX_DATA(LCDD2_MARK, PSD5_0, PTC2_FN), + PINMUX_DATA(LCDD1_MARK, PSD5_0, PTC1_FN), + PINMUX_DATA(LCDD0_MARK, PSD5_0, PTC0_FN), + + /* PTD FN */ + PINMUX_DATA(LCDD15_MARK, PSD5_0, PTD7_FN), + PINMUX_DATA(LCDD14_MARK, PSD5_0, PTD6_FN), + PINMUX_DATA(LCDD13_MARK, PSD5_0, PTD5_FN), + PINMUX_DATA(LCDD12_MARK, PSD5_0, PTD4_FN), + PINMUX_DATA(LCDD11_MARK, PSD5_0, PTD3_FN), + PINMUX_DATA(LCDD10_MARK, PSD5_0, PTD2_FN), + PINMUX_DATA(LCDD9_MARK, PSD5_0, PTD1_FN), + PINMUX_DATA(LCDD8_MARK, PSD5_0, PTD0_FN), + + /* PTE FN */ + PINMUX_DATA(FSIMCKB_MARK, PTE7_FN), + PINMUX_DATA(FSIMCKA_MARK, PTE6_FN), + + PINMUX_DATA(LCDD21_MARK, PSC5_0, PSC4_0, PTE5_FN), + PINMUX_DATA(LCDD20_MARK, PSD3_0, PSD2_0, PTE4_FN), + PINMUX_DATA(LCDD19_MARK, PSA3_0, PSA2_0, PTE3_FN), + PINMUX_DATA(LCDD18_MARK, PSA3_0, PSA2_0, PTE2_FN), + PINMUX_DATA(LCDD17_MARK, PSD5_0, PTE1_FN), + PINMUX_DATA(LCDD16_MARK, PSD5_0, PTE0_FN), + + PINMUX_DATA(SCIF2_L_TXD_MARK, PSC5_0, PSC4_1, PTE5_FN), + PINMUX_DATA(SCIF4_SCK_MARK, PSD3_0, PSD2_1, PTE4_FN), + PINMUX_DATA(SCIF4_RXD_MARK, PSA3_0, PSA2_1, PTE3_FN), + PINMUX_DATA(SCIF4_TXD_MARK, PSA3_0, PSA2_1, PTE2_FN), + + /* PTF FN */ + PINMUX_DATA(LCDVSYN_MARK, PSD8_0, PTF7_FN), + PINMUX_DATA(LCDDISP_MARK, PSD10_0, PSD9_0, PTF6_FN), + PINMUX_DATA(LCDHSYN_MARK, PSD10_0, PSD9_0, PTF5_FN), + PINMUX_DATA(LCDDON_MARK, PSD8_0, PTF4_FN), + PINMUX_DATA(LCDDCK_MARK, PSD10_0, PSD9_0, PTF3_FN), + PINMUX_DATA(LCDVEPWC_MARK, PSA6_0, PTF2_FN), + PINMUX_DATA(LCDD23_MARK, PSC7_0, PSC6_0, PTF1_FN), + PINMUX_DATA(LCDD22_MARK, PSC5_0, PSC4_0, PTF0_FN), + + PINMUX_DATA(LCDRS_MARK, PSD10_0, PSD9_1, PTF6_FN), + PINMUX_DATA(LCDCS_MARK, PSD10_0, PSD9_1, PTF5_FN), + PINMUX_DATA(LCDWR_MARK, PSD10_0, PSD9_1, PTF3_FN), + + PINMUX_DATA(SCIF0_TXD_MARK, PSA6_1, PTF2_FN), + PINMUX_DATA(SCIF2_L_SCK_MARK, PSC7_0, PSC6_1, PTF1_FN), + PINMUX_DATA(SCIF2_L_RXD_MARK, PSC5_0, PSC4_1, PTF0_FN), + + /* PTG FN */ + PINMUX_DATA(AUDCK_MARK, PTG5_FN), + PINMUX_DATA(AUDSYNC_MARK, PTG4_FN), + PINMUX_DATA(AUDATA3_MARK, PTG3_FN), + PINMUX_DATA(AUDATA2_MARK, PTG2_FN), + PINMUX_DATA(AUDATA1_MARK, PTG1_FN), + PINMUX_DATA(AUDATA0_MARK, PTG0_FN), + + /* PTH FN */ + PINMUX_DATA(VIO0_VD_MARK, PTH7_FN), + PINMUX_DATA(VIO0_CLK_MARK, PTH6_FN), + PINMUX_DATA(VIO0_D7_MARK, PTH5_FN), + PINMUX_DATA(VIO0_D6_MARK, PTH4_FN), + PINMUX_DATA(VIO0_D5_MARK, PTH3_FN), + PINMUX_DATA(VIO0_D4_MARK, PTH2_FN), + PINMUX_DATA(VIO0_D3_MARK, PTH1_FN), + PINMUX_DATA(VIO0_D2_MARK, PTH0_FN), + + /* PTJ FN */ + PINMUX_DATA(PDSTATUS_MARK, PTJ7_FN), + PINMUX_DATA(STATUS2_MARK, PTJ6_FN), + PINMUX_DATA(STATUS0_MARK, PTJ5_FN), + PINMUX_DATA(A25_MARK, PSA8_0, PTJ3_FN), + PINMUX_DATA(BS_MARK, PSA8_1, PTJ3_FN), + PINMUX_DATA(A24_MARK, PTJ2_FN), + PINMUX_DATA(A23_MARK, PTJ1_FN), + PINMUX_DATA(A22_MARK, PTJ0_FN), + + /* PTK FN */ + PINMUX_DATA(VIO1_D5_MARK, PSB7_0, PSB6_0, PTK7_FN), + PINMUX_DATA(VIO1_D4_MARK, PSB7_0, PSB6_0, PTK6_FN), + PINMUX_DATA(VIO1_D3_MARK, PSB7_0, PSB6_0, PTK5_FN), + PINMUX_DATA(VIO1_D2_MARK, PSB7_0, PSB6_0, PTK4_FN), + PINMUX_DATA(VIO1_D1_MARK, PSB7_0, PSB6_0, PTK3_FN), + PINMUX_DATA(VIO1_D0_MARK, PSB7_0, PSB6_0, PTK2_FN), + + PINMUX_DATA(VIO0_D13_MARK, PSB7_0, PSB6_1, PTK7_FN), + PINMUX_DATA(VIO0_D12_MARK, PSB7_0, PSB6_1, PTK6_FN), + PINMUX_DATA(VIO0_D11_MARK, PSB7_0, PSB6_1, PTK5_FN), + PINMUX_DATA(VIO0_D10_MARK, PSB7_0, PSB6_1, PTK4_FN), + PINMUX_DATA(VIO0_D9_MARK, PSB7_0, PSB6_1, PTK3_FN), + PINMUX_DATA(VIO0_D8_MARK, PSB7_0, PSB6_1, PTK2_FN), + + PINMUX_DATA(IDED5_MARK, PSB7_1, PSB6_0, PTK7_FN), + PINMUX_DATA(IDED4_MARK, PSB7_1, PSB6_0, PTK6_FN), + PINMUX_DATA(IDED3_MARK, PSB7_1, PSB6_0, PTK5_FN), + PINMUX_DATA(IDED2_MARK, PSB7_1, PSB6_0, PTK4_FN), + PINMUX_DATA(IDED1_MARK, PSB7_1, PSB6_0, PTK3_FN), + PINMUX_DATA(IDED0_MARK, PSB7_1, PSB6_0, PTK2_FN), + + PINMUX_DATA(VIO0_FLD_MARK, PTK1_FN), + PINMUX_DATA(VIO0_HD_MARK, PTK0_FN), + + /* PTL FN */ + PINMUX_DATA(DV_D5_MARK, PSB9_0, PSB8_0, PTL7_FN), + PINMUX_DATA(DV_D4_MARK, PSB9_0, PSB8_0, PTL6_FN), + PINMUX_DATA(DV_D3_MARK, PSE7_0, PSE6_0, PTL5_FN), + PINMUX_DATA(DV_D2_MARK, PSC9_0, PSC8_0, PTL4_FN), + PINMUX_DATA(DV_D1_MARK, PSC9_0, PSC8_0, PTL3_FN), + PINMUX_DATA(DV_D0_MARK, PSC9_0, PSC8_0, PTL2_FN), + PINMUX_DATA(DV_D15_MARK, PSD4_0, PTL1_FN), + PINMUX_DATA(DV_D14_MARK, PSE5_0, PSE4_0, PTL0_FN), + + PINMUX_DATA(SCIF3_V_SCK_MARK, PSB9_0, PSB8_1, PTL7_FN), + PINMUX_DATA(SCIF3_V_RXD_MARK, PSB9_0, PSB8_1, PTL6_FN), + PINMUX_DATA(SCIF3_V_TXD_MARK, PSE7_0, PSE6_1, PTL5_FN), + PINMUX_DATA(SCIF1_SCK_MARK, PSC9_0, PSC8_1, PTL4_FN), + PINMUX_DATA(SCIF1_RXD_MARK, PSC9_0, PSC8_1, PTL3_FN), + PINMUX_DATA(SCIF1_TXD_MARK, PSC9_0, PSC8_1, PTL2_FN), + + PINMUX_DATA(RMII_RXD0_MARK, PSB9_1, PSB8_0, PTL7_FN), + PINMUX_DATA(RMII_RXD1_MARK, PSB9_1, PSB8_0, PTL6_FN), + PINMUX_DATA(RMII_REF_CLK_MARK, PSE7_1, PSE6_0, PTL5_FN), + PINMUX_DATA(RMII_TX_EN_MARK, PSC9_1, PSC8_0, PTL4_FN), + PINMUX_DATA(RMII_TXD0_MARK, PSC9_1, PSC8_0, PTL3_FN), + PINMUX_DATA(RMII_TXD1_MARK, PSC9_1, PSC8_0, PTL2_FN), + + PINMUX_DATA(MSIOF0_MCK_MARK, PSE5_0, PSE4_1, PTL0_FN), + + /* PTM FN */ + PINMUX_DATA(DV_D13_MARK, PSC13_0, PSC12_0, PTM7_FN), + PINMUX_DATA(DV_D12_MARK, PSC13_0, PSC12_0, PTM6_FN), + PINMUX_DATA(DV_D11_MARK, PSC13_0, PSC12_0, PTM5_FN), + PINMUX_DATA(DV_D10_MARK, PSC13_0, PSC12_0, PTM4_FN), + PINMUX_DATA(DV_D9_MARK, PSC11_0, PSC10_0, PTM3_FN), + PINMUX_DATA(DV_D8_MARK, PSC11_0, PSC10_0, PTM2_FN), + + PINMUX_DATA(MSIOF0_TSCK_MARK, PSC13_0, PSC12_1, PTM7_FN), + PINMUX_DATA(MSIOF0_RXD_MARK, PSC13_0, PSC12_1, PTM6_FN), + PINMUX_DATA(MSIOF0_TXD_MARK, PSC13_0, PSC12_1, PTM5_FN), + PINMUX_DATA(MSIOF0_TSYNC_MARK, PSC13_0, PSC12_1, PTM4_FN), + PINMUX_DATA(MSIOF0_SS1_MARK, PSC11_0, PSC10_1, PTM3_FN), + PINMUX_DATA(MSIOF0_RSCK_MARK, PSC11_1, PSC10_0, PTM3_FN), + PINMUX_DATA(MSIOF0_SS2_MARK, PSC11_0, PSC10_1, PTM2_FN), + PINMUX_DATA(MSIOF0_RSYNC_MARK, PSC11_1, PSC10_0, PTM2_FN), + + PINMUX_DATA(LCDVCPWC_MARK, PSA6_0, PTM1_FN), + PINMUX_DATA(LCDRD_MARK, PSA7_0, PTM0_FN), + + PINMUX_DATA(SCIF0_RXD_MARK, PSA6_1, PTM1_FN), + PINMUX_DATA(SCIF0_SCK_MARK, PSA7_1, PTM0_FN), + + /* PTN FN */ + PINMUX_DATA(VIO0_D1_MARK, PTN7_FN), + PINMUX_DATA(VIO0_D0_MARK, PTN6_FN), + + PINMUX_DATA(DV_CLKI_MARK, PSD11_0, PTN5_FN), + PINMUX_DATA(DV_CLK_MARK, PSD13_0, PSD12_0, PTN4_FN), + PINMUX_DATA(DV_VSYNC_MARK, PSD15_0, PSD14_0, PTN3_FN), + PINMUX_DATA(DV_HSYNC_MARK, PSB5_0, PSB4_0, PTN2_FN), + PINMUX_DATA(DV_D7_MARK, PSB3_0, PSB2_0, PTN1_FN), + PINMUX_DATA(DV_D6_MARK, PSB1_0, PSB0_0, PTN0_FN), + + PINMUX_DATA(SCIF2_V_SCK_MARK, PSD13_0, PSD12_1, PTN4_FN), + PINMUX_DATA(SCIF2_V_RXD_MARK, PSD15_0, PSD14_1, PTN3_FN), + PINMUX_DATA(SCIF2_V_TXD_MARK, PSB5_0, PSB4_1, PTN2_FN), + PINMUX_DATA(SCIF3_V_CTS_MARK, PSB3_0, PSB2_1, PTN1_FN), + PINMUX_DATA(SCIF3_V_RTS_MARK, PSB1_0, PSB0_1, PTN0_FN), + + PINMUX_DATA(RMII_RX_ER_MARK, PSB3_1, PSB2_0, PTN1_FN), + PINMUX_DATA(RMII_CRS_DV_MARK, PSB1_1, PSB0_0, PTN0_FN), + + /* PTQ FN */ + PINMUX_DATA(D7_MARK, PTQ7_FN), + PINMUX_DATA(D6_MARK, PTQ6_FN), + PINMUX_DATA(D5_MARK, PTQ5_FN), + PINMUX_DATA(D4_MARK, PTQ4_FN), + PINMUX_DATA(D3_MARK, PTQ3_FN), + PINMUX_DATA(D2_MARK, PTQ2_FN), + PINMUX_DATA(D1_MARK, PTQ1_FN), + PINMUX_DATA(D0_MARK, PTQ0_FN), + + /* PTR FN */ + PINMUX_DATA(CS6B_CE1B_MARK, PTR7_FN), + PINMUX_DATA(CS6A_CE2B_MARK, PTR6_FN), + PINMUX_DATA(CS5B_CE1A_MARK, PTR5_FN), + PINMUX_DATA(CS5A_CE2A_MARK, PTR4_FN), + PINMUX_DATA(IOIS16_MARK, PSA5_0, PTR3_FN), + PINMUX_DATA(WAIT_MARK, PTR2_FN), + PINMUX_DATA(WE3_ICIOWR_MARK, PSA1_0, PSA0_0, PTR1_FN), + PINMUX_DATA(WE2_ICIORD_MARK, PSD1_0, PSD0_0, PTR0_FN), + + PINMUX_DATA(LCDLCLK_MARK, PSA5_1, PTR3_FN), + + PINMUX_DATA(IDEA2_MARK, PSD1_1, PSD0_0, PTR0_FN), + + PINMUX_DATA(TPUTO3_MARK, PSA1_0, PSA0_1, PTR1_FN), + PINMUX_DATA(TPUTI3_MARK, PSA1_1, PSA0_0, PTR1_FN), + PINMUX_DATA(TPUTO2_MARK, PSD1_0, PSD0_1, PTR0_FN), + + /* PTS FN */ + PINMUX_DATA(VIO_CKO_MARK, PTS6_FN), + + PINMUX_DATA(TPUTI2_MARK, PSE9_0, PSE8_1, PTS5_FN), + + PINMUX_DATA(IDEIORDY_MARK, PSE9_1, PSE8_0, PTS5_FN), + + PINMUX_DATA(VIO1_FLD_MARK, PSE9_0, PSE8_0, PTS5_FN), + PINMUX_DATA(VIO1_HD_MARK, PSA10_0, PTS4_FN), + PINMUX_DATA(VIO1_VD_MARK, PSA9_0, PTS3_FN), + PINMUX_DATA(VIO1_CLK_MARK, PSA9_0, PTS2_FN), + PINMUX_DATA(VIO1_D7_MARK, PSB7_0, PSB6_0, PTS1_FN), + PINMUX_DATA(VIO1_D6_MARK, PSB7_0, PSB6_0, PTS0_FN), + + PINMUX_DATA(SCIF5_SCK_MARK, PSA10_1, PTS4_FN), + PINMUX_DATA(SCIF5_RXD_MARK, PSA9_1, PTS3_FN), + PINMUX_DATA(SCIF5_TXD_MARK, PSA9_1, PTS2_FN), + + PINMUX_DATA(VIO0_D15_MARK, PSB7_0, PSB6_1, PTS1_FN), + PINMUX_DATA(VIO0_D14_MARK, PSB7_0, PSB6_1, PTS0_FN), + + PINMUX_DATA(IDED7_MARK, PSB7_1, PSB6_0, PTS1_FN), + PINMUX_DATA(IDED6_MARK, PSB7_1, PSB6_0, PTS0_FN), + + /* PTT FN */ + PINMUX_DATA(D15_MARK, PTT7_FN), + PINMUX_DATA(D14_MARK, PTT6_FN), + PINMUX_DATA(D13_MARK, PTT5_FN), + PINMUX_DATA(D12_MARK, PTT4_FN), + PINMUX_DATA(D11_MARK, PTT3_FN), + PINMUX_DATA(D10_MARK, PTT2_FN), + PINMUX_DATA(D9_MARK, PTT1_FN), + PINMUX_DATA(D8_MARK, PTT0_FN), + + /* PTU FN */ + PINMUX_DATA(DMAC_DACK0_MARK, PTU7_FN), + PINMUX_DATA(DMAC_DREQ0_MARK, PTU6_FN), + + PINMUX_DATA(FSIOASD_MARK, PSE1_0, PTU5_FN), + PINMUX_DATA(FSIIABCK_MARK, PSE1_0, PTU4_FN), + PINMUX_DATA(FSIIALRCK_MARK, PSE1_0, PTU3_FN), + PINMUX_DATA(FSIOABCK_MARK, PSE1_0, PTU2_FN), + PINMUX_DATA(FSIOALRCK_MARK, PSE1_0, PTU1_FN), + PINMUX_DATA(CLKAUDIOAO_MARK, PSE0_0, PTU0_FN), + + /* PTV FN */ + PINMUX_DATA(FSIIBSD_MARK, PSD7_0, PSD6_0, PTV7_FN), + PINMUX_DATA(FSIOBSD_MARK, PSD7_0, PSD6_0, PTV6_FN), + PINMUX_DATA(FSIIBBCK_MARK, PSC15_0, PSC14_0, PTV5_FN), + PINMUX_DATA(FSIIBLRCK_MARK, PSC15_0, PSC14_0, PTV4_FN), + PINMUX_DATA(FSIOBBCK_MARK, PSC15_0, PSC14_0, PTV3_FN), + PINMUX_DATA(FSIOBLRCK_MARK, PSC15_0, PSC14_0, PTV2_FN), + PINMUX_DATA(CLKAUDIOBO_MARK, PSE3_0, PSE2_0, PTV1_FN), + PINMUX_DATA(FSIIASD_MARK, PSE10_0, PTV0_FN), + + PINMUX_DATA(MSIOF1_SS2_MARK, PSD7_0, PSD6_1, PTV7_FN), + PINMUX_DATA(MSIOF1_RSYNC_MARK, PSD7_1, PSD6_0, PTV7_FN), + PINMUX_DATA(MSIOF1_SS1_MARK, PSD7_0, PSD6_1, PTV6_FN), + PINMUX_DATA(MSIOF1_RSCK_MARK, PSD7_1, PSD6_0, PTV6_FN), + PINMUX_DATA(MSIOF1_RXD_MARK, PSC15_0, PSC14_1, PTV5_FN), + PINMUX_DATA(MSIOF1_TSYNC_MARK, PSC15_0, PSC14_1, PTV4_FN), + PINMUX_DATA(MSIOF1_TSCK_MARK, PSC15_0, PSC14_1, PTV3_FN), + PINMUX_DATA(MSIOF1_TXD_MARK, PSC15_0, PSC14_1, PTV2_FN), + PINMUX_DATA(MSIOF1_MCK_MARK, PSE3_0, PSE2_1, PTV1_FN), + + /* PTW FN */ + PINMUX_DATA(MMC_D7_MARK, PSE13_0, PSE12_0, PTW7_FN), + PINMUX_DATA(MMC_D6_MARK, PSE13_0, PSE12_0, PTW6_FN), + PINMUX_DATA(MMC_D5_MARK, PSE13_0, PSE12_0, PTW5_FN), + PINMUX_DATA(MMC_D4_MARK, PSE13_0, PSE12_0, PTW4_FN), + PINMUX_DATA(MMC_D3_MARK, PSA13_0, PTW3_FN), + PINMUX_DATA(MMC_D2_MARK, PSA13_0, PTW2_FN), + PINMUX_DATA(MMC_D1_MARK, PSA13_0, PTW1_FN), + PINMUX_DATA(MMC_D0_MARK, PSA13_0, PTW0_FN), + + PINMUX_DATA(SDHI1CD_MARK, PSE13_0, PSE12_1, PTW7_FN), + PINMUX_DATA(SDHI1WP_MARK, PSE13_0, PSE12_1, PTW6_FN), + PINMUX_DATA(SDHI1D3_MARK, PSE13_0, PSE12_1, PTW5_FN), + PINMUX_DATA(SDHI1D2_MARK, PSE13_0, PSE12_1, PTW4_FN), + PINMUX_DATA(SDHI1D1_MARK, PSA13_1, PTW3_FN), + PINMUX_DATA(SDHI1D0_MARK, PSA13_1, PTW2_FN), + PINMUX_DATA(SDHI1CMD_MARK, PSA13_1, PTW1_FN), + PINMUX_DATA(SDHI1CLK_MARK, PSA13_1, PTW0_FN), + + PINMUX_DATA(IODACK_MARK, PSE13_1, PSE12_0, PTW7_FN), + PINMUX_DATA(IDERST_MARK, PSE13_1, PSE12_0, PTW6_FN), + PINMUX_DATA(EXBUF_ENB_MARK, PSE13_1, PSE12_0, PTW5_FN), + PINMUX_DATA(DIRECTION_MARK, PSE13_1, PSE12_0, PTW4_FN), + + /* PTX FN */ + PINMUX_DATA(DMAC_DACK1_MARK, PSA12_0, PTX7_FN), + PINMUX_DATA(DMAC_DREQ1_MARK, PSA12_0, PTX6_FN), + + PINMUX_DATA(IRDA_OUT_MARK, PSA12_1, PTX7_FN), + PINMUX_DATA(IRDA_IN_MARK, PSA12_1, PTX6_FN), + + PINMUX_DATA(TSIF_TS0_SDAT_MARK, PSC0_0, PTX5_FN), + PINMUX_DATA(TSIF_TS0_SCK_MARK, PSC1_0, PTX4_FN), + PINMUX_DATA(TSIF_TS0_SDEN_MARK, PSC2_0, PTX3_FN), + PINMUX_DATA(TSIF_TS0_SPSYNC_MARK, PTX2_FN), + + PINMUX_DATA(LNKSTA_MARK, PSC0_1, PTX5_FN), + PINMUX_DATA(MDIO_MARK, PSC1_1, PTX4_FN), + PINMUX_DATA(MDC_MARK, PSC2_1, PTX3_FN), + + PINMUX_DATA(MMC_CLK_MARK, PTX1_FN), + PINMUX_DATA(MMC_CMD_MARK, PTX0_FN), + + /* PTY FN */ + PINMUX_DATA(SDHI0CD_MARK, PTY7_FN), + PINMUX_DATA(SDHI0WP_MARK, PTY6_FN), + PINMUX_DATA(SDHI0D3_MARK, PTY5_FN), + PINMUX_DATA(SDHI0D2_MARK, PTY4_FN), + PINMUX_DATA(SDHI0D1_MARK, PTY3_FN), + PINMUX_DATA(SDHI0D0_MARK, PTY2_FN), + PINMUX_DATA(SDHI0CMD_MARK, PTY1_FN), + PINMUX_DATA(SDHI0CLK_MARK, PTY0_FN), + + /* PTZ FN */ + PINMUX_DATA(INTC_IRQ7_MARK, PSB10_0, PTZ7_FN), + PINMUX_DATA(INTC_IRQ6_MARK, PSB11_0, PTZ6_FN), + PINMUX_DATA(INTC_IRQ5_MARK, PSB12_0, PTZ5_FN), + PINMUX_DATA(INTC_IRQ4_MARK, PSB13_0, PTZ4_FN), + PINMUX_DATA(INTC_IRQ3_MARK, PSB14_0, PTZ3_FN), + PINMUX_DATA(INTC_IRQ2_MARK, PTZ2_FN), + PINMUX_DATA(INTC_IRQ1_MARK, PTZ1_FN), + PINMUX_DATA(INTC_IRQ0_MARK, PTZ0_FN), + + PINMUX_DATA(SCIF3_I_CTS_MARK, PSB10_1, PTZ7_FN), + PINMUX_DATA(SCIF3_I_RTS_MARK, PSB11_1, PTZ6_FN), + PINMUX_DATA(SCIF3_I_SCK_MARK, PSB12_1, PTZ5_FN), + PINMUX_DATA(SCIF3_I_RXD_MARK, PSB13_1, PTZ4_FN), + PINMUX_DATA(SCIF3_I_TXD_MARK, PSB14_1, PTZ3_FN), +}; + +static struct pinmux_gpio pinmux_gpios[] = { + /* PTA */ + PINMUX_GPIO(GPIO_PTA7, PTA7_DATA), + PINMUX_GPIO(GPIO_PTA6, PTA6_DATA), + PINMUX_GPIO(GPIO_PTA5, PTA5_DATA), + PINMUX_GPIO(GPIO_PTA4, PTA4_DATA), + PINMUX_GPIO(GPIO_PTA3, PTA3_DATA), + PINMUX_GPIO(GPIO_PTA2, PTA2_DATA), + PINMUX_GPIO(GPIO_PTA1, PTA1_DATA), + PINMUX_GPIO(GPIO_PTA0, PTA0_DATA), + + /* PTB */ + PINMUX_GPIO(GPIO_PTB7, PTB7_DATA), + PINMUX_GPIO(GPIO_PTB6, PTB6_DATA), + PINMUX_GPIO(GPIO_PTB5, PTB5_DATA), + PINMUX_GPIO(GPIO_PTB4, PTB4_DATA), + PINMUX_GPIO(GPIO_PTB3, PTB3_DATA), + PINMUX_GPIO(GPIO_PTB2, PTB2_DATA), + PINMUX_GPIO(GPIO_PTB1, PTB1_DATA), + PINMUX_GPIO(GPIO_PTB0, PTB0_DATA), + + /* PTC */ + PINMUX_GPIO(GPIO_PTC7, PTC7_DATA), + PINMUX_GPIO(GPIO_PTC6, PTC6_DATA), + PINMUX_GPIO(GPIO_PTC5, PTC5_DATA), + PINMUX_GPIO(GPIO_PTC4, PTC4_DATA), + PINMUX_GPIO(GPIO_PTC3, PTC3_DATA), + PINMUX_GPIO(GPIO_PTC2, PTC2_DATA), + PINMUX_GPIO(GPIO_PTC1, PTC1_DATA), + PINMUX_GPIO(GPIO_PTC0, PTC0_DATA), + + /* PTD */ + PINMUX_GPIO(GPIO_PTD7, PTD7_DATA), + PINMUX_GPIO(GPIO_PTD6, PTD6_DATA), + PINMUX_GPIO(GPIO_PTD5, PTD5_DATA), + PINMUX_GPIO(GPIO_PTD4, PTD4_DATA), + PINMUX_GPIO(GPIO_PTD3, PTD3_DATA), + PINMUX_GPIO(GPIO_PTD2, PTD2_DATA), + PINMUX_GPIO(GPIO_PTD1, PTD1_DATA), + PINMUX_GPIO(GPIO_PTD0, PTD0_DATA), + + /* PTE */ + PINMUX_GPIO(GPIO_PTE7, PTE7_DATA), + PINMUX_GPIO(GPIO_PTE6, PTE6_DATA), + PINMUX_GPIO(GPIO_PTE5, PTE5_DATA), + PINMUX_GPIO(GPIO_PTE4, PTE4_DATA), + PINMUX_GPIO(GPIO_PTE3, PTE3_DATA), + PINMUX_GPIO(GPIO_PTE2, PTE2_DATA), + PINMUX_GPIO(GPIO_PTE1, PTE1_DATA), + PINMUX_GPIO(GPIO_PTE0, PTE0_DATA), + + /* PTF */ + PINMUX_GPIO(GPIO_PTF7, PTF7_DATA), + PINMUX_GPIO(GPIO_PTF6, PTF6_DATA), + PINMUX_GPIO(GPIO_PTF5, PTF5_DATA), + PINMUX_GPIO(GPIO_PTF4, PTF4_DATA), + PINMUX_GPIO(GPIO_PTF3, PTF3_DATA), + PINMUX_GPIO(GPIO_PTF2, PTF2_DATA), + PINMUX_GPIO(GPIO_PTF1, PTF1_DATA), + PINMUX_GPIO(GPIO_PTF0, PTF0_DATA), + + /* PTG */ + PINMUX_GPIO(GPIO_PTG5, PTG5_DATA), + PINMUX_GPIO(GPIO_PTG4, PTG4_DATA), + PINMUX_GPIO(GPIO_PTG3, PTG3_DATA), + PINMUX_GPIO(GPIO_PTG2, PTG2_DATA), + PINMUX_GPIO(GPIO_PTG1, PTG1_DATA), + PINMUX_GPIO(GPIO_PTG0, PTG0_DATA), + + /* PTH */ + PINMUX_GPIO(GPIO_PTH7, PTH7_DATA), + PINMUX_GPIO(GPIO_PTH6, PTH6_DATA), + PINMUX_GPIO(GPIO_PTH5, PTH5_DATA), + PINMUX_GPIO(GPIO_PTH4, PTH4_DATA), + PINMUX_GPIO(GPIO_PTH3, PTH3_DATA), + PINMUX_GPIO(GPIO_PTH2, PTH2_DATA), + PINMUX_GPIO(GPIO_PTH1, PTH1_DATA), + PINMUX_GPIO(GPIO_PTH0, PTH0_DATA), + + /* PTJ */ + PINMUX_GPIO(GPIO_PTJ7, PTJ7_DATA), + PINMUX_GPIO(GPIO_PTJ6, PTJ6_DATA), + PINMUX_GPIO(GPIO_PTJ5, PTJ5_DATA), + PINMUX_GPIO(GPIO_PTJ3, PTJ3_DATA), + PINMUX_GPIO(GPIO_PTJ2, PTJ2_DATA), + PINMUX_GPIO(GPIO_PTJ1, PTJ1_DATA), + PINMUX_GPIO(GPIO_PTJ0, PTJ0_DATA), + + /* PTK */ + PINMUX_GPIO(GPIO_PTK7, PTK7_DATA), + PINMUX_GPIO(GPIO_PTK6, PTK6_DATA), + PINMUX_GPIO(GPIO_PTK5, PTK5_DATA), + PINMUX_GPIO(GPIO_PTK4, PTK4_DATA), + PINMUX_GPIO(GPIO_PTK3, PTK3_DATA), + PINMUX_GPIO(GPIO_PTK2, PTK2_DATA), + PINMUX_GPIO(GPIO_PTK1, PTK1_DATA), + PINMUX_GPIO(GPIO_PTK0, PTK0_DATA), + + /* PTL */ + PINMUX_GPIO(GPIO_PTL7, PTL7_DATA), + PINMUX_GPIO(GPIO_PTL6, PTL6_DATA), + PINMUX_GPIO(GPIO_PTL5, PTL5_DATA), + PINMUX_GPIO(GPIO_PTL4, PTL4_DATA), + PINMUX_GPIO(GPIO_PTL3, PTL3_DATA), + PINMUX_GPIO(GPIO_PTL2, PTL2_DATA), + PINMUX_GPIO(GPIO_PTL1, PTL1_DATA), + PINMUX_GPIO(GPIO_PTL0, PTL0_DATA), + + /* PTM */ + PINMUX_GPIO(GPIO_PTM7, PTM7_DATA), + PINMUX_GPIO(GPIO_PTM6, PTM6_DATA), + PINMUX_GPIO(GPIO_PTM5, PTM5_DATA), + PINMUX_GPIO(GPIO_PTM4, PTM4_DATA), + PINMUX_GPIO(GPIO_PTM3, PTM3_DATA), + PINMUX_GPIO(GPIO_PTM2, PTM2_DATA), + PINMUX_GPIO(GPIO_PTM1, PTM1_DATA), + PINMUX_GPIO(GPIO_PTM0, PTM0_DATA), + + /* PTN */ + PINMUX_GPIO(GPIO_PTN7, PTN7_DATA), + PINMUX_GPIO(GPIO_PTN6, PTN6_DATA), + PINMUX_GPIO(GPIO_PTN5, PTN5_DATA), + PINMUX_GPIO(GPIO_PTN4, PTN4_DATA), + PINMUX_GPIO(GPIO_PTN3, PTN3_DATA), + PINMUX_GPIO(GPIO_PTN2, PTN2_DATA), + PINMUX_GPIO(GPIO_PTN1, PTN1_DATA), + PINMUX_GPIO(GPIO_PTN0, PTN0_DATA), + + /* PTQ */ + PINMUX_GPIO(GPIO_PTQ7, PTQ7_DATA), + PINMUX_GPIO(GPIO_PTQ6, PTQ6_DATA), + PINMUX_GPIO(GPIO_PTQ5, PTQ5_DATA), + PINMUX_GPIO(GPIO_PTQ4, PTQ4_DATA), + PINMUX_GPIO(GPIO_PTQ3, PTQ3_DATA), + PINMUX_GPIO(GPIO_PTQ2, PTQ2_DATA), + PINMUX_GPIO(GPIO_PTQ1, PTQ1_DATA), + PINMUX_GPIO(GPIO_PTQ0, PTQ0_DATA), + + /* PTR */ + PINMUX_GPIO(GPIO_PTR7, PTR7_DATA), + PINMUX_GPIO(GPIO_PTR6, PTR6_DATA), + PINMUX_GPIO(GPIO_PTR5, PTR5_DATA), + PINMUX_GPIO(GPIO_PTR4, PTR4_DATA), + PINMUX_GPIO(GPIO_PTR3, PTR3_DATA), + PINMUX_GPIO(GPIO_PTR2, PTR2_DATA), + PINMUX_GPIO(GPIO_PTR1, PTR1_DATA), + PINMUX_GPIO(GPIO_PTR0, PTR0_DATA), + + /* PTS */ + PINMUX_GPIO(GPIO_PTS6, PTS6_DATA), + PINMUX_GPIO(GPIO_PTS5, PTS5_DATA), + PINMUX_GPIO(GPIO_PTS4, PTS4_DATA), + PINMUX_GPIO(GPIO_PTS3, PTS3_DATA), + PINMUX_GPIO(GPIO_PTS2, PTS2_DATA), + PINMUX_GPIO(GPIO_PTS1, PTS1_DATA), + PINMUX_GPIO(GPIO_PTS0, PTS0_DATA), + + /* PTT */ + PINMUX_GPIO(GPIO_PTT7, PTT7_DATA), + PINMUX_GPIO(GPIO_PTT6, PTT6_DATA), + PINMUX_GPIO(GPIO_PTT5, PTT5_DATA), + PINMUX_GPIO(GPIO_PTT4, PTT4_DATA), + PINMUX_GPIO(GPIO_PTT3, PTT3_DATA), + PINMUX_GPIO(GPIO_PTT2, PTT2_DATA), + PINMUX_GPIO(GPIO_PTT1, PTT1_DATA), + PINMUX_GPIO(GPIO_PTT0, PTT0_DATA), + + /* PTU */ + PINMUX_GPIO(GPIO_PTU7, PTU7_DATA), + PINMUX_GPIO(GPIO_PTU6, PTU6_DATA), + PINMUX_GPIO(GPIO_PTU5, PTU5_DATA), + PINMUX_GPIO(GPIO_PTU4, PTU4_DATA), + PINMUX_GPIO(GPIO_PTU3, PTU3_DATA), + PINMUX_GPIO(GPIO_PTU2, PTU2_DATA), + PINMUX_GPIO(GPIO_PTU1, PTU1_DATA), + PINMUX_GPIO(GPIO_PTU0, PTU0_DATA), + + /* PTV */ + PINMUX_GPIO(GPIO_PTV7, PTV7_DATA), + PINMUX_GPIO(GPIO_PTV6, PTV6_DATA), + PINMUX_GPIO(GPIO_PTV5, PTV5_DATA), + PINMUX_GPIO(GPIO_PTV4, PTV4_DATA), + PINMUX_GPIO(GPIO_PTV3, PTV3_DATA), + PINMUX_GPIO(GPIO_PTV2, PTV2_DATA), + PINMUX_GPIO(GPIO_PTV1, PTV1_DATA), + PINMUX_GPIO(GPIO_PTV0, PTV0_DATA), + + /* PTW */ + PINMUX_GPIO(GPIO_PTW7, PTW7_DATA), + PINMUX_GPIO(GPIO_PTW6, PTW6_DATA), + PINMUX_GPIO(GPIO_PTW5, PTW5_DATA), + PINMUX_GPIO(GPIO_PTW4, PTW4_DATA), + PINMUX_GPIO(GPIO_PTW3, PTW3_DATA), + PINMUX_GPIO(GPIO_PTW2, PTW2_DATA), + PINMUX_GPIO(GPIO_PTW1, PTW1_DATA), + PINMUX_GPIO(GPIO_PTW0, PTW0_DATA), + + /* PTX */ + PINMUX_GPIO(GPIO_PTX7, PTX7_DATA), + PINMUX_GPIO(GPIO_PTX6, PTX6_DATA), + PINMUX_GPIO(GPIO_PTX5, PTX5_DATA), + PINMUX_GPIO(GPIO_PTX4, PTX4_DATA), + PINMUX_GPIO(GPIO_PTX3, PTX3_DATA), + PINMUX_GPIO(GPIO_PTX2, PTX2_DATA), + PINMUX_GPIO(GPIO_PTX1, PTX1_DATA), + PINMUX_GPIO(GPIO_PTX0, PTX0_DATA), + + /* PTY */ + PINMUX_GPIO(GPIO_PTY7, PTY7_DATA), + PINMUX_GPIO(GPIO_PTY6, PTY6_DATA), + PINMUX_GPIO(GPIO_PTY5, PTY5_DATA), + PINMUX_GPIO(GPIO_PTY4, PTY4_DATA), + PINMUX_GPIO(GPIO_PTY3, PTY3_DATA), + PINMUX_GPIO(GPIO_PTY2, PTY2_DATA), + PINMUX_GPIO(GPIO_PTY1, PTY1_DATA), + PINMUX_GPIO(GPIO_PTY0, PTY0_DATA), + + /* PTZ */ + PINMUX_GPIO(GPIO_PTZ7, PTZ7_DATA), + PINMUX_GPIO(GPIO_PTZ6, PTZ6_DATA), + PINMUX_GPIO(GPIO_PTZ5, PTZ5_DATA), + PINMUX_GPIO(GPIO_PTZ4, PTZ4_DATA), + PINMUX_GPIO(GPIO_PTZ3, PTZ3_DATA), + PINMUX_GPIO(GPIO_PTZ2, PTZ2_DATA), + PINMUX_GPIO(GPIO_PTZ1, PTZ1_DATA), + PINMUX_GPIO(GPIO_PTZ0, PTZ0_DATA), + + /* BSC */ + PINMUX_GPIO(GPIO_FN_D31, D31_MARK), + PINMUX_GPIO(GPIO_FN_D30, D30_MARK), + PINMUX_GPIO(GPIO_FN_D29, D29_MARK), + PINMUX_GPIO(GPIO_FN_D28, D28_MARK), + PINMUX_GPIO(GPIO_FN_D27, D27_MARK), + PINMUX_GPIO(GPIO_FN_D26, D26_MARK), + PINMUX_GPIO(GPIO_FN_D25, D25_MARK), + PINMUX_GPIO(GPIO_FN_D24, D24_MARK), + PINMUX_GPIO(GPIO_FN_D23, D23_MARK), + PINMUX_GPIO(GPIO_FN_D22, D22_MARK), + PINMUX_GPIO(GPIO_FN_D21, D21_MARK), + PINMUX_GPIO(GPIO_FN_D20, D20_MARK), + PINMUX_GPIO(GPIO_FN_D19, D19_MARK), + PINMUX_GPIO(GPIO_FN_D18, D18_MARK), + PINMUX_GPIO(GPIO_FN_D17, D17_MARK), + PINMUX_GPIO(GPIO_FN_D16, D16_MARK), + PINMUX_GPIO(GPIO_FN_D15, D15_MARK), + PINMUX_GPIO(GPIO_FN_D14, D14_MARK), + PINMUX_GPIO(GPIO_FN_D13, D13_MARK), + PINMUX_GPIO(GPIO_FN_D12, D12_MARK), + PINMUX_GPIO(GPIO_FN_D11, D11_MARK), + PINMUX_GPIO(GPIO_FN_D10, D10_MARK), + PINMUX_GPIO(GPIO_FN_D9, D9_MARK), + PINMUX_GPIO(GPIO_FN_D8, D8_MARK), + PINMUX_GPIO(GPIO_FN_D7, D7_MARK), + PINMUX_GPIO(GPIO_FN_D6, D6_MARK), + PINMUX_GPIO(GPIO_FN_D5, D5_MARK), + PINMUX_GPIO(GPIO_FN_D4, D4_MARK), + PINMUX_GPIO(GPIO_FN_D3, D3_MARK), + PINMUX_GPIO(GPIO_FN_D2, D2_MARK), + PINMUX_GPIO(GPIO_FN_D1, D1_MARK), + PINMUX_GPIO(GPIO_FN_D0, D0_MARK), + PINMUX_GPIO(GPIO_FN_A25, A25_MARK), + PINMUX_GPIO(GPIO_FN_A24, A24_MARK), + PINMUX_GPIO(GPIO_FN_A23, A23_MARK), + PINMUX_GPIO(GPIO_FN_A22, A22_MARK), + PINMUX_GPIO(GPIO_FN_CS6B_CE1B, CS6B_CE1B_MARK), + PINMUX_GPIO(GPIO_FN_CS6A_CE2B, CS6A_CE2B_MARK), + PINMUX_GPIO(GPIO_FN_CS5B_CE1A, CS5B_CE1A_MARK), + PINMUX_GPIO(GPIO_FN_CS5A_CE2A, CS5A_CE2A_MARK), + PINMUX_GPIO(GPIO_FN_WE3_ICIOWR, WE3_ICIOWR_MARK), + PINMUX_GPIO(GPIO_FN_WE2_ICIORD, WE2_ICIORD_MARK), + PINMUX_GPIO(GPIO_FN_IOIS16, IOIS16_MARK), + PINMUX_GPIO(GPIO_FN_WAIT, WAIT_MARK), + PINMUX_GPIO(GPIO_FN_BS, BS_MARK), + + /* KEYSC */ + PINMUX_GPIO(GPIO_FN_KEYOUT5_IN5, KEYOUT5_IN5_MARK), + PINMUX_GPIO(GPIO_FN_KEYOUT4_IN6, KEYOUT4_IN6_MARK), + PINMUX_GPIO(GPIO_FN_KEYIN4, KEYIN4_MARK), + PINMUX_GPIO(GPIO_FN_KEYIN3, KEYIN3_MARK), + PINMUX_GPIO(GPIO_FN_KEYIN2, KEYIN2_MARK), + PINMUX_GPIO(GPIO_FN_KEYIN1, KEYIN1_MARK), + PINMUX_GPIO(GPIO_FN_KEYIN0, KEYIN0_MARK), + PINMUX_GPIO(GPIO_FN_KEYOUT3, KEYOUT3_MARK), + PINMUX_GPIO(GPIO_FN_KEYOUT2, KEYOUT2_MARK), + PINMUX_GPIO(GPIO_FN_KEYOUT1, KEYOUT1_MARK), + PINMUX_GPIO(GPIO_FN_KEYOUT0, KEYOUT0_MARK), + + /* ATAPI */ + PINMUX_GPIO(GPIO_FN_IDED15, IDED15_MARK), + PINMUX_GPIO(GPIO_FN_IDED14, IDED14_MARK), + PINMUX_GPIO(GPIO_FN_IDED13, IDED13_MARK), + PINMUX_GPIO(GPIO_FN_IDED12, IDED12_MARK), + PINMUX_GPIO(GPIO_FN_IDED11, IDED11_MARK), + PINMUX_GPIO(GPIO_FN_IDED10, IDED10_MARK), + PINMUX_GPIO(GPIO_FN_IDED9, IDED9_MARK), + PINMUX_GPIO(GPIO_FN_IDED8, IDED8_MARK), + PINMUX_GPIO(GPIO_FN_IDED7, IDED7_MARK), + PINMUX_GPIO(GPIO_FN_IDED6, IDED6_MARK), + PINMUX_GPIO(GPIO_FN_IDED5, IDED5_MARK), + PINMUX_GPIO(GPIO_FN_IDED4, IDED4_MARK), + PINMUX_GPIO(GPIO_FN_IDED3, IDED3_MARK), + PINMUX_GPIO(GPIO_FN_IDED2, IDED2_MARK), + PINMUX_GPIO(GPIO_FN_IDED1, IDED1_MARK), + PINMUX_GPIO(GPIO_FN_IDED0, IDED0_MARK), + PINMUX_GPIO(GPIO_FN_IDEA2, IDEA2_MARK), + PINMUX_GPIO(GPIO_FN_IDEA1, IDEA1_MARK), + PINMUX_GPIO(GPIO_FN_IDEA0, IDEA0_MARK), + PINMUX_GPIO(GPIO_FN_IDEIOWR, IDEIOWR_MARK), + PINMUX_GPIO(GPIO_FN_IODREQ, IODREQ_MARK), + PINMUX_GPIO(GPIO_FN_IDECS0, IDECS0_MARK), + PINMUX_GPIO(GPIO_FN_IDECS1, IDECS1_MARK), + PINMUX_GPIO(GPIO_FN_IDEIORD, IDEIORD_MARK), + PINMUX_GPIO(GPIO_FN_DIRECTION, DIRECTION_MARK), + PINMUX_GPIO(GPIO_FN_EXBUF_ENB, EXBUF_ENB_MARK), + PINMUX_GPIO(GPIO_FN_IDERST, IDERST_MARK), + PINMUX_GPIO(GPIO_FN_IODACK, IODACK_MARK), + PINMUX_GPIO(GPIO_FN_IDEINT, IDEINT_MARK), + PINMUX_GPIO(GPIO_FN_IDEIORDY, IDEIORDY_MARK), + + /* TPU */ + PINMUX_GPIO(GPIO_FN_TPUTO3, TPUTO3_MARK), + PINMUX_GPIO(GPIO_FN_TPUTO2, TPUTO2_MARK), + PINMUX_GPIO(GPIO_FN_TPUTO1, TPUTO1_MARK), + PINMUX_GPIO(GPIO_FN_TPUTO0, TPUTO0_MARK), + PINMUX_GPIO(GPIO_FN_TPUTI3, TPUTI3_MARK), + PINMUX_GPIO(GPIO_FN_TPUTI2, TPUTI2_MARK), + + /* LCDC */ + PINMUX_GPIO(GPIO_FN_LCDD23, LCDD23_MARK), + PINMUX_GPIO(GPIO_FN_LCDD22, LCDD22_MARK), + PINMUX_GPIO(GPIO_FN_LCDD21, LCDD21_MARK), + PINMUX_GPIO(GPIO_FN_LCDD20, LCDD20_MARK), + PINMUX_GPIO(GPIO_FN_LCDD19, LCDD19_MARK), + PINMUX_GPIO(GPIO_FN_LCDD18, LCDD18_MARK), + PINMUX_GPIO(GPIO_FN_LCDD17, LCDD17_MARK), + PINMUX_GPIO(GPIO_FN_LCDD16, LCDD16_MARK), + PINMUX_GPIO(GPIO_FN_LCDD15, LCDD15_MARK), + PINMUX_GPIO(GPIO_FN_LCDD14, LCDD14_MARK), + PINMUX_GPIO(GPIO_FN_LCDD13, LCDD13_MARK), + PINMUX_GPIO(GPIO_FN_LCDD12, LCDD12_MARK), + PINMUX_GPIO(GPIO_FN_LCDD11, LCDD11_MARK), + PINMUX_GPIO(GPIO_FN_LCDD10, LCDD10_MARK), + PINMUX_GPIO(GPIO_FN_LCDD9, LCDD9_MARK), + PINMUX_GPIO(GPIO_FN_LCDD8, LCDD8_MARK), + PINMUX_GPIO(GPIO_FN_LCDD7, LCDD7_MARK), + PINMUX_GPIO(GPIO_FN_LCDD6, LCDD6_MARK), + PINMUX_GPIO(GPIO_FN_LCDD5, LCDD5_MARK), + PINMUX_GPIO(GPIO_FN_LCDD4, LCDD4_MARK), + PINMUX_GPIO(GPIO_FN_LCDD3, LCDD3_MARK), + PINMUX_GPIO(GPIO_FN_LCDD2, LCDD2_MARK), + PINMUX_GPIO(GPIO_FN_LCDD1, LCDD1_MARK), + PINMUX_GPIO(GPIO_FN_LCDD0, LCDD0_MARK), + PINMUX_GPIO(GPIO_FN_LCDVSYN, LCDVSYN_MARK), + PINMUX_GPIO(GPIO_FN_LCDDISP, LCDDISP_MARK), + PINMUX_GPIO(GPIO_FN_LCDRS, LCDRS_MARK), + PINMUX_GPIO(GPIO_FN_LCDHSYN, LCDHSYN_MARK), + PINMUX_GPIO(GPIO_FN_LCDCS, LCDCS_MARK), + PINMUX_GPIO(GPIO_FN_LCDDON, LCDDON_MARK), + PINMUX_GPIO(GPIO_FN_LCDDCK, LCDDCK_MARK), + PINMUX_GPIO(GPIO_FN_LCDWR, LCDWR_MARK), + PINMUX_GPIO(GPIO_FN_LCDVEPWC, LCDVEPWC_MARK), + PINMUX_GPIO(GPIO_FN_LCDVCPWC, LCDVCPWC_MARK), + PINMUX_GPIO(GPIO_FN_LCDRD, LCDRD_MARK), + PINMUX_GPIO(GPIO_FN_LCDLCLK, LCDLCLK_MARK), + + /* SCIF0 */ + PINMUX_GPIO(GPIO_FN_SCIF0_TXD, SCIF0_TXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF0_RXD, SCIF0_RXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF0_SCK, SCIF0_SCK_MARK), + + /* SCIF1 */ + PINMUX_GPIO(GPIO_FN_SCIF1_SCK, SCIF1_SCK_MARK), + PINMUX_GPIO(GPIO_FN_SCIF1_RXD, SCIF1_RXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF1_TXD, SCIF1_TXD_MARK), + + /* SCIF2 */ + PINMUX_GPIO(GPIO_FN_SCIF2_L_TXD, SCIF2_L_TXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF2_L_SCK, SCIF2_L_SCK_MARK), + PINMUX_GPIO(GPIO_FN_SCIF2_L_RXD, SCIF2_L_RXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF2_V_TXD, SCIF2_V_TXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF2_V_SCK, SCIF2_V_SCK_MARK), + PINMUX_GPIO(GPIO_FN_SCIF2_V_RXD, SCIF2_V_RXD_MARK), + + /* SCIF3 */ + PINMUX_GPIO(GPIO_FN_SCIF3_V_SCK, SCIF3_V_SCK_MARK), + PINMUX_GPIO(GPIO_FN_SCIF3_V_RXD, SCIF3_V_RXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF3_V_TXD, SCIF3_V_TXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF3_V_CTS, SCIF3_V_CTS_MARK), + PINMUX_GPIO(GPIO_FN_SCIF3_V_RTS, SCIF3_V_RTS_MARK), + PINMUX_GPIO(GPIO_FN_SCIF3_I_SCK, SCIF3_I_SCK_MARK), + PINMUX_GPIO(GPIO_FN_SCIF3_I_RXD, SCIF3_I_RXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF3_I_TXD, SCIF3_I_TXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF3_I_CTS, SCIF3_I_CTS_MARK), + PINMUX_GPIO(GPIO_FN_SCIF3_I_RTS, SCIF3_I_RTS_MARK), + + /* SCIF4 */ + PINMUX_GPIO(GPIO_FN_SCIF4_SCK, SCIF4_SCK_MARK), + PINMUX_GPIO(GPIO_FN_SCIF4_RXD, SCIF4_RXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF4_TXD, SCIF4_TXD_MARK), + + /* SCIF5 */ + PINMUX_GPIO(GPIO_FN_SCIF5_SCK, SCIF5_SCK_MARK), + PINMUX_GPIO(GPIO_FN_SCIF5_RXD, SCIF5_RXD_MARK), + PINMUX_GPIO(GPIO_FN_SCIF5_TXD, SCIF5_TXD_MARK), + + /* FSI */ + PINMUX_GPIO(GPIO_FN_FSIMCKB, FSIMCKB_MARK), + PINMUX_GPIO(GPIO_FN_FSIMCKA, FSIMCKA_MARK), + PINMUX_GPIO(GPIO_FN_FSIOASD, FSIOASD_MARK), + PINMUX_GPIO(GPIO_FN_FSIIABCK, FSIIABCK_MARK), + PINMUX_GPIO(GPIO_FN_FSIIALRCK, FSIIALRCK_MARK), + PINMUX_GPIO(GPIO_FN_FSIOABCK, FSIOABCK_MARK), + PINMUX_GPIO(GPIO_FN_FSIOALRCK, FSIOALRCK_MARK), + PINMUX_GPIO(GPIO_FN_CLKAUDIOAO, CLKAUDIOAO_MARK), + PINMUX_GPIO(GPIO_FN_FSIIBSD, FSIIBSD_MARK), + PINMUX_GPIO(GPIO_FN_FSIOBSD, FSIOBSD_MARK), + PINMUX_GPIO(GPIO_FN_FSIIBBCK, FSIIBBCK_MARK), + PINMUX_GPIO(GPIO_FN_FSIIBLRCK, FSIIBLRCK_MARK), + PINMUX_GPIO(GPIO_FN_FSIOBBCK, FSIOBBCK_MARK), + PINMUX_GPIO(GPIO_FN_FSIOBLRCK, FSIOBLRCK_MARK), + PINMUX_GPIO(GPIO_FN_CLKAUDIOBO, CLKAUDIOBO_MARK), + PINMUX_GPIO(GPIO_FN_FSIIASD, FSIIASD_MARK), + + /* AUD */ + PINMUX_GPIO(GPIO_FN_AUDCK, AUDCK_MARK), + PINMUX_GPIO(GPIO_FN_AUDSYNC, AUDSYNC_MARK), + PINMUX_GPIO(GPIO_FN_AUDATA3, AUDATA3_MARK), + PINMUX_GPIO(GPIO_FN_AUDATA2, AUDATA2_MARK), + PINMUX_GPIO(GPIO_FN_AUDATA1, AUDATA1_MARK), + PINMUX_GPIO(GPIO_FN_AUDATA0, AUDATA0_MARK), + + /* VIO */ + PINMUX_GPIO(GPIO_FN_VIO_CKO, VIO_CKO_MARK), + + /* VIO0 */ + PINMUX_GPIO(GPIO_FN_VIO0_D15, VIO0_D15_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D14, VIO0_D14_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D13, VIO0_D13_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D12, VIO0_D12_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D11, VIO0_D11_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D10, VIO0_D10_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D9, VIO0_D9_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D8, VIO0_D8_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D7, VIO0_D7_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D6, VIO0_D6_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D5, VIO0_D5_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D4, VIO0_D4_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D3, VIO0_D3_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D2, VIO0_D2_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D1, VIO0_D1_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_D0, VIO0_D0_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_VD, VIO0_VD_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_CLK, VIO0_CLK_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_FLD, VIO0_FLD_MARK), + PINMUX_GPIO(GPIO_FN_VIO0_HD, VIO0_HD_MARK), + + /* VIO1 */ + PINMUX_GPIO(GPIO_FN_VIO1_D7, VIO1_D7_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_D6, VIO1_D6_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_D5, VIO1_D5_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_D4, VIO1_D4_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_D3, VIO1_D3_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_D2, VIO1_D2_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_D1, VIO1_D1_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_D0, VIO1_D0_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_FLD, VIO1_FLD_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_HD, VIO1_HD_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_VD, VIO1_VD_MARK), + PINMUX_GPIO(GPIO_FN_VIO1_CLK, VIO1_CLK_MARK), + + /* Eth */ + PINMUX_GPIO(GPIO_FN_RMII_RXD0, RMII_RXD0_MARK), + PINMUX_GPIO(GPIO_FN_RMII_RXD1, RMII_RXD1_MARK), + PINMUX_GPIO(GPIO_FN_RMII_TXD0, RMII_TXD0_MARK), + PINMUX_GPIO(GPIO_FN_RMII_TXD1, RMII_TXD1_MARK), + PINMUX_GPIO(GPIO_FN_RMII_REF_CLK, RMII_REF_CLK_MARK), + PINMUX_GPIO(GPIO_FN_RMII_TX_EN, RMII_TX_EN_MARK), + PINMUX_GPIO(GPIO_FN_RMII_RX_ER, RMII_RX_ER_MARK), + PINMUX_GPIO(GPIO_FN_RMII_CRS_DV, RMII_CRS_DV_MARK), + PINMUX_GPIO(GPIO_FN_LNKSTA, LNKSTA_MARK), + PINMUX_GPIO(GPIO_FN_MDIO, MDIO_MARK), + PINMUX_GPIO(GPIO_FN_MDC, MDC_MARK), + + /* System */ + PINMUX_GPIO(GPIO_FN_PDSTATUS, PDSTATUS_MARK), + PINMUX_GPIO(GPIO_FN_STATUS2, STATUS2_MARK), + PINMUX_GPIO(GPIO_FN_STATUS0, STATUS0_MARK), + + /* VOU */ + PINMUX_GPIO(GPIO_FN_DV_D15, DV_D15_MARK), + PINMUX_GPIO(GPIO_FN_DV_D14, DV_D14_MARK), + PINMUX_GPIO(GPIO_FN_DV_D13, DV_D13_MARK), + PINMUX_GPIO(GPIO_FN_DV_D12, DV_D12_MARK), + PINMUX_GPIO(GPIO_FN_DV_D11, DV_D11_MARK), + PINMUX_GPIO(GPIO_FN_DV_D10, DV_D10_MARK), + PINMUX_GPIO(GPIO_FN_DV_D9, DV_D9_MARK), + PINMUX_GPIO(GPIO_FN_DV_D8, DV_D8_MARK), + PINMUX_GPIO(GPIO_FN_DV_D7, DV_D7_MARK), + PINMUX_GPIO(GPIO_FN_DV_D6, DV_D6_MARK), + PINMUX_GPIO(GPIO_FN_DV_D5, DV_D5_MARK), + PINMUX_GPIO(GPIO_FN_DV_D4, DV_D4_MARK), + PINMUX_GPIO(GPIO_FN_DV_D3, DV_D3_MARK), + PINMUX_GPIO(GPIO_FN_DV_D2, DV_D2_MARK), + PINMUX_GPIO(GPIO_FN_DV_D1, DV_D1_MARK), + PINMUX_GPIO(GPIO_FN_DV_D0, DV_D0_MARK), + PINMUX_GPIO(GPIO_FN_DV_CLKI, DV_CLKI_MARK), + PINMUX_GPIO(GPIO_FN_DV_CLK, DV_CLK_MARK), + PINMUX_GPIO(GPIO_FN_DV_VSYNC, DV_VSYNC_MARK), + PINMUX_GPIO(GPIO_FN_DV_HSYNC, DV_HSYNC_MARK), + + /* MSIOF0 */ + PINMUX_GPIO(GPIO_FN_MSIOF0_RXD, MSIOF0_RXD_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF0_TXD, MSIOF0_TXD_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF0_MCK, MSIOF0_MCK_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF0_TSCK, MSIOF0_TSCK_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF0_SS1, MSIOF0_SS1_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF0_SS2, MSIOF0_SS2_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF0_TSYNC, MSIOF0_TSYNC_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF0_RSCK, MSIOF0_RSCK_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF0_RSYNC, MSIOF0_RSYNC_MARK), + + /* MSIOF1 */ + PINMUX_GPIO(GPIO_FN_MSIOF1_RXD, MSIOF1_RXD_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF1_TXD, MSIOF1_TXD_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF1_MCK, MSIOF1_MCK_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF1_TSCK, MSIOF1_TSCK_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF1_SS1, MSIOF1_SS1_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF1_SS2, MSIOF1_SS2_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF1_TSYNC, MSIOF1_TSYNC_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF1_RSCK, MSIOF1_RSCK_MARK), + PINMUX_GPIO(GPIO_FN_MSIOF1_RSYNC, MSIOF1_RSYNC_MARK), + + /* DMAC */ + PINMUX_GPIO(GPIO_FN_DMAC_DACK0, DMAC_DACK0_MARK), + PINMUX_GPIO(GPIO_FN_DMAC_DREQ0, DMAC_DREQ0_MARK), + PINMUX_GPIO(GPIO_FN_DMAC_DACK1, DMAC_DACK1_MARK), + PINMUX_GPIO(GPIO_FN_DMAC_DREQ1, DMAC_DREQ1_MARK), + + /* SDHI0 */ + PINMUX_GPIO(GPIO_FN_SDHI0CD, SDHI0CD_MARK), + PINMUX_GPIO(GPIO_FN_SDHI0WP, SDHI0WP_MARK), + PINMUX_GPIO(GPIO_FN_SDHI0CMD, SDHI0CMD_MARK), + PINMUX_GPIO(GPIO_FN_SDHI0CLK, SDHI0CLK_MARK), + PINMUX_GPIO(GPIO_FN_SDHI0D3, SDHI0D3_MARK), + PINMUX_GPIO(GPIO_FN_SDHI0D2, SDHI0D2_MARK), + PINMUX_GPIO(GPIO_FN_SDHI0D1, SDHI0D1_MARK), + PINMUX_GPIO(GPIO_FN_SDHI0D0, SDHI0D0_MARK), + + /* SDHI1 */ + PINMUX_GPIO(GPIO_FN_SDHI1CD, SDHI1CD_MARK), + PINMUX_GPIO(GPIO_FN_SDHI1WP, SDHI1WP_MARK), + PINMUX_GPIO(GPIO_FN_SDHI1CMD, SDHI1CMD_MARK), + PINMUX_GPIO(GPIO_FN_SDHI1CLK, SDHI1CLK_MARK), + PINMUX_GPIO(GPIO_FN_SDHI1D3, SDHI1D3_MARK), + PINMUX_GPIO(GPIO_FN_SDHI1D2, SDHI1D2_MARK), + PINMUX_GPIO(GPIO_FN_SDHI1D1, SDHI1D1_MARK), + PINMUX_GPIO(GPIO_FN_SDHI1D0, SDHI1D0_MARK), + + /* MMC */ + PINMUX_GPIO(GPIO_FN_MMC_D7, MMC_D7_MARK), + PINMUX_GPIO(GPIO_FN_MMC_D6, MMC_D6_MARK), + PINMUX_GPIO(GPIO_FN_MMC_D5, MMC_D5_MARK), + PINMUX_GPIO(GPIO_FN_MMC_D4, MMC_D4_MARK), + PINMUX_GPIO(GPIO_FN_MMC_D3, MMC_D3_MARK), + PINMUX_GPIO(GPIO_FN_MMC_D2, MMC_D2_MARK), + PINMUX_GPIO(GPIO_FN_MMC_D1, MMC_D1_MARK), + PINMUX_GPIO(GPIO_FN_MMC_D0, MMC_D0_MARK), + PINMUX_GPIO(GPIO_FN_MMC_CLK, MMC_CLK_MARK), + PINMUX_GPIO(GPIO_FN_MMC_CMD, MMC_CMD_MARK), + + /* IrDA */ + PINMUX_GPIO(GPIO_FN_IRDA_OUT, IRDA_OUT_MARK), + PINMUX_GPIO(GPIO_FN_IRDA_IN, IRDA_IN_MARK), + + /* TSIF */ + PINMUX_GPIO(GPIO_FN_TSIF_TS0_SDAT, TSIF_TS0_SDAT_MARK), + PINMUX_GPIO(GPIO_FN_TSIF_TS0_SCK, TSIF_TS0_SCK_MARK), + PINMUX_GPIO(GPIO_FN_TSIF_TS0_SDEN, TSIF_TS0_SDEN_MARK), + PINMUX_GPIO(GPIO_FN_TSIF_TS0_SPSYNC, TSIF_TS0_SPSYNC_MARK), + + /* IRQ */ + PINMUX_GPIO(GPIO_FN_INTC_IRQ7, INTC_IRQ7_MARK), + PINMUX_GPIO(GPIO_FN_INTC_IRQ6, INTC_IRQ6_MARK), + PINMUX_GPIO(GPIO_FN_INTC_IRQ5, INTC_IRQ5_MARK), + PINMUX_GPIO(GPIO_FN_INTC_IRQ4, INTC_IRQ4_MARK), + PINMUX_GPIO(GPIO_FN_INTC_IRQ3, INTC_IRQ3_MARK), + PINMUX_GPIO(GPIO_FN_INTC_IRQ2, INTC_IRQ2_MARK), + PINMUX_GPIO(GPIO_FN_INTC_IRQ1, INTC_IRQ1_MARK), + PINMUX_GPIO(GPIO_FN_INTC_IRQ0, INTC_IRQ0_MARK), + }; + +static struct pinmux_cfg_reg pinmux_config_regs[] = { + { PINMUX_CFG_REG("PACR", 0xa4050100, 16, 2) { + PTA7_FN, PTA7_OUT, PTA7_IN_PU, PTA7_IN, + PTA6_FN, PTA6_OUT, PTA6_IN_PU, PTA6_IN, + PTA5_FN, PTA5_OUT, PTA5_IN_PU, PTA5_IN, + PTA4_FN, PTA4_OUT, PTA4_IN_PU, PTA4_IN, + PTA3_FN, PTA3_OUT, PTA3_IN_PU, PTA3_IN, + PTA2_FN, PTA2_OUT, PTA2_IN_PU, PTA2_IN, + PTA1_FN, PTA1_OUT, PTA1_IN_PU, PTA1_IN, + PTA0_FN, PTA0_OUT, PTA0_IN_PU, PTA0_IN } + }, + { PINMUX_CFG_REG("PBCR", 0xa4050102, 16, 2) { + PTB7_FN, PTB7_OUT, PTB7_IN_PU, PTB7_IN, + PTB6_FN, PTB6_OUT, PTB6_IN_PU, PTB6_IN, + PTB5_FN, PTB5_OUT, PTB5_IN_PU, PTB5_IN, + PTB4_FN, PTB4_OUT, PTB4_IN_PU, PTB4_IN, + PTB3_FN, PTB3_OUT, PTB3_IN_PU, PTB3_IN, + PTB2_FN, PTB2_OUT, PTB2_IN_PU, PTB2_IN, + PTB1_FN, PTB1_OUT, PTB1_IN_PU, PTB1_IN, + PTB0_FN, PTB0_OUT, PTB0_IN_PU, PTB0_IN } + }, + { PINMUX_CFG_REG("PCCR", 0xa4050104, 16, 2) { + PTC7_FN, PTC7_OUT, PTC7_IN_PU, PTC7_IN, + PTC6_FN, PTC6_OUT, PTC6_IN_PU, PTC6_IN, + PTC5_FN, PTC5_OUT, PTC5_IN_PU, PTC5_IN, + PTC4_FN, PTC4_OUT, PTC4_IN_PU, PTC4_IN, + PTC3_FN, PTC3_OUT, PTC3_IN_PU, PTC3_IN, + PTC2_FN, PTC2_OUT, PTC2_IN_PU, PTC2_IN, + PTC1_FN, PTC1_OUT, PTC1_IN_PU, PTC1_IN, + PTC0_FN, PTC0_OUT, PTC0_IN_PU, PTC0_IN } + }, + { PINMUX_CFG_REG("PDCR", 0xa4050106, 16, 2) { + PTD7_FN, PTD7_OUT, PTD7_IN_PU, PTD7_IN, + PTD6_FN, PTD6_OUT, PTD6_IN_PU, PTD6_IN, + PTD5_FN, PTD5_OUT, PTD5_IN_PU, PTD5_IN, + PTD4_FN, PTD4_OUT, PTD4_IN_PU, PTD4_IN, + PTD3_FN, PTD3_OUT, PTD3_IN_PU, PTD3_IN, + PTD2_FN, PTD2_OUT, PTD2_IN_PU, PTD2_IN, + PTD1_FN, PTD1_OUT, PTD1_IN_PU, PTD1_IN, + PTD0_FN, PTD0_OUT, PTD0_IN_PU, PTD0_IN } + }, + { PINMUX_CFG_REG("PECR", 0xa4050108, 16, 2) { + PTE7_FN, PTE7_OUT, PTE7_IN_PU, PTE7_IN, + PTE6_FN, PTE6_OUT, PTE6_IN_PU, PTE6_IN, + PTE5_FN, PTE5_OUT, PTE5_IN_PU, PTE5_IN, + PTE4_FN, PTE4_OUT, PTE4_IN_PU, PTE4_IN, + PTE3_FN, PTE3_OUT, PTE3_IN_PU, PTE3_IN, + PTE2_FN, PTE2_OUT, PTE2_IN_PU, PTE2_IN, + PTE1_FN, PTE1_OUT, PTE1_IN_PU, PTE1_IN, + PTE0_FN, PTE0_OUT, PTE0_IN_PU, PTE0_IN } + }, + { PINMUX_CFG_REG("PFCR", 0xa405010a, 16, 2) { + PTF7_FN, PTF7_OUT, PTF7_IN_PU, PTF7_IN, + PTF6_FN, PTF6_OUT, PTF6_IN_PU, PTF6_IN, + PTF5_FN, PTF5_OUT, PTF5_IN_PU, PTF5_IN, + PTF4_FN, PTF4_OUT, PTF4_IN_PU, PTF4_IN, + PTF3_FN, PTF3_OUT, PTF3_IN_PU, PTF3_IN, + PTF2_FN, PTF2_OUT, PTF2_IN_PU, PTF2_IN, + PTF1_FN, PTF1_OUT, PTF1_IN_PU, PTF1_IN, + PTF0_FN, PTF0_OUT, PTF0_IN_PU, PTF0_IN } + }, + { PINMUX_CFG_REG("PGCR", 0xa405010c, 16, 2) { + 0, 0, 0, 0, + 0, 0, 0, 0, + PTG5_FN, PTG5_OUT, 0, 0, + PTG4_FN, PTG4_OUT, 0, 0, + PTG3_FN, PTG3_OUT, 0, 0, + PTG2_FN, PTG2_OUT, 0, 0, + PTG1_FN, PTG1_OUT, 0, 0, + PTG0_FN, PTG0_OUT, 0, 0 } + }, + { PINMUX_CFG_REG("PHCR", 0xa405010e, 16, 2) { + PTH7_FN, PTH7_OUT, PTH7_IN_PU, PTH7_IN, + PTH6_FN, PTH6_OUT, PTH6_IN_PU, PTH6_IN, + PTH5_FN, PTH5_OUT, PTH5_IN_PU, PTH5_IN, + PTH4_FN, PTH4_OUT, PTH4_IN_PU, PTH4_IN, + PTH3_FN, PTH3_OUT, PTH3_IN_PU, PTH3_IN, + PTH2_FN, PTH2_OUT, PTH2_IN_PU, PTH2_IN, + PTH1_FN, PTH1_OUT, PTH1_IN_PU, PTH1_IN, + PTH0_FN, PTH0_OUT, PTH0_IN_PU, PTH0_IN } + }, + { PINMUX_CFG_REG("PJCR", 0xa4050110, 16, 2) { + PTJ7_FN, PTJ7_OUT, 0, 0, + PTJ6_FN, PTJ6_OUT, 0, 0, + PTJ5_FN, PTJ5_OUT, 0, 0, + 0, 0, 0, 0, + PTJ3_FN, PTJ3_OUT, PTJ3_IN_PU, PTJ3_IN, + PTJ2_FN, PTJ2_OUT, PTJ2_IN_PU, PTJ2_IN, + PTJ1_FN, PTJ1_OUT, PTJ1_IN_PU, PTJ1_IN, + PTJ0_FN, PTJ0_OUT, PTJ0_IN_PU, PTJ0_IN } + }, + { PINMUX_CFG_REG("PKCR", 0xa4050112, 16, 2) { + PTK7_FN, PTK7_OUT, PTK7_IN_PU, PTK7_IN, + PTK6_FN, PTK6_OUT, PTK6_IN_PU, PTK6_IN, + PTK5_FN, PTK5_OUT, PTK5_IN_PU, PTK5_IN, + PTK4_FN, PTK4_OUT, PTK4_IN_PU, PTK4_IN, + PTK3_FN, PTK3_OUT, PTK3_IN_PU, PTK3_IN, + PTK2_FN, PTK2_OUT, PTK2_IN_PU, PTK2_IN, + PTK1_FN, PTK1_OUT, PTK1_IN_PU, PTK1_IN, + PTK0_FN, PTK0_OUT, PTK0_IN_PU, PTK0_IN } + }, + { PINMUX_CFG_REG("PLCR", 0xa4050114, 16, 2) { + PTL7_FN, PTL7_OUT, PTL7_IN_PU, PTL7_IN, + PTL6_FN, PTL6_OUT, PTL6_IN_PU, PTL6_IN, + PTL5_FN, PTL5_OUT, PTL5_IN_PU, PTL5_IN, + PTL4_FN, PTL4_OUT, PTL4_IN_PU, PTL4_IN, + PTL3_FN, PTL3_OUT, PTL3_IN_PU, PTL3_IN, + PTL2_FN, PTL2_OUT, PTL2_IN_PU, PTL2_IN, + PTL1_FN, PTL1_OUT, PTL1_IN_PU, PTL1_IN, + PTL0_FN, PTL0_OUT, PTL0_IN_PU, PTL0_IN } + }, + { PINMUX_CFG_REG("PMCR", 0xa4050116, 16, 2) { + PTM7_FN, PTM7_OUT, PTM7_IN_PU, PTM7_IN, + PTM6_FN, PTM6_OUT, PTM6_IN_PU, PTM6_IN, + PTM5_FN, PTM5_OUT, PTM5_IN_PU, PTM5_IN, + PTM4_FN, PTM4_OUT, PTM4_IN_PU, PTM4_IN, + PTM3_FN, PTM3_OUT, PTM3_IN_PU, PTM3_IN, + PTM2_FN, PTM2_OUT, PTM2_IN_PU, PTM2_IN, + PTM1_FN, PTM1_OUT, PTM1_IN_PU, PTM1_IN, + PTM0_FN, PTM0_OUT, PTM0_IN_PU, PTM0_IN } + }, + { PINMUX_CFG_REG("PNCR", 0xa4050118, 16, 2) { + PTN7_FN, PTN7_OUT, PTN7_IN_PU, PTN7_IN, + PTN6_FN, PTN6_OUT, PTN6_IN_PU, PTN6_IN, + PTN5_FN, PTN5_OUT, PTN5_IN_PU, PTN5_IN, + PTN4_FN, PTN4_OUT, PTN4_IN_PU, PTN4_IN, + PTN3_FN, PTN3_OUT, PTN3_IN_PU, PTN3_IN, + PTN2_FN, PTN2_OUT, PTN2_IN_PU, PTN2_IN, + PTN1_FN, PTN1_OUT, PTN1_IN_PU, PTN1_IN, + PTN0_FN, PTN0_OUT, PTN0_IN_PU, PTN0_IN } + }, + { PINMUX_CFG_REG("PQCR", 0xa405011a, 16, 2) { + PTQ7_FN, PTQ7_OUT, PTQ7_IN_PU, PTQ7_IN, + PTQ6_FN, PTQ6_OUT, PTQ6_IN_PU, PTQ6_IN, + PTQ5_FN, PTQ5_OUT, PTQ5_IN_PU, PTQ5_IN, + PTQ4_FN, PTQ4_OUT, PTQ4_IN_PU, PTQ4_IN, + PTQ3_FN, PTQ3_OUT, PTQ3_IN_PU, PTQ3_IN, + PTQ2_FN, PTQ2_OUT, PTQ2_IN_PU, PTQ2_IN, + PTQ1_FN, PTQ1_OUT, PTQ1_IN_PU, PTQ1_IN, + PTQ0_FN, PTQ0_OUT, PTQ0_IN_PU, PTQ0_IN } + }, + { PINMUX_CFG_REG("PRCR", 0xa405011c, 16, 2) { + PTR7_FN, PTR7_OUT, PTR7_IN_PU, PTR7_IN, + PTR6_FN, PTR6_OUT, PTR6_IN_PU, PTR6_IN, + PTR5_FN, PTR5_OUT, PTR5_IN_PU, PTR5_IN, + PTR4_FN, PTR4_OUT, PTR4_IN_PU, PTR4_IN, + PTR3_FN, 0, PTR3_IN_PU, PTR3_IN, + PTR2_FN, 0, PTR2_IN_PU, PTR2_IN, + PTR1_FN, PTR1_OUT, PTR1_IN_PU, PTR1_IN, + PTR0_FN, PTR0_OUT, PTR0_IN_PU, PTR0_IN } + }, + { PINMUX_CFG_REG("PSCR", 0xa405011e, 16, 2) { + 0, 0, 0, 0, + PTS6_FN, PTS6_OUT, PTS6_IN_PU, PTS6_IN, + PTS5_FN, PTS5_OUT, PTS5_IN_PU, PTS5_IN, + PTS4_FN, PTS4_OUT, PTS4_IN_PU, PTS4_IN, + PTS3_FN, PTS3_OUT, PTS3_IN_PU, PTS3_IN, + PTS2_FN, PTS2_OUT, PTS2_IN_PU, PTS2_IN, + PTS1_FN, PTS1_OUT, PTS1_IN_PU, PTS1_IN, + PTS0_FN, PTS0_OUT, PTS0_IN_PU, PTS0_IN } + }, + { PINMUX_CFG_REG("PTCR", 0xa4050140, 16, 2) { + PTT7_FN, PTT7_OUT, PTT7_IN_PU, PTT7_IN, + PTT6_FN, PTT6_OUT, PTT6_IN_PU, PTT6_IN, + PTT5_FN, PTT5_OUT, PTT5_IN_PU, PTT5_IN, + PTT4_FN, PTT4_OUT, PTT4_IN_PU, PTT4_IN, + PTT3_FN, PTT3_OUT, PTT3_IN_PU, PTT3_IN, + PTT2_FN, PTT2_OUT, PTT2_IN_PU, PTT2_IN, + PTT1_FN, PTT1_OUT, PTT1_IN_PU, PTT1_IN, + PTT0_FN, PTT0_OUT, PTT0_IN_PU, PTT0_IN } + }, + { PINMUX_CFG_REG("PUCR", 0xa4050142, 16, 2) { + PTU7_FN, PTU7_OUT, PTU7_IN_PU, PTU7_IN, + PTU6_FN, PTU6_OUT, PTU6_IN_PU, PTU6_IN, + PTU5_FN, PTU5_OUT, PTU5_IN_PU, PTU5_IN, + PTU4_FN, PTU4_OUT, PTU4_IN_PU, PTU4_IN, + PTU3_FN, PTU3_OUT, PTU3_IN_PU, PTU3_IN, + PTU2_FN, PTU2_OUT, PTU2_IN_PU, PTU2_IN, + PTU1_FN, PTU1_OUT, PTU1_IN_PU, PTU1_IN, + PTU0_FN, PTU0_OUT, PTU0_IN_PU, PTU0_IN } + }, + { PINMUX_CFG_REG("PVCR", 0xa4050144, 16, 2) { + PTV7_FN, PTV7_OUT, PTV7_IN_PU, PTV7_IN, + PTV6_FN, PTV6_OUT, PTV6_IN_PU, PTV6_IN, + PTV5_FN, PTV5_OUT, PTV5_IN_PU, PTV5_IN, + PTV4_FN, PTV4_OUT, PTV4_IN_PU, PTV4_IN, + PTV3_FN, PTV3_OUT, PTV3_IN_PU, PTV3_IN, + PTV2_FN, PTV2_OUT, PTV2_IN_PU, PTV2_IN, + PTV1_FN, PTV1_OUT, PTV1_IN_PU, PTV1_IN, + PTV0_FN, PTV0_OUT, PTV0_IN_PU, PTV0_IN } + }, + { PINMUX_CFG_REG("PWCR", 0xa4050146, 16, 2) { + PTW7_FN, PTW7_OUT, PTW7_IN_PU, PTW7_IN, + PTW6_FN, PTW6_OUT, PTW6_IN_PU, PTW6_IN, + PTW5_FN, PTW5_OUT, PTW5_IN_PU, PTW5_IN, + PTW4_FN, PTW4_OUT, PTW4_IN_PU, PTW4_IN, + PTW3_FN, PTW3_OUT, PTW3_IN_PU, PTW3_IN, + PTW2_FN, PTW2_OUT, PTW2_IN_PU, PTW2_IN, + PTW1_FN, PTW1_OUT, PTW1_IN_PU, PTW1_IN, + PTW0_FN, PTW0_OUT, PTW0_IN_PU, PTW0_IN } + }, + { PINMUX_CFG_REG("PXCR", 0xa4050148, 16, 2) { + PTX7_FN, PTX7_OUT, PTX7_IN_PU, PTX7_IN, + PTX6_FN, PTX6_OUT, PTX6_IN_PU, PTX6_IN, + PTX5_FN, PTX5_OUT, PTX5_IN_PU, PTX5_IN, + PTX4_FN, PTX4_OUT, PTX4_IN_PU, PTX4_IN, + PTX3_FN, PTX3_OUT, PTX3_IN_PU, PTX3_IN, + PTX2_FN, PTX2_OUT, PTX2_IN_PU, PTX2_IN, + PTX1_FN, PTX1_OUT, PTX1_IN_PU, PTX1_IN, + PTX0_FN, PTX0_OUT, PTX0_IN_PU, PTX0_IN } + }, + { PINMUX_CFG_REG("PYCR", 0xa405014a, 16, 2) { + PTY7_FN, PTY7_OUT, PTY7_IN_PU, PTY7_IN, + PTY6_FN, PTY6_OUT, PTY6_IN_PU, PTY6_IN, + PTY5_FN, PTY5_OUT, PTY5_IN_PU, PTY5_IN, + PTY4_FN, PTY4_OUT, PTY4_IN_PU, PTY4_IN, + PTY3_FN, PTY3_OUT, PTY3_IN_PU, PTY3_IN, + PTY2_FN, PTY2_OUT, PTY2_IN_PU, PTY2_IN, + PTY1_FN, PTY1_OUT, PTY1_IN_PU, PTY1_IN, + PTY0_FN, PTY0_OUT, PTY0_IN_PU, PTY0_IN } + }, + { PINMUX_CFG_REG("PZCR", 0xa405014c, 16, 2) { + PTZ7_FN, PTZ7_OUT, PTZ7_IN_PU, PTZ7_IN, + PTZ6_FN, PTZ6_OUT, PTZ6_IN_PU, PTZ6_IN, + PTZ5_FN, PTZ5_OUT, PTZ5_IN_PU, PTZ5_IN, + PTZ4_FN, PTZ4_OUT, PTZ4_IN_PU, PTZ4_IN, + PTZ3_FN, PTZ3_OUT, PTZ3_IN_PU, PTZ3_IN, + PTZ2_FN, PTZ2_OUT, PTZ2_IN_PU, PTZ2_IN, + PTZ1_FN, PTZ1_OUT, PTZ1_IN_PU, PTZ1_IN, + PTZ0_FN, PTZ0_OUT, PTZ0_IN_PU, PTZ0_IN } + }, + { PINMUX_CFG_REG("PSELA", 0xa405014e, 16, 1) { + PSA15_0, PSA15_1, + PSA14_0, PSA14_1, + PSA13_0, PSA13_1, + PSA12_0, PSA12_1, + 0, 0, + PSA10_0, PSA10_1, + PSA9_0, PSA9_1, + PSA8_0, PSA8_1, + PSA7_0, PSA7_1, + PSA6_0, PSA6_1, + PSA5_0, PSA5_1, + 0, 0, + PSA3_0, PSA3_1, + PSA2_0, PSA2_1, + PSA1_0, PSA1_1, + PSA0_0, PSA0_1} + }, + { PINMUX_CFG_REG("PSELB", 0xa4050150, 16, 1) { + 0, 0, + PSB14_0, PSB14_1, + PSB13_0, PSB13_1, + PSB12_0, PSB12_1, + PSB11_0, PSB11_1, + PSB10_0, PSB10_1, + PSB9_0, PSB9_1, + PSB8_0, PSB8_1, + PSB7_0, PSB7_1, + PSB6_0, PSB6_1, + PSB5_0, PSB5_1, + PSB4_0, PSB4_1, + PSB3_0, PSB3_1, + PSB2_0, PSB2_1, + PSB1_0, PSB1_1, + PSB0_0, PSB0_1} + }, + { PINMUX_CFG_REG("PSELC", 0xa4050152, 16, 1) { + PSC15_0, PSC15_1, + PSC14_0, PSC14_1, + PSC13_0, PSC13_1, + PSC12_0, PSC12_1, + PSC11_0, PSC11_1, + PSC10_0, PSC10_1, + PSC9_0, PSC9_1, + PSC8_0, PSC8_1, + PSC7_0, PSC7_1, + PSC6_0, PSC6_1, + PSC5_0, PSC5_1, + PSC4_0, PSC4_1, + 0, 0, + PSC2_0, PSC2_1, + PSC1_0, PSC1_1, + PSC0_0, PSC0_1} + }, + { PINMUX_CFG_REG("PSELD", 0xa4050154, 16, 1) { + PSD15_0, PSD15_1, + PSD14_0, PSD14_1, + PSD13_0, PSD13_1, + PSD12_0, PSD12_1, + PSD11_0, PSD11_1, + PSD10_0, PSD10_1, + PSD9_0, PSD9_1, + PSD8_0, PSD8_1, + PSD7_0, PSD7_1, + PSD6_0, PSD6_1, + PSD5_0, PSD5_1, + PSD4_0, PSD4_1, + PSD3_0, PSD3_1, + PSD2_0, PSD2_1, + PSD1_0, PSD1_1, + PSD0_0, PSD0_1} + }, + { PINMUX_CFG_REG("PSELE", 0xa4050156, 16, 1) { + PSE15_0, PSE15_1, + PSE14_0, PSE14_1, + PSE13_0, PSE13_1, + PSE12_0, PSE12_1, + PSE11_0, PSE11_1, + PSE10_0, PSE10_1, + PSE9_0, PSE9_1, + PSE8_0, PSE8_1, + PSE7_0, PSE7_1, + PSE6_0, PSE6_1, + PSE5_0, PSE5_1, + PSE4_0, PSE4_1, + PSE3_0, PSE3_1, + PSE2_0, PSE2_1, + PSE1_0, PSE1_1, + PSE0_0, PSE0_1} + }, + {} +}; + +static struct pinmux_data_reg pinmux_data_regs[] = { + { PINMUX_DATA_REG("PADR", 0xa4050120, 8) { + PTA7_DATA, PTA6_DATA, PTA5_DATA, PTA4_DATA, + PTA3_DATA, PTA2_DATA, PTA1_DATA, PTA0_DATA } + }, + { PINMUX_DATA_REG("PBDR", 0xa4050122, 8) { + PTB7_DATA, PTB6_DATA, PTB5_DATA, PTB4_DATA, + PTB3_DATA, PTB2_DATA, PTB1_DATA, PTB0_DATA } + }, + { PINMUX_DATA_REG("PCDR", 0xa4050124, 8) { + PTC7_DATA, PTC6_DATA, PTC5_DATA, PTC4_DATA, + PTC3_DATA, PTC2_DATA, PTC1_DATA, PTC0_DATA } + }, + { PINMUX_DATA_REG("PDDR", 0xa4050126, 8) { + PTD7_DATA, PTD6_DATA, PTD5_DATA, PTD4_DATA, + PTD3_DATA, PTD2_DATA, PTD1_DATA, PTD0_DATA } + }, + { PINMUX_DATA_REG("PEDR", 0xa4050128, 8) { + PTE7_DATA, PTE6_DATA, PTE5_DATA, PTE4_DATA, + PTE3_DATA, PTE2_DATA, PTE1_DATA, PTE0_DATA } + }, + { PINMUX_DATA_REG("PFDR", 0xa405012a, 8) { + PTF7_DATA, PTF6_DATA, PTF5_DATA, PTF4_DATA, + PTF3_DATA, PTF2_DATA, PTF1_DATA, PTF0_DATA } + }, + { PINMUX_DATA_REG("PGDR", 0xa405012c, 8) { + 0, 0, PTG5_DATA, PTG4_DATA, + PTG3_DATA, PTG2_DATA, PTG1_DATA, PTG0_DATA } + }, + { PINMUX_DATA_REG("PHDR", 0xa405012e, 8) { + PTH7_DATA, PTH6_DATA, PTH5_DATA, PTH4_DATA, + PTH3_DATA, PTH2_DATA, PTH1_DATA, PTH0_DATA } + }, + { PINMUX_DATA_REG("PJDR", 0xa4050130, 8) { + PTJ7_DATA, PTJ6_DATA, PTJ5_DATA, 0, + PTJ3_DATA, PTJ2_DATA, PTJ1_DATA, PTJ0_DATA } + }, + { PINMUX_DATA_REG("PKDR", 0xa4050132, 8) { + PTK7_DATA, PTK6_DATA, PTK5_DATA, PTK4_DATA, + PTK3_DATA, PTK2_DATA, PTK1_DATA, PTK0_DATA } + }, + { PINMUX_DATA_REG("PLDR", 0xa4050134, 8) { + PTL7_DATA, PTL6_DATA, PTL5_DATA, PTL4_DATA, + PTL3_DATA, PTL2_DATA, PTL1_DATA, PTL0_DATA } + }, + { PINMUX_DATA_REG("PMDR", 0xa4050136, 8) { + PTM7_DATA, PTM6_DATA, PTM5_DATA, PTM4_DATA, + PTM3_DATA, PTM2_DATA, PTM1_DATA, PTM0_DATA } + }, + { PINMUX_DATA_REG("PNDR", 0xa4050138, 8) { + PTN7_DATA, PTN6_DATA, PTN5_DATA, PTN4_DATA, + PTN3_DATA, PTN2_DATA, PTN1_DATA, PTN0_DATA } + }, + { PINMUX_DATA_REG("PQDR", 0xa405013a, 8) { + PTQ7_DATA, PTQ6_DATA, PTQ5_DATA, PTQ4_DATA, + PTQ3_DATA, PTQ2_DATA, PTQ1_DATA, PTQ0_DATA } + }, + { PINMUX_DATA_REG("PRDR", 0xa405013c, 8) { + PTR7_DATA, PTR6_DATA, PTR5_DATA, PTR4_DATA, + PTR3_DATA, PTR2_DATA, PTR1_DATA, PTR0_DATA } + }, + { PINMUX_DATA_REG("PSDR", 0xa405013e, 8) { + 0, PTS6_DATA, PTS5_DATA, PTS4_DATA, + PTS3_DATA, PTS2_DATA, PTS1_DATA, PTS0_DATA } + }, + { PINMUX_DATA_REG("PTDR", 0xa4050160, 8) { + PTT7_DATA, PTT6_DATA, PTT5_DATA, PTT4_DATA, + PTT3_DATA, PTT2_DATA, PTT1_DATA, PTT0_DATA } + }, + { PINMUX_DATA_REG("PUDR", 0xa4050162, 8) { + PTU7_DATA, PTU6_DATA, PTU5_DATA, PTU4_DATA, + PTU3_DATA, PTU2_DATA, PTU1_DATA, PTU0_DATA } + }, + { PINMUX_DATA_REG("PVDR", 0xa4050164, 8) { + PTV7_DATA, PTV6_DATA, PTV5_DATA, PTV4_DATA, + PTV3_DATA, PTV2_DATA, PTV1_DATA, PTV0_DATA } + }, + { PINMUX_DATA_REG("PWDR", 0xa4050166, 8) { + PTW7_DATA, PTW6_DATA, PTW5_DATA, PTW4_DATA, + PTW3_DATA, PTW2_DATA, PTW1_DATA, PTW0_DATA } + }, + { PINMUX_DATA_REG("PXDR", 0xa4050168, 8) { + PTX7_DATA, PTX6_DATA, PTX5_DATA, PTX4_DATA, + PTX3_DATA, PTX2_DATA, PTX1_DATA, PTX0_DATA } + }, + { PINMUX_DATA_REG("PYDR", 0xa405016a, 8) { + PTY7_DATA, PTY6_DATA, PTY5_DATA, PTY4_DATA, + PTY3_DATA, PTY2_DATA, PTY1_DATA, PTY0_DATA } + }, + { PINMUX_DATA_REG("PZDR", 0xa405016c, 8) { + PTZ7_DATA, PTZ6_DATA, PTZ5_DATA, PTZ4_DATA, + PTZ3_DATA, PTZ2_DATA, PTZ1_DATA, PTZ0_DATA } + }, + { }, +}; + +static struct pinmux_info sh7724_pinmux_info = { + .name = "sh7724_pfc", + .reserved_id = PINMUX_RESERVED, + .data = { PINMUX_DATA_BEGIN, PINMUX_DATA_END }, + .input = { PINMUX_INPUT_BEGIN, PINMUX_INPUT_END }, + .input_pu = { PINMUX_INPUT_PULLUP_BEGIN, PINMUX_INPUT_PULLUP_END }, + .output = { PINMUX_OUTPUT_BEGIN, PINMUX_OUTPUT_END }, + .mark = { PINMUX_MARK_BEGIN, PINMUX_MARK_END }, + .function = { PINMUX_FUNCTION_BEGIN, PINMUX_FUNCTION_END }, + + .first_gpio = GPIO_PTA7, + .last_gpio = GPIO_FN_INTC_IRQ0, + + .gpios = pinmux_gpios, + .cfg_regs = pinmux_config_regs, + .data_regs = pinmux_data_regs, + + .gpio_data = pinmux_data, + .gpio_data_size = ARRAY_SIZE(pinmux_data), +}; + +static int __init plat_pinmux_setup(void) +{ + return register_pinmux(&sh7724_pinmux_info); +} +arch_initcall(plat_pinmux_setup); diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c new file mode 100644 index 00000000000..4327b1e080b --- /dev/null +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -0,0 +1,371 @@ +/* + * SH7724 Setup + * + * Copyright (C) 2009 Renesas Solutions Corp. + * + * Kuninori Morimoto + * + * Based on SH7723 Setup + * Copyright (C) 2008 Paul Mundt + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* Serial */ +static struct plat_sci_port sci_platform_data[] = { + { + .mapbase = 0xffe00000, + .flags = UPF_BOOT_AUTOCONF, + .type = PORT_SCIF, + .irqs = { 80, 80, 80, 80 }, + }, { + .mapbase = 0xffe10000, + .flags = UPF_BOOT_AUTOCONF, + .type = PORT_SCIF, + .irqs = { 81, 81, 81, 81 }, + }, { + .mapbase = 0xffe20000, + .flags = UPF_BOOT_AUTOCONF, + .type = PORT_SCIF, + .irqs = { 82, 82, 82, 82 }, + }, { + .mapbase = 0xa4e30000, + .flags = UPF_BOOT_AUTOCONF, + .type = PORT_SCIFA, + .irqs = { 56, 56, 56, 56 }, + }, { + .mapbase = 0xa4e40000, + .flags = UPF_BOOT_AUTOCONF, + .type = PORT_SCIFA, + .irqs = { 88, 88, 88, 88 }, + }, { + .mapbase = 0xa4e50000, + .flags = UPF_BOOT_AUTOCONF, + .type = PORT_SCIFA, + .irqs = { 109, 109, 109, 109 }, + }, { + .flags = 0, + } +}; + +static struct platform_device sci_device = { + .name = "sh-sci", + .id = -1, + .dev = { + .platform_data = sci_platform_data, + }, +}; + +/* RTC */ +static struct resource rtc_resources[] = { + [0] = { + .start = 0xa465fec0, + .end = 0xa465fec0 + 0x58 - 1, + .flags = IORESOURCE_IO, + }, + [1] = { + /* Period IRQ */ + .start = 69, + .flags = IORESOURCE_IRQ, + }, + [2] = { + /* Carry IRQ */ + .start = 70, + .flags = IORESOURCE_IRQ, + }, + [3] = { + /* Alarm IRQ */ + .start = 68, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device rtc_device = { + .name = "sh-rtc", + .id = -1, + .num_resources = ARRAY_SIZE(rtc_resources), + .resource = rtc_resources, +}; + +static struct platform_device *sh7724_devices[] __initdata = { + &sci_device, + &rtc_device, +}; + +static int __init sh7724_devices_setup(void) +{ + clk_always_enable("rtc0"); /* RTC */ + + return platform_add_devices(sh7724_devices, + ARRAY_SIZE(sh7724_devices)); +} +device_initcall(sh7724_devices_setup); + +enum { + UNUSED = 0, + + /* interrupt sources */ + IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7, + HUDI, + DMAC1A_DEI0, DMAC1A_DEI1, DMAC1A_DEI2, DMAC1A_DEI3, + _2DG_TRI, _2DG_INI, _2DG_CEI, _2DG_BRK, + DMAC0A_DEI0, DMAC0A_DEI1, DMAC0A_DEI2, DMAC0A_DEI3, + VIO_CEU20I, VIO_BEU20I, VIO_VEU3F1, VIO_VOUI, + SCIFA_SCIFA0, + VPU_VPUI, + TPU_TPUI, + CEU21I, + BEU21I, + USB_USI0, + ATAPI, + RTC_ATI, RTC_PRI, RTC_CUI, + DMAC1B_DEI4, DMAC1B_DEI5, DMAC1B_DADERR, + DMAC0B_DEI4, DMAC0B_DEI5, DMAC0B_DADERR, + KEYSC_KEYI, + SCIF_SCIF0, SCIF_SCIF1, SCIF_SCIF2, + VEU3F0I, + MSIOF_MSIOFI0, MSIOF_MSIOFI1, + SPU_SPUI0, SPU_SPUI1, + SCIFA_SCIFA1, +/* ICB_ICBI, */ + ETHI, + I2C1_ALI, I2C1_TACKI, I2C1_WAITI, I2C1_DTEI, + I2C0_ALI, I2C0_TACKI, I2C0_WAITI, I2C0_DTEI, + SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2, + CMT_CMTI, + TSIF_TSIFI, +/* ICB_LMBI, */ + FSI_FSI, + SCIFA_SCIFA2, + TMU0_TUNI0, TMU0_TUNI1, TMU0_TUNI2, + IRDA_IRDAI, + SDHI1_SDHII0, SDHI1_SDHII1, SDHI1_SDHII2, + JPU_JPUI, + MMC_MMCI0, MMC_MMCI1, MMC_MMCI2, + LCDC_LCDCI, + TMU1_TUNI0, TMU1_TUNI1, TMU1_TUNI2, + + /* interrupt groups */ + DMAC1A, _2DG, DMAC0A, VIO, RTC, + DMAC1B, DMAC0B, I2C0, I2C1, SDHI0, SDHI1, SPU, MMC, +}; + +static struct intc_vect vectors[] __initdata = { + INTC_VECT(IRQ0, 0x600), INTC_VECT(IRQ1, 0x620), + INTC_VECT(IRQ2, 0x640), INTC_VECT(IRQ3, 0x660), + INTC_VECT(IRQ4, 0x680), INTC_VECT(IRQ5, 0x6a0), + INTC_VECT(IRQ6, 0x6c0), INTC_VECT(IRQ7, 0x6e0), + + INTC_VECT(DMAC1A_DEI0, 0x700), + INTC_VECT(DMAC1A_DEI1, 0x720), + INTC_VECT(DMAC1A_DEI2, 0x740), + INTC_VECT(DMAC1A_DEI3, 0x760), + + INTC_VECT(_2DG_TRI, 0x780), + INTC_VECT(_2DG_INI, 0x7A0), + INTC_VECT(_2DG_CEI, 0x7C0), + INTC_VECT(_2DG_BRK, 0x7E0), + + INTC_VECT(DMAC0A_DEI0, 0x800), + INTC_VECT(DMAC0A_DEI1, 0x820), + INTC_VECT(DMAC0A_DEI2, 0x840), + INTC_VECT(DMAC0A_DEI3, 0x860), + + INTC_VECT(VIO_CEU20I, 0x880), + INTC_VECT(VIO_BEU20I, 0x8A0), + INTC_VECT(VIO_VEU3F1, 0x8C0), + INTC_VECT(VIO_VOUI, 0x8E0), + + INTC_VECT(SCIFA_SCIFA0, 0x900), + INTC_VECT(VPU_VPUI, 0x980), + INTC_VECT(TPU_TPUI, 0x9A0), + INTC_VECT(CEU21I, 0x9E0), + INTC_VECT(BEU21I, 0xA00), + INTC_VECT(USB_USI0, 0xA20), + INTC_VECT(ATAPI, 0xA60), + + INTC_VECT(RTC_ATI, 0xA80), + INTC_VECT(RTC_PRI, 0xAA0), + INTC_VECT(RTC_CUI, 0xAC0), + + INTC_VECT(DMAC1B_DEI4, 0xB00), + INTC_VECT(DMAC1B_DEI5, 0xB20), + INTC_VECT(DMAC1B_DADERR, 0xB40), + + INTC_VECT(DMAC0B_DEI4, 0xB80), + INTC_VECT(DMAC0B_DEI5, 0xBA0), + INTC_VECT(DMAC0B_DADERR, 0xBC0), + + INTC_VECT(KEYSC_KEYI, 0xBE0), + INTC_VECT(SCIF_SCIF0, 0xC00), + INTC_VECT(SCIF_SCIF1, 0xC20), + INTC_VECT(SCIF_SCIF2, 0xC40), + INTC_VECT(VEU3F0I, 0xC60), + INTC_VECT(MSIOF_MSIOFI0, 0xC80), + INTC_VECT(MSIOF_MSIOFI1, 0xCA0), + INTC_VECT(SPU_SPUI0, 0xCC0), + INTC_VECT(SPU_SPUI1, 0xCE0), + INTC_VECT(SCIFA_SCIFA1, 0xD00), + +/* INTC_VECT(ICB_ICBI, 0xD20), */ + INTC_VECT(ETHI, 0xD60), + + INTC_VECT(I2C1_ALI, 0xD80), + INTC_VECT(I2C1_TACKI, 0xDA0), + INTC_VECT(I2C1_WAITI, 0xDC0), + INTC_VECT(I2C1_DTEI, 0xDE0), + + INTC_VECT(I2C0_ALI, 0xE00), + INTC_VECT(I2C0_TACKI, 0xE20), + INTC_VECT(I2C0_WAITI, 0xE40), + INTC_VECT(I2C0_DTEI, 0xE60), + + INTC_VECT(SDHI0_SDHII0, 0xE80), + INTC_VECT(SDHI0_SDHII1, 0xEA0), + INTC_VECT(SDHI0_SDHII2, 0xEC0), + + INTC_VECT(CMT_CMTI, 0xF00), + INTC_VECT(TSIF_TSIFI, 0xF20), +/* INTC_VECT(ICB_LMBI, 0xF60), */ + INTC_VECT(FSI_FSI, 0xF80), + INTC_VECT(SCIFA_SCIFA2, 0xFA0), + + INTC_VECT(TMU0_TUNI0, 0x400), + INTC_VECT(TMU0_TUNI1, 0x420), + INTC_VECT(TMU0_TUNI2, 0x440), + + INTC_VECT(IRDA_IRDAI, 0x480), + + INTC_VECT(SDHI1_SDHII0, 0x4E0), + INTC_VECT(SDHI1_SDHII1, 0x500), + INTC_VECT(SDHI1_SDHII2, 0x520), + + INTC_VECT(JPU_JPUI, 0x560), + + INTC_VECT(MMC_MMCI0, 0x580), + INTC_VECT(MMC_MMCI1, 0x5A0), + INTC_VECT(MMC_MMCI2, 0x5C0), + + INTC_VECT(LCDC_LCDCI, 0xF40), + + INTC_VECT(TMU1_TUNI0, 0x920), + INTC_VECT(TMU1_TUNI1, 0x940), + INTC_VECT(TMU1_TUNI2, 0x960), +}; + +static struct intc_group groups[] __initdata = { + INTC_GROUP(DMAC1A, DMAC1A_DEI0, DMAC1A_DEI1, DMAC1A_DEI2, DMAC1A_DEI3), + INTC_GROUP(_2DG, _2DG_TRI, _2DG_INI, _2DG_CEI, _2DG_BRK), + INTC_GROUP(DMAC0A, DMAC0A_DEI0, DMAC0A_DEI1, DMAC0A_DEI2, DMAC0A_DEI3), + INTC_GROUP(VIO, VIO_CEU20I, VIO_BEU20I, VIO_VEU3F1, VIO_VOUI), + INTC_GROUP(RTC, RTC_ATI, RTC_PRI, RTC_CUI), + INTC_GROUP(DMAC1B, DMAC1B_DEI4, DMAC1B_DEI5, DMAC1B_DADERR), + INTC_GROUP(DMAC0B, DMAC0B_DEI4, DMAC0B_DEI5, DMAC0B_DADERR), + INTC_GROUP(I2C0, I2C0_ALI, I2C0_TACKI, I2C0_WAITI, I2C0_DTEI), + INTC_GROUP(I2C1, I2C1_ALI, I2C1_TACKI, I2C1_WAITI, I2C1_DTEI), + INTC_GROUP(SDHI0, SDHI0_SDHII0, SDHI0_SDHII1, SDHI0_SDHII2), + INTC_GROUP(SDHI1, SDHI1_SDHII0, SDHI1_SDHII1, SDHI1_SDHII2), + INTC_GROUP(SPU, SPU_SPUI0, SPU_SPUI1), + INTC_GROUP(MMC, MMC_MMCI0, MMC_MMCI1, MMC_MMCI2), +}; + +/* FIXMEEEEEEEEEEEEEEEEEEE !!!!! */ +/* very bad manual !! */ +static struct intc_mask_reg mask_registers[] __initdata = { + { 0xa4080080, 0xa40800c0, 8, /* IMR0 / IMCR0 */ + { 0, TMU1_TUNI2, TMU1_TUNI1, TMU1_TUNI0, + /*SDHII3?*/0, SDHI1_SDHII2, SDHI1_SDHII1, SDHI1_SDHII0 } }, + { 0xa4080084, 0xa40800c4, 8, /* IMR1 / IMCR1 */ + { VIO_VOUI, VIO_VEU3F1, VIO_BEU20I, VIO_CEU20I, + DMAC0A_DEI3, DMAC0A_DEI2, DMAC0A_DEI1, DMAC0A_DEI0 } }, + { 0xa4080088, 0xa40800c8, 8, /* IMR2 / IMCR2 */ + { 0, 0, 0, VPU_VPUI, ATAPI, ETHI, 0, /*SCIFA3*/SCIFA_SCIFA0 } }, + { 0xa408008c, 0xa40800cc, 8, /* IMR3 / IMCR3 */ + { DMAC1A_DEI3, DMAC1A_DEI2, DMAC1A_DEI1, DMAC1A_DEI0, + SPU_SPUI1, SPU_SPUI0, BEU21I, IRDA_IRDAI } }, + { 0xa4080090, 0xa40800d0, 8, /* IMR4 / IMCR4 */ + { 0, TMU0_TUNI2, TMU0_TUNI1, TMU0_TUNI0, + JPU_JPUI, 0, 0, LCDC_LCDCI } }, + { 0xa4080094, 0xa40800d4, 8, /* IMR5 / IMCR5 */ + { KEYSC_KEYI, DMAC0B_DADERR, DMAC0B_DEI5, DMAC0B_DEI4, + VEU3F0I, SCIF_SCIF2, SCIF_SCIF1, SCIF_SCIF0 } }, + { 0xa4080098, 0xa40800d8, 8, /* IMR6 / IMCR6 */ + { 0, 0, /*ICB_ICBI*/0, /*SCIFA4*/SCIFA_SCIFA1, + CEU21I, 0, MSIOF_MSIOFI1, MSIOF_MSIOFI0 } }, + { 0xa408009c, 0xa40800dc, 8, /* IMR7 / IMCR7 */ + { I2C0_DTEI, I2C0_WAITI, I2C0_TACKI, I2C0_ALI, + I2C1_DTEI, I2C1_WAITI, I2C1_TACKI, I2C1_ALI } }, + { 0xa40800a0, 0xa40800e0, 8, /* IMR8 / IMCR8 */ + { /*SDHII3*/0, SDHI0_SDHII2, SDHI0_SDHII1, SDHI0_SDHII0, + 0, 0, /*SCIFA5*/SCIFA_SCIFA2, FSI_FSI } }, + { 0xa40800a4, 0xa40800e4, 8, /* IMR9 / IMCR9 */ + { 0, 0, 0, CMT_CMTI, 0, /*USB1*/0, USB_USI0, 0 } }, + { 0xa40800a8, 0xa40800e8, 8, /* IMR10 / IMCR10 */ + { 0, DMAC1B_DADERR, DMAC1B_DEI5, DMAC1B_DEI4, + 0, RTC_ATI, RTC_PRI, RTC_CUI } }, + { 0xa40800ac, 0xa40800ec, 8, /* IMR11 / IMCR11 */ + { _2DG_BRK, _2DG_CEI, _2DG_INI, _2DG_TRI, + 0, TPU_TPUI, /*ICB_LMBI*/0, TSIF_TSIFI } }, + { 0xa40800b0, 0xa40800f0, 8, /* IMR12 / IMCR12 */ + { 0, 0, 0, 0, 0, 0, 0, 0/*2DDMAC*/ } }, + { 0xa4140044, 0xa4140064, 8, /* INTMSK00 / INTMSKCLR00 */ + { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } }, +}; + +static struct intc_prio_reg prio_registers[] __initdata = { + { 0xa4080000, 0, 16, 4, /* IPRA */ { TMU0_TUNI0, TMU0_TUNI1, + TMU0_TUNI2, IRDA_IRDAI } }, + { 0xa4080004, 0, 16, 4, /* IPRB */ { JPU_JPUI, LCDC_LCDCI, + DMAC1A, BEU21I } }, + { 0xa4080008, 0, 16, 4, /* IPRC */ { TMU1_TUNI0, TMU1_TUNI1, + TMU1_TUNI2, SPU } }, + { 0xa408000c, 0, 16, 4, /* IPRD */ { 0, MMC, 0, ATAPI } }, + { 0xa4080010, 0, 16, 4, /* IPRE */ + { DMAC0A, /*BEU?VEU?*/VIO, /*SCIFA3*/SCIFA_SCIFA0, /*VPU5F*/ + VPU_VPUI } }, + { 0xa4080014, 0, 16, 4, /* IPRF */ { KEYSC_KEYI, DMAC0B, + USB_USI0, CMT_CMTI } }, + { 0xa4080018, 0, 16, 4, /* IPRG */ { SCIF_SCIF0, SCIF_SCIF1, + SCIF_SCIF2, VEU3F0I } }, + { 0xa408001c, 0, 16, 4, /* IPRH */ { MSIOF_MSIOFI0, MSIOF_MSIOFI1, + I2C1, I2C0 } }, + { 0xa4080020, 0, 16, 4, /* IPRI */ { /*SCIFA4*/SCIFA_SCIFA1, /*ICB*/0, + TSIF_TSIFI, _2DG/*ICB?*/ } }, + { 0xa4080024, 0, 16, 4, /* IPRJ */ { CEU21I, ETHI, FSI_FSI, SDHI1 } }, + { 0xa4080028, 0, 16, 4, /* IPRK */ { RTC, DMAC1B, /*ICB?*/0, SDHI0 } }, + { 0xa408002c, 0, 16, 4, /* IPRL */ { /*SCIFA5*/SCIFA_SCIFA2, 0, + TPU_TPUI, /*2DDMAC*/0 } }, + { 0xa4140010, 0, 32, 4, /* INTPRI00 */ + { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } }, +}; + +static struct intc_sense_reg sense_registers[] __initdata = { + { 0xa414001c, 16, 2, /* ICR1 */ + { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } }, +}; + +static struct intc_mask_reg ack_registers[] __initdata = { + { 0xa4140024, 0, 8, /* INTREQ00 */ + { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, IRQ6, IRQ7 } }, +}; + +static DECLARE_INTC_DESC_ACK(intc_desc, "sh7724", vectors, groups, + mask_registers, prio_registers, sense_registers, + ack_registers); + +void __init plat_irq_setup(void) +{ + register_intc_controller(&intc_desc); +} diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 04a6004fccc..0e6ed804546 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -435,7 +435,8 @@ static const char *cpu_name[] = { [CPU_SH7722] = "SH7722", [CPU_SHX3] = "SH-X3", [CPU_SH5_101] = "SH5-101", [CPU_SH5_103] = "SH5-103", [CPU_MXG] = "MX-G", [CPU_SH7723] = "SH7723", - [CPU_SH7366] = "SH7366", [CPU_SH_NONE] = "Unknown" + [CPU_SH7366] = "SH7366", [CPU_SH7724] = "SH7724", + [CPU_SH_NONE] = "Unknown" }; const char *get_cpu_subtype(struct sh_cpuinfo *c) diff --git a/arch/sh/oprofile/common.c b/arch/sh/oprofile/common.c index 1b9d4304b3b..44f4e31c6d6 100644 --- a/arch/sh/oprofile/common.c +++ b/arch/sh/oprofile/common.c @@ -109,6 +109,7 @@ int __init oprofile_arch_init(struct oprofile_operations *ops) case CPU_SH7785: case CPU_SH7786: case CPU_SH7723: + case CPU_SH7724: case CPU_SHX3: lmodel = &op_model_sh4a_ops; break; From 47948d2bd6d27648a107a27357b3bc5ad054ff64 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 15 Apr 2009 11:42:47 +0900 Subject: [PATCH 0394/2061] serial: sh-sci: SH7724 support. Signed-off-by: Kuninori Morimoto Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.h | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h index d0aa82d7fce..84cc6512f08 100644 --- a/drivers/serial/sh-sci.h +++ b/drivers/serial/sh-sci.h @@ -91,6 +91,9 @@ # define SCSPTR5 0xa4050128 # define SCIF_ORER 0x0001 /* overrun error bit */ # define SCSCR_INIT(port) 0x0038 /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */ +#elif defined(CONFIG_CPU_SUBTYPE_SH7724) +# define SCIF_ORER 0x0001 /* overrun error bit */ +# define SCSCR_INIT(port) 0x0038 /* TIE=0,RIE=0,TE=1,RE=1,REIE=1 */ #elif defined(CONFIG_CPU_SUBTYPE_SH4_202) # define SCSPTR2 0xffe80020 /* 16 bit SCIF */ # define SCIF_ORER 0x0001 /* overrun error bit */ @@ -361,7 +364,8 @@ h8_sci_offset, h8_sci_size) \ CPU_SCI_FNS(name, h8_sci_offset, h8_sci_size) #define SCIF_FNS(name, sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size) -#elif defined(CONFIG_CPU_SUBTYPE_SH7723) +#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\ + defined(CONFIG_CPU_SUBTYPE_SH7724) #define SCIx_FNS(name, sh4_scifa_offset, sh4_scifa_size, sh4_scif_offset, sh4_scif_size) \ CPU_SCIx_FNS(name, sh4_scifa_offset, sh4_scifa_size, sh4_scif_offset, sh4_scif_size) #define SCIF_FNS(name, sh4_scif_offset, sh4_scif_size) \ @@ -390,7 +394,8 @@ SCIF_FNS(SCFDR, 0x1c, 16) SCIF_FNS(SCxTDR, 0x20, 8) SCIF_FNS(SCxRDR, 0x24, 8) SCIF_FNS(SCLSR, 0x24, 16) -#elif defined(CONFIG_CPU_SUBTYPE_SH7723) +#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\ + defined(CONFIG_CPU_SUBTYPE_SH7724) SCIx_FNS(SCSMR, 0x00, 16, 0x00, 16) SCIx_FNS(SCBRR, 0x04, 8, 0x04, 8) SCIx_FNS(SCSCR, 0x08, 16, 0x08, 16) @@ -604,6 +609,17 @@ static inline int sci_rxd_in(struct uart_port *port) return ctrl_inb(SCSPTR5) & 0x0008 ? 1 : 0; /* SCIF5 */ return 1; } +#elif defined(CONFIG_CPU_SUBTYPE_SH7724) +# define SCFSR 0x0010 +# define SCASSR 0x0014 +static inline int sci_rxd_in(struct uart_port *port) +{ + if (port->type == PORT_SCIF) + return ctrl_inw((port->mapbase + SCFSR)) & SCIF_BRK ? 1 : 0; + if (port->type == PORT_SCIFA) + return ctrl_inw((port->mapbase + SCASSR)) & SCIF_BRK ? 1 : 0; + return 1; +} #elif defined(CONFIG_CPU_SUBTYPE_SH5_101) || defined(CONFIG_CPU_SUBTYPE_SH5_103) static inline int sci_rxd_in(struct uart_port *port) { @@ -757,7 +773,8 @@ static inline int sci_rxd_in(struct uart_port *port) defined(CONFIG_CPU_SUBTYPE_SH7720) || \ defined(CONFIG_CPU_SUBTYPE_SH7721) #define SCBRR_VALUE(bps, clk) (((clk*2)+16*bps)/(32*bps)-1) -#elif defined(CONFIG_CPU_SUBTYPE_SH7723) +#elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\ + defined(CONFIG_CPU_SUBTYPE_SH7724) static inline int scbrr_calc(struct uart_port *port, int bps, int clk) { if (port->type == PORT_SCIF) From 40c7e8be556715079d0a9d7454ceb5371a2f0b39 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 16 Apr 2009 13:16:07 +0900 Subject: [PATCH 0395/2061] sh: sh7724: Add I2C support. This adds support for the SH-Mobile I2C controller on the SH7724. Signed-off-by: Kuninori Morimoto Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 46 ++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index 4327b1e080b..f17eda6688d 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -99,9 +99,55 @@ static struct platform_device rtc_device = { .resource = rtc_resources, }; +/* I2C0 */ +static struct resource iic0_resources[] = { + [0] = { + .name = "IIC0", + .start = 0x04470000, + .end = 0x04470018 - 1, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 96, + .end = 99, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device iic0_device = { + .name = "i2c-sh_mobile", + .id = 0, /* "i2c0" clock */ + .num_resources = ARRAY_SIZE(iic0_resources), + .resource = iic0_resources, +}; + +/* I2C1 */ +static struct resource iic1_resources[] = { + [0] = { + .name = "IIC1", + .start = 0x04750000, + .end = 0x04750018 - 1, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 92, + .end = 95, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device iic1_device = { + .name = "i2c-sh_mobile", + .id = 1, /* "i2c1" clock */ + .num_resources = ARRAY_SIZE(iic1_resources), + .resource = iic1_resources, +}; + static struct platform_device *sh7724_devices[] __initdata = { &sci_device, &rtc_device, + &iic0_device, + &iic1_device, }; static int __init sh7724_devices_setup(void) From cd5b9ef776feff440e7a889d1a565ceabfecbfa1 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 15 Apr 2009 11:43:03 +0900 Subject: [PATCH 0396/2061] sh: sh7724: Add VPU support. This adds uio_pdrv_genirq support for the VPU. Signed-off-by: Kuninori Morimoto Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index f17eda6688d..499a6fcdf23 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -143,16 +143,49 @@ static struct platform_device iic1_device = { .resource = iic1_resources, }; +/* VPU */ +static struct uio_info vpu_platform_data = { + .name = "VPU5F", + .version = "0", + .irq = 60, +}; + +static struct resource vpu_resources[] = { + [0] = { + .name = "VPU", + .start = 0xfe900000, + .end = 0xfe902807, + .flags = IORESOURCE_MEM, + }, + [1] = { + /* place holder for contiguous memory */ + }, +}; + +static struct platform_device vpu_device = { + .name = "uio_pdrv_genirq", + .id = 0, + .dev = { + .platform_data = &vpu_platform_data, + }, + .resource = vpu_resources, + .num_resources = ARRAY_SIZE(vpu_resources), +}; + static struct platform_device *sh7724_devices[] __initdata = { &sci_device, &rtc_device, &iic0_device, &iic1_device, + &vpu_device, }; static int __init sh7724_devices_setup(void) { clk_always_enable("rtc0"); /* RTC */ + clk_always_enable("vpu0"); /* VPU */ + + platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20); return platform_add_devices(sh7724_devices, ARRAY_SIZE(sh7724_devices)); From ad95b78c9f735da11ff9ec760e9b038cd82aead6 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Wed, 15 Apr 2009 11:43:07 +0900 Subject: [PATCH 0397/2061] sh: sh7724: Add VEU support. This adds uio_pdrv_genirq support for the VEU. Signed-off-by: Kuninori Morimoto Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 64 ++++++++++++++++++++++++++ 1 file changed, 64 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index 499a6fcdf23..65570ed69e6 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -172,20 +172,84 @@ static struct platform_device vpu_device = { .num_resources = ARRAY_SIZE(vpu_resources), }; +/* VEU0 */ +static struct uio_info veu0_platform_data = { + .name = "VEU3F0", + .version = "0", + .irq = 83, +}; + +static struct resource veu0_resources[] = { + [0] = { + .name = "VEU3F0", + .start = 0xfe920000, + .end = 0xfe9200cb - 1, + .flags = IORESOURCE_MEM, + }, + [1] = { + /* place holder for contiguous memory */ + }, +}; + +static struct platform_device veu0_device = { + .name = "uio_pdrv_genirq", + .id = 1, + .dev = { + .platform_data = &veu0_platform_data, + }, + .resource = veu0_resources, + .num_resources = ARRAY_SIZE(veu0_resources), +}; + +/* VEU1 */ +static struct uio_info veu1_platform_data = { + .name = "VEU3F1", + .version = "0", + .irq = 54, +}; + +static struct resource veu1_resources[] = { + [0] = { + .name = "VEU3F1", + .start = 0xfe924000, + .end = 0xfe9240cb - 1, + .flags = IORESOURCE_MEM, + }, + [1] = { + /* place holder for contiguous memory */ + }, +}; + +static struct platform_device veu1_device = { + .name = "uio_pdrv_genirq", + .id = 2, + .dev = { + .platform_data = &veu1_platform_data, + }, + .resource = veu1_resources, + .num_resources = ARRAY_SIZE(veu1_resources), +}; + static struct platform_device *sh7724_devices[] __initdata = { &sci_device, &rtc_device, &iic0_device, &iic1_device, &vpu_device, + &veu0_device, + &veu1_device, }; static int __init sh7724_devices_setup(void) { clk_always_enable("rtc0"); /* RTC */ clk_always_enable("vpu0"); /* VPU */ + clk_always_enable("veu1"); /* VEU3F1 */ + clk_always_enable("veu0"); /* VEU3F0 */ platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20); + platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20); + platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20); return platform_add_devices(sh7724_devices, ARRAY_SIZE(sh7724_devices)); From 6a3395beb99d7ae882ddf701c6fa6005ad7edebf Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 16 Apr 2009 15:36:13 +0900 Subject: [PATCH 0398/2061] sh: sh7724: Add CMT clockevents support. This enables support for the CMT clockevents driver on SH7724. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 33 ++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index 65570ed69e6..8b87ba8f26b 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -230,7 +230,40 @@ static struct platform_device veu1_device = { .num_resources = ARRAY_SIZE(veu1_resources), }; +static struct sh_cmt_config cmt_platform_data = { + .name = "CMT", + .channel_offset = 0x60, + .timer_bit = 5, + .clk = "cmt0", + .clockevent_rating = 125, + .clocksource_rating = 200, +}; + +static struct resource cmt_resources[] = { + [0] = { + .name = "CMT", + .start = 0x044a0060, + .end = 0x044a006b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 104, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt_device = { + .name = "sh_cmt", + .id = 0, + .dev = { + .platform_data = &cmt_platform_data, + }, + .resource = cmt_resources, + .num_resources = ARRAY_SIZE(cmt_resources), +}; + static struct platform_device *sh7724_devices[] __initdata = { + &cmt_device, &sci_device, &rtc_device, &iic0_device, From 59fe700dcbf3d6257ae86ca8c0192fc64b2eea1c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 16 Apr 2009 15:43:42 +0900 Subject: [PATCH 0399/2061] sh: Have SH7724 select ARCH_SHMOBILE. This is an SH-Mobile CPU, so select ARCH_SHMOBILE. This enables all of the PM functionality, amongst other things. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 505d1acbd0a..4c68fdedfa1 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -351,6 +351,7 @@ config CPU_SUBTYPE_SH7724 bool "Support SH7724 processor" select CPU_SH4A select CPU_SHX2 + select ARCH_SHMOBILE select ARCH_SPARSEMEM_ENABLE select SYS_SUPPORTS_CMT help From bc9f3d4291f8275924a9197a81208a5df0a15095 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 16 Apr 2009 15:44:58 +0900 Subject: [PATCH 0400/2061] sh: Add a generic defconfig for SH7724 platforms. Signed-off-by: Paul Mundt --- arch/sh/configs/sh7724_generic_defconfig | 707 +++++++++++++++++++++++ 1 file changed, 707 insertions(+) create mode 100644 arch/sh/configs/sh7724_generic_defconfig diff --git a/arch/sh/configs/sh7724_generic_defconfig b/arch/sh/configs/sh7724_generic_defconfig new file mode 100644 index 00000000000..268d04ed8cd --- /dev/null +++ b/arch/sh/configs/sh7724_generic_defconfig @@ -0,0 +1,707 @@ +# +# Automatically generated make config: don't edit +# Linux kernel version: 2.6.30-rc2 +# Thu Apr 16 15:42:20 2009 +# +CONFIG_SUPERH=y +CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set +CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" +CONFIG_RWSEM_GENERIC_SPINLOCK=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_FIND_NEXT_BIT=y +CONFIG_GENERIC_HWEIGHT=y +CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y +CONFIG_GENERIC_IRQ_PROBE=y +# CONFIG_GENERIC_GPIO is not set +CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_SYS_SUPPORTS_CMT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_HAVE_LATENCYTOP_SUPPORT=y +# CONFIG_ARCH_HAS_ILOG2_U32 is not set +# CONFIG_ARCH_HAS_ILOG2_U64 is not set +CONFIG_ARCH_NO_VIRT_TO_BUS=y +CONFIG_ARCH_HAS_DEFAULT_IDLE=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" + +# +# General setup +# +CONFIG_EXPERIMENTAL=y +CONFIG_BROKEN_ON_SMP=y +CONFIG_INIT_ENV_ARG_LIMIT=32 +CONFIG_LOCALVERSION="" +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +# CONFIG_BSD_PROCESS_ACCT is not set + +# +# RCU Subsystem +# +# CONFIG_CLASSIC_RCU is not set +CONFIG_TREE_RCU=y +# CONFIG_PREEMPT_RCU is not set +# CONFIG_RCU_TRACE is not set +CONFIG_RCU_FANOUT=32 +# CONFIG_RCU_FANOUT_EXACT is not set +# CONFIG_TREE_RCU_TRACE is not set +# CONFIG_PREEMPT_RCU_TRACE is not set +# CONFIG_IKCONFIG is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_GROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_USER_SCHED=y +# CONFIG_CGROUP_SCHED is not set +CONFIG_CGROUPS=y +# CONFIG_CGROUP_DEBUG is not set +# CONFIG_CGROUP_NS is not set +# CONFIG_CGROUP_FREEZER is not set +# CONFIG_CGROUP_DEVICE is not set +# CONFIG_CPUSETS is not set +# CONFIG_CGROUP_CPUACCT is not set +# CONFIG_RESOURCE_COUNTERS is not set +# CONFIG_RELAY is not set +# CONFIG_NAMESPACES is not set +# CONFIG_BLK_DEV_INITRD is not set +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y +CONFIG_EMBEDDED=y +# CONFIG_UID16 is not set +CONFIG_SYSCTL_SYSCALL=y +CONFIG_KALLSYMS=y +# CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set +CONFIG_HOTPLUG=y +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_VM_EVENT_COUNTERS=y +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +# CONFIG_MARKERS is not set +CONFIG_OPROFILE=y +CONFIG_HAVE_OPROFILE=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set +CONFIG_HAVE_GENERIC_DMA_COHERENT=y +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +# CONFIG_MODULES is not set +CONFIG_BLOCK=y +# CONFIG_LBD is not set +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_BLK_DEV_INTEGRITY is not set + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" +CONFIG_FREEZER=y + +# +# System type +# +CONFIG_CPU_SH4=y +CONFIG_CPU_SH4A=y +CONFIG_CPU_SHX2=y +CONFIG_ARCH_SHMOBILE=y +# CONFIG_CPU_SUBTYPE_SH7619 is not set +# CONFIG_CPU_SUBTYPE_SH7201 is not set +# CONFIG_CPU_SUBTYPE_SH7203 is not set +# CONFIG_CPU_SUBTYPE_SH7206 is not set +# CONFIG_CPU_SUBTYPE_SH7263 is not set +# CONFIG_CPU_SUBTYPE_MXG is not set +# CONFIG_CPU_SUBTYPE_SH7705 is not set +# CONFIG_CPU_SUBTYPE_SH7706 is not set +# CONFIG_CPU_SUBTYPE_SH7707 is not set +# CONFIG_CPU_SUBTYPE_SH7708 is not set +# CONFIG_CPU_SUBTYPE_SH7709 is not set +# CONFIG_CPU_SUBTYPE_SH7710 is not set +# CONFIG_CPU_SUBTYPE_SH7712 is not set +# CONFIG_CPU_SUBTYPE_SH7720 is not set +# CONFIG_CPU_SUBTYPE_SH7721 is not set +# CONFIG_CPU_SUBTYPE_SH7750 is not set +# CONFIG_CPU_SUBTYPE_SH7091 is not set +# CONFIG_CPU_SUBTYPE_SH7750R is not set +# CONFIG_CPU_SUBTYPE_SH7750S is not set +# CONFIG_CPU_SUBTYPE_SH7751 is not set +# CONFIG_CPU_SUBTYPE_SH7751R is not set +# CONFIG_CPU_SUBTYPE_SH7760 is not set +# CONFIG_CPU_SUBTYPE_SH4_202 is not set +# CONFIG_CPU_SUBTYPE_SH7723 is not set +CONFIG_CPU_SUBTYPE_SH7724=y +# CONFIG_CPU_SUBTYPE_SH7763 is not set +# CONFIG_CPU_SUBTYPE_SH7770 is not set +# CONFIG_CPU_SUBTYPE_SH7780 is not set +# CONFIG_CPU_SUBTYPE_SH7785 is not set +# CONFIG_CPU_SUBTYPE_SH7786 is not set +# CONFIG_CPU_SUBTYPE_SHX3 is not set +# CONFIG_CPU_SUBTYPE_SH7343 is not set +# CONFIG_CPU_SUBTYPE_SH7722 is not set +# CONFIG_CPU_SUBTYPE_SH7366 is not set + +# +# Memory management options +# +CONFIG_QUICKLIST=y +CONFIG_MMU=y +CONFIG_PAGE_OFFSET=0x80000000 +CONFIG_MEMORY_START=0x08000000 +CONFIG_MEMORY_SIZE=0x04000000 +CONFIG_29BIT=y +# CONFIG_X2TLB is not set +CONFIG_VSYSCALL=y +CONFIG_ARCH_FLATMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_MAX_ACTIVE_REGIONS=1 +CONFIG_ARCH_POPULATES_NODE_MAP=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_PAGE_SIZE_4KB=y +# CONFIG_PAGE_SIZE_8KB is not set +# CONFIG_PAGE_SIZE_16KB is not set +# CONFIG_PAGE_SIZE_64KB is not set +CONFIG_ENTRY_OFFSET=0x00001000 +CONFIG_SELECT_MEMORY_MODEL=y +# CONFIG_FLATMEM_MANUAL is not set +# CONFIG_DISCONTIGMEM_MANUAL is not set +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_STATIC=y + +# +# Memory hotplug is currently incompatible with Software Suspend +# +CONFIG_PAGEFLAGS_EXTENDED=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MIGRATION=y +# CONFIG_PHYS_ADDR_T_64BIT is not set +CONFIG_ZONE_DMA_FLAG=0 +CONFIG_NR_QUICK=2 +CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y + +# +# Cache configuration +# +CONFIG_CACHE_WRITEBACK=y +# CONFIG_CACHE_WRITETHROUGH is not set +# CONFIG_CACHE_OFF is not set + +# +# Processor features +# +CONFIG_CPU_LITTLE_ENDIAN=y +# CONFIG_CPU_BIG_ENDIAN is not set +CONFIG_SH_FPU=y +# CONFIG_SH_STORE_QUEUES is not set +CONFIG_CPU_HAS_INTEVT=y +CONFIG_CPU_HAS_SR_RB=y +CONFIG_CPU_HAS_PTEA=y +CONFIG_CPU_HAS_FPU=y + +# +# Board support +# + +# +# Timer and clock configuration +# +CONFIG_SH_TMU=y +CONFIG_SH_TIMER_CMT=y +CONFIG_SH_TIMER_IRQ=16 +CONFIG_SH_PCLK_FREQ=41666666 +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_GENERIC_CLOCKEVENTS_BUILD=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_TABLE=y +# CONFIG_CPU_FREQ_DEBUG is not set +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_STAT_DETAILS is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_SH_CPU_FREQ=y + +# +# DMA support +# +# CONFIG_SH_DMA is not set + +# +# Companion Chips +# + +# +# Additional SuperH Device Drivers +# +# CONFIG_HEARTBEAT is not set +# CONFIG_PUSH_SWITCH is not set + +# +# Kernel features +# +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y +# CONFIG_HZ_300 is not set +# CONFIG_HZ_1000 is not set +CONFIG_HZ=250 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +# CONFIG_CRASH_DUMP is not set +CONFIG_KEXEC_JUMP=y +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set +CONFIG_GUSA=y + +# +# Boot options +# +CONFIG_ZERO_PAGE_OFFSET=0x00001000 +CONFIG_BOOT_LINK_OFFSET=0x00800000 +# CONFIG_CMDLINE_BOOL is not set + +# +# Bus options +# +# CONFIG_ARCH_SUPPORTS_MSI is not set +# CONFIG_PCCARD is not set + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +# CONFIG_HAVE_AOUT is not set +# CONFIG_BINFMT_MISC is not set + +# +# Power management options (EXPERIMENTAL) +# +CONFIG_PM=y +# CONFIG_PM_DEBUG is not set +CONFIG_PM_SLEEP=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +CONFIG_HIBERNATION=y +CONFIG_PM_STD_PARTITION="" +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +# CONFIG_NET is not set + +# +# Device Drivers +# + +# +# Generic Driver Options +# +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_STANDALONE=y +# CONFIG_PREVENT_FIRMWARE_BUILD is not set +CONFIG_FW_LOADER=y +CONFIG_FIRMWARE_IN_KERNEL=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_SYS_HYPERVISOR is not set +# CONFIG_MTD is not set +# CONFIG_PARPORT is not set +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_COW_COMMON is not set +# CONFIG_BLK_DEV_LOOP is not set +# CONFIG_BLK_DEV_RAM is not set +# CONFIG_CDROM_PKTCDVD is not set +# CONFIG_BLK_DEV_HD is not set +# CONFIG_MISC_DEVICES is not set +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +# CONFIG_RAID_ATTRS is not set +# CONFIG_SCSI is not set +# CONFIG_SCSI_DMA is not set +# CONFIG_SCSI_NETLINK is not set +# CONFIG_ATA is not set +# CONFIG_MD is not set +# CONFIG_PHONE is not set + +# +# Input device support +# +# CONFIG_INPUT is not set + +# +# Hardware I/O ports +# +# CONFIG_SERIO is not set +# CONFIG_GAMEPORT is not set + +# +# Character devices +# +# CONFIG_VT is not set +# CONFIG_DEVKMEM is not set +# CONFIG_SERIAL_NONSTANDARD is not set + +# +# Serial drivers +# +# CONFIG_SERIAL_8250 is not set + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_SH_SCI=y +CONFIG_SERIAL_SH_SCI_NR_UARTS=6 +CONFIG_SERIAL_SH_SCI_CONSOLE=y +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +# CONFIG_UNIX98_PTYS is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_IPMI_HANDLER is not set +# CONFIG_HW_RANDOM is not set +# CONFIG_R3964 is not set +# CONFIG_RAW_DRIVER is not set +# CONFIG_TCG_TPM is not set +CONFIG_I2C=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_CHARDEV=y +CONFIG_I2C_HELPER_AUTO=y + +# +# I2C Hardware Bus support +# + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +# CONFIG_I2C_OCORES is not set +CONFIG_I2C_SH_MOBILE=y +# CONFIG_I2C_SIMTEC is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_TAOS_EVM is not set + +# +# Other I2C/SMBus bus drivers +# +# CONFIG_I2C_PCA_PLATFORM is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_DS1682 is not set +# CONFIG_SENSORS_PCF8574 is not set +# CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCA9539 is not set +# CONFIG_SENSORS_MAX6875 is not set +# CONFIG_SENSORS_TSL2550 is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set +# CONFIG_SPI is not set +# CONFIG_W1 is not set +# CONFIG_POWER_SUPPLY is not set +# CONFIG_HWMON is not set +# CONFIG_THERMAL is not set +# CONFIG_THERMAL_HWMON is not set +# CONFIG_WATCHDOG is not set +CONFIG_SSB_POSSIBLE=y + +# +# Sonics Silicon Backplane +# +# CONFIG_SSB is not set + +# +# Multifunction device drivers +# +# CONFIG_MFD_CORE is not set +# CONFIG_MFD_SM501 is not set +# CONFIG_HTC_PASIC3 is not set +# CONFIG_TWL4030_CORE is not set +# CONFIG_MFD_TMIO is not set +# CONFIG_PMIC_DA903X is not set +# CONFIG_MFD_WM8400 is not set +# CONFIG_MFD_WM8350_I2C is not set +# CONFIG_MFD_PCF50633 is not set +# CONFIG_REGULATOR is not set + +# +# Multimedia devices +# + +# +# Multimedia core support +# +# CONFIG_VIDEO_DEV is not set +# CONFIG_VIDEO_MEDIA is not set + +# +# Multimedia drivers +# +# CONFIG_DAB is not set + +# +# Graphics support +# +# CONFIG_VGASTATE is not set +# CONFIG_VIDEO_OUTPUT_CONTROL is not set +# CONFIG_FB is not set +# CONFIG_BACKLIGHT_LCD_SUPPORT is not set + +# +# Display device support +# +# CONFIG_DISPLAY_SUPPORT is not set +# CONFIG_SOUND is not set +# CONFIG_USB_SUPPORT is not set +# CONFIG_MMC is not set +# CONFIG_MEMSTICK is not set +# CONFIG_NEW_LEDS is not set +# CONFIG_ACCESSIBILITY is not set +CONFIG_RTC_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set + +# +# RTC interfaces +# +CONFIG_RTC_INTF_DEV=y +# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +# CONFIG_RTC_DRV_DS1307 is not set +# CONFIG_RTC_DRV_DS1374 is not set +# CONFIG_RTC_DRV_DS1672 is not set +# CONFIG_RTC_DRV_MAX6900 is not set +# CONFIG_RTC_DRV_RS5C372 is not set +# CONFIG_RTC_DRV_ISL1208 is not set +# CONFIG_RTC_DRV_X1205 is not set +# CONFIG_RTC_DRV_PCF8563 is not set +# CONFIG_RTC_DRV_PCF8583 is not set +# CONFIG_RTC_DRV_M41T80 is not set +# CONFIG_RTC_DRV_S35390A is not set +# CONFIG_RTC_DRV_FM3130 is not set +# CONFIG_RTC_DRV_RX8581 is not set + +# +# SPI RTC drivers +# + +# +# Platform RTC drivers +# +# CONFIG_RTC_DRV_DS1286 is not set +# CONFIG_RTC_DRV_DS1511 is not set +# CONFIG_RTC_DRV_DS1553 is not set +# CONFIG_RTC_DRV_DS1742 is not set +# CONFIG_RTC_DRV_STK17TA8 is not set +# CONFIG_RTC_DRV_M48T86 is not set +# CONFIG_RTC_DRV_M48T35 is not set +# CONFIG_RTC_DRV_M48T59 is not set +# CONFIG_RTC_DRV_BQ4802 is not set +# CONFIG_RTC_DRV_V3020 is not set + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_SH=y +# CONFIG_DMADEVICES is not set +# CONFIG_AUXDISPLAY is not set +CONFIG_UIO=y +# CONFIG_UIO_PDRV is not set +CONFIG_UIO_PDRV_GENIRQ=y +# CONFIG_UIO_SMX is not set +# CONFIG_UIO_SERCOS3 is not set +# CONFIG_STAGING is not set + +# +# File systems +# +# CONFIG_EXT2_FS is not set +# CONFIG_EXT3_FS is not set +# CONFIG_EXT4_FS is not set +# CONFIG_REISERFS_FS is not set +# CONFIG_JFS_FS is not set +# CONFIG_FS_POSIX_ACL is not set +CONFIG_FILE_LOCKING=y +# CONFIG_XFS_FS is not set +# CONFIG_BTRFS_FS is not set +# CONFIG_DNOTIFY is not set +# CONFIG_INOTIFY is not set +# CONFIG_QUOTA is not set +# CONFIG_AUTOFS_FS is not set +# CONFIG_AUTOFS4_FS is not set +# CONFIG_FUSE_FS is not set + +# +# Caches +# +# CONFIG_FSCACHE is not set + +# +# CD-ROM/DVD Filesystems +# +# CONFIG_ISO9660_FS is not set +# CONFIG_UDF_FS is not set + +# +# DOS/FAT/NT Filesystems +# +# CONFIG_MSDOS_FS is not set +# CONFIG_VFAT_FS is not set +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +# CONFIG_PROC_FS is not set +# CONFIG_SYSFS is not set +# CONFIG_TMPFS is not set +# CONFIG_HUGETLBFS is not set +# CONFIG_HUGETLB_PAGE is not set +# CONFIG_MISC_FILESYSTEMS is not set + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y +# CONFIG_NLS is not set + +# +# Kernel hacking +# +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +# CONFIG_PRINTK_TIME is not set +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=1024 +# CONFIG_MAGIC_SYSRQ is not set +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_DEBUG_FS=y +# CONFIG_HEADERS_CHECK is not set +# CONFIG_DEBUG_KERNEL is not set +CONFIG_STACKTRACE=y +# CONFIG_DEBUG_BUGVERBOSE is not set +# CONFIG_DEBUG_MEMORY_INIT is not set +# CONFIG_RCU_CPU_STALL_DETECTOR is not set +# CONFIG_LATENCYTOP is not set +# CONFIG_SYSCTL_SYSCALL_CHECK is not set +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y + +# +# Tracers +# +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_SH_STANDARD_BIOS is not set +# CONFIG_EARLY_SCIF_CONSOLE is not set +# CONFIG_MORE_COMPILE_OPTIONS is not set + +# +# Security options +# +# CONFIG_KEYS is not set +# CONFIG_SECURITYFS is not set +# CONFIG_SECURITY_FILE_CAPABILITIES is not set +# CONFIG_CRYPTO is not set +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_GENERIC_FIND_LAST_BIT=y +# CONFIG_CRC_CCITT is not set +# CONFIG_CRC16 is not set +# CONFIG_CRC_T10DIF is not set +# CONFIG_CRC_ITU_T is not set +# CONFIG_CRC32 is not set +# CONFIG_CRC7 is not set +# CONFIG_LIBCRC32C is not set +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT=y +CONFIG_HAS_DMA=y From b8b47bfbe4eb1ae0e6891e49c86a5f4fb00413be Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 11 Mar 2009 15:41:51 +0900 Subject: [PATCH 0401/2061] sh: pass along struct pci_channel These patches rework the pci code for the sh architecture. Currently each board implements some kind of ioport to address mapping. Some boards use generic_io_base others try passing addresses as io ports. This is the first set of patches that try to unify the pci code as much as possible to avoid duplicated code. This will in the end lead to fewer lines board specific code and more generic code. This patch makes sure a struct pci_channel pointer is passed along to various pci functions such as pci_read_reg(), pci_write_reg(), pci_fixup_pcic(), sh7751_pcic_init() and sh7780_pcic_init(). Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/fixups-lboxre2.c | 23 +++++---- arch/sh/drivers/pci/fixups-r7780rp.c | 33 ++++++------ arch/sh/drivers/pci/fixups-rts7751r2d.c | 23 +++++---- arch/sh/drivers/pci/fixups-sdk7780.c | 46 +++++++++-------- arch/sh/drivers/pci/fixups-se7780.c | 32 ++++++------ arch/sh/drivers/pci/ops-landisk.c | 2 +- arch/sh/drivers/pci/ops-lboxre2.c | 2 +- arch/sh/drivers/pci/ops-r7780rp.c | 2 +- arch/sh/drivers/pci/ops-rts7751r2d.c | 2 +- arch/sh/drivers/pci/ops-sdk7780.c | 2 +- arch/sh/drivers/pci/ops-se7780.c | 2 +- arch/sh/drivers/pci/ops-sh4.c | 24 ++++----- arch/sh/drivers/pci/ops-snapgear.c | 2 +- arch/sh/drivers/pci/ops-titan.c | 2 +- arch/sh/drivers/pci/pci-sh4.h | 11 ++-- arch/sh/drivers/pci/pci-sh7751.c | 68 +++++++++++++------------ arch/sh/drivers/pci/pci-sh7751.h | 3 +- arch/sh/drivers/pci/pci-sh7780.c | 31 +++++------ arch/sh/drivers/pci/pci-sh7780.h | 3 +- 19 files changed, 164 insertions(+), 149 deletions(-) diff --git a/arch/sh/drivers/pci/fixups-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c index 1c1d41255ec..a82011d03cb 100644 --- a/arch/sh/drivers/pci/fixups-lboxre2.c +++ b/arch/sh/drivers/pci/fixups-lboxre2.c @@ -9,33 +9,34 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#include #include "pci-sh4.h" #define PCIMCR_MRSET_OFF 0xBFFFFFFF #define PCIMCR_RFSH_OFF 0xFFFFFFFB -int pci_fixup_pcic(void) +int pci_fixup_pcic(struct pci_channel *chan) { unsigned long bcr1, mcr; bcr1 = ctrl_inl(SH7751_BCR1); bcr1 |= 0x40080000; /* Enable Bit 19 BREQEN, set PCIC to slave */ - pci_write_reg(bcr1, SH4_PCIBCR1); + pci_write_reg(chan, bcr1, SH4_PCIBCR1); /* Enable all interrupts, so we known what to fix */ - pci_write_reg(0x0000c3ff, SH4_PCIINTM); - pci_write_reg(0x0000380f, SH4_PCIAINTM); - pci_write_reg(0xfb900047, SH7751_PCICONF1); - pci_write_reg(0xab000001, SH7751_PCICONF4); + pci_write_reg(chan, 0x0000c3ff, SH4_PCIINTM); + pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM); + pci_write_reg(chan, 0xfb900047, SH7751_PCICONF1); + pci_write_reg(chan, 0xab000001, SH7751_PCICONF4); mcr = ctrl_inl(SH7751_MCR); mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF; - pci_write_reg(mcr, SH4_PCIMCR); + pci_write_reg(chan, mcr, SH4_PCIMCR); - pci_write_reg(0x0c000000, SH7751_PCICONF5); - pci_write_reg(0xd0000000, SH7751_PCICONF6); - pci_write_reg(0x0c000000, SH4_PCILAR0); - pci_write_reg(0x00000000, SH4_PCILAR1); + pci_write_reg(chan, 0x0c000000, SH7751_PCICONF5); + pci_write_reg(chan, 0xd0000000, SH7751_PCICONF6); + pci_write_reg(chan, 0x0c000000, SH4_PCILAR0); + pci_write_reg(chan, 0x00000000, SH4_PCILAR1); return 0; } diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c index 3e321df65d2..5b25021bbd6 100644 --- a/arch/sh/drivers/pci/fixups-r7780rp.c +++ b/arch/sh/drivers/pci/fixups-r7780rp.c @@ -14,32 +14,33 @@ #include "pci-sh4.h" #include -int pci_fixup_pcic(void) +int pci_fixup_pcic(struct pci_channel *chan) { - pci_write_reg(0x000043ff, SH4_PCIINTM); - pci_write_reg(0x0000380f, SH4_PCIAINTM); + pci_write_reg(chan, 0x000043ff, SH4_PCIINTM); + pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM); - pci_write_reg(0xfbb00047, SH7780_PCICMD); - pci_write_reg(0x00000000, SH7780_PCIIBAR); + pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD); + pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR); - pci_write_reg(0x00011912, SH7780_PCISVID); - pci_write_reg(0x08000000, SH7780_PCICSCR0); - pci_write_reg(0x0000001b, SH7780_PCICSAR0); - pci_write_reg(0xfd000000, SH7780_PCICSCR1); - pci_write_reg(0x0000000f, SH7780_PCICSAR1); + pci_write_reg(chan, 0x00011912, SH7780_PCISVID); + pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0); + pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0); + pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1); + pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1); - pci_write_reg(0xfd000000, SH7780_PCIMBR0); - pci_write_reg(0x00fc0000, SH7780_PCIMBMR0); + pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0); + pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0); #ifdef CONFIG_32BIT - pci_write_reg(0xc0000000, SH7780_PCIMBR2); - pci_write_reg(0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2); + pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2); + pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2); #endif /* Set IOBR for windows containing area specified in pci.h */ - pci_write_reg((PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)), + pci_write_reg(chan, (PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)), SH7780_PCIIOBR); - pci_write_reg(((SH7780_PCI_IO_SIZE-1) & (7<<18)), SH7780_PCIIOBMR); + pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)), + SH7780_PCIIOBMR); return 0; } diff --git a/arch/sh/drivers/pci/fixups-rts7751r2d.c b/arch/sh/drivers/pci/fixups-rts7751r2d.c index 904bce8768d..38852334d47 100644 --- a/arch/sh/drivers/pci/fixups-rts7751r2d.c +++ b/arch/sh/drivers/pci/fixups-rts7751r2d.c @@ -10,34 +10,35 @@ * License. See the file "COPYING" in the main directory of this archive * for more details. */ +#include #include "pci-sh4.h" #define PCIMCR_MRSET_OFF 0xBFFFFFFF #define PCIMCR_RFSH_OFF 0xFFFFFFFB -int pci_fixup_pcic(void) +int pci_fixup_pcic(struct pci_channel *chan) { unsigned long bcr1, mcr; bcr1 = ctrl_inl(SH7751_BCR1); bcr1 |= 0x40080000; /* Enable Bit 19 BREQEN, set PCIC to slave */ - pci_write_reg(bcr1, SH4_PCIBCR1); + pci_write_reg(chan, bcr1, SH4_PCIBCR1); /* Enable all interrupts, so we known what to fix */ - pci_write_reg(0x0000c3ff, SH4_PCIINTM); - pci_write_reg(0x0000380f, SH4_PCIAINTM); + pci_write_reg(chan, 0x0000c3ff, SH4_PCIINTM); + pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM); - pci_write_reg(0xfb900047, SH7751_PCICONF1); - pci_write_reg(0xab000001, SH7751_PCICONF4); + pci_write_reg(chan, 0xfb900047, SH7751_PCICONF1); + pci_write_reg(chan, 0xab000001, SH7751_PCICONF4); mcr = ctrl_inl(SH7751_MCR); mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF; - pci_write_reg(mcr, SH4_PCIMCR); + pci_write_reg(chan, mcr, SH4_PCIMCR); - pci_write_reg(0x0c000000, SH7751_PCICONF5); - pci_write_reg(0xd0000000, SH7751_PCICONF6); - pci_write_reg(0x0c000000, SH4_PCILAR0); - pci_write_reg(0x00000000, SH4_PCILAR1); + pci_write_reg(chan, 0x0c000000, SH7751_PCICONF5); + pci_write_reg(chan, 0xd0000000, SH7751_PCICONF6); + pci_write_reg(chan, 0x0c000000, SH4_PCILAR0); + pci_write_reg(chan, 0x00000000, SH4_PCILAR1); return 0; } diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c index 2f8863099dd..3f6754a120e 100644 --- a/arch/sh/drivers/pci/fixups-sdk7780.c +++ b/arch/sh/drivers/pci/fixups-sdk7780.c @@ -14,46 +14,48 @@ #include "pci-sh4.h" #include -int pci_fixup_pcic(void) +int pci_fixup_pcic(struct pci_channel *chan) { ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable all interrupts, so we know what to fix */ - pci_write_reg(0x0000C3FF, SH7780_PCIIMR); - pci_write_reg(0x0000380F, SH7780_PCIAINTM); + pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR); + pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM); /* Set up standard PCI config registers */ - pci_write_reg(0xFB00, SH7780_PCISTATUS); - pci_write_reg(0x0047, SH7780_PCICMD); - pci_write_reg(0x00, SH7780_PCIPIF); - pci_write_reg(0x00, SH7780_PCISUB); - pci_write_reg(0x06, SH7780_PCIBCC); - pci_write_reg(0x1912, SH7780_PCISVID); - pci_write_reg(0x0001, SH7780_PCISID); + pci_write_reg(chan, 0xFB00, SH7780_PCISTATUS); + pci_write_reg(chan, 0x0047, SH7780_PCICMD); + pci_write_reg(chan, 0x00, SH7780_PCIPIF); + pci_write_reg(chan, 0x00, SH7780_PCISUB); + pci_write_reg(chan, 0x06, SH7780_PCIBCC); + pci_write_reg(chan, 0x1912, SH7780_PCISVID); + pci_write_reg(chan, 0x0001, SH7780_PCISID); - pci_write_reg(0x08000000, SH7780_PCIMBAR0); /* PCI */ - pci_write_reg(0x08000000, SH7780_PCILAR0); /* SHwy */ - pci_write_reg(0x07F00001, SH7780_PCILSR); /* size 128M w/ MBAR */ + pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0); /* PCI */ + pci_write_reg(chan, 0x08000000, SH7780_PCILAR0); /* SHwy */ + pci_write_reg(chan, 0x07F00001, SH7780_PCILSR); /* size 128M w/ MBAR */ - pci_write_reg(0x00000000, SH7780_PCIMBAR1); - pci_write_reg(0x00000000, SH7780_PCILAR1); - pci_write_reg(0x00000000, SH7780_PCILSR1); + pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1); + pci_write_reg(chan, 0x00000000, SH7780_PCILAR1); + pci_write_reg(chan, 0x00000000, SH7780_PCILSR1); - pci_write_reg(0xAB000801, SH7780_PCIIBAR); + pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR); /* * Set the MBR so PCI address is one-to-one with window, * meaning all calls go straight through... use ifdef to * catch erroneous assumption. */ - pci_write_reg(0xFD000000 , SH7780_PCIMBR0); - pci_write_reg(0x00FC0000 , SH7780_PCIMBMR0); /* 16M */ + pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0); + pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0); /* 16M */ /* Set IOBR for window containing area specified in pci.h */ - pci_write_reg(PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1), SH7780_PCIIOBR); - pci_write_reg((SH7780_PCI_IO_SIZE-1) & (7 << 18), SH7780_PCIIOBMR); + pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1), + SH7780_PCIIOBR); + pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18), + SH7780_PCIIOBMR); - pci_write_reg(0xA5000C01, SH7780_PCICR); + pci_write_reg(chan, 0xA5000C01, SH7780_PCICR); return 0; } diff --git a/arch/sh/drivers/pci/fixups-se7780.c b/arch/sh/drivers/pci/fixups-se7780.c index 880cea1c0d8..b8e735e01c3 100644 --- a/arch/sh/drivers/pci/fixups-se7780.c +++ b/arch/sh/drivers/pci/fixups-se7780.c @@ -15,13 +15,13 @@ #include "pci-sh4.h" #include -int pci_fixup_pcic(void) +int pci_fixup_pcic(struct pci_channel *chan) { ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable all interrupts, so we know what to fix */ - pci_write_reg(0x0000C3FF, SH7780_PCIIMR); - pci_write_reg(0x0000380F, SH7780_PCIAINTM); + pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR); + pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM); /* Set up standard PCI config registers */ ctrl_outw(0xFB00, PCI_REG(SH7780_PCISTATUS)); @@ -32,29 +32,31 @@ int pci_fixup_pcic(void) ctrl_outw(0x1912, PCI_REG(SH7780_PCISVID)); ctrl_outw(0x0001, PCI_REG(SH7780_PCISID)); - pci_write_reg(0x08000000, SH7780_PCIMBAR0); /* PCI */ - pci_write_reg(0x08000000, SH7780_PCILAR0); /* SHwy */ - pci_write_reg(0x07F00001, SH7780_PCILSR); /* size 128M w/ MBAR */ + pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0); /* PCI */ + pci_write_reg(chan, 0x08000000, SH7780_PCILAR0); /* SHwy */ + pci_write_reg(chan, 0x07F00001, SH7780_PCILSR); /* size 128M w/ MBAR */ - pci_write_reg(0x00000000, SH7780_PCIMBAR1); - pci_write_reg(0x00000000, SH7780_PCILAR1); - pci_write_reg(0x00000000, SH7780_PCILSR1); + pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1); + pci_write_reg(chan, 0x00000000, SH7780_PCILAR1); + pci_write_reg(chan, 0x00000000, SH7780_PCILSR1); - pci_write_reg(0xAB000801, SH7780_PCIIBAR); + pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR); /* * Set the MBR so PCI address is one-to-one with window, * meaning all calls go straight through... use ifdef to * catch erroneous assumption. */ - pci_write_reg(0xFD000000 , SH7780_PCIMBR0); - pci_write_reg(0x00FC0000 , SH7780_PCIMBMR0); /* 16M */ + pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0); + pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0); /* 16M */ /* Set IOBR for window containing area specified in pci.h */ - pci_write_reg(PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1), SH7780_PCIIOBR); - pci_write_reg((SH7780_PCI_IO_SIZE-1) & (7 << 18), SH7780_PCIIOBMR); + pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1), + SH7780_PCIIOBR); + pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18), + SH7780_PCIIOBMR); - pci_write_reg(0xA5000C01, SH7780_PCICR); + pci_write_reg(chan, 0xA5000C01, SH7780_PCICR); return 0; } diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c index bff09ecf341..343c072a5a7 100644 --- a/arch/sh/drivers/pci/ops-landisk.c +++ b/arch/sh/drivers/pci/ops-landisk.c @@ -45,7 +45,7 @@ static struct sh4_pci_address_map sh7751_pci_map = { int __init pcibios_init_platform(void) { - return sh7751_pcic_init(&sh7751_pci_map); + return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); } int pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c index 86c0b6fb737..8bff32a2210 100644 --- a/arch/sh/drivers/pci/ops-lboxre2.c +++ b/arch/sh/drivers/pci/ops-lboxre2.c @@ -59,5 +59,5 @@ static struct sh4_pci_address_map sh7751_pci_map = { int __init pcibios_init_platform(void) { - return sh7751_pcic_init(&sh7751_pci_map); + return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); } diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c index 8555238e63e..bf32ee8b132 100644 --- a/arch/sh/drivers/pci/ops-r7780rp.c +++ b/arch/sh/drivers/pci/ops-r7780rp.c @@ -64,5 +64,5 @@ static struct sh4_pci_address_map sh7780_pci_map = { int __init pcibios_init_platform(void) { - return sh7780_pcic_init(&sh7780_pci_map); + return sh7780_pcic_init(&board_pci_channels[0], &sh7780_pci_map); } diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c index d6ca74b25d5..e4208a69732 100644 --- a/arch/sh/drivers/pci/ops-rts7751r2d.c +++ b/arch/sh/drivers/pci/ops-rts7751r2d.c @@ -69,6 +69,6 @@ static struct sh4_pci_address_map sh7751_pci_map = { int __init pcibios_init_platform(void) { __set_io_port_base(SH7751_PCI_IO_BASE); - return sh7751_pcic_init(&sh7751_pci_map); + return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); } diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c index 4dcc64184b2..21d59d4a215 100644 --- a/arch/sh/drivers/pci/ops-sdk7780.c +++ b/arch/sh/drivers/pci/ops-sdk7780.c @@ -69,5 +69,5 @@ static struct sh4_pci_address_map sdk7780_pci_map = { int __init pcibios_init_platform(void) { printk(KERN_INFO "SH7780 PCI: Finished initializing PCI controller\n"); - return sh7780_pcic_init(&sdk7780_pci_map); + return sh7780_pcic_init(&board_pci_channels[0], &sdk7780_pci_map); } diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c index 3145c62484d..78a6f2bc4f1 100644 --- a/arch/sh/drivers/pci/ops-se7780.c +++ b/arch/sh/drivers/pci/ops-se7780.c @@ -92,5 +92,5 @@ int __init pcibios_init_platform(void) ctrl_outw(0x0013, FPGA_PCI_INTSEL1); ctrl_outw(0xE402, FPGA_PCI_INTSEL2); - return sh7780_pcic_init(&se7780_pci_map); + return sh7780_pcic_init(&board_pci_channels[0], &se7780_pci_map); } diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c index 710a3b0306e..92d27f734f2 100644 --- a/arch/sh/drivers/pci/ops-sh4.c +++ b/arch/sh/drivers/pci/ops-sh4.c @@ -34,8 +34,8 @@ static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn, * so we must do byte alignment by hand */ spin_lock_irqsave(&sh4_pci_lock, flags); - pci_write_reg(CONFIG_CMD(bus, devfn, where), SH4_PCIPAR); - data = pci_read_reg(SH4_PCIPDR); + pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR); + data = pci_read_reg(NULL, SH4_PCIPDR); spin_unlock_irqrestore(&sh4_pci_lock, flags); switch (size) { @@ -68,8 +68,8 @@ static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn, u32 data; spin_lock_irqsave(&sh4_pci_lock, flags); - pci_write_reg(CONFIG_CMD(bus, devfn, where), SH4_PCIPAR); - data = pci_read_reg(SH4_PCIPDR); + pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR); + data = pci_read_reg(NULL, SH4_PCIPDR); spin_unlock_irqrestore(&sh4_pci_lock, flags); switch (size) { @@ -90,7 +90,7 @@ static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn, return PCIBIOS_FUNC_NOT_SUPPORTED; } - pci_write_reg(data, SH4_PCIPDR); + pci_write_reg(NULL, data, SH4_PCIPDR); return PCIBIOS_SUCCESSFUL; } @@ -106,25 +106,25 @@ struct pci_ops sh4_pci_ops = { */ static unsigned int pci_probe = PCI_PROBE_CONF1; -int __init sh4_pci_check_direct(void) +int __init sh4_pci_check_direct(struct pci_channel *chan) { /* * Check if configuration works. */ if (pci_probe & PCI_PROBE_CONF1) { - unsigned int tmp = pci_read_reg(SH4_PCIPAR); + unsigned int tmp = pci_read_reg(chan, SH4_PCIPAR); - pci_write_reg(P1SEG, SH4_PCIPAR); + pci_write_reg(chan, P1SEG, SH4_PCIPAR); - if (pci_read_reg(SH4_PCIPAR) == P1SEG) { - pci_write_reg(tmp, SH4_PCIPAR); + if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) { + pci_write_reg(chan, tmp, SH4_PCIPAR); printk(KERN_INFO "PCI: Using configuration type 1\n"); request_region(PCI_REG(SH4_PCIPAR), 8, "PCI conf1"); return 0; } - pci_write_reg(tmp, SH4_PCIPAR); + pci_write_reg(chan, tmp, SH4_PCIPAR); } pr_debug("PCI: pci_check_direct failed\n"); @@ -163,7 +163,7 @@ char * __devinit pcibios_setup(char *str) return str; } -int __attribute__((weak)) pci_fixup_pcic(void) +int __attribute__((weak)) pci_fixup_pcic(struct pci_channel *chan) { /* Nothing to do. */ return 0; diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c index 53dd893d4e5..cba80153dde 100644 --- a/arch/sh/drivers/pci/ops-snapgear.c +++ b/arch/sh/drivers/pci/ops-snapgear.c @@ -66,7 +66,7 @@ static struct sh4_pci_address_map sh7751_pci_map = { */ int __init pcibios_init_platform(void) { - return sh7751_pcic_init(&sh7751_pci_map); + return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); } int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c index a8f7801a34a..69fcc5c5d52 100644 --- a/arch/sh/drivers/pci/ops-titan.c +++ b/arch/sh/drivers/pci/ops-titan.c @@ -73,5 +73,5 @@ static struct sh4_pci_address_map sh7751_pci_map = { int __init pcibios_init_platform(void) { - return sh7751_pcic_init(&sh7751_pci_map); + return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); } diff --git a/arch/sh/drivers/pci/pci-sh4.h b/arch/sh/drivers/pci/pci-sh4.h index a83dcf70c13..62ba3505608 100644 --- a/arch/sh/drivers/pci/pci-sh4.h +++ b/arch/sh/drivers/pci/pci-sh4.h @@ -154,8 +154,8 @@ /* arch/sh/kernel/drivers/pci/ops-sh4.c */ extern struct pci_ops sh4_pci_ops; -int sh4_pci_check_direct(void); -int pci_fixup_pcic(void); +int sh4_pci_check_direct(struct pci_channel *chan); +int pci_fixup_pcic(struct pci_channel *chan); struct sh4_pci_address_space { unsigned long base; @@ -168,13 +168,16 @@ struct sh4_pci_address_map { unsigned long flags; }; -static inline void pci_write_reg(unsigned long val, unsigned long reg) +static inline void pci_write_reg(struct pci_channel *chan, + unsigned long val, unsigned long reg) { ctrl_outl(val, PCI_REG(reg)); } -static inline unsigned long pci_read_reg(unsigned long reg) +static inline unsigned long pci_read_reg(struct pci_channel *chan, + unsigned long reg) { return ctrl_inl(PCI_REG(reg)); } + #endif /* __PCI_SH4_H */ diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c index 3065eb184f0..9c2c01490d6 100644 --- a/arch/sh/drivers/pci/pci-sh7751.c +++ b/arch/sh/drivers/pci/pci-sh7751.c @@ -40,21 +40,22 @@ static int __init sh7751_pci_init(void) pr_debug("PCI: Starting intialization.\n"); /* check for SH7751/SH7751R hardware */ - id = pci_read_reg(SH7751_PCICONF0); + id = pci_read_reg(NULL, SH7751_PCICONF0); if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) && id != ((SH7751R_DEVICE_ID << 16) | SH7751_VENDOR_ID)) { pr_debug("PCI: This is not an SH7751(R) (%x)\n", id); return -ENODEV; } - if ((ret = sh4_pci_check_direct()) != 0) + if ((ret = sh4_pci_check_direct(NULL)) != 0) return ret; return pcibios_init_platform(); } subsys_initcall(sh7751_pci_init); -static int __init __area_sdram_check(unsigned int area) +static int __init __area_sdram_check(struct pci_channel *chan, + unsigned int area) { u32 word; @@ -65,7 +66,7 @@ static int __init __area_sdram_check(unsigned int area) area, word); return 0; } - pci_write_reg(word, SH4_PCIBCR1); + pci_write_reg(chan, word, SH4_PCIBCR1); word = (u16)ctrl_inw(SH7751_BCR2); /* check BCR2 for 32bit SDRAM interface*/ @@ -74,12 +75,13 @@ static int __init __area_sdram_check(unsigned int area) area, word); return 0; } - pci_write_reg(word, SH4_PCIBCR2); + pci_write_reg(chan, word, SH4_PCIBCR2); return 1; } -int __init sh7751_pcic_init(struct sh4_pci_address_map *map) +int __init sh7751_pcic_init(struct pci_channel *chan, + struct sh4_pci_address_map *map) { u32 reg; u32 word; @@ -90,10 +92,10 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map) ctrl_outl(reg, SH7751_BCR1); /* Turn the clocks back on (not done in reset)*/ - pci_write_reg(0, SH4_PCICLKR); + pci_write_reg(chan, 0, SH4_PCICLKR); /* Clear Powerdown IRQ's (not done in reset) */ word = SH4_PCIPINT_D3 | SH4_PCIPINT_D0; - pci_write_reg(word, SH4_PCIPINT); + pci_write_reg(chan, word, SH4_PCIPINT); /* * This code is unused for some boards as it is done in the @@ -103,11 +105,11 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map) if (!(map->flags & SH4_PCIC_NO_RESET)) { /* toggle PCI reset pin */ word = SH4_PCICR_PREFIX | SH4_PCICR_PRST; - pci_write_reg(word, SH4_PCICR); + pci_write_reg(chan, word, SH4_PCICR); /* Wait for a long time... not 1 sec. but long enough */ mdelay(100); word = SH4_PCICR_PREFIX; - pci_write_reg(word, SH4_PCICR); + pci_write_reg(chan, word, SH4_PCICR); } /* set the command/status bits to: @@ -116,11 +118,11 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map) */ word = SH7751_PCICONF1_WCC | SH7751_PCICONF1_PER | SH7751_PCICONF1_BUM | SH7751_PCICONF1_MES; - pci_write_reg(word, SH7751_PCICONF1); + pci_write_reg(chan, word, SH7751_PCICONF1); /* define this host as the host bridge */ word = PCI_BASE_CLASS_BRIDGE << 24; - pci_write_reg(word, SH7751_PCICONF2); + pci_write_reg(chan, word, SH7751_PCICONF2); /* Set IO and Mem windows to local address * Make PCI and local address the same for easy 1 to 1 mapping @@ -128,24 +130,24 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map) * Window1 = map->window1.size @ cached area base = SDRAM */ word = map->window0.size - 1; - pci_write_reg(word, SH4_PCILSR0); + pci_write_reg(chan, word, SH4_PCILSR0); word = map->window1.size - 1; - pci_write_reg(word, SH4_PCILSR1); + pci_write_reg(chan, word, SH4_PCILSR1); /* Set the values on window 0 PCI config registers */ word = P2SEGADDR(map->window0.base); - pci_write_reg(word, SH4_PCILAR0); - pci_write_reg(word, SH7751_PCICONF5); + pci_write_reg(chan, word, SH4_PCILAR0); + pci_write_reg(chan, word, SH7751_PCICONF5); /* Set the values on window 1 PCI config registers */ word = PHYSADDR(map->window1.base); - pci_write_reg(word, SH4_PCILAR1); - pci_write_reg(word, SH7751_PCICONF6); + pci_write_reg(chan, word, SH4_PCILAR1); + pci_write_reg(chan, word, SH7751_PCICONF6); /* Set the local 16MB PCI memory space window to * the lowest PCI mapped address */ word = PCIBIOS_MIN_MEM & SH4_PCIMBR_MASK; pr_debug("PCI: Setting upper bits of Memory window to 0x%x\n", word); - pci_write_reg(word , SH4_PCIMBR); + pci_write_reg(chan, word , SH4_PCIMBR); /* Map IO space into PCI IO window * The IO window is 64K-PCIBIOS_MIN_IO in size @@ -160,19 +162,19 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map) * correctly */ word = PCIBIOS_MIN_IO & SH4_PCIIOBR_MASK; pr_debug("PCI: Setting upper bits of IO window to 0x%x\n", word); - pci_write_reg(word, SH4_PCIIOBR); + pci_write_reg(chan, word, SH4_PCIIOBR); /* Set PCI WCRx, BCRx's, copy from BSC locations */ /* check BCR for SDRAM in specified area */ switch (map->window0.base) { - case SH7751_CS0_BASE_ADDR: word = __area_sdram_check(0); break; - case SH7751_CS1_BASE_ADDR: word = __area_sdram_check(1); break; - case SH7751_CS2_BASE_ADDR: word = __area_sdram_check(2); break; - case SH7751_CS3_BASE_ADDR: word = __area_sdram_check(3); break; - case SH7751_CS4_BASE_ADDR: word = __area_sdram_check(4); break; - case SH7751_CS5_BASE_ADDR: word = __area_sdram_check(5); break; - case SH7751_CS6_BASE_ADDR: word = __area_sdram_check(6); break; + case SH7751_CS0_BASE_ADDR: word = __area_sdram_check(chan, 0); break; + case SH7751_CS1_BASE_ADDR: word = __area_sdram_check(chan, 1); break; + case SH7751_CS2_BASE_ADDR: word = __area_sdram_check(chan, 2); break; + case SH7751_CS3_BASE_ADDR: word = __area_sdram_check(chan, 3); break; + case SH7751_CS4_BASE_ADDR: word = __area_sdram_check(chan, 4); break; + case SH7751_CS5_BASE_ADDR: word = __area_sdram_check(chan, 5); break; + case SH7751_CS6_BASE_ADDR: word = __area_sdram_check(chan, 6); break; } if (!word) @@ -180,25 +182,25 @@ int __init sh7751_pcic_init(struct sh4_pci_address_map *map) /* configure the wait control registers */ word = ctrl_inl(SH7751_WCR1); - pci_write_reg(word, SH4_PCIWCR1); + pci_write_reg(chan, word, SH4_PCIWCR1); word = ctrl_inl(SH7751_WCR2); - pci_write_reg(word, SH4_PCIWCR2); + pci_write_reg(chan, word, SH4_PCIWCR2); word = ctrl_inl(SH7751_WCR3); - pci_write_reg(word, SH4_PCIWCR3); + pci_write_reg(chan, word, SH4_PCIWCR3); word = ctrl_inl(SH7751_MCR); - pci_write_reg(word, SH4_PCIMCR); + pci_write_reg(chan, word, SH4_PCIMCR); /* NOTE: I'm ignoring the PCI error IRQs for now.. * TODO: add support for the internal error interrupts and * DMA interrupts... */ - pci_fixup_pcic(); + pci_fixup_pcic(chan); /* SH7751 init done, set central function init complete */ /* use round robin mode to stop a device starving/overruning */ word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM; - pci_write_reg(word, SH4_PCICR); + pci_write_reg(chan, word, SH4_PCICR); return 1; } diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h index 68e3cb5e6be..6f101e5a6c8 100644 --- a/arch/sh/drivers/pci/pci-sh7751.h +++ b/arch/sh/drivers/pci/pci-sh7751.h @@ -130,6 +130,7 @@ struct sh4_pci_address_map; /* arch/sh/drivers/pci/pci-sh7751.c */ -int sh7751_pcic_init(struct sh4_pci_address_map *map); +int sh7751_pcic_init(struct pci_channel *chan, + struct sh4_pci_address_map *map); #endif /* _PCI_SH7751_H_ */ diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index bae6a2cf047..56f673f66cb 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -55,7 +55,7 @@ static int __init sh7780_pci_init(void) ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */ /* check for SH7780/SH7780R hardware */ - id = pci_read_reg(SH7780_PCIVID); + id = pci_read_reg(NULL, SH7780_PCIVID); if ((id & 0xffff) == SH7780_VENDOR_ID) { switch ((id >> 16) & 0xffff) { case SH7763_DEVICE_ID: @@ -82,14 +82,15 @@ static int __init sh7780_pci_init(void) ctrl_outl(0x33333333, INTC_INTPRI); } - if ((ret = sh4_pci_check_direct()) != 0) + if ((ret = sh4_pci_check_direct(NULL)) != 0) return ret; return pcibios_init_platform(); } core_initcall(sh7780_pci_init); -int __init sh7780_pcic_init(struct sh4_pci_address_map *map) +int __init sh7780_pcic_init(struct pci_channel *chan, + struct sh4_pci_address_map *map) { u32 word; @@ -101,34 +102,34 @@ int __init sh7780_pcic_init(struct sh4_pci_address_map *map) if (!(map->flags & SH4_PCIC_NO_RESET)) { /* toggle PCI reset pin */ word = SH4_PCICR_PREFIX | SH4_PCICR_PRST; - pci_write_reg(word, SH4_PCICR); + pci_write_reg(chan, word, SH4_PCICR); /* Wait for a long time... not 1 sec. but long enough */ mdelay(100); word = SH4_PCICR_PREFIX; - pci_write_reg(word, SH4_PCICR); + pci_write_reg(chan, word, SH4_PCICR); } /* set the command/status bits to: * Wait Cycle Control + Parity Enable + Bus Master + * Mem space enable */ - pci_write_reg(0x00000046, SH7780_PCICMD); + pci_write_reg(chan, 0x00000046, SH7780_PCICMD); /* define this host as the host bridge */ word = PCI_BASE_CLASS_BRIDGE << 24; - pci_write_reg(word, SH7780_PCIRID); + pci_write_reg(chan, word, SH7780_PCIRID); /* Set IO and Mem windows to local address * Make PCI and local address the same for easy 1 to 1 mapping */ - pci_write_reg(map->window0.size - 0xfffff, SH4_PCILSR0); - pci_write_reg(map->window1.size - 0xfffff, SH4_PCILSR1); + pci_write_reg(chan, map->window0.size - 0xfffff, SH4_PCILSR0); + pci_write_reg(chan, map->window1.size - 0xfffff, SH4_PCILSR1); /* Set the values on window 0 PCI config registers */ - pci_write_reg(map->window0.base, SH4_PCILAR0); - pci_write_reg(map->window0.base, SH7780_PCIMBAR0); + pci_write_reg(chan, map->window0.base, SH4_PCILAR0); + pci_write_reg(chan, map->window0.base, SH7780_PCIMBAR0); /* Set the values on window 1 PCI config registers */ - pci_write_reg(map->window1.base, SH4_PCILAR1); - pci_write_reg(map->window1.base, SH7780_PCIMBAR1); + pci_write_reg(chan, map->window1.base, SH4_PCILAR1); + pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1); /* Map IO space into PCI IO window * The IO window is 64K-PCIBIOS_MIN_IO in size @@ -145,12 +146,12 @@ int __init sh7780_pcic_init(struct sh4_pci_address_map *map) */ /* Apply any last-minute PCIC fixups */ - pci_fixup_pcic(); + pci_fixup_pcic(chan); /* SH7780 init done, set central function init complete */ /* use round robin mode to stop a device starving/overruning */ word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO; - pci_write_reg(word, SH4_PCICR); + pci_write_reg(chan, word, SH4_PCICR); return 1; } diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h index 93adc7119b7..d34961153d5 100644 --- a/arch/sh/drivers/pci/pci-sh7780.h +++ b/arch/sh/drivers/pci/pci-sh7780.h @@ -109,6 +109,7 @@ struct sh4_pci_address_map; /* arch/sh/drivers/pci/pci-sh7780.c */ -int sh7780_pcic_init(struct sh4_pci_address_map *map); +int sh7780_pcic_init(struct pci_channel *chan, + struct sh4_pci_address_map *map); #endif /* _PCI_SH7780_H_ */ From d0e3db40e2a1352aa2a2f425a7d4631bddc03d51 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 11 Mar 2009 15:46:14 +0900 Subject: [PATCH 0402/2061] sh: add init member to pci_channel data This patch adds an init callback to struct pci_channel and makes sure it is initialized properly. Code is added to call this init function from pcibios_init(). Return values are adjusted and a warning is is printed if init fails. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/boards/mach-dreamcast/setup.c | 6 ------ arch/sh/drivers/pci/ops-cayman.c | 2 +- arch/sh/drivers/pci/ops-dreamcast.c | 18 ++++++++---------- arch/sh/drivers/pci/ops-landisk.c | 2 +- arch/sh/drivers/pci/ops-lboxre2.c | 2 +- arch/sh/drivers/pci/ops-r7780rp.c | 2 +- arch/sh/drivers/pci/ops-rts7751r2d.c | 2 +- arch/sh/drivers/pci/ops-sdk7780.c | 2 +- arch/sh/drivers/pci/ops-se7780.c | 2 +- arch/sh/drivers/pci/ops-sh03.c | 2 +- arch/sh/drivers/pci/ops-snapgear.c | 2 +- arch/sh/drivers/pci/ops-titan.c | 2 +- arch/sh/drivers/pci/pci-sh5.c | 6 ++++++ arch/sh/drivers/pci/pci-sh5.h | 1 + arch/sh/drivers/pci/pci-sh7751.c | 11 +++++------ arch/sh/drivers/pci/pci-sh7751.h | 1 + arch/sh/drivers/pci/pci-sh7780.c | 9 ++++----- arch/sh/drivers/pci/pci-sh7780.h | 1 + arch/sh/drivers/pci/pci.c | 23 ++++++++++++++++++----- arch/sh/include/asm/pci.h | 2 ++ 20 files changed, 56 insertions(+), 42 deletions(-) diff --git a/arch/sh/boards/mach-dreamcast/setup.c b/arch/sh/boards/mach-dreamcast/setup.c index d1bee4884cd..ebe99227d4e 100644 --- a/arch/sh/boards/mach-dreamcast/setup.c +++ b/arch/sh/boards/mach-dreamcast/setup.c @@ -30,7 +30,6 @@ extern struct irq_chip systemasic_int; extern void aica_time_init(void); -extern int gapspci_init(void); extern int systemasic_irq_demux(int); static void __init dreamcast_setup(char **cmdline_p) @@ -51,11 +50,6 @@ static void __init dreamcast_setup(char **cmdline_p) handle_level_irq); board_time_init = aica_time_init; - -#ifdef CONFIG_PCI - if (gapspci_init() < 0) - printk(KERN_WARNING "GAPSPCI was not detected.\n"); -#endif } static struct sh_machine_vector mv_dreamcast __initmv = { diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/ops-cayman.c index 38ef76207af..f4a5e14f7e5 100644 --- a/arch/sh/drivers/pci/ops-cayman.c +++ b/arch/sh/drivers/pci/ops-cayman.c @@ -77,7 +77,7 @@ int __init pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin) } struct pci_channel board_pci_channels[] = { - { &sh5_pci_ops, NULL, NULL, 0, 0xff }, + { sh5_pci_init, &sh5_pci_ops, NULL, NULL, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c index f5d2a2aa6f3..f62063eb649 100644 --- a/arch/sh/drivers/pci/ops-dreamcast.c +++ b/arch/sh/drivers/pci/ops-dreamcast.c @@ -42,15 +42,6 @@ static struct resource gapspci_mem_resource = { .flags = IORESOURCE_MEM, }; -static struct pci_ops gapspci_pci_ops; - -struct pci_channel board_pci_channels[] = { - { &gapspci_pci_ops, &gapspci_io_resource, - &gapspci_mem_resource, 0, 1 }, - { 0, } -}; -EXPORT_SYMBOL(board_pci_channels); - /* * The !gapspci_config_access case really shouldn't happen, ever, unless * someone implicitly messes around with the last devfn value.. otherwise we @@ -116,7 +107,7 @@ static struct pci_ops gapspci_pci_ops = { * gapspci init */ -int __init gapspci_init(void) +static int __init gapspci_init(struct pci_channel *chan) { char idbuf[16]; int i; @@ -168,3 +159,10 @@ char * __devinit pcibios_setup(char *str) { return str; } + +struct pci_channel board_pci_channels[] = { + { gapspci_init, &gapspci_pci_ops, &gapspci_io_resource, + &gapspci_mem_resource, 0, 1 }, + { 0, } +}; +EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c index 343c072a5a7..c46911d95ec 100644 --- a/arch/sh/drivers/pci/ops-landisk.c +++ b/arch/sh/drivers/pci/ops-landisk.c @@ -30,7 +30,7 @@ static struct resource sh7751_mem_resource = { }; struct pci_channel board_pci_channels[] = { - {&sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0x3ff}, + { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0x3ff}, {NULL, NULL, NULL, 0, 0}, }; diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c index 8bff32a2210..f606df2195c 100644 --- a/arch/sh/drivers/pci/ops-lboxre2.c +++ b/arch/sh/drivers/pci/ops-lboxre2.c @@ -39,7 +39,7 @@ static struct resource sh7751_mem_resource = { extern struct pci_ops sh7751_pci_ops; struct pci_channel board_pci_channels[] = { - { &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, + { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c index bf32ee8b132..b51b7e4078d 100644 --- a/arch/sh/drivers/pci/ops-r7780rp.c +++ b/arch/sh/drivers/pci/ops-r7780rp.c @@ -43,7 +43,7 @@ static struct resource sh7780_mem_resource = { extern struct pci_ops sh7780_pci_ops; struct pci_channel board_pci_channels[] = { - { &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff }, + { sh7780_pci_init, &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c index e4208a69732..fe5a231b866 100644 --- a/arch/sh/drivers/pci/ops-rts7751r2d.c +++ b/arch/sh/drivers/pci/ops-rts7751r2d.c @@ -47,7 +47,7 @@ static struct resource sh7751_mem_resource = { extern struct pci_ops sh7751_pci_ops; struct pci_channel board_pci_channels[] = { - { &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, + { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c index 21d59d4a215..7277cd15ae6 100644 --- a/arch/sh/drivers/pci/ops-sdk7780.c +++ b/arch/sh/drivers/pci/ops-sdk7780.c @@ -49,7 +49,7 @@ static struct resource sdk7780_mem_resource = { }; struct pci_channel board_pci_channels[] = { - { &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff }, + { sh7780_pci_init, &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c index 78a6f2bc4f1..76a74fb42fb 100644 --- a/arch/sh/drivers/pci/ops-se7780.c +++ b/arch/sh/drivers/pci/ops-se7780.c @@ -58,7 +58,7 @@ static struct resource se7780_mem_resource = { extern struct pci_ops se7780_pci_ops; struct pci_channel board_pci_channels[] = { - { &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff }, + { sh7780_pci_init, &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/drivers/pci/ops-sh03.c b/arch/sh/drivers/pci/ops-sh03.c index e1703ff5a4d..0218135f0bb 100644 --- a/arch/sh/drivers/pci/ops-sh03.c +++ b/arch/sh/drivers/pci/ops-sh03.c @@ -39,7 +39,7 @@ static struct resource sh7751_mem_resource = { extern struct pci_ops sh4_pci_ops; struct pci_channel board_pci_channels[] = { - { &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, + { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c index cba80153dde..2e254c6cf6c 100644 --- a/arch/sh/drivers/pci/ops-snapgear.c +++ b/arch/sh/drivers/pci/ops-snapgear.c @@ -40,7 +40,7 @@ static struct resource sh7751_mem_resource = { }; struct pci_channel board_pci_channels[] = { - { &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, + { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, { 0, } }; diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c index 69fcc5c5d52..ffa79bdf475 100644 --- a/arch/sh/drivers/pci/ops-titan.c +++ b/arch/sh/drivers/pci/ops-titan.c @@ -52,7 +52,7 @@ static struct resource sh7751_mem_resource = { }; struct pci_channel board_pci_channels[] = { - { &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, + { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c index 7a97438762c..008a02ec0d9 100644 --- a/arch/sh/drivers/pci/pci-sh5.c +++ b/arch/sh/drivers/pci/pci-sh5.c @@ -27,6 +27,12 @@ unsigned long pcicr_virt; unsigned long PCI_IO_AREA; +int __init sh5_pci_init(struct pci_channel *chan) +{ + pr_debug("PCI: Starting intialization.\n"); + return pcibios_init_platform(); +} + /* Rounds a number UP to the nearest power of two. Used for * sizing the PCI window. */ diff --git a/arch/sh/drivers/pci/pci-sh5.h b/arch/sh/drivers/pci/pci-sh5.h index 7cff3fc04d3..af09f384c7d 100644 --- a/arch/sh/drivers/pci/pci-sh5.h +++ b/arch/sh/drivers/pci/pci-sh5.h @@ -108,6 +108,7 @@ extern unsigned long pcicr_virt; extern struct pci_ops sh5_pci_ops; /* arch/sh/drivers/pci/pci-sh5.c */ +int sh5_pci_init(struct pci_channel *chan); int sh5pci_init(unsigned long memStart, unsigned long memSize); #endif /* __PCI_SH5_H */ diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c index 9c2c01490d6..230db8bd974 100644 --- a/arch/sh/drivers/pci/pci-sh7751.c +++ b/arch/sh/drivers/pci/pci-sh7751.c @@ -32,7 +32,7 @@ * space mapping) will be called via the platform defined function * pcibios_init_platform(). */ -static int __init sh7751_pci_init(void) +int __init sh7751_pci_init(struct pci_channel *chan) { unsigned int id; int ret; @@ -40,19 +40,18 @@ static int __init sh7751_pci_init(void) pr_debug("PCI: Starting intialization.\n"); /* check for SH7751/SH7751R hardware */ - id = pci_read_reg(NULL, SH7751_PCICONF0); + id = pci_read_reg(chan, SH7751_PCICONF0); if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) && id != ((SH7751R_DEVICE_ID << 16) | SH7751_VENDOR_ID)) { pr_debug("PCI: This is not an SH7751(R) (%x)\n", id); return -ENODEV; } - if ((ret = sh4_pci_check_direct(NULL)) != 0) + if ((ret = sh4_pci_check_direct(chan)) != 0) return ret; return pcibios_init_platform(); } -subsys_initcall(sh7751_pci_init); static int __init __area_sdram_check(struct pci_channel *chan, unsigned int area) @@ -178,7 +177,7 @@ int __init sh7751_pcic_init(struct pci_channel *chan, } if (!word) - return 0; + return -1; /* configure the wait control registers */ word = ctrl_inl(SH7751_WCR1); @@ -202,5 +201,5 @@ int __init sh7751_pcic_init(struct pci_channel *chan, word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM; pci_write_reg(chan, word, SH4_PCICR); - return 1; + return 0; } diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h index 6f101e5a6c8..0ea4387df13 100644 --- a/arch/sh/drivers/pci/pci-sh7751.h +++ b/arch/sh/drivers/pci/pci-sh7751.h @@ -130,6 +130,7 @@ struct sh4_pci_address_map; /* arch/sh/drivers/pci/pci-sh7751.c */ +int sh7751_pci_init(struct pci_channel *chan); int sh7751_pcic_init(struct pci_channel *chan, struct sh4_pci_address_map *map); diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 56f673f66cb..4706e880b08 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -45,7 +45,7 @@ * space mapping) will be called via the platform defined function * pcibios_init_platform(). */ -static int __init sh7780_pci_init(void) +int __init sh7780_pci_init(struct pci_channel *chan) { unsigned int id; int ret, match = 0; @@ -55,7 +55,7 @@ static int __init sh7780_pci_init(void) ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */ /* check for SH7780/SH7780R hardware */ - id = pci_read_reg(NULL, SH7780_PCIVID); + id = pci_read_reg(chan, SH7780_PCIVID); if ((id & 0xffff) == SH7780_VENDOR_ID) { switch ((id >> 16) & 0xffff) { case SH7763_DEVICE_ID: @@ -82,12 +82,11 @@ static int __init sh7780_pci_init(void) ctrl_outl(0x33333333, INTC_INTPRI); } - if ((ret = sh4_pci_check_direct(NULL)) != 0) + if ((ret = sh4_pci_check_direct(chan)) != 0) return ret; return pcibios_init_platform(); } -core_initcall(sh7780_pci_init); int __init sh7780_pcic_init(struct pci_channel *chan, struct sh4_pci_address_map *map) @@ -153,5 +152,5 @@ int __init sh7780_pcic_init(struct pci_channel *chan, word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO; pci_write_reg(chan, word, SH4_PCICR); - return 1; + return 0; } diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h index d34961153d5..2f3c92065ec 100644 --- a/arch/sh/drivers/pci/pci-sh7780.h +++ b/arch/sh/drivers/pci/pci-sh7780.h @@ -109,6 +109,7 @@ struct sh4_pci_address_map; /* arch/sh/drivers/pci/pci-sh7780.c */ +int sh7780_pci_init(struct pci_channel *chan); int sh7780_pcic_init(struct pci_channel *chan, struct sh4_pci_address_map *map); diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index 0d6ac7a1db4..29ec16e69af 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -28,18 +28,31 @@ static int __init pcibios_init(void) struct pci_bus *bus; int busno; + /* init channels */ + busno = 0; + for (p = board_pci_channels; p->init; p++) { + if (p->init(p) == 0) + p->enabled = 1; + else + pr_err("Unable to init pci channel %d\n", busno); + busno++; + } + #ifdef CONFIG_PCI_AUTO /* assign resources */ busno = 0; - for (p = board_pci_channels; p->pci_ops != NULL; p++) - busno = pciauto_assign_resources(busno, p) + 1; + for (p = board_pci_channels; p->init; p++) + if (p->enabled) + busno = pciauto_assign_resources(busno, p) + 1; #endif /* scan the buses */ busno = 0; - for (p = board_pci_channels; p->pci_ops != NULL; p++) { - bus = pci_scan_bus(busno, p->pci_ops, p); - busno = bus->subordinate + 1; + for (p = board_pci_channels; p->init; p++) { + if (p->enabled) { + bus = pci_scan_bus(busno, p->pci_ops, p); + busno = bus->subordinate + 1; + } } pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq); diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index df1d383e18a..5c7a8f1d2d5 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -17,11 +17,13 @@ * external) PCI controllers. */ struct pci_channel { + int (*init)(struct pci_channel *chan); struct pci_ops *pci_ops; struct resource *io_resource; struct resource *mem_resource; int first_devfn; int last_devfn; + int enabled; }; /* From 710fa3c81151948ac4d836ef52b57cef91b0ab72 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 11 Mar 2009 15:47:23 +0900 Subject: [PATCH 0403/2061] sh: avoid using PCIBIOS_MIN_xxx Replaces PCIBIOS_MIN_IO and PCIBIOS_MIN_MEM with direct struct pci_channel access. This allows us to have more than one pci channel. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/fixups-r7780rp.c | 2 +- arch/sh/drivers/pci/fixups-sdk7780.c | 2 +- arch/sh/drivers/pci/fixups-se7780.c | 2 +- arch/sh/drivers/pci/pci-sh5.c | 9 ++++++--- arch/sh/drivers/pci/pci-sh7751.c | 14 ++++++-------- arch/sh/drivers/pci/pci-sh7780.c | 10 ++++------ 6 files changed, 19 insertions(+), 20 deletions(-) diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c index 5b25021bbd6..31f8dfbfcbf 100644 --- a/arch/sh/drivers/pci/fixups-r7780rp.c +++ b/arch/sh/drivers/pci/fixups-r7780rp.c @@ -37,7 +37,7 @@ int pci_fixup_pcic(struct pci_channel *chan) #endif /* Set IOBR for windows containing area specified in pci.h */ - pci_write_reg(chan, (PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)), + pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1), SH7780_PCIIOBR); pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)), SH7780_PCIIOBMR); diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c index 3f6754a120e..c2957312b30 100644 --- a/arch/sh/drivers/pci/fixups-sdk7780.c +++ b/arch/sh/drivers/pci/fixups-sdk7780.c @@ -50,7 +50,7 @@ int pci_fixup_pcic(struct pci_channel *chan) pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0); /* 16M */ /* Set IOBR for window containing area specified in pci.h */ - pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1), + pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1), SH7780_PCIIOBR); pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18), SH7780_PCIIOBMR); diff --git a/arch/sh/drivers/pci/fixups-se7780.c b/arch/sh/drivers/pci/fixups-se7780.c index b8e735e01c3..a968af7d6f7 100644 --- a/arch/sh/drivers/pci/fixups-se7780.c +++ b/arch/sh/drivers/pci/fixups-se7780.c @@ -51,7 +51,7 @@ int pci_fixup_pcic(struct pci_channel *chan) pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0); /* 16M */ /* Set IOBR for window containing area specified in pci.h */ - pci_write_reg(chan, PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE-1), + pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1), SH7780_PCIIOBR); pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18), SH7780_PCIIOBMR); diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c index 008a02ec0d9..7750da27628 100644 --- a/arch/sh/drivers/pci/pci-sh5.c +++ b/arch/sh/drivers/pci/pci-sh5.c @@ -206,6 +206,9 @@ int __init sh5pci_init(unsigned long memStart, unsigned long memSize) return 0; } +#define xPCIBIOS_MIN_IO board_pci_channels->io_resource->start +#define xPCIBIOS_MIN_MEM board_pci_channels->mem_resource->start + void __devinit pcibios_fixup_bus(struct pci_bus *bus) { struct pci_dev *dev = bus->self; @@ -223,9 +226,9 @@ void __devinit pcibios_fixup_bus(struct pci_bus *bus) /* For now, propagate host limits to the bus; * we'll adjust them later. */ bus->resource[0]->end = 64*1024 - 1 ; - bus->resource[1]->end = PCIBIOS_MIN_MEM+(256*1024*1024)-1; - bus->resource[0]->start = PCIBIOS_MIN_IO; - bus->resource[1]->start = PCIBIOS_MIN_MEM; + bus->resource[1]->end = xPCIBIOS_MIN_MEM+(256*1024*1024)-1; + bus->resource[0]->start = xPCIBIOS_MIN_IO; + bus->resource[1]->start = xPCIBIOS_MIN_MEM; /* Turn off downstream PF memory address range by default */ bus->resource[2]->start = 1024*1024; diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c index 230db8bd974..447234c69ab 100644 --- a/arch/sh/drivers/pci/pci-sh7751.c +++ b/arch/sh/drivers/pci/pci-sh7751.c @@ -144,22 +144,20 @@ int __init sh7751_pcic_init(struct pci_channel *chan, /* Set the local 16MB PCI memory space window to * the lowest PCI mapped address */ - word = PCIBIOS_MIN_MEM & SH4_PCIMBR_MASK; + word = chan->mem_resource->start & SH4_PCIMBR_MASK; pr_debug("PCI: Setting upper bits of Memory window to 0x%x\n", word); pci_write_reg(chan, word , SH4_PCIMBR); - /* Map IO space into PCI IO window - * The IO window is 64K-PCIBIOS_MIN_IO in size - * IO addresses will be translated to the - * PCI IO window base address + /* Map IO space into PCI IO window: + * IO addresses will be translated to the PCI IO window base address */ pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n", - PCIBIOS_MIN_IO, (64 << 10), - SH7751_PCI_IO_BASE + PCIBIOS_MIN_IO); + chan->io_resource->start, chan->io_resource->end, + SH7751_PCI_IO_BASE + chan->io_resource->start); /* Make sure the MSB's of IO window are set to access PCI space * correctly */ - word = PCIBIOS_MIN_IO & SH4_PCIIOBR_MASK; + word = chan->io_resource->start & SH4_PCIIOBR_MASK; pr_debug("PCI: Setting upper bits of IO window to 0x%x\n", word); pci_write_reg(chan, word, SH4_PCIIOBR); diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 4706e880b08..e8f3a308c07 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -130,14 +130,12 @@ int __init sh7780_pcic_init(struct pci_channel *chan, pci_write_reg(chan, map->window1.base, SH4_PCILAR1); pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1); - /* Map IO space into PCI IO window - * The IO window is 64K-PCIBIOS_MIN_IO in size - * IO addresses will be translated to the - * PCI IO window base address + /* Map IO space into PCI IO window: + * IO addresses will be translated to the PCI IO window base address */ pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n", - PCIBIOS_MIN_IO, (64 << 10), - SH7780_PCI_IO_BASE + PCIBIOS_MIN_IO); + chan->io_resource->start, chan->io_resource->end, + SH7780_PCI_IO_BASE + chan->io_resource->start); /* NOTE: I'm ignoring the PCI error IRQs for now.. * TODO: add support for the internal error interrupts and From b6706ef10f75921733d7275fd45d268f2f6254c8 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 19 Feb 2008 21:34:55 +0900 Subject: [PATCH 0404/2061] sh: hook in struct pci_channel in sysdata Store a struct pci_channel pointer in bus->sysdata. This makes whatever struct pci_channel assigned to a bus available for sh4_pci_read() and sh4_pci_write(). We also modify PCIBIOS_MIN_IO and PCIBIOS_MIN_MEM to use bus->sysdata - this to gives us support for multiple pci channels. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-sh4.c | 12 +++++++----- arch/sh/drivers/pci/pci-auto.c | 1 + arch/sh/include/asm/pci.h | 6 ++++-- 3 files changed, 12 insertions(+), 7 deletions(-) diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c index 92d27f734f2..ee62e6de713 100644 --- a/arch/sh/drivers/pci/ops-sh4.c +++ b/arch/sh/drivers/pci/ops-sh4.c @@ -26,6 +26,7 @@ static DEFINE_SPINLOCK(sh4_pci_lock); static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { + struct pci_channel *chan = bus->sysdata; unsigned long flags; u32 data; @@ -34,8 +35,8 @@ static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn, * so we must do byte alignment by hand */ spin_lock_irqsave(&sh4_pci_lock, flags); - pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR); - data = pci_read_reg(NULL, SH4_PCIPDR); + pci_write_reg(chan, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR); + data = pci_read_reg(chan, SH4_PCIPDR); spin_unlock_irqrestore(&sh4_pci_lock, flags); switch (size) { @@ -63,13 +64,14 @@ static int sh4_pci_read(struct pci_bus *bus, unsigned int devfn, static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 val) { + struct pci_channel *chan = bus->sysdata; unsigned long flags; int shift; u32 data; spin_lock_irqsave(&sh4_pci_lock, flags); - pci_write_reg(NULL, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR); - data = pci_read_reg(NULL, SH4_PCIPDR); + pci_write_reg(chan, CONFIG_CMD(bus, devfn, where), SH4_PCIPAR); + data = pci_read_reg(chan, SH4_PCIPDR); spin_unlock_irqrestore(&sh4_pci_lock, flags); switch (size) { @@ -90,7 +92,7 @@ static int sh4_pci_write(struct pci_bus *bus, unsigned int devfn, return PCIBIOS_FUNC_NOT_SUPPORTED; } - pci_write_reg(NULL, data, SH4_PCIPDR); + pci_write_reg(chan, data, SH4_PCIPDR); return PCIBIOS_SUCCESSFUL; } diff --git a/arch/sh/drivers/pci/pci-auto.c b/arch/sh/drivers/pci/pci-auto.c index cf48b12ee58..1d715ec405b 100644 --- a/arch/sh/drivers/pci/pci-auto.c +++ b/arch/sh/drivers/pci/pci-auto.c @@ -67,6 +67,7 @@ static struct pci_dev *fake_pci_dev(struct pci_channel *hose, dev.devfn = devfn; bus.number = busnr; bus.ops = hose->pci_ops; + bus.sysdata = hose; if(busnr != top_bus) /* Fake a parent bus structure. */ diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index 5c7a8f1d2d5..386587e0883 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -31,8 +31,10 @@ struct pci_channel { */ extern struct pci_channel board_pci_channels[]; -#define PCIBIOS_MIN_IO board_pci_channels->io_resource->start -#define PCIBIOS_MIN_MEM board_pci_channels->mem_resource->start +/* ugly as hell, but makes drivers/pci/setup-res.c compile and work */ +#define __PCI_CHAN(bus) ((struct pci_channel *)bus->sysdata) +#define PCIBIOS_MIN_IO __PCI_CHAN(bus)->io_resource->start +#define PCIBIOS_MIN_MEM __PCI_CHAN(bus)->mem_resource->start /* * I/O routine helpers From e4c6a3604e07185046e2ce4be82a201f4447d788 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 19 Feb 2008 21:35:04 +0900 Subject: [PATCH 0405/2061] sh: add reg_base member to pci_channel Store the base address of the pci host controller registers in struct pci_channel and use the address in pci_read_reg() and pci_write_reg(). Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-sh4.c | 4 ++-- arch/sh/drivers/pci/pci-sh4.h | 4 ++-- arch/sh/drivers/pci/pci-sh7751.c | 2 ++ arch/sh/drivers/pci/pci-sh7751.h | 1 - arch/sh/drivers/pci/pci-sh7780.c | 2 ++ arch/sh/drivers/pci/pci-sh7780.h | 1 - arch/sh/include/asm/pci.h | 1 + 7 files changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c index ee62e6de713..540683d07c7 100644 --- a/arch/sh/drivers/pci/ops-sh4.c +++ b/arch/sh/drivers/pci/ops-sh4.c @@ -121,8 +121,8 @@ int __init sh4_pci_check_direct(struct pci_channel *chan) if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) { pci_write_reg(chan, tmp, SH4_PCIPAR); printk(KERN_INFO "PCI: Using configuration type 1\n"); - request_region(PCI_REG(SH4_PCIPAR), 8, "PCI conf1"); - + request_region(chan->reg_base + SH4_PCIPAR, 8, + "PCI conf1"); return 0; } diff --git a/arch/sh/drivers/pci/pci-sh4.h b/arch/sh/drivers/pci/pci-sh4.h index 62ba3505608..90abfe3d39b 100644 --- a/arch/sh/drivers/pci/pci-sh4.h +++ b/arch/sh/drivers/pci/pci-sh4.h @@ -171,13 +171,13 @@ struct sh4_pci_address_map { static inline void pci_write_reg(struct pci_channel *chan, unsigned long val, unsigned long reg) { - ctrl_outl(val, PCI_REG(reg)); + ctrl_outl(val, chan->reg_base + reg); } static inline unsigned long pci_read_reg(struct pci_channel *chan, unsigned long reg) { - return ctrl_inl(PCI_REG(reg)); + return ctrl_inl(chan->reg_base + reg); } #endif /* __PCI_SH4_H */ diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c index 447234c69ab..201266b020f 100644 --- a/arch/sh/drivers/pci/pci-sh7751.c +++ b/arch/sh/drivers/pci/pci-sh7751.c @@ -39,6 +39,8 @@ int __init sh7751_pci_init(struct pci_channel *chan) pr_debug("PCI: Starting intialization.\n"); + chan->reg_base = 0xfe200000; + /* check for SH7751/SH7751R hardware */ id = pci_read_reg(chan, SH7751_PCICONF0); if (id != ((SH7751_DEVICE_ID << 16) | SH7751_VENDOR_ID) && diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h index 0ea4387df13..c390dd2f5e1 100644 --- a/arch/sh/drivers/pci/pci-sh7751.h +++ b/arch/sh/drivers/pci/pci-sh7751.h @@ -26,7 +26,6 @@ #define SH7751_PCI_IO_SIZE 0x40000 /* Size of IO window */ #define SH7751_PCIREG_BASE 0xFE200000 /* PCI regs base address */ -#define PCI_REG(n) (SH7751_PCIREG_BASE+ n) #define SH7751_PCICONF0 0x0 /* PCI Config Reg 0 */ #define SH7751_PCICONF0_DEVID 0xFFFF0000 /* Device ID */ diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index e8f3a308c07..9d6483a26cf 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -52,6 +52,8 @@ int __init sh7780_pci_init(struct pci_channel *chan) pr_debug("PCI: Starting intialization.\n"); + chan->reg_base = 0xfe040000; + ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */ /* check for SH7780/SH7780R hardware */ diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h index 2f3c92065ec..fffcf1dcfed 100644 --- a/arch/sh/drivers/pci/pci-sh7780.h +++ b/arch/sh/drivers/pci/pci-sh7780.h @@ -35,7 +35,6 @@ #define SH7780_PCI_IO_SIZE 0x00400000 /* Size of IO window */ #define SH7780_PCIREG_BASE 0xFE040000 /* PCI regs base address */ -#define PCI_REG(n) (SH7780_PCIREG_BASE+n) /* SH7780 PCI Config Registers */ #define SH7780_PCIVID 0x000 /* Vendor ID */ diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index 386587e0883..8e9e0edcf36 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -24,6 +24,7 @@ struct pci_channel { int first_devfn; int last_devfn; int enabled; + unsigned long reg_base; }; /* From ef53fdeb7e0cb139aff33665635b886700137abb Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 19 Feb 2008 21:35:14 +0900 Subject: [PATCH 0406/2061] sh: add io_base member to pci_channel Store the io window base address in struct pci_channel and use that one instead of SH77xx_PCI_IO_BASE. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-sh7751.c | 5 +++-- arch/sh/drivers/pci/pci-sh7780.c | 5 +++-- arch/sh/include/asm/pci.h | 1 + 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c index 201266b020f..2a6c7aab2d7 100644 --- a/arch/sh/drivers/pci/pci-sh7751.c +++ b/arch/sh/drivers/pci/pci-sh7751.c @@ -40,6 +40,7 @@ int __init sh7751_pci_init(struct pci_channel *chan) pr_debug("PCI: Starting intialization.\n"); chan->reg_base = 0xfe200000; + chan->io_base = 0xfe240000; /* check for SH7751/SH7751R hardware */ id = pci_read_reg(chan, SH7751_PCICONF0); @@ -153,9 +154,9 @@ int __init sh7751_pcic_init(struct pci_channel *chan, /* Map IO space into PCI IO window: * IO addresses will be translated to the PCI IO window base address */ - pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n", + pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n", chan->io_resource->start, chan->io_resource->end, - SH7751_PCI_IO_BASE + chan->io_resource->start); + chan->io_base + chan->io_resource->start); /* Make sure the MSB's of IO window are set to access PCI space * correctly */ diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 9d6483a26cf..87a7f3b7a38 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -53,6 +53,7 @@ int __init sh7780_pci_init(struct pci_channel *chan) pr_debug("PCI: Starting intialization.\n"); chan->reg_base = 0xfe040000; + chan->io_base = 0xfe200000; ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */ @@ -135,9 +136,9 @@ int __init sh7780_pcic_init(struct pci_channel *chan, /* Map IO space into PCI IO window: * IO addresses will be translated to the PCI IO window base address */ - pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%x\n", + pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n", chan->io_resource->start, chan->io_resource->end, - SH7780_PCI_IO_BASE + chan->io_resource->start); + chan->io_base + chan->io_resource->start); /* NOTE: I'm ignoring the PCI error IRQs for now.. * TODO: add support for the internal error interrupts and diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index 8e9e0edcf36..84d12ebef08 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -25,6 +25,7 @@ struct pci_channel { int last_devfn; int enabled; unsigned long reg_base; + unsigned long io_base; }; /* From ef339f241b08a16af58897e6288ba200e0c7a8c7 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 19 Feb 2008 21:35:22 +0900 Subject: [PATCH 0407/2061] sh: pci memory range checking code This patch changes the code to use __is_pci_memory() instead of is_pci_memaddr(). __is_pci_memory() loops through all the pci channels on the system to match memory windows. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/boards/mach-titan/io.c | 2 +- arch/sh/drivers/pci/pci.c | 5 ++--- arch/sh/include/asm/pci.h | 23 +++++++++++++++++++---- arch/sh/mm/ioremap_32.c | 4 ++-- 4 files changed, 24 insertions(+), 10 deletions(-) diff --git a/arch/sh/boards/mach-titan/io.c b/arch/sh/boards/mach-titan/io.c index 4badad4c6f3..053b3ed2ed8 100644 --- a/arch/sh/boards/mach-titan/io.c +++ b/arch/sh/boards/mach-titan/io.c @@ -117,7 +117,7 @@ void titan_outsl(unsigned long port, const void *src, unsigned long count) void __iomem *titan_ioport_map(unsigned long port, unsigned int size) { - if (PXSEG(port) || is_pci_memaddr(port)) + if (PXSEG(port)) return (void __iomem *)port; else if (is_pci_ioaddr(port)) return (void __iomem *)pci_ioaddr(port); diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index 29ec16e69af..b9aca547804 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -167,9 +167,8 @@ void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) /* * Presently the IORESOURCE_MEM case is a bit special, most * SH7751 style PCI controllers have PCI memory at a fixed - * location in the address space where no remapping is desired - * (typically at 0xfd000000, but is_pci_memaddr() will know - * best). With the IORESOURCE_MEM case more care has to be taken + * location in the address space where no remapping is desired. + * With the IORESOURCE_MEM case more care has to be taken * to inhibit page table mapping for legacy cores, but this is * punted off to __ioremap(). * -- PFM. diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index 84d12ebef08..ccf5c5ff62f 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -61,12 +61,8 @@ extern unsigned long PCI_IO_AREA; #define is_pci_ioaddr(port) \ (((port) >= PCIBIOS_MIN_IO) && \ ((port) < (PCIBIOS_MIN_IO + PCI_IO_SIZE))) -#define is_pci_memaddr(port) \ - (((port) >= PCIBIOS_MIN_MEM) && \ - ((port) < (PCIBIOS_MIN_MEM + PCI_MEM_SIZE))) #else #define is_pci_ioaddr(port) (0) -#define is_pci_memaddr(port) (0) #endif struct pci_dev; @@ -127,6 +123,25 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev, *strat = PCI_DMA_BURST_INFINITY; *strategy_parameter = ~0UL; } + +static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size) +{ + struct pci_channel *p; + struct resource *res; + + for (p = board_pci_channels; p->init; p++) { + res = p->mem_resource; + if (p->enabled && (phys_addr >= res->start) && + (phys_addr + size) <= (res->end + 1)) + return 1; + } + return 0; +} +#else +static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size) +{ + return 0; +} #endif /* Board-specific fixup routines. */ diff --git a/arch/sh/mm/ioremap_32.c b/arch/sh/mm/ioremap_32.c index 60cc486d2c2..7e04cc8f3b9 100644 --- a/arch/sh/mm/ioremap_32.c +++ b/arch/sh/mm/ioremap_32.c @@ -56,7 +56,7 @@ void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, * P1/P2 space, ioremap() will already do the right thing, * and we'll never get this far. */ - if (is_pci_memaddr(phys_addr) && is_pci_memaddr(last_addr)) + if (__is_pci_memory(phys_addr, size)) return (void __iomem *)phys_addr; #if !defined(CONFIG_PMB_FIXED) @@ -121,7 +121,7 @@ void __iounmap(void __iomem *addr) unsigned long seg = PXSEG(vaddr); struct vm_struct *p; - if (seg < P3SEG || vaddr >= P3_ADDR_MAX || is_pci_memaddr(vaddr)) + if (seg < P3SEG || vaddr >= P3_ADDR_MAX || __is_pci_memory(vaddr, 0)) return; #ifdef CONFIG_PMB From 8ce0143b11cdc519b8e1fd94a262b654ef0bc3ab Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 19 Feb 2008 21:35:31 +0900 Subject: [PATCH 0408/2061] sh: pci io port base address code Adds a __get_pci_io_base() function which is used to match a port range against struct pci_channel. This allows us to detect if a port range is assigned to pci or happens to be legacy port io. While at it, remove unused cpu-specific cruft. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-rts7751r2d.c | 1 - arch/sh/include/asm/pci.h | 47 ++++++++++++---------------- arch/sh/kernel/io.c | 5 +++ 3 files changed, 25 insertions(+), 28 deletions(-) diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c index fe5a231b866..58ed1d58cff 100644 --- a/arch/sh/drivers/pci/ops-rts7751r2d.c +++ b/arch/sh/drivers/pci/ops-rts7751r2d.c @@ -68,7 +68,6 @@ static struct sh4_pci_address_map sh7751_pci_map = { int __init pcibios_init_platform(void) { - __set_io_port_base(SH7751_PCI_IO_BASE); return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); } diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index ccf5c5ff62f..bb2c2fcddc9 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -38,33 +38,6 @@ extern struct pci_channel board_pci_channels[]; #define PCIBIOS_MIN_IO __PCI_CHAN(bus)->io_resource->start #define PCIBIOS_MIN_MEM __PCI_CHAN(bus)->mem_resource->start -/* - * I/O routine helpers - */ -#if defined(CONFIG_CPU_SUBTYPE_SH7780) || defined(CONFIG_CPU_SUBTYPE_SH7785) -#define PCI_IO_AREA 0xFE400000 -#define PCI_IO_SIZE 0x00400000 -#elif defined(CONFIG_CPU_SH5) -extern unsigned long PCI_IO_AREA; -#define PCI_IO_SIZE 0x00010000 -#else -#define PCI_IO_AREA 0xFE240000 -#define PCI_IO_SIZE 0x00040000 -#endif - -#define PCI_MEM_SIZE 0x01000000 - -#define SH4_PCIIOBR_MASK 0xFFFC0000 -#define pci_ioaddr(addr) (PCI_IO_AREA + (addr & ~SH4_PCIIOBR_MASK)) - -#if defined(CONFIG_PCI) -#define is_pci_ioaddr(port) \ - (((port) >= PCIBIOS_MIN_IO) && \ - ((port) < (PCIBIOS_MIN_IO + PCI_IO_SIZE))) -#else -#define is_pci_ioaddr(port) (0) -#endif - struct pci_dev; extern void pcibios_set_master(struct pci_dev *dev); @@ -137,11 +110,31 @@ static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size) } return 0; } + +static inline void __iomem *__get_pci_io_base(unsigned long port, + unsigned long size) +{ + struct pci_channel *p; + struct resource *res; + + for (p = board_pci_channels; p->init; p++) { + res = p->io_resource; + if (p->enabled && (port >= res->start) && + (port + size) <= (res->end + 1)) + return (void __iomem *)(p->io_base + port); + } + return NULL; +} #else static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size) { return 0; } +static inline void __iomem *__get_pci_io_base(unsigned long port, + unsigned long size) +{ + return NULL; +} #endif /* Board-specific fixup routines. */ diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c index 29cf4588fc0..59fb020718a 100644 --- a/arch/sh/kernel/io.c +++ b/arch/sh/kernel/io.c @@ -12,6 +12,7 @@ * for more details. */ #include +#include #include #include @@ -69,6 +70,10 @@ void __iomem *ioport_map(unsigned long port, unsigned int nr) if (ret) return ret; + ret = __get_pci_io_base(port, nr); + if (ret) + return ret; + return __ioport_map(port, nr); } EXPORT_SYMBOL(ioport_map); From aa5d3ff99cc1f3dfe262ab9dd9179131fcfe39fd Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 19 Feb 2008 21:35:40 +0900 Subject: [PATCH 0409/2061] sh: export board_pci_channels in one place Instead of sometimes exporting board_pci_channels[] in the board specific code just export it in one place. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-cayman.c | 1 - arch/sh/drivers/pci/ops-lboxre2.c | 2 -- arch/sh/drivers/pci/ops-r7780rp.c | 1 - arch/sh/drivers/pci/ops-rts7751r2d.c | 1 - arch/sh/drivers/pci/ops-sdk7780.c | 1 - arch/sh/drivers/pci/ops-se7780.c | 1 - arch/sh/drivers/pci/ops-titan.c | 1 - arch/sh/drivers/pci/pci.c | 2 ++ 8 files changed, 2 insertions(+), 8 deletions(-) diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/ops-cayman.c index f4a5e14f7e5..cbaaf0234b7 100644 --- a/arch/sh/drivers/pci/ops-cayman.c +++ b/arch/sh/drivers/pci/ops-cayman.c @@ -80,7 +80,6 @@ struct pci_channel board_pci_channels[] = { { sh5_pci_init, &sh5_pci_ops, NULL, NULL, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; -EXPORT_SYMBOL(board_pci_channels); int __init pcibios_init_platform(void) { diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c index f606df2195c..781496bfb1f 100644 --- a/arch/sh/drivers/pci/ops-lboxre2.c +++ b/arch/sh/drivers/pci/ops-lboxre2.c @@ -43,8 +43,6 @@ struct pci_channel board_pci_channels[] = { { NULL, NULL, NULL, 0, 0 }, }; -EXPORT_SYMBOL(board_pci_channels); - static struct sh4_pci_address_map sh7751_pci_map = { .window0 = { .base = SH7751_CS3_BASE_ADDR, diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c index b51b7e4078d..c58f1cff9fb 100644 --- a/arch/sh/drivers/pci/ops-r7780rp.c +++ b/arch/sh/drivers/pci/ops-r7780rp.c @@ -46,7 +46,6 @@ struct pci_channel board_pci_channels[] = { { sh7780_pci_init, &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; -EXPORT_SYMBOL(board_pci_channels); static struct sh4_pci_address_map sh7780_pci_map = { .window0 = { diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c index 58ed1d58cff..d374cd37f45 100644 --- a/arch/sh/drivers/pci/ops-rts7751r2d.c +++ b/arch/sh/drivers/pci/ops-rts7751r2d.c @@ -50,7 +50,6 @@ struct pci_channel board_pci_channels[] = { { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; -EXPORT_SYMBOL(board_pci_channels); static struct sh4_pci_address_map sh7751_pci_map = { .window0 = { diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c index 7277cd15ae6..b34fbc54a7c 100644 --- a/arch/sh/drivers/pci/ops-sdk7780.c +++ b/arch/sh/drivers/pci/ops-sdk7780.c @@ -52,7 +52,6 @@ struct pci_channel board_pci_channels[] = { { sh7780_pci_init, &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; -EXPORT_SYMBOL(board_pci_channels); static struct sh4_pci_address_map sdk7780_pci_map = { .window0 = { diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c index 76a74fb42fb..47302077a0c 100644 --- a/arch/sh/drivers/pci/ops-se7780.c +++ b/arch/sh/drivers/pci/ops-se7780.c @@ -61,7 +61,6 @@ struct pci_channel board_pci_channels[] = { { sh7780_pci_init, &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; -EXPORT_SYMBOL(board_pci_channels); static struct sh4_pci_address_map se7780_pci_map = { .window0 = { diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c index ffa79bdf475..31ed03716a2 100644 --- a/arch/sh/drivers/pci/ops-titan.c +++ b/arch/sh/drivers/pci/ops-titan.c @@ -55,7 +55,6 @@ struct pci_channel board_pci_channels[] = { { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; -EXPORT_SYMBOL(board_pci_channels); static struct sh4_pci_address_map sh7751_pci_map = { .window0 = { diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index b9aca547804..8aaacc99b71 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -187,3 +187,5 @@ void pci_iounmap(struct pci_dev *dev, void __iomem *addr) iounmap(addr); } EXPORT_SYMBOL(pci_iounmap); + +EXPORT_SYMBOL(board_pci_channels); From 10591578c84825a320734e59272f161385edcc41 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 11 Mar 2009 15:58:04 +0900 Subject: [PATCH 0410/2061] sh: drop duplicate symbol export on dreamcast and sh7785lcr. With board_pci_channels now being exported in a single place, update the boards that duplicated the export. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-dreamcast.c | 1 - arch/sh/drivers/pci/ops-sh7785lcr.c | 1 - 2 files changed, 2 deletions(-) diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c index f62063eb649..5bc81d5b660 100644 --- a/arch/sh/drivers/pci/ops-dreamcast.c +++ b/arch/sh/drivers/pci/ops-dreamcast.c @@ -165,4 +165,3 @@ struct pci_channel board_pci_channels[] = { &gapspci_mem_resource, 0, 1 }, { 0, } }; -EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c index fb0869f0bef..2a9f9a599d6 100644 --- a/arch/sh/drivers/pci/ops-sh7785lcr.c +++ b/arch/sh/drivers/pci/ops-sh7785lcr.c @@ -44,7 +44,6 @@ struct pci_channel board_pci_channels[] = { { &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; -EXPORT_SYMBOL(board_pci_channels); static struct sh4_pci_address_map sh7785_pci_map = { .window0 = { From 3aabce8d3d2f9af2c08c2f590ac9acb272ca8c95 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 11 Mar 2009 16:12:39 +0900 Subject: [PATCH 0411/2061] sh: sh7785lcr: Update for recent PCI changes. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/fixups-sh7785lcr.c | 33 +++++++++++++------------- arch/sh/drivers/pci/ops-sh7785lcr.c | 4 ++-- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/sh/drivers/pci/fixups-sh7785lcr.c b/arch/sh/drivers/pci/fixups-sh7785lcr.c index 4949e601387..9e7dc79037e 100644 --- a/arch/sh/drivers/pci/fixups-sh7785lcr.c +++ b/arch/sh/drivers/pci/fixups-sh7785lcr.c @@ -15,32 +15,33 @@ #include #include "pci-sh4.h" -int pci_fixup_pcic(void) +int pci_fixup_pcic(struct pci_channel *chan) { - pci_write_reg(0x000043ff, SH4_PCIINTM); - pci_write_reg(0x0000380f, SH4_PCIAINTM); + pci_write_reg(chan, 0x000043ff, SH4_PCIINTM); + pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM); - pci_write_reg(0xfbb00047, SH7780_PCICMD); - pci_write_reg(0x00000000, SH7780_PCIIBAR); + pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD); + pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR); - pci_write_reg(0x00011912, SH7780_PCISVID); - pci_write_reg(0x08000000, SH7780_PCICSCR0); - pci_write_reg(0x0000001b, SH7780_PCICSAR0); - pci_write_reg(0xfd000000, SH7780_PCICSCR1); - pci_write_reg(0x0000000f, SH7780_PCICSAR1); + pci_write_reg(chan, 0x00011912, SH7780_PCISVID); + pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0); + pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0); + pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1); + pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1); - pci_write_reg(0xfd000000, SH7780_PCIMBR0); - pci_write_reg(0x00fc0000, SH7780_PCIMBMR0); + pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0); + pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0); #ifdef CONFIG_32BIT - pci_write_reg(0xc0000000, SH7780_PCIMBR2); - pci_write_reg(0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2); + pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2); + pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2); #endif /* Set IOBR for windows containing area specified in pci.h */ - pci_write_reg((PCIBIOS_MIN_IO & ~(SH7780_PCI_IO_SIZE - 1)), + pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE - 1), SH7780_PCIIOBR); - pci_write_reg(((SH7780_PCI_IO_SIZE - 1) & (7 << 18)), SH7780_PCIIOBMR); + pci_write_reg(chan, ((SH7780_PCI_IO_SIZE - 1) & (7 << 18)), + SH7780_PCIIOBMR); return 0; } diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c index 2a9f9a599d6..afbb9bd4751 100644 --- a/arch/sh/drivers/pci/ops-sh7785lcr.c +++ b/arch/sh/drivers/pci/ops-sh7785lcr.c @@ -41,7 +41,7 @@ static struct resource sh7785_mem_resource = { }; struct pci_channel board_pci_channels[] = { - { &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff }, + { sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff }, { NULL, NULL, NULL, 0, 0 }, }; @@ -61,5 +61,5 @@ static struct sh4_pci_address_map sh7785_pci_map = { int __init pcibios_init_platform(void) { - return sh7780_pcic_init(&sh7785_pci_map); + return sh7780_pcic_init(&board_pci_channels[0], &sh7785_pci_map); } From f1a9ba8f15f89f3f444985251efaf809cd16da53 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 11 Mar 2009 16:17:53 +0900 Subject: [PATCH 0412/2061] sh: pci: drop duplicate PCIC fixups for SE7780 and SH7785LCR. SE7780 has the same PCIC fixup as SDK7780, and SH7785LCR the same as R7780RP. Switch to using those, and drop the duplicate code. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Makefile | 4 +- arch/sh/drivers/pci/fixups-se7780.c | 62 -------------------------- arch/sh/drivers/pci/fixups-sh7785lcr.c | 47 ------------------- 3 files changed, 2 insertions(+), 111 deletions(-) delete mode 100644 arch/sh/drivers/pci/fixups-se7780.c delete mode 100644 arch/sh/drivers/pci/fixups-sh7785lcr.c diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index 847e90894d1..67d710a04de 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -21,6 +21,6 @@ obj-$(CONFIG_SH_SDK7780) += ops-sdk7780.o fixups-sdk7780.o obj-$(CONFIG_SH_TITAN) += ops-titan.o obj-$(CONFIG_SH_LANDISK) += ops-landisk.o obj-$(CONFIG_SH_LBOX_RE2) += ops-lboxre2.o fixups-lboxre2.o -obj-$(CONFIG_SH_7780_SOLUTION_ENGINE) += ops-se7780.o fixups-se7780.o +obj-$(CONFIG_SH_7780_SOLUTION_ENGINE) += ops-se7780.o fixups-sdk7780.o obj-$(CONFIG_SH_CAYMAN) += ops-cayman.o -obj-$(CONFIG_SH_SH7785LCR) += ops-sh7785lcr.o fixups-sh7785lcr.o +obj-$(CONFIG_SH_SH7785LCR) += ops-sh7785lcr.o fixups-r7780rp.o diff --git a/arch/sh/drivers/pci/fixups-se7780.c b/arch/sh/drivers/pci/fixups-se7780.c deleted file mode 100644 index a968af7d6f7..00000000000 --- a/arch/sh/drivers/pci/fixups-se7780.c +++ /dev/null @@ -1,62 +0,0 @@ -/* - * arch/sh/drivers/pci/fixups-se7780.c - * - * HITACHI UL Solution Engine 7780 PCI fixups - * - * Copyright (C) 2003 Lineo uSolutions, Inc. - * Copyright (C) 2004 - 2006 Paul Mundt - * Copyright (C) 2006 Nobuhiro Iwamatsu - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include "pci-sh4.h" -#include - -int pci_fixup_pcic(struct pci_channel *chan) -{ - ctrl_outl(0x00000001, SH7780_PCI_VCR2); - - /* Enable all interrupts, so we know what to fix */ - pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR); - pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM); - - /* Set up standard PCI config registers */ - ctrl_outw(0xFB00, PCI_REG(SH7780_PCISTATUS)); - ctrl_outw(0x0047, PCI_REG(SH7780_PCICMD)); - ctrl_outb( 0x00, PCI_REG(SH7780_PCIPIF)); - ctrl_outb( 0x00, PCI_REG(SH7780_PCISUB)); - ctrl_outb( 0x06, PCI_REG(SH7780_PCIBCC)); - ctrl_outw(0x1912, PCI_REG(SH7780_PCISVID)); - ctrl_outw(0x0001, PCI_REG(SH7780_PCISID)); - - pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0); /* PCI */ - pci_write_reg(chan, 0x08000000, SH7780_PCILAR0); /* SHwy */ - pci_write_reg(chan, 0x07F00001, SH7780_PCILSR); /* size 128M w/ MBAR */ - - pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1); - pci_write_reg(chan, 0x00000000, SH7780_PCILAR1); - pci_write_reg(chan, 0x00000000, SH7780_PCILSR1); - - pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR); - - /* - * Set the MBR so PCI address is one-to-one with window, - * meaning all calls go straight through... use ifdef to - * catch erroneous assumption. - */ - pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0); - pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0); /* 16M */ - - /* Set IOBR for window containing area specified in pci.h */ - pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1), - SH7780_PCIIOBR); - pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18), - SH7780_PCIIOBMR); - - pci_write_reg(chan, 0xA5000C01, SH7780_PCICR); - - return 0; -} diff --git a/arch/sh/drivers/pci/fixups-sh7785lcr.c b/arch/sh/drivers/pci/fixups-sh7785lcr.c deleted file mode 100644 index 9e7dc79037e..00000000000 --- a/arch/sh/drivers/pci/fixups-sh7785lcr.c +++ /dev/null @@ -1,47 +0,0 @@ -/* - * arch/sh/drivers/pci/fixups-sh7785lcr.c - * - * R0P7785LC0011RL PCI fixups - * Copyright (C) 2008 Yoshihiro Shimoda - * - * Based on arch/sh/drivers/pci/fixups-r7780rp.c - * Copyright (C) 2003 Lineo uSolutions, Inc. - * Copyright (C) 2004 - 2006 Paul Mundt - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include "pci-sh4.h" - -int pci_fixup_pcic(struct pci_channel *chan) -{ - pci_write_reg(chan, 0x000043ff, SH4_PCIINTM); - pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM); - - pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD); - pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR); - - pci_write_reg(chan, 0x00011912, SH7780_PCISVID); - pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0); - pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0); - pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1); - pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1); - - pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0); - pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0); - -#ifdef CONFIG_32BIT - pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2); - pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2); -#endif - - /* Set IOBR for windows containing area specified in pci.h */ - pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE - 1), - SH7780_PCIIOBR); - pci_write_reg(chan, ((SH7780_PCI_IO_SIZE - 1) & (7 << 18)), - SH7780_PCIIOBMR); - - return 0; -} From d0deef5b14af7d5bbd0003a0a2a1a32326e20a6d Mon Sep 17 00:00:00 2001 From: Shawn Du Date: Tue, 14 Apr 2009 13:58:56 +0800 Subject: [PATCH 0413/2061] blktrace: support per-partition tracing Though one can specify '-d /dev/sda1' when using blktrace, it still traces the whole sda. To support per-partition tracing, when we start tracing, we initialize bt->start_lba and bt->end_lba to the start and end sector of that partition. Note some actions are per device, thus we don't filter 0-sector events. The original patch and discussion can be found here: http://marc.info/?l=linux-btrace&m=122949374214540&w=2 Signed-off-by: Shawn Du Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Jens Axboe LKML-Reference: <49E42620.4050701@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- block/compat_ioctl.c | 2 +- drivers/scsi/sg.c | 1 + include/linux/blktrace_api.h | 24 +++++++++++++----------- kernel/trace/blktrace.c | 29 +++++++++++++++++++++-------- 4 files changed, 36 insertions(+), 20 deletions(-) diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index f87615dea46..f8c218cd08e 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -568,7 +568,7 @@ static int compat_blk_trace_setup(struct block_device *bdev, char __user *arg) memcpy(&buts.name, &cbuts.name, 32); mutex_lock(&bdev->bd_mutex); - ret = do_blk_trace_setup(q, b, bdev->bd_dev, &buts); + ret = do_blk_trace_setup(q, b, bdev->bd_dev, bdev, &buts); mutex_unlock(&bdev->bd_mutex); if (ret) return ret; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 82312df9b0b..49c98730bb8 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1065,6 +1065,7 @@ sg_ioctl(struct inode *inode, struct file *filp, return blk_trace_setup(sdp->device->request_queue, sdp->disk->disk_name, MKDEV(SCSI_GENERIC_MAJOR, sdp->index), + NULL, (char *)arg); case BLKTRACESTART: return blk_trace_startstop(sdp->device->request_queue, 1); diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index d960889e92e..267edc4017e 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -165,8 +165,9 @@ struct blk_trace { extern int blk_trace_ioctl(struct block_device *, unsigned, char __user *); extern void blk_trace_shutdown(struct request_queue *); -extern int do_blk_trace_setup(struct request_queue *q, - char *name, dev_t dev, struct blk_user_trace_setup *buts); +extern int do_blk_trace_setup(struct request_queue *q, char *name, + dev_t dev, struct block_device *bdev, + struct blk_user_trace_setup *buts); extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); /** @@ -193,6 +194,7 @@ extern void __trace_note_message(struct blk_trace *, const char *fmt, ...); extern void blk_add_driver_data(struct request_queue *q, struct request *rq, void *data, size_t len); extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg); extern int blk_trace_startstop(struct request_queue *q, int start); extern int blk_trace_remove(struct request_queue *q); @@ -200,15 +202,15 @@ extern int blk_trace_remove(struct request_queue *q); extern struct attribute_group blk_trace_attr_group; #else /* !CONFIG_BLK_DEV_IO_TRACE */ -#define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) -#define blk_trace_shutdown(q) do { } while (0) -#define do_blk_trace_setup(q, name, dev, buts) (-ENOTTY) -#define blk_add_driver_data(q, rq, data, len) do {} while (0) -#define blk_trace_setup(q, name, dev, arg) (-ENOTTY) -#define blk_trace_startstop(q, start) (-ENOTTY) -#define blk_trace_remove(q) (-ENOTTY) -#define blk_add_trace_msg(q, fmt, ...) do { } while (0) - +# define blk_trace_ioctl(bdev, cmd, arg) (-ENOTTY) +# define blk_trace_shutdown(q) do { } while (0) +# define do_blk_trace_setup(q, name, dev, bdev, buts) (-ENOTTY) +# define blk_add_driver_data(q, rq, data, len) do {} while (0) +# define blk_trace_setup(q, name, dev, bdev, arg) (-ENOTTY) +# define blk_trace_startstop(q, start) (-ENOTTY) +# define blk_trace_remove(q) (-ENOTTY) +# define blk_add_trace_msg(q, fmt, ...) do { } while (0) #endif /* CONFIG_BLK_DEV_IO_TRACE */ + #endif /* __KERNEL__ */ #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 2b98195b338..e932654cf59 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -147,7 +147,7 @@ static int act_log_check(struct blk_trace *bt, u32 what, sector_t sector, { if (((bt->act_mask << BLK_TC_SHIFT) & what) == 0) return 1; - if (sector < bt->start_lba || sector > bt->end_lba) + if (sector && (sector < bt->start_lba || sector > bt->end_lba)) return 1; if (bt->pid && pid != bt->pid) return 1; @@ -192,7 +192,7 @@ static void __blk_add_trace(struct blk_trace *bt, sector_t sector, int bytes, what |= MASK_TC_BIT(rw, DISCARD); pid = tsk->pid; - if (unlikely(act_log_check(bt, what, sector, pid))) + if (act_log_check(bt, what, sector, pid)) return; cpu = raw_smp_processor_id(); @@ -407,11 +407,13 @@ static struct rchan_callbacks blk_relay_callbacks = { * Setup everything required to start tracing */ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, - struct blk_user_trace_setup *buts) + struct block_device *bdev, + struct blk_user_trace_setup *buts) { struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; int ret, i; + struct hd_struct *part = NULL; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -480,11 +482,21 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->act_mask) bt->act_mask = (u16) -1; - bt->start_lba = buts->start_lba; - bt->end_lba = buts->end_lba; - if (!bt->end_lba) + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else bt->end_lba = -1ULL; + /* overwrite with user settings */ + if (buts->start_lba) + bt->start_lba = buts->start_lba; + if (buts->end_lba) + bt->end_lba = buts->end_lba; + bt->pid = buts->pid; bt->trace_state = Blktrace_setup; @@ -505,6 +517,7 @@ err: } int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, + struct block_device *bdev, char __user *arg) { struct blk_user_trace_setup buts; @@ -514,7 +527,7 @@ int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (ret) return -EFAULT; - ret = do_blk_trace_setup(q, name, dev, &buts); + ret = do_blk_trace_setup(q, name, dev, bdev, &buts); if (ret) return ret; @@ -582,7 +595,7 @@ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) switch (cmd) { case BLKTRACESETUP: bdevname(bdev, b); - ret = blk_trace_setup(q, b, bdev->bd_dev, arg); + ret = blk_trace_setup(q, b, bdev->bd_dev, bdev, arg); break; case BLKTRACESTART: start = 1; From 9908c30997b8a73c95f836170b9998dae9aa3f4a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Apr 2009 13:59:34 +0800 Subject: [PATCH 0414/2061] blktrace: support per-partition tracing for ftrace plugin The previous patch adds support to trace a single partition for relay+ioctl blktrace, and this patch is for ftrace plugin blktrace: # echo 1 > /sys/block/sda/sda7/enable # cat start_lba 102398373 # cat end_lba 102703545 Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Shawn Du Cc: Jens Axboe LKML-Reference: <49E42646.4060608@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 39 +++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 14 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e932654cf59..d1098988052 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -403,6 +403,23 @@ static struct rchan_callbacks blk_relay_callbacks = { .remove_buf_file = blk_remove_buf_file_callback, }; +static void blk_trace_setup_lba(struct blk_trace *bt, + struct block_device *bdev) +{ + struct hd_struct *part = NULL; + + if (bdev) + part = bdev->bd_part; + + if (part) { + bt->start_lba = part->start_sect; + bt->end_lba = part->start_sect + part->nr_sects; + } else { + bt->start_lba = 0; + bt->end_lba = -1ULL; + } +} + /* * Setup everything required to start tracing */ @@ -413,7 +430,6 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, struct blk_trace *old_bt, *bt = NULL; struct dentry *dir = NULL; int ret, i; - struct hd_struct *part = NULL; if (!buts->buf_size || !buts->buf_nr) return -EINVAL; @@ -482,14 +498,7 @@ int do_blk_trace_setup(struct request_queue *q, char *name, dev_t dev, if (!bt->act_mask) bt->act_mask = (u16) -1; - if (bdev) - part = bdev->bd_part; - - if (part) { - bt->start_lba = part->start_sect; - bt->end_lba = part->start_sect + part->nr_sects; - } else - bt->end_lba = -1ULL; + blk_trace_setup_lba(bt, bdev); /* overwrite with user settings */ if (buts->start_lba) @@ -1370,7 +1379,8 @@ static int blk_trace_remove_queue(struct request_queue *q) /* * Setup everything required to start tracing */ -static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) +static int blk_trace_setup_queue(struct request_queue *q, + struct block_device *bdev) { struct blk_trace *old_bt, *bt = NULL; int ret = -ENOMEM; @@ -1383,9 +1393,10 @@ static int blk_trace_setup_queue(struct request_queue *q, dev_t dev) if (!bt->msg_data) goto free_bt; - bt->dev = dev; + bt->dev = bdev->bd_dev; bt->act_mask = (u16)-1; - bt->end_lba = -1ULL; + + blk_trace_setup_lba(bt, bdev); old_bt = xchg(&q->blk_trace, bt); if (old_bt != NULL) { @@ -1602,7 +1613,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, if (attr == &dev_attr_enable) { if (value) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); else ret = blk_trace_remove_queue(q); goto out_unlock_bdev; @@ -1610,7 +1621,7 @@ static ssize_t sysfs_blk_trace_attr_store(struct device *dev, ret = 0; if (q->blk_trace == NULL) - ret = blk_trace_setup_queue(q, bdev->bd_dev); + ret = blk_trace_setup_queue(q, bdev); if (ret == 0) { if (attr == &dev_attr_act_mask) From 1d54ad6da9192fed5dd3b60224d9f2dfea0dcd82 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 14 Apr 2009 14:00:05 +0800 Subject: [PATCH 0415/2061] blktrace: add trace/ to /sys/block/sda Impact: allow ftrace-plugin blktrace to trace device-mapper devices To trace a single partition: # echo 1 > /sys/block/sda/sda1/enable To trace the whole sda instead: # echo 1 > /sys/block/sda/enable Thus we also fix an issue reported by Ted, that ftrace-plugin blktrace can't be used to trace device-mapper devices. Now: # echo 1 > /sys/block/dm-0/trace/enable echo: write error: No such device or address # mount -t ext4 /dev/dm-0 /mnt # echo 1 > /sys/block/dm-0/trace/enable # echo blk > /debug/tracing/current_tracer Reported-by: Theodore Tso Signed-off-by: Li Zefan Acked-by: "Theodore Ts'o" Cc: Arnaldo Carvalho de Melo Cc: Shawn Du Cc: Jens Axboe LKML-Reference: <49E42665.6020506@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- block/blk-sysfs.c | 7 ++++++- include/linux/blktrace_api.h | 6 ++++++ kernel/trace/blktrace.c | 5 +++++ 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 73f36beff5c..8653d710b39 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -387,16 +387,21 @@ struct kobj_type blk_queue_ktype = { int blk_register_queue(struct gendisk *disk) { int ret; + struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; if (WARN_ON(!q)) return -ENXIO; + ret = blk_trace_init_sysfs(dev); + if (ret) + return ret; + if (!q->request_fn) return 0; - ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj), + ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); if (ret < 0) return ret; diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 267edc4017e..62763c95285 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -198,6 +198,7 @@ extern int blk_trace_setup(struct request_queue *q, char *name, dev_t dev, char __user *arg); extern int blk_trace_startstop(struct request_queue *q, int start); extern int blk_trace_remove(struct request_queue *q); +extern int blk_trace_init_sysfs(struct device *dev); extern struct attribute_group blk_trace_attr_group; @@ -210,6 +211,11 @@ extern struct attribute_group blk_trace_attr_group; # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) # define blk_add_trace_msg(q, fmt, ...) do { } while (0) +static inline int blk_trace_init_sysfs(struct device *dev) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_IO_TRACE */ #endif /* __KERNEL__ */ diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index d1098988052..8e7c5da3a3e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1644,3 +1644,8 @@ out: return ret ? ret : count; } +int blk_trace_init_sysfs(struct device *dev) +{ + return sysfs_create_group(&dev->kobj, &blk_trace_attr_group); +} + From f3948f8857ef5de239f28a61dddb1554a0ae4c2c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 15 Apr 2009 11:02:56 +0800 Subject: [PATCH 0416/2061] blktrace: fix context-info when mixed-using blk tracer and trace events When current tracer is set to blk tracer, TRACE_ITER_CONTEXT_INFO is unset, but actually context-info is printed: pdflush-431 [000] 821.181576: 8,0 P N [pdflush] And then if we enable TRACE_ITER_CONTEXT_INFO: # echo context-info > trace_options We'll see context-info printed twice. What's worse, when we use blk tracer and trace events at the same time, we'll see no context-info for trace events at all: jbd2_commit_logging: dev dm-0:8 transaction 333227 jbd2_end_commit: dev dm-0:8 transaction 333227 head 332814 rm-25433 [001] 9578.307485: 8,18 m N cfq25433 slice expired t=0 rm-25433 [001] 9578.307486: 8,18 m N cfq25433 put_queue This patch adds blk_tracer->set_flags(), and context-info flag is unset only when we set the output to classic mode. Note after this patch, one should unset context-info explicitly if he wants to get binary output that can be parsed by blkparse: # echo nocontext-info > trace_options # echo bin > trace_options # echo blk > current_tracer # cat trace_pipe | blkparse -i - Reported-by: Theodore Ts'o Signed-off-by: Li Zefan Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Cc: Steven Rostedt LKML-Reference: <49E54E60.50408@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 8e7c5da3a3e..c32062bd10b 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1211,7 +1211,6 @@ static void blk_tracer_print_header(struct seq_file *m) static void blk_tracer_start(struct trace_array *tr) { blk_tracer_enabled = true; - trace_flags &= ~TRACE_ITER_CONTEXT_INFO; } static int blk_tracer_init(struct trace_array *tr) @@ -1224,7 +1223,6 @@ static int blk_tracer_init(struct trace_array *tr) static void blk_tracer_stop(struct trace_array *tr) { blk_tracer_enabled = false; - trace_flags |= TRACE_ITER_CONTEXT_INFO; } static void blk_tracer_reset(struct trace_array *tr) @@ -1289,9 +1287,6 @@ out: static enum print_line_t blk_trace_event_print(struct trace_iterator *iter, int flags) { - if (!trace_print_context(iter)) - return TRACE_TYPE_PARTIAL_LINE; - return print_one_line(iter, false); } @@ -1326,6 +1321,18 @@ static enum print_line_t blk_tracer_print_line(struct trace_iterator *iter) return print_one_line(iter, true); } +static int blk_tracer_set_flag(u32 old_flags, u32 bit, int set) +{ + /* don't output context-info for blk_classic output */ + if (bit == TRACE_BLK_OPT_CLASSIC) { + if (set) + trace_flags &= ~TRACE_ITER_CONTEXT_INFO; + else + trace_flags |= TRACE_ITER_CONTEXT_INFO; + } + return 0; +} + static struct tracer blk_tracer __read_mostly = { .name = "blk", .init = blk_tracer_init, @@ -1335,6 +1342,7 @@ static struct tracer blk_tracer __read_mostly = { .print_header = blk_tracer_print_header, .print_line = blk_tracer_print_line, .flags = &blk_tracer_flags, + .set_flag = blk_tracer_set_flag, }; static struct trace_event trace_blk_event = { From 0232ba9ce031d0fd8f331fa8b3c00e16901f54e6 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 16 Apr 2009 18:01:31 +0900 Subject: [PATCH 0417/2061] sh: pci: Kill off unused SH4_PCIC_NO_RESET code. Nothing ended up using this anymore, so just kill it off. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-landisk.c | 2 -- arch/sh/drivers/pci/ops-lboxre2.c | 5 ----- arch/sh/drivers/pci/ops-r7780rp.c | 2 -- arch/sh/drivers/pci/ops-rts7751r2d.c | 7 ------- arch/sh/drivers/pci/ops-sdk7780.c | 1 - arch/sh/drivers/pci/ops-se7780.c | 1 - arch/sh/drivers/pci/ops-sh7785lcr.c | 2 -- arch/sh/drivers/pci/ops-snapgear.c | 2 -- arch/sh/drivers/pci/ops-titan.c | 2 -- arch/sh/drivers/pci/pci-sh4.h | 4 ---- arch/sh/drivers/pci/pci-sh7751.c | 15 --------------- arch/sh/drivers/pci/pci-sh7780.c | 15 --------------- 12 files changed, 58 deletions(-) diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c index c46911d95ec..178b77828aa 100644 --- a/arch/sh/drivers/pci/ops-landisk.c +++ b/arch/sh/drivers/pci/ops-landisk.c @@ -39,8 +39,6 @@ static struct sh4_pci_address_map sh7751_pci_map = { .base = SH7751_CS3_BASE_ADDR, .size = (64 << 20), /* 64MB */ }, - - .flags = SH4_PCIC_NO_RESET, }; int __init pcibios_init_platform(void) diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c index 781496bfb1f..91cabd84f02 100644 --- a/arch/sh/drivers/pci/ops-lboxre2.c +++ b/arch/sh/drivers/pci/ops-lboxre2.c @@ -48,11 +48,6 @@ static struct sh4_pci_address_map sh7751_pci_map = { .base = SH7751_CS3_BASE_ADDR, .size = 0x04000000, }, - .window1 = { - .base = 0x00000000, /* Unused */ - .size = 0x00000000, /* Unused */ - }, - .flags = SH4_PCIC_NO_RESET, }; int __init pcibios_init_platform(void) diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c index c58f1cff9fb..8ec6d225ef9 100644 --- a/arch/sh/drivers/pci/ops-r7780rp.c +++ b/arch/sh/drivers/pci/ops-r7780rp.c @@ -57,8 +57,6 @@ static struct sh4_pci_address_map sh7780_pci_map = { .base = SH7780_CS3_BASE_ADDR, .size = 0x04000000, }, - - .flags = SH4_PCIC_NO_RESET, }; int __init pcibios_init_platform(void) diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c index d374cd37f45..96b916c0d6c 100644 --- a/arch/sh/drivers/pci/ops-rts7751r2d.c +++ b/arch/sh/drivers/pci/ops-rts7751r2d.c @@ -56,13 +56,6 @@ static struct sh4_pci_address_map sh7751_pci_map = { .base = SH7751_CS3_BASE_ADDR, .size = 0x04000000, }, - - .window1 = { - .base = 0x00000000, /* Unused */ - .size = 0x00000000, /* Unused */ - }, - - .flags = SH4_PCIC_NO_RESET, }; int __init pcibios_init_platform(void) diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c index b34fbc54a7c..6a0b7c06783 100644 --- a/arch/sh/drivers/pci/ops-sdk7780.c +++ b/arch/sh/drivers/pci/ops-sdk7780.c @@ -62,7 +62,6 @@ static struct sh4_pci_address_map sdk7780_pci_map = { .base = SH7780_CS3_BASE_ADDR, .size = 0x04000000, }, - .flags = SH4_PCIC_NO_RESET, }; int __init pcibios_init_platform(void) diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c index 47302077a0c..583b8e82ff9 100644 --- a/arch/sh/drivers/pci/ops-se7780.c +++ b/arch/sh/drivers/pci/ops-se7780.c @@ -67,7 +67,6 @@ static struct sh4_pci_address_map se7780_pci_map = { .base = SH7780_CS2_BASE_ADDR, .size = 0x04000000, }, - .flags = SH4_PCIC_NO_RESET, }; int __init pcibios_init_platform(void) diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c index afbb9bd4751..ab0d1decf2d 100644 --- a/arch/sh/drivers/pci/ops-sh7785lcr.c +++ b/arch/sh/drivers/pci/ops-sh7785lcr.c @@ -55,8 +55,6 @@ static struct sh4_pci_address_map sh7785_pci_map = { .size = 0x20000000, #endif }, - - .flags = SH4_PCIC_NO_RESET, }; int __init pcibios_init_platform(void) diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c index 2e254c6cf6c..dd2c5df2830 100644 --- a/arch/sh/drivers/pci/ops-snapgear.c +++ b/arch/sh/drivers/pci/ops-snapgear.c @@ -54,8 +54,6 @@ static struct sh4_pci_address_map sh7751_pci_map = { .base = SH7751_CS2_BASE_ADDR, .size = SNAPGEAR_LSR1_SIZE, }, - - .flags = SH4_PCIC_NO_RESET, }; /* diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c index 31ed03716a2..e45bb62bf8c 100644 --- a/arch/sh/drivers/pci/ops-titan.c +++ b/arch/sh/drivers/pci/ops-titan.c @@ -66,8 +66,6 @@ static struct sh4_pci_address_map sh7751_pci_map = { .base = SH7751_CS2_BASE_ADDR, .size = SH7751_MEM_REGION_SIZE*2, }, - - .flags = SH4_PCIC_NO_RESET, }; int __init pcibios_init_platform(void) diff --git a/arch/sh/drivers/pci/pci-sh4.h b/arch/sh/drivers/pci/pci-sh4.h index 90abfe3d39b..3d5296cde62 100644 --- a/arch/sh/drivers/pci/pci-sh4.h +++ b/arch/sh/drivers/pci/pci-sh4.h @@ -149,9 +149,6 @@ #define SH4_PCIPDTR_PB0 0x000000001 /* Port 0 Enable */ #define SH4_PCIPDR 0x220 /* Port IO Data Register */ -/* Flags */ -#define SH4_PCIC_NO_RESET 0x0001 - /* arch/sh/kernel/drivers/pci/ops-sh4.c */ extern struct pci_ops sh4_pci_ops; int sh4_pci_check_direct(struct pci_channel *chan); @@ -165,7 +162,6 @@ struct sh4_pci_address_space { struct sh4_pci_address_map { struct sh4_pci_address_space window0; struct sh4_pci_address_space window1; - unsigned long flags; }; static inline void pci_write_reg(struct pci_channel *chan, diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c index 2a6c7aab2d7..af8874436d2 100644 --- a/arch/sh/drivers/pci/pci-sh7751.c +++ b/arch/sh/drivers/pci/pci-sh7751.c @@ -99,21 +99,6 @@ int __init sh7751_pcic_init(struct pci_channel *chan, word = SH4_PCIPINT_D3 | SH4_PCIPINT_D0; pci_write_reg(chan, word, SH4_PCIPINT); - /* - * This code is unused for some boards as it is done in the - * bootloader and doing it here means the MAC addresses loaded - * by the bootloader get lost. - */ - if (!(map->flags & SH4_PCIC_NO_RESET)) { - /* toggle PCI reset pin */ - word = SH4_PCICR_PREFIX | SH4_PCICR_PRST; - pci_write_reg(chan, word, SH4_PCICR); - /* Wait for a long time... not 1 sec. but long enough */ - mdelay(100); - word = SH4_PCICR_PREFIX; - pci_write_reg(chan, word, SH4_PCICR); - } - /* set the command/status bits to: * Wait Cycle Control + Parity Enable + Bus Master + * Mem space enable diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 87a7f3b7a38..282cabe15e3 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -96,21 +96,6 @@ int __init sh7780_pcic_init(struct pci_channel *chan, { u32 word; - /* - * This code is unused for some boards as it is done in the - * bootloader and doing it here means the MAC addresses loaded - * by the bootloader get lost. - */ - if (!(map->flags & SH4_PCIC_NO_RESET)) { - /* toggle PCI reset pin */ - word = SH4_PCICR_PREFIX | SH4_PCICR_PRST; - pci_write_reg(chan, word, SH4_PCICR); - /* Wait for a long time... not 1 sec. but long enough */ - mdelay(100); - word = SH4_PCICR_PREFIX; - pci_write_reg(chan, word, SH4_PCICR); - } - /* set the command/status bits to: * Wait Cycle Control + Parity Enable + Bus Master + * Mem space enable From d1f0ae5e2e45e74cff4c3bdefb0fc77608cdfeec Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 15 Apr 2009 21:34:55 +0200 Subject: [PATCH 0418/2061] x86: standardize Kbuild rules Introducing this Kbuild file allow us to: make arch/x86/ And thus building all the core part of x86. Signed-off-by: Sam Ravnborg Cc: Jaswinder Singh Rajput Signed-off-by: Ingo Molnar --- arch/x86/Kbuild | 16 ++++++++++++++++ arch/x86/Makefile | 19 ++----------------- 2 files changed, 18 insertions(+), 17 deletions(-) create mode 100644 arch/x86/Kbuild diff --git a/arch/x86/Kbuild b/arch/x86/Kbuild new file mode 100644 index 00000000000..ad8ec356fb3 --- /dev/null +++ b/arch/x86/Kbuild @@ -0,0 +1,16 @@ + +obj-$(CONFIG_KVM) += kvm/ + +# Xen paravirtualization support +obj-$(CONFIG_XEN) += xen/ + +# lguest paravirtualization support +obj-$(CONFIG_LGUEST_GUEST) += lguest/ + +obj-y += kernel/ +obj-y += mm/ + +obj-y += crypto/ +obj-y += vdso/ +obj-$(CONFIG_IA32_EMULATION) += ia32/ + diff --git a/arch/x86/Makefile b/arch/x86/Makefile index f05d8c91d9e..e81f0b27776 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile @@ -7,8 +7,6 @@ else KBUILD_DEFCONFIG := $(ARCH)_defconfig endif -core-$(CONFIG_KVM) += arch/x86/kvm/ - # BITS is used as extension for files which are available in a 32 bit # and a 64 bit version to simplify shared Makefiles. # e.g.: obj-y += foo_$(BITS).o @@ -118,21 +116,8 @@ head-y += arch/x86/kernel/init_task.o libs-y += arch/x86/lib/ -# Sub architecture files that needs linking first -core-y += $(fcore-y) - -# Xen paravirtualization support -core-$(CONFIG_XEN) += arch/x86/xen/ - -# lguest paravirtualization support -core-$(CONFIG_LGUEST_GUEST) += arch/x86/lguest/ - -core-y += arch/x86/kernel/ -core-y += arch/x86/mm/ - -core-y += arch/x86/crypto/ -core-y += arch/x86/vdso/ -core-$(CONFIG_IA32_EMULATION) += arch/x86/ia32/ +# See arch/x86/Kbuild for content of core part of the kernel +core-y += arch/x86/ # drivers-y are linked after core-y drivers-$(CONFIG_MATH_EMULATION) += arch/x86/math-emu/ From 84971bb401866d79c6b353cb1d8861c2b4621867 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 12:44:25 +0900 Subject: [PATCH 0419/2061] sh: pci: Kill off useless debugging printk() in pci-sh7780 init. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-sh7780.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 282cabe15e3..b826c7bc62b 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -118,18 +118,6 @@ int __init sh7780_pcic_init(struct pci_channel *chan, pci_write_reg(chan, map->window1.base, SH4_PCILAR1); pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1); - /* Map IO space into PCI IO window: - * IO addresses will be translated to the PCI IO window base address - */ - pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n", - chan->io_resource->start, chan->io_resource->end, - chan->io_base + chan->io_resource->start); - - /* NOTE: I'm ignoring the PCI error IRQs for now.. - * TODO: add support for the internal error interrupts and - * DMA interrupts... - */ - /* Apply any last-minute PCIC fixups */ pci_fixup_pcic(chan); From b627b4ed3d056c5d00e8f3cb32d033b0ee6619a9 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 13:00:18 +0900 Subject: [PATCH 0420/2061] sh: pci: Move se7780 INTC fixups out of pci-sh7780.c. These fixups belong in the board INTC setup code, not in the middle of pci-sh7780.c. Signed-off-by: Paul Mundt --- arch/sh/boards/mach-se/7780/irq.c | 10 ++++++++-- arch/sh/drivers/pci/pci-sh7780.c | 24 ------------------------ 2 files changed, 8 insertions(+), 26 deletions(-) diff --git a/arch/sh/boards/mach-se/7780/irq.c b/arch/sh/boards/mach-se/7780/irq.c index 66ad292c9fc..44b61a597a1 100644 --- a/arch/sh/boards/mach-se/7780/irq.c +++ b/arch/sh/boards/mach-se/7780/irq.c @@ -12,10 +12,13 @@ #include #include #include -#include -#include +#include +#include #include +#define INTC_BASE 0xffd00000 +#define INTC_ICR1 (INTC_BASE+0x1c) + /* * Initialize IRQ setting */ @@ -43,4 +46,7 @@ void __init init_se7780_IRQ(void) ctrl_outw((IRQPIN_PCCPW << IRQPOS_PCCPW), FPGA_INTSEL3); plat_irq_setup_pins(IRQ_MODE_IRQ); /* install handlers for IRQ0-7 */ + + /* ICR1: detect low level(for 2ndcut) */ + ctrl_outl(0xAAAA0000, INTC_ICR1); } diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index b826c7bc62b..45fa423f2e5 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -22,20 +22,6 @@ #include #include "pci-sh4.h" -#define INTC_BASE 0xffd00000 -#define INTC_ICR0 (INTC_BASE+0x0) -#define INTC_ICR1 (INTC_BASE+0x1c) -#define INTC_INTPRI (INTC_BASE+0x10) -#define INTC_INTREQ (INTC_BASE+0x24) -#define INTC_INTMSK0 (INTC_BASE+0x44) -#define INTC_INTMSK1 (INTC_BASE+0x48) -#define INTC_INTMSK2 (INTC_BASE+0x40080) -#define INTC_INTMSKCLR0 (INTC_BASE+0x64) -#define INTC_INTMSKCLR1 (INTC_BASE+0x68) -#define INTC_INTMSKCLR2 (INTC_BASE+0x40084) -#define INTC_INT2MSKR (INTC_BASE+0x40038) -#define INTC_INT2MSKCR (INTC_BASE+0x4003c) - /* * Initialization. Try all known PCI access methods. Note that we support * using both PCI BIOS and direct access: in such cases, we use I/O ports @@ -75,16 +61,6 @@ int __init sh7780_pci_init(struct pci_channel *chan) return -ENODEV; } - /* Setup the INTC */ - if (mach_is_7780se()) { - /* ICR0: IRL=use separately */ - ctrl_outl(0x00C00020, INTC_ICR0); - /* ICR1: detect low level(for 2ndcut) */ - ctrl_outl(0xAAAA0000, INTC_ICR1); - /* INTPRI: priority=3(all) */ - ctrl_outl(0x33333333, INTC_INTPRI); - } - if ((ret = sh4_pci_check_direct(chan)) != 0) return ret; From 7e4ba0d77c96d328ba968ddff4a464d4d2fa7abc Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 14:07:57 +0900 Subject: [PATCH 0421/2061] sh: pci: Prefer P1SEG over P1SEGADDR for CONFIG_CMD. P1SEGADDR is obsolete and will be killed off completely in the future, so transition off of it and reference P1SEG explicitly. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-sh4.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c index 540683d07c7..2a7f7b50ff0 100644 --- a/arch/sh/drivers/pci/ops-sh4.c +++ b/arch/sh/drivers/pci/ops-sh4.c @@ -1,22 +1,22 @@ /* * Generic SH-4 / SH-4A PCIC operations (SH7751, SH7780). * - * Copyright (C) 2002 - 2006 Paul Mundt + * Copyright (C) 2002 - 2009 Paul Mundt * * This file is subject to the terms and conditions of the GNU General Public * License v2. See the file "COPYING" in the main directory of this archive * for more details. */ #include +#include #include -#include #include "pci-sh4.h" /* * Direct access to PCI hardware... */ #define CONFIG_CMD(bus, devfn, where) \ - P1SEGADDR((bus->number << 16) | (devfn << 8) | (where & ~3)) + (P1SEG | (bus->number << 16) | (devfn << 8) | (where & ~3)) static DEFINE_SPINLOCK(sh4_pci_lock); From 0bbc9bc3189f24de946777af43c9033c8c4871e4 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 14:09:09 +0900 Subject: [PATCH 0422/2061] sh: pci: Set class/sub-class code correctly for SH7780 PCIC. The SH7780 PCI host controller implements a configuration header that requires a fair bit of hand-holding to initialize properly. By default it appears as a pre-2.0 host controller given the zeroed out class code, so fix this up properly. Some boards that happened to be using the R7780RP version of the PCIC fixups had set this correctly, but this belongs in the standard initialization, and is by no means board specific. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/fixups-sdk7780.c | 4 ---- arch/sh/drivers/pci/pci-sh7780.c | 7 +++---- 2 files changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c index c2957312b30..004efd486ee 100644 --- a/arch/sh/drivers/pci/fixups-sdk7780.c +++ b/arch/sh/drivers/pci/fixups-sdk7780.c @@ -16,8 +16,6 @@ int pci_fixup_pcic(struct pci_channel *chan) { - ctrl_outl(0x00000001, SH7780_PCI_VCR2); - /* Enable all interrupts, so we know what to fix */ pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR); pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM); @@ -26,8 +24,6 @@ int pci_fixup_pcic(struct pci_channel *chan) pci_write_reg(chan, 0xFB00, SH7780_PCISTATUS); pci_write_reg(chan, 0x0047, SH7780_PCICMD); pci_write_reg(chan, 0x00, SH7780_PCIPIF); - pci_write_reg(chan, 0x00, SH7780_PCISUB); - pci_write_reg(chan, 0x06, SH7780_PCIBCC); pci_write_reg(chan, 0x1912, SH7780_PCISVID); pci_write_reg(chan, 0x0001, SH7780_PCISID); diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 45fa423f2e5..7f4f5903754 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -72,16 +72,15 @@ int __init sh7780_pcic_init(struct pci_channel *chan, { u32 word; + pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST >> 8, SH7780_PCIBCC); + pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST & 0xff, SH7780_PCISUB); + /* set the command/status bits to: * Wait Cycle Control + Parity Enable + Bus Master + * Mem space enable */ pci_write_reg(chan, 0x00000046, SH7780_PCICMD); - /* define this host as the host bridge */ - word = PCI_BASE_CLASS_BRIDGE << 24; - pci_write_reg(chan, word, SH7780_PCIRID); - /* Set IO and Mem windows to local address * Make PCI and local address the same for easy 1 to 1 mapping */ From 4e7b7fdb129995640f144b7de114e109c6b46a2a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 15:05:19 +0900 Subject: [PATCH 0423/2061] sh: pci: Rework SH7780 host controller detection. This reworks how the host controller is probed, and makes it a bit more verbose in the event a new type of controller is detected. Additionally, we also log the revision information. This now uses the proper access sizes for the vendor/device registers, rather than relying on a larger access that encapsulated both of them. Not all devices support 32-bit read cycles for these registers. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-sh7780.c | 42 ++++++++++++++++++-------------- arch/sh/drivers/pci/pci-sh7780.h | 5 ++-- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 7f4f5903754..63b5151e9aa 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -34,33 +34,39 @@ int __init sh7780_pci_init(struct pci_channel *chan) { unsigned int id; - int ret, match = 0; + const char *type = NULL; + int ret; - pr_debug("PCI: Starting intialization.\n"); + printk(KERN_NOTICE "PCI: Starting intialization.\n"); chan->reg_base = 0xfe040000; chan->io_base = 0xfe200000; - ctrl_outl(0x00000001, SH7780_PCI_VCR2); /* Enable PCIC */ + /* Enable CPU access to the PCIC registers. */ + __raw_writel(PCIECR_ENBL, PCIECR); - /* check for SH7780/SH7780R hardware */ - id = pci_read_reg(chan, SH7780_PCIVID); - if ((id & 0xffff) == SH7780_VENDOR_ID) { - switch ((id >> 16) & 0xffff) { - case SH7763_DEVICE_ID: - case SH7780_DEVICE_ID: - case SH7781_DEVICE_ID: - case SH7785_DEVICE_ID: - match = 1; - break; - } - } - - if (unlikely(!match)) { - printk(KERN_ERR "PCI: This is not an SH7780 (%x)\n", id); + id = __raw_readw(chan->reg_base + SH7780_PCIVID); + if (id != SH7780_VENDOR_ID) { + printk(KERN_ERR "PCI: Unknown vendor ID 0x%04x.\n", id); return -ENODEV; } + id = __raw_readw(chan->reg_base + SH7780_PCIDID); + type = (id == SH7763_DEVICE_ID) ? "SH7763" : + (id == SH7780_DEVICE_ID) ? "SH7780" : + (id == SH7781_DEVICE_ID) ? "SH7781" : + (id == SH7785_DEVICE_ID) ? "SH7785" : + NULL; + if (unlikely(!type)) { + printk(KERN_ERR "PCI: Found an unsupported Renesas host " + "controller, device id 0x%04x.\n", id); + return -EINVAL; + } + + printk(KERN_NOTICE "PCI: Found a Renesas %s host " + "controller, revision %d.\n", type, + __raw_readb(chan->reg_base + SH7780_PCIRID)); + if ((ret = sh4_pci_check_direct(chan)) != 0) return ret; diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h index fffcf1dcfed..213f1d8c9ca 100644 --- a/arch/sh/drivers/pci/pci-sh7780.h +++ b/arch/sh/drivers/pci/pci-sh7780.h @@ -20,9 +20,8 @@ #define SH7785_DEVICE_ID 0x0007 /* SH7780 Control Registers */ -#define SH7780_PCI_VCR0 0xFE000000 -#define SH7780_PCI_VCR1 0xFE000004 -#define SH7780_PCI_VCR2 0xFE000008 +#define PCIECR 0xFE000008 +#define PCIECR_ENBL 0x01 /* SH7780 Specific Values */ #define SH7780_PCI_CONFIG_BASE 0xFD000000 /* Config space base addr */ From ab78cbcf6877334fc20868b7df7887349e2e01c8 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 15:08:01 +0900 Subject: [PATCH 0424/2061] sh: pci: Use the proper write size for class/sub-class code. Don't use pci_write_reg() for these, as it defaults to 32-bit. Rather than using the helper, use __raw_writeb() directly. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-sh7780.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 63b5151e9aa..19bac2168f4 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -78,8 +78,10 @@ int __init sh7780_pcic_init(struct pci_channel *chan, { u32 word; - pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST >> 8, SH7780_PCIBCC); - pci_write_reg(chan, PCI_CLASS_BRIDGE_HOST & 0xff, SH7780_PCISUB); + __raw_writeb(PCI_CLASS_BRIDGE_HOST >> 8, + chan->reg_base + SH7780_PCIBCC); + __raw_writeb(PCI_CLASS_BRIDGE_HOST & 0xff, + chan->reg_base + SH7780_PCISUB); /* set the command/status bits to: * Wait Cycle Control + Parity Enable + Bus Master + From c66c1d79a94a7a302e2dc6c93da40902423eac3e Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 16:38:00 +0900 Subject: [PATCH 0425/2061] sh: pci: Set pci_cache_line_size on SH7780 via the PCICLS register. The SH7780 PCIC contains a read-only cache line size register that we can derive pci_cache_line_size from. So, make sure that the software idea of the cache line size actually matches the host controller's idea. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-sh7780.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 19bac2168f4..fa73b0d1588 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -22,15 +22,6 @@ #include #include "pci-sh4.h" -/* - * Initialization. Try all known PCI access methods. Note that we support - * using both PCI BIOS and direct access: in such cases, we use I/O ports - * to access config space. - * - * Note that the platform specific initialization (BSC registers, and memory - * space mapping) will be called via the platform defined function - * pcibios_init_platform(). - */ int __init sh7780_pci_init(struct pci_channel *chan) { unsigned int id; @@ -70,19 +61,31 @@ int __init sh7780_pci_init(struct pci_channel *chan) if ((ret = sh4_pci_check_direct(chan)) != 0) return ret; + /* + * Platform specific initialization (BSC registers, and memory space + * mapping) will be called via the platform defined function + * pcibios_init_platform(). + */ return pcibios_init_platform(); } +extern u8 pci_cache_line_size; + int __init sh7780_pcic_init(struct pci_channel *chan, struct sh4_pci_address_map *map) { u32 word; + /* + * Set the class and sub-class codes. + */ __raw_writeb(PCI_CLASS_BRIDGE_HOST >> 8, chan->reg_base + SH7780_PCIBCC); __raw_writeb(PCI_CLASS_BRIDGE_HOST & 0xff, chan->reg_base + SH7780_PCISUB); + pci_cache_line_size = pci_read_reg(chan, SH7780_PCICLS) / 4; + /* set the command/status bits to: * Wait Cycle Control + Parity Enable + Bus Master + * Mem space enable From f1dcab756687622b658154ded1657538984edcdb Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 17:00:27 +0900 Subject: [PATCH 0426/2061] sh: pci: Set the I/O port base to the SH7780 I/O window default. Presently the I/O port base isn't being set anywhere, which allows things like generic_inl() to blow up. Fix this up to point at the PCI IO window. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-sh7780.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index fa73b0d1588..207b7206fbd 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -112,5 +112,7 @@ int __init sh7780_pcic_init(struct pci_channel *chan, word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO; pci_write_reg(chan, word, SH4_PCICR); + __set_io_port_base(SH7780_PCI_IO_BASE); + return 0; } From ab1363a8929f32cc163cd3f50ca72f20d901b00c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 17:07:47 +0900 Subject: [PATCH 0427/2061] sh: pci: Consolidate PCI I/O and mem window definitions for SH7780. This consolidates all of the PCI I/O and memory window definitions across the pci-sh7780 users in pci-sh7780 itself. No functional changes, in that every platform had exactly the same implementation. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-r7780rp.c | 23 +---------------------- arch/sh/drivers/pci/ops-sdk7780.c | 21 +-------------------- arch/sh/drivers/pci/ops-se7780.c | 23 +---------------------- arch/sh/drivers/pci/ops-sh7785lcr.c | 21 +-------------------- arch/sh/drivers/pci/pci-sh7780.c | 25 ++++++++++++++++++++++--- arch/sh/drivers/pci/pci-sh7780.h | 4 +--- 6 files changed, 27 insertions(+), 90 deletions(-) diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c index 8ec6d225ef9..044525df18c 100644 --- a/arch/sh/drivers/pci/ops-r7780rp.c +++ b/arch/sh/drivers/pci/ops-r7780rp.c @@ -26,27 +26,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) return irq_tab[slot]; } -static struct resource sh7780_io_resource = { - .name = "SH7780_IO", - .start = SH7780_PCI_IO_BASE, - .end = SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource sh7780_mem_resource = { - .name = "SH7780_mem", - .start = SH7780_PCI_MEMORY_BASE, - .end = SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -extern struct pci_ops sh7780_pci_ops; - -struct pci_channel board_pci_channels[] = { - { sh7780_pci_init, &sh4_pci_ops, &sh7780_io_resource, &sh7780_mem_resource, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - static struct sh4_pci_address_map sh7780_pci_map = { .window0 = { .base = SH7780_CS2_BASE_ADDR, @@ -61,5 +40,5 @@ static struct sh4_pci_address_map sh7780_pci_map = { int __init pcibios_init_platform(void) { - return sh7780_pcic_init(&board_pci_channels[0], &sh7780_pci_map); + return sh7780_pcic_init(&sh7780_pci_map); } diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c index 6a0b7c06783..570fd71f693 100644 --- a/arch/sh/drivers/pci/ops-sdk7780.c +++ b/arch/sh/drivers/pci/ops-sdk7780.c @@ -34,25 +34,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) return sdk7780_irq_tab[pin-1][slot]; } -static struct resource sdk7780_io_resource = { - .name = "SH7780_IO", - .start = SH7780_PCI_IO_BASE, - .end = SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource sdk7780_mem_resource = { - .name = "SH7780_mem", - .start = SH7780_PCI_MEMORY_BASE, - .end = SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -struct pci_channel board_pci_channels[] = { - { sh7780_pci_init, &sh4_pci_ops, &sdk7780_io_resource, &sdk7780_mem_resource, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - static struct sh4_pci_address_map sdk7780_pci_map = { .window0 = { .base = SH7780_CS2_BASE_ADDR, @@ -67,5 +48,5 @@ static struct sh4_pci_address_map sdk7780_pci_map = { int __init pcibios_init_platform(void) { printk(KERN_INFO "SH7780 PCI: Finished initializing PCI controller\n"); - return sh7780_pcic_init(&board_pci_channels[0], &sdk7780_pci_map); + return sh7780_pcic_init(&sdk7780_pci_map); } diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c index 583b8e82ff9..8b0941515b0 100644 --- a/arch/sh/drivers/pci/ops-se7780.c +++ b/arch/sh/drivers/pci/ops-se7780.c @@ -41,27 +41,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) return se7780_irq_tab[pin-1][slot]; } -static struct resource se7780_io_resource = { - .name = "SH7780_IO", - .start = SH7780_PCI_IO_BASE, - .end = SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource se7780_mem_resource = { - .name = "SH7780_mem", - .start = SH7780_PCI_MEMORY_BASE, - .end = SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -extern struct pci_ops se7780_pci_ops; - -struct pci_channel board_pci_channels[] = { - { sh7780_pci_init, &sh4_pci_ops, &se7780_io_resource, &se7780_mem_resource, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - static struct sh4_pci_address_map se7780_pci_map = { .window0 = { .base = SH7780_CS2_BASE_ADDR, @@ -90,5 +69,5 @@ int __init pcibios_init_platform(void) ctrl_outw(0x0013, FPGA_PCI_INTSEL1); ctrl_outw(0xE402, FPGA_PCI_INTSEL2); - return sh7780_pcic_init(&board_pci_channels[0], &se7780_pci_map); + return sh7780_pcic_init(&se7780_pci_map); } diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c index ab0d1decf2d..bac46b5d158 100644 --- a/arch/sh/drivers/pci/ops-sh7785lcr.c +++ b/arch/sh/drivers/pci/ops-sh7785lcr.c @@ -26,25 +26,6 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) return irq_tab[slot]; } -static struct resource sh7785_io_resource = { - .name = "SH7785_IO", - .start = SH7780_PCI_IO_BASE, - .end = SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource sh7785_mem_resource = { - .name = "SH7785_mem", - .start = SH7780_PCI_MEMORY_BASE, - .end = SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -struct pci_channel board_pci_channels[] = { - { sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - static struct sh4_pci_address_map sh7785_pci_map = { .window0 = { #if defined(CONFIG_32BIT) @@ -59,5 +40,5 @@ static struct sh4_pci_address_map sh7785_pci_map = { int __init pcibios_init_platform(void) { - return sh7780_pcic_init(&board_pci_channels[0], &sh7785_pci_map); + return sh7780_pcic_init(&sh7785_pci_map); } diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 207b7206fbd..eb217ddf025 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -22,7 +22,7 @@ #include #include "pci-sh4.h" -int __init sh7780_pci_init(struct pci_channel *chan) +static int __init sh7780_pci_init(struct pci_channel *chan) { unsigned int id; const char *type = NULL; @@ -71,9 +71,28 @@ int __init sh7780_pci_init(struct pci_channel *chan) extern u8 pci_cache_line_size; -int __init sh7780_pcic_init(struct pci_channel *chan, - struct sh4_pci_address_map *map) +static struct resource sh7785_io_resource = { + .name = "SH7785_IO", + .start = SH7780_PCI_IO_BASE, + .end = SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1, + .flags = IORESOURCE_IO +}; + +static struct resource sh7785_mem_resource = { + .name = "SH7785_mem", + .start = SH7780_PCI_MEMORY_BASE, + .end = SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1, + .flags = IORESOURCE_MEM +}; + +struct pci_channel board_pci_channels[] = { + { sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff }, + { NULL, NULL, NULL, 0, 0 }, +}; + +int __init sh7780_pcic_init(struct sh4_pci_address_map *map) { + struct pci_channel *chan = &board_pci_channels[0]; u32 word; /* diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h index 213f1d8c9ca..7a4f8a8dd69 100644 --- a/arch/sh/drivers/pci/pci-sh7780.h +++ b/arch/sh/drivers/pci/pci-sh7780.h @@ -107,8 +107,6 @@ struct sh4_pci_address_map; /* arch/sh/drivers/pci/pci-sh7780.c */ -int sh7780_pci_init(struct pci_channel *chan); -int sh7780_pcic_init(struct pci_channel *chan, - struct sh4_pci_address_map *map); +int sh7780_pcic_init(struct sh4_pci_address_map *map); #endif /* _PCI_SH7780_H_ */ From 4c7a47de897e89c25a40e228ac5319cbac7257fe Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 17:21:36 +0900 Subject: [PATCH 0428/2061] sh: pci: Kill off platform-specific multi-window mappings. Commit 68b42d1b548be1840aff7122fdebeb804daf0fa3 ("sh: sh7785lcr: Map whole PCI address space.") changed around the semantics of how various chip-selects are made accessible to PCI. Now that there is a single large mapping covering from CS0-CS6, there is no longer any need to do multi-window mapping. Subsequently, all of the differing implementations can be consolidated in to pci-sh7780. Signed-off-by: Paul Mundt --- arch/sh/boards/mach-se/7780/irq.c | 17 ++++++++++++++++ arch/sh/drivers/pci/ops-r7780rp.c | 17 ---------------- arch/sh/drivers/pci/ops-sdk7780.c | 17 ---------------- arch/sh/drivers/pci/ops-se7780.c | 31 ----------------------------- arch/sh/drivers/pci/ops-sh7785lcr.c | 17 ---------------- arch/sh/drivers/pci/pci-sh7780.c | 24 ++++++++++++++-------- arch/sh/drivers/pci/pci-sh7780.h | 5 ----- 7 files changed, 33 insertions(+), 95 deletions(-) diff --git a/arch/sh/boards/mach-se/7780/irq.c b/arch/sh/boards/mach-se/7780/irq.c index 44b61a597a1..b8d43b638fc 100644 --- a/arch/sh/boards/mach-se/7780/irq.c +++ b/arch/sh/boards/mach-se/7780/irq.c @@ -49,4 +49,21 @@ void __init init_se7780_IRQ(void) /* ICR1: detect low level(for 2ndcut) */ ctrl_outl(0xAAAA0000, INTC_ICR1); + + /* + * FPGA PCISEL register initialize + * + * CPU || SLOT1 | SLOT2 | S-ATA | USB + * ------------------------------------- + * INTA || INTA | INTD | -- | INTB + * ------------------------------------- + * INTB || INTB | INTA | -- | INTC + * ------------------------------------- + * INTC || INTC | INTB | INTA | -- + * ------------------------------------- + * INTD || INTD | INTC | -- | INTA + * ------------------------------------- + */ + ctrl_outw(0x0013, FPGA_PCI_INTSEL1); + ctrl_outw(0xE402, FPGA_PCI_INTSEL2); } diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c index 044525df18c..4ea136e4eac 100644 --- a/arch/sh/drivers/pci/ops-r7780rp.c +++ b/arch/sh/drivers/pci/ops-r7780rp.c @@ -25,20 +25,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) { return irq_tab[slot]; } - -static struct sh4_pci_address_map sh7780_pci_map = { - .window0 = { - .base = SH7780_CS2_BASE_ADDR, - .size = 0x04000000, - }, - - .window1 = { - .base = SH7780_CS3_BASE_ADDR, - .size = 0x04000000, - }, -}; - -int __init pcibios_init_platform(void) -{ - return sh7780_pcic_init(&sh7780_pci_map); -} diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c index 570fd71f693..1d9a91bcfb7 100644 --- a/arch/sh/drivers/pci/ops-sdk7780.c +++ b/arch/sh/drivers/pci/ops-sdk7780.c @@ -33,20 +33,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) { return sdk7780_irq_tab[pin-1][slot]; } - -static struct sh4_pci_address_map sdk7780_pci_map = { - .window0 = { - .base = SH7780_CS2_BASE_ADDR, - .size = 0x04000000, - }, - .window1 = { - .base = SH7780_CS3_BASE_ADDR, - .size = 0x04000000, - }, -}; - -int __init pcibios_init_platform(void) -{ - printk(KERN_INFO "SH7780 PCI: Finished initializing PCI controller\n"); - return sh7780_pcic_init(&sdk7780_pci_map); -} diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c index 8b0941515b0..6c088ccfe47 100644 --- a/arch/sh/drivers/pci/ops-se7780.c +++ b/arch/sh/drivers/pci/ops-se7780.c @@ -40,34 +40,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) { return se7780_irq_tab[pin-1][slot]; } - -static struct sh4_pci_address_map se7780_pci_map = { - .window0 = { - .base = SH7780_CS2_BASE_ADDR, - .size = 0x04000000, - }, -}; - -int __init pcibios_init_platform(void) -{ - printk("SH7780 PCI: Finished initialization of the PCI controller\n"); - - /* - * FPGA PCISEL register initialize - * - * CPU || SLOT1 | SLOT2 | S-ATA | USB - * ------------------------------------- - * INTA || INTA | INTD | -- | INTB - * ------------------------------------- - * INTB || INTB | INTA | -- | INTC - * ------------------------------------- - * INTC || INTC | INTB | INTA | -- - * ------------------------------------- - * INTD || INTD | INTC | -- | INTA - * ------------------------------------- - */ - ctrl_outw(0x0013, FPGA_PCI_INTSEL1); - ctrl_outw(0xE402, FPGA_PCI_INTSEL2); - - return sh7780_pcic_init(&se7780_pci_map); -} diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c index bac46b5d158..0fe423e2350 100644 --- a/arch/sh/drivers/pci/ops-sh7785lcr.c +++ b/arch/sh/drivers/pci/ops-sh7785lcr.c @@ -25,20 +25,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) { return irq_tab[slot]; } - -static struct sh4_pci_address_map sh7785_pci_map = { - .window0 = { -#if defined(CONFIG_32BIT) - .base = SH7780_32BIT_DDR_BASE_ADDR, - .size = 0x40000000, -#else - .base = SH7780_CS0_BASE_ADDR, - .size = 0x20000000, -#endif - }, -}; - -int __init pcibios_init_platform(void) -{ - return sh7780_pcic_init(&sh7785_pci_map); -} diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index eb217ddf025..07c5529a273 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -90,7 +90,19 @@ struct pci_channel board_pci_channels[] = { { NULL, NULL, NULL, 0, 0 }, }; -int __init sh7780_pcic_init(struct sh4_pci_address_map *map) +static struct sh4_pci_address_map sh7780_pci_map = { + .window0 = { +#if defined(CONFIG_32BIT) + .base = SH7780_32BIT_DDR_BASE_ADDR, + .size = 0x40000000, +#else + .base = SH7780_CS0_BASE_ADDR, + .size = 0x20000000, +#endif + }, +}; + +int __init pcibios_init_platform(void) { struct pci_channel *chan = &board_pci_channels[0]; u32 word; @@ -114,14 +126,10 @@ int __init sh7780_pcic_init(struct sh4_pci_address_map *map) /* Set IO and Mem windows to local address * Make PCI and local address the same for easy 1 to 1 mapping */ - pci_write_reg(chan, map->window0.size - 0xfffff, SH4_PCILSR0); - pci_write_reg(chan, map->window1.size - 0xfffff, SH4_PCILSR1); + pci_write_reg(chan, sh7780_pci_map.window0.size - 0xfffff, SH4_PCILSR0); /* Set the values on window 0 PCI config registers */ - pci_write_reg(chan, map->window0.base, SH4_PCILAR0); - pci_write_reg(chan, map->window0.base, SH7780_PCIMBAR0); - /* Set the values on window 1 PCI config registers */ - pci_write_reg(chan, map->window1.base, SH4_PCILAR1); - pci_write_reg(chan, map->window1.base, SH7780_PCIMBAR1); + pci_write_reg(chan, sh7780_pci_map.window0.base, SH4_PCILAR0); + pci_write_reg(chan, sh7780_pci_map.window0.base, SH7780_PCIMBAR0); /* Apply any last-minute PCIC fixups */ pci_fixup_pcic(chan); diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h index 7a4f8a8dd69..4b65d4b26f7 100644 --- a/arch/sh/drivers/pci/pci-sh7780.h +++ b/arch/sh/drivers/pci/pci-sh7780.h @@ -104,9 +104,4 @@ #define SH7780_32BIT_DDR_BASE_ADDR 0x40000000 -struct sh4_pci_address_map; - -/* arch/sh/drivers/pci/pci-sh7780.c */ -int sh7780_pcic_init(struct sh4_pci_address_map *map); - #endif /* _PCI_SH7780_H_ */ From a6d377b6969235a3b5a6e87bdcef387d0976b41c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 20:11:44 +0900 Subject: [PATCH 0429/2061] sh: pci: Consolidate SH7780 PCIC IRQ routing. Now that the platform code is a bit leaner, we can start consolidating the various IRQ routing implementations. There are effectively only 2 variants, and the others can use those directly. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Makefile | 8 +++--- arch/sh/drivers/pci/fixups-r7780rp.c | 10 ++++++- arch/sh/drivers/pci/fixups-sdk7780.c | 19 ++++++++++++- arch/sh/drivers/pci/ops-r7780rp.c | 27 ------------------ arch/sh/drivers/pci/ops-sdk7780.c | 35 ----------------------- arch/sh/drivers/pci/ops-se7780.c | 42 ---------------------------- arch/sh/drivers/pci/ops-sh7785lcr.c | 27 ------------------ 7 files changed, 31 insertions(+), 137 deletions(-) delete mode 100644 arch/sh/drivers/pci/ops-r7780rp.c delete mode 100644 arch/sh/drivers/pci/ops-sdk7780.c delete mode 100644 arch/sh/drivers/pci/ops-se7780.c delete mode 100644 arch/sh/drivers/pci/ops-sh7785lcr.c diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index 67d710a04de..5003971476e 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -16,11 +16,11 @@ obj-$(CONFIG_SH_DREAMCAST) += ops-dreamcast.o fixups-dreamcast.o obj-$(CONFIG_SH_SECUREEDGE5410) += ops-snapgear.o obj-$(CONFIG_SH_RTS7751R2D) += ops-rts7751r2d.o fixups-rts7751r2d.o obj-$(CONFIG_SH_SH03) += ops-sh03.o fixups-sh03.o -obj-$(CONFIG_SH_HIGHLANDER) += ops-r7780rp.o fixups-r7780rp.o -obj-$(CONFIG_SH_SDK7780) += ops-sdk7780.o fixups-sdk7780.o +obj-$(CONFIG_SH_HIGHLANDER) += fixups-r7780rp.o +obj-$(CONFIG_SH_SH7785LCR) += fixups-r7780rp.o +obj-$(CONFIG_SH_SDK7780) += fixups-sdk7780.o +obj-$(CONFIG_SH_7780_SOLUTION_ENGINE) += fixups-sdk7780.o obj-$(CONFIG_SH_TITAN) += ops-titan.o obj-$(CONFIG_SH_LANDISK) += ops-landisk.o obj-$(CONFIG_SH_LBOX_RE2) += ops-lboxre2.o fixups-lboxre2.o -obj-$(CONFIG_SH_7780_SOLUTION_ENGINE) += ops-se7780.o fixups-sdk7780.o obj-$(CONFIG_SH_CAYMAN) += ops-cayman.o -obj-$(CONFIG_SH_SH7785LCR) += ops-sh7785lcr.o fixups-r7780rp.o diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c index 31f8dfbfcbf..864e92f6970 100644 --- a/arch/sh/drivers/pci/fixups-r7780rp.c +++ b/arch/sh/drivers/pci/fixups-r7780rp.c @@ -11,9 +11,17 @@ * for more details. */ #include +#include #include "pci-sh4.h" -#include +static char irq_tab[] __initdata = { + 65, 66, 67, 68, +}; + +int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) +{ + return irq_tab[slot]; +} int pci_fixup_pcic(struct pci_channel *chan) { pci_write_reg(chan, 0x000043ff, SH4_PCIINTM); diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c index 004efd486ee..da60e99894b 100644 --- a/arch/sh/drivers/pci/fixups-sdk7780.c +++ b/arch/sh/drivers/pci/fixups-sdk7780.c @@ -5,15 +5,32 @@ * * Copyright (C) 2003 Lineo uSolutions, Inc. * Copyright (C) 2004 - 2006 Paul Mundt + * Copyright (C) 2006 Nobuhiro Iwamatsu * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive * for more details. */ #include +#include #include "pci-sh4.h" -#include +/* IDSEL [16][17][18][19][20][21][22][23][24][25][26][27][28][29][30][31] */ +static char sdk7780_irq_tab[4][16] __initdata = { + /* INTA */ + { 65, 68, 67, 68, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, + /* INTB */ + { 66, 65, -1, 65, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, + /* INTC */ + { 67, 66, -1, 66, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, + /* INTD */ + { 68, 67, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, +}; + +int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) +{ + return sdk7780_irq_tab[pin-1][slot]; +} int pci_fixup_pcic(struct pci_channel *chan) { /* Enable all interrupts, so we know what to fix */ diff --git a/arch/sh/drivers/pci/ops-r7780rp.c b/arch/sh/drivers/pci/ops-r7780rp.c deleted file mode 100644 index 4ea136e4eac..00000000000 --- a/arch/sh/drivers/pci/ops-r7780rp.c +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Author: Ian DaSilva (idasilva@mvista.com) - * - * Highly leveraged from pci-bigsur.c, written by Dustin McIntire. - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * PCI initialization for the Renesas SH7780 Highlander R7780RP-1 board - */ -#include -#include -#include -#include -#include -#include -#include -#include "pci-sh4.h" - -static char irq_tab[] __initdata = { - 65, 66, 67, 68, -}; - -int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) -{ - return irq_tab[slot]; -} diff --git a/arch/sh/drivers/pci/ops-sdk7780.c b/arch/sh/drivers/pci/ops-sdk7780.c deleted file mode 100644 index 1d9a91bcfb7..00000000000 --- a/arch/sh/drivers/pci/ops-sdk7780.c +++ /dev/null @@ -1,35 +0,0 @@ -/* - * linux/arch/sh/drivers/pci/ops-sdk7780.c - * - * Copyright (C) 2006 Nobuhiro Iwamatsu - * - * PCI initialization for the SDK7780SE03 - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - */ -#include -#include -#include -#include -#include -#include -#include -#include "pci-sh4.h" - -/* IDSEL [16][17][18][19][20][21][22][23][24][25][26][27][28][29][30][31] */ -static char sdk7780_irq_tab[4][16] __initdata = { - /* INTA */ - { 65, 68, 67, 68, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, - /* INTB */ - { 66, 65, -1, 65, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, - /* INTC */ - { 67, 66, -1, 66, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, - /* INTD */ - { 68, 67, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, -}; - -int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) -{ - return sdk7780_irq_tab[pin-1][slot]; -} diff --git a/arch/sh/drivers/pci/ops-se7780.c b/arch/sh/drivers/pci/ops-se7780.c deleted file mode 100644 index 6c088ccfe47..00000000000 --- a/arch/sh/drivers/pci/ops-se7780.c +++ /dev/null @@ -1,42 +0,0 @@ -/* - * linux/arch/sh/drivers/pci/ops-se7780.c - * - * Copyright (C) 2006 Nobuhiro Iwamatsu - * - * PCI initialization for the Hitachi UL Solution Engine 7780SE03 - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - */ -#include -#include -#include -#include -#include -#include -#include -#include "pci-sh4.h" - -/* - * IDSEL = AD16 PCI slot - * IDSEL = AD17 PCI slot - * IDSEL = AD18 Serial ATA Controller (Silicon Image SiL3512A) - * IDSEL = AD19 USB Host Controller (NEC uPD7210100A) - */ - -/* IDSEL [16][17][18][19][20][21][22][23][24][25][26][27][28][29][30][31] */ -static char se7780_irq_tab[4][16] __initdata = { - /* INTA */ - { 65, 68, 67, 68, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, - /* INTB */ - { 66, 65, -1, 65, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, - /* INTC */ - { 67, 66, -1, 66, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, - /* INTD */ - { 68, 67, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }, -}; - -int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) -{ - return se7780_irq_tab[pin-1][slot]; -} diff --git a/arch/sh/drivers/pci/ops-sh7785lcr.c b/arch/sh/drivers/pci/ops-sh7785lcr.c deleted file mode 100644 index 0fe423e2350..00000000000 --- a/arch/sh/drivers/pci/ops-sh7785lcr.c +++ /dev/null @@ -1,27 +0,0 @@ -/* - * Author: Ian DaSilva (idasilva@mvista.com) - * - * Highly leveraged from pci-bigsur.c, written by Dustin McIntire. - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * PCI initialization for the Renesas R0P7785LC0011RL board - * Based on arch/sh/drivers/pci/ops-r7780rp.c - * - */ -#include -#include -#include -#include -#include -#include "pci-sh4.h" - -static char irq_tab[] __initdata = { - 65, 66, 67, 68, -}; - -int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) -{ - return irq_tab[slot]; -} From 62c7ae87cb5962d3dfaa6d916a15e4faa9e07363 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 17 Apr 2009 20:37:16 +0900 Subject: [PATCH 0430/2061] sh: pci: Start unifying the SH7780 PCIC initialization. This starts moving out the common initialization bits from the various fixup paths in to the shared init path. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/fixups-r7780rp.c | 20 +---------- arch/sh/drivers/pci/fixups-sdk7780.c | 32 +++--------------- arch/sh/drivers/pci/pci-sh7780.c | 50 +++++++++++++++++----------- arch/sh/drivers/pci/pci-sh7780.h | 5 --- 4 files changed, 37 insertions(+), 70 deletions(-) diff --git a/arch/sh/drivers/pci/fixups-r7780rp.c b/arch/sh/drivers/pci/fixups-r7780rp.c index 864e92f6970..15ca65cb667 100644 --- a/arch/sh/drivers/pci/fixups-r7780rp.c +++ b/arch/sh/drivers/pci/fixups-r7780rp.c @@ -22,33 +22,15 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) { return irq_tab[slot]; } + int pci_fixup_pcic(struct pci_channel *chan) { pci_write_reg(chan, 0x000043ff, SH4_PCIINTM); - pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM); - - pci_write_reg(chan, 0xfbb00047, SH7780_PCICMD); pci_write_reg(chan, 0x00000000, SH7780_PCIIBAR); - - pci_write_reg(chan, 0x00011912, SH7780_PCISVID); pci_write_reg(chan, 0x08000000, SH7780_PCICSCR0); pci_write_reg(chan, 0x0000001b, SH7780_PCICSAR0); pci_write_reg(chan, 0xfd000000, SH7780_PCICSCR1); pci_write_reg(chan, 0x0000000f, SH7780_PCICSAR1); - pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0); - pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0); - -#ifdef CONFIG_32BIT - pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2); - pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2); -#endif - - /* Set IOBR for windows containing area specified in pci.h */ - pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1), - SH7780_PCIIOBR); - pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)), - SH7780_PCIIOBMR); - return 0; } diff --git a/arch/sh/drivers/pci/fixups-sdk7780.c b/arch/sh/drivers/pci/fixups-sdk7780.c index da60e99894b..250b0edd736 100644 --- a/arch/sh/drivers/pci/fixups-sdk7780.c +++ b/arch/sh/drivers/pci/fixups-sdk7780.c @@ -35,40 +35,18 @@ int pci_fixup_pcic(struct pci_channel *chan) { /* Enable all interrupts, so we know what to fix */ pci_write_reg(chan, 0x0000C3FF, SH7780_PCIIMR); - pci_write_reg(chan, 0x0000380F, SH7780_PCIAINTM); /* Set up standard PCI config registers */ - pci_write_reg(chan, 0xFB00, SH7780_PCISTATUS); - pci_write_reg(chan, 0x0047, SH7780_PCICMD); - pci_write_reg(chan, 0x00, SH7780_PCIPIF); - pci_write_reg(chan, 0x1912, SH7780_PCISVID); - pci_write_reg(chan, 0x0001, SH7780_PCISID); - pci_write_reg(chan, 0x08000000, SH7780_PCIMBAR0); /* PCI */ - pci_write_reg(chan, 0x08000000, SH7780_PCILAR0); /* SHwy */ - pci_write_reg(chan, 0x07F00001, SH7780_PCILSR); /* size 128M w/ MBAR */ + pci_write_reg(chan, 0x08000000, SH4_PCILAR0); /* SHwy */ + pci_write_reg(chan, 0x07F00001, SH4_PCILSR0); /* size 128M w/ MBAR */ pci_write_reg(chan, 0x00000000, SH7780_PCIMBAR1); - pci_write_reg(chan, 0x00000000, SH7780_PCILAR1); - pci_write_reg(chan, 0x00000000, SH7780_PCILSR1); + pci_write_reg(chan, 0x00000000, SH4_PCILAR1); + pci_write_reg(chan, 0x00000000, SH4_PCILSR1); pci_write_reg(chan, 0xAB000801, SH7780_PCIIBAR); - - /* - * Set the MBR so PCI address is one-to-one with window, - * meaning all calls go straight through... use ifdef to - * catch erroneous assumption. - */ - pci_write_reg(chan, 0xFD000000 , SH7780_PCIMBR0); - pci_write_reg(chan, 0x00FC0000 , SH7780_PCIMBMR0); /* 16M */ - - /* Set IOBR for window containing area specified in pci.h */ - pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1), - SH7780_PCIIOBR); - pci_write_reg(chan, (SH7780_PCI_IO_SIZE-1) & (7 << 18), - SH7780_PCIIOBMR); - - pci_write_reg(chan, 0xA5000C01, SH7780_PCICR); + pci_write_reg(chan, 0xA5000C01, SH4_PCICR); return 0; } diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 07c5529a273..f02d9dfcf25 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -1,19 +1,12 @@ /* - * Low-Level PCI Support for the SH7780 + * Low-Level PCI Support for the SH7780 * - * Dustin McIntire (dustin@sensoria.com) - * Derived from arch/i386/kernel/pci-*.c which bore the message: - * (c) 1999--2000 Martin Mares - * - * Ported to the new API by Paul Mundt - * With cleanup by Paul van Gool - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. + * Copyright (C) 2005 - 2009 Paul Mundt * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. */ -#undef DEBUG - #include #include #include @@ -117,13 +110,8 @@ int __init pcibios_init_platform(void) pci_cache_line_size = pci_read_reg(chan, SH7780_PCICLS) / 4; - /* set the command/status bits to: - * Wait Cycle Control + Parity Enable + Bus Master + - * Mem space enable - */ - pci_write_reg(chan, 0x00000046, SH7780_PCICMD); - - /* Set IO and Mem windows to local address + /* + * Set IO and Mem windows to local address * Make PCI and local address the same for easy 1 to 1 mapping */ pci_write_reg(chan, sh7780_pci_map.window0.size - 0xfffff, SH4_PCILSR0); @@ -131,9 +119,33 @@ int __init pcibios_init_platform(void) pci_write_reg(chan, sh7780_pci_map.window0.base, SH4_PCILAR0); pci_write_reg(chan, sh7780_pci_map.window0.base, SH7780_PCIMBAR0); + pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM); + + /* Set up standard PCI config registers */ + __raw_writew(0xFB00, chan->reg_base + SH7780_PCISTATUS); + __raw_writew(0x0047, chan->reg_base + SH7780_PCICMD); + __raw_writew(0x1912, chan->reg_base + SH7780_PCISVID); + __raw_writew(0x0001, chan->reg_base + SH7780_PCISID); + + __raw_writeb(0x00, chan->reg_base + SH7780_PCIPIF); + /* Apply any last-minute PCIC fixups */ pci_fixup_pcic(chan); + pci_write_reg(chan, 0xfd000000, SH7780_PCIMBR0); + pci_write_reg(chan, 0x00fc0000, SH7780_PCIMBMR0); + +#ifdef CONFIG_32BIT + pci_write_reg(chan, 0xc0000000, SH7780_PCIMBR2); + pci_write_reg(chan, 0x20000000 - SH7780_PCI_IO_SIZE, SH7780_PCIMBMR2); +#endif + + /* Set IOBR for windows containing area specified in pci.h */ + pci_write_reg(chan, chan->io_resource->start & ~(SH7780_PCI_IO_SIZE-1), + SH7780_PCIIOBR); + pci_write_reg(chan, ((SH7780_PCI_IO_SIZE-1) & (7<<18)), + SH7780_PCIIOBMR); + /* SH7780 init done, set central function init complete */ /* use round robin mode to stop a device starving/overruning */ word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_FTO; diff --git a/arch/sh/drivers/pci/pci-sh7780.h b/arch/sh/drivers/pci/pci-sh7780.h index 4b65d4b26f7..4a52478c97c 100644 --- a/arch/sh/drivers/pci/pci-sh7780.h +++ b/arch/sh/drivers/pci/pci-sh7780.h @@ -65,11 +65,6 @@ #define SH7780_PCIPMCSR_BSE 0x046 #define SH7780_PCICDD 0x047 -#define SH7780_PCICR 0x100 /* PCI Control Register */ -#define SH7780_PCILSR 0x104 /* PCI Local Space Register0 */ -#define SH7780_PCILSR1 0x108 /* PCI Local Space Register1 */ -#define SH7780_PCILAR0 0x10C /* PCI Local Address Register1 */ -#define SH7780_PCILAR1 0x110 /* PCI Local Address Register1 */ #define SH7780_PCIIR 0x114 /* PCI Interrupt Register */ #define SH7780_PCIIMR 0x118 /* PCI Interrupt Mask Register */ #define SH7780_PCIAIR 0x11C /* Error Address Register */ From 93eb677d74a4f7d3edfb678c94f6c0544d9fbad2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 13:24:06 -0400 Subject: [PATCH 0431/2061] ftrace: use module notifier for function tracer The hooks in the module code for the function tracer must be called before any of that module code runs. The function tracer hooks modify the module (replacing calls to mcount to nops). If the code is executed while the change occurs, then the CPU can take a GPF. To handle the above with a bit of paranoia, I originally implemented the hooks as calls directly from the module code. After examining the notifier calls, it looks as though the start up notify is called before any of the module's code is executed. This makes the use of the notify safe with ftrace. Only the startup notify is required to be "safe". The shutdown simply removes the entries from the ftrace function list, and does not modify any code. This change has another benefit. It removes a issue with a reverse dependency in the mutexes of ftrace_lock and module_mutex. [ Impact: fix lock dependency bug, cleanup ] Cc: Rusty Russell Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 ---- include/linux/module.h | 4 ++ kernel/module.c | 19 ++++----- kernel/trace/ftrace.c | 90 ++++++++++++++++++++++++++++++------------ 4 files changed, 75 insertions(+), 45 deletions(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 53869bef610..97c83e1bc58 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -233,8 +233,6 @@ extern int ftrace_arch_read_dyn_info(char *buf, int size); extern int skip_trace(unsigned long ip); -extern void ftrace_release(void *start, unsigned long size); - extern void ftrace_disable_daemon(void); extern void ftrace_enable_daemon(void); #else @@ -325,13 +323,8 @@ static inline void __ftrace_enabled_restore(int enabled) #ifdef CONFIG_FTRACE_MCOUNT_RECORD extern void ftrace_init(void); -extern void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end); #else static inline void ftrace_init(void) { } -static inline void -ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) { } #endif /* diff --git a/include/linux/module.h b/include/linux/module.h index 6155fa44168..a8f2c0aa4c3 100644 --- a/include/linux/module.h +++ b/include/linux/module.h @@ -341,6 +341,10 @@ struct module struct ftrace_event_call *trace_events; unsigned int num_trace_events; #endif +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + unsigned long *ftrace_callsites; + unsigned int num_ftrace_callsites; +#endif #ifdef CONFIG_MODULE_UNLOAD /* What modules depend on me? */ diff --git a/kernel/module.c b/kernel/module.c index a0394706f10..2383e60fcf3 100644 --- a/kernel/module.c +++ b/kernel/module.c @@ -1490,9 +1490,6 @@ static void free_module(struct module *mod) /* Free any allocated parameters. */ destroy_params(mod->kp, mod->num_kp); - /* release any pointers to mcount in this module */ - ftrace_release(mod->module_core, mod->core_size); - /* This may be NULL, but that's OK */ module_free(mod, mod->module_init); kfree(mod->args); @@ -1893,11 +1890,9 @@ static noinline struct module *load_module(void __user *umod, unsigned int symindex = 0; unsigned int strindex = 0; unsigned int modindex, versindex, infoindex, pcpuindex; - unsigned int num_mcount; struct module *mod; long err = 0; void *percpu = NULL, *ptr = NULL; /* Stops spurious gcc warning */ - unsigned long *mseg; mm_segment_t old_fs; DEBUGP("load_module: umod=%p, len=%lu, uargs=%p\n", @@ -2179,7 +2174,13 @@ static noinline struct module *load_module(void __user *umod, sizeof(*mod->trace_events), &mod->num_trace_events); #endif - +#ifdef CONFIG_FTRACE_MCOUNT_RECORD + /* sechdrs[0].sh_size is always zero */ + mod->ftrace_callsites = section_objs(hdr, sechdrs, secstrings, + "__mcount_loc", + sizeof(*mod->ftrace_callsites), + &mod->num_ftrace_callsites); +#endif #ifdef CONFIG_MODVERSIONS if ((mod->num_syms && !mod->crcs) || (mod->num_gpl_syms && !mod->gpl_crcs) @@ -2244,11 +2245,6 @@ static noinline struct module *load_module(void __user *umod, dynamic_debug_setup(debug, num_debug); } - /* sechdrs[0].sh_size is always zero */ - mseg = section_objs(hdr, sechdrs, secstrings, "__mcount_loc", - sizeof(*mseg), &num_mcount); - ftrace_init_module(mod, mseg, mseg + num_mcount); - err = module_finalize(hdr, sechdrs, mod); if (err < 0) goto cleanup; @@ -2309,7 +2305,6 @@ static noinline struct module *load_module(void __user *umod, cleanup: kobject_del(&mod->mkobj.kobj); kobject_put(&mod->mkobj.kobj); - ftrace_release(mod->module_core, mod->core_size); free_unload: module_unload_free(mod); #if defined(CONFIG_MODULE_UNLOAD) && defined(CONFIG_SMP) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index a2348898858..5b606f45b6c 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -916,30 +916,6 @@ static void ftrace_free_rec(struct dyn_ftrace *rec) rec->flags |= FTRACE_FL_FREE; } -void ftrace_release(void *start, unsigned long size) -{ - struct dyn_ftrace *rec; - struct ftrace_page *pg; - unsigned long s = (unsigned long)start; - unsigned long e = s + size; - - if (ftrace_disabled || !start) - return; - - mutex_lock(&ftrace_lock); - do_for_each_ftrace_rec(pg, rec) { - if ((rec->ip >= s) && (rec->ip < e)) { - /* - * rec->ip is changed in ftrace_free_rec() - * It should not between s and e if record was freed. - */ - FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); - ftrace_free_rec(rec); - } - } while_for_each_ftrace_rec(); - mutex_unlock(&ftrace_lock); -} - static struct dyn_ftrace *ftrace_alloc_dyn_node(unsigned long ip) { struct dyn_ftrace *rec; @@ -2752,14 +2728,72 @@ static int ftrace_convert_nops(struct module *mod, return 0; } -void ftrace_init_module(struct module *mod, - unsigned long *start, unsigned long *end) +#ifdef CONFIG_MODULES +void ftrace_release(void *start, void *end) +{ + struct dyn_ftrace *rec; + struct ftrace_page *pg; + unsigned long s = (unsigned long)start; + unsigned long e = (unsigned long)end; + + if (ftrace_disabled || !start || start == end) + return; + + mutex_lock(&ftrace_lock); + do_for_each_ftrace_rec(pg, rec) { + if ((rec->ip >= s) && (rec->ip < e)) { + /* + * rec->ip is changed in ftrace_free_rec() + * It should not between s and e if record was freed. + */ + FTRACE_WARN_ON(rec->flags & FTRACE_FL_FREE); + ftrace_free_rec(rec); + } + } while_for_each_ftrace_rec(); + mutex_unlock(&ftrace_lock); +} + +static void ftrace_init_module(struct module *mod, + unsigned long *start, unsigned long *end) { if (ftrace_disabled || start == end) return; ftrace_convert_nops(mod, start, end); } +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + struct module *mod = data; + + switch (val) { + case MODULE_STATE_COMING: + ftrace_init_module(mod, mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + case MODULE_STATE_GOING: + ftrace_release(mod->ftrace_callsites, + mod->ftrace_callsites + + mod->num_ftrace_callsites); + break; + } + + return 0; +} +#else +static int ftrace_module_notify(struct notifier_block *self, + unsigned long val, void *data) +{ + return 0; +} +#endif /* CONFIG_MODULES */ + +struct notifier_block ftrace_module_nb = { + .notifier_call = ftrace_module_notify, + .priority = 0, +}; + extern unsigned long __start_mcount_loc[]; extern unsigned long __stop_mcount_loc[]; @@ -2791,6 +2825,10 @@ void __init ftrace_init(void) __start_mcount_loc, __stop_mcount_loc); + ret = register_module_notifier(&ftrace_module_nb); + if (!ret) + pr_warning("Failed to register trace ftrace module notifier\n"); + return; failed: ftrace_disabled = 1; From e6187007d6c365b551c69ea3df46f06fd1c8bd19 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 13:36:40 -0400 Subject: [PATCH 0432/2061] tracing/events: add startup tests for events MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As events start to become popular, and the new way to add tracing infrastructure into ftrace, it is important to catch any problems that might happen with a mistake in the TRACE_EVENT macro. This patch introduces a startup self test on the registered trace events. Note, it can only do a generic test, any type of testing that needs more involement is needed to be implemented by the tracepoint creators. The test goes down one by one enabling a trace point and running some random tasks (random in the sense that I just made them up). Those tasks are creating threads, grabbing mutexes and spinlocks and using workqueues. After testing each event individually, it does the same test after enabling each system of trace points. Like sched, irq, lockdep. Then finally it enables all tracepoints and performs the tasks again. The output to the console on bootup will look like this when everything works: Running tests on trace events: Testing event kfree_skb: OK Testing event kmalloc: OK Testing event kmem_cache_alloc: OK Testing event kmalloc_node: OK Testing event kmem_cache_alloc_node: OK Testing event kfree: OK Testing event kmem_cache_free: OK Testing event irq_handler_exit: OK Testing event irq_handler_entry: OK Testing event softirq_entry: OK Testing event softirq_exit: OK Testing event lock_acquire: OK Testing event lock_release: OK Testing event sched_kthread_stop: OK Testing event sched_kthread_stop_ret: OK Testing event sched_wait_task: OK Testing event sched_wakeup: OK Testing event sched_wakeup_new: OK Testing event sched_switch: OK Testing event sched_migrate_task: OK Testing event sched_process_free: OK Testing event sched_process_exit: OK Testing event sched_process_wait: OK Testing event sched_process_fork: OK Testing event sched_signal_send: OK Running tests on trace event systems: Testing event system skb: OK Testing event system kmem: OK Testing event system irq: OK Testing event system lockdep: OK Testing event system sched: OK Running tests on all trace events: Testing all events: OK [ folded in: tracing: add #include to fix build failure in test_work() This build failure occured on a few rare configs: kernel/trace/trace_events.c: In function ‘test_work’: kernel/trace/trace_events.c:975: error: implicit declaration of function ‘udelay’ kernel/trace/trace_events.c:980: error: implicit declaration of function ‘msleep’ delay.h is included in way too many other headers, hiding cases where new usage is added without header inclusion. [ Impact: build fix ] Signed-off-by: Ingo Molnar ] [ Impact: add event tracer self-tests ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 178 ++++++++++++++++++++++++++++++++++++ 1 file changed, 178 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6591d83e1e7..f81d6eec4e4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -8,10 +8,14 @@ * */ +#include +#include +#include #include #include #include #include +#include #include "trace_output.h" @@ -920,3 +924,177 @@ static __init int event_trace_init(void) return 0; } fs_initcall(event_trace_init); + +#ifdef CONFIG_FTRACE_STARTUP_TEST + +static DEFINE_SPINLOCK(test_spinlock); +static DEFINE_SPINLOCK(test_spinlock_irq); +static DEFINE_MUTEX(test_mutex); + +static __init void test_work(struct work_struct *dummy) +{ + spin_lock(&test_spinlock); + spin_lock_irq(&test_spinlock_irq); + udelay(1); + spin_unlock_irq(&test_spinlock_irq); + spin_unlock(&test_spinlock); + + mutex_lock(&test_mutex); + msleep(1); + mutex_unlock(&test_mutex); +} + +static __init int event_test_thread(void *unused) +{ + void *test_malloc; + + test_malloc = kmalloc(1234, GFP_KERNEL); + if (!test_malloc) + pr_info("failed to kmalloc\n"); + + schedule_on_each_cpu(test_work); + + kfree(test_malloc); + + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) + schedule(); + + return 0; +} + +/* + * Do various things that may trigger events. + */ +static __init void event_test_stuff(void) +{ + struct task_struct *test_thread; + + test_thread = kthread_run(event_test_thread, NULL, "test-events"); + msleep(1); + kthread_stop(test_thread); +} + +/* + * For every trace event defined, we will test each trace point separately, + * and then by groups, and finally all trace points. + */ +static __init int event_trace_self_tests(void) +{ + struct ftrace_event_call *call; + struct event_subsystem *system; + char *sysname; + int ret; + + pr_info("Running tests on trace events:\n"); + + list_for_each_entry(call, &ftrace_events, list) { + + /* Only test those that have a regfunc */ + if (!call->regfunc) + continue; + + pr_info("Testing event %s: ", call->name); + + /* + * If an event is already enabled, someone is using + * it and the self test should not be on. + */ + if (call->enabled) { + pr_warning("Enabled event during self test!\n"); + WARN_ON_ONCE(1); + continue; + } + + call->enabled = 1; + call->regfunc(); + + event_test_stuff(); + + call->unregfunc(); + call->enabled = 0; + + pr_cont("OK\n"); + } + + /* Now test at the sub system level */ + + pr_info("Running tests on trace event systems:\n"); + + list_for_each_entry(system, &event_subsystems, list) { + + /* the ftrace system is special, skip it */ + if (strcmp(system->name, "ftrace") == 0) + continue; + + pr_info("Testing event system %s: ", system->name); + + /* ftrace_set_clr_event can modify the name passed in. */ + sysname = kstrdup(system->name, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + ret = ftrace_set_clr_event(sysname, 1); + kfree(sysname); + if (WARN_ON_ONCE(ret)) { + pr_warning("error enabling system %s\n", + system->name); + continue; + } + + event_test_stuff(); + + sysname = kstrdup(system->name, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + ret = ftrace_set_clr_event(sysname, 0); + kfree(sysname); + + if (WARN_ON_ONCE(ret)) + pr_warning("error disabling system %s\n", + system->name); + + pr_cont("OK\n"); + } + + /* Test with all events enabled */ + + pr_info("Running tests on all trace events:\n"); + pr_info("Testing all events: "); + + sysname = kmalloc(4, GFP_KERNEL); + if (WARN_ON(!sysname)) { + pr_warning("Can't allocate memory, giving up!\n"); + return 0; + } + memcpy(sysname, "*:*", 4); + ret = ftrace_set_clr_event(sysname, 1); + if (WARN_ON_ONCE(ret)) { + kfree(sysname); + pr_warning("error enabling all events\n"); + return 0; + } + + event_test_stuff(); + + /* reset sysname */ + memcpy(sysname, "*:*", 4); + ret = ftrace_set_clr_event(sysname, 0); + kfree(sysname); + + if (WARN_ON_ONCE(ret)) { + pr_warning("error disabling all events\n"); + return 0; + } + + pr_cont("OK\n"); + + return 0; +} + +late_initcall(event_trace_self_tests); + +#endif From d1b182a8d49ed6416325b4e0a1cb0f17cd4e702a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 15 Apr 2009 16:53:47 -0400 Subject: [PATCH 0433/2061] tracing/events/ring-buffer: expose format of ring buffer headers to users Currently, every thing needed to read the binary output from the ring buffers is available, with the exception of the way the ring buffers handles itself internally. This patch creates two special files in the debugfs/tracing/events directory: # cat /debug/tracing/events/header_page field: u64 timestamp; offset:0; size:8; field: local_t commit; offset:8; size:8; field: char data; offset:16; size:4080; # cat /debug/tracing/events/header_event type : 2 bits len : 3 bits time_delta : 27 bits array : 32 bits padding : type == 0 time_extend : type == 1 data : type == 3 This is to allow a userspace app to see if the ring buffer format changes or not. [ Impact: allow userspace apps to know of ringbuffer format changes ] Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 5 +++++ kernel/trace/ring_buffer.c | 44 +++++++++++++++++++++++++++++++++++++ kernel/trace/trace_events.c | 38 ++++++++++++++++++++++++++++++++ 3 files changed, 87 insertions(+) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index f0aa486d131..fac8f1ac6f4 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -166,6 +166,11 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data); int ring_buffer_read_page(struct ring_buffer *buffer, void **data_page, size_t len, int cpu, int full); +struct trace_seq; + +int ring_buffer_print_entry_header(struct trace_seq *s); +int ring_buffer_print_page_header(struct trace_seq *s); + enum ring_buffer_flags { RB_FL_OVERWRITE = 1 << 0, }; diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f935bd5ec3e..84a6055f37c 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -21,6 +21,28 @@ #include "trace.h" +/* + * The ring buffer header is special. We must manually up keep it. + */ +int ring_buffer_print_entry_header(struct trace_seq *s) +{ + int ret; + + ret = trace_seq_printf(s, "\ttype : 2 bits\n"); + ret = trace_seq_printf(s, "\tlen : 3 bits\n"); + ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); + ret = trace_seq_printf(s, "\tarray : 32 bits\n"); + ret = trace_seq_printf(s, "\n"); + ret = trace_seq_printf(s, "\tpadding : type == %d\n", + RINGBUF_TYPE_PADDING); + ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", + RINGBUF_TYPE_TIME_EXTEND); + ret = trace_seq_printf(s, "\tdata : type == %d\n", + RINGBUF_TYPE_DATA); + + return ret; +} + /* * The ring buffer is made up of a list of pages. A separate list of pages is * allocated for each CPU. A writer may only write to a buffer that is @@ -340,6 +362,28 @@ static inline int test_time_stamp(u64 delta) #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +int ring_buffer_print_page_header(struct trace_seq *s) +{ + struct buffer_data_page field; + int ret; + + ret = trace_seq_printf(s, "\tfield: u64 timestamp;\t" + "offset:0;\tsize:%u;\n", + (unsigned int)sizeof(field.time_stamp)); + + ret = trace_seq_printf(s, "\tfield: local_t commit;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), commit), + (unsigned int)sizeof(field.commit)); + + ret = trace_seq_printf(s, "\tfield: char data;\t" + "offset:%u;\tsize:%u;\n", + (unsigned int)offsetof(typeof(field), data), + (unsigned int)BUF_PAGE_SIZE); + + return ret; +} + /* * head_page == tail_page && head == tail then buffer is empty. */ diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f81d6eec4e4..7163a2bb021 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -610,6 +610,30 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +show_header(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) +{ + int (*func)(struct trace_seq *s) = filp->private_data; + struct trace_seq *s; + int r; + + if (*ppos) + return 0; + + s = kmalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + trace_seq_init(s); + + func(s); + r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); + + kfree(s); + + return r; +} + static const struct seq_operations show_event_seq_ops = { .start = t_start, .next = t_next, @@ -667,6 +691,11 @@ static const struct file_operations ftrace_subsystem_filter_fops = { .write = subsystem_filter_write, }; +static const struct file_operations ftrace_show_header_fops = { + .open = tracing_open_generic, + .read = show_header, +}; + static struct dentry *event_trace_events_dir(void) { static struct dentry *d_tracer; @@ -909,6 +938,15 @@ static __init int event_trace_init(void) if (!d_events) return 0; + /* ring buffer internal formats */ + trace_create_file("header_page", 0444, d_events, + ring_buffer_print_page_header, + &ftrace_show_header_fops); + + trace_create_file("header_event", 0444, d_events, + ring_buffer_print_entry_header, + &ftrace_show_header_fops); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) From 69abe6a5d18a9394baa325bab8f57748b037c517 Mon Sep 17 00:00:00 2001 From: Avadh Patel Date: Fri, 10 Apr 2009 16:04:48 -0400 Subject: [PATCH 0434/2061] tracing: add saved_cmdlines file to show cached task comms Export the cached task comms to userspace. This allows user apps to translate the pids from a trace into their respective task command lines. [ Impact: let userspace apps reading binary buffer know comm's of pids ] Signed-off-by: Avadh Patel [ added error checking and use of buf pointer to index file_buf ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 53 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 2d69b26b3cc..031c46f11bb 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2421,6 +2421,56 @@ static const struct file_operations tracing_readme_fops = { .read = tracing_readme_read, }; +static ssize_t +tracing_saved_cmdlines_read(struct file *file, char __user *ubuf, + size_t cnt, loff_t *ppos) +{ + char *buf_comm; + char *file_buf; + char *buf; + int len = 0; + int pid; + int i; + + file_buf = kmalloc(SAVED_CMDLINES*(16+TASK_COMM_LEN), GFP_KERNEL); + if (!file_buf) + return -ENOMEM; + + buf_comm = kmalloc(TASK_COMM_LEN, GFP_KERNEL); + if (!buf_comm) { + kfree(file_buf); + return -ENOMEM; + } + + buf = file_buf; + + for (i = 0; i < SAVED_CMDLINES; i++) { + int r; + + pid = map_cmdline_to_pid[i]; + if (pid == -1 || pid == NO_CMDLINE_MAP) + continue; + + trace_find_cmdline(pid, buf_comm); + r = sprintf(buf, "%d %s\n", pid, buf_comm); + buf += r; + len += r; + } + + len = simple_read_from_buffer(ubuf, cnt, ppos, + file_buf, len); + + kfree(file_buf); + kfree(buf_comm); + + return len; +} + +static const struct file_operations tracing_saved_cmdlines_fops = { + .open = tracing_open_generic, + .read = tracing_saved_cmdlines_read, +}; + static ssize_t tracing_ctrl_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) @@ -3973,6 +4023,9 @@ static __init int tracer_init_debugfs(void) trace_create_file("trace_marker", 0220, d_tracer, NULL, &tracing_mark_fops); + trace_create_file("saved_cmdlines", 0444, d_tracer, + NULL, &tracing_saved_cmdlines_fops); + #ifdef CONFIG_DYNAMIC_FTRACE trace_create_file("dyn_ftrace_total_info", 0444, d_tracer, &ftrace_update_tot_cnt, &tracing_dyn_info_fops); From 9ea21c1ecdb35ecdcac5fd9d95f62a1f6a7ffec0 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Apr 2009 12:15:44 -0400 Subject: [PATCH 0435/2061] tracing/events: perform function tracing in event selftests We can find some bugs in the trace events if we stress the writes as well. The function tracer is a good way to stress the events. [ Impact: extend scope of event tracer self-tests ] Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: Peter Zijlstra Cc: Frederic Weisbecker LKML-Reference: <20090416161746.604786131@goodmis.org> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 78 ++++++++++++++++++++++++++++++++++--- 1 file changed, 72 insertions(+), 6 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 7163a2bb021..1137f951be4 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1017,7 +1017,7 @@ static __init void event_test_stuff(void) * For every trace event defined, we will test each trace point separately, * and then by groups, and finally all trace points. */ -static __init int event_trace_self_tests(void) +static __init void event_trace_self_tests(void) { struct ftrace_event_call *call; struct event_subsystem *system; @@ -1071,7 +1071,7 @@ static __init int event_trace_self_tests(void) sysname = kstrdup(system->name, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } ret = ftrace_set_clr_event(sysname, 1); kfree(sysname); @@ -1086,7 +1086,7 @@ static __init int event_trace_self_tests(void) sysname = kstrdup(system->name, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } ret = ftrace_set_clr_event(sysname, 0); kfree(sysname); @@ -1106,14 +1106,14 @@ static __init int event_trace_self_tests(void) sysname = kmalloc(4, GFP_KERNEL); if (WARN_ON(!sysname)) { pr_warning("Can't allocate memory, giving up!\n"); - return 0; + return; } memcpy(sysname, "*:*", 4); ret = ftrace_set_clr_event(sysname, 1); if (WARN_ON_ONCE(ret)) { kfree(sysname); pr_warning("error enabling all events\n"); - return 0; + return; } event_test_stuff(); @@ -1125,10 +1125,76 @@ static __init int event_trace_self_tests(void) if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); - return 0; + return; } pr_cont("OK\n"); +} + +#ifdef CONFIG_FUNCTION_TRACER + +static DEFINE_PER_CPU(atomic_t, test_event_disable); + +static void +function_test_events_call(unsigned long ip, unsigned long parent_ip) +{ + struct ring_buffer_event *event; + struct ftrace_entry *entry; + unsigned long flags; + long disabled; + int resched; + int cpu; + int pc; + + pc = preempt_count(); + resched = ftrace_preempt_disable(); + cpu = raw_smp_processor_id(); + disabled = atomic_inc_return(&per_cpu(test_event_disable, cpu)); + + if (disabled != 1) + goto out; + + local_save_flags(flags); + + event = trace_current_buffer_lock_reserve(TRACE_FN, sizeof(*entry), + flags, pc); + if (!event) + goto out; + entry = ring_buffer_event_data(event); + entry->ip = ip; + entry->parent_ip = parent_ip; + + trace_current_buffer_unlock_commit(event, flags, pc); + + out: + atomic_dec(&per_cpu(test_event_disable, cpu)); + ftrace_preempt_enable(resched); +} + +static struct ftrace_ops trace_ops __initdata = +{ + .func = function_test_events_call, +}; + +static __init void event_trace_self_test_with_function(void) +{ + register_ftrace_function(&trace_ops); + pr_info("Running tests again, along with the function tracer\n"); + event_trace_self_tests(); + unregister_ftrace_function(&trace_ops); +} +#else +static __init void event_trace_self_test_with_function(void) +{ +} +#endif + +static __init int event_trace_self_tests_init(void) +{ + + event_trace_self_tests(); + + event_trace_self_test_with_function(); return 0; } From 76aa81118ddfbb3dc31533030cf3ec329dd067a6 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 16 Apr 2009 23:35:39 -0700 Subject: [PATCH 0436/2061] tracing: avoid warnings from zero-arg tracepoints Tracepoints with no arguments can issue two warnings: "field" defined by not used "ret" is uninitialized in this function Mark field as being OK to leave unused, and initialize ret. [ Impact: fix false positive compiler warnings. ] Signed-off-by: Jeremy Fitzhardinge Acked-by: Steven Rostedt Cc: mathieu.desnoyers@polymtl.ca LKML-Reference: <1239950139-1119-5-git-send-email-jeremy@goop.org> Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 60c5323bee6..39a3351f2e7 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -160,8 +160,8 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ static int \ ftrace_format_##call(struct trace_seq *s) \ { \ - struct ftrace_raw_##call field; \ - int ret; \ + struct ftrace_raw_##call field __attribute__((unused)); \ + int ret = 0; \ \ tstruct; \ \ From 339ae5d3c3fc2025e3657637921495fd600027c7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 17 Apr 2009 10:34:30 +0800 Subject: [PATCH 0437/2061] tracing: fix file mode of trace and README trace is read-write and README is read-only. [ Impact: fix /debug/tracing/ file permissions. ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Steven Rostedt LKML-Reference: <49E7EAB6.4070605@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 031c46f11bb..f681f646aa0 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4002,7 +4002,7 @@ static __init int tracer_init_debugfs(void) trace_create_file("available_tracers", 0444, d_tracer, &global_trace, &show_traces_fops); - trace_create_file("current_tracer", 0444, d_tracer, + trace_create_file("current_tracer", 0644, d_tracer, &global_trace, &set_tracer_fops); trace_create_file("tracing_max_latency", 0644, d_tracer, @@ -4011,7 +4011,7 @@ static __init int tracer_init_debugfs(void) trace_create_file("tracing_thresh", 0644, d_tracer, &tracing_thresh, &tracing_max_lat_fops); - trace_create_file("README", 0644, d_tracer, + trace_create_file("README", 0444, d_tracer, NULL, &tracing_readme_fops); trace_create_file("trace_pipe", 0444, d_tracer, From 9b94b3a19b13e094c10f65f24bc358f6ffe4eacd Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Fri, 17 Apr 2009 12:07:46 +0200 Subject: [PATCH 0438/2061] x86: fixup numa_node information for AMD CPU northbridge functions Currently the numa_node attribute for these PCI devices is 0 (it corresponds to the numa_node for PCI bus 0). This is not a big issue but incorrect. This inconsistency can be fixed by reading the node number from CPU NB function 0. [ Impact: fill in dev->numa_node information, to optimize DMA allocations ] Signed-off-by: Andreas Herrmann Cc: jbarnes@virtuousgeek.org LKML-Reference: <20090417100746.GG16198@alberich.amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/quirks.c | 39 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 38 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/quirks.c b/arch/x86/kernel/quirks.c index e95022e4f5d..94ad0c029f0 100644 --- a/arch/x86/kernel/quirks.c +++ b/arch/x86/kernel/quirks.c @@ -493,5 +493,42 @@ void force_hpet_resume(void) break; } } - +#endif + +#if defined(CONFIG_PCI) && defined(CONFIG_NUMA) +/* Set correct numa_node information for AMD NB functions */ +static void __init quirk_amd_nb_node(struct pci_dev *dev) +{ + struct pci_dev *nb_ht; + unsigned int devfn; + u32 val; + + devfn = PCI_DEVFN(PCI_SLOT(dev->devfn), 0); + nb_ht = pci_get_slot(dev->bus, devfn); + if (!nb_ht) + return; + + pci_read_config_dword(nb_ht, 0x60, &val); + set_dev_node(&dev->dev, val & 7); + pci_dev_put(dev); +} + +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_ADDRMAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MEMCTL, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_HT, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MAP, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_DRAM, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC, + quirk_amd_nb_node); +DECLARE_PCI_FIXUP_FINAL(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_LINK, + quirk_amd_nb_node); #endif From 46de405f25f1d9fa73b657ffbb752aa0cc87a91d Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Fri, 17 Apr 2009 10:53:43 +0800 Subject: [PATCH 0439/2061] tracing: Remove include/trace/kmem_event_types.h kmem_event_types.h is no longer necessary since tracepoint definitions are put into include/trace/events/kmem.h [ Impact: remove now-unused file. ] Signed-off-by: Zhao Lei Acked-by: Steven Rostedt Cc: Frederic Weisbecker Cc: Tom Zanussi LKML-Reference: <49E7EF37.2080205@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/kmem_event_types.h | 193 ------------------------------- 1 file changed, 193 deletions(-) delete mode 100644 include/trace/kmem_event_types.h diff --git a/include/trace/kmem_event_types.h b/include/trace/kmem_event_types.h deleted file mode 100644 index 4ff420fe467..00000000000 --- a/include/trace/kmem_event_types.h +++ /dev/null @@ -1,193 +0,0 @@ - -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif - -#undef TRACE_SYSTEM -#define TRACE_SYSTEM kmem - -TRACE_EVENT(kmalloc, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags) -); - -TRACE_EVENT(kmem_cache_alloc, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags) -); - -TRACE_EVENT(kmalloc_node, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - __entry->node = node; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags, - __entry->node) -); - -TRACE_EVENT(kmem_cache_alloc_node, - - TP_PROTO(unsigned long call_site, - const void *ptr, - size_t bytes_req, - size_t bytes_alloc, - gfp_t gfp_flags, - int node), - - TP_ARGS(call_site, ptr, bytes_req, bytes_alloc, gfp_flags, node), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - __field( size_t, bytes_req ) - __field( size_t, bytes_alloc ) - __field( gfp_t, gfp_flags ) - __field( int, node ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - __entry->bytes_req = bytes_req; - __entry->bytes_alloc = bytes_alloc; - __entry->gfp_flags = gfp_flags; - __entry->node = node; - ), - - TP_printk("call_site=%lx ptr=%p bytes_req=%zu bytes_alloc=%zu gfp_flags=%08x node=%d", - __entry->call_site, - __entry->ptr, - __entry->bytes_req, - __entry->bytes_alloc, - __entry->gfp_flags, - __entry->node) -); - -TRACE_EVENT(kfree, - - TP_PROTO(unsigned long call_site, const void *ptr), - - TP_ARGS(call_site, ptr), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - ), - - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) -); - -TRACE_EVENT(kmem_cache_free, - - TP_PROTO(unsigned long call_site, const void *ptr), - - TP_ARGS(call_site, ptr), - - TP_STRUCT__entry( - __field( unsigned long, call_site ) - __field( const void *, ptr ) - ), - - TP_fast_assign( - __entry->call_site = call_site; - __entry->ptr = ptr; - ), - - TP_printk("call_site=%lx ptr=%p", __entry->call_site, __entry->ptr) -); - -#undef TRACE_SYSTEM From ac1adc55fc71c7515caa2eb0e63e49b3d1c6a47c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Fri, 17 Apr 2009 00:27:08 -0500 Subject: [PATCH 0440/2061] tracing/filters: add filter_mutex to protect filter predicates This patch adds a filter_mutex to prevent the filter predicates from being accessed concurrently by various external functions. It's based on a previous patch by Li Zefan: "[PATCH 7/7] tracing/filters: make filter preds RCU safe" v2 changes: - fixed wrong value returned in a add_subsystem_pred() failure case noticed by Li Zefan. [ Impact: fix trace filter corruption/crashes on parallel access ] Signed-off-by: Tom Zanussi Reviewed-by: Li Zefan Tested-by: Li Zefan Cc: Frederic Weisbecker Cc: Steven Rostedt Cc: paulmck@linux.vnet.ibm.com LKML-Reference: <1239946028.6639.13.camel@tropicana> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 4 +- kernel/trace/trace_events.c | 4 +- kernel/trace/trace_events_filter.c | 90 +++++++++++++++++++++++------- 3 files changed, 75 insertions(+), 23 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 8817c18ef97..247948e81b0 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -757,13 +757,15 @@ struct filter_pred { }; extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct filter_pred **preds, int n_preds, +extern void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s); extern int filter_parse(char **pbuf, struct filter_pred *pred); extern int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred); extern void filter_disable_preds(struct ftrace_event_call *call); extern void filter_free_subsystem_preds(struct event_subsystem *system); +extern void filter_print_subsystem_preds(struct event_subsystem *system, + struct trace_seq *s); extern int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1137f951be4..64f9d6d2735 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -488,7 +488,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call->preds, call->n_preds, s); + filter_print_preds(call, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -558,7 +558,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(system->preds, system->n_preds, s); + filter_print_subsystem_preds(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f8e5eab0424..e0fcfd2a16d 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -22,10 +22,13 @@ #include #include #include +#include #include "trace.h" #include "trace_output.h" +static DEFINE_MUTEX(filter_mutex); + static int filter_pred_64(struct filter_pred *pred, void *event) { u64 *addr = (u64 *)(event + pred->offset); @@ -112,8 +115,8 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) } EXPORT_SYMBOL_GPL(filter_match_preds); -void filter_print_preds(struct filter_pred **preds, int n_preds, - struct trace_seq *s) +static void __filter_print_preds(struct filter_pred **preds, int n_preds, + struct trace_seq *s) { char *field_name; struct filter_pred *pred; @@ -138,6 +141,21 @@ void filter_print_preds(struct filter_pred **preds, int n_preds, } } +void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) +{ + mutex_lock(&filter_mutex); + __filter_print_preds(call->preds, call->n_preds, s); + mutex_unlock(&filter_mutex); +} + +void filter_print_subsystem_preds(struct event_subsystem *system, + struct trace_seq *s) +{ + mutex_lock(&filter_mutex); + __filter_print_preds(system->preds, system->n_preds, s); + mutex_unlock(&filter_mutex); +} + static struct ftrace_event_field * find_event_field(struct ftrace_event_call *call, char *name) { @@ -180,7 +198,7 @@ static int filter_set_pred(struct filter_pred *dest, return 0; } -void filter_disable_preds(struct ftrace_event_call *call) +static void __filter_disable_preds(struct ftrace_event_call *call) { int i; @@ -190,6 +208,13 @@ void filter_disable_preds(struct ftrace_event_call *call) call->preds[i]->fn = filter_pred_none; } +void filter_disable_preds(struct ftrace_event_call *call) +{ + mutex_lock(&filter_mutex); + __filter_disable_preds(call); + mutex_unlock(&filter_mutex); +} + int init_preds(struct ftrace_event_call *call) { struct filter_pred *pred; @@ -223,7 +248,7 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -void filter_free_subsystem_preds(struct event_subsystem *system) +static void __filter_free_subsystem_preds(struct event_subsystem *system) { struct ftrace_event_call *call; int i; @@ -241,18 +266,25 @@ void filter_free_subsystem_preds(struct event_subsystem *system) continue; if (!strcmp(call->system, system->name)) - filter_disable_preds(call); + __filter_disable_preds(call); } } -static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred, - filter_pred_fn_t fn) +void filter_free_subsystem_preds(struct event_subsystem *system) +{ + mutex_lock(&filter_mutex); + __filter_free_subsystem_preds(system); + mutex_unlock(&filter_mutex); +} + +static int filter_add_pred_fn(struct ftrace_event_call *call, + struct filter_pred *pred, + filter_pred_fn_t fn) { int idx, err; if (call->n_preds && !pred->compound) - filter_disable_preds(call); + __filter_disable_preds(call); if (call->n_preds == MAX_FILTER_PRED) return -ENOSPC; @@ -276,7 +308,8 @@ static int is_string_field(const char *type) return 0; } -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +static int __filter_add_pred(struct ftrace_event_call *call, + struct filter_pred *pred) { struct ftrace_event_field *field; filter_pred_fn_t fn; @@ -293,7 +326,7 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) return -EINVAL; fn = filter_pred_string; pred->str_len = field->size; - return __filter_add_pred(call, pred, fn); + return filter_add_pred_fn(call, pred, fn); } else { if (pred->str_len) return -EINVAL; @@ -316,7 +349,18 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) return -EINVAL; } - return __filter_add_pred(call, pred, fn); + return filter_add_pred_fn(call, pred, fn); +} + +int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) +{ + int err; + + mutex_lock(&filter_mutex); + err = __filter_add_pred(call, pred); + mutex_unlock(&filter_mutex); + + return err; } int filter_add_subsystem_pred(struct event_subsystem *system, @@ -324,20 +368,27 @@ int filter_add_subsystem_pred(struct event_subsystem *system, { struct ftrace_event_call *call; + mutex_lock(&filter_mutex); + if (system->n_preds && !pred->compound) - filter_free_subsystem_preds(system); + __filter_free_subsystem_preds(system); if (!system->n_preds) { system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!system->preds) + if (!system->preds) { + mutex_unlock(&filter_mutex); return -ENOMEM; + } } - if (system->n_preds == MAX_FILTER_PRED) + if (system->n_preds == MAX_FILTER_PRED) { + mutex_unlock(&filter_mutex); return -ENOSPC; + } system->preds[system->n_preds] = pred; + system->n_preds++; list_for_each_entry(call, &ftrace_events, list) { int err; @@ -348,17 +399,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (strcmp(call->system, system->name)) continue; - if (!find_event_field(call, pred->field_name)) - continue; - - err = filter_add_pred(call, pred); + err = __filter_add_pred(call, pred); if (err == -ENOMEM) { system->preds[system->n_preds] = NULL; + system->n_preds--; + mutex_unlock(&filter_mutex); return err; } } - system->n_preds++; + mutex_unlock(&filter_mutex); return 0; } From b0afdc126d0515e76890f0a5f26b28501cfa298e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 13:02:22 -0400 Subject: [PATCH 0441/2061] tracing/events: enable code with EVENT_TRACING not EVENT_TRACER The CONFIG_EVENT_TRACER is the way to turn on event tracing when no other tracing has been configured. All code to get enabled should depend on CONFIG_EVENT_TRACING. That is what is enabled when TRACING (or CONFIG_EVENT_TRACER) is selected. This patch enables the include/trace/ftrace.h file when CONFIG_EVENT_TRACING is enabled. [ Impact: fix warning in event tracer selftest ] Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 18869417109..7f1f23d601e 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -56,7 +56,7 @@ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) -#ifdef CONFIG_EVENT_TRACER +#ifdef CONFIG_EVENT_TRACING #include #endif From 12acd473d45cf2e40de3782cb2de712e5cd4d715 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 16:01:56 -0400 Subject: [PATCH 0442/2061] tracing: add EXPORT_SYMBOL_GPL for trace commits Not all the necessary symbols were exported to allow for tracing by modules. This patch adds them in. [ Impact: allow modules to commit data to the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f681f646aa0..183d788038e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -894,18 +894,20 @@ void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } +EXPORT_SYMBOL(trace_current_buffer_unlock_commit); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } +EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit); void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { ring_buffer_discard_commit(global_trace.buffer, event); } -EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_current_buffer_discard_commit); void trace_function(struct trace_array *tr, From 261842b7c9099f56de2eb969c8ad65402d68e00e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 16 Apr 2009 21:41:52 -0400 Subject: [PATCH 0443/2061] tracing: add same level recursion detection The tracing infrastructure allows for recursion. That is, an interrupt may interrupt the act of tracing an event, and that interrupt may very well perform its own trace. This is a recursive trace, and is fine to do. The problem arises when there is a bug, and the utility doing the trace calls something that recurses back into the tracer. This recursion is not caused by an external event like an interrupt, but by code that is not expected to recurse. The result could be a lockup. This patch adds a bitmask to the task structure that keeps track of the trace recursion. To find the interrupt depth, the following algorithm is used: level = hardirq_count() + softirq_count() + in_nmi; Here, level will be the depth of interrutps and softirqs, and even handles the nmi. Then the corresponding bit is set in the recursion bitmask. If the bit was already set, we know we had a recursion at the same level and we warn about it and fail the writing to the buffer. After the data has been committed to the buffer, we clear the bit. No atomics are needed. The only races are with interrupts and they reset the bitmask before returning anywy. [ Impact: detect same irq level trace recursion ] Signed-off-by: Steven Rostedt --- include/linux/ftrace.h | 7 +++++++ include/linux/init_task.h | 1 + include/linux/sched.h | 4 +++- kernel/trace/ring_buffer.c | 42 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 53 insertions(+), 1 deletion(-) diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 97c83e1bc58..39b95c56587 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -488,8 +488,15 @@ static inline int test_tsk_trace_graph(struct task_struct *tsk) extern int ftrace_dump_on_oops; +#ifdef CONFIG_PREEMPT +#define INIT_TRACE_RECURSION .trace_recursion = 0, +#endif + #endif /* CONFIG_TRACING */ +#ifndef INIT_TRACE_RECURSION +#define INIT_TRACE_RECURSION +#endif #ifdef CONFIG_HW_BRANCH_TRACER diff --git a/include/linux/init_task.h b/include/linux/init_task.h index dcfb93337e9..6fc21852986 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -187,6 +187,7 @@ extern struct cred init_cred; INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ + INIT_TRACE_RECURSION \ } diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049..7ede5e49091 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1428,7 +1428,9 @@ struct task_struct { #ifdef CONFIG_TRACING /* state flags for use by tracers */ unsigned long trace; -#endif + /* bitmask of trace recursion */ + unsigned long trace_recursion; +#endif /* CONFIG_TRACING */ }; /* Future-safe accessor for struct task_struct's cpus_allowed. */ diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 84a6055f37c..b421b0ea911 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1481,6 +1481,40 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } +static int trace_irq_level(void) +{ + return hardirq_count() + softirq_count() + in_nmi(); +} + +static int trace_recursive_lock(void) +{ + int level; + + level = trace_irq_level(); + + if (unlikely(current->trace_recursion & (1 << level))) { + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); + WARN_ON_ONCE(1); + return -1; + } + + current->trace_recursion |= 1 << level; + + return 0; +} + +static void trace_recursive_unlock(void) +{ + int level; + + level = trace_irq_level(); + + WARN_ON_ONCE(!current->trace_recursion & (1 << level)); + + current->trace_recursion &= ~(1 << level); +} + static DEFINE_PER_CPU(int, rb_need_resched); /** @@ -1514,6 +1548,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) /* If we are tracing schedule, we don't want to recurse */ resched = ftrace_preempt_disable(); + if (trace_recursive_lock()) + goto out_nocheck; + cpu = raw_smp_processor_id(); if (!cpumask_test_cpu(cpu, buffer->cpumask)) @@ -1543,6 +1580,9 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) return event; out: + trace_recursive_unlock(); + + out_nocheck: ftrace_preempt_enable(resched); return NULL; } @@ -1581,6 +1621,8 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, rb_commit(cpu_buffer, event); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ From 3189cdb31622f4e40688ce5a6fc5d940b42bc805 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 16:13:55 -0400 Subject: [PATCH 0444/2061] tracing: protect trace_printk from recursion trace_printk can be called from any context, including NMIs. If this happens, then we must test for for recursion before grabbing any spinlocks. This patch prevents trace_printk from being called recursively. [ Impact: prevent hard lockup in lockdep event tracer ] Cc: Peter Zijlstra Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 183d788038e..b9a3adce922 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1259,6 +1259,7 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) struct trace_array_cpu *data; struct bprint_entry *entry; unsigned long flags; + int disable; int resched; int cpu, len = 0, size, pc; @@ -1273,7 +1274,8 @@ int trace_vbprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; /* Lockdep uses trace_printk for lock tracing */ @@ -1301,6 +1303,7 @@ out_unlock: local_irq_restore(flags); out: + atomic_dec_return(&data->disabled); ftrace_preempt_enable(resched); unpause_graph_tracing(); @@ -1320,6 +1323,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) int cpu, len = 0, size, pc; struct print_entry *entry; unsigned long irq_flags; + int disable; if (tracing_disabled || tracing_selftest_running) return 0; @@ -1329,7 +1333,8 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) cpu = raw_smp_processor_id(); data = tr->data[cpu]; - if (unlikely(atomic_read(&data->disabled))) + disable = atomic_inc_return(&data->disabled); + if (unlikely(disable != 1)) goto out; pause_graph_tracing(); @@ -1357,6 +1362,7 @@ int trace_vprintk(unsigned long ip, const char *fmt, va_list args) raw_local_irq_restore(irq_flags); unpause_graph_tracing(); out: + atomic_dec_return(&data->disabled); preempt_enable_notrace(); return len; From 8e668b5b3455207e4540fc7ccab9ecf70142f288 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 17 Apr 2009 17:17:55 -0400 Subject: [PATCH 0445/2061] tracing: remove format attribute of inline function Due to a cut and paste error, I added the gcc attribute for printf format to the static inline stub of trace_seq_printf. This will cause a compile failure. [ Impact: fix compiler error when CONFIG_TRACING is off ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt Cc: Andrew Morton Cc: =?ISO-8859-15?Q?Fr=E9d=E9ric_Weisbecker?= LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/trace_seq.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 15ca2c71af1..37db9bdfbc1 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -42,7 +42,6 @@ extern int trace_seq_path(struct trace_seq *s, struct path *path); #else /* CONFIG_TRACING */ static inline int trace_seq_printf(struct trace_seq *s, const char *fmt, ...) - __attribute__ ((format (printf, 2, 3))) { return 0; } From 6f41469c627fe118121dfce2a8134ad24da3df28 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0446/2061] block: clear req->errors on bio completion only for fs requests Impact: subtle behavior change For fs requests, rq is only carrier of bios and rq error status as a whole doesn't mean much. This is the reason why rq->errors is being cleared on each partial completion of a request as on each partial completion the error status is transferred to the respective bios. For pc requests, rq->errors is used to carry error status to the issuer and thus __end_that_request_first() doesn't clear it on such cases. The condition was fine till now as only fs and pc requests have used bio and thus the bio completion path. However, future changes will unify data accesses to bio and all non fs users care about rq error status. Clear rq->errors on bio completion only for fs requests. In general, the implicit clearing is a bit too subtle especially as the meaning of rq->errors is completely dependent on low level drivers. Unifying / cleaning up rq->errors usage and letting llds manage it would be better. TODO comment added. Signed-off-by: Tejun Heo Acked-by: Jens Axboe --- block/blk-core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 07ab75403e1..cce7a88dc61 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1739,10 +1739,14 @@ static int __end_that_request_first(struct request *req, int error, trace_block_rq_complete(req->q, req); /* - * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual - * sense key with us all the way through + * For fs requests, rq is just carrier of independent bio's + * and each partial completion should be handled separately. + * Reset per-request error on each partial completion. + * + * TODO: tj: This is too subtle. It would be better to let + * low level drivers do what they see fit. */ - if (!blk_pc_request(req)) + if (blk_fs_request(req)) req->errors = 0; if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { From 1e75540ec5202cae63cd238c86bd880e3d496546 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0447/2061] ide-tape: remove back-to-back REQUEST_SENSE detection Impact: fix an oops which always triggers ide_tape_issue_pc() assumed drive->pc isn't NULL on invocation when checking for back-to-back request sense issues but drive->pc can be NULL and even when it's not NULL, it's not safe to dereference it once the previous command is complete because pc could have been freed or was on stack. Kill back-to-back REQUEST_SENSE detection. Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index cb942a9b580..3a53e0834cf 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -614,12 +614,6 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, { idetape_tape_t *tape = drive->driver_data; - if (drive->pc->c[0] == REQUEST_SENSE && - pc->c[0] == REQUEST_SENSE) { - printk(KERN_ERR "ide-tape: possible ide-tape.c bug - " - "Two request sense in serial were issued\n"); - } - if (drive->failed_pc == NULL && pc->c[0] != REQUEST_SENSE) drive->failed_pc = pc; From 853280a4dc8e3cc97ff10c1c02234d96078f437b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0448/2061] ide: use blk_run_queue() instead of blk_start_queueing() blk_start_queueing() is being phased out in favor of [__]blk_run_queue(). Switch. Signed-off-by: Tejun Heo --- drivers/ide/ide-park.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index 310d03f2b5b..a914023d6d0 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -24,11 +24,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) start_queue = 1; spin_unlock_irq(&hwif->lock); - if (start_queue) { - spin_lock_irq(q->queue_lock); - blk_start_queueing(q); - spin_unlock_irq(q->queue_lock); - } + if (start_queue) + blk_run_queue(q); return; } spin_unlock_irq(&hwif->lock); From 55f3f399422a4a3f6cb84ea4096dfaddf8817399 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0449/2061] ide: don't set REQ_SOFTBARRIER ide doesn't have to worry about REQ_SOFTBARRIER. Don't set it. Signed-off-by: Tejun Heo --- drivers/ide/ide-disk.c | 1 - drivers/ide/ide-ioctls.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index a9fbe2c3121..c2438804d3c 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -411,7 +411,6 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq) cmd->protocol = ATA_PROT_NODATA; rq->cmd_type = REQ_TYPE_ATA_TASKFILE; - rq->cmd_flags |= REQ_SOFTBARRIER; rq->special = cmd; } diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index c1c25ebbaa1..5991b23793f 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -231,7 +231,6 @@ static int generic_drive_reset(ide_drive_t *drive) rq->cmd_type = REQ_TYPE_SPECIAL; rq->cmd_len = 1; rq->cmd[0] = REQ_DRIVE_RESET; - rq->cmd_flags |= REQ_SOFTBARRIER; if (blk_execute_rq(drive->queue, NULL, rq, 1)) ret = rq->errors; blk_put_request(rq); From 46a802e852c2106d755f697819ea80cd3ffdc222 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0450/2061] ide kill unused ide_cmd->special Impact: removal of unused field No one uses ide_cmd->special anymore. Kill it. Signed-off-by: Tejun Heo --- include/linux/ide.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/ide.h b/include/linux/ide.h index ff65fffb078..846a1e13240 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -324,7 +324,6 @@ struct ide_cmd { unsigned int cursg_ofs; struct request *rq; /* copy of request */ - void *special; /* valid_t generally */ }; /* ATAPI packet command flags */ From 1873b90cdea038715ec7140fccc2116fb930ffb5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0451/2061] ide-cd: clear sense buffer before issuing request sense Impact: code simplification ide_cd_request_sense_fixup() clears the tail of the sense buffer if the device didn't completely fill it. This patch makes cdrom_queue_request_sense() clear the sense buffer before issuing the command instead of clearing it afterwards. This simplifies code and eases future changes. Signed-off-by: Tejun Heo --- drivers/ide/ide-cd.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 3aec19d1fdf..509f1293cae 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -217,6 +217,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense, if (sense == NULL) sense = &info->sense_data; + memset(sense, 0, 18); + /* stuff the sense request in front of our current request */ blk_rq_init(NULL, rq); rq->cmd_type = REQ_TYPE_ATA_PC; @@ -504,14 +506,8 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd) * and some drives don't send them. Sigh. */ if (rq->cmd[0] == GPCMD_REQUEST_SENSE && - cmd->nleft > 0 && cmd->nleft <= 5) { - unsigned int ofs = cmd->nbytes - cmd->nleft; - - while (cmd->nleft > 0) { - *((u8 *)rq->data + ofs++) = 0; - cmd->nleft--; - } - } + cmd->nleft > 0 && cmd->nleft <= 5) + cmd->nleft = 0; } int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, From 7f006dc24fae158131116c9472874f12e16cf040 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0452/2061] ide-floppy: block pc always uses bio Impact: remove unnecessary code path Block pc requests always use bio and rq->data is always NULL. No need to worry about !rq->bio cases in idefloppy_block_pc_cmd(). Note that ide-atapi uses ide_pio_bytes() for bio PIO transfer which handle sg fine. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-floppy.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 2b4868d95f8..3b22e066287 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -216,15 +216,13 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy, ide_init_pc(pc); memcpy(pc->c, rq->cmd, sizeof(pc->c)); pc->rq = rq; - if (rq->data_len && rq_data_dir(rq) == WRITE) - pc->flags |= PC_FLAG_WRITING; - pc->buf = rq->data; - if (rq->bio) + if (rq->data_len) { pc->flags |= PC_FLAG_DMA_OK; - /* - * possibly problematic, doesn't look like ide-floppy correctly - * handled scattered requests if dma fails... - */ + if (rq_data_dir(rq) == WRITE) + pc->flags |= PC_FLAG_WRITING; + } + /* pio will be performed by ide_pio_bytes() which handles sg fine */ + pc->buf = NULL; pc->req_xfer = pc->buf_size = rq->data_len; } From eace4cb04c0edc9388e987bf9bbdef461f6daca4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0453/2061] ide-taskfile: don't abuse rq->buffer Impact: rq->buffer usage cleanup ide_raw_taskfile() directly uses rq->buffer to carry pointer to the data buffer. This complicates both block interface and ide backend request handling. Use blk_rq_map_kern() instead and drop special handling for REQ_TYPE_ATA_TASKFILE from ide_map_sg(). Note that REQ_RW setting is moved upwards as blk_rq_map_kern() uses it to initialize bio rw flag. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-io.c | 5 +---- drivers/ide/ide-taskfile.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 35dc38d3b2c..9b9e8b1aae5 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -248,10 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd) struct scatterlist *sg = hwif->sg_table; struct request *rq = cmd->rq; - if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) { - sg_init_one(sg, rq->buffer, rq->nr_sectors * SECTOR_SIZE); - cmd->sg_nents = 1; - } else if (!rq->bio) { + if (!rq->bio) { sg_init_one(sg, rq->data, rq->data_len); cmd->sg_nents = 1; } else diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 4aa6223c11b..f400eb4d4af 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -424,7 +424,9 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; - rq->buffer = buf; + + if (cmd->tf_flags & IDE_TFLAG_WRITE) + rq->cmd_flags |= REQ_RW; /* * (ks) We transfer currently only whole sectors. @@ -432,18 +434,20 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, * if we would find a solution to transfer any size. * To support special commands like READ LONG. */ - rq->hard_nr_sectors = rq->nr_sectors = nsect; - rq->hard_cur_sectors = rq->current_nr_sectors = nsect; - - if (cmd->tf_flags & IDE_TFLAG_WRITE) - rq->cmd_flags |= REQ_RW; + if (nsect) { + error = blk_rq_map_kern(drive->queue, rq, buf, + nsect * SECTOR_SIZE, __GFP_WAIT); + if (error) + goto put_req; + } rq->special = cmd; cmd->rq = rq; error = blk_execute_rq(drive->queue, NULL, rq, 0); - blk_put_request(rq); +put_req: + blk_put_request(rq); return error; } From c267cc1c4db4ccb3406d045a8da8660f0bbfe08d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0454/2061] ide-atapi: don't abuse rq->buffer Impact: rq->buffer usage cleanup ide-atapi uses rq->buffer as private opaque value for internal special requests. rq->special isn't used for these cases (the only case where rq->special is used is for ide-tape rw requests). Use rq->special instead. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-atapi.c | 4 ++-- drivers/ide/ide-floppy.c | 2 +- drivers/ide/ide-tape.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 7201b176d75..2894577237b 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -90,7 +90,7 @@ static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk, blk_rq_init(NULL, rq); rq->cmd_type = REQ_TYPE_SPECIAL; rq->cmd_flags |= REQ_PREEMPT; - rq->buffer = (char *)pc; + rq->special = (char *)pc; rq->rq_disk = disk; if (pc->req_xfer) { @@ -119,7 +119,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_SPECIAL; - rq->buffer = (char *)pc; + rq->special = (char *)pc; if (pc->req_xfer) { rq->data = pc->buf; diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 3b22e066287..94600331a27 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -264,7 +264,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, pc = &floppy->queued_pc; idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block); } else if (blk_special_request(rq)) { - pc = (struct ide_atapi_pc *) rq->buffer; + pc = (struct ide_atapi_pc *)rq->special; } else if (blk_pc_request(rq)) { pc = &floppy->queued_pc; idefloppy_blockpc_cmd(floppy, pc, rq); diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 3a53e0834cf..aadf53cfac6 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -828,7 +828,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, goto out; } if (rq->cmd[13] & REQ_IDETAPE_PC1) { - pc = (struct ide_atapi_pc *) rq->buffer; + pc = (struct ide_atapi_pc *)rq->special; rq->cmd[13] &= ~(REQ_IDETAPE_PC1); rq->cmd[13] |= REQ_IDETAPE_PC2; goto out; From cbfd082abfcbed8c57a12636f36e9bead8d6cfc6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0455/2061] ide-cd: don't abuse rq->buffer Impact: rq->buffer usage cleanup ide-cd uses rq->buffer to carry pointer to the original request when issuing REQUEST_SENSE. Use rq->special instead. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-cd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 509f1293cae..eb3c299d95d 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -232,8 +232,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense, rq->cmd_type = REQ_TYPE_SENSE; rq->cmd_flags |= REQ_PREEMPT; - /* NOTE! Save the failed command in "rq->buffer" */ - rq->buffer = (void *) failed_command; + /* NOTE! Save the failed command in "rq->special" */ + rq->special = (void *)failed_command; if (failed_command) ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x", @@ -247,10 +247,10 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense, static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) { /* - * For REQ_TYPE_SENSE, "rq->buffer" points to the original + * For REQ_TYPE_SENSE, "rq->special" points to the original * failed request */ - struct request *failed = (struct request *)rq->buffer; + struct request *failed = (struct request *)rq->special; struct cdrom_info *info = drive->driver_data; void *sense = &info->sense_data; From a1df5169f9bf08f6067029bfb840a05e282b1b97 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0456/2061] ide: add helpers for preparing sense requests This is in preparation of removing the queueing of a sense request out of the IRQ handler path. Use struct request_sense as a general sense buffer for all ATAPI devices ide-{floppy,tape,cd}. tj: * blk_get_request(__GFP_WAIT) can't be called from do_request() as it can cause deadlock. Converted to use inline struct request and blk_rq_init(). * Added xfer / cdb len selection depending on device type. * All sense prep logics folded into ide_prep_sense() which never fails. * hwif->rq clearing and sense_rq used handling moved into ide_queue_sense_rq(). * blk_rq_map_kern() conversion is moved to later patch. CC: Bartlomiej Zolnierkiewicz CC: FUJITA Tomonori Signed-off-by: Borislav Petkov Signed-off-by: Tejun Heo --- drivers/ide/ide-atapi.c | 61 +++++++++++++++++++++++++++++++++++++++++ include/linux/ide.h | 11 ++++++++ 2 files changed, 72 insertions(+) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 2894577237b..c6e03485a63 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -191,6 +191,67 @@ void ide_create_request_sense_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc) } EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd); +void ide_prep_sense(ide_drive_t *drive, struct request *rq) +{ + struct request_sense *sense = &drive->sense_data; + struct request *sense_rq = &drive->sense_rq; + unsigned int cmd_len, sense_len; + + debug_log("%s: enter\n", __func__); + + switch (drive->media) { + case ide_floppy: + cmd_len = 255; + sense_len = 18; + break; + case ide_tape: + cmd_len = 20; + sense_len = 20; + break; + default: + cmd_len = 18; + sense_len = 18; + } + + BUG_ON(sense_len > sizeof(*sense)); + + if (blk_sense_request(rq) || drive->sense_rq_armed) + return; + + memset(sense, 0, sizeof(*sense)); + + blk_rq_init(rq->q, sense_rq); + sense_rq->rq_disk = rq->rq_disk; + + sense_rq->data = sense; + sense_rq->cmd[0] = GPCMD_REQUEST_SENSE; + sense_rq->cmd[4] = cmd_len; + sense_rq->data_len = sense_len; + + sense_rq->cmd_type = REQ_TYPE_SENSE; + sense_rq->cmd_flags |= REQ_PREEMPT; + + if (drive->media == ide_tape) + sense_rq->cmd[13] = REQ_IDETAPE_PC1; + + drive->sense_rq_armed = true; +} +EXPORT_SYMBOL_GPL(ide_prep_sense); + +void ide_queue_sense_rq(ide_drive_t *drive, void *special) +{ + BUG_ON(!drive->sense_rq_armed); + + drive->sense_rq.special = special; + drive->sense_rq_armed = false; + + drive->hwif->rq = NULL; + + elv_add_request(drive->queue, &drive->sense_rq, + ELEVATOR_INSERT_FRONT, 0); +} +EXPORT_SYMBOL_GPL(ide_queue_sense_rq); + /* * Called when an error was detected during the last packet command. * We queue a request sense packet command in the head of the request list. diff --git a/include/linux/ide.h b/include/linux/ide.h index 846a1e13240..a69ccac5641 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -26,6 +26,9 @@ #include #include +/* for request_sense */ +#include + #if defined(CONFIG_CRIS) || defined(CONFIG_FRV) || defined(CONFIG_MN10300) # define SUPPORT_VLB_SYNC 0 #else @@ -602,6 +605,11 @@ struct ide_drive_s { struct ide_atapi_pc request_sense_pc; struct request request_sense_rq; + + /* current sense rq and buffer */ + bool sense_rq_armed; + struct request sense_rq; + struct request_sense sense_data; }; typedef struct ide_drive_s ide_drive_t; @@ -1175,6 +1183,9 @@ int ide_set_media_lock(ide_drive_t *, struct gendisk *, int); void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *); void ide_retry_pc(ide_drive_t *, struct gendisk *); +void ide_prep_sense(ide_drive_t *drive, struct request *rq); +void ide_queue_sense_rq(ide_drive_t *drive, void *special); + int ide_cd_expiry(ide_drive_t *); int ide_cd_get_xferlen(struct request *); From 746d5e43274e9ea6cbd58818afc9239d41fb4e1e Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0457/2061] ide-cd: convert to using generic sense request Preallocate a sense request in the ->do_request method and reinitialize it only on demand, in case it's been consumed in the IRQ handler path. The reason for this is that we don't want to be mapping rq to bio in the IRQ path and introduce all kinds of unnecessary hacks to the block layer. tj: * Both user and kernel PC requests expect sense data to be stored in separate storage other than drive->sense_data. Copy sense data to rq->sense on completion if rq->sense is not NULL. This fixes bogus sense data on PC requests. As a result, remove cdrom_queue_request_sense. CC: Bartlomiej Zolnierkiewicz CC: FUJITA Tomonori Signed-off-by: Borislav Petkov Signed-off-by: Tejun Heo --- drivers/ide/ide-cd.c | 54 ++++++++++---------------------------------- drivers/ide/ide-cd.h | 4 ---- 2 files changed, 12 insertions(+), 46 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index eb3c299d95d..7b21c7eac5b 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -206,44 +206,6 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive, ide_cd_log_error(drive->name, failed_command, sense); } -static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense, - struct request *failed_command) -{ - struct cdrom_info *info = drive->driver_data; - struct request *rq = &drive->request_sense_rq; - - ide_debug_log(IDE_DBG_SENSE, "enter"); - - if (sense == NULL) - sense = &info->sense_data; - - memset(sense, 0, 18); - - /* stuff the sense request in front of our current request */ - blk_rq_init(NULL, rq); - rq->cmd_type = REQ_TYPE_ATA_PC; - rq->rq_disk = info->disk; - - rq->data = sense; - rq->cmd[0] = GPCMD_REQUEST_SENSE; - rq->cmd[4] = 18; - rq->data_len = 18; - - rq->cmd_type = REQ_TYPE_SENSE; - rq->cmd_flags |= REQ_PREEMPT; - - /* NOTE! Save the failed command in "rq->special" */ - rq->special = (void *)failed_command; - - if (failed_command) - ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x", - failed_command->cmd[0]); - - drive->hwif->rq = NULL; - - elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0); -} - static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) { /* @@ -251,11 +213,16 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) * failed request */ struct request *failed = (struct request *)rq->special; - struct cdrom_info *info = drive->driver_data; - void *sense = &info->sense_data; + struct request_sense *sense = &drive->sense_data; if (failed) { if (failed->sense) { + /* + * Sense is always read into drive->sense_data. + * Copy back if the failed request has its + * sense pointer set. + */ + memcpy(failed->sense, sense, 18); sense = failed->sense; failed->sense_len = rq->sense_len; } @@ -431,7 +398,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) /* if we got a CHECK_CONDITION status, queue a request sense command */ if (stat & ATA_ERR) - cdrom_queue_request_sense(drive, NULL, NULL); + ide_queue_sense_rq(drive, NULL); return 1; end_request: @@ -445,7 +412,7 @@ end_request: hwif->rq = NULL; - cdrom_queue_request_sense(drive, rq->sense, rq); + ide_queue_sense_rq(drive, rq); return 1; } else return 2; @@ -893,6 +860,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, goto out_end; } + /* prepare sense request for this command */ + ide_prep_sense(drive, rq); + memset(&cmd, 0, sizeof(cmd)); if (rq_data_dir(rq)) diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h index 1d97101099c..93a3cf1b0f3 100644 --- a/drivers/ide/ide-cd.h +++ b/drivers/ide/ide-cd.h @@ -87,10 +87,6 @@ struct cdrom_info { struct atapi_toc *toc; - /* The result of the last successful request sense command - on this device. */ - struct request_sense sense_data; - u8 max_speed; /* Max speed of the drive. */ u8 current_speed; /* Current speed of the drive. */ From 6b544fcc8cd0a04eb42de9d1ecdd345e979d6ada Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0458/2061] ide-atapi: convert ide-{floppy,tape} to using preallocated sense buffer Since we're issuing REQ_TYPE_SENSE now we need to allow those types of rqs in the ->do_request callbacks. As a future improvement, sense_len assignment might be unified across all ATAPI devices. Borislav to check with specs and test. As a result, get rid of ide_queue_pc_head() and drive->request_sense_rq. tj: * Init request sense ide_atapi_pc from sense request. In the longer timer, it would probably better to fold ide_create_request_sense_cmd() into its only current user - ide_floppy_get_format_progress(). * ide_retry_pc() no longer takes @disk. CC: Bartlomiej Zolnierkiewicz CC: FUJITA Tomonori Signed-off-by: Borislav Petkov Signed-off-by: Tejun Heo --- drivers/ide/ide-atapi.c | 48 ++++++++++++---------------------------- drivers/ide/ide-floppy.c | 4 +++- drivers/ide/ide-tape.c | 7 ++++-- include/linux/ide.h | 3 +-- 4 files changed, 23 insertions(+), 39 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index c6e03485a63..972c522516f 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -79,34 +79,6 @@ void ide_init_pc(struct ide_atapi_pc *pc) } EXPORT_SYMBOL_GPL(ide_init_pc); -/* - * Generate a new packet command request in front of the request queue, before - * the current request, so that it will be processed immediately, on the next - * pass through the driver. - */ -static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk, - struct ide_atapi_pc *pc, struct request *rq) -{ - blk_rq_init(NULL, rq); - rq->cmd_type = REQ_TYPE_SPECIAL; - rq->cmd_flags |= REQ_PREEMPT; - rq->special = (char *)pc; - rq->rq_disk = disk; - - if (pc->req_xfer) { - rq->data = pc->buf; - rq->data_len = pc->req_xfer; - } - - memcpy(rq->cmd, pc->c, 12); - if (drive->media == ide_tape) - rq->cmd[13] = REQ_IDETAPE_PC1; - - drive->hwif->rq = NULL; - - elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0); -} - /* * Add a special packet command request to the tail of the request queue, * and wait for it to be serviced. @@ -254,18 +226,26 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq); /* * Called when an error was detected during the last packet command. - * We queue a request sense packet command in the head of the request list. + * We queue a request sense packet command at the head of the request + * queue. */ -void ide_retry_pc(ide_drive_t *drive, struct gendisk *disk) +void ide_retry_pc(ide_drive_t *drive) { - struct request *rq = &drive->request_sense_rq; + struct request *sense_rq = &drive->sense_rq; struct ide_atapi_pc *pc = &drive->request_sense_pc; (void)ide_read_error(drive); - ide_create_request_sense_cmd(drive, pc); + + /* init pc from sense_rq */ + ide_init_pc(pc); + memcpy(pc->c, sense_rq->cmd, 12); + pc->buf = sense_rq->data; + pc->req_xfer = sense_rq->data_len; + if (drive->media == ide_tape) set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags); - ide_queue_pc_head(drive, disk, pc, rq); + + ide_queue_sense_rq(drive, pc); } EXPORT_SYMBOL_GPL(ide_retry_pc); @@ -404,7 +384,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) debug_log("[cmd %x]: check condition\n", rq->cmd[0]); /* Retry operation */ - ide_retry_pc(drive, rq->rq_disk); + ide_retry_pc(drive); /* queued, but not started */ return ide_stopped; diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 94600331a27..d3302cc891e 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -263,7 +263,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, } pc = &floppy->queued_pc; idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block); - } else if (blk_special_request(rq)) { + } else if (blk_special_request(rq) || blk_sense_request(rq)) { pc = (struct ide_atapi_pc *)rq->special; } else if (blk_pc_request(rq)) { pc = &floppy->queued_pc; @@ -273,6 +273,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, goto out_end; } + ide_prep_sense(drive, rq); + memset(&cmd, 0, sizeof(cmd)); if (rq_data_dir(rq)) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index aadf53cfac6..8324dfa78a3 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -695,7 +695,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive) printk(KERN_ERR "ide-tape: %s: I/O error, ", tape->name); /* Retry operation */ - ide_retry_pc(drive, tape->disk); + ide_retry_pc(drive); return ide_stopped; } pc->error = 0; @@ -752,7 +752,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, (unsigned long long)rq->sector, rq->nr_sectors, rq->current_nr_sectors); - if (!blk_special_request(rq)) { + if (!(blk_special_request(rq) || blk_sense_request(rq))) { /* We do not support buffer cache originated requests. */ printk(KERN_NOTICE "ide-tape: %s: Unsupported request in " "request queue (%d)\n", drive->name, rq->cmd_type); @@ -840,6 +840,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, BUG(); out: + /* prepare sense request for this command */ + ide_prep_sense(drive, rq); + memset(&cmd, 0, sizeof(cmd)); if (rq_data_dir(rq)) diff --git a/include/linux/ide.h b/include/linux/ide.h index a69ccac5641..9e67ccac3c1 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -604,7 +604,6 @@ struct ide_drive_s { unsigned long atapi_flags; struct ide_atapi_pc request_sense_pc; - struct request request_sense_rq; /* current sense rq and buffer */ bool sense_rq_armed; @@ -1181,7 +1180,7 @@ int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *); int ide_do_start_stop(ide_drive_t *, struct gendisk *, int); int ide_set_media_lock(ide_drive_t *, struct gendisk *, int); void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *); -void ide_retry_pc(ide_drive_t *, struct gendisk *); +void ide_retry_pc(ide_drive_t *drive); void ide_prep_sense(ide_drive_t *drive, struct request *rq); void ide_queue_sense_rq(ide_drive_t *drive, void *special); From 5c4be57249e2e09136446597d2fe2a967c6ffef0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0459/2061] ide-cd,atapi: use bio for internal commands Impact: unify request data buffer handling rq->data is used mostly to pass kernel buffer through request queue without using bio. There are only a couple of places which still do this in kernel and converting to bio isn't difficult. This patch converts ide-cd and atapi to use bio instead of rq->data for request sense and internal pc commands. With previous change to unify sense request handling, this is relatively easily achieved by adding blk_rq_map_kern() during sense_rq prep and PC issue. If blk_rq_map_kern() fails for sense, the error is deferred till sense issue and aborts the failed command which triggered the sense. Note that this is a slim possibility as sense prep is done on each command issue, so for the above condition to actually trigger, all preps since the last sense issue till the issue of the request which would require a sense should fail. * do_request functions might sleep now. This should be okay as ide request_fn - do_ide_request() - is invoked only from make_request and plug work. Make sure this is the case by adding might_sleep() to do_ide_request(). * Functions which access the read sense data before the sense request is complete now should access bio_data(sense_rq->bio) as the sense buffer might have been copied during blk_rq_map_kern(). * ide-tape updated to map sg. * cdrom_do_block_pc() now doesn't have to deal with REQ_TYPE_ATA_PC special case. Simplified. * tp_ops->output/input_data path dropped from ide_pc_intr(). Signed-off-by: Tejun Heo --- drivers/ide/ide-atapi.c | 52 ++++++++++++++++++++++++----------------- drivers/ide/ide-cd.c | 28 +++++++++++----------- drivers/ide/ide-io.c | 3 +++ drivers/ide/ide-tape.c | 3 +++ include/linux/ide.h | 2 +- 5 files changed, 52 insertions(+), 36 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 972c522516f..5cefe12f562 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -94,16 +94,18 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, rq->special = (char *)pc; if (pc->req_xfer) { - rq->data = pc->buf; - rq->data_len = pc->req_xfer; + error = blk_rq_map_kern(drive->queue, rq, pc->buf, pc->req_xfer, + GFP_NOIO); + if (error) + goto put_req; } memcpy(rq->cmd, pc->c, 12); if (drive->media == ide_tape) rq->cmd[13] = REQ_IDETAPE_PC1; error = blk_execute_rq(drive->queue, disk, rq, 0); +put_req: blk_put_request(rq); - return error; } EXPORT_SYMBOL_GPL(ide_queue_pc_tail); @@ -168,6 +170,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) struct request_sense *sense = &drive->sense_data; struct request *sense_rq = &drive->sense_rq; unsigned int cmd_len, sense_len; + int err; debug_log("%s: enter\n", __func__); @@ -193,13 +196,19 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) memset(sense, 0, sizeof(*sense)); blk_rq_init(rq->q, sense_rq); - sense_rq->rq_disk = rq->rq_disk; - sense_rq->data = sense; + err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len, + GFP_NOIO); + if (unlikely(err)) { + if (printk_ratelimit()) + printk(KERN_WARNING "%s: failed to map sense buffer\n", + drive->name); + return; + } + + sense_rq->rq_disk = rq->rq_disk; sense_rq->cmd[0] = GPCMD_REQUEST_SENSE; sense_rq->cmd[4] = cmd_len; - sense_rq->data_len = sense_len; - sense_rq->cmd_type = REQ_TYPE_SENSE; sense_rq->cmd_flags |= REQ_PREEMPT; @@ -210,9 +219,14 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) } EXPORT_SYMBOL_GPL(ide_prep_sense); -void ide_queue_sense_rq(ide_drive_t *drive, void *special) +int ide_queue_sense_rq(ide_drive_t *drive, void *special) { - BUG_ON(!drive->sense_rq_armed); + /* deferred failure from ide_prep_sense() */ + if (!drive->sense_rq_armed) { + printk(KERN_WARNING "%s: failed queue sense request\n", + drive->name); + return -ENOMEM; + } drive->sense_rq.special = special; drive->sense_rq_armed = false; @@ -221,6 +235,7 @@ void ide_queue_sense_rq(ide_drive_t *drive, void *special) elv_add_request(drive->queue, &drive->sense_rq, ELEVATOR_INSERT_FRONT, 0); + return 0; } EXPORT_SYMBOL_GPL(ide_queue_sense_rq); @@ -239,13 +254,14 @@ void ide_retry_pc(ide_drive_t *drive) /* init pc from sense_rq */ ide_init_pc(pc); memcpy(pc->c, sense_rq->cmd, 12); - pc->buf = sense_rq->data; + pc->buf = bio_data(sense_rq->bio); /* pointer to mapped address */ pc->req_xfer = sense_rq->data_len; if (drive->media == ide_tape) set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags); - ide_queue_sense_rq(drive, pc); + if (ide_queue_sense_rq(drive, pc)) + ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); } EXPORT_SYMBOL_GPL(ide_retry_pc); @@ -317,7 +333,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) struct ide_cmd *cmd = &hwif->cmd; struct request *rq = hwif->rq; const struct ide_tp_ops *tp_ops = hwif->tp_ops; - xfer_func_t *xferfunc; unsigned int timeout, done; u16 bcount; u8 stat, ireason, dsc = 0; @@ -411,7 +426,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) rq->errors = -EIO; } - if (drive->media == ide_tape) + if (drive->media == ide_tape && !rq->bio) done = ide_rq_bytes(rq); /* FIXME */ else done = blk_rq_bytes(rq); @@ -448,16 +463,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) return ide_do_reset(drive); } - xferfunc = write ? tp_ops->output_data : tp_ops->input_data; - - if (drive->media == ide_floppy && pc->buf == NULL) { + if (drive->media == ide_tape && pc->bh) + done = drive->pc_io_buffers(drive, pc, bcount, write); + else { done = min_t(unsigned int, bcount, cmd->nleft); ide_pio_bytes(drive, cmd, write, done); - } else if (drive->media == ide_tape && pc->bh) { - done = drive->pc_io_buffers(drive, pc, bcount, write); - } else { - done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred); - xferfunc(drive, NULL, pc->cur_pos, done); } /* Update the current position */ diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 7b21c7eac5b..392a5bdf2fd 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -210,10 +210,12 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) { /* * For REQ_TYPE_SENSE, "rq->special" points to the original - * failed request + * failed request. Also, the sense data should be read + * directly from rq which might be different from the original + * sense buffer if it got copied during mapping. */ struct request *failed = (struct request *)rq->special; - struct request_sense *sense = &drive->sense_data; + void *sense = bio_data(rq->bio); if (failed) { if (failed->sense) { @@ -398,7 +400,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) /* if we got a CHECK_CONDITION status, queue a request sense command */ if (stat & ATA_ERR) - ide_queue_sense_rq(drive, NULL); + return ide_queue_sense_rq(drive, NULL) ? 2 : 1; return 1; end_request: @@ -412,8 +414,7 @@ end_request: hwif->rq = NULL; - ide_queue_sense_rq(drive, rq); - return 1; + return ide_queue_sense_rq(drive, rq) ? 2 : 1; } else return 2; } @@ -507,8 +508,12 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, rq->cmd_flags |= cmd_flags; rq->timeout = timeout; if (buffer) { - rq->data = buffer; - rq->data_len = *bufflen; + error = blk_rq_map_kern(drive->queue, rq, buffer, + *bufflen, GFP_NOIO); + if (error) { + blk_put_request(rq); + return error; + } } error = blk_execute_rq(drive->queue, info->disk, rq, 0); @@ -802,15 +807,10 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq) drive->dma = 0; /* sg request */ - if (rq->bio || ((rq->cmd_type == REQ_TYPE_ATA_PC) && rq->data_len)) { + if (rq->bio) { struct request_queue *q = drive->queue; + char *buf = bio_data(rq->bio); unsigned int alignment; - char *buf; - - if (rq->bio) - buf = bio_data(rq->bio); - else - buf = rq->data; drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA); diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 9b9e8b1aae5..3245c2dbda3 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -481,6 +481,9 @@ void do_ide_request(struct request_queue *q) spin_unlock_irq(q->queue_lock); + /* HLD do_request() callback might sleep, make sure it's okay */ + might_sleep(); + if (ide_lock_host(host, hwif)) goto plug_device_2; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 8324dfa78a3..9b762a2d5d9 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -850,6 +850,9 @@ out: cmd.rq = rq; + ide_init_sg_cmd(&cmd, pc->req_xfer); + ide_map_sg(drive, &cmd); + return ide_tape_issue_pc(drive, &cmd, pc); } diff --git a/include/linux/ide.h b/include/linux/ide.h index 9e67ccac3c1..1957461ac76 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1183,7 +1183,7 @@ void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *); void ide_retry_pc(ide_drive_t *drive); void ide_prep_sense(ide_drive_t *drive, struct request *rq); -void ide_queue_sense_rq(ide_drive_t *drive, void *special); +int ide_queue_sense_rq(ide_drive_t *drive, void *special); int ide_cd_expiry(ide_drive_t *); From fc38b521dcffcb07447cd98fedc56f495c10b90d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:43 +0900 Subject: [PATCH 0460/2061] ide-pm: don't abuse rq->data Impact: cleanup rq->data usage ide-pm uses rq->data to carry pointer to struct request_pm_state through request queue and rq->special is used to carray pointer to local struct ide_cmd, which isn't necessary. Use rq->special for request_pm_state instead and use local ide_cmd in ide_start_power_step(). Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-io.c | 2 +- drivers/ide/ide-pm.c | 38 +++++++++++++++----------------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 3245c2dbda3..6e3094e2277 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -368,7 +368,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) return execute_drive_cmd(drive, rq); else if (blk_pm_request(rq)) { - struct request_pm_state *pm = rq->data; + struct request_pm_state *pm = rq->special; #ifdef DEBUG_PM printk("%s: start_power_step(step: %d)\n", drive->name, pm->pm_step); diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 0d8a151c0a0..ba1488bd843 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -7,7 +7,6 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) ide_hwif_t *hwif = drive->hwif; struct request *rq; struct request_pm_state rqpm; - struct ide_cmd cmd; int ret; /* call ACPI _GTM only once */ @@ -15,11 +14,9 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) ide_acpi_get_timing(hwif); memset(&rqpm, 0, sizeof(rqpm)); - memset(&cmd, 0, sizeof(cmd)); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_PM_SUSPEND; - rq->special = &cmd; - rq->data = &rqpm; + rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; if (mesg.event == PM_EVENT_PRETHAW) mesg.event = PM_EVENT_FREEZE; @@ -41,7 +38,6 @@ int generic_ide_resume(struct device *dev) ide_hwif_t *hwif = drive->hwif; struct request *rq; struct request_pm_state rqpm; - struct ide_cmd cmd; int err; /* call ACPI _PS0 / _STM only once */ @@ -53,12 +49,10 @@ int generic_ide_resume(struct device *dev) ide_acpi_exec_tfs(drive); memset(&rqpm, 0, sizeof(rqpm)); - memset(&cmd, 0, sizeof(cmd)); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_PM_RESUME; rq->cmd_flags |= REQ_PREEMPT; - rq->special = &cmd; - rq->data = &rqpm; + rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_RESUME; rqpm.pm_state = PM_EVENT_ON; @@ -77,7 +71,7 @@ int generic_ide_resume(struct device *dev) void ide_complete_power_step(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->data; + struct request_pm_state *pm = rq->special; #ifdef DEBUG_PM printk(KERN_INFO "%s: complete_power_step(step: %d)\n", @@ -107,10 +101,8 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq) ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->data; - struct ide_cmd *cmd = rq->special; - - memset(cmd, 0, sizeof(*cmd)); + struct request_pm_state *pm = rq->special; + struct ide_cmd cmd = { }; switch (pm->pm_step) { case IDE_PM_FLUSH_CACHE: /* Suspend step 1 (flush cache) */ @@ -123,12 +115,12 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) return ide_stopped; } if (ata_id_flush_ext_enabled(drive->id)) - cmd->tf.command = ATA_CMD_FLUSH_EXT; + cmd.tf.command = ATA_CMD_FLUSH_EXT; else - cmd->tf.command = ATA_CMD_FLUSH; + cmd.tf.command = ATA_CMD_FLUSH; goto out_do_tf; case IDE_PM_STANDBY: /* Suspend step 2 (standby) */ - cmd->tf.command = ATA_CMD_STANDBYNOW1; + cmd.tf.command = ATA_CMD_STANDBYNOW1; goto out_do_tf; case IDE_PM_RESTORE_PIO: /* Resume step 1 (restore PIO) */ ide_set_max_pio(drive); @@ -141,7 +133,7 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) ide_complete_power_step(drive, rq); return ide_stopped; case IDE_PM_IDLE: /* Resume step 2 (idle) */ - cmd->tf.command = ATA_CMD_IDLEIMMEDIATE; + cmd.tf.command = ATA_CMD_IDLEIMMEDIATE; goto out_do_tf; case IDE_PM_RESTORE_DMA: /* Resume step 3 (restore DMA) */ /* @@ -163,11 +155,11 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) return ide_stopped; out_do_tf: - cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE; - cmd->valid.in.tf = IDE_VALID_IN_TF | IDE_VALID_DEVICE; - cmd->protocol = ATA_PROT_NODATA; + cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE; + cmd.valid.in.tf = IDE_VALID_IN_TF | IDE_VALID_DEVICE; + cmd.protocol = ATA_PROT_NODATA; - return do_rw_taskfile(drive, cmd); + return do_rw_taskfile(drive, &cmd); } /** @@ -181,7 +173,7 @@ out_do_tf: void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) { struct request_queue *q = drive->queue; - struct request_pm_state *pm = rq->data; + struct request_pm_state *pm = rq->special; unsigned long flags; ide_complete_power_step(drive, rq); @@ -207,7 +199,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) void ide_check_pm_state(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->data; + struct request_pm_state *pm = rq->special; if (blk_pm_suspend_request(rq) && pm->pm_step == IDE_PM_START_SUSPEND) From ea7066afcd590e4663e6dc010f93704164050f48 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0461/2061] ide-tape,floppy: fix failed command completion after request sense Impact: fix infinite retry loop After a command failed, ide-tape and floppy inserts REQUEST_SENSE in front of the failed command and according to the result, sets pc->retries, flags and errors. After REQUEST_SENSE is complete, the failed command is again at the front of the queue and if the verdict was to terminate the request, the issue functions tries to complete it directly by calling drive->pc_callback() and returning ide_stopped. However, drive->pc_callback() doesn't complete a request. It only prepares for completion of the request. As a result, this creates an infinite loop where the failed request is retried perpetually. Fix it by actually ending the request by calling ide_complete_rq(). Signed-off-by: Tejun Heo --- drivers/ide/ide-floppy.c | 1 + drivers/ide/ide-tape.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index d3302cc891e..d20704ac318 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -141,6 +141,7 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive, drive->failed_pc = NULL; drive->pc_callback(drive, 0); + ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); return ide_stopped; } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 9b762a2d5d9..2b9a13671c5 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -643,6 +643,7 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, } drive->failed_pc = NULL; drive->pc_callback(drive, 0); + ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); return ide_stopped; } debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]); From b3071d190d6757b14af002a9d79832f12de61bce Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0462/2061] ide-atapi,tape,floppy: allow ->pc_callback() to change rq->data_len Impact: allow residual count implementation in ->pc_callback() rq->data_len has two duties - carrying the number of input bytes on issue and carrying residual count back to the issuer on completion. ide-atapi completion callback ->pc_callback() is the right place to do this but currently ide-atapi depends on rq->data_len carrying the original request size after calling ->pc_callback() to complete the pc request. This patch makes ide_pc_intr(), ide_tape_issue_pc() and ide_floppy_issue_pc() cache length to complete before calling ->pc_callback() so that it can modify rq->data_len as necessary. Note: As using rq->data_len for two purposes can make cases like this incorrect in subtle ways, future changes will introduce separate field for residual count. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-atapi.c | 16 ++++++++++------ drivers/ide/ide-floppy.c | 5 ++++- drivers/ide/ide-tape.c | 5 ++++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 5cefe12f562..3df5442de71 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -409,6 +409,16 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0) dsc = 1; + /* + * ->pc_callback() might change rq->data_len for + * residual count, cache total length. + */ + if (!blk_special_request(rq) && + (drive->media == ide_tape && !rq->bio)) + done = ide_rq_bytes(rq); /* FIXME */ + else + done = blk_rq_bytes(rq); + /* Command finished - Call the callback function */ uptodate = drive->pc_callback(drive, dsc); @@ -417,7 +427,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) if (blk_special_request(rq)) { rq->errors = 0; - done = blk_rq_bytes(rq); error = 0; } else { @@ -426,11 +435,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) rq->errors = -EIO; } - if (drive->media == ide_tape && !rq->bio) - done = ide_rq_bytes(rq); /* FIXME */ - else - done = blk_rq_bytes(rq); - error = uptodate ? 0 : -EIO; } diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index d20704ac318..537b7c55803 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -134,14 +134,17 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive, drive->pc = pc; if (pc->retries > IDEFLOPPY_MAX_PC_RETRIES) { + unsigned int done = blk_rq_bytes(drive->hwif->rq); + if (!(pc->flags & PC_FLAG_SUPPRESS_ERROR)) ide_floppy_report_error(floppy, pc); + /* Giving up */ pc->error = IDE_DRV_ERROR_GENERAL; drive->failed_pc = NULL; drive->pc_callback(drive, 0); - ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); + ide_complete_rq(drive, -EIO, done); return ide_stopped; } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 2b9a13671c5..8226d52504d 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -622,6 +622,8 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, if (pc->retries > IDETAPE_MAX_PC_RETRIES || (pc->flags & PC_FLAG_ABORT)) { + unsigned int done = blk_rq_bytes(drive->hwif->rq); + /* * We will "abort" retrying a packet command in case legitimate * error code was received (crossing a filemark, or end of the @@ -641,9 +643,10 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, /* Giving up */ pc->error = IDE_DRV_ERROR_GENERAL; } + drive->failed_pc = NULL; drive->pc_callback(drive, 0); - ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); + ide_complete_rq(drive, -EIO, done); return ide_stopped; } debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]); From 35ab8d3251833e4052aa64b09b08195e949518c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0463/2061] ide-tape: use single continuous buffer Impact: simpler buffer allocation and handling, kills OOM, fix DMA transfers ide-tape has its own multiple buffer mechanism using struct idetape_bh. It allocates buffer with decreasing order-of-two allocations so that it results in minimum number of segments. However, the implementation is quite complex and works in a way that no other block or ide driver works necessitating a lot of special case handling. The benefit this complex allocation scheme brings is questionable as PIO or DMA the number of segments (16 maximum) doesn't make any noticeable difference and it also doesn't negate the need for multiple order allocation which can fail under memory pressure or high fragmentation although it does lower the highest order necessary by one when the buffer size isn't power of two. As the first step to remove the custom buffer management, this patch makes ide-tape allocate single continous buffer. The maximum order is four. I doubt the change would cause any trouble but if it ever matters, it should be converted to regular sg mechanism like everyone else and even in that case dropping custom buffer handling and moving to standard mechanism first make sense as an intermediate step. This patch makes the first bh to contain the whole buffer and drops multi bh handling code. Following patches will make further changes. This patch has the side effect of killing OOM triggered by allocation path and fixing DMA transfers. Previously, bug in alloc path triggered OOM on command issue and commands were passed to DMA engine without DMA-mapping all the segments. Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 254 +++++++++-------------------------------- 1 file changed, 57 insertions(+), 197 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 8226d52504d..b2afbc7bcb6 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -134,7 +134,6 @@ enum { struct idetape_bh { u32 b_size; atomic_t b_count; - struct idetape_bh *b_reqnext; char *b_data; }; @@ -228,10 +227,6 @@ typedef struct ide_tape_obj { char *b_data; int b_count; - int pages_per_buffer; - /* Wasted space in each stage */ - int excess_bh_size; - /* Measures average tape speed */ unsigned long avg_time; int avg_size; @@ -303,9 +298,7 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, struct idetape_bh *bh = pc->bh; int count; - while (bcount) { - if (bh == NULL) - break; + if (bcount && bh) { count = min( (unsigned int)(bh->b_size - atomic_read(&bh->b_count)), bcount); @@ -313,15 +306,10 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, atomic_read(&bh->b_count), count); bcount -= count; atomic_add(count, &bh->b_count); - if (atomic_read(&bh->b_count) == bh->b_size) { - bh = bh->b_reqnext; - if (bh) - atomic_set(&bh->b_count, 0); - } + if (atomic_read(&bh->b_count) == bh->b_size) + pc->bh = NULL; } - pc->bh = bh; - return bcount; } @@ -331,22 +319,14 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, struct idetape_bh *bh = pc->bh; int count; - while (bcount) { - if (bh == NULL) - break; + if (bcount && bh) { count = min((unsigned int)pc->b_count, (unsigned int)bcount); drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count); bcount -= count; pc->b_data += count; pc->b_count -= count; - if (!pc->b_count) { - bh = bh->b_reqnext; - pc->bh = bh; - if (bh) { - pc->b_data = bh->b_data; - pc->b_count = atomic_read(&bh->b_count); - } - } + if (!pc->b_count) + pc->bh = NULL; } return bcount; @@ -355,24 +335,20 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc) { struct idetape_bh *bh = pc->bh; - int count; unsigned int bcount = pc->xferred; if (pc->flags & PC_FLAG_WRITING) return; - while (bcount) { - if (bh == NULL) { + if (bcount) { + if (bh == NULL || bcount > bh->b_size) { printk(KERN_ERR "ide-tape: bh == NULL in %s\n", __func__); return; } - count = min((unsigned int)bh->b_size, (unsigned int)bcount); - atomic_set(&bh->b_count, count); + atomic_set(&bh->b_count, bcount); if (atomic_read(&bh->b_count) == bh->b_size) - bh = bh->b_reqnext; - bcount -= count; + pc->bh = NULL; } - pc->bh = bh; } /* @@ -439,24 +415,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense) /* Free data buffers completely. */ static void ide_tape_kfree_buffer(idetape_tape_t *tape) { - struct idetape_bh *prev_bh, *bh = tape->merge_bh; + struct idetape_bh *bh = tape->merge_bh; - while (bh) { - u32 size = bh->b_size; - - while (size) { - unsigned int order = fls(size >> PAGE_SHIFT)-1; - - if (bh->b_data) - free_pages((unsigned long)bh->b_data, order); - - size &= (order-1); - bh->b_data += (1 << order) * PAGE_SIZE; - } - prev_bh = bh; - bh = bh->b_reqnext; - kfree(prev_bh); - } + kfree(bh->b_data); + kfree(bh); } static void ide_tape_handle_dsc(ide_drive_t *); @@ -861,117 +823,50 @@ out: } /* - * The function below uses __get_free_pages to allocate a data buffer of size - * tape->buffer_size (or a bit more). We attempt to combine sequential pages as - * much as possible. - * - * It returns a pointer to the newly allocated buffer, or NULL in case of - * failure. + * It returns a pointer to the newly allocated buffer, or NULL in case + * of failure. */ static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape, - int full, int clear) + int full) { - struct idetape_bh *prev_bh, *bh, *merge_bh; - int pages = tape->pages_per_buffer; - unsigned int order, b_allocd; - char *b_data = NULL; + struct idetape_bh *bh; - merge_bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL); - bh = merge_bh; - if (bh == NULL) - goto abort; + bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL); + if (!bh) + return NULL; - order = fls(pages) - 1; - bh->b_data = (char *) __get_free_pages(GFP_KERNEL, order); - if (!bh->b_data) - goto abort; - b_allocd = (1 << order) * PAGE_SIZE; - pages &= (order-1); - - if (clear) - memset(bh->b_data, 0, b_allocd); - bh->b_reqnext = NULL; - bh->b_size = b_allocd; - atomic_set(&bh->b_count, full ? bh->b_size : 0); - - while (pages) { - order = fls(pages) - 1; - b_data = (char *) __get_free_pages(GFP_KERNEL, order); - if (!b_data) - goto abort; - b_allocd = (1 << order) * PAGE_SIZE; - - if (clear) - memset(b_data, 0, b_allocd); - - /* newly allocated page frames below buffer header or ...*/ - if (bh->b_data == b_data + b_allocd) { - bh->b_size += b_allocd; - bh->b_data -= b_allocd; - if (full) - atomic_add(b_allocd, &bh->b_count); - continue; - } - /* they are above the header */ - if (b_data == bh->b_data + bh->b_size) { - bh->b_size += b_allocd; - if (full) - atomic_add(b_allocd, &bh->b_count); - continue; - } - prev_bh = bh; - bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL); - if (!bh) { - free_pages((unsigned long) b_data, order); - goto abort; - } - bh->b_reqnext = NULL; - bh->b_data = b_data; - bh->b_size = b_allocd; - atomic_set(&bh->b_count, full ? bh->b_size : 0); - prev_bh->b_reqnext = bh; - - pages &= (order-1); + bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL); + if (!bh->b_data) { + kfree(bh); + return NULL; } - bh->b_size -= tape->excess_bh_size; - if (full) - atomic_sub(tape->excess_bh_size, &bh->b_count); - return merge_bh; -abort: - ide_tape_kfree_buffer(tape); - return NULL; + bh->b_size = tape->buffer_size; + atomic_set(&bh->b_count, full ? bh->b_size : 0); + + return bh; } static int idetape_copy_stage_from_user(idetape_tape_t *tape, const char __user *buf, int n) { struct idetape_bh *bh = tape->bh; - int count; int ret = 0; - while (n) { - if (bh == NULL) { + if (n) { + if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) { printk(KERN_ERR "ide-tape: bh == NULL in %s\n", __func__); return 1; } - count = min((unsigned int) - (bh->b_size - atomic_read(&bh->b_count)), - (unsigned int)n); if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf, - count)) + n)) ret = 1; - n -= count; - atomic_add(count, &bh->b_count); - buf += count; - if (atomic_read(&bh->b_count) == bh->b_size) { - bh = bh->b_reqnext; - if (bh) - atomic_set(&bh->b_count, 0); - } + atomic_add(n, &bh->b_count); + if (atomic_read(&bh->b_count) == bh->b_size) + tape->bh = NULL; } - tape->bh = bh; + return ret; } @@ -979,30 +874,20 @@ static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf, int n) { struct idetape_bh *bh = tape->bh; - int count; int ret = 0; - while (n) { - if (bh == NULL) { + if (n) { + if (bh == NULL || n > tape->b_count) { printk(KERN_ERR "ide-tape: bh == NULL in %s\n", __func__); return 1; } - count = min(tape->b_count, n); - if (copy_to_user(buf, tape->b_data, count)) + if (copy_to_user(buf, tape->b_data, n)) ret = 1; - n -= count; - tape->b_data += count; - tape->b_count -= count; - buf += count; - if (!tape->b_count) { - bh = bh->b_reqnext; - tape->bh = bh; - if (bh) { - tape->b_data = bh->b_data; - tape->b_count = atomic_read(&bh->b_count); - } - } + tape->b_data += n; + tape->b_count -= n; + if (!tape->b_count) + tape->bh = NULL; } return ret; } @@ -1254,7 +1139,7 @@ static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks) static void ide_tape_flush_merge_buffer(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; - int blocks, min; + int blocks; struct idetape_bh *bh; if (tape->chrdev_dir != IDETAPE_DIR_WRITE) { @@ -1269,31 +1154,16 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive) if (tape->merge_bh_size) { blocks = tape->merge_bh_size / tape->blk_size; if (tape->merge_bh_size % tape->blk_size) { - unsigned int i; - + unsigned int i = tape->blk_size - + tape->merge_bh_size % tape->blk_size; blocks++; - i = tape->blk_size - tape->merge_bh_size % - tape->blk_size; - bh = tape->bh->b_reqnext; - while (bh) { - atomic_set(&bh->b_count, 0); - bh = bh->b_reqnext; - } bh = tape->bh; - while (i) { - if (bh == NULL) { - printk(KERN_INFO "ide-tape: bug," - " bh NULL\n"); - break; - } - min = min(i, (unsigned int)(bh->b_size - - atomic_read(&bh->b_count))); + if (bh) { memset(bh->b_data + atomic_read(&bh->b_count), - 0, min); - atomic_add(min, &bh->b_count); - i -= min; - bh = bh->b_reqnext; - } + 0, i); + atomic_add(i, &bh->b_count); + } else + printk(KERN_INFO "ide-tape: bug, bh NULL\n"); } (void) idetape_add_chrdev_write_request(drive, blocks); tape->merge_bh_size = 0; @@ -1321,7 +1191,7 @@ static int idetape_init_read(ide_drive_t *drive) " 0 now\n"); tape->merge_bh_size = 0; } - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0); + tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0); if (!tape->merge_bh) return -ENOMEM; tape->chrdev_dir = IDETAPE_DIR_READ; @@ -1368,23 +1238,18 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) static void idetape_pad_zeros(ide_drive_t *drive, int bcount) { idetape_tape_t *tape = drive->driver_data; - struct idetape_bh *bh; + struct idetape_bh *bh = tape->merge_bh; int blocks; while (bcount) { unsigned int count; - bh = tape->merge_bh; count = min(tape->buffer_size, bcount); bcount -= count; blocks = count / tape->blk_size; - while (count) { - atomic_set(&bh->b_count, - min(count, (unsigned int)bh->b_size)); - memset(bh->b_data, 0, atomic_read(&bh->b_count)); - count -= atomic_read(&bh->b_count); - bh = bh->b_reqnext; - } + atomic_set(&bh->b_count, count); + memset(bh->b_data, 0, atomic_read(&bh->b_count)); + idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks, tape->merge_bh); } @@ -1596,7 +1461,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, "should be 0 now\n"); tape->merge_bh_size = 0; } - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0); + tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0); if (!tape->merge_bh) return -ENOMEM; tape->chrdev_dir = IDETAPE_DIR_WRITE; @@ -1970,7 +1835,7 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor) idetape_tape_t *tape = drive->driver_data; ide_tape_flush_merge_buffer(drive); - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1, 0); + tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1); if (tape->merge_bh != NULL) { idetape_pad_zeros(drive, tape->blk_size * (tape->user_bs_factor - 1)); @@ -2201,11 +2066,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor) tape->buffer_size = *ctl * tape->blk_size; } buffer_size = tape->buffer_size; - tape->pages_per_buffer = buffer_size / PAGE_SIZE; - if (buffer_size % PAGE_SIZE) { - tape->pages_per_buffer++; - tape->excess_bh_size = PAGE_SIZE - buffer_size % PAGE_SIZE; - } /* select the "best" DSC read/write polling freq */ speed = max(*(u16 *)&tape->caps[14], *(u16 *)&tape->caps[8]); From 21d9c5d227593d15630ae83a336d1519653e9b8a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0464/2061] ide-tape: use standard data transfer mechanism Impact: use standard way to transfer data ide-tape uses rq in an interesting way. For r/w requests, rq->special is used to carry a private buffer management structure idetape_bh and rq->nr_sectors and current_nr_sectors are initialized to the number of idetape blocks which isn't necessary 512 bytes. Also, rq->current_nr_sectors is used to report back the residual count in units of idetape blocks. This peculiarity taxes both block layer and ide. ide-atapi has different paths and hooks to accomodate it and what a rq means becomes quite confusing and making changes at the block layer becomes quite difficult and error-prone. This patch makes ide-tape use bio instead. With the previous patch, ide-tape currently is using single contiguos buffer so replacing it isn't difficult. Data buffer is mapped into bio using blk_rq_map_kern() in idetape_queue_rw_tail(). idetape_io_buffers() and idetape_update_buffers() are dropped and pc->bh is set to null to tell ide-atapi to use standard data transfer mechanism and idetape_bh byte counts are updated by the issuer on completion using the residual count. This change also nicely removes the FIXME in ide_pc_intr() where ide-tape rqs need to be completed using ide_rq_bytes() instead of blk_rq_bytes() (although this didn't really matter as the request didn't have bio). Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-atapi.c | 6 +-- drivers/ide/ide-tape.c | 112 +++++++++------------------------------- 2 files changed, 24 insertions(+), 94 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 3df5442de71..b9dd4503cbc 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -413,11 +413,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) * ->pc_callback() might change rq->data_len for * residual count, cache total length. */ - if (!blk_special_request(rq) && - (drive->media == ide_tape && !rq->bio)) - done = ide_rq_bytes(rq); /* FIXME */ - else - done = blk_rq_bytes(rq); + done = blk_rq_bytes(rq); /* Command finished - Call the callback function */ uptodate = drive->pc_callback(drive, dsc); diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index b2afbc7bcb6..b373ac87635 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -292,65 +292,6 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i) return tape; } -static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, - unsigned int bcount) -{ - struct idetape_bh *bh = pc->bh; - int count; - - if (bcount && bh) { - count = min( - (unsigned int)(bh->b_size - atomic_read(&bh->b_count)), - bcount); - drive->hwif->tp_ops->input_data(drive, NULL, bh->b_data + - atomic_read(&bh->b_count), count); - bcount -= count; - atomic_add(count, &bh->b_count); - if (atomic_read(&bh->b_count) == bh->b_size) - pc->bh = NULL; - } - - return bcount; -} - -static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, - unsigned int bcount) -{ - struct idetape_bh *bh = pc->bh; - int count; - - if (bcount && bh) { - count = min((unsigned int)pc->b_count, (unsigned int)bcount); - drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count); - bcount -= count; - pc->b_data += count; - pc->b_count -= count; - if (!pc->b_count) - pc->bh = NULL; - } - - return bcount; -} - -static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc) -{ - struct idetape_bh *bh = pc->bh; - unsigned int bcount = pc->xferred; - - if (pc->flags & PC_FLAG_WRITING) - return; - if (bcount) { - if (bh == NULL || bcount > bh->b_size) { - printk(KERN_ERR "ide-tape: bh == NULL in %s\n", - __func__); - return; - } - atomic_set(&bh->b_count, bcount); - if (atomic_read(&bh->b_count) == bh->b_size) - pc->bh = NULL; - } -} - /* * called on each failed packet command retry to analyze the request sense. We * currently do not utilize this information. @@ -368,12 +309,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense) pc->c[0], tape->sense_key, tape->asc, tape->ascq); /* Correct pc->xferred by asking the tape. */ - if (pc->flags & PC_FLAG_DMA_ERROR) { + if (pc->flags & PC_FLAG_DMA_ERROR) pc->xferred = pc->req_xfer - tape->blk_size * get_unaligned_be32(&sense[3]); - idetape_update_buffers(drive, pc); - } /* * If error was the result of a zero-length read or write command, @@ -458,7 +397,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc) } tape->first_frame += blocks; - rq->current_nr_sectors -= blocks; + rq->data_len -= blocks * tape->blk_size; if (pc->error) { uptodate = 0; @@ -520,19 +459,6 @@ static void ide_tape_handle_dsc(ide_drive_t *drive) idetape_postpone_request(drive); } -static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, - unsigned int bcount, int write) -{ - unsigned int bleft; - - if (write) - bleft = idetape_output_buffers(drive, pc, bcount); - else - bleft = idetape_input_buffers(drive, pc, bcount); - - return bcount - bleft; -} - /* * Packet Command Interface * @@ -683,7 +609,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape, ide_init_pc(pc); put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]); pc->c[1] = 1; - pc->bh = bh; + pc->bh = NULL; pc->buf = NULL; pc->buf_size = length * tape->blk_size; pc->req_xfer = pc->buf_size; @@ -1063,10 +989,12 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, struct idetape_bh *bh) { idetape_tape_t *tape = drive->driver_data; + size_t size = blocks * tape->blk_size; struct request *rq; - int ret, errors; + int ret; debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd); + BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_SPECIAL; @@ -1074,21 +1002,29 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, rq->rq_disk = tape->disk; rq->special = (void *)bh; rq->sector = tape->first_frame; - rq->nr_sectors = blocks; - rq->current_nr_sectors = blocks; + + if (size) { + ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size, + __GFP_WAIT); + if (ret) + goto out_put; + } + blk_execute_rq(drive->queue, tape->disk, rq, 0); - errors = rq->errors; - ret = tape->blk_size * (blocks - rq->current_nr_sectors); - blk_put_request(rq); + /* calculate the number of transferred bytes and update bh */ + size -= rq->data_len; + if (cmd == REQ_IDETAPE_READ) + atomic_add(size, &bh->b_count); - if ((cmd & (REQ_IDETAPE_READ | REQ_IDETAPE_WRITE)) == 0) - return 0; + ret = size; + if (rq->errors == IDE_DRV_ERROR_GENERAL) + ret = -EIO; if (tape->merge_bh) idetape_init_merge_buffer(tape); - if (errors == IDE_DRV_ERROR_GENERAL) - return -EIO; +out_put: + blk_put_request(rq); return ret; } @@ -2034,8 +1970,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor) u16 *ctl = (u16 *)&tape->caps[12]; drive->pc_callback = ide_tape_callback; - drive->pc_update_buffers = idetape_update_buffers; - drive->pc_io_buffers = ide_tape_io_buffers; drive->dev_flags |= IDE_DFLAG_DSC_OVERLAP; From 963da55c4b9eeeb2085ca74ba927cf77bce966d4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0465/2061] ide-tape: kill idetape_bh Impact: kill now unnecessary idetape_bh With everything using standard mechanisms, there is no need for idetape_bh anymore. Kill it and use tape->buf, cur and valid to describe data buffer instead. Changes worth mentioning are... * idetape_queue_rq_tail() now always queue tape->buf and and adjusts buffer state properly before completion. * idetape_pad_zeros() clears the buffer only once. Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 305 ++++++++++++----------------------------- 1 file changed, 84 insertions(+), 221 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index b373ac87635..d2e9c09b521 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -131,12 +131,6 @@ enum { IDETAPE_DIR_WRITE = (1 << 2), }; -struct idetape_bh { - u32 b_size; - atomic_t b_count; - char *b_data; -}; - /* Tape door status */ #define DOOR_UNLOCKED 0 #define DOOR_LOCKED 1 @@ -218,14 +212,12 @@ typedef struct ide_tape_obj { /* Data buffer size chosen based on the tape's recommendation */ int buffer_size; - /* merge buffer */ - struct idetape_bh *merge_bh; - /* size of the merge buffer */ - int merge_bh_size; - /* pointer to current buffer head within the merge buffer */ - struct idetape_bh *bh; - char *b_data; - int b_count; + /* Staging buffer of buffer_size bytes */ + void *buf; + /* The read/write cursor */ + void *cur; + /* The number of valid bytes in buf */ + size_t valid; /* Measures average tape speed */ unsigned long avg_time; @@ -351,15 +343,6 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense) } } -/* Free data buffers completely. */ -static void ide_tape_kfree_buffer(idetape_tape_t *tape) -{ - struct idetape_bh *bh = tape->merge_bh; - - kfree(bh->b_data); - kfree(bh); -} - static void ide_tape_handle_dsc(ide_drive_t *); static int ide_tape_callback(ide_drive_t *drive, int dsc) @@ -603,8 +586,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape, struct ide_atapi_pc *pc, struct request *rq, u8 opcode) { - struct idetape_bh *bh = (struct idetape_bh *)rq->special; - unsigned int length = rq->current_nr_sectors; + unsigned int length = rq->nr_sectors; ide_init_pc(pc); put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]); @@ -616,14 +598,11 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape, if (pc->req_xfer == tape->buffer_size) pc->flags |= PC_FLAG_DMA_OK; - if (opcode == READ_6) { + if (opcode == READ_6) pc->c[0] = READ_6; - atomic_set(&bh->b_count, 0); - } else if (opcode == WRITE_6) { + else if (opcode == WRITE_6) { pc->c[0] = WRITE_6; pc->flags |= PC_FLAG_WRITING; - pc->b_data = bh->b_data; - pc->b_count = atomic_read(&bh->b_count); } memcpy(rq->cmd, pc->c, 12); @@ -639,10 +618,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, struct ide_cmd cmd; u8 stat; - debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu," - " current_nr_sectors: %u\n", - (unsigned long long)rq->sector, rq->nr_sectors, - rq->current_nr_sectors); + debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n" + (unsigned long long)rq->sector, rq->nr_sectors); if (!(blk_special_request(rq) || blk_sense_request(rq))) { /* We do not support buffer cache originated requests. */ @@ -748,89 +725,6 @@ out: return ide_tape_issue_pc(drive, &cmd, pc); } -/* - * It returns a pointer to the newly allocated buffer, or NULL in case - * of failure. - */ -static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape, - int full) -{ - struct idetape_bh *bh; - - bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL); - if (!bh) - return NULL; - - bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL); - if (!bh->b_data) { - kfree(bh); - return NULL; - } - - bh->b_size = tape->buffer_size; - atomic_set(&bh->b_count, full ? bh->b_size : 0); - - return bh; -} - -static int idetape_copy_stage_from_user(idetape_tape_t *tape, - const char __user *buf, int n) -{ - struct idetape_bh *bh = tape->bh; - int ret = 0; - - if (n) { - if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) { - printk(KERN_ERR "ide-tape: bh == NULL in %s\n", - __func__); - return 1; - } - if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf, - n)) - ret = 1; - atomic_add(n, &bh->b_count); - if (atomic_read(&bh->b_count) == bh->b_size) - tape->bh = NULL; - } - - return ret; -} - -static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf, - int n) -{ - struct idetape_bh *bh = tape->bh; - int ret = 0; - - if (n) { - if (bh == NULL || n > tape->b_count) { - printk(KERN_ERR "ide-tape: bh == NULL in %s\n", - __func__); - return 1; - } - if (copy_to_user(buf, tape->b_data, n)) - ret = 1; - tape->b_data += n; - tape->b_count -= n; - if (!tape->b_count) - tape->bh = NULL; - } - return ret; -} - -static void idetape_init_merge_buffer(idetape_tape_t *tape) -{ - struct idetape_bh *bh = tape->merge_bh; - tape->bh = tape->merge_bh; - - if (tape->chrdev_dir == IDETAPE_DIR_WRITE) - atomic_set(&bh->b_count, 0); - else { - tape->b_data = bh->b_data; - tape->b_count = atomic_read(&bh->b_count); - } -} - /* * Write a filemark if write_filemark=1. Flush the device buffers without * writing a filemark otherwise. @@ -928,10 +822,10 @@ static void __ide_tape_discard_merge_buffer(ide_drive_t *drive) return; clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags); - tape->merge_bh_size = 0; - if (tape->merge_bh != NULL) { - ide_tape_kfree_buffer(tape); - tape->merge_bh = NULL; + tape->valid = 0; + if (tape->buf != NULL) { + kfree(tape->buf); + tape->buf = NULL; } tape->chrdev_dir = IDETAPE_DIR_NONE; @@ -985,8 +879,7 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive, * Generate a read/write request for the block device interface and wait for it * to be serviced. */ -static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, - struct idetape_bh *bh) +static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks) { idetape_tape_t *tape = drive->driver_data; size_t size = blocks * tape->blk_size; @@ -1000,11 +893,10 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, rq->cmd_type = REQ_TYPE_SPECIAL; rq->cmd[13] = cmd; rq->rq_disk = tape->disk; - rq->special = (void *)bh; rq->sector = tape->first_frame; if (size) { - ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size, + ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size, __GFP_WAIT); if (ret) goto out_put; @@ -1012,17 +904,17 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, blk_execute_rq(drive->queue, tape->disk, rq, 0); - /* calculate the number of transferred bytes and update bh */ + /* calculate the number of transferred bytes and update buffer state */ size -= rq->data_len; + tape->cur = tape->buf; if (cmd == REQ_IDETAPE_READ) - atomic_add(size, &bh->b_count); + tape->valid = size; + else + tape->valid = 0; ret = size; if (rq->errors == IDE_DRV_ERROR_GENERAL) ret = -EIO; - - if (tape->merge_bh) - idetape_init_merge_buffer(tape); out_put: blk_put_request(rq); return ret; @@ -1064,49 +956,33 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd) /* Queue up a character device originated write request. */ static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks) { - idetape_tape_t *tape = drive->driver_data; - debug_log(DBG_CHRDEV, "Enter %s\n", __func__); - return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, - blocks, tape->merge_bh); + return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks); } static void ide_tape_flush_merge_buffer(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; int blocks; - struct idetape_bh *bh; if (tape->chrdev_dir != IDETAPE_DIR_WRITE) { printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer" " but we are not writing.\n"); return; } - if (tape->merge_bh_size > tape->buffer_size) { - printk(KERN_ERR "ide-tape: bug: merge_buffer too big\n"); - tape->merge_bh_size = tape->buffer_size; - } - if (tape->merge_bh_size) { - blocks = tape->merge_bh_size / tape->blk_size; - if (tape->merge_bh_size % tape->blk_size) { - unsigned int i = tape->blk_size - - tape->merge_bh_size % tape->blk_size; + if (tape->buf) { + blocks = tape->valid / tape->blk_size; + if (tape->valid % tape->blk_size) { blocks++; - bh = tape->bh; - if (bh) { - memset(bh->b_data + atomic_read(&bh->b_count), - 0, i); - atomic_add(i, &bh->b_count); - } else - printk(KERN_INFO "ide-tape: bug, bh NULL\n"); + memset(tape->buf + tape->valid, 0, + tape->blk_size - tape->valid % tape->blk_size); } (void) idetape_add_chrdev_write_request(drive, blocks); - tape->merge_bh_size = 0; } - if (tape->merge_bh != NULL) { - ide_tape_kfree_buffer(tape); - tape->merge_bh = NULL; + if (tape->buf != NULL) { + kfree(tape->buf); + tape->buf = NULL; } tape->chrdev_dir = IDETAPE_DIR_NONE; } @@ -1122,15 +998,15 @@ static int idetape_init_read(ide_drive_t *drive) ide_tape_flush_merge_buffer(drive); idetape_flush_tape_buffers(drive); } - if (tape->merge_bh || tape->merge_bh_size) { - printk(KERN_ERR "ide-tape: merge_bh_size should be" - " 0 now\n"); - tape->merge_bh_size = 0; + if (tape->buf || tape->valid) { + printk(KERN_ERR "ide-tape: valid should be 0 now\n"); + tape->valid = 0; } - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0); - if (!tape->merge_bh) + tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); + if (!tape->buf) return -ENOMEM; tape->chrdev_dir = IDETAPE_DIR_READ; + tape->cur = tape->buf; /* * Issue a read 0 command to ensure that DSC handshake is @@ -1140,11 +1016,10 @@ static int idetape_init_read(ide_drive_t *drive) */ if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) { bytes_read = idetape_queue_rw_tail(drive, - REQ_IDETAPE_READ, 0, - tape->merge_bh); + REQ_IDETAPE_READ, 0); if (bytes_read < 0) { - ide_tape_kfree_buffer(tape); - tape->merge_bh = NULL; + kfree(tape->buf); + tape->buf = NULL; tape->chrdev_dir = IDETAPE_DIR_NONE; return bytes_read; } @@ -1157,8 +1032,6 @@ static int idetape_init_read(ide_drive_t *drive) /* called from idetape_chrdev_read() to service a chrdev read request. */ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) { - idetape_tape_t *tape = drive->driver_data; - debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks); /* If we are at a filemark, return a read length of 0 */ @@ -1167,27 +1040,21 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) idetape_init_read(drive); - return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks, - tape->merge_bh); + return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks); } static void idetape_pad_zeros(ide_drive_t *drive, int bcount) { idetape_tape_t *tape = drive->driver_data; - struct idetape_bh *bh = tape->merge_bh; - int blocks; + + memset(tape->buf, 0, tape->buffer_size); while (bcount) { - unsigned int count; + unsigned int count = min(tape->buffer_size, bcount); - count = min(tape->buffer_size, bcount); + idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, + count / tape->blk_size); bcount -= count; - blocks = count / tape->blk_size; - atomic_set(&bh->b_count, count); - memset(bh->b_data, 0, atomic_read(&bh->b_count)); - - idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks, - tape->merge_bh); } } @@ -1267,7 +1134,7 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op, } if (tape->chrdev_dir == IDETAPE_DIR_READ) { - tape->merge_bh_size = 0; + tape->valid = 0; if (test_and_clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) ++count; ide_tape_discard_merge_buffer(drive, 0); @@ -1333,20 +1200,20 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, return rc; if (count == 0) return (0); - if (tape->merge_bh_size) { - actually_read = min((unsigned int)(tape->merge_bh_size), - (unsigned int)count); - if (idetape_copy_stage_to_user(tape, buf, actually_read)) + if (tape->valid) { + actually_read = min_t(unsigned int, tape->valid, count); + if (copy_to_user(buf, tape->cur, actually_read)) ret = -EFAULT; buf += actually_read; - tape->merge_bh_size -= actually_read; count -= actually_read; + tape->cur += actually_read; + tape->valid -= actually_read; } while (count >= tape->buffer_size) { bytes_read = idetape_add_chrdev_read_request(drive, ctl); if (bytes_read <= 0) goto finish; - if (idetape_copy_stage_to_user(tape, buf, bytes_read)) + if (copy_to_user(buf, tape->cur, bytes_read)) ret = -EFAULT; buf += bytes_read; count -= bytes_read; @@ -1357,10 +1224,11 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, if (bytes_read <= 0) goto finish; temp = min((unsigned long)count, (unsigned long)bytes_read); - if (idetape_copy_stage_to_user(tape, buf, temp)) + if (copy_to_user(buf, tape->cur, temp)) ret = -EFAULT; actually_read += temp; - tape->merge_bh_size = bytes_read-temp; + tape->cur += temp; + tape->valid -= temp; } finish: if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) { @@ -1392,16 +1260,15 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, if (tape->chrdev_dir != IDETAPE_DIR_WRITE) { if (tape->chrdev_dir == IDETAPE_DIR_READ) ide_tape_discard_merge_buffer(drive, 1); - if (tape->merge_bh || tape->merge_bh_size) { - printk(KERN_ERR "ide-tape: merge_bh_size " - "should be 0 now\n"); - tape->merge_bh_size = 0; + if (tape->buf || tape->valid) { + printk(KERN_ERR "ide-tape: valid should be 0 now\n"); + tape->valid = 0; } - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0); - if (!tape->merge_bh) + tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); + if (!tape->buf) return -ENOMEM; tape->chrdev_dir = IDETAPE_DIR_WRITE; - idetape_init_merge_buffer(tape); + tape->cur = tape->buf; /* * Issue a write 0 command to ensure that DSC handshake is @@ -1411,11 +1278,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, */ if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) { ssize_t retval = idetape_queue_rw_tail(drive, - REQ_IDETAPE_WRITE, 0, - tape->merge_bh); + REQ_IDETAPE_WRITE, 0); if (retval < 0) { - ide_tape_kfree_buffer(tape); - tape->merge_bh = NULL; + kfree(tape->buf); + tape->buf = NULL; tape->chrdev_dir = IDETAPE_DIR_NONE; return retval; } @@ -1423,23 +1289,19 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, } if (count == 0) return (0); - if (tape->merge_bh_size) { - if (tape->merge_bh_size >= tape->buffer_size) { - printk(KERN_ERR "ide-tape: bug: merge buf too big\n"); - tape->merge_bh_size = 0; - } - actually_written = min((unsigned int) - (tape->buffer_size - tape->merge_bh_size), - (unsigned int)count); - if (idetape_copy_stage_from_user(tape, buf, actually_written)) - ret = -EFAULT; + if (tape->valid < tape->buffer_size) { + actually_written = min_t(unsigned int, + tape->buffer_size - tape->valid, + count); + if (copy_from_user(tape->cur, buf, actually_written)) + ret = -EFAULT; buf += actually_written; - tape->merge_bh_size += actually_written; count -= actually_written; + tape->cur += actually_written; + tape->valid += actually_written; - if (tape->merge_bh_size == tape->buffer_size) { + if (tape->valid == tape->buffer_size) { ssize_t retval; - tape->merge_bh_size = 0; retval = idetape_add_chrdev_write_request(drive, ctl); if (retval <= 0) return (retval); @@ -1447,7 +1309,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, } while (count >= tape->buffer_size) { ssize_t retval; - if (idetape_copy_stage_from_user(tape, buf, tape->buffer_size)) + if (copy_from_user(tape->cur, buf, tape->buffer_size)) ret = -EFAULT; buf += tape->buffer_size; count -= tape->buffer_size; @@ -1458,9 +1320,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, } if (count) { actually_written += count; - if (idetape_copy_stage_from_user(tape, buf, count)) + if (copy_from_user(tape->cur, buf, count)) ret = -EFAULT; - tape->merge_bh_size += count; + tape->cur += count; + tape->valid += count; } return ret ? ret : actually_written; } @@ -1623,7 +1486,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file, idetape_flush_tape_buffers(drive); } if (cmd == MTIOCGET || cmd == MTIOCPOS) { - block_offset = tape->merge_bh_size / + block_offset = tape->valid / (tape->blk_size * tape->user_bs_factor); position = idetape_read_position(drive); if (position < 0) @@ -1771,12 +1634,12 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor) idetape_tape_t *tape = drive->driver_data; ide_tape_flush_merge_buffer(drive); - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1); - if (tape->merge_bh != NULL) { + tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); + if (tape->buf != NULL) { idetape_pad_zeros(drive, tape->blk_size * (tape->user_bs_factor - 1)); - ide_tape_kfree_buffer(tape); - tape->merge_bh = NULL; + kfree(tape->buf); + tape->buf = NULL; } idetape_write_filemark(drive); idetape_flush_tape_buffers(drive); @@ -2042,7 +1905,7 @@ static void ide_tape_release(struct device *dev) ide_drive_t *drive = tape->drive; struct gendisk *g = tape->disk; - BUG_ON(tape->merge_bh_size); + BUG_ON(tape->valid); drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP; drive->driver_data = NULL; From 88f1b941c5c94016a59144a3c94c9ca31eb16205 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0466/2061] ide-tape: unify r/w init paths Impact: cleanup Read and write init paths are almost identical. Unify them into idetape_init_rw(). Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 110 +++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 64 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index d2e9c09b521..4da7fa9bca1 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -987,42 +987,50 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive) tape->chrdev_dir = IDETAPE_DIR_NONE; } -static int idetape_init_read(ide_drive_t *drive) +static int idetape_init_rw(ide_drive_t *drive, int dir) { idetape_tape_t *tape = drive->driver_data; - int bytes_read; + int rc; - /* Initialize read operation */ - if (tape->chrdev_dir != IDETAPE_DIR_READ) { - if (tape->chrdev_dir == IDETAPE_DIR_WRITE) { - ide_tape_flush_merge_buffer(drive); - idetape_flush_tape_buffers(drive); - } - if (tape->buf || tape->valid) { - printk(KERN_ERR "ide-tape: valid should be 0 now\n"); - tape->valid = 0; - } - tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); - if (!tape->buf) - return -ENOMEM; - tape->chrdev_dir = IDETAPE_DIR_READ; - tape->cur = tape->buf; + BUG_ON(dir != IDETAPE_DIR_READ && dir != IDETAPE_DIR_WRITE); - /* - * Issue a read 0 command to ensure that DSC handshake is - * switched from completion mode to buffer available mode. - * No point in issuing this if DSC overlap isn't supported, some - * drives (Seagate STT3401A) will return an error. - */ - if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) { - bytes_read = idetape_queue_rw_tail(drive, - REQ_IDETAPE_READ, 0); - if (bytes_read < 0) { - kfree(tape->buf); - tape->buf = NULL; - tape->chrdev_dir = IDETAPE_DIR_NONE; - return bytes_read; - } + if (tape->chrdev_dir == dir) + return 0; + + if (tape->chrdev_dir == IDETAPE_DIR_READ) + ide_tape_discard_merge_buffer(drive, 1); + else if (tape->chrdev_dir == IDETAPE_DIR_WRITE) { + ide_tape_flush_merge_buffer(drive); + idetape_flush_tape_buffers(drive); + } + + if (tape->buf || tape->valid) { + printk(KERN_ERR "ide-tape: valid should be 0 now\n"); + tape->valid = 0; + } + + tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); + if (!tape->buf) + return -ENOMEM; + tape->chrdev_dir = dir; + tape->cur = tape->buf; + + /* + * Issue a 0 rw command to ensure that DSC handshake is + * switched from completion mode to buffer available mode. No + * point in issuing this if DSC overlap isn't supported, some + * drives (Seagate STT3401A) will return an error. + */ + if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) { + int cmd = dir == IDETAPE_DIR_READ ? REQ_IDETAPE_READ + : REQ_IDETAPE_WRITE; + + rc = idetape_queue_rw_tail(drive, cmd, 0); + if (rc < 0) { + kfree(tape->buf); + tape->buf = NULL; + tape->chrdev_dir = IDETAPE_DIR_NONE; + return rc; } } @@ -1038,7 +1046,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) return 0; - idetape_init_read(drive); + idetape_init_rw(drive, IDETAPE_DIR_READ); return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks); } @@ -1195,7 +1203,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, (count % tape->blk_size) == 0) tape->user_bs_factor = count / tape->blk_size; } - rc = idetape_init_read(drive); + rc = idetape_init_rw(drive, IDETAPE_DIR_READ); if (rc < 0) return rc; if (count == 0) @@ -1249,6 +1257,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, ssize_t actually_written = 0; ssize_t ret = 0; u16 ctl = *(u16 *)&tape->caps[12]; + int rc; /* The drive is write protected. */ if (tape->write_prot) @@ -1257,36 +1266,9 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count); /* Initialize write operation */ - if (tape->chrdev_dir != IDETAPE_DIR_WRITE) { - if (tape->chrdev_dir == IDETAPE_DIR_READ) - ide_tape_discard_merge_buffer(drive, 1); - if (tape->buf || tape->valid) { - printk(KERN_ERR "ide-tape: valid should be 0 now\n"); - tape->valid = 0; - } - tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); - if (!tape->buf) - return -ENOMEM; - tape->chrdev_dir = IDETAPE_DIR_WRITE; - tape->cur = tape->buf; - - /* - * Issue a write 0 command to ensure that DSC handshake is - * switched from completion mode to buffer available mode. No - * point in issuing this if DSC overlap isn't supported, some - * drives (Seagate STT3401A) will return an error. - */ - if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) { - ssize_t retval = idetape_queue_rw_tail(drive, - REQ_IDETAPE_WRITE, 0); - if (retval < 0) { - kfree(tape->buf); - tape->buf = NULL; - tape->chrdev_dir = IDETAPE_DIR_NONE; - return retval; - } - } - } + rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE); + if (rc < 0) + return rc; if (count == 0) return (0); if (tape->valid < tape->buffer_size) { From 6bb11dd14f70228f8dab25fd25dabeb9bc74926d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:03 +0900 Subject: [PATCH 0467/2061] ide-tape: use byte size instead of sectors on rw issue functions Impact: cleanup Byte size is what most issue functions deal with, make idetape_queue_rw_tail() and its wrappers take byte size instead of sector counts. idetape_chrdev_read() and write() functions are converted to use tape->buffer_size instead of ctl from tape->cap. This cleans up code a little bit and will ease the next r/w reimplementation. Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 45 +++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 4da7fa9bca1..d5e9bb286e3 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -879,15 +879,15 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive, * Generate a read/write request for the block device interface and wait for it * to be serviced. */ -static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks) +static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) { idetape_tape_t *tape = drive->driver_data; - size_t size = blocks * tape->blk_size; struct request *rq; int ret; debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd); BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE); + BUG_ON(size < 0 || size % tape->blk_size); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_SPECIAL; @@ -954,17 +954,16 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd) } /* Queue up a character device originated write request. */ -static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks) +static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size) { debug_log(DBG_CHRDEV, "Enter %s\n", __func__); - return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks); + return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size); } static void ide_tape_flush_merge_buffer(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; - int blocks; if (tape->chrdev_dir != IDETAPE_DIR_WRITE) { printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer" @@ -972,15 +971,10 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive) return; } if (tape->buf) { - blocks = tape->valid / tape->blk_size; - if (tape->valid % tape->blk_size) { - blocks++; - memset(tape->buf + tape->valid, 0, - tape->blk_size - tape->valid % tape->blk_size); - } - (void) idetape_add_chrdev_write_request(drive, blocks); - } - if (tape->buf != NULL) { + size_t aligned = roundup(tape->valid, tape->blk_size); + + memset(tape->cur, 0, aligned - tape->valid); + idetape_add_chrdev_write_request(drive, aligned); kfree(tape->buf); tape->buf = NULL; } @@ -1038,9 +1032,9 @@ static int idetape_init_rw(ide_drive_t *drive, int dir) } /* called from idetape_chrdev_read() to service a chrdev read request. */ -static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) +static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size) { - debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks); + debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size); /* If we are at a filemark, return a read length of 0 */ if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) @@ -1048,7 +1042,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) idetape_init_rw(drive, IDETAPE_DIR_READ); - return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks); + return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size); } static void idetape_pad_zeros(ide_drive_t *drive, int bcount) @@ -1060,8 +1054,7 @@ static void idetape_pad_zeros(ide_drive_t *drive, int bcount) while (bcount) { unsigned int count = min(tape->buffer_size, bcount); - idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, - count / tape->blk_size); + idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, count); bcount -= count; } } @@ -1193,7 +1186,6 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, ide_drive_t *drive = tape->drive; ssize_t bytes_read, temp, actually_read = 0, rc; ssize_t ret = 0; - u16 ctl = *(u16 *)&tape->caps[12]; debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count); @@ -1218,7 +1210,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, tape->valid -= actually_read; } while (count >= tape->buffer_size) { - bytes_read = idetape_add_chrdev_read_request(drive, ctl); + bytes_read = idetape_add_chrdev_read_request(drive, + tape->buffer_size); if (bytes_read <= 0) goto finish; if (copy_to_user(buf, tape->cur, bytes_read)) @@ -1228,7 +1221,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, actually_read += bytes_read; } if (count) { - bytes_read = idetape_add_chrdev_read_request(drive, ctl); + bytes_read = idetape_add_chrdev_read_request(drive, + tape->buffer_size); if (bytes_read <= 0) goto finish; temp = min((unsigned long)count, (unsigned long)bytes_read); @@ -1256,7 +1250,6 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, ide_drive_t *drive = tape->drive; ssize_t actually_written = 0; ssize_t ret = 0; - u16 ctl = *(u16 *)&tape->caps[12]; int rc; /* The drive is write protected. */ @@ -1284,7 +1277,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, if (tape->valid == tape->buffer_size) { ssize_t retval; - retval = idetape_add_chrdev_write_request(drive, ctl); + retval = idetape_add_chrdev_write_request(drive, + tape->buffer_size); if (retval <= 0) return (retval); } @@ -1295,7 +1289,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, ret = -EFAULT; buf += tape->buffer_size; count -= tape->buffer_size; - retval = idetape_add_chrdev_write_request(drive, ctl); + retval = idetape_add_chrdev_write_request(drive, + tape->buffer_size); actually_written += tape->buffer_size; if (retval <= 0) return (retval); From 07bd9686c50c2b1f10e48089d4fb836a971f5177 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:03 +0900 Subject: [PATCH 0468/2061] ide-tape: simplify read/write functions Impact: cleanup idetape_chrdev_read/write() functions are unnecessarily complex when everything can be handled in a single loop. Collapse idetape_add_chrdev_read/write_request() into the rw functions and simplify the implementation. Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 147 ++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 98 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index d5e9bb286e3..2599579e417 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -953,14 +953,6 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd) pc->flags |= PC_FLAG_WAIT_FOR_DSC; } -/* Queue up a character device originated write request. */ -static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size) -{ - debug_log(DBG_CHRDEV, "Enter %s\n", __func__); - - return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size); -} - static void ide_tape_flush_merge_buffer(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; @@ -974,7 +966,7 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive) size_t aligned = roundup(tape->valid, tape->blk_size); memset(tape->cur, 0, aligned - tape->valid); - idetape_add_chrdev_write_request(drive, aligned); + idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, aligned); kfree(tape->buf); tape->buf = NULL; } @@ -1031,20 +1023,6 @@ static int idetape_init_rw(ide_drive_t *drive, int dir) return 0; } -/* called from idetape_chrdev_read() to service a chrdev read request. */ -static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size) -{ - debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size); - - /* If we are at a filemark, return a read length of 0 */ - if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) - return 0; - - idetape_init_rw(drive, IDETAPE_DIR_READ); - - return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size); -} - static void idetape_pad_zeros(ide_drive_t *drive, int bcount) { idetape_tape_t *tape = drive->driver_data; @@ -1184,8 +1162,9 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, { struct ide_tape_obj *tape = file->private_data; ide_drive_t *drive = tape->drive; - ssize_t bytes_read, temp, actually_read = 0, rc; + size_t done = 0; ssize_t ret = 0; + int rc; debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count); @@ -1195,52 +1174,43 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, (count % tape->blk_size) == 0) tape->user_bs_factor = count / tape->blk_size; } + rc = idetape_init_rw(drive, IDETAPE_DIR_READ); if (rc < 0) return rc; - if (count == 0) - return (0); - if (tape->valid) { - actually_read = min_t(unsigned int, tape->valid, count); - if (copy_to_user(buf, tape->cur, actually_read)) + + while (done < count) { + size_t todo; + + /* refill if staging buffer is empty */ + if (!tape->valid) { + /* If we are at a filemark, nothing more to read */ + if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) + break; + /* read */ + if (idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, + tape->buffer_size) <= 0) + break; + } + + /* copy out */ + todo = min_t(size_t, count - done, tape->valid); + if (copy_to_user(buf + done, tape->cur, todo)) ret = -EFAULT; - buf += actually_read; - count -= actually_read; - tape->cur += actually_read; - tape->valid -= actually_read; + + tape->cur += todo; + tape->valid -= todo; + done += todo; } - while (count >= tape->buffer_size) { - bytes_read = idetape_add_chrdev_read_request(drive, - tape->buffer_size); - if (bytes_read <= 0) - goto finish; - if (copy_to_user(buf, tape->cur, bytes_read)) - ret = -EFAULT; - buf += bytes_read; - count -= bytes_read; - actually_read += bytes_read; - } - if (count) { - bytes_read = idetape_add_chrdev_read_request(drive, - tape->buffer_size); - if (bytes_read <= 0) - goto finish; - temp = min((unsigned long)count, (unsigned long)bytes_read); - if (copy_to_user(buf, tape->cur, temp)) - ret = -EFAULT; - actually_read += temp; - tape->cur += temp; - tape->valid -= temp; - } -finish: - if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) { + + if (!done && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) { debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name); idetape_space_over_filemarks(drive, MTFSF, 1); return 0; } - return ret ? ret : actually_read; + return ret ? ret : done; } static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, @@ -1248,7 +1218,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, { struct ide_tape_obj *tape = file->private_data; ide_drive_t *drive = tape->drive; - ssize_t actually_written = 0; + size_t done = 0; ssize_t ret = 0; int rc; @@ -1262,47 +1232,28 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE); if (rc < 0) return rc; - if (count == 0) - return (0); - if (tape->valid < tape->buffer_size) { - actually_written = min_t(unsigned int, - tape->buffer_size - tape->valid, - count); - if (copy_from_user(tape->cur, buf, actually_written)) - ret = -EFAULT; - buf += actually_written; - count -= actually_written; - tape->cur += actually_written; - tape->valid += actually_written; - if (tape->valid == tape->buffer_size) { - ssize_t retval; - retval = idetape_add_chrdev_write_request(drive, - tape->buffer_size); - if (retval <= 0) - return (retval); - } - } - while (count >= tape->buffer_size) { - ssize_t retval; - if (copy_from_user(tape->cur, buf, tape->buffer_size)) + while (done < count) { + size_t todo; + + /* flush if staging buffer is full */ + if (tape->valid == tape->buffer_size && + idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, + tape->buffer_size) <= 0) + return rc; + + /* copy in */ + todo = min_t(size_t, count - done, + tape->buffer_size - tape->valid); + if (copy_from_user(tape->cur, buf + done, todo)) ret = -EFAULT; - buf += tape->buffer_size; - count -= tape->buffer_size; - retval = idetape_add_chrdev_write_request(drive, - tape->buffer_size); - actually_written += tape->buffer_size; - if (retval <= 0) - return (retval); + + tape->cur += todo; + tape->valid += todo; + done += todo; } - if (count) { - actually_written += count; - if (copy_from_user(tape->cur, buf, count)) - ret = -EFAULT; - tape->cur += count; - tape->valid += count; - } - return ret ? ret : actually_written; + + return ret ? ret : done; } static int idetape_write_filemark(ide_drive_t *drive) From 6d7003877c2f0578f1c08f66d05c3f72ef4ae596 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:03 +0900 Subject: [PATCH 0469/2061] ide-atapi: kill unused fields and callbacks Impact: remove fields and code paths which are no longer necessary Now that ide-tape uses standard mechanisms to transfer data, special case handling for bh handling can be dropped from ide-atapi. Drop the followings. * pc->cur_pos, b_count, bh and b_data * drive->pc_update_buffers() and pc_io_buffers(). Signed-off-by: Tejun Heo --- drivers/ide/ide-atapi.c | 17 ++++------------- drivers/ide/ide-tape.c | 1 - include/linux/ide.h | 12 ------------ 3 files changed, 4 insertions(+), 26 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index b9dd4503cbc..afe5a432387 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -359,11 +359,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) drive->name, rq_data_dir(pc->rq) ? "write" : "read"); pc->flags |= PC_FLAG_DMA_ERROR; - } else { + } else pc->xferred = pc->req_xfer; - if (drive->pc_update_buffers) - drive->pc_update_buffers(drive, pc); - } debug_log("%s: DMA finished\n", drive->name); } @@ -463,16 +460,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) return ide_do_reset(drive); } - if (drive->media == ide_tape && pc->bh) - done = drive->pc_io_buffers(drive, pc, bcount, write); - else { - done = min_t(unsigned int, bcount, cmd->nleft); - ide_pio_bytes(drive, cmd, write, done); - } + done = min_t(unsigned int, bcount, cmd->nleft); + ide_pio_bytes(drive, cmd, write, done); - /* Update the current position */ + /* Update transferred byte count */ pc->xferred += done; - pc->cur_pos += done; bcount -= done; @@ -650,7 +642,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd) /* We haven't transferred any data yet */ pc->xferred = 0; - pc->cur_pos = pc->buf; valid_tf = IDE_VALID_DEVICE; bcount = ((drive->media == ide_tape) ? diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 2599579e417..8dfc68892d6 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -591,7 +591,6 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape, ide_init_pc(pc); put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]); pc->c[1] = 1; - pc->bh = NULL; pc->buf = NULL; pc->buf_size = length * tape->blk_size; pc->req_xfer = pc->buf_size; diff --git a/include/linux/ide.h b/include/linux/ide.h index 1957461ac76..34c128f0a33 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -362,11 +362,7 @@ struct ide_atapi_pc { /* data buffer */ u8 *buf; - /* current buffer position */ - u8 *cur_pos; int buf_size; - /* missing/available data on the current buffer */ - int b_count; /* the corresponding request */ struct request *rq; @@ -379,10 +375,6 @@ struct ide_atapi_pc { */ u8 pc_buf[IDE_PC_BUFFER_SIZE]; - /* idetape only */ - struct idetape_bh *bh; - char *b_data; - unsigned long timeout; }; @@ -595,10 +587,6 @@ struct ide_drive_s { /* callback for packet commands */ int (*pc_callback)(struct ide_drive_s *, int); - void (*pc_update_buffers)(struct ide_drive_s *, struct ide_atapi_pc *); - int (*pc_io_buffers)(struct ide_drive_s *, struct ide_atapi_pc *, - unsigned int, int); - ide_startstop_t (*irq_handler)(struct ide_drive_s *); unsigned long atapi_flags; From 2c316bb57ad4e9f0f3de2d7ef1ae85530c2a7e69 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:03 +0900 Subject: [PATCH 0470/2061] ide: drop rq->data handling from ide_map_sg() Impact: remove code path which is no longer necessary All IDE data transfers now use rq->bio. Simplify ide_map_sg() accordingly. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-io.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 6e3094e2277..a0309ea661a 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -248,11 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd) struct scatterlist *sg = hwif->sg_table; struct request *rq = cmd->rq; - if (!rq->bio) { - sg_init_one(sg, rq->data, rq->data_len); - cmd->sg_nents = 1; - } else - cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg); + cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg); } EXPORT_SYMBOL_GPL(ide_map_sg); From e475eedb09ee9a0fd855f3e923aa9af31c17d141 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 15 Apr 2009 10:50:04 +0000 Subject: [PATCH 0471/2061] clocksource: sh_cmt earlytimer support Add Early Platform Driver support to the sh_cmt driver using the earlytimer class. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/clocksource/sh_cmt.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 1c92c39a53a..02bae3994ab 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -566,9 +566,19 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev) static int __devinit sh_cmt_probe(struct platform_device *pdev) { struct sh_cmt_priv *p = platform_get_drvdata(pdev); + struct sh_cmt_config *cfg = pdev->dev.platform_data; int ret; - p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p) { + pr_info("sh_cmt: %s kept as earlytimer\n", cfg->name); + return 0; + } + + if (is_early_platform_device(pdev)) + p = alloc_bootmem(sizeof(*p)); + else + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) { dev_err(&pdev->dev, "failed to allocate driver data\n"); return -ENOMEM; @@ -576,7 +586,10 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev) ret = sh_cmt_setup(p, pdev); if (ret) { - kfree(p); + if (is_early_platform_device(pdev)) + free_bootmem(__pa(p), sizeof(*p)); + else + kfree(p); platform_set_drvdata(pdev, NULL); } @@ -606,6 +619,7 @@ static void __exit sh_cmt_exit(void) platform_driver_unregister(&sh_cmt_device_driver); } +early_platform_init("earlytimer", &sh_cmt_device_driver); module_init(sh_cmt_init); module_exit(sh_cmt_exit); From eaab89197b733d0f81f07d6c44294b674479fda8 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 15 Apr 2009 10:50:12 +0000 Subject: [PATCH 0472/2061] sh: arch earlytimer support Extend the 32-bit SuperH timer code to register and probe the earlytimer class of Early Platform Drivers. This registers the sh_cmt driver if compiled-in. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/time_32.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c index c34e1e0f9b0..04b8c6ab166 100644 --- a/arch/sh/kernel/time_32.c +++ b/arch/sh/kernel/time_32.c @@ -17,6 +17,7 @@ #include #include #include /* for rtc_lock */ +#include #include #include #include @@ -228,6 +229,14 @@ void __init time_init(void) local_timer_setup(smp_processor_id()); #endif + /* + * Make sure all compiled-in early timers register themselves. + * Run probe() for one "earlytimer" device. + */ + early_platform_driver_register_all("earlytimer"); + if (early_platform_driver_probe("earlytimer", 1, 0)) + return; + /* * Find the timer to use as the system timer, it will be * initialized for us. From 87a00dc059e3af46303f1f56b0e8df41af988c7b Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 15 Apr 2009 10:50:21 +0000 Subject: [PATCH 0473/2061] sh: Add plat_early_device_setup() Add a plat_early_device_setup() function to allow processor-specific code to register Early Platform Data. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/include/asm/device.h | 2 ++ arch/sh/kernel/setup.c | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/arch/sh/include/asm/device.h b/arch/sh/include/asm/device.h index efd511d0803..8688a88303e 100644 --- a/arch/sh/include/asm/device.h +++ b/arch/sh/include/asm/device.h @@ -10,3 +10,5 @@ struct platform_device; int platform_resource_setup_memory(struct platform_device *pdev, char *name, unsigned long memsize); +void plat_early_device_setup(void); + diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 04a6004fccc..22b976d4201 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -328,6 +329,10 @@ static int __init parse_elfcorehdr(char *arg) early_param("elfcorehdr", parse_elfcorehdr); #endif +void __init __attribute__ ((weak)) plat_early_device_setup(void) +{ +} + void __init setup_arch(char **cmdline_p) { enable_mmu(); @@ -381,6 +386,8 @@ void __init setup_arch(char **cmdline_p) parse_early_param(); + plat_early_device_setup(); + sh_mv_setup(); /* From 28fde6863e6ed1f4bfb7cef080e67bf439c20628 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 15 Apr 2009 10:50:30 +0000 Subject: [PATCH 0474/2061] sh: Early Platform Data for SuperH Mobile Use plat_early_device_setup() to register Early Platform Data for SuperH Mobile processors. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7343.c | 10 ++++++++++ arch/sh/kernel/cpu/sh4a/setup-sh7366.c | 10 ++++++++++ arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 10 ++++++++++ arch/sh/kernel/cpu/sh4a/setup-sh7723.c | 10 ++++++++++ 4 files changed, 40 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c index c1549382c87..cb5b4db1ca2 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c @@ -234,6 +234,16 @@ static int __init sh7343_devices_setup(void) } __initcall(sh7343_devices_setup); +static struct platform_device *sh7343_early_devices[] __initdata = { + &cmt_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7343_early_devices, + ARRAY_SIZE(sh7343_early_devices)); +} + enum { UNUSED = 0, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c index 93ecf8ed5c6..2a771f48e9e 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c @@ -226,6 +226,16 @@ static int __init sh7366_devices_setup(void) } __initcall(sh7366_devices_setup); +static struct platform_device *sh7366_early_devices[] __initdata = { + &cmt_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7366_early_devices, + ARRAY_SIZE(sh7366_early_devices)); +} + enum { UNUSED=0, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c index 0e5d204bc79..8f974d0494c 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c @@ -270,6 +270,16 @@ static int __init sh7722_devices_setup(void) } __initcall(sh7722_devices_setup); +static struct platform_device *sh7722_early_devices[] __initdata = { + &cmt_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7722_early_devices, + ARRAY_SIZE(sh7722_early_devices)); +} + enum { UNUSED=0, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c index 5338dacbcfb..21a58d63a00 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c @@ -281,6 +281,16 @@ static int __init sh7723_devices_setup(void) } __initcall(sh7723_devices_setup); +static struct platform_device *sh7723_early_devices[] __initdata = { + &cmt_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7723_early_devices, + ARRAY_SIZE(sh7723_early_devices)); +} + enum { UNUSED=0, From d9aed8b95f062b2f96e37b0e07373f36a6e7066d Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 19 Apr 2009 13:12:04 +0900 Subject: [PATCH 0475/2061] sh: sh7724: Don't default enable the RTC clock. rtc-sh takes care of this now, so no need to have this always enabled. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index 8b87ba8f26b..195274a74f8 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -275,7 +275,6 @@ static struct platform_device *sh7724_devices[] __initdata = { static int __init sh7724_devices_setup(void) { - clk_always_enable("rtc0"); /* RTC */ clk_always_enable("vpu0"); /* VPU */ clk_always_enable("veu1"); /* VEU3F1 */ clk_always_enable("veu0"); /* VEU3F0 */ From 8fb2bae4b41eb64f6e233e9bd3f3a789fbb04a06 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 19 Apr 2009 13:14:29 +0900 Subject: [PATCH 0476/2061] sh: sh7724: Register CMT as an early platform device here too. Follows the SH7722/SH7723 changes. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index 195274a74f8..8429396acb5 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -288,6 +288,16 @@ static int __init sh7724_devices_setup(void) } device_initcall(sh7724_devices_setup); +static struct platform_device *sh7724_early_devices[] __initdata = { + &cmt_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7724_early_devices, + ARRAY_SIZE(sh7724_early_devices)); +} + enum { UNUSED = 0, From e057a5e5647a1c9d0d0054fbd298bfa04b3d1cb4 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 23:38:12 +0200 Subject: [PATCH 0477/2061] tracing/core: Add current context on tracing recursion warning In case of tracing recursion detection, we only get the stacktrace. But the current context may be very useful to debug the issue. This patch adds the softirq/hardirq/nmi context with the warning using lockdep context display to have a familiar output. v2: Use printk_once() v3: drop {hardirq,softirq}_context which depend on lockdep, only keep what is part of current->trace_recursion, sufficient to debug the warning source. [ Impact: print context necessary to debug recursion ] Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index b421b0ea911..bffde630c4e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1495,6 +1495,13 @@ static int trace_recursive_lock(void) if (unlikely(current->trace_recursion & (1 << level))) { /* Disable all tracing before we do anything else */ tracing_off_permanent(); + + printk_once(KERN_WARNING "Tracing recursion: " + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); + WARN_ON_ONCE(1); return -1; } From 9ade1217c9ba39ad2f004a898ddfbb815fd5fe74 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 15:38:25 +0900 Subject: [PATCH 0478/2061] sh: pci: Drop asm-generic/pci.h, so we can use our own fixups. The new PCI code wants its own bus<->resource mappings instead of the generic equivalents, so drop the asm-generic include in preparation. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci.c | 14 ++++++++++++++ arch/sh/include/asm/pci.h | 27 ++++++++++++++++++++++++--- 2 files changed, 38 insertions(+), 3 deletions(-) diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index 8aaacc99b71..6d659cd93c9 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -72,6 +72,20 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus) pci_read_bridge_bases(bus); } +void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, + struct resource *res) +{ + region->start = res->start; + region->end = res->end; +} + +void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, + struct pci_bus_region *region) +{ + res->start = region->start; + res->end = region->end; +} + void pcibios_align_resource(void *data, struct resource *res, resource_size_t size, resource_size_t align) __attribute__ ((weak)); diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index bb2c2fcddc9..69cb615c391 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -146,13 +146,34 @@ int pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin); int pciauto_assign_resources(int busno, struct pci_channel *hose); #endif -#endif /* __KERNEL__ */ +extern void pcibios_resource_to_bus(struct pci_dev *dev, + struct pci_bus_region *region, struct resource *res); -/* generic pci stuff */ -#include +extern void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, + struct pci_bus_region *region); + +static inline struct resource * +pcibios_select_root(struct pci_dev *pdev, struct resource *res) +{ + struct resource *root = NULL; + + if (res->flags & IORESOURCE_IO) + root = &ioport_resource; + if (res->flags & IORESOURCE_MEM) + root = &iomem_resource; + + return root; +} + +/* Chances are this interrupt is wired PC-style ... */ +static inline int pci_get_legacy_ide_irq(struct pci_dev *dev, int channel) +{ + return channel ? 15 : 14; +} /* generic DMA-mapping stuff */ #include +#endif /* __KERNEL__ */ #endif /* __ASM_SH_PCI_H */ From 4c5107e44514a9bde74d0af77982705d602d9e39 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 15:43:36 +0900 Subject: [PATCH 0479/2061] sh: pci: Split out new-style PCI core. This splits off a 'pci-new.c' which is aimed at gradually replacing the pci-auto backend and the arch/sh/drivers/pci/pci.c core respectively. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Kconfig | 10 +- arch/sh/drivers/pci/Makefile | 4 +- arch/sh/drivers/pci/pci-new.c | 248 ++++++++++++++++++++++++++++++++++ 3 files changed, 258 insertions(+), 4 deletions(-) create mode 100644 arch/sh/drivers/pci/pci-new.c diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig index 7e816ededed..efe8cf965a9 100644 --- a/arch/sh/drivers/pci/Kconfig +++ b/arch/sh/drivers/pci/Kconfig @@ -18,10 +18,17 @@ config SH_PCIDMA_NONCOHERENT bridge integrated with your SH CPU, refer carefully to the chip specs to see if you can say 'N' here. Otherwise, leave it as 'Y'. +# Temporary config option for transitioning off of PCI_AUTO +config PCI_NEW + bool + depends on PCI + default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \ + CPU_SUBTYPE_SH7785 + # This is also board-specific config PCI_AUTO bool - depends on PCI + depends on PCI && !PCI_NEW default y config PCI_AUTO_UPDATE_RESOURCES @@ -34,4 +41,3 @@ config PCI_AUTO_UPDATE_RESOURCES for some reason, you have a board that simply refuses to work with its resources updated beyond what they are when the device is powered up, set this to N. Everyone else will want this as Y. - diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index 5003971476e..362a1eec73a 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -2,8 +2,8 @@ # Makefile for the PCI specific kernel interface routines under Linux. # -obj-y += pci.o -obj-$(CONFIG_PCI_AUTO) += pci-auto.o +obj-$(CONFIG_PCI_AUTO) := pci.o pci-auto.o +obj-$(CONFIG_PCI_NEW) := pci-new.o obj-$(CONFIG_CPU_SUBTYPE_SH7751) += pci-sh7751.o ops-sh4.o obj-$(CONFIG_CPU_SUBTYPE_SH7751R) += pci-sh7751.o ops-sh4.o diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c new file mode 100644 index 00000000000..097eb881112 --- /dev/null +++ b/arch/sh/drivers/pci/pci-new.c @@ -0,0 +1,248 @@ +/* + * New-style PCI core. + * + * Copyright (c) 2002 M. R. Brown + * Copyright (c) 2004 - 2009 Paul Mundt + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ +#include +#include +#include +#include +#include + +static int __init pcibios_init(void) +{ + struct pci_channel *p; + struct pci_bus *bus; + int busno; + + /* init channels */ + busno = 0; + for (p = board_pci_channels; p->init; p++) { + if (p->init(p) == 0) + p->enabled = 1; + else + pr_err("Unable to init pci channel %d\n", busno); + busno++; + } + + /* scan the buses */ + busno = 0; + for (p = board_pci_channels; p->init; p++) { + if (p->enabled) { + bus = pci_scan_bus(busno, p->pci_ops, p); + busno = bus->subordinate + 1; + + pci_bus_size_bridges(bus); + pci_bus_assign_resources(bus); + pci_enable_bridges(bus); + } + } + + pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq); + + dma_debug_add_bus(&pci_bus_type); + + return 0; +} +subsys_initcall(pcibios_init); + +static void pcibios_fixup_device_resources(struct pci_dev *dev, + struct pci_bus *bus) +{ + /* Update device resources. */ + struct pci_channel *chan = bus->sysdata; + unsigned long offset = 0; + int i; + + for (i = 0; i < PCI_NUM_RESOURCES; i++) { + if (!dev->resource[i].start) + continue; + if (dev->resource[i].flags & IORESOURCE_PCI_FIXED) + continue; + if (dev->resource[i].flags & IORESOURCE_IO) + offset = chan->io_base; + else if (dev->resource[i].flags & IORESOURCE_MEM) + offset = 0; + + dev->resource[i].start += offset; + dev->resource[i].end += offset; + } +} + + +/* + * Called after each bus is probed, but before its children + * are examined. + */ +void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus) +{ + struct pci_dev *dev = bus->self; + struct list_head *ln; + struct pci_channel *chan = bus->sysdata; + + if (!dev) { + bus->resource[0] = chan->io_resource; + bus->resource[1] = chan->mem_resource; + } + + for (ln = bus->devices.next; ln != &bus->devices; ln = ln->next) { + dev = pci_dev_b(ln); + + if ((dev->class >> 8) != PCI_CLASS_BRIDGE_PCI) + pcibios_fixup_device_resources(dev, bus); + } +} + +void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, + struct resource *res) +{ + struct pci_channel *chan = dev->sysdata; + unsigned long offset = 0; + + if (res->flags & IORESOURCE_IO) + offset = chan->io_base; + else if (res->flags & IORESOURCE_MEM) + offset = 0; + + region->start = res->start - offset; + region->end = res->end - offset; +} + +void __devinit +pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, + struct pci_bus_region *region) +{ + struct pci_channel *chan = dev->sysdata; + unsigned long offset = 0; + + if (res->flags & IORESOURCE_IO) + offset = chan->io_base; + else if (res->flags & IORESOURCE_MEM) + offset = 0; + + res->start = region->start + offset; + res->end = region->end + offset; +} + +void pcibios_align_resource(void *data, struct resource *res, + resource_size_t size, resource_size_t align) + __attribute__ ((weak)); + +/* + * We need to avoid collisions with `mirrored' VGA ports + * and other strange ISA hardware, so we always want the + * addresses to be allocated in the 0x000-0x0ff region + * modulo 0x400. + */ +void pcibios_align_resource(void *data, struct resource *res, + resource_size_t size, resource_size_t align) +{ + if (res->flags & IORESOURCE_IO) { + resource_size_t start = res->start; + + if (start & 0x300) { + start = (start + 0x3ff) & ~0x3ff; + res->start = start; + } + } +} + +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ + u16 cmd, old_cmd; + int idx; + struct resource *r; + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + old_cmd = cmd; + for(idx=0; idx<6; idx++) { + if (!(mask & (1 << idx))) + continue; + r = &dev->resource[idx]; + if (!r->start && r->end) { + printk(KERN_ERR "PCI: Device %s not available because " + "of resource collisions\n", pci_name(dev)); + return -EINVAL; + } + if (r->flags & IORESOURCE_IO) + cmd |= PCI_COMMAND_IO; + if (r->flags & IORESOURCE_MEM) + cmd |= PCI_COMMAND_MEMORY; + } + if (dev->resource[PCI_ROM_RESOURCE].start) + cmd |= PCI_COMMAND_MEMORY; + if (cmd != old_cmd) { + printk(KERN_INFO "PCI: Enabling device %s (%04x -> %04x)\n", + pci_name(dev), old_cmd, cmd); + pci_write_config_word(dev, PCI_COMMAND, cmd); + } + return 0; +} + +/* + * If we set up a device for bus mastering, we need to check and set + * the latency timer as it may not be properly set. + */ +static unsigned int pcibios_max_latency = 255; + +void pcibios_set_master(struct pci_dev *dev) +{ + u8 lat; + pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); + if (lat < 16) + lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; + else if (lat > pcibios_max_latency) + lat = pcibios_max_latency; + else + return; + printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n", + pci_name(dev), lat); + pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); +} + +void __init pcibios_update_irq(struct pci_dev *dev, int irq) +{ + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); +} + +void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) +{ + resource_size_t start = pci_resource_start(dev, bar); + resource_size_t len = pci_resource_len(dev, bar); + unsigned long flags = pci_resource_flags(dev, bar); + + if (unlikely(!len || !start)) + return NULL; + if (maxlen && len > maxlen) + len = maxlen; + + /* + * Presently the IORESOURCE_MEM case is a bit special, most + * SH7751 style PCI controllers have PCI memory at a fixed + * location in the address space where no remapping is desired. + * With the IORESOURCE_MEM case more care has to be taken + * to inhibit page table mapping for legacy cores, but this is + * punted off to __ioremap(). + * -- PFM. + */ + if (flags & IORESOURCE_IO) + return ioport_map(start, len); + if (flags & IORESOURCE_MEM) + return ioremap(start, len); + + return NULL; +} +EXPORT_SYMBOL(pci_iomap); + +void pci_iounmap(struct pci_dev *dev, void __iomem *addr) +{ + iounmap(addr); +} +EXPORT_SYMBOL(pci_iounmap); + +EXPORT_SYMBOL(board_pci_channels); From 9833385131fc4e8c52f95320ab899051d1c06831 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 15:51:45 +0900 Subject: [PATCH 0480/2061] sh: pci: HAVE_PCI_MMAP support. Derived from the MIPS version, now uses pgprot_noncached(). Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Makefile | 6 +++--- arch/sh/drivers/pci/pci-lib.c | 26 ++++++++++++++++++++++++++ arch/sh/include/asm/pci.h | 3 +++ 3 files changed, 32 insertions(+), 3 deletions(-) create mode 100644 arch/sh/drivers/pci/pci-lib.c diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index 362a1eec73a..c8eab14843e 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -1,9 +1,9 @@ # # Makefile for the PCI specific kernel interface routines under Linux. # - -obj-$(CONFIG_PCI_AUTO) := pci.o pci-auto.o -obj-$(CONFIG_PCI_NEW) := pci-new.o +obj-y += pci-lib.o +obj-$(CONFIG_PCI_AUTO) += pci.o pci-auto.o +obj-$(CONFIG_PCI_NEW) += pci-new.o obj-$(CONFIG_CPU_SUBTYPE_SH7751) += pci-sh7751.o ops-sh4.o obj-$(CONFIG_CPU_SUBTYPE_SH7751R) += pci-sh7751.o ops-sh4.o diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c new file mode 100644 index 00000000000..1a43a350d57 --- /dev/null +++ b/arch/sh/drivers/pci/pci-lib.c @@ -0,0 +1,26 @@ +#include +#include +#include +#include +#include + +int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, int write_combine) +{ + /* + * I/O space can be accessed via normal processor loads and stores on + * this platform but for now we elect not to do this and portable + * drivers should not do this anyway. + */ + if (mmap_state == pci_mmap_io) + return -EINVAL; + + /* + * Ignore write-combine; for now only return uncached mappings. + */ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +} diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index 69cb615c391..46afd449739 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -40,6 +40,9 @@ extern struct pci_channel board_pci_channels[]; struct pci_dev; +#define HAVE_PCI_MMAP +extern int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, int write_combine); extern void pcibios_set_master(struct pci_dev *dev); static inline void pcibios_penalize_isa_irq(int irq, int active) From a3c0e0d0032d5bbfd7dc04827a257c717d432a5b Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 16:14:29 +0900 Subject: [PATCH 0481/2061] sh: pci: Consolidate pcibios_align_resource() definitions. This introduces a saner pcibios_align_resource() that can be used regardless of whether pci-auto or pci-new are being used, and consolidates it in pci-lib.c. Signed-off-by: Paul Mundt --- arch/sh/boards/mach-se/7751/pci.c | 9 ++++--- arch/sh/drivers/pci/pci-lib.c | 42 +++++++++++++++++++++++++++++++ arch/sh/drivers/pci/pci-new.c | 23 ----------------- arch/sh/drivers/pci/pci.c | 23 ----------------- arch/sh/include/asm/pci.h | 5 +--- 5 files changed, 49 insertions(+), 53 deletions(-) diff --git a/arch/sh/boards/mach-se/7751/pci.c b/arch/sh/boards/mach-se/7751/pci.c index 203b2923fe7..9ec64a416b3 100644 --- a/arch/sh/boards/mach-se/7751/pci.c +++ b/arch/sh/boards/mach-se/7751/pci.c @@ -30,6 +30,9 @@ #define PCIC_WRITE(x,v) writel((v), PCI_REG(x)) #define PCIC_READ(x) readl(PCI_REG(x)) +#define xPCIBIOS_MIN_IO board_pci_channels->io_resource->start +#define xPCIBIOS_MIN_MEM board_pci_channels->mem_resource->start + /* * Description: This function sets up and initializes the pcic, sets * up the BARS, maps the DRAM into the address space etc, etc. @@ -97,12 +100,12 @@ int __init pcibios_init_platform(void) * meaning all calls go straight through... use BUG_ON to * catch erroneous assumption. */ - BUG_ON(PCIBIOS_MIN_MEM != SH7751_PCI_MEMORY_BASE); + BUG_ON(xPCIBIOS_MIN_MEM != SH7751_PCI_MEMORY_BASE); - PCIC_WRITE(SH7751_PCIMBR, PCIBIOS_MIN_MEM); + PCIC_WRITE(SH7751_PCIMBR, xPCIBIOS_MIN_MEM); /* Set IOBR for window containing area specified in pci.h */ - PCIC_WRITE(SH7751_PCIIOBR, (PCIBIOS_MIN_IO & SH7751_PCIIOBR_MASK)); + PCIC_WRITE(SH7751_PCIIOBR, (xPCIBIOS_MIN_IO & SH7751_PCIIOBR_MASK)); /* All done, may as well say so... */ printk("SH7751 PCI: Finished initialization of the PCI controller\n"); diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c index 1a43a350d57..8ab1a2d1b48 100644 --- a/arch/sh/drivers/pci/pci-lib.c +++ b/arch/sh/drivers/pci/pci-lib.c @@ -4,6 +4,41 @@ #include #include +unsigned long PCIBIOS_MIN_IO = 0x0000; +unsigned long PCIBIOS_MIN_MEM = 0; + +/* + * We need to avoid collisions with `mirrored' VGA ports + * and other strange ISA hardware, so we always want the + * addresses to be allocated in the 0x000-0x0ff region + * modulo 0x400. + */ +void pcibios_align_resource(void *data, struct resource *res, + resource_size_t size, resource_size_t align) +{ + struct pci_dev *dev = data; + struct pci_channel *chan = dev->sysdata; + resource_size_t start = res->start; + + if (res->flags & IORESOURCE_IO) { + if (start < PCIBIOS_MIN_IO + chan->io_resource->start) + start = PCIBIOS_MIN_IO + chan->io_resource->start; + + /* + * Put everything into 0x00-0xff region modulo 0x400. + */ + if (start & 0x300) { + start = (start + 0x3ff) & ~0x3ff; + res->start = start; + } + } else if (res->flags & IORESOURCE_MEM) { + if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start) + start = PCIBIOS_MIN_MEM + chan->mem_resource->start; + } + + res->start = start; +} + int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine) { @@ -24,3 +59,10 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, vma->vm_end - vma->vm_start, vma->vm_page_prot); } + +#ifdef CONFIG_HOTPLUG +EXPORT_SYMBOL(pcibios_resource_to_bus); +EXPORT_SYMBOL(pcibios_bus_to_resource); +EXPORT_SYMBOL(PCIBIOS_MIN_IO); +EXPORT_SYMBOL(PCIBIOS_MIN_MEM); +#endif diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c index 097eb881112..4e9251f3d09 100644 --- a/arch/sh/drivers/pci/pci-new.c +++ b/arch/sh/drivers/pci/pci-new.c @@ -129,29 +129,6 @@ pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, res->end = region->end + offset; } -void pcibios_align_resource(void *data, struct resource *res, - resource_size_t size, resource_size_t align) - __attribute__ ((weak)); - -/* - * We need to avoid collisions with `mirrored' VGA ports - * and other strange ISA hardware, so we always want the - * addresses to be allocated in the 0x000-0x0ff region - * modulo 0x400. - */ -void pcibios_align_resource(void *data, struct resource *res, - resource_size_t size, resource_size_t align) -{ - if (res->flags & IORESOURCE_IO) { - resource_size_t start = res->start; - - if (start & 0x300) { - start = (start + 0x3ff) & ~0x3ff; - res->start = start; - } - } -} - int pcibios_enable_device(struct pci_dev *dev, int mask) { u16 cmd, old_cmd; diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index 6d659cd93c9..f670988e033 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -86,29 +86,6 @@ void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, res->end = region->end; } -void pcibios_align_resource(void *data, struct resource *res, - resource_size_t size, resource_size_t align) - __attribute__ ((weak)); - -/* - * We need to avoid collisions with `mirrored' VGA ports - * and other strange ISA hardware, so we always want the - * addresses to be allocated in the 0x000-0x0ff region - * modulo 0x400. - */ -void pcibios_align_resource(void *data, struct resource *res, - resource_size_t size, resource_size_t align) -{ - if (res->flags & IORESOURCE_IO) { - resource_size_t start = res->start; - - if (start & 0x300) { - start = (start + 0x3ff) & ~0x3ff; - res->start = start; - } - } -} - int pcibios_enable_device(struct pci_dev *dev, int mask) { u16 cmd, old_cmd; diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index 46afd449739..5212bf6dd4b 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -33,10 +33,7 @@ struct pci_channel { */ extern struct pci_channel board_pci_channels[]; -/* ugly as hell, but makes drivers/pci/setup-res.c compile and work */ -#define __PCI_CHAN(bus) ((struct pci_channel *)bus->sysdata) -#define PCIBIOS_MIN_IO __PCI_CHAN(bus)->io_resource->start -#define PCIBIOS_MIN_MEM __PCI_CHAN(bus)->mem_resource->start +extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM; struct pci_dev; From 394b6d2fe624246e258a218dac68d44fe9a8411f Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 16:18:46 +0900 Subject: [PATCH 0482/2061] sh: pci: Kill off unused pcibios_fixup(). This is left over cruft that hasn't been used by anything in a long time, kill off bits that weren't purged previously. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-snapgear.c | 5 ----- arch/sh/include/asm/pci.h | 1 - 2 files changed, 6 deletions(-) diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c index dd2c5df2830..b64f2b91be8 100644 --- a/arch/sh/drivers/pci/ops-snapgear.c +++ b/arch/sh/drivers/pci/ops-snapgear.c @@ -85,8 +85,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) return irq; } - -void __init pcibios_fixup(void) -{ - /* Nothing to fixup .. */ -} diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index 5212bf6dd4b..e8265fd0bb6 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -138,7 +138,6 @@ static inline void __iomem *__get_pci_io_base(unsigned long port, #endif /* Board-specific fixup routines. */ -void pcibios_fixup(void); int pcibios_init_platform(void); int pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin); From 0bb34a6bf1f71d5ad2abfda582a2c2794957bc7b Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 16:38:00 +0900 Subject: [PATCH 0483/2061] sh: pci: Consolidate pci_iomap() and use the generic I/O base. This consolidates the pci_iomap() definitions and reworks how the I/O port base is handled. PCI channels can register their own I/O map base, or if none is provided, the system-wide generic I/O base is used instead. Functionally nothing changes, while this allows us to kill off lots of I/O address special casing and lookups. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-lib.c | 51 +++++++++++++++++++++++++++++++++++ arch/sh/drivers/pci/pci-new.c | 35 ------------------------ arch/sh/drivers/pci/pci.c | 35 ------------------------ arch/sh/include/asm/pci.h | 22 ++------------- arch/sh/kernel/io.c | 4 --- 5 files changed, 53 insertions(+), 94 deletions(-) diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c index 8ab1a2d1b48..654ffcc67d0 100644 --- a/arch/sh/drivers/pci/pci-lib.c +++ b/arch/sh/drivers/pci/pci-lib.c @@ -60,6 +60,57 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, vma->vm_page_prot); } +static void __iomem *ioport_map_pci(struct pci_dev *dev, + unsigned long port, unsigned int nr) +{ + struct pci_channel *chan = dev->sysdata; + + if (!chan->io_map_base) + chan->io_map_base = generic_io_base; + + return (void __iomem *)(chan->io_map_base + port); +} + +void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) +{ + resource_size_t start = pci_resource_start(dev, bar); + resource_size_t len = pci_resource_len(dev, bar); + unsigned long flags = pci_resource_flags(dev, bar); + + if (unlikely(!len || !start)) + return NULL; + if (maxlen && len > maxlen) + len = maxlen; + + if (flags & IORESOURCE_IO) + return ioport_map_pci(dev, start, len); + + /* + * Presently the IORESOURCE_MEM case is a bit special, most + * SH7751 style PCI controllers have PCI memory at a fixed + * location in the address space where no remapping is desired. + * With the IORESOURCE_MEM case more care has to be taken + * to inhibit page table mapping for legacy cores, but this is + * punted off to __ioremap(). + * -- PFM. + */ + if (flags & IORESOURCE_MEM) { + if (flags & IORESOURCE_CACHEABLE) + return ioremap(start, len); + + return ioremap_nocache(start, len); + } + + return NULL; +} +EXPORT_SYMBOL(pci_iomap); + +void pci_iounmap(struct pci_dev *dev, void __iomem *addr) +{ + iounmap(addr); +} +EXPORT_SYMBOL(pci_iounmap); + #ifdef CONFIG_HOTPLUG EXPORT_SYMBOL(pcibios_resource_to_bus); EXPORT_SYMBOL(pcibios_bus_to_resource); diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c index 4e9251f3d09..c92e65045c6 100644 --- a/arch/sh/drivers/pci/pci-new.c +++ b/arch/sh/drivers/pci/pci-new.c @@ -187,39 +187,4 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } -void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) -{ - resource_size_t start = pci_resource_start(dev, bar); - resource_size_t len = pci_resource_len(dev, bar); - unsigned long flags = pci_resource_flags(dev, bar); - - if (unlikely(!len || !start)) - return NULL; - if (maxlen && len > maxlen) - len = maxlen; - - /* - * Presently the IORESOURCE_MEM case is a bit special, most - * SH7751 style PCI controllers have PCI memory at a fixed - * location in the address space where no remapping is desired. - * With the IORESOURCE_MEM case more care has to be taken - * to inhibit page table mapping for legacy cores, but this is - * punted off to __ioremap(). - * -- PFM. - */ - if (flags & IORESOURCE_IO) - return ioport_map(start, len); - if (flags & IORESOURCE_MEM) - return ioremap(start, len); - - return NULL; -} -EXPORT_SYMBOL(pci_iomap); - -void pci_iounmap(struct pci_dev *dev, void __iomem *addr) -{ - iounmap(addr); -} -EXPORT_SYMBOL(pci_iounmap); - EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index f670988e033..d39f24091ad 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -144,39 +144,4 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } -void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) -{ - resource_size_t start = pci_resource_start(dev, bar); - resource_size_t len = pci_resource_len(dev, bar); - unsigned long flags = pci_resource_flags(dev, bar); - - if (unlikely(!len || !start)) - return NULL; - if (maxlen && len > maxlen) - len = maxlen; - - /* - * Presently the IORESOURCE_MEM case is a bit special, most - * SH7751 style PCI controllers have PCI memory at a fixed - * location in the address space where no remapping is desired. - * With the IORESOURCE_MEM case more care has to be taken - * to inhibit page table mapping for legacy cores, but this is - * punted off to __ioremap(). - * -- PFM. - */ - if (flags & IORESOURCE_IO) - return ioport_map(start, len); - if (flags & IORESOURCE_MEM) - return ioremap(start, len); - - return NULL; -} -EXPORT_SYMBOL(pci_iomap); - -void pci_iounmap(struct pci_dev *dev, void __iomem *addr) -{ - iounmap(addr); -} -EXPORT_SYMBOL(pci_iounmap); - EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index e8265fd0bb6..53242828977 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -26,6 +26,8 @@ struct pci_channel { int enabled; unsigned long reg_base; unsigned long io_base; + + unsigned long io_map_base; }; /* @@ -110,31 +112,11 @@ static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size) } return 0; } - -static inline void __iomem *__get_pci_io_base(unsigned long port, - unsigned long size) -{ - struct pci_channel *p; - struct resource *res; - - for (p = board_pci_channels; p->init; p++) { - res = p->io_resource; - if (p->enabled && (port >= res->start) && - (port + size) <= (res->end + 1)) - return (void __iomem *)(p->io_base + port); - } - return NULL; -} #else static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size) { return 0; } -static inline void __iomem *__get_pci_io_base(unsigned long port, - unsigned long size) -{ - return NULL; -} #endif /* Board-specific fixup routines. */ diff --git a/arch/sh/kernel/io.c b/arch/sh/kernel/io.c index 59fb020718a..4f85fffaa55 100644 --- a/arch/sh/kernel/io.c +++ b/arch/sh/kernel/io.c @@ -70,10 +70,6 @@ void __iomem *ioport_map(unsigned long port, unsigned int nr) if (ret) return ret; - ret = __get_pci_io_base(port, nr); - if (ret) - return ret; - return __ioport_map(port, nr); } EXPORT_SYMBOL(ioport_map); From f3b9aae16219aaeca2dd5a9ca69f7a10faa063df Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 23:39:33 +0200 Subject: [PATCH 0484/2061] tracing/ring-buffer: Add unlock recursion protection on discard The pair of helpers trace_recursive_lock() and trace_recursive_unlock() have been introduced recently to provide generic tracing recursion protection. They are used in a symetric way: - trace_recursive_lock() on buffer reserve - trace_recursive_unlock() on buffer commit However sometimes, we don't commit but discard on entry to the buffer, ie: in case of filter checking. Then we must also unlock the recursion protection on discard time, otherwise the tracing gets definitely deactivated and a warning is raised spuriously, such as: 111.119821] ------------[ cut here ]------------ [ 111.119829] WARNING: at kernel/trace/ring_buffer.c:1498 ring_buffer_lock_reserve+0x1b7/0x1d0() [ 111.119835] Hardware name: AMILO Li 2727 [ 111.119839] Modules linked in: [ 111.119846] Pid: 5731, comm: Xorg Tainted: G W 2.6.30-rc1 #69 [ 111.119851] Call Trace: [ 111.119863] [] warn_slowpath+0xd8/0x130 [ 111.119873] [] ? __lock_acquire+0x19f/0x1ae0 [ 111.119882] [] ? __lock_acquire+0x19f/0x1ae0 [ 111.119891] [] ? native_sched_clock+0x20/0x70 [ 111.119899] [] ? put_lock_stats+0xe/0x30 [ 111.119906] [] ? lock_release_holdtime+0xa8/0x150 [ 111.119913] [] ring_buffer_lock_reserve+0x1b7/0x1d0 [ 111.119921] [] trace_buffer_lock_reserve+0x30/0x70 [ 111.119930] [] trace_current_buffer_lock_reserve+0x20/0x30 [ 111.119939] [] ftrace_raw_event_sched_switch+0x58/0x100 [ 111.119948] [] __schedule+0x3a7/0x4cd [ 111.119957] [] ? ftrace_call+0x5/0x2b [ 111.119964] [] ? ftrace_call+0x5/0x2b [ 111.119971] [] schedule+0x18/0x40 [ 111.119977] [] preempt_schedule+0x39/0x60 [ 111.119985] [] _read_unlock+0x53/0x60 [ 111.119993] [] sock_def_readable+0x72/0x80 [ 111.120002] [] unix_stream_sendmsg+0x24d/0x3d0 [ 111.120011] [] sock_aio_write+0x143/0x160 [ 111.120019] [] ? ftrace_call+0x5/0x2b [ 111.120026] [] ? sock_aio_write+0x0/0x160 [ 111.120033] [] ? sock_aio_write+0x0/0x160 [ 111.120042] [] do_sync_readv_writev+0xf3/0x140 [ 111.120049] [] ? ftrace_call+0x5/0x2b [ 111.120057] [] ? autoremove_wake_function+0x0/0x40 [ 111.120067] [] ? cap_file_permission+0x9/0x10 [ 111.120074] [] ? security_file_permission+0x16/0x20 [ 111.120082] [] do_readv_writev+0xd4/0x1f0 [ 111.120089] [] ? ftrace_call+0x5/0x2b [ 111.120097] [] ? ftrace_call+0x5/0x2b [ 111.120105] [] vfs_writev+0x48/0x70 [ 111.120111] [] sys_writev+0x55/0xc0 [ 111.120119] [] system_call_fastpath+0x16/0x1b [ 111.120125] ---[ end trace 15605f4e98d5ccb5 ]--- [ Impact: fix spurious warning triggering tracing shutdown ] Signed-off-by: Frederic Weisbecker --- kernel/trace/ring_buffer.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index bffde630c4e..e145969a8ed 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1642,6 +1642,14 @@ int ring_buffer_unlock_commit(struct ring_buffer *buffer, } EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); +static inline void rb_event_discard(struct ring_buffer_event *event) +{ + event->type = RINGBUF_TYPE_PADDING; + /* time delta must be non zero */ + if (!event->time_delta) + event->time_delta = 1; +} + /** * ring_buffer_event_discard - discard any event in the ring buffer * @event: the event to discard @@ -1656,10 +1664,8 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); */ void ring_buffer_event_discard(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; - /* time delta must be non zero */ - if (!event->time_delta) - event->time_delta = 1; + rb_event_discard(event); + trace_recursive_unlock(); } EXPORT_SYMBOL_GPL(ring_buffer_event_discard); @@ -1690,7 +1696,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, int cpu; /* The event is discarded regardless */ - ring_buffer_event_discard(event); + rb_event_discard(event); /* * This must only be called if the event has not been @@ -1735,6 +1741,8 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, if (rb_is_commit(cpu_buffer, event)) rb_set_commit_to_write(cpu_buffer); + trace_recursive_unlock(); + /* * Only the last preempt count needs to restore preemption. */ From 99f95f117848088f2708b45c70be73152e78bb8a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 18:24:57 +0900 Subject: [PATCH 0485/2061] sh: pci: Rework fixed region checks in ioremap(). Not all PCI channels have non-translatable memory windows, this is a special property of the on-chip PCIC with its 0xfd00... mapping, handle this explicitly. Signed-off-by: Paul Mundt --- arch/sh/include/asm/pci.h | 27 ++++++++++----------------- arch/sh/mm/ioremap_32.c | 14 +++++++------- 2 files changed, 17 insertions(+), 24 deletions(-) diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index 53242828977..82a9369511b 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -90,7 +90,6 @@ static inline void pcibios_penalize_isa_irq(int irq, int active) #define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) #endif -#ifdef CONFIG_PCI static inline void pci_dma_burst_advice(struct pci_dev *pdev, enum pci_dma_burst_strategy *strat, unsigned long *strategy_parameter) @@ -99,24 +98,18 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev, *strategy_parameter = ~0UL; } -static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size) -{ - struct pci_channel *p; - struct resource *res; +#ifdef CONFIG_SUPERH32 +/* + * If we're on an SH7751 or SH7780 PCI controller, PCI memory is mapped + * at the end of the address space in a special non-translatable area. + */ +#define PCI_MEM_FIXED_START 0xfd000000 +#define PCI_MEM_FIXED_END (PCI_MEM_FIXED_START + 0x01000000) - for (p = board_pci_channels; p->init; p++) { - res = p->mem_resource; - if (p->enabled && (phys_addr >= res->start) && - (phys_addr + size) <= (res->end + 1)) - return 1; - } - return 0; -} +#define is_pci_memory_fixed_range(s, e) \ + ((s) >= PCI_MEM_FIXED_START && (e) < PCI_MEM_FIXED_END) #else -static inline int __is_pci_memory(unsigned long phys_addr, unsigned long size) -{ - return 0; -} +#define is_pci_memory_fixed_range(s, e) (0) #endif /* Board-specific fixup routines. */ diff --git a/arch/sh/mm/ioremap_32.c b/arch/sh/mm/ioremap_32.c index 7e04cc8f3b9..da2f4186f2c 100644 --- a/arch/sh/mm/ioremap_32.c +++ b/arch/sh/mm/ioremap_32.c @@ -46,17 +46,15 @@ void __iomem *__ioremap(unsigned long phys_addr, unsigned long size, return NULL; /* - * If we're on an SH7751 or SH7780 PCI controller, PCI memory is - * mapped at the end of the address space (typically 0xfd000000) - * in a non-translatable area, so mapping through page tables for - * this area is not only pointless, but also fundamentally - * broken. Just return the physical address instead. + * If we're in the fixed PCI memory range, mapping through page + * tables is not only pointless, but also fundamentally broken. + * Just return the physical address instead. * * For boards that map a small PCI memory aperture somewhere in * P1/P2 space, ioremap() will already do the right thing, * and we'll never get this far. */ - if (__is_pci_memory(phys_addr, size)) + if (is_pci_memory_fixed_range(phys_addr, size)) return (void __iomem *)phys_addr; #if !defined(CONFIG_PMB_FIXED) @@ -121,7 +119,9 @@ void __iounmap(void __iomem *addr) unsigned long seg = PXSEG(vaddr); struct vm_struct *p; - if (seg < P3SEG || vaddr >= P3_ADDR_MAX || __is_pci_memory(vaddr, 0)) + if (seg < P3SEG || vaddr >= P3_ADDR_MAX) + return; + if (is_pci_memory_fixed_range(vaddr, 0)) return; #ifdef CONFIG_PMB From e79066a659b893debe19010179d3f3f015d76d1c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 18:29:22 +0900 Subject: [PATCH 0486/2061] sh: pci: New-style controller registration. This moves off of the board_pci_channels[] approach for bus registration and over to a cleaner register_pci_controller(), all derived from the MIPS code. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-new.c | 103 ++++++++++++++++++++++--------- arch/sh/drivers/pci/pci-sh7780.c | 87 ++++++++++++-------------- arch/sh/include/asm/pci.h | 27 +++++--- 3 files changed, 132 insertions(+), 85 deletions(-) diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c index c92e65045c6..78b7292c6aa 100644 --- a/arch/sh/drivers/pci/pci-new.c +++ b/arch/sh/drivers/pci/pci-new.c @@ -13,40 +13,90 @@ #include #include #include +#include + +/* + * The PCI controller list. + */ +static struct pci_channel *hose_head, **hose_tail = &hose_head; + +static int pci_initialized; + +static void __devinit pcibios_scanbus(struct pci_channel *hose) +{ + static int next_busno; + struct pci_bus *bus; + + /* Catch botched conversion attempts */ + BUG_ON(hose->init); + + bus = pci_scan_bus(next_busno, hose->pci_ops, hose); + if (bus) { + next_busno = bus->subordinate + 1; + /* Don't allow 8-bit bus number overflow inside the hose - + reserve some space for bridges. */ + if (next_busno > 224) + next_busno = 0; + + pci_bus_size_bridges(bus); + pci_bus_assign_resources(bus); + pci_enable_bridges(bus); + } +} + +static DEFINE_MUTEX(pci_scan_mutex); + +void __devinit register_pci_controller(struct pci_channel *hose) +{ + if (request_resource(&iomem_resource, hose->mem_resource) < 0) + goto out; + if (request_resource(&ioport_resource, hose->io_resource) < 0) { + release_resource(hose->mem_resource); + goto out; + } + + *hose_tail = hose; + hose_tail = &hose->next; + + /* + * Do not panic here but later - this might hapen before console init. + */ + if (!hose->io_map_base) { + printk(KERN_WARNING + "registering PCI controller with io_map_base unset\n"); + } + + /* + * Scan the bus if it is register after the PCI subsystem + * initialization. + */ + if (pci_initialized) { + mutex_lock(&pci_scan_mutex); + pcibios_scanbus(hose); + mutex_unlock(&pci_scan_mutex); + } + + return; + +out: + printk(KERN_WARNING + "Skipping PCI bus scan due to resource conflict\n"); +} static int __init pcibios_init(void) { - struct pci_channel *p; - struct pci_bus *bus; - int busno; + struct pci_channel *hose; - /* init channels */ - busno = 0; - for (p = board_pci_channels; p->init; p++) { - if (p->init(p) == 0) - p->enabled = 1; - else - pr_err("Unable to init pci channel %d\n", busno); - busno++; - } - - /* scan the buses */ - busno = 0; - for (p = board_pci_channels; p->init; p++) { - if (p->enabled) { - bus = pci_scan_bus(busno, p->pci_ops, p); - busno = bus->subordinate + 1; - - pci_bus_size_bridges(bus); - pci_bus_assign_resources(bus); - pci_enable_bridges(bus); - } - } + /* Scan all of the recorded PCI controllers. */ + for (hose = hose_head; hose; hose = hose->next) + pcibios_scanbus(hose); pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq); dma_debug_add_bus(&pci_bus_type); + pci_initialized = 1; + return 0; } subsys_initcall(pcibios_init); @@ -74,7 +124,6 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev, } } - /* * Called after each bus is probed, but before its children * are examined. @@ -186,5 +235,3 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) { pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } - -EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index f02d9dfcf25..4dd6e3b94a6 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -15,11 +15,47 @@ #include #include "pci-sh4.h" -static int __init sh7780_pci_init(struct pci_channel *chan) +extern u8 pci_cache_line_size; + +static struct resource sh7785_io_resource = { + .name = "SH7785_IO", + .start = SH7780_PCI_IO_BASE, + .end = SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1, + .flags = IORESOURCE_IO +}; + +static struct resource sh7785_mem_resource = { + .name = "SH7785_mem", + .start = SH7780_PCI_MEMORY_BASE, + .end = SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1, + .flags = IORESOURCE_MEM +}; + +static struct pci_channel sh7780_pci_controller = { + .pci_ops = &sh4_pci_ops, + .mem_resource = &sh7785_mem_resource, + .io_resource = &sh7785_io_resource, +}; + +static struct sh4_pci_address_map sh7780_pci_map = { + .window0 = { +#if defined(CONFIG_32BIT) + .base = SH7780_32BIT_DDR_BASE_ADDR, + .size = 0x40000000, +#else + .base = SH7780_CS0_BASE_ADDR, + .size = 0x20000000, +#endif + }, +}; + +static int __init sh7780_pci_init(void) { + struct pci_channel *chan = &sh7780_pci_controller; unsigned int id; const char *type = NULL; int ret; + u32 word; printk(KERN_NOTICE "PCI: Starting intialization.\n"); @@ -54,52 +90,6 @@ static int __init sh7780_pci_init(struct pci_channel *chan) if ((ret = sh4_pci_check_direct(chan)) != 0) return ret; - /* - * Platform specific initialization (BSC registers, and memory space - * mapping) will be called via the platform defined function - * pcibios_init_platform(). - */ - return pcibios_init_platform(); -} - -extern u8 pci_cache_line_size; - -static struct resource sh7785_io_resource = { - .name = "SH7785_IO", - .start = SH7780_PCI_IO_BASE, - .end = SH7780_PCI_IO_BASE + SH7780_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource sh7785_mem_resource = { - .name = "SH7785_mem", - .start = SH7780_PCI_MEMORY_BASE, - .end = SH7780_PCI_MEMORY_BASE + SH7780_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -struct pci_channel board_pci_channels[] = { - { sh7780_pci_init, &sh4_pci_ops, &sh7785_io_resource, &sh7785_mem_resource, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - -static struct sh4_pci_address_map sh7780_pci_map = { - .window0 = { -#if defined(CONFIG_32BIT) - .base = SH7780_32BIT_DDR_BASE_ADDR, - .size = 0x40000000, -#else - .base = SH7780_CS0_BASE_ADDR, - .size = 0x20000000, -#endif - }, -}; - -int __init pcibios_init_platform(void) -{ - struct pci_channel *chan = &board_pci_channels[0]; - u32 word; - /* * Set the class and sub-class codes. */ @@ -153,5 +143,8 @@ int __init pcibios_init_platform(void) __set_io_port_base(SH7780_PCI_IO_BASE); + register_pci_controller(chan); + return 0; } +arch_initcall(sh7780_pci_init); diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index 82a9369511b..e057ebdb461 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -17,17 +17,22 @@ * external) PCI controllers. */ struct pci_channel { - int (*init)(struct pci_channel *chan); - struct pci_ops *pci_ops; - struct resource *io_resource; - struct resource *mem_resource; - int first_devfn; - int last_devfn; - int enabled; - unsigned long reg_base; - unsigned long io_base; + struct pci_channel *next; - unsigned long io_map_base; + int (*init)(struct pci_channel *chan); + + struct pci_ops *pci_ops; + struct resource *io_resource; + struct resource *mem_resource; + + int first_devfn; + int last_devfn; + int enabled; + + unsigned long reg_base; + unsigned long io_base; + + unsigned long io_map_base; }; /* @@ -35,6 +40,8 @@ struct pci_channel { */ extern struct pci_channel board_pci_channels[]; +extern void register_pci_controller(struct pci_channel *hose); + extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM; struct pci_dev; From 09cfeb133e3cac39b8b9a2cb1d8ab4f77e396248 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 18:42:00 +0900 Subject: [PATCH 0487/2061] sh: pci: Track io and mem_offset per-channel. This implements a per-hose offset for I/O and mem resources. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-new.c | 18 +++++++++--------- arch/sh/drivers/pci/pci-sh7780.c | 2 ++ arch/sh/include/asm/pci.h | 3 +++ 3 files changed, 14 insertions(+), 9 deletions(-) diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c index 78b7292c6aa..e8ac8daafc3 100644 --- a/arch/sh/drivers/pci/pci-new.c +++ b/arch/sh/drivers/pci/pci-new.c @@ -105,7 +105,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev, struct pci_bus *bus) { /* Update device resources. */ - struct pci_channel *chan = bus->sysdata; + struct pci_channel *hose = bus->sysdata; unsigned long offset = 0; int i; @@ -115,9 +115,9 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev, if (dev->resource[i].flags & IORESOURCE_PCI_FIXED) continue; if (dev->resource[i].flags & IORESOURCE_IO) - offset = chan->io_base; + offset = hose->io_offset; else if (dev->resource[i].flags & IORESOURCE_MEM) - offset = 0; + offset = hose->mem_offset; dev->resource[i].start += offset; dev->resource[i].end += offset; @@ -150,13 +150,13 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus) void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, struct resource *res) { - struct pci_channel *chan = dev->sysdata; + struct pci_channel *hose = dev->sysdata; unsigned long offset = 0; if (res->flags & IORESOURCE_IO) - offset = chan->io_base; + offset = hose->io_offset; else if (res->flags & IORESOURCE_MEM) - offset = 0; + offset = hose->mem_offset; region->start = res->start - offset; region->end = res->end - offset; @@ -166,13 +166,13 @@ void __devinit pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, struct pci_bus_region *region) { - struct pci_channel *chan = dev->sysdata; + struct pci_channel *hose = dev->sysdata; unsigned long offset = 0; if (res->flags & IORESOURCE_IO) - offset = chan->io_base; + offset = hose->io_offset; else if (res->flags & IORESOURCE_MEM) - offset = 0; + offset = hose->mem_offset; res->start = region->start + offset; res->end = region->end + offset; diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 4dd6e3b94a6..57a3b870a27 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -34,7 +34,9 @@ static struct resource sh7785_mem_resource = { static struct pci_channel sh7780_pci_controller = { .pci_ops = &sh4_pci_ops, .mem_resource = &sh7785_mem_resource, + .mem_offset = 0x00000000, .io_resource = &sh7785_io_resource, + .io_offset = 0x00000000, }; static struct sh4_pci_address_map sh7780_pci_map = { diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index e057ebdb461..0be20521a1f 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -25,6 +25,9 @@ struct pci_channel { struct resource *io_resource; struct resource *mem_resource; + unsigned long io_offset; + unsigned long mem_offset; + int first_devfn; int last_devfn; int enabled; From 5160d3f782a5e0cdb3bdaa8a891a1fb9d9ab83ec Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 18:47:21 +0900 Subject: [PATCH 0488/2061] sh: pci: Consolidate bus<->resource mapping in pci-lib. Now that the io and mem offsets are tracked accordingly, the pci-new version of the bus<->resource mappers can be used generically. This moves them in to pci-lib. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-lib.c | 31 +++++++++++++++++++++++++++++++ arch/sh/drivers/pci/pci-new.c | 31 ------------------------------- arch/sh/drivers/pci/pci.c | 14 -------------- 3 files changed, 31 insertions(+), 45 deletions(-) diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c index 654ffcc67d0..9fd3af9db46 100644 --- a/arch/sh/drivers/pci/pci-lib.c +++ b/arch/sh/drivers/pci/pci-lib.c @@ -39,6 +39,37 @@ void pcibios_align_resource(void *data, struct resource *res, res->start = start; } +void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, + struct resource *res) +{ + struct pci_channel *hose = dev->sysdata; + unsigned long offset = 0; + + if (res->flags & IORESOURCE_IO) + offset = hose->io_offset; + else if (res->flags & IORESOURCE_MEM) + offset = hose->mem_offset; + + region->start = res->start - offset; + region->end = res->end - offset; +} + +void __devinit +pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, + struct pci_bus_region *region) +{ + struct pci_channel *hose = dev->sysdata; + unsigned long offset = 0; + + if (res->flags & IORESOURCE_IO) + offset = hose->io_offset; + else if (res->flags & IORESOURCE_MEM) + offset = hose->mem_offset; + + res->start = region->start + offset; + res->end = region->end + offset; +} + int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine) { diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c index e8ac8daafc3..9d426147802 100644 --- a/arch/sh/drivers/pci/pci-new.c +++ b/arch/sh/drivers/pci/pci-new.c @@ -147,37 +147,6 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus) } } -void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, - struct resource *res) -{ - struct pci_channel *hose = dev->sysdata; - unsigned long offset = 0; - - if (res->flags & IORESOURCE_IO) - offset = hose->io_offset; - else if (res->flags & IORESOURCE_MEM) - offset = hose->mem_offset; - - region->start = res->start - offset; - region->end = res->end - offset; -} - -void __devinit -pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, - struct pci_bus_region *region) -{ - struct pci_channel *hose = dev->sysdata; - unsigned long offset = 0; - - if (res->flags & IORESOURCE_IO) - offset = hose->io_offset; - else if (res->flags & IORESOURCE_MEM) - offset = hose->mem_offset; - - res->start = region->start + offset; - res->end = region->end + offset; -} - int pcibios_enable_device(struct pci_dev *dev, int mask) { u16 cmd, old_cmd; diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index d39f24091ad..c15a6f0ad50 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -72,20 +72,6 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus) pci_read_bridge_bases(bus); } -void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, - struct resource *res) -{ - region->start = res->start; - region->end = res->end; -} - -void pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, - struct pci_bus_region *region) -{ - res->start = region->start; - res->end = region->end; -} - int pcibios_enable_device(struct pci_dev *dev, int mask) { u16 cmd, old_cmd; From 3f8daeacd7ed7a502daf0998e2515cea4f467f21 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 18:53:41 +0900 Subject: [PATCH 0489/2061] sh: pci: Consolidate the remaining common bits. This moves the remaining common bits in to pci-lib. Thereby reducing pci.c/pci-new.c to simple bus fixups and controller registration. As more platforms are moved over, the old code will disappear completely and the pci-new bits will be rolled in to pci-lib, eventually replacing pci.c completely. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-lib.c | 64 +++++++++++++++++++++++++++++++++++ arch/sh/drivers/pci/pci-new.c | 58 ------------------------------- arch/sh/drivers/pci/pci.c | 58 ------------------------------- 3 files changed, 64 insertions(+), 116 deletions(-) diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c index 9fd3af9db46..ea836294584 100644 --- a/arch/sh/drivers/pci/pci-lib.c +++ b/arch/sh/drivers/pci/pci-lib.c @@ -70,6 +70,70 @@ pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, res->end = region->end + offset; } +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ + u16 cmd, old_cmd; + int idx; + struct resource *r; + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + old_cmd = cmd; + for (idx=0; idx < PCI_NUM_RESOURCES; idx++) { + /* Only set up the requested stuff */ + if (!(mask & (1<resource[idx]; + if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM))) + continue; + if ((idx == PCI_ROM_RESOURCE) && + (!(r->flags & IORESOURCE_ROM_ENABLE))) + continue; + if (!r->start && r->end) { + printk(KERN_ERR "PCI: Device %s not available " + "because of resource collisions\n", + pci_name(dev)); + return -EINVAL; + } + if (r->flags & IORESOURCE_IO) + cmd |= PCI_COMMAND_IO; + if (r->flags & IORESOURCE_MEM) + cmd |= PCI_COMMAND_MEMORY; + } + if (cmd != old_cmd) { + printk("PCI: Enabling device %s (%04x -> %04x)\n", + pci_name(dev), old_cmd, cmd); + pci_write_config_word(dev, PCI_COMMAND, cmd); + } + return 0; +} + +/* + * If we set up a device for bus mastering, we need to check and set + * the latency timer as it may not be properly set. + */ +static unsigned int pcibios_max_latency = 255; + +void pcibios_set_master(struct pci_dev *dev) +{ + u8 lat; + pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); + if (lat < 16) + lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; + else if (lat > pcibios_max_latency) + lat = pcibios_max_latency; + else + return; + printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n", + pci_name(dev), lat); + pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); +} + +void __init pcibios_update_irq(struct pci_dev *dev, int irq) +{ + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); +} + int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine) { diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c index 9d426147802..8c0b136eecb 100644 --- a/arch/sh/drivers/pci/pci-new.c +++ b/arch/sh/drivers/pci/pci-new.c @@ -146,61 +146,3 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus) pcibios_fixup_device_resources(dev, bus); } } - -int pcibios_enable_device(struct pci_dev *dev, int mask) -{ - u16 cmd, old_cmd; - int idx; - struct resource *r; - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - old_cmd = cmd; - for(idx=0; idx<6; idx++) { - if (!(mask & (1 << idx))) - continue; - r = &dev->resource[idx]; - if (!r->start && r->end) { - printk(KERN_ERR "PCI: Device %s not available because " - "of resource collisions\n", pci_name(dev)); - return -EINVAL; - } - if (r->flags & IORESOURCE_IO) - cmd |= PCI_COMMAND_IO; - if (r->flags & IORESOURCE_MEM) - cmd |= PCI_COMMAND_MEMORY; - } - if (dev->resource[PCI_ROM_RESOURCE].start) - cmd |= PCI_COMMAND_MEMORY; - if (cmd != old_cmd) { - printk(KERN_INFO "PCI: Enabling device %s (%04x -> %04x)\n", - pci_name(dev), old_cmd, cmd); - pci_write_config_word(dev, PCI_COMMAND, cmd); - } - return 0; -} - -/* - * If we set up a device for bus mastering, we need to check and set - * the latency timer as it may not be properly set. - */ -static unsigned int pcibios_max_latency = 255; - -void pcibios_set_master(struct pci_dev *dev) -{ - u8 lat; - pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); - if (lat < 16) - lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; - else if (lat > pcibios_max_latency) - lat = pcibios_max_latency; - else - return; - printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n", - pci_name(dev), lat); - pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); -} - -void __init pcibios_update_irq(struct pci_dev *dev, int irq) -{ - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); -} diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c index c15a6f0ad50..8c332b2a464 100644 --- a/arch/sh/drivers/pci/pci.c +++ b/arch/sh/drivers/pci/pci.c @@ -72,62 +72,4 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus) pci_read_bridge_bases(bus); } -int pcibios_enable_device(struct pci_dev *dev, int mask) -{ - u16 cmd, old_cmd; - int idx; - struct resource *r; - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - old_cmd = cmd; - for(idx=0; idx<6; idx++) { - if (!(mask & (1 << idx))) - continue; - r = &dev->resource[idx]; - if (!r->start && r->end) { - printk(KERN_ERR "PCI: Device %s not available because " - "of resource collisions\n", pci_name(dev)); - return -EINVAL; - } - if (r->flags & IORESOURCE_IO) - cmd |= PCI_COMMAND_IO; - if (r->flags & IORESOURCE_MEM) - cmd |= PCI_COMMAND_MEMORY; - } - if (dev->resource[PCI_ROM_RESOURCE].start) - cmd |= PCI_COMMAND_MEMORY; - if (cmd != old_cmd) { - printk(KERN_INFO "PCI: Enabling device %s (%04x -> %04x)\n", - pci_name(dev), old_cmd, cmd); - pci_write_config_word(dev, PCI_COMMAND, cmd); - } - return 0; -} - -/* - * If we set up a device for bus mastering, we need to check and set - * the latency timer as it may not be properly set. - */ -static unsigned int pcibios_max_latency = 255; - -void pcibios_set_master(struct pci_dev *dev) -{ - u8 lat; - pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); - if (lat < 16) - lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; - else if (lat > pcibios_max_latency) - lat = pcibios_max_latency; - else - return; - printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n", - pci_name(dev), lat); - pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); -} - -void __init pcibios_update_irq(struct pci_dev *dev, int irq) -{ - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); -} - EXPORT_SYMBOL(board_pci_channels); From 5ba7205fc49ff72e88784c94fb661f93e7ae7d36 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 19:00:32 +0900 Subject: [PATCH 0490/2061] sh: pci: Kill off the now unused hose->io_base. Nothing is using this any more, so kill it off. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/pci-sh7751.c | 8 -------- arch/sh/drivers/pci/pci-sh7780.c | 1 - arch/sh/include/asm/pci.h | 1 - 3 files changed, 10 deletions(-) diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c index af8874436d2..4c08fd7f665 100644 --- a/arch/sh/drivers/pci/pci-sh7751.c +++ b/arch/sh/drivers/pci/pci-sh7751.c @@ -40,7 +40,6 @@ int __init sh7751_pci_init(struct pci_channel *chan) pr_debug("PCI: Starting intialization.\n"); chan->reg_base = 0xfe200000; - chan->io_base = 0xfe240000; /* check for SH7751/SH7751R hardware */ id = pci_read_reg(chan, SH7751_PCICONF0); @@ -136,13 +135,6 @@ int __init sh7751_pcic_init(struct pci_channel *chan, pr_debug("PCI: Setting upper bits of Memory window to 0x%x\n", word); pci_write_reg(chan, word , SH4_PCIMBR); - /* Map IO space into PCI IO window: - * IO addresses will be translated to the PCI IO window base address - */ - pr_debug("PCI: Mapping IO address 0x%x - 0x%x to base 0x%lx\n", - chan->io_resource->start, chan->io_resource->end, - chan->io_base + chan->io_resource->start); - /* Make sure the MSB's of IO window are set to access PCI space * correctly */ word = chan->io_resource->start & SH4_PCIIOBR_MASK; diff --git a/arch/sh/drivers/pci/pci-sh7780.c b/arch/sh/drivers/pci/pci-sh7780.c index 57a3b870a27..ae13ff925c6 100644 --- a/arch/sh/drivers/pci/pci-sh7780.c +++ b/arch/sh/drivers/pci/pci-sh7780.c @@ -62,7 +62,6 @@ static int __init sh7780_pci_init(void) printk(KERN_NOTICE "PCI: Starting intialization.\n"); chan->reg_base = 0xfe040000; - chan->io_base = 0xfe200000; /* Enable CPU access to the PCIC registers. */ __raw_writel(PCIECR_ENBL, PCIECR); diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index 0be20521a1f..f36c7899295 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -33,7 +33,6 @@ struct pci_channel { int enabled; unsigned long reg_base; - unsigned long io_base; unsigned long io_map_base; }; From bb3396477bf46c7cae6bd04b969580277957966e Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 19:26:45 +0900 Subject: [PATCH 0491/2061] sh: pci: Kill off superfluous lboxre2 pci fixups. This is a verbatim copy of the r2d one, use that instead. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Makefile | 2 +- arch/sh/drivers/pci/fixups-lboxre2.c | 42 ---------------------------- 2 files changed, 1 insertion(+), 43 deletions(-) delete mode 100644 arch/sh/drivers/pci/fixups-lboxre2.c diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index c8eab14843e..4cac866d55d 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -22,5 +22,5 @@ obj-$(CONFIG_SH_SDK7780) += fixups-sdk7780.o obj-$(CONFIG_SH_7780_SOLUTION_ENGINE) += fixups-sdk7780.o obj-$(CONFIG_SH_TITAN) += ops-titan.o obj-$(CONFIG_SH_LANDISK) += ops-landisk.o -obj-$(CONFIG_SH_LBOX_RE2) += ops-lboxre2.o fixups-lboxre2.o +obj-$(CONFIG_SH_LBOX_RE2) += ops-lboxre2.o fixups-rts7751r2d.o obj-$(CONFIG_SH_CAYMAN) += ops-cayman.o diff --git a/arch/sh/drivers/pci/fixups-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c deleted file mode 100644 index a82011d03cb..00000000000 --- a/arch/sh/drivers/pci/fixups-lboxre2.c +++ /dev/null @@ -1,42 +0,0 @@ -/* - * arch/sh/drivers/pci/fixups-lboxre2.c - * - * L-BOX RE2 PCI fixups - * - * Copyright (C) 2007 Nobuhiro Iwamatsu - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include "pci-sh4.h" - -#define PCIMCR_MRSET_OFF 0xBFFFFFFF -#define PCIMCR_RFSH_OFF 0xFFFFFFFB - -int pci_fixup_pcic(struct pci_channel *chan) -{ - unsigned long bcr1, mcr; - - bcr1 = ctrl_inl(SH7751_BCR1); - bcr1 |= 0x40080000; /* Enable Bit 19 BREQEN, set PCIC to slave */ - pci_write_reg(chan, bcr1, SH4_PCIBCR1); - - /* Enable all interrupts, so we known what to fix */ - pci_write_reg(chan, 0x0000c3ff, SH4_PCIINTM); - pci_write_reg(chan, 0x0000380f, SH4_PCIAINTM); - pci_write_reg(chan, 0xfb900047, SH7751_PCICONF1); - pci_write_reg(chan, 0xab000001, SH7751_PCICONF4); - - mcr = ctrl_inl(SH7751_MCR); - mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF; - pci_write_reg(chan, mcr, SH4_PCIMCR); - - pci_write_reg(chan, 0x0c000000, SH7751_PCICONF5); - pci_write_reg(chan, 0xd0000000, SH7751_PCICONF6); - pci_write_reg(chan, 0x0c000000, SH4_PCILAR0); - pci_write_reg(chan, 0x00000000, SH4_PCILAR1); - - return 0; -} From d556fcc101c4b0d57ac9742ab806a6bfed78eac1 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 19:31:20 +0900 Subject: [PATCH 0492/2061] sh: pci: Flag the dreamcast BBA as IORESOURCE_PCI_FIXED. This isn't a real BAR, so prevent any attempts to move it, as we don't wish to encourage a bus luck by overzealous PCI initialization code. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/fixups-dreamcast.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c index 2bf85cf091e..48c6381fffa 100644 --- a/arch/sh/drivers/pci/fixups-dreamcast.c +++ b/arch/sh/drivers/pci/fixups-dreamcast.c @@ -41,6 +41,13 @@ static void __init gapspci_fixup_resources(struct pci_dev *dev) */ dev->resource[1].start = p->io_resource->start + 0x100; dev->resource[1].end = dev->resource[1].start + 0x200 - 1; + + /* + * This is not a normal BAR, prevent any attempts to move + * the BAR, as this will result in a bus lock. + */ + dev->resource[1].flags |= IORESOURCE_PCI_FIXED; + /* * Redirect dma memory allocations to special memory window. */ From c563bf0965c86cc6087b09c34ca063fe96d6deca Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 19:39:57 +0900 Subject: [PATCH 0493/2061] sh: pci: Kill off dead references to is_pci_ioaddr and friends. Some old boards are still using this in their I/O routines, kill it off. Signed-off-by: Paul Mundt --- arch/sh/boards/mach-se/7751/io.c | 16 ---------------- arch/sh/boards/mach-snapgear/io.c | 16 ---------------- arch/sh/boards/mach-systemh/io.c | 16 ---------------- arch/sh/boards/mach-titan/io.c | 18 ------------------ 4 files changed, 66 deletions(-) diff --git a/arch/sh/boards/mach-se/7751/io.c b/arch/sh/boards/mach-se/7751/io.c index 6287ae57031..6e75bd4459e 100644 --- a/arch/sh/boards/mach-se/7751/io.c +++ b/arch/sh/boards/mach-se/7751/io.c @@ -34,8 +34,6 @@ unsigned char sh7751se_inb(unsigned long port) { if (PXSEG(port)) return *(volatile unsigned char *)port; - else if (is_pci_ioaddr(port)) - return *(volatile unsigned char *)pci_ioaddr(port); else return (*port2adr(port)) & 0xff; } @@ -46,8 +44,6 @@ unsigned char sh7751se_inb_p(unsigned long port) if (PXSEG(port)) v = *(volatile unsigned char *)port; - else if (is_pci_ioaddr(port)) - v = *(volatile unsigned char *)pci_ioaddr(port); else v = (*port2adr(port)) & 0xff; ctrl_delay(); @@ -58,8 +54,6 @@ unsigned short sh7751se_inw(unsigned long port) { if (PXSEG(port)) return *(volatile unsigned short *)port; - else if (is_pci_ioaddr(port)) - return *(volatile unsigned short *)pci_ioaddr(port); else if (port >= 0x2000) return *port2adr(port); else @@ -71,8 +65,6 @@ unsigned int sh7751se_inl(unsigned long port) { if (PXSEG(port)) return *(volatile unsigned long *)port; - else if (is_pci_ioaddr(port)) - return *(volatile unsigned int *)pci_ioaddr(port); else if (port >= 0x2000) return *port2adr(port); else @@ -85,8 +77,6 @@ void sh7751se_outb(unsigned char value, unsigned long port) if (PXSEG(port)) *(volatile unsigned char *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned char*)pci_ioaddr(port)) = value; else *(port2adr(port)) = value; } @@ -95,8 +85,6 @@ void sh7751se_outb_p(unsigned char value, unsigned long port) { if (PXSEG(port)) *(volatile unsigned char *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned char*)pci_ioaddr(port)) = value; else *(port2adr(port)) = value; ctrl_delay(); @@ -106,8 +94,6 @@ void sh7751se_outw(unsigned short value, unsigned long port) { if (PXSEG(port)) *(volatile unsigned short *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned short *)pci_ioaddr(port)) = value; else if (port >= 0x2000) *port2adr(port) = value; else @@ -118,8 +104,6 @@ void sh7751se_outl(unsigned int value, unsigned long port) { if (PXSEG(port)) *(volatile unsigned long *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned long*)pci_ioaddr(port)) = value; else maybebadio(port); } diff --git a/arch/sh/boards/mach-snapgear/io.c b/arch/sh/boards/mach-snapgear/io.c index 0f482426455..476650e42db 100644 --- a/arch/sh/boards/mach-snapgear/io.c +++ b/arch/sh/boards/mach-snapgear/io.c @@ -36,8 +36,6 @@ unsigned char snapgear_inb(unsigned long port) { if (PXSEG(port)) return *(volatile unsigned char *)port; - else if (is_pci_ioaddr(port)) - return *(volatile unsigned char *)pci_ioaddr(port); else return (*port2adr(port)) & 0xff; } @@ -48,8 +46,6 @@ unsigned char snapgear_inb_p(unsigned long port) if (PXSEG(port)) v = *(volatile unsigned char *)port; - else if (is_pci_ioaddr(port)) - v = *(volatile unsigned char *)pci_ioaddr(port); else v = (*port2adr(port))&0xff; ctrl_delay(); @@ -60,8 +56,6 @@ unsigned short snapgear_inw(unsigned long port) { if (PXSEG(port)) return *(volatile unsigned short *)port; - else if (is_pci_ioaddr(port)) - return *(volatile unsigned short *)pci_ioaddr(port); else if (port >= 0x2000) return *port2adr(port); else @@ -73,8 +67,6 @@ unsigned int snapgear_inl(unsigned long port) { if (PXSEG(port)) return *(volatile unsigned long *)port; - else if (is_pci_ioaddr(port)) - return *(volatile unsigned int *)pci_ioaddr(port); else if (port >= 0x2000) return *port2adr(port); else @@ -87,8 +79,6 @@ void snapgear_outb(unsigned char value, unsigned long port) if (PXSEG(port)) *(volatile unsigned char *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned char*)pci_ioaddr(port)) = value; else *(port2adr(port)) = value; } @@ -97,8 +87,6 @@ void snapgear_outb_p(unsigned char value, unsigned long port) { if (PXSEG(port)) *(volatile unsigned char *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned char*)pci_ioaddr(port)) = value; else *(port2adr(port)) = value; ctrl_delay(); @@ -108,8 +96,6 @@ void snapgear_outw(unsigned short value, unsigned long port) { if (PXSEG(port)) *(volatile unsigned short *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned short *)pci_ioaddr(port)) = value; else if (port >= 0x2000) *port2adr(port) = value; else @@ -120,8 +106,6 @@ void snapgear_outl(unsigned int value, unsigned long port) { if (PXSEG(port)) *(volatile unsigned long *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned long*)pci_ioaddr(port)) = value; else maybebadio(port); } diff --git a/arch/sh/boards/mach-systemh/io.c b/arch/sh/boards/mach-systemh/io.c index dec3db0ee93..15577ff1f71 100644 --- a/arch/sh/boards/mach-systemh/io.c +++ b/arch/sh/boards/mach-systemh/io.c @@ -35,8 +35,6 @@ unsigned char sh7751systemh_inb(unsigned long port) { if (PXSEG(port)) return *(volatile unsigned char *)port; - else if (is_pci_ioaddr(port)) - return *(volatile unsigned char *)pci_ioaddr(port); else if (port <= 0x3F1) return *(volatile unsigned char *)ETHER_IOMAP(port); else @@ -49,8 +47,6 @@ unsigned char sh7751systemh_inb_p(unsigned long port) if (PXSEG(port)) v = *(volatile unsigned char *)port; - else if (is_pci_ioaddr(port)) - v = *(volatile unsigned char *)pci_ioaddr(port); else if (port <= 0x3F1) v = *(volatile unsigned char *)ETHER_IOMAP(port); else @@ -63,8 +59,6 @@ unsigned short sh7751systemh_inw(unsigned long port) { if (PXSEG(port)) return *(volatile unsigned short *)port; - else if (is_pci_ioaddr(port)) - return *(volatile unsigned short *)pci_ioaddr(port); else if (port >= 0x2000) return *port2adr(port); else if (port <= 0x3F1) @@ -78,8 +72,6 @@ unsigned int sh7751systemh_inl(unsigned long port) { if (PXSEG(port)) return *(volatile unsigned long *)port; - else if (is_pci_ioaddr(port)) - return *(volatile unsigned int *)pci_ioaddr(port); else if (port >= 0x2000) return *port2adr(port); else if (port <= 0x3F1) @@ -94,8 +86,6 @@ void sh7751systemh_outb(unsigned char value, unsigned long port) if (PXSEG(port)) *(volatile unsigned char *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned char*)pci_ioaddr(port)) = value; else if (port <= 0x3F1) *(volatile unsigned char *)ETHER_IOMAP(port) = value; else @@ -106,8 +96,6 @@ void sh7751systemh_outb_p(unsigned char value, unsigned long port) { if (PXSEG(port)) *(volatile unsigned char *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned char*)pci_ioaddr(port)) = value; else if (port <= 0x3F1) *(volatile unsigned char *)ETHER_IOMAP(port) = value; else @@ -119,8 +107,6 @@ void sh7751systemh_outw(unsigned short value, unsigned long port) { if (PXSEG(port)) *(volatile unsigned short *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned short *)pci_ioaddr(port)) = value; else if (port >= 0x2000) *port2adr(port) = value; else if (port <= 0x3F1) @@ -133,8 +119,6 @@ void sh7751systemh_outl(unsigned int value, unsigned long port) { if (PXSEG(port)) *(volatile unsigned long *)port = value; - else if (is_pci_ioaddr(port)) - *((unsigned long*)pci_ioaddr(port)) = value; else maybebadio(port); } diff --git a/arch/sh/boards/mach-titan/io.c b/arch/sh/boards/mach-titan/io.c index 053b3ed2ed8..0130e9826ac 100644 --- a/arch/sh/boards/mach-titan/io.c +++ b/arch/sh/boards/mach-titan/io.c @@ -17,8 +17,6 @@ u8 titan_inb(unsigned long port) { if (PXSEG(port)) return ctrl_inb(port); - else if (is_pci_ioaddr(port)) - return ctrl_inb(pci_ioaddr(port)); return ctrl_inw(port2adr(port)) & 0xff; } @@ -28,8 +26,6 @@ u8 titan_inb_p(unsigned long port) if (PXSEG(port)) v = ctrl_inb(port); - else if (is_pci_ioaddr(port)) - v = ctrl_inb(pci_ioaddr(port)); else v = ctrl_inw(port2adr(port)) & 0xff; ctrl_delay(); @@ -40,8 +36,6 @@ u16 titan_inw(unsigned long port) { if (PXSEG(port)) return ctrl_inw(port); - else if (is_pci_ioaddr(port)) - return ctrl_inw(pci_ioaddr(port)); else if (port >= 0x2000) return ctrl_inw(port2adr(port)); else @@ -53,8 +47,6 @@ u32 titan_inl(unsigned long port) { if (PXSEG(port)) return ctrl_inl(port); - else if (is_pci_ioaddr(port)) - return ctrl_inl(pci_ioaddr(port)); else if (port >= 0x2000) return ctrl_inw(port2adr(port)); else @@ -66,8 +58,6 @@ void titan_outb(u8 value, unsigned long port) { if (PXSEG(port)) ctrl_outb(value, port); - else if (is_pci_ioaddr(port)) - ctrl_outb(value, pci_ioaddr(port)); else ctrl_outw(value, port2adr(port)); } @@ -76,8 +66,6 @@ void titan_outb_p(u8 value, unsigned long port) { if (PXSEG(port)) ctrl_outb(value, port); - else if (is_pci_ioaddr(port)) - ctrl_outb(value, pci_ioaddr(port)); else ctrl_outw(value, port2adr(port)); ctrl_delay(); @@ -87,8 +75,6 @@ void titan_outw(u16 value, unsigned long port) { if (PXSEG(port)) ctrl_outw(value, port); - else if (is_pci_ioaddr(port)) - ctrl_outw(value, pci_ioaddr(port)); else if (port >= 0x2000) ctrl_outw(value, port2adr(port)); else @@ -99,8 +85,6 @@ void titan_outl(u32 value, unsigned long port) { if (PXSEG(port)) ctrl_outl(value, port); - else if (is_pci_ioaddr(port)) - ctrl_outl(value, pci_ioaddr(port)); else maybebadio(port); } @@ -119,8 +103,6 @@ void __iomem *titan_ioport_map(unsigned long port, unsigned int size) { if (PXSEG(port)) return (void __iomem *)port; - else if (is_pci_ioaddr(port)) - return (void __iomem *)pci_ioaddr(port); return (void __iomem *)port2adr(port); } From 0e75148108914062cb46ad3dc8f054a628018df7 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 19:48:48 +0900 Subject: [PATCH 0494/2061] sh: pci: Consolidate pcibios_setup() in pci-lib. This wasn't really being used for anything useful, so just stub it in pci-lib. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-dreamcast.c | 6 ----- arch/sh/drivers/pci/ops-sh4.c | 35 +++++++++-------------------- arch/sh/drivers/pci/ops-sh5.c | 5 ----- arch/sh/drivers/pci/pci-lib.c | 5 +++++ 4 files changed, 16 insertions(+), 35 deletions(-) diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c index 5bc81d5b660..529aa4f27a8 100644 --- a/arch/sh/drivers/pci/ops-dreamcast.c +++ b/arch/sh/drivers/pci/ops-dreamcast.c @@ -154,12 +154,6 @@ static int __init gapspci_init(struct pci_channel *chan) return 0; } -/* Haven't done anything here as yet */ -char * __devinit pcibios_setup(char *str) -{ - return str; -} - struct pci_channel board_pci_channels[] = { { gapspci_init, &gapspci_pci_ops, &gapspci_io_resource, &gapspci_mem_resource, 0, 1 }, diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c index 2a7f7b50ff0..7cc1fccf1c5 100644 --- a/arch/sh/drivers/pci/ops-sh4.c +++ b/arch/sh/drivers/pci/ops-sh4.c @@ -106,30 +106,27 @@ struct pci_ops sh4_pci_ops = { * Not really related to pci_ops, but it's common and not worth shoving * somewhere else for now.. */ -static unsigned int pci_probe = PCI_PROBE_CONF1; - int __init sh4_pci_check_direct(struct pci_channel *chan) { /* * Check if configuration works. */ - if (pci_probe & PCI_PROBE_CONF1) { - unsigned int tmp = pci_read_reg(chan, SH4_PCIPAR); + unsigned int tmp = pci_read_reg(chan, SH4_PCIPAR); - pci_write_reg(chan, P1SEG, SH4_PCIPAR); - - if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) { - pci_write_reg(chan, tmp, SH4_PCIPAR); - printk(KERN_INFO "PCI: Using configuration type 1\n"); - request_region(chan->reg_base + SH4_PCIPAR, 8, - "PCI conf1"); - return 0; - } + pci_write_reg(chan, P1SEG, SH4_PCIPAR); + if (pci_read_reg(chan, SH4_PCIPAR) == P1SEG) { pci_write_reg(chan, tmp, SH4_PCIPAR); + printk(KERN_INFO "PCI: Using configuration type 1\n"); + request_region(chan->reg_base + SH4_PCIPAR, 8, + "PCI conf1"); + return 0; } - pr_debug("PCI: pci_check_direct failed\n"); + pci_write_reg(chan, tmp, SH4_PCIPAR); + + printk(KERN_ERR "PCI: %s failed\n", __func__); + return -EINVAL; } @@ -155,16 +152,6 @@ static void __init pci_fixup_ide_bases(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases); -char * __devinit pcibios_setup(char *str) -{ - if (!strcmp(str, "off")) { - pci_probe = 0; - return NULL; - } - - return str; -} - int __attribute__((weak)) pci_fixup_pcic(struct pci_channel *chan) { /* Nothing to do. */ diff --git a/arch/sh/drivers/pci/ops-sh5.c b/arch/sh/drivers/pci/ops-sh5.c index 729e38a6fe0..b10e2d5f425 100644 --- a/arch/sh/drivers/pci/ops-sh5.c +++ b/arch/sh/drivers/pci/ops-sh5.c @@ -42,11 +42,6 @@ static void __init pci_fixup_ide_bases(struct pci_dev *d) } DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases); -char * __devinit pcibios_setup(char *str) -{ - return str; -} - static int sh5pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c index ea836294584..f072acafce6 100644 --- a/arch/sh/drivers/pci/pci-lib.c +++ b/arch/sh/drivers/pci/pci-lib.c @@ -134,6 +134,11 @@ void __init pcibios_update_irq(struct pci_dev *dev, int irq) pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); } +char * __devinit pcibios_setup(char *str) +{ + return str; +} + int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, enum pci_mmap_state mmap_state, int write_combine) { From 0db38cea69fc478a5c25b3c915ec680cc5538783 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 19:54:47 +0900 Subject: [PATCH 0495/2061] sh: pci: Kill off legacy ide quirks. These fixups seem to have bitrotted a bit since their introduction in the 2.4 days. As we never had much use for them in the first place, and nothing is using them any more, kill them off the rest of the way. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/ops-sh4.c | 22 ---------------------- arch/sh/drivers/pci/ops-sh5.c | 20 -------------------- 2 files changed, 42 deletions(-) diff --git a/arch/sh/drivers/pci/ops-sh4.c b/arch/sh/drivers/pci/ops-sh4.c index 7cc1fccf1c5..78bebebdc99 100644 --- a/arch/sh/drivers/pci/ops-sh4.c +++ b/arch/sh/drivers/pci/ops-sh4.c @@ -130,28 +130,6 @@ int __init sh4_pci_check_direct(struct pci_channel *chan) return -EINVAL; } -/* Handle generic fixups */ -static void __init pci_fixup_ide_bases(struct pci_dev *d) -{ - int i; - - /* - * PCI IDE controllers use non-standard I/O port decoding, respect it. - */ - if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE) - return; - pr_debug("PCI: IDE base address fixup for %s\n", pci_name(d)); - for(i = 0; i < 4; i++) { - struct resource *r = &d->resource[i]; - - if ((r->start & ~0x80) == 0x374) { - r->start |= 2; - r->end = r->start; - } - } -} -DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases); - int __attribute__((weak)) pci_fixup_pcic(struct pci_channel *chan) { /* Nothing to do. */ diff --git a/arch/sh/drivers/pci/ops-sh5.c b/arch/sh/drivers/pci/ops-sh5.c index b10e2d5f425..4ce95a001b8 100644 --- a/arch/sh/drivers/pci/ops-sh5.c +++ b/arch/sh/drivers/pci/ops-sh5.c @@ -22,26 +22,6 @@ #include #include "pci-sh5.h" -static void __init pci_fixup_ide_bases(struct pci_dev *d) -{ - int i; - - /* - * PCI IDE controllers use non-standard I/O port decoding, respect it. - */ - if ((d->class >> 8) != PCI_CLASS_STORAGE_IDE) - return; - printk("PCI: IDE base address fixup for %s\n", pci_name(d)); - for(i=0; i<4; i++) { - struct resource *r = &d->resource[i]; - if ((r->start & ~0x80) == 0x374) { - r->start |= 2; - r->end = r->start; - } - } -} -DECLARE_PCI_FIXUP_HEADER(PCI_ANY_ID, PCI_ANY_ID, pci_fixup_ide_bases); - static int sh5pci_read(struct pci_bus *bus, unsigned int devfn, int where, int size, u32 *val) { From 3444f5ec49bc6cb901ffea38e085db1d76e1189c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 20:22:05 +0900 Subject: [PATCH 0496/2061] sh: pci: Tidy up the dreamcast PCI support. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Makefile | 3 +- arch/sh/drivers/pci/ops-dreamcast.c | 98 +++----------------- arch/sh/drivers/pci/pci-dreamcast.c | 105 ++++++++++++++++++++++ arch/sh/include/mach-dreamcast/mach/pci.h | 2 + 4 files changed, 119 insertions(+), 89 deletions(-) create mode 100644 arch/sh/drivers/pci/pci-dreamcast.c diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index 4cac866d55d..2160a06b6c1 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -12,7 +12,8 @@ obj-$(CONFIG_CPU_SUBTYPE_SH7780) += pci-sh7780.o ops-sh4.o obj-$(CONFIG_CPU_SUBTYPE_SH7785) += pci-sh7780.o ops-sh4.o obj-$(CONFIG_CPU_SH5) += pci-sh5.o ops-sh5.o -obj-$(CONFIG_SH_DREAMCAST) += ops-dreamcast.o fixups-dreamcast.o +obj-$(CONFIG_SH_DREAMCAST) += ops-dreamcast.o fixups-dreamcast.o \ + pci-dreamcast.o obj-$(CONFIG_SH_SECUREEDGE5410) += ops-snapgear.o obj-$(CONFIG_SH_RTS7751R2D) += ops-rts7751r2d.o fixups-rts7751r2d.o obj-$(CONFIG_SH_SH03) += ops-sh03.o fixups-sh03.o diff --git a/arch/sh/drivers/pci/ops-dreamcast.c b/arch/sh/drivers/pci/ops-dreamcast.c index 529aa4f27a8..e83d0d3aabe 100644 --- a/arch/sh/drivers/pci/ops-dreamcast.c +++ b/arch/sh/drivers/pci/ops-dreamcast.c @@ -1,15 +1,9 @@ /* - * arch/sh/drivers/pci/ops-dreamcast.c - * * PCI operations for the Sega Dreamcast * * Copyright (C) 2001, 2002 M. R. Brown * Copyright (C) 2002, 2003 Paul Mundt * - * This file originally bore the message (with enclosed-$): - * Id: pci.c,v 1.3 2003/05/04 19:29:46 lethal Exp - * Dreamcast PCI: Supports SEGA Broadband Adaptor only. - * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive * for more details. @@ -23,25 +17,10 @@ #include #include #include - -#include -#include +#include +#include #include -static struct resource gapspci_io_resource = { - .name = "GAPSPCI IO", - .start = GAPSPCI_BBA_CONFIG, - .end = GAPSPCI_BBA_CONFIG + GAPSPCI_BBA_CONFIG_SIZE - 1, - .flags = IORESOURCE_IO, -}; - -static struct resource gapspci_mem_resource = { - .name = "GAPSPCI mem", - .start = GAPSPCI_DMA_BASE, - .end = GAPSPCI_DMA_BASE + GAPSPCI_DMA_SIZE - 1, - .flags = IORESOURCE_MEM, -}; - /* * The !gapspci_config_access case really shouldn't happen, ever, unless * someone implicitly messes around with the last devfn value.. otherwise we @@ -76,10 +55,10 @@ static int gapspci_read(struct pci_bus *bus, unsigned int devfn, int where, int return PCIBIOS_DEVICE_NOT_FOUND; switch (size) { - case 1: *val = inb(GAPSPCI_BBA_CONFIG+where); break; - case 2: *val = inw(GAPSPCI_BBA_CONFIG+where); break; - case 4: *val = inl(GAPSPCI_BBA_CONFIG+where); break; - } + case 1: *val = inb(GAPSPCI_BBA_CONFIG+where); break; + case 2: *val = inw(GAPSPCI_BBA_CONFIG+where); break; + case 4: *val = inl(GAPSPCI_BBA_CONFIG+where); break; + } return PCIBIOS_SUCCESSFUL; } @@ -90,72 +69,15 @@ static int gapspci_write(struct pci_bus *bus, unsigned int devfn, int where, int return PCIBIOS_DEVICE_NOT_FOUND; switch (size) { - case 1: outb(( u8)val, GAPSPCI_BBA_CONFIG+where); break; - case 2: outw((u16)val, GAPSPCI_BBA_CONFIG+where); break; - case 4: outl((u32)val, GAPSPCI_BBA_CONFIG+where); break; + case 1: outb(( u8)val, GAPSPCI_BBA_CONFIG+where); break; + case 2: outw((u16)val, GAPSPCI_BBA_CONFIG+where); break; + case 4: outl((u32)val, GAPSPCI_BBA_CONFIG+where); break; } return PCIBIOS_SUCCESSFUL; } -static struct pci_ops gapspci_pci_ops = { +struct pci_ops gapspci_pci_ops = { .read = gapspci_read, .write = gapspci_write, }; - -/* - * gapspci init - */ - -static int __init gapspci_init(struct pci_channel *chan) -{ - char idbuf[16]; - int i; - - /* - * FIXME: All of this wants documenting to some degree, - * even some basic register definitions would be nice. - * - * I haven't seen anything this ugly since.. maple. - */ - - for (i=0; i<16; i++) - idbuf[i] = inb(GAPSPCI_REGS+i); - - if (strncmp(idbuf, "GAPSPCI_BRIDGE_2", 16)) - return -ENODEV; - - outl(0x5a14a501, GAPSPCI_REGS+0x18); - - for (i=0; i<1000000; i++) - ; - - if (inl(GAPSPCI_REGS+0x18) != 1) - return -EINVAL; - - outl(0x01000000, GAPSPCI_REGS+0x20); - outl(0x01000000, GAPSPCI_REGS+0x24); - - outl(GAPSPCI_DMA_BASE, GAPSPCI_REGS+0x28); - outl(GAPSPCI_DMA_BASE+GAPSPCI_DMA_SIZE, GAPSPCI_REGS+0x2c); - - outl(1, GAPSPCI_REGS+0x14); - outl(1, GAPSPCI_REGS+0x34); - - /* Setting Broadband Adapter */ - outw(0xf900, GAPSPCI_BBA_CONFIG+0x06); - outl(0x00000000, GAPSPCI_BBA_CONFIG+0x30); - outb(0x00, GAPSPCI_BBA_CONFIG+0x3c); - outb(0xf0, GAPSPCI_BBA_CONFIG+0x0d); - outw(0x0006, GAPSPCI_BBA_CONFIG+0x04); - outl(0x00002001, GAPSPCI_BBA_CONFIG+0x10); - outl(0x01000000, GAPSPCI_BBA_CONFIG+0x14); - - return 0; -} - -struct pci_channel board_pci_channels[] = { - { gapspci_init, &gapspci_pci_ops, &gapspci_io_resource, - &gapspci_mem_resource, 0, 1 }, - { 0, } -}; diff --git a/arch/sh/drivers/pci/pci-dreamcast.c b/arch/sh/drivers/pci/pci-dreamcast.c new file mode 100644 index 00000000000..0897be5053d --- /dev/null +++ b/arch/sh/drivers/pci/pci-dreamcast.c @@ -0,0 +1,105 @@ +/* + * PCI support for the Sega Dreamcast + * + * Copyright (C) 2001, 2002 M. R. Brown + * Copyright (C) 2002, 2003 Paul Mundt + * + * This file originally bore the message (with enclosed-$): + * Id: pci.c,v 1.3 2003/05/04 19:29:46 lethal Exp + * Dreamcast PCI: Supports SEGA Broadband Adaptor only. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +static struct resource gapspci_io_resource = { + .name = "GAPSPCI IO", + .start = GAPSPCI_BBA_CONFIG, + .end = GAPSPCI_BBA_CONFIG + GAPSPCI_BBA_CONFIG_SIZE - 1, + .flags = IORESOURCE_IO, +}; + +static struct resource gapspci_mem_resource = { + .name = "GAPSPCI mem", + .start = GAPSPCI_DMA_BASE, + .end = GAPSPCI_DMA_BASE + GAPSPCI_DMA_SIZE - 1, + .flags = IORESOURCE_MEM, +}; + +/* + * gapspci init + */ + +static int __init gapspci_init(struct pci_channel *chan) +{ + char idbuf[16]; + int i; + + /* + * FIXME: All of this wants documenting to some degree, + * even some basic register definitions would be nice. + * + * I haven't seen anything this ugly since.. maple. + */ + + for (i=0; i<16; i++) + idbuf[i] = inb(GAPSPCI_REGS+i); + + if (strncmp(idbuf, "GAPSPCI_BRIDGE_2", 16)) + return -ENODEV; + + outl(0x5a14a501, GAPSPCI_REGS+0x18); + + for (i=0; i<1000000; i++) + cpu_relax(); + + if (inl(GAPSPCI_REGS+0x18) != 1) + return -EINVAL; + + outl(0x01000000, GAPSPCI_REGS+0x20); + outl(0x01000000, GAPSPCI_REGS+0x24); + + outl(GAPSPCI_DMA_BASE, GAPSPCI_REGS+0x28); + outl(GAPSPCI_DMA_BASE+GAPSPCI_DMA_SIZE, GAPSPCI_REGS+0x2c); + + outl(1, GAPSPCI_REGS+0x14); + outl(1, GAPSPCI_REGS+0x34); + + /* Setting Broadband Adapter */ + outw(0xf900, GAPSPCI_BBA_CONFIG+0x06); + outl(0x00000000, GAPSPCI_BBA_CONFIG+0x30); + outb(0x00, GAPSPCI_BBA_CONFIG+0x3c); + outb(0xf0, GAPSPCI_BBA_CONFIG+0x0d); + outw(0x0006, GAPSPCI_BBA_CONFIG+0x04); + outl(0x00002001, GAPSPCI_BBA_CONFIG+0x10); + outl(0x01000000, GAPSPCI_BBA_CONFIG+0x14); + + return 0; +} + +struct pci_channel board_pci_channels[] = { + { + .init = gapspci_init, + .pci_ops = &gapspci_pci_ops, + .io_resource = &gapspci_io_resource, + .mem_resource = &gapspci_mem_resource, + .first_devfn = 0, + .last_devfn = 1, + }, { + .init = NULL, + } +}; diff --git a/arch/sh/include/mach-dreamcast/mach/pci.h b/arch/sh/include/mach-dreamcast/mach/pci.h index 75fc9009e09..0314d975e62 100644 --- a/arch/sh/include/mach-dreamcast/mach/pci.h +++ b/arch/sh/include/mach-dreamcast/mach/pci.h @@ -21,5 +21,7 @@ #define GAPSPCI_IRQ HW_EVENT_EXTERNAL +extern struct pci_ops gapspci_pci_ops; + #endif /* __ASM_SH_DREAMCAST_PCI_H */ From 48e4237d96fdcb1237b63bcddb37771f97452eec Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 20:40:48 +0900 Subject: [PATCH 0497/2061] sh: pci: Convert the SH-5 code over to the new interface. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Kconfig | 2 +- arch/sh/drivers/pci/ops-cayman.c | 11 ------ arch/sh/drivers/pci/pci-sh5.c | 60 ++++++++++++-------------------- arch/sh/drivers/pci/pci-sh5.h | 4 --- 4 files changed, 24 insertions(+), 53 deletions(-) diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig index efe8cf965a9..f9fb1d1b623 100644 --- a/arch/sh/drivers/pci/Kconfig +++ b/arch/sh/drivers/pci/Kconfig @@ -23,7 +23,7 @@ config PCI_NEW bool depends on PCI default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \ - CPU_SUBTYPE_SH7785 + CPU_SUBTYPE_SH7785 || CPU_SH5 # This is also board-specific config PCI_AUTO diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/ops-cayman.c index cbaaf0234b7..b68b61d22c6 100644 --- a/arch/sh/drivers/pci/ops-cayman.c +++ b/arch/sh/drivers/pci/ops-cayman.c @@ -75,14 +75,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin) return result; } - -struct pci_channel board_pci_channels[] = { - { sh5_pci_init, &sh5_pci_ops, NULL, NULL, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - -int __init pcibios_init_platform(void) -{ - return sh5pci_init(__pa(memory_start), - __pa(memory_end) - __pa(memory_start)); -} diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c index 7750da27628..cf431852213 100644 --- a/arch/sh/drivers/pci/pci-sh5.c +++ b/arch/sh/drivers/pci/pci-sh5.c @@ -27,12 +27,6 @@ unsigned long pcicr_virt; unsigned long PCI_IO_AREA; -int __init sh5_pci_init(struct pci_channel *chan) -{ - pr_debug("PCI: Starting intialization.\n"); - return pcibios_init_platform(); -} - /* Rounds a number UP to the nearest power of two. Used for * sizing the PCI window. */ @@ -95,8 +89,21 @@ static irqreturn_t pcish5_serr_irq(int irq, void *dev_id) return IRQ_NONE; } -int __init sh5pci_init(unsigned long memStart, unsigned long memSize) +static struct resource sh5_io_resource = { /* place holder */ }; +static struct resource sh5_mem_resource = { /* place holder */ }; + +static struct pci_channel sh5pci_controller = { + .pci_ops = &sh5_pci_ops, + .mem_resource = &sh5_mem_resource, + .mem_offset = 0x00000000, + .io_resource = &sh5_io_resource, + .io_offset = 0x00000000, +}; + +static int __init sh5pci_init(void) { + unsigned long memStart = __pa(memory_start); + unsigned long memSize = __pa(memory_end) - memStart; u32 lsr0; u32 uval; @@ -203,35 +210,14 @@ int __init sh5pci_init(unsigned long memStart, unsigned long memSize) SH5PCI_WRITE(AINTM, ~0); SH5PCI_WRITE(PINTM, ~0); + sh5_io_resource.start = PCI_IO_AREA; + sh5_io_resource.end = PCI_IO_AREA + 0x10000; + + sh5_mem_resource.start = memStart; + sh5_mem_resource.end = memStart + memSize; + + register_pci_controller(&sh5pci_controller); + return 0; } - -#define xPCIBIOS_MIN_IO board_pci_channels->io_resource->start -#define xPCIBIOS_MIN_MEM board_pci_channels->mem_resource->start - -void __devinit pcibios_fixup_bus(struct pci_bus *bus) -{ - struct pci_dev *dev = bus->self; - int i; - - if (dev) { - for (i= 0; i < 3; i++) { - bus->resource[i] = - &dev->resource[PCI_BRIDGE_RESOURCES+i]; - bus->resource[i]->name = bus->name; - } - bus->resource[0]->flags |= IORESOURCE_IO; - bus->resource[1]->flags |= IORESOURCE_MEM; - - /* For now, propagate host limits to the bus; - * we'll adjust them later. */ - bus->resource[0]->end = 64*1024 - 1 ; - bus->resource[1]->end = xPCIBIOS_MIN_MEM+(256*1024*1024)-1; - bus->resource[0]->start = xPCIBIOS_MIN_IO; - bus->resource[1]->start = xPCIBIOS_MIN_MEM; - - /* Turn off downstream PF memory address range by default */ - bus->resource[2]->start = 1024*1024; - bus->resource[2]->end = bus->resource[2]->start - 1; - } -} +arch_initcall(sh5pci_init); diff --git a/arch/sh/drivers/pci/pci-sh5.h b/arch/sh/drivers/pci/pci-sh5.h index af09f384c7d..f277628221f 100644 --- a/arch/sh/drivers/pci/pci-sh5.h +++ b/arch/sh/drivers/pci/pci-sh5.h @@ -107,8 +107,4 @@ extern unsigned long pcicr_virt; extern struct pci_ops sh5_pci_ops; -/* arch/sh/drivers/pci/pci-sh5.c */ -int sh5_pci_init(struct pci_channel *chan); -int sh5pci_init(unsigned long memStart, unsigned long memSize); - #endif /* __PCI_SH5_H */ From a5b08047129f214af1899bd9088605c7adc21ed5 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 20:41:45 +0900 Subject: [PATCH 0498/2061] sh: pci: Rename ops-cayman -> fixups-cayman. Now that ops-cayman.c only contains IRQ routing fixups, rename it. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Makefile | 2 +- arch/sh/drivers/pci/{ops-cayman.c => fixups-cayman.c} | 0 2 files changed, 1 insertion(+), 1 deletion(-) rename arch/sh/drivers/pci/{ops-cayman.c => fixups-cayman.c} (100%) diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index 2160a06b6c1..cb2190c3e36 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -24,4 +24,4 @@ obj-$(CONFIG_SH_7780_SOLUTION_ENGINE) += fixups-sdk7780.o obj-$(CONFIG_SH_TITAN) += ops-titan.o obj-$(CONFIG_SH_LANDISK) += ops-landisk.o obj-$(CONFIG_SH_LBOX_RE2) += ops-lboxre2.o fixups-rts7751r2d.o -obj-$(CONFIG_SH_CAYMAN) += ops-cayman.o +obj-$(CONFIG_SH_CAYMAN) += fixups-cayman.o diff --git a/arch/sh/drivers/pci/ops-cayman.c b/arch/sh/drivers/pci/fixups-cayman.c similarity index 100% rename from arch/sh/drivers/pci/ops-cayman.c rename to arch/sh/drivers/pci/fixups-cayman.c From 757e3c16f8bafa2a470aebf9b04671c5d4d18f49 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 21:11:07 +0900 Subject: [PATCH 0499/2061] sh: pci: Rewrite SH7751 PCI support to follow SH7780. This follows the similar sort of scheme that the refactored SH7780 code uses, using a 64MB CS3 mapping to handle the window0 case, and simply discarding window1. This vastly simplifies the code, and allows most of the board-specific setup to go die. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Kconfig | 3 +- arch/sh/drivers/pci/Makefile | 2 +- arch/sh/drivers/pci/ops-landisk.c | 31 ------ arch/sh/drivers/pci/ops-lboxre2.c | 33 ------ arch/sh/drivers/pci/ops-rts7751r2d.c | 34 ------- arch/sh/drivers/pci/ops-sh03.c | 45 --------- arch/sh/drivers/pci/ops-snapgear.c | 49 --------- arch/sh/drivers/pci/ops-titan.c | 36 ------- arch/sh/drivers/pci/pci-sh7751.c | 145 ++++++++++++++------------- arch/sh/drivers/pci/pci-sh7751.h | 13 +-- 10 files changed, 81 insertions(+), 310 deletions(-) delete mode 100644 arch/sh/drivers/pci/ops-sh03.c diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig index f9fb1d1b623..5aaee3c707b 100644 --- a/arch/sh/drivers/pci/Kconfig +++ b/arch/sh/drivers/pci/Kconfig @@ -23,7 +23,8 @@ config PCI_NEW bool depends on PCI default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \ - CPU_SUBTYPE_SH7785 || CPU_SH5 + CPU_SUBTYPE_SH7785 || CPU_SH5 || \ + CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R # This is also board-specific config PCI_AUTO diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index cb2190c3e36..b8667de19ec 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -16,7 +16,7 @@ obj-$(CONFIG_SH_DREAMCAST) += ops-dreamcast.o fixups-dreamcast.o \ pci-dreamcast.o obj-$(CONFIG_SH_SECUREEDGE5410) += ops-snapgear.o obj-$(CONFIG_SH_RTS7751R2D) += ops-rts7751r2d.o fixups-rts7751r2d.o -obj-$(CONFIG_SH_SH03) += ops-sh03.o fixups-sh03.o +obj-$(CONFIG_SH_SH03) += fixups-sh03.o obj-$(CONFIG_SH_HIGHLANDER) += fixups-r7780rp.o obj-$(CONFIG_SH_SH7785LCR) += fixups-r7780rp.o obj-$(CONFIG_SH_SDK7780) += fixups-sdk7780.o diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/ops-landisk.c index 178b77828aa..bb1a6bb5149 100644 --- a/arch/sh/drivers/pci/ops-landisk.c +++ b/arch/sh/drivers/pci/ops-landisk.c @@ -15,37 +15,6 @@ #include #include "pci-sh4.h" -static struct resource sh7751_io_resource = { - .name = "SH7751 IO", - .start = SH7751_PCI_IO_BASE, - .end = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource sh7751_mem_resource = { - .name = "SH7751 mem", - .start = SH7751_PCI_MEMORY_BASE, - .end = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -struct pci_channel board_pci_channels[] = { - { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0x3ff}, - {NULL, NULL, NULL, 0, 0}, -}; - -static struct sh4_pci_address_map sh7751_pci_map = { - .window0 = { - .base = SH7751_CS3_BASE_ADDR, - .size = (64 << 20), /* 64MB */ - }, -}; - -int __init pcibios_init_platform(void) -{ - return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); -} - int pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) { /* diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/ops-lboxre2.c index 91cabd84f02..6db2c209737 100644 --- a/arch/sh/drivers/pci/ops-lboxre2.c +++ b/arch/sh/drivers/pci/ops-lboxre2.c @@ -21,36 +21,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) { return lboxre2_irq_tab[slot]; } - -static struct resource sh7751_io_resource = { - .name = "SH7751_IO", - .start = SH7751_PCI_IO_BASE , - .end = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource sh7751_mem_resource = { - .name = "SH7751_mem", - .start = SH7751_PCI_MEMORY_BASE, - .end = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -extern struct pci_ops sh7751_pci_ops; - -struct pci_channel board_pci_channels[] = { - { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - -static struct sh4_pci_address_map sh7751_pci_map = { - .window0 = { - .base = SH7751_CS3_BASE_ADDR, - .size = 0x04000000, - }, -}; - -int __init pcibios_init_platform(void) -{ - return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); -} diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c index 96b916c0d6c..d950b8ab25f 100644 --- a/arch/sh/drivers/pci/ops-rts7751r2d.c +++ b/arch/sh/drivers/pci/ops-rts7751r2d.c @@ -29,37 +29,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) { return rts7751r2d_irq_tab[slot]; } - -static struct resource sh7751_io_resource = { - .name = "SH7751_IO", - .start = 0x4000, - .end = SH7751_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource sh7751_mem_resource = { - .name = "SH7751_mem", - .start = SH7751_PCI_MEMORY_BASE, - .end = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -extern struct pci_ops sh7751_pci_ops; - -struct pci_channel board_pci_channels[] = { - { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - -static struct sh4_pci_address_map sh7751_pci_map = { - .window0 = { - .base = SH7751_CS3_BASE_ADDR, - .size = 0x04000000, - }, -}; - -int __init pcibios_init_platform(void) -{ - return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); -} - diff --git a/arch/sh/drivers/pci/ops-sh03.c b/arch/sh/drivers/pci/ops-sh03.c deleted file mode 100644 index 0218135f0bb..00000000000 --- a/arch/sh/drivers/pci/ops-sh03.c +++ /dev/null @@ -1,45 +0,0 @@ -/* - * linux/arch/sh/drivers/pci/ops-sh03.c - * - * PCI initialization for the Interface CTP/PCI-SH03 board - */ - -#include -#include -#include -#include -#include -#include -#include "pci-sh7751.h" - -/* - * Description: This function sets up and initializes the pcic, sets - * up the BARS, maps the DRAM into the address space etc, etc. - */ -int __init pcibios_init_platform(void) -{ - __set_io_port_base(SH7751_PCI_IO_BASE); - return 1; -} - -static struct resource sh7751_io_resource = { - .name = "SH03 IO", - .start = SH7751_PCI_IO_BASE, - .end = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource sh7751_mem_resource = { - .name = "SH03 mem", - .start = SH7751_PCI_MEMORY_BASE, - .end = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -extern struct pci_ops sh4_pci_ops; - -struct pci_channel board_pci_channels[] = { - { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/ops-snapgear.c index b64f2b91be8..5a39ecc1adb 100644 --- a/arch/sh/drivers/pci/ops-snapgear.c +++ b/arch/sh/drivers/pci/ops-snapgear.c @@ -18,55 +18,6 @@ #include #include "pci-sh4.h" -#define SNAPGEAR_PCI_IO 0x4000 -#define SNAPGEAR_PCI_MEM 0xfd000000 - -/* PCI: default LOCAL memory window sizes (seen from PCI bus) */ -#define SNAPGEAR_LSR0_SIZE (64*(1<<20)) //64MB -#define SNAPGEAR_LSR1_SIZE (64*(1<<20)) //64MB - -static struct resource sh7751_io_resource = { - .name = "SH7751 IO", - .start = SNAPGEAR_PCI_IO, - .end = SNAPGEAR_PCI_IO + (64*1024) - 1, /* 64KiB I/O */ - .flags = IORESOURCE_IO, -}; - -static struct resource sh7751_mem_resource = { - .name = "SH7751 mem", - .start = SNAPGEAR_PCI_MEM, - .end = SNAPGEAR_PCI_MEM + (64*1024*1024) - 1, /* 64MiB mem */ - .flags = IORESOURCE_MEM, -}; - -struct pci_channel board_pci_channels[] = { - { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, - { 0, } -}; - -static struct sh4_pci_address_map sh7751_pci_map = { - .window0 = { - .base = SH7751_CS2_BASE_ADDR, - .size = SNAPGEAR_LSR0_SIZE, - }, - - .window1 = { - .base = SH7751_CS2_BASE_ADDR, - .size = SNAPGEAR_LSR1_SIZE, - }, -}; - -/* - * Initialize the SnapGear PCI interface - * Setup hardware to be Central Funtion - * Copy the BSR regs to the PCI interface - * Setup PCI windows into local RAM - */ -int __init pcibios_init_platform(void) -{ - return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); -} - int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) { int irq = -1; diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/ops-titan.c index e45bb62bf8c..3a79fa8254a 100644 --- a/arch/sh/drivers/pci/ops-titan.c +++ b/arch/sh/drivers/pci/ops-titan.c @@ -36,39 +36,3 @@ int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) return irq; } - -static struct resource sh7751_io_resource = { - .name = "SH7751_IO", - .start = SH7751_PCI_IO_BASE, - .end = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource sh7751_mem_resource = { - .name = "SH7751_mem", - .start = SH7751_PCI_MEMORY_BASE, - .end = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -struct pci_channel board_pci_channels[] = { - { sh7751_pci_init, &sh4_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - -static struct sh4_pci_address_map sh7751_pci_map = { - .window0 = { - .base = SH7751_CS2_BASE_ADDR, - .size = SH7751_MEM_REGION_SIZE*2, /* cs2 and cs3 */ - }, - - .window1 = { - .base = SH7751_CS2_BASE_ADDR, - .size = SH7751_MEM_REGION_SIZE*2, - }, -}; - -int __init pcibios_init_platform(void) -{ - return sh7751_pcic_init(&board_pci_channels[0], &sh7751_pci_map); -} diff --git a/arch/sh/drivers/pci/pci-sh7751.c b/arch/sh/drivers/pci/pci-sh7751.c index 4c08fd7f665..c4fa0bb1397 100644 --- a/arch/sh/drivers/pci/pci-sh7751.c +++ b/arch/sh/drivers/pci/pci-sh7751.c @@ -1,43 +1,86 @@ /* - * Low-Level PCI Support for the SH7751 + * Low-Level PCI Support for the SH7751 * - * Dustin McIntire (dustin@sensoria.com) - * Derived from arch/i386/kernel/pci-*.c which bore the message: - * (c) 1999--2000 Martin Mares + * Copyright (C) 2003 - 2009 Paul Mundt + * Copyright (C) 2001 Dustin McIntire * - * Ported to the new API by Paul Mundt - * With cleanup by Paul van Gool - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. + * With cleanup by Paul van Gool , 2003. * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. */ -#undef DEBUG - #include #include #include #include -#include +#include #include "pci-sh4.h" #include -#include -/* - * Initialization. Try all known PCI access methods. Note that we support - * using both PCI BIOS and direct access: in such cases, we use I/O ports - * to access config space. - * - * Note that the platform specific initialization (BSC registers, and memory - * space mapping) will be called via the platform defined function - * pcibios_init_platform(). - */ -int __init sh7751_pci_init(struct pci_channel *chan) +static int __init __area_sdram_check(struct pci_channel *chan, + unsigned int area) { + unsigned long word; + + word = __raw_readl(SH7751_BCR1); + /* check BCR for SDRAM in area */ + if (((word >> area) & 1) == 0) { + printk("PCI: Area %d is not configured for SDRAM. BCR1=0x%lx\n", + area, word); + return 0; + } + pci_write_reg(chan, word, SH4_PCIBCR1); + + word = __raw_readw(SH7751_BCR2); + /* check BCR2 for 32bit SDRAM interface*/ + if (((word >> (area << 1)) & 0x3) != 0x3) { + printk("PCI: Area %d is not 32 bit SDRAM. BCR2=0x%lx\n", + area, word); + return 0; + } + pci_write_reg(chan, word, SH4_PCIBCR2); + + return 1; +} + +static struct resource sh7751_io_resource = { + .name = "SH7751_IO", + .start = SH7751_PCI_IO_BASE, + .end = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1, + .flags = IORESOURCE_IO +}; + +static struct resource sh7751_mem_resource = { + .name = "SH7785_mem", + .start = SH7751_PCI_MEMORY_BASE, + .end = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1, + .flags = IORESOURCE_MEM +}; + +static struct pci_channel sh7751_pci_controller = { + .pci_ops = &sh4_pci_ops, + .mem_resource = &sh7751_mem_resource, + .mem_offset = 0x00000000, + .io_resource = &sh7751_io_resource, + .io_offset = 0x00000000, +}; + +static struct sh4_pci_address_map sh7751_pci_map = { + .window0 = { + .base = SH7751_CS3_BASE_ADDR, + .size = 0x04000000, + }, +}; + +static int __init sh7751_pci_init(void) +{ + struct pci_channel *chan = &sh7751_pci_controller; unsigned int id; + u32 word, reg; int ret; - pr_debug("PCI: Starting intialization.\n"); + printk(KERN_NOTICE "PCI: Starting intialization.\n"); chan->reg_base = 0xfe200000; @@ -52,41 +95,6 @@ int __init sh7751_pci_init(struct pci_channel *chan) if ((ret = sh4_pci_check_direct(chan)) != 0) return ret; - return pcibios_init_platform(); -} - -static int __init __area_sdram_check(struct pci_channel *chan, - unsigned int area) -{ - u32 word; - - word = ctrl_inl(SH7751_BCR1); - /* check BCR for SDRAM in area */ - if (((word >> area) & 1) == 0) { - printk("PCI: Area %d is not configured for SDRAM. BCR1=0x%x\n", - area, word); - return 0; - } - pci_write_reg(chan, word, SH4_PCIBCR1); - - word = (u16)ctrl_inw(SH7751_BCR2); - /* check BCR2 for 32bit SDRAM interface*/ - if (((word >> (area << 1)) & 0x3) != 0x3) { - printk("PCI: Area %d is not 32 bit SDRAM. BCR2=0x%x\n", - area, word); - return 0; - } - pci_write_reg(chan, word, SH4_PCIBCR2); - - return 1; -} - -int __init sh7751_pcic_init(struct pci_channel *chan, - struct sh4_pci_address_map *map) -{ - u32 reg; - u32 word; - /* Set the BCR's to enable PCI access */ reg = ctrl_inl(SH7751_BCR1); reg |= 0x80000; @@ -112,21 +120,13 @@ int __init sh7751_pcic_init(struct pci_channel *chan, /* Set IO and Mem windows to local address * Make PCI and local address the same for easy 1 to 1 mapping - * Window0 = map->window0.size @ non-cached area base = SDRAM - * Window1 = map->window1.size @ cached area base = SDRAM */ - word = map->window0.size - 1; + word = sh7751_pci_map.window0.size - 1; pci_write_reg(chan, word, SH4_PCILSR0); - word = map->window1.size - 1; - pci_write_reg(chan, word, SH4_PCILSR1); /* Set the values on window 0 PCI config registers */ - word = P2SEGADDR(map->window0.base); + word = P2SEGADDR(sh7751_pci_map.window0.base); pci_write_reg(chan, word, SH4_PCILAR0); pci_write_reg(chan, word, SH7751_PCICONF5); - /* Set the values on window 1 PCI config registers */ - word = PHYSADDR(map->window1.base); - pci_write_reg(chan, word, SH4_PCILAR1); - pci_write_reg(chan, word, SH7751_PCICONF6); /* Set the local 16MB PCI memory space window to * the lowest PCI mapped address @@ -144,7 +144,7 @@ int __init sh7751_pcic_init(struct pci_channel *chan, /* Set PCI WCRx, BCRx's, copy from BSC locations */ /* check BCR for SDRAM in specified area */ - switch (map->window0.base) { + switch (sh7751_pci_map.window0.base) { case SH7751_CS0_BASE_ADDR: word = __area_sdram_check(chan, 0); break; case SH7751_CS1_BASE_ADDR: word = __area_sdram_check(chan, 1); break; case SH7751_CS2_BASE_ADDR: word = __area_sdram_check(chan, 2); break; @@ -179,5 +179,10 @@ int __init sh7751_pcic_init(struct pci_channel *chan, word = SH4_PCICR_PREFIX | SH4_PCICR_CFIN | SH4_PCICR_ARBM; pci_write_reg(chan, word, SH4_PCICR); + __set_io_port_base(SH7751_PCI_IO_BASE); + + register_pci_controller(chan); + return 0; } +arch_initcall(sh7751_pci_init); diff --git a/arch/sh/drivers/pci/pci-sh7751.h b/arch/sh/drivers/pci/pci-sh7751.h index c390dd2f5e1..4983a4d2035 100644 --- a/arch/sh/drivers/pci/pci-sh7751.h +++ b/arch/sh/drivers/pci/pci-sh7751.h @@ -57,7 +57,7 @@ #define SH7751_PCICONF2_SCC 0x00FF0000 /* Sub-Class Code */ #define SH7751_PCICONF2_RLPI 0x0000FF00 /* Programming Interface */ #define SH7751_PCICONF2_REV 0x000000FF /* Revision ID */ -#define SH7751_PCICONF3 0xC /* PCI Config Reg 3 */ +#define SH7751_PCICONF3 0xC /* PCI Config Reg 3 */ #define SH7751_PCICONF3_BIST7 0x80000000 /* Bist Supported */ #define SH7751_PCICONF3_BIST6 0x40000000 /* Bist Executing */ #define SH7751_PCICONF3_BIST3_0 0x0F000000 /* Bist Passed */ @@ -72,12 +72,12 @@ #define SH7751_PCICONF5_BASE 0xFFFFFFF0 /* Mem Space Base Addr */ #define SH7751_PCICONF5_LAP 0x00000008 /* Prefetch Enabled */ #define SH7751_PCICONF5_LAT 0x00000006 /* Local Memory type */ - #define SH7751_PCICONF5_ASI 0x00000001 /* Address Space Type */ + #define SH7751_PCICONF5_ASI 0x00000001 /* Address Space Type */ #define SH7751_PCICONF6 0x18 /* PCI Config Reg 6 */ #define SH7751_PCICONF6_BASE 0xFFFFFFF0 /* Mem Space Base Addr */ #define SH7751_PCICONF6_LAP 0x00000008 /* Prefetch Enabled */ #define SH7751_PCICONF6_LAT 0x00000006 /* Local Memory type */ - #define SH7751_PCICONF6_ASI 0x00000001 /* Address Space Type */ + #define SH7751_PCICONF6_ASI 0x00000001 /* Address Space Type */ /* PCICONF7 - PCICONF10 are undefined */ #define SH7751_PCICONF11 0x2C /* PCI Config Reg 11 */ #define SH7751_PCICONF11_SSID 0xFFFF0000 /* Subsystem ID */ @@ -126,11 +126,4 @@ #define SH7751_CS5_BASE_ADDR (SH7751_CS4_BASE_ADDR + SH7751_MEM_REGION_SIZE) #define SH7751_CS6_BASE_ADDR (SH7751_CS5_BASE_ADDR + SH7751_MEM_REGION_SIZE) -struct sh4_pci_address_map; - -/* arch/sh/drivers/pci/pci-sh7751.c */ -int sh7751_pci_init(struct pci_channel *chan); -int sh7751_pcic_init(struct pci_channel *chan, - struct sh4_pci_address_map *map); - #endif /* _PCI_SH7751_H_ */ From 84972ec0c206b73ffb74e5114b574186ce6166b8 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 21:17:10 +0900 Subject: [PATCH 0500/2061] sh: pci: Rename SH7751 platform ops files to fixups. None of these contain pci_ops, only IRQ routing bits, rename them accordingly. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Makefile | 8 ++++---- arch/sh/drivers/pci/{ops-landisk.c => fixups-landisk.c} | 0 arch/sh/drivers/pci/{ops-lboxre2.c => fixups-lboxre2.c} | 0 arch/sh/drivers/pci/{ops-snapgear.c => fixups-snapgear.c} | 0 arch/sh/drivers/pci/{ops-titan.c => fixups-titan.c} | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename arch/sh/drivers/pci/{ops-landisk.c => fixups-landisk.c} (100%) rename arch/sh/drivers/pci/{ops-lboxre2.c => fixups-lboxre2.c} (100%) rename arch/sh/drivers/pci/{ops-snapgear.c => fixups-snapgear.c} (100%) rename arch/sh/drivers/pci/{ops-titan.c => fixups-titan.c} (100%) diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index b8667de19ec..abd931c1496 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -14,14 +14,14 @@ obj-$(CONFIG_CPU_SH5) += pci-sh5.o ops-sh5.o obj-$(CONFIG_SH_DREAMCAST) += ops-dreamcast.o fixups-dreamcast.o \ pci-dreamcast.o -obj-$(CONFIG_SH_SECUREEDGE5410) += ops-snapgear.o +obj-$(CONFIG_SH_SECUREEDGE5410) += fixups-snapgear.o obj-$(CONFIG_SH_RTS7751R2D) += ops-rts7751r2d.o fixups-rts7751r2d.o obj-$(CONFIG_SH_SH03) += fixups-sh03.o obj-$(CONFIG_SH_HIGHLANDER) += fixups-r7780rp.o obj-$(CONFIG_SH_SH7785LCR) += fixups-r7780rp.o obj-$(CONFIG_SH_SDK7780) += fixups-sdk7780.o obj-$(CONFIG_SH_7780_SOLUTION_ENGINE) += fixups-sdk7780.o -obj-$(CONFIG_SH_TITAN) += ops-titan.o -obj-$(CONFIG_SH_LANDISK) += ops-landisk.o -obj-$(CONFIG_SH_LBOX_RE2) += ops-lboxre2.o fixups-rts7751r2d.o +obj-$(CONFIG_SH_TITAN) += fixups-titan.o +obj-$(CONFIG_SH_LANDISK) += fixups-landisk.o +obj-$(CONFIG_SH_LBOX_RE2) += fixups-lboxre2.o fixups-rts7751r2d.o obj-$(CONFIG_SH_CAYMAN) += fixups-cayman.o diff --git a/arch/sh/drivers/pci/ops-landisk.c b/arch/sh/drivers/pci/fixups-landisk.c similarity index 100% rename from arch/sh/drivers/pci/ops-landisk.c rename to arch/sh/drivers/pci/fixups-landisk.c diff --git a/arch/sh/drivers/pci/ops-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c similarity index 100% rename from arch/sh/drivers/pci/ops-lboxre2.c rename to arch/sh/drivers/pci/fixups-lboxre2.c diff --git a/arch/sh/drivers/pci/ops-snapgear.c b/arch/sh/drivers/pci/fixups-snapgear.c similarity index 100% rename from arch/sh/drivers/pci/ops-snapgear.c rename to arch/sh/drivers/pci/fixups-snapgear.c diff --git a/arch/sh/drivers/pci/ops-titan.c b/arch/sh/drivers/pci/fixups-titan.c similarity index 100% rename from arch/sh/drivers/pci/ops-titan.c rename to arch/sh/drivers/pci/fixups-titan.c From 37c8ac361efdbae969eff1a603bc39412d3e873f Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 21:27:15 +0900 Subject: [PATCH 0501/2061] sh: pci: Consolidate lboxre2 and r2d IRQ fixups. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Makefile | 4 ++-- arch/sh/drivers/pci/fixups-lboxre2.c | 23 ------------------ arch/sh/drivers/pci/fixups-rts7751r2d.c | 25 +++++++++++++++++++- arch/sh/drivers/pci/ops-rts7751r2d.c | 31 ------------------------- 4 files changed, 26 insertions(+), 57 deletions(-) delete mode 100644 arch/sh/drivers/pci/fixups-lboxre2.c delete mode 100644 arch/sh/drivers/pci/ops-rts7751r2d.c diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index abd931c1496..fbebd73b22f 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -15,7 +15,7 @@ obj-$(CONFIG_CPU_SH5) += pci-sh5.o ops-sh5.o obj-$(CONFIG_SH_DREAMCAST) += ops-dreamcast.o fixups-dreamcast.o \ pci-dreamcast.o obj-$(CONFIG_SH_SECUREEDGE5410) += fixups-snapgear.o -obj-$(CONFIG_SH_RTS7751R2D) += ops-rts7751r2d.o fixups-rts7751r2d.o +obj-$(CONFIG_SH_RTS7751R2D) += fixups-rts7751r2d.o obj-$(CONFIG_SH_SH03) += fixups-sh03.o obj-$(CONFIG_SH_HIGHLANDER) += fixups-r7780rp.o obj-$(CONFIG_SH_SH7785LCR) += fixups-r7780rp.o @@ -23,5 +23,5 @@ obj-$(CONFIG_SH_SDK7780) += fixups-sdk7780.o obj-$(CONFIG_SH_7780_SOLUTION_ENGINE) += fixups-sdk7780.o obj-$(CONFIG_SH_TITAN) += fixups-titan.o obj-$(CONFIG_SH_LANDISK) += fixups-landisk.o -obj-$(CONFIG_SH_LBOX_RE2) += fixups-lboxre2.o fixups-rts7751r2d.o +obj-$(CONFIG_SH_LBOX_RE2) += fixups-rts7751r2d.o obj-$(CONFIG_SH_CAYMAN) += fixups-cayman.o diff --git a/arch/sh/drivers/pci/fixups-lboxre2.c b/arch/sh/drivers/pci/fixups-lboxre2.c deleted file mode 100644 index 6db2c209737..00000000000 --- a/arch/sh/drivers/pci/fixups-lboxre2.c +++ /dev/null @@ -1,23 +0,0 @@ -/* - * linux/arch/sh/drivers/pci/ops-lboxre2.c - * - * Copyright (C) 2007 Nobuhiro Iwamatsu - * - * PCI initialization for the NTT COMWARE L-BOX RE2 - */ -#include -#include -#include -#include -#include -#include -#include "pci-sh4.h" - -static char lboxre2_irq_tab[] __initdata = { - IRQ_ETH0, IRQ_ETH1, IRQ_INTA, IRQ_INTD, -}; - -int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) -{ - return lboxre2_irq_tab[slot]; -} diff --git a/arch/sh/drivers/pci/fixups-rts7751r2d.c b/arch/sh/drivers/pci/fixups-rts7751r2d.c index 38852334d47..052b354236d 100644 --- a/arch/sh/drivers/pci/fixups-rts7751r2d.c +++ b/arch/sh/drivers/pci/fixups-rts7751r2d.c @@ -1,21 +1,44 @@ /* * arch/sh/drivers/pci/fixups-rts7751r2d.c * - * RTS7751R2D PCI fixups + * RTS7751R2D / LBOXRE2 PCI fixups * * Copyright (C) 2003 Lineo uSolutions, Inc. * Copyright (C) 2004 Paul Mundt + * Copyright (C) 2007 Nobuhiro Iwamatsu * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive * for more details. */ #include +#include +#include #include "pci-sh4.h" +#include #define PCIMCR_MRSET_OFF 0xBFFFFFFF #define PCIMCR_RFSH_OFF 0xFFFFFFFB +static u8 rts7751r2d_irq_tab[] __initdata = { + IRQ_PCI_INTA, + IRQ_PCI_INTB, + IRQ_PCI_INTC, + IRQ_PCI_INTD, +}; + +static char lboxre2_irq_tab[] __initdata = { + IRQ_ETH0, IRQ_ETH1, IRQ_INTA, IRQ_INTD, +}; + +int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) +{ + if (mach_is_lboxre2()) + return lboxre2_irq_tab[slot]; + else + return rts7751r2d_irq_tab[slot]; +} + int pci_fixup_pcic(struct pci_channel *chan) { unsigned long bcr1, mcr; diff --git a/arch/sh/drivers/pci/ops-rts7751r2d.c b/arch/sh/drivers/pci/ops-rts7751r2d.c deleted file mode 100644 index d950b8ab25f..00000000000 --- a/arch/sh/drivers/pci/ops-rts7751r2d.c +++ /dev/null @@ -1,31 +0,0 @@ -/* - * linux/arch/sh/drivers/pci/ops-rts7751r2d.c - * - * Author: Ian DaSilva (idasilva@mvista.com) - * - * Highly leveraged from pci-bigsur.c, written by Dustin McIntire. - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * PCI initialization for the Renesas SH7751R RTS7751R2D board - */ -#include -#include -#include -#include -#include -#include -#include "pci-sh4.h" - -static u8 rts7751r2d_irq_tab[] __initdata = { - IRQ_PCI_INTA, - IRQ_PCI_INTB, - IRQ_PCI_INTC, - IRQ_PCI_INTD, -}; - -int __init pcibios_map_platform_irq(struct pci_dev *pdev, u8 slot, u8 pin) -{ - return rts7751r2d_irq_tab[slot]; -} From 951a681bda844491de8699b3bdc6c3899cbd4c9f Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 21:34:36 +0900 Subject: [PATCH 0502/2061] sh: pci: Convert dreamcast to new-style interface. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Kconfig | 5 +---- arch/sh/drivers/pci/fixups-dreamcast.c | 2 +- arch/sh/drivers/pci/pci-dreamcast.c | 27 ++++++++++++-------------- 3 files changed, 14 insertions(+), 20 deletions(-) diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig index 5aaee3c707b..1d53496b149 100644 --- a/arch/sh/drivers/pci/Kconfig +++ b/arch/sh/drivers/pci/Kconfig @@ -20,11 +20,8 @@ config SH_PCIDMA_NONCOHERENT # Temporary config option for transitioning off of PCI_AUTO config PCI_NEW - bool + def_bool y depends on PCI - default y if CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7780 || \ - CPU_SUBTYPE_SH7785 || CPU_SH5 || \ - CPU_SUBTYPE_SH7751 || CPU_SUBTYPE_SH7751R # This is also board-specific config PCI_AUTO diff --git a/arch/sh/drivers/pci/fixups-dreamcast.c b/arch/sh/drivers/pci/fixups-dreamcast.c index 48c6381fffa..ed7f489936f 100644 --- a/arch/sh/drivers/pci/fixups-dreamcast.c +++ b/arch/sh/drivers/pci/fixups-dreamcast.c @@ -30,7 +30,7 @@ static void __init gapspci_fixup_resources(struct pci_dev *dev) { - struct pci_channel *p = board_pci_channels; + struct pci_channel *p = dev->sysdata; printk(KERN_NOTICE "PCI: Fixing up device %s\n", pci_name(dev)); diff --git a/arch/sh/drivers/pci/pci-dreamcast.c b/arch/sh/drivers/pci/pci-dreamcast.c index 0897be5053d..210f9d4af14 100644 --- a/arch/sh/drivers/pci/pci-dreamcast.c +++ b/arch/sh/drivers/pci/pci-dreamcast.c @@ -21,7 +21,6 @@ #include #include #include - #include #include #include @@ -40,11 +39,19 @@ static struct resource gapspci_mem_resource = { .flags = IORESOURCE_MEM, }; +static struct pci_channel dreamcast_pci_controller = { + .pci_ops = &gapspci_pci_ops, + .io_resource = &gapspci_io_resource, + .io_offset = 0x00000000, + .mem_resource = &gapspci_mem_resource, + .mem_offset = 0x00000000, +}; + /* * gapspci init */ -static int __init gapspci_init(struct pci_channel *chan) +static int __init gapspci_init(void) { char idbuf[16]; int i; @@ -88,18 +95,8 @@ static int __init gapspci_init(struct pci_channel *chan) outl(0x00002001, GAPSPCI_BBA_CONFIG+0x10); outl(0x01000000, GAPSPCI_BBA_CONFIG+0x14); + register_pci_controller(&dreamcast_pci_controller); + return 0; } - -struct pci_channel board_pci_channels[] = { - { - .init = gapspci_init, - .pci_ops = &gapspci_pci_ops, - .io_resource = &gapspci_io_resource, - .mem_resource = &gapspci_mem_resource, - .first_devfn = 0, - .last_devfn = 1, - }, { - .init = NULL, - } -}; +arch_initcall(gapspci_init); From 2d5efc190eb415dbff79ffab4f8ea731ab0288a9 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 21:42:59 +0900 Subject: [PATCH 0503/2061] sh: pci: Move the se7751 fixups in to arch/sh/drivers/pci/. The se7751 was still doing the PCI fixups in its own board directory, so we move it over to arch/sh/drivers/pci/ with the rest of the board fixups. It has bitrotted significantly over the years, so will still likely need a bit of work to bring back up to date. Signed-off-by: Paul Mundt --- arch/sh/boards/mach-se/7751/Makefile | 2 - arch/sh/boards/mach-se/7751/pci.c | 150 --------------------------- arch/sh/drivers/pci/Makefile | 1 + arch/sh/drivers/pci/fixups-se7751.c | 111 ++++++++++++++++++++ 4 files changed, 112 insertions(+), 152 deletions(-) delete mode 100644 arch/sh/boards/mach-se/7751/pci.c create mode 100644 arch/sh/drivers/pci/fixups-se7751.c diff --git a/arch/sh/boards/mach-se/7751/Makefile b/arch/sh/boards/mach-se/7751/Makefile index dbc29f3a9de..e6f4341bfe6 100644 --- a/arch/sh/boards/mach-se/7751/Makefile +++ b/arch/sh/boards/mach-se/7751/Makefile @@ -3,5 +3,3 @@ # obj-y := setup.o io.o irq.o - -obj-$(CONFIG_PCI) += pci.o diff --git a/arch/sh/boards/mach-se/7751/pci.c b/arch/sh/boards/mach-se/7751/pci.c deleted file mode 100644 index 9ec64a416b3..00000000000 --- a/arch/sh/boards/mach-se/7751/pci.c +++ /dev/null @@ -1,150 +0,0 @@ -/* - * linux/arch/sh/boards/se/7751/pci.c - * - * Author: Ian DaSilva (idasilva@mvista.com) - * - * Highly leveraged from pci-bigsur.c, written by Dustin McIntire. - * - * May be copied or modified under the terms of the GNU General Public - * License. See linux/COPYING for more information. - * - * PCI initialization for the Hitachi SH7751 Solution Engine board (MS7751SE01) - */ - -#include -#include -#include -#include -#include - -#include -#include "../../../drivers/pci/pci-sh7751.h" - -#define PCIMCR_MRSET_OFF 0xBFFFFFFF -#define PCIMCR_RFSH_OFF 0xFFFFFFFB - -/* - * Only long word accesses of the PCIC's internal local registers and the - * configuration registers from the CPU is supported. - */ -#define PCIC_WRITE(x,v) writel((v), PCI_REG(x)) -#define PCIC_READ(x) readl(PCI_REG(x)) - -#define xPCIBIOS_MIN_IO board_pci_channels->io_resource->start -#define xPCIBIOS_MIN_MEM board_pci_channels->mem_resource->start - -/* - * Description: This function sets up and initializes the pcic, sets - * up the BARS, maps the DRAM into the address space etc, etc. - */ -int __init pcibios_init_platform(void) -{ - unsigned long bcr1, wcr1, wcr2, wcr3, mcr; - unsigned short bcr2; - - /* - * Initialize the slave bus controller on the pcic. The values used - * here should not be hardcoded, but they should be taken from the bsc - * on the processor, to make this function as generic as possible. - * (i.e. Another sbc may usr different SDRAM timing settings -- in order - * for the pcic to work, its settings need to be exactly the same.) - */ - bcr1 = (*(volatile unsigned long*)(SH7751_BCR1)); - bcr2 = (*(volatile unsigned short*)(SH7751_BCR2)); - wcr1 = (*(volatile unsigned long*)(SH7751_WCR1)); - wcr2 = (*(volatile unsigned long*)(SH7751_WCR2)); - wcr3 = (*(volatile unsigned long*)(SH7751_WCR3)); - mcr = (*(volatile unsigned long*)(SH7751_MCR)); - - bcr1 = bcr1 | 0x00080000; /* Enable Bit 19, BREQEN */ - (*(volatile unsigned long*)(SH7751_BCR1)) = bcr1; - - bcr1 = bcr1 | 0x40080000; /* Enable Bit 19 BREQEN, set PCIC to slave */ - PCIC_WRITE(SH7751_PCIBCR1, bcr1); /* PCIC BCR1 */ - PCIC_WRITE(SH7751_PCIBCR2, bcr2); /* PCIC BCR2 */ - PCIC_WRITE(SH7751_PCIWCR1, wcr1); /* PCIC WCR1 */ - PCIC_WRITE(SH7751_PCIWCR2, wcr2); /* PCIC WCR2 */ - PCIC_WRITE(SH7751_PCIWCR3, wcr3); /* PCIC WCR3 */ - mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF; - PCIC_WRITE(SH7751_PCIMCR, mcr); /* PCIC MCR */ - - - /* Enable all interrupts, so we know what to fix */ - PCIC_WRITE(SH7751_PCIINTM, 0x0000c3ff); - PCIC_WRITE(SH7751_PCIAINTM, 0x0000380f); - - /* Set up standard PCI config registers */ - PCIC_WRITE(SH7751_PCICONF1, 0xF39000C7); /* Bus Master, Mem & I/O access */ - PCIC_WRITE(SH7751_PCICONF2, 0x00000000); /* PCI Class code & Revision ID */ - PCIC_WRITE(SH7751_PCICONF4, 0xab000001); /* PCI I/O address (local regs) */ - PCIC_WRITE(SH7751_PCICONF5, 0x0c000000); /* PCI MEM address (local RAM) */ - PCIC_WRITE(SH7751_PCICONF6, 0xd0000000); /* PCI MEM address (unused) */ - PCIC_WRITE(SH7751_PCICONF11, 0x35051054); /* PCI Subsystem ID & Vendor ID */ - PCIC_WRITE(SH7751_PCILSR0, 0x03f00000); /* MEM (full 64M exposed) */ - PCIC_WRITE(SH7751_PCILSR1, 0x00000000); /* MEM (unused) */ - PCIC_WRITE(SH7751_PCILAR0, 0x0c000000); /* MEM (direct map from PCI) */ - PCIC_WRITE(SH7751_PCILAR1, 0x00000000); /* MEM (unused) */ - - /* Now turn it on... */ - PCIC_WRITE(SH7751_PCICR, 0xa5000001); - - /* - * Set PCIMBR and PCIIOBR here, assuming a single window - * (16M MEM, 256K IO) is enough. If a larger space is - * needed, the readx/writex and inx/outx functions will - * have to do more (e.g. setting registers for each call). - */ - - /* - * Set the MBR so PCI address is one-to-one with window, - * meaning all calls go straight through... use BUG_ON to - * catch erroneous assumption. - */ - BUG_ON(xPCIBIOS_MIN_MEM != SH7751_PCI_MEMORY_BASE); - - PCIC_WRITE(SH7751_PCIMBR, xPCIBIOS_MIN_MEM); - - /* Set IOBR for window containing area specified in pci.h */ - PCIC_WRITE(SH7751_PCIIOBR, (xPCIBIOS_MIN_IO & SH7751_PCIIOBR_MASK)); - - /* All done, may as well say so... */ - printk("SH7751 PCI: Finished initialization of the PCI controller\n"); - - return 1; -} - -int __init pcibios_map_platform_irq(u8 slot, u8 pin) -{ - switch (slot) { - case 0: return 13; - case 1: return 13; /* AMD Ethernet controller */ - case 2: return -1; - case 3: return -1; - case 4: return -1; - default: - printk("PCI: Bad IRQ mapping request for slot %d\n", slot); - return -1; - } -} - -static struct resource sh7751_io_resource = { - .name = "SH7751 IO", - .start = SH7751_PCI_IO_BASE, - .end = SH7751_PCI_IO_BASE + SH7751_PCI_IO_SIZE - 1, - .flags = IORESOURCE_IO -}; - -static struct resource sh7751_mem_resource = { - .name = "SH7751 mem", - .start = SH7751_PCI_MEMORY_BASE, - .end = SH7751_PCI_MEMORY_BASE + SH7751_PCI_MEM_SIZE - 1, - .flags = IORESOURCE_MEM -}; - -extern struct pci_ops sh7751_pci_ops; - -struct pci_channel board_pci_channels[] = { - { &sh7751_pci_ops, &sh7751_io_resource, &sh7751_mem_resource, 0, 0xff }, - { NULL, NULL, NULL, 0, 0 }, -}; - diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index fbebd73b22f..e388a70d146 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_CPU_SH5) += pci-sh5.o ops-sh5.o obj-$(CONFIG_SH_DREAMCAST) += ops-dreamcast.o fixups-dreamcast.o \ pci-dreamcast.o obj-$(CONFIG_SH_SECUREEDGE5410) += fixups-snapgear.o +obj-$(CONFIG_SH_7751_SOLUTION_ENGINE) += fixups-se7751.o obj-$(CONFIG_SH_RTS7751R2D) += fixups-rts7751r2d.o obj-$(CONFIG_SH_SH03) += fixups-sh03.o obj-$(CONFIG_SH_HIGHLANDER) += fixups-r7780rp.o diff --git a/arch/sh/drivers/pci/fixups-se7751.c b/arch/sh/drivers/pci/fixups-se7751.c new file mode 100644 index 00000000000..475fa9f0fe2 --- /dev/null +++ b/arch/sh/drivers/pci/fixups-se7751.c @@ -0,0 +1,111 @@ +#include +#include +#include +#include +#include +#include +#include "pci-sh4.h" + +int __init pcibios_map_platform_irq(u8 slot, u8 pin) +{ + switch (slot) { + case 0: return 13; + case 1: return 13; /* AMD Ethernet controller */ + case 2: return -1; + case 3: return -1; + case 4: return -1; + default: + printk("PCI: Bad IRQ mapping request for slot %d\n", slot); + return -1; + } +} + +#define PCIMCR_MRSET_OFF 0xBFFFFFFF +#define PCIMCR_RFSH_OFF 0xFFFFFFFB + +/* + * Only long word accesses of the PCIC's internal local registers and the + * configuration registers from the CPU is supported. + */ +#define PCIC_WRITE(x,v) writel((v), PCI_REG(x)) +#define PCIC_READ(x) readl(PCI_REG(x)) + +/* + * Description: This function sets up and initializes the pcic, sets + * up the BARS, maps the DRAM into the address space etc, etc. + */ +int pci_fixup_pcic(struct pci_channel *chan) +{ + unsigned long bcr1, wcr1, wcr2, wcr3, mcr; + unsigned short bcr2; + + /* + * Initialize the slave bus controller on the pcic. The values used + * here should not be hardcoded, but they should be taken from the bsc + * on the processor, to make this function as generic as possible. + * (i.e. Another sbc may usr different SDRAM timing settings -- in order + * for the pcic to work, its settings need to be exactly the same.) + */ + bcr1 = (*(volatile unsigned long*)(SH7751_BCR1)); + bcr2 = (*(volatile unsigned short*)(SH7751_BCR2)); + wcr1 = (*(volatile unsigned long*)(SH7751_WCR1)); + wcr2 = (*(volatile unsigned long*)(SH7751_WCR2)); + wcr3 = (*(volatile unsigned long*)(SH7751_WCR3)); + mcr = (*(volatile unsigned long*)(SH7751_MCR)); + + bcr1 = bcr1 | 0x00080000; /* Enable Bit 19, BREQEN */ + (*(volatile unsigned long*)(SH7751_BCR1)) = bcr1; + + bcr1 = bcr1 | 0x40080000; /* Enable Bit 19 BREQEN, set PCIC to slave */ + PCIC_WRITE(SH7751_PCIBCR1, bcr1); /* PCIC BCR1 */ + PCIC_WRITE(SH7751_PCIBCR2, bcr2); /* PCIC BCR2 */ + PCIC_WRITE(SH7751_PCIWCR1, wcr1); /* PCIC WCR1 */ + PCIC_WRITE(SH7751_PCIWCR2, wcr2); /* PCIC WCR2 */ + PCIC_WRITE(SH7751_PCIWCR3, wcr3); /* PCIC WCR3 */ + mcr = (mcr & PCIMCR_MRSET_OFF) & PCIMCR_RFSH_OFF; + PCIC_WRITE(SH7751_PCIMCR, mcr); /* PCIC MCR */ + + + /* Enable all interrupts, so we know what to fix */ + PCIC_WRITE(SH7751_PCIINTM, 0x0000c3ff); + PCIC_WRITE(SH7751_PCIAINTM, 0x0000380f); + + /* Set up standard PCI config registers */ + PCIC_WRITE(SH7751_PCICONF1, 0xF39000C7); /* Bus Master, Mem & I/O access */ + PCIC_WRITE(SH7751_PCICONF2, 0x00000000); /* PCI Class code & Revision ID */ + PCIC_WRITE(SH7751_PCICONF4, 0xab000001); /* PCI I/O address (local regs) */ + PCIC_WRITE(SH7751_PCICONF5, 0x0c000000); /* PCI MEM address (local RAM) */ + PCIC_WRITE(SH7751_PCICONF6, 0xd0000000); /* PCI MEM address (unused) */ + PCIC_WRITE(SH7751_PCICONF11, 0x35051054); /* PCI Subsystem ID & Vendor ID */ + PCIC_WRITE(SH7751_PCILSR0, 0x03f00000); /* MEM (full 64M exposed) */ + PCIC_WRITE(SH7751_PCILSR1, 0x00000000); /* MEM (unused) */ + PCIC_WRITE(SH7751_PCILAR0, 0x0c000000); /* MEM (direct map from PCI) */ + PCIC_WRITE(SH7751_PCILAR1, 0x00000000); /* MEM (unused) */ + + /* Now turn it on... */ + PCIC_WRITE(SH7751_PCICR, 0xa5000001); + + /* + * Set PCIMBR and PCIIOBR here, assuming a single window + * (16M MEM, 256K IO) is enough. If a larger space is + * needed, the readx/writex and inx/outx functions will + * have to do more (e.g. setting registers for each call). + */ + + /* + * Set the MBR so PCI address is one-to-one with window, + * meaning all calls go straight through... use BUG_ON to + * catch erroneous assumption. + */ + BUG_ON(chan->mem_resource->start != SH7751_PCI_MEMORY_BASE); + + PCIC_WRITE(SH7751_PCIMBR, chan->mem_resource->start); + + /* Set IOBR for window containing area specified in pci.h */ + PCIC_WRITE(SH7751_PCIIOBR, (chan->io_resource->start & SH7751_PCIIOBR_MASK)); + + /* All done, may as well say so... */ + printk("SH7751 PCI: Finished initialization of the PCI controller\n"); + + return 1; +} From 805fcc88999162b361ef0b0ce25782ef65f147d7 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 21:46:42 +0900 Subject: [PATCH 0504/2061] sh: pci: Kill off the last remnants of the now unused pci-auto code. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Kconfig | 18 -- arch/sh/drivers/pci/Makefile | 1 - arch/sh/drivers/pci/pci-auto.c | 546 --------------------------------- arch/sh/drivers/pci/pci.c | 75 ----- arch/sh/include/asm/pci.h | 16 - 5 files changed, 656 deletions(-) delete mode 100644 arch/sh/drivers/pci/pci-auto.c delete mode 100644 arch/sh/drivers/pci/pci.c diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig index 1d53496b149..ea903a984f0 100644 --- a/arch/sh/drivers/pci/Kconfig +++ b/arch/sh/drivers/pci/Kconfig @@ -18,24 +18,6 @@ config SH_PCIDMA_NONCOHERENT bridge integrated with your SH CPU, refer carefully to the chip specs to see if you can say 'N' here. Otherwise, leave it as 'Y'. -# Temporary config option for transitioning off of PCI_AUTO config PCI_NEW def_bool y depends on PCI - -# This is also board-specific -config PCI_AUTO - bool - depends on PCI && !PCI_NEW - default y - -config PCI_AUTO_UPDATE_RESOURCES - bool - depends on PCI_AUTO - default y if !SH_DREAMCAST - help - Selecting this option will cause the PCI auto code to leave your - BAR values alone. Otherwise they will be updated automatically. If - for some reason, you have a board that simply refuses to work - with its resources updated beyond what they are when the device - is powered up, set this to N. Everyone else will want this as Y. diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index e388a70d146..a8350ccfa9d 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -2,7 +2,6 @@ # Makefile for the PCI specific kernel interface routines under Linux. # obj-y += pci-lib.o -obj-$(CONFIG_PCI_AUTO) += pci.o pci-auto.o obj-$(CONFIG_PCI_NEW) += pci-new.o obj-$(CONFIG_CPU_SUBTYPE_SH7751) += pci-sh7751.o ops-sh4.o diff --git a/arch/sh/drivers/pci/pci-auto.c b/arch/sh/drivers/pci/pci-auto.c deleted file mode 100644 index 1d715ec405b..00000000000 --- a/arch/sh/drivers/pci/pci-auto.c +++ /dev/null @@ -1,546 +0,0 @@ -/* - * PCI autoconfiguration library - * - * Author: Matt Porter - * - * Copyright 2000, 2001 MontaVista Software Inc. - * Copyright 2001 Bradley D. LaRonde - * Copyright 2003 Paul Mundt - * - * This program is free software; you can redistribute it and/or modify it - * under the terms of the GNU General Public License as published by the - * Free Software Foundation; either version 2 of the License, or (at your - * option) any later version. - */ - -/* - * Modified for MIPS by Jun Sun, jsun@mvista.com - * - * . Simplify the interface between pci_auto and the rest: a single function. - * . Assign resources from low address to upper address. - * . change most int to u32. - * - * Further modified to include it as mips generic code, ppopov@mvista.com. - * - * 2001-10-26 Bradley D. LaRonde - * - Add a top_bus argument to the "early config" functions so that - * they can set a fake parent bus pointer to convince the underlying - * pci ops to use type 1 configuration for sub busses. - * - Set bridge base and limit registers correctly. - * - Align io and memory base properly before and after bridge setup. - * - Don't fall through to pci_setup_bars for bridge. - * - Reformat the debug output to look more like lspci's output. - * - * Cloned for SuperH by M. R. Brown, mrbrown@0xd6.org - * - * 2003-08-05 Paul Mundt - * - Don't update the BAR values on systems that already have valid addresses - * and don't want these updated for whatever reason, by way of a new config - * option check. However, we still read in the old BAR values so that they - * can still be reported through the debug output. - */ - -#include -#include -#include -#include - -#define DEBUG -#ifdef DEBUG -#define DBG(x...) printk(x) -#else -#define DBG(x...) -#endif - -/* - * These functions are used early on before PCI scanning is done - * and all of the pci_dev and pci_bus structures have been created. - */ -static struct pci_dev *fake_pci_dev(struct pci_channel *hose, - int top_bus, int busnr, int devfn) -{ - static struct pci_dev dev; - static struct pci_bus bus; - - dev.bus = &bus; - dev.sysdata = hose; - dev.devfn = devfn; - bus.number = busnr; - bus.ops = hose->pci_ops; - bus.sysdata = hose; - - if(busnr != top_bus) - /* Fake a parent bus structure. */ - bus.parent = &bus; - else - bus.parent = NULL; - - return &dev; -} - -#define EARLY_PCI_OP(rw, size, type) \ -static int early_##rw##_config_##size(struct pci_channel *hose, \ - int top_bus, int bus, int devfn, int offset, type value) \ -{ \ - return pci_##rw##_config_##size( \ - fake_pci_dev(hose, top_bus, bus, devfn), \ - offset, value); \ -} - -EARLY_PCI_OP(read, byte, u8 *) -EARLY_PCI_OP(read, word, u16 *) -EARLY_PCI_OP(read, dword, u32 *) -EARLY_PCI_OP(write, byte, u8) -EARLY_PCI_OP(write, word, u16) -EARLY_PCI_OP(write, dword, u32) - -static struct resource *io_resource_inuse; -static struct resource *mem_resource_inuse; - -static u32 pciauto_lower_iospc; -static u32 pciauto_upper_iospc; - -static u32 pciauto_lower_memspc; -static u32 pciauto_upper_memspc; - -static void __init -pciauto_setup_bars(struct pci_channel *hose, - int top_bus, - int current_bus, - int pci_devfn, - int bar_limit) -{ - u32 bar_response, bar_size, bar_value; - u32 bar, addr_mask, bar_nr = 0; - u32 * upper_limit; - u32 * lower_limit; - int found_mem64 = 0; - - for (bar = PCI_BASE_ADDRESS_0; bar <= bar_limit; bar+=4) { - u32 bar_addr; - - /* Read the old BAR value */ - early_read_config_dword(hose, top_bus, - current_bus, - pci_devfn, - bar, - &bar_addr); - - /* Tickle the BAR and get the response */ - early_write_config_dword(hose, top_bus, - current_bus, - pci_devfn, - bar, - 0xffffffff); - - early_read_config_dword(hose, top_bus, - current_bus, - pci_devfn, - bar, - &bar_response); - - /* - * Write the old BAR value back out, only update the BAR - * if we implicitly want resources to be updated, which - * is done by the generic code further down. -- PFM. - */ - early_write_config_dword(hose, top_bus, - current_bus, - pci_devfn, - bar, - bar_addr); - - /* If BAR is not implemented go to the next BAR */ - if (!bar_response) - continue; - - /* - * Workaround for a BAR that doesn't use its upper word, - * like the ALi 1535D+ PCI DC-97 Controller Modem (M5457). - * bdl - */ - if (!(bar_response & 0xffff0000)) - bar_response |= 0xffff0000; - -retry: - /* Check the BAR type and set our address mask */ - if (bar_response & PCI_BASE_ADDRESS_SPACE) { - addr_mask = PCI_BASE_ADDRESS_IO_MASK; - upper_limit = &pciauto_upper_iospc; - lower_limit = &pciauto_lower_iospc; - DBG(" I/O"); - } else { - if ((bar_response & PCI_BASE_ADDRESS_MEM_TYPE_MASK) == - PCI_BASE_ADDRESS_MEM_TYPE_64) - found_mem64 = 1; - - addr_mask = PCI_BASE_ADDRESS_MEM_MASK; - upper_limit = &pciauto_upper_memspc; - lower_limit = &pciauto_lower_memspc; - DBG(" Mem"); - } - - - /* Calculate requested size */ - bar_size = ~(bar_response & addr_mask) + 1; - - /* Allocate a base address */ - bar_value = ((*lower_limit - 1) & ~(bar_size - 1)) + bar_size; - - if ((bar_value + bar_size) > *upper_limit) { - if (bar_response & PCI_BASE_ADDRESS_SPACE) { - if (io_resource_inuse->child) { - io_resource_inuse = - io_resource_inuse->child; - pciauto_lower_iospc = - io_resource_inuse->start; - pciauto_upper_iospc = - io_resource_inuse->end + 1; - goto retry; - } - - } else { - if (mem_resource_inuse->child) { - mem_resource_inuse = - mem_resource_inuse->child; - pciauto_lower_memspc = - mem_resource_inuse->start; - pciauto_upper_memspc = - mem_resource_inuse->end + 1; - goto retry; - } - } - DBG(" unavailable -- skipping, value %x size %x\n", - bar_value, bar_size); - continue; - } - - if (bar_value < *lower_limit || (bar_value + bar_size) >= *upper_limit) { - DBG(" unavailable -- skipping, value %x size %x\n", - bar_value, bar_size); - continue; - } - -#ifdef CONFIG_PCI_AUTO_UPDATE_RESOURCES - /* Write it out and update our limit */ - early_write_config_dword(hose, top_bus, current_bus, pci_devfn, - bar, bar_value); -#endif - - *lower_limit = bar_value + bar_size; - - /* - * If we are a 64-bit decoder then increment to the - * upper 32 bits of the bar and force it to locate - * in the lower 4GB of memory. - */ - if (found_mem64) { - bar += 4; - early_write_config_dword(hose, top_bus, - current_bus, - pci_devfn, - bar, - 0x00000000); - } - - DBG(" at 0x%.8x [size=0x%x]\n", bar_value, bar_size); - - bar_nr++; - } - -} - -static void __init -pciauto_prescan_setup_bridge(struct pci_channel *hose, - int top_bus, - int current_bus, - int pci_devfn, - int sub_bus) -{ - /* Configure bus number registers */ - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_PRIMARY_BUS, current_bus); - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_SECONDARY_BUS, sub_bus + 1); - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_SUBORDINATE_BUS, 0xff); - - /* Align memory and I/O to 1MB and 4KB boundaries. */ - pciauto_lower_memspc = (pciauto_lower_memspc + (0x100000 - 1)) - & ~(0x100000 - 1); - pciauto_lower_iospc = (pciauto_lower_iospc + (0x1000 - 1)) - & ~(0x1000 - 1); - - /* Set base (lower limit) of address range behind bridge. */ - early_write_config_word(hose, top_bus, current_bus, pci_devfn, - PCI_MEMORY_BASE, pciauto_lower_memspc >> 16); - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_IO_BASE, (pciauto_lower_iospc & 0x0000f000) >> 8); - early_write_config_word(hose, top_bus, current_bus, pci_devfn, - PCI_IO_BASE_UPPER16, pciauto_lower_iospc >> 16); - - /* We don't support prefetchable memory for now, so disable */ - early_write_config_word(hose, top_bus, current_bus, pci_devfn, - PCI_PREF_MEMORY_BASE, 0); - early_write_config_word(hose, top_bus, current_bus, pci_devfn, - PCI_PREF_MEMORY_LIMIT, 0); -} - -static void __init -pciauto_postscan_setup_bridge(struct pci_channel *hose, - int top_bus, - int current_bus, - int pci_devfn, - int sub_bus) -{ - u32 temp; - - /* - * [jsun] we always bump up baselines a little, so that if there - * nothing behind P2P bridge, we don't wind up overlapping IO/MEM - * spaces. - */ - pciauto_lower_memspc += 1; - pciauto_lower_iospc += 1; - - /* Configure bus number registers */ - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_SUBORDINATE_BUS, sub_bus); - - /* Set upper limit of address range behind bridge. */ - early_write_config_word(hose, top_bus, current_bus, pci_devfn, - PCI_MEMORY_LIMIT, pciauto_lower_memspc >> 16); - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_IO_LIMIT, (pciauto_lower_iospc & 0x0000f000) >> 8); - early_write_config_word(hose, top_bus, current_bus, pci_devfn, - PCI_IO_LIMIT_UPPER16, pciauto_lower_iospc >> 16); - - /* Align memory and I/O to 1MB and 4KB boundaries. */ - pciauto_lower_memspc = (pciauto_lower_memspc + (0x100000 - 1)) - & ~(0x100000 - 1); - pciauto_lower_iospc = (pciauto_lower_iospc + (0x1000 - 1)) - & ~(0x1000 - 1); - - /* Enable memory and I/O accesses, enable bus master */ - early_read_config_dword(hose, top_bus, current_bus, pci_devfn, - PCI_COMMAND, &temp); - early_write_config_dword(hose, top_bus, current_bus, pci_devfn, - PCI_COMMAND, temp | PCI_COMMAND_IO | PCI_COMMAND_MEMORY - | PCI_COMMAND_MASTER); -} - -static void __init -pciauto_prescan_setup_cardbus_bridge(struct pci_channel *hose, - int top_bus, - int current_bus, - int pci_devfn, - int sub_bus) -{ - /* Configure bus number registers */ - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_PRIMARY_BUS, current_bus); - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_SECONDARY_BUS, sub_bus + 1); - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_SUBORDINATE_BUS, 0xff); - - /* Align memory and I/O to 4KB and 4 byte boundaries. */ - pciauto_lower_memspc = (pciauto_lower_memspc + (0x1000 - 1)) - & ~(0x1000 - 1); - pciauto_lower_iospc = (pciauto_lower_iospc + (0x4 - 1)) - & ~(0x4 - 1); - - early_write_config_dword(hose, top_bus, current_bus, pci_devfn, - PCI_CB_MEMORY_BASE_0, pciauto_lower_memspc); - early_write_config_dword(hose, top_bus, current_bus, pci_devfn, - PCI_CB_IO_BASE_0, pciauto_lower_iospc); -} - -static void __init -pciauto_postscan_setup_cardbus_bridge(struct pci_channel *hose, - int top_bus, - int current_bus, - int pci_devfn, - int sub_bus) -{ - u32 temp; - - /* - * [jsun] we always bump up baselines a little, so that if there - * nothing behind P2P bridge, we don't wind up overlapping IO/MEM - * spaces. - */ - pciauto_lower_memspc += 1; - pciauto_lower_iospc += 1; - - /* - * Configure subordinate bus number. The PCI subsystem - * bus scan will renumber buses (reserving three additional - * for this PCI<->CardBus bridge for the case where a CardBus - * adapter contains a P2P or CB2CB bridge. - */ - - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_SUBORDINATE_BUS, sub_bus); - - /* - * Reserve an additional 4MB for mem space and 16KB for - * I/O space. This should cover any additional space - * requirement of unusual CardBus devices with - * additional bridges that can consume more address space. - * - * Although pcmcia-cs currently will reprogram bridge - * windows, the goal is to add an option to leave them - * alone and use the bridge window ranges as the regions - * that are searched for free resources upon hot-insertion - * of a device. This will allow a PCI<->CardBus bridge - * configured by this routine to happily live behind a - * P2P bridge in a system. - */ - /* Align memory and I/O to 4KB and 4 byte boundaries. */ - pciauto_lower_memspc = (pciauto_lower_memspc + (0x1000 - 1)) - & ~(0x1000 - 1); - pciauto_lower_iospc = (pciauto_lower_iospc + (0x4 - 1)) - & ~(0x4 - 1); - /* Set up memory and I/O filter limits, assume 32-bit I/O space */ - early_write_config_dword(hose, top_bus, current_bus, pci_devfn, - PCI_CB_MEMORY_LIMIT_0, pciauto_lower_memspc - 1); - early_write_config_dword(hose, top_bus, current_bus, pci_devfn, - PCI_CB_IO_LIMIT_0, pciauto_lower_iospc - 1); - - /* Enable memory and I/O accesses, enable bus master */ - early_read_config_dword(hose, top_bus, current_bus, pci_devfn, - PCI_COMMAND, &temp); - early_write_config_dword(hose, top_bus, current_bus, pci_devfn, - PCI_COMMAND, temp | PCI_COMMAND_IO | PCI_COMMAND_MEMORY | - PCI_COMMAND_MASTER); -} - -#define PCIAUTO_IDE_MODE_MASK 0x05 - -static int __init -pciauto_bus_scan(struct pci_channel *hose, int top_bus, int current_bus) -{ - int sub_bus; - u32 pci_devfn, pci_class, cmdstat, found_multi=0; - unsigned short vid, did; - unsigned char header_type; - int devfn_start = 0; - int devfn_stop = 0xff; - - sub_bus = current_bus; - - if (hose->first_devfn) - devfn_start = hose->first_devfn; - if (hose->last_devfn) - devfn_stop = hose->last_devfn; - - for (pci_devfn=devfn_start; pci_devfn> 16, vid, did); - if (pci_class & 0xff) - DBG(" (rev %.2x)", pci_class & 0xff); - DBG("\n"); - - if ((pci_class >> 16) == PCI_CLASS_BRIDGE_PCI) { - DBG(" Bridge: primary=%.2x, secondary=%.2x\n", - current_bus, sub_bus + 1); - pciauto_prescan_setup_bridge(hose, top_bus, current_bus, - pci_devfn, sub_bus); - DBG("Scanning sub bus %.2x, I/O 0x%.8x, Mem 0x%.8x\n", - sub_bus + 1, - pciauto_lower_iospc, pciauto_lower_memspc); - sub_bus = pciauto_bus_scan(hose, top_bus, sub_bus+1); - DBG("Back to bus %.2x\n", current_bus); - pciauto_postscan_setup_bridge(hose, top_bus, current_bus, - pci_devfn, sub_bus); - continue; - } else if ((pci_class >> 16) == PCI_CLASS_BRIDGE_CARDBUS) { - DBG(" CARDBUS Bridge: primary=%.2x, secondary=%.2x\n", - current_bus, sub_bus + 1); - DBG("PCI Autoconfig: Found CardBus bridge, device %d function %d\n", PCI_SLOT(pci_devfn), PCI_FUNC(pci_devfn)); - /* Place CardBus Socket/ExCA registers */ - pciauto_setup_bars(hose, top_bus, current_bus, pci_devfn, PCI_BASE_ADDRESS_0); - - pciauto_prescan_setup_cardbus_bridge(hose, top_bus, - current_bus, pci_devfn, sub_bus); - - DBG("Scanning sub bus %.2x, I/O 0x%.8x, Mem 0x%.8x\n", - sub_bus + 1, - pciauto_lower_iospc, pciauto_lower_memspc); - sub_bus = pciauto_bus_scan(hose, top_bus, sub_bus+1); - DBG("Back to bus %.2x, sub_bus is %x\n", current_bus, sub_bus); - pciauto_postscan_setup_cardbus_bridge(hose, top_bus, - current_bus, pci_devfn, sub_bus); - continue; - } else if ((pci_class >> 16) == PCI_CLASS_STORAGE_IDE) { - - unsigned char prg_iface; - - early_read_config_byte(hose, top_bus, current_bus, - pci_devfn, PCI_CLASS_PROG, &prg_iface); - if (!(prg_iface & PCIAUTO_IDE_MODE_MASK)) { - DBG("Skipping legacy mode IDE controller\n"); - continue; - } - } - - /* - * Found a peripheral, enable some standard - * settings - */ - early_read_config_dword(hose, top_bus, current_bus, pci_devfn, - PCI_COMMAND, &cmdstat); - early_write_config_dword(hose, top_bus, current_bus, pci_devfn, - PCI_COMMAND, cmdstat | PCI_COMMAND_IO | - PCI_COMMAND_MEMORY | - PCI_COMMAND_MASTER); - early_write_config_byte(hose, top_bus, current_bus, pci_devfn, - PCI_LATENCY_TIMER, 0x80); - - /* Allocate PCI I/O and/or memory space */ - pciauto_setup_bars(hose, top_bus, current_bus, pci_devfn, PCI_BASE_ADDRESS_5); - } - return sub_bus; -} - -int __init -pciauto_assign_resources(int busno, struct pci_channel *hose) -{ - /* setup resource limits */ - io_resource_inuse = hose->io_resource; - mem_resource_inuse = hose->mem_resource; - - pciauto_lower_iospc = io_resource_inuse->start; - pciauto_upper_iospc = io_resource_inuse->end + 1; - pciauto_lower_memspc = mem_resource_inuse->start; - pciauto_upper_memspc = mem_resource_inuse->end + 1; - DBG("Autoconfig PCI channel 0x%p\n", hose); - DBG("Scanning bus %.2x, I/O 0x%.8x:0x%.8x, Mem 0x%.8x:0x%.8x\n", - busno, pciauto_lower_iospc, pciauto_upper_iospc, - pciauto_lower_memspc, pciauto_upper_memspc); - - return pciauto_bus_scan(hose, busno, busno); -} diff --git a/arch/sh/drivers/pci/pci.c b/arch/sh/drivers/pci/pci.c deleted file mode 100644 index 8c332b2a464..00000000000 --- a/arch/sh/drivers/pci/pci.c +++ /dev/null @@ -1,75 +0,0 @@ -/* - * arch/sh/drivers/pci/pci.c - * - * Copyright (c) 2002 M. R. Brown - * Copyright (c) 2004 - 2006 Paul Mundt - * - * These functions are collected here to reduce duplication of common - * code amongst the many platform-specific PCI support code files. - * - * These routines require the following board-specific routines: - * void pcibios_fixup_irqs(); - * - * See include/asm-sh/pci.h for more information. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include -#include -#include - -static int __init pcibios_init(void) -{ - struct pci_channel *p; - struct pci_bus *bus; - int busno; - - /* init channels */ - busno = 0; - for (p = board_pci_channels; p->init; p++) { - if (p->init(p) == 0) - p->enabled = 1; - else - pr_err("Unable to init pci channel %d\n", busno); - busno++; - } - -#ifdef CONFIG_PCI_AUTO - /* assign resources */ - busno = 0; - for (p = board_pci_channels; p->init; p++) - if (p->enabled) - busno = pciauto_assign_resources(busno, p) + 1; -#endif - - /* scan the buses */ - busno = 0; - for (p = board_pci_channels; p->init; p++) { - if (p->enabled) { - bus = pci_scan_bus(busno, p->pci_ops, p); - busno = bus->subordinate + 1; - } - } - - pci_fixup_irqs(pci_common_swizzle, pcibios_map_platform_irq); - - dma_debug_add_bus(&pci_bus_type); - - return 0; -} -subsys_initcall(pcibios_init); - -/* - * Called after each bus is probed, but before its children - * are examined. - */ -void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus) -{ - pci_read_bridge_bases(bus); -} - -EXPORT_SYMBOL(board_pci_channels); diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index f36c7899295..f910121559b 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -19,8 +19,6 @@ struct pci_channel { struct pci_channel *next; - int (*init)(struct pci_channel *chan); - struct pci_ops *pci_ops; struct resource *io_resource; struct resource *mem_resource; @@ -28,20 +26,11 @@ struct pci_channel { unsigned long io_offset; unsigned long mem_offset; - int first_devfn; - int last_devfn; - int enabled; - unsigned long reg_base; unsigned long io_map_base; }; -/* - * Each board initializes this array and terminates it with a NULL entry. - */ -extern struct pci_channel board_pci_channels[]; - extern void register_pci_controller(struct pci_channel *hose); extern unsigned long PCIBIOS_MIN_IO, PCIBIOS_MIN_MEM; @@ -122,13 +111,8 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev, #endif /* Board-specific fixup routines. */ -int pcibios_init_platform(void); int pcibios_map_platform_irq(struct pci_dev *dev, u8 slot, u8 pin); -#ifdef CONFIG_PCI_AUTO -int pciauto_assign_resources(int busno, struct pci_channel *hose); -#endif - extern void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, struct resource *res); From 35bcfffd86aac933205fbf8401e3e7e4dde68264 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 21:51:19 +0900 Subject: [PATCH 0505/2061] sh: pci: Roll pci-lib in to pci-new. Now that the pci-auto cruft is gone, pci-lib can go away. Roll it back in to pci-new.c where it originally split off from. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Makefile | 1 - arch/sh/drivers/pci/pci-lib.c | 219 -------------------------------- arch/sh/drivers/pci/pci-new.c | 226 +++++++++++++++++++++++++++++++++- 3 files changed, 221 insertions(+), 225 deletions(-) delete mode 100644 arch/sh/drivers/pci/pci-lib.c diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index a8350ccfa9d..eacac7f475d 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -1,7 +1,6 @@ # # Makefile for the PCI specific kernel interface routines under Linux. # -obj-y += pci-lib.o obj-$(CONFIG_PCI_NEW) += pci-new.o obj-$(CONFIG_CPU_SUBTYPE_SH7751) += pci-sh7751.o ops-sh4.o diff --git a/arch/sh/drivers/pci/pci-lib.c b/arch/sh/drivers/pci/pci-lib.c deleted file mode 100644 index f072acafce6..00000000000 --- a/arch/sh/drivers/pci/pci-lib.c +++ /dev/null @@ -1,219 +0,0 @@ -#include -#include -#include -#include -#include - -unsigned long PCIBIOS_MIN_IO = 0x0000; -unsigned long PCIBIOS_MIN_MEM = 0; - -/* - * We need to avoid collisions with `mirrored' VGA ports - * and other strange ISA hardware, so we always want the - * addresses to be allocated in the 0x000-0x0ff region - * modulo 0x400. - */ -void pcibios_align_resource(void *data, struct resource *res, - resource_size_t size, resource_size_t align) -{ - struct pci_dev *dev = data; - struct pci_channel *chan = dev->sysdata; - resource_size_t start = res->start; - - if (res->flags & IORESOURCE_IO) { - if (start < PCIBIOS_MIN_IO + chan->io_resource->start) - start = PCIBIOS_MIN_IO + chan->io_resource->start; - - /* - * Put everything into 0x00-0xff region modulo 0x400. - */ - if (start & 0x300) { - start = (start + 0x3ff) & ~0x3ff; - res->start = start; - } - } else if (res->flags & IORESOURCE_MEM) { - if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start) - start = PCIBIOS_MIN_MEM + chan->mem_resource->start; - } - - res->start = start; -} - -void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, - struct resource *res) -{ - struct pci_channel *hose = dev->sysdata; - unsigned long offset = 0; - - if (res->flags & IORESOURCE_IO) - offset = hose->io_offset; - else if (res->flags & IORESOURCE_MEM) - offset = hose->mem_offset; - - region->start = res->start - offset; - region->end = res->end - offset; -} - -void __devinit -pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, - struct pci_bus_region *region) -{ - struct pci_channel *hose = dev->sysdata; - unsigned long offset = 0; - - if (res->flags & IORESOURCE_IO) - offset = hose->io_offset; - else if (res->flags & IORESOURCE_MEM) - offset = hose->mem_offset; - - res->start = region->start + offset; - res->end = region->end + offset; -} - -int pcibios_enable_device(struct pci_dev *dev, int mask) -{ - u16 cmd, old_cmd; - int idx; - struct resource *r; - - pci_read_config_word(dev, PCI_COMMAND, &cmd); - old_cmd = cmd; - for (idx=0; idx < PCI_NUM_RESOURCES; idx++) { - /* Only set up the requested stuff */ - if (!(mask & (1<resource[idx]; - if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM))) - continue; - if ((idx == PCI_ROM_RESOURCE) && - (!(r->flags & IORESOURCE_ROM_ENABLE))) - continue; - if (!r->start && r->end) { - printk(KERN_ERR "PCI: Device %s not available " - "because of resource collisions\n", - pci_name(dev)); - return -EINVAL; - } - if (r->flags & IORESOURCE_IO) - cmd |= PCI_COMMAND_IO; - if (r->flags & IORESOURCE_MEM) - cmd |= PCI_COMMAND_MEMORY; - } - if (cmd != old_cmd) { - printk("PCI: Enabling device %s (%04x -> %04x)\n", - pci_name(dev), old_cmd, cmd); - pci_write_config_word(dev, PCI_COMMAND, cmd); - } - return 0; -} - -/* - * If we set up a device for bus mastering, we need to check and set - * the latency timer as it may not be properly set. - */ -static unsigned int pcibios_max_latency = 255; - -void pcibios_set_master(struct pci_dev *dev) -{ - u8 lat; - pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); - if (lat < 16) - lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; - else if (lat > pcibios_max_latency) - lat = pcibios_max_latency; - else - return; - printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n", - pci_name(dev), lat); - pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); -} - -void __init pcibios_update_irq(struct pci_dev *dev, int irq) -{ - pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); -} - -char * __devinit pcibios_setup(char *str) -{ - return str; -} - -int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, - enum pci_mmap_state mmap_state, int write_combine) -{ - /* - * I/O space can be accessed via normal processor loads and stores on - * this platform but for now we elect not to do this and portable - * drivers should not do this anyway. - */ - if (mmap_state == pci_mmap_io) - return -EINVAL; - - /* - * Ignore write-combine; for now only return uncached mappings. - */ - vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); - - return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, - vma->vm_end - vma->vm_start, - vma->vm_page_prot); -} - -static void __iomem *ioport_map_pci(struct pci_dev *dev, - unsigned long port, unsigned int nr) -{ - struct pci_channel *chan = dev->sysdata; - - if (!chan->io_map_base) - chan->io_map_base = generic_io_base; - - return (void __iomem *)(chan->io_map_base + port); -} - -void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) -{ - resource_size_t start = pci_resource_start(dev, bar); - resource_size_t len = pci_resource_len(dev, bar); - unsigned long flags = pci_resource_flags(dev, bar); - - if (unlikely(!len || !start)) - return NULL; - if (maxlen && len > maxlen) - len = maxlen; - - if (flags & IORESOURCE_IO) - return ioport_map_pci(dev, start, len); - - /* - * Presently the IORESOURCE_MEM case is a bit special, most - * SH7751 style PCI controllers have PCI memory at a fixed - * location in the address space where no remapping is desired. - * With the IORESOURCE_MEM case more care has to be taken - * to inhibit page table mapping for legacy cores, but this is - * punted off to __ioremap(). - * -- PFM. - */ - if (flags & IORESOURCE_MEM) { - if (flags & IORESOURCE_CACHEABLE) - return ioremap(start, len); - - return ioremap_nocache(start, len); - } - - return NULL; -} -EXPORT_SYMBOL(pci_iomap); - -void pci_iounmap(struct pci_dev *dev, void __iomem *addr) -{ - iounmap(addr); -} -EXPORT_SYMBOL(pci_iounmap); - -#ifdef CONFIG_HOTPLUG -EXPORT_SYMBOL(pcibios_resource_to_bus); -EXPORT_SYMBOL(pcibios_bus_to_resource); -EXPORT_SYMBOL(PCIBIOS_MIN_IO); -EXPORT_SYMBOL(PCIBIOS_MIN_MEM); -#endif diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci-new.c index 8c0b136eecb..54d77cbb8b3 100644 --- a/arch/sh/drivers/pci/pci-new.c +++ b/arch/sh/drivers/pci/pci-new.c @@ -1,20 +1,28 @@ /* * New-style PCI core. * - * Copyright (c) 2002 M. R. Brown * Copyright (c) 2004 - 2009 Paul Mundt + * Copyright (c) 2002 M. R. Brown + * + * Modelled after arch/mips/pci/pci.c: + * Copyright (C) 2003, 04 Ralf Baechle (ralf@linux-mips.org) * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive * for more details. */ #include +#include #include #include +#include #include #include #include +unsigned long PCIBIOS_MIN_IO = 0x0000; +unsigned long PCIBIOS_MIN_MEM = 0; + /* * The PCI controller list. */ @@ -27,9 +35,6 @@ static void __devinit pcibios_scanbus(struct pci_channel *hose) static int next_busno; struct pci_bus *bus; - /* Catch botched conversion attempts */ - BUG_ON(hose->init); - bus = pci_scan_bus(next_busno, hose->pci_ops, hose); if (bus) { next_busno = bus->subordinate + 1; @@ -128,7 +133,7 @@ static void pcibios_fixup_device_resources(struct pci_dev *dev, * Called after each bus is probed, but before its children * are examined. */ -void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus) +void __devinit pcibios_fixup_bus(struct pci_bus *bus) { struct pci_dev *dev = bus->self; struct list_head *ln; @@ -146,3 +151,214 @@ void __devinit __weak pcibios_fixup_bus(struct pci_bus *bus) pcibios_fixup_device_resources(dev, bus); } } + +/* + * We need to avoid collisions with `mirrored' VGA ports + * and other strange ISA hardware, so we always want the + * addresses to be allocated in the 0x000-0x0ff region + * modulo 0x400. + */ +void pcibios_align_resource(void *data, struct resource *res, + resource_size_t size, resource_size_t align) +{ + struct pci_dev *dev = data; + struct pci_channel *chan = dev->sysdata; + resource_size_t start = res->start; + + if (res->flags & IORESOURCE_IO) { + if (start < PCIBIOS_MIN_IO + chan->io_resource->start) + start = PCIBIOS_MIN_IO + chan->io_resource->start; + + /* + * Put everything into 0x00-0xff region modulo 0x400. + */ + if (start & 0x300) { + start = (start + 0x3ff) & ~0x3ff; + res->start = start; + } + } else if (res->flags & IORESOURCE_MEM) { + if (start < PCIBIOS_MIN_MEM + chan->mem_resource->start) + start = PCIBIOS_MIN_MEM + chan->mem_resource->start; + } + + res->start = start; +} + +void pcibios_resource_to_bus(struct pci_dev *dev, struct pci_bus_region *region, + struct resource *res) +{ + struct pci_channel *hose = dev->sysdata; + unsigned long offset = 0; + + if (res->flags & IORESOURCE_IO) + offset = hose->io_offset; + else if (res->flags & IORESOURCE_MEM) + offset = hose->mem_offset; + + region->start = res->start - offset; + region->end = res->end - offset; +} + +void __devinit +pcibios_bus_to_resource(struct pci_dev *dev, struct resource *res, + struct pci_bus_region *region) +{ + struct pci_channel *hose = dev->sysdata; + unsigned long offset = 0; + + if (res->flags & IORESOURCE_IO) + offset = hose->io_offset; + else if (res->flags & IORESOURCE_MEM) + offset = hose->mem_offset; + + res->start = region->start + offset; + res->end = region->end + offset; +} + +int pcibios_enable_device(struct pci_dev *dev, int mask) +{ + u16 cmd, old_cmd; + int idx; + struct resource *r; + + pci_read_config_word(dev, PCI_COMMAND, &cmd); + old_cmd = cmd; + for (idx=0; idx < PCI_NUM_RESOURCES; idx++) { + /* Only set up the requested stuff */ + if (!(mask & (1<resource[idx]; + if (!(r->flags & (IORESOURCE_IO | IORESOURCE_MEM))) + continue; + if ((idx == PCI_ROM_RESOURCE) && + (!(r->flags & IORESOURCE_ROM_ENABLE))) + continue; + if (!r->start && r->end) { + printk(KERN_ERR "PCI: Device %s not available " + "because of resource collisions\n", + pci_name(dev)); + return -EINVAL; + } + if (r->flags & IORESOURCE_IO) + cmd |= PCI_COMMAND_IO; + if (r->flags & IORESOURCE_MEM) + cmd |= PCI_COMMAND_MEMORY; + } + if (cmd != old_cmd) { + printk("PCI: Enabling device %s (%04x -> %04x)\n", + pci_name(dev), old_cmd, cmd); + pci_write_config_word(dev, PCI_COMMAND, cmd); + } + return 0; +} + +/* + * If we set up a device for bus mastering, we need to check and set + * the latency timer as it may not be properly set. + */ +static unsigned int pcibios_max_latency = 255; + +void pcibios_set_master(struct pci_dev *dev) +{ + u8 lat; + pci_read_config_byte(dev, PCI_LATENCY_TIMER, &lat); + if (lat < 16) + lat = (64 <= pcibios_max_latency) ? 64 : pcibios_max_latency; + else if (lat > pcibios_max_latency) + lat = pcibios_max_latency; + else + return; + printk(KERN_INFO "PCI: Setting latency timer of device %s to %d\n", + pci_name(dev), lat); + pci_write_config_byte(dev, PCI_LATENCY_TIMER, lat); +} + +void __init pcibios_update_irq(struct pci_dev *dev, int irq) +{ + pci_write_config_byte(dev, PCI_INTERRUPT_LINE, irq); +} + +char * __devinit pcibios_setup(char *str) +{ + return str; +} + +int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma, + enum pci_mmap_state mmap_state, int write_combine) +{ + /* + * I/O space can be accessed via normal processor loads and stores on + * this platform but for now we elect not to do this and portable + * drivers should not do this anyway. + */ + if (mmap_state == pci_mmap_io) + return -EINVAL; + + /* + * Ignore write-combine; for now only return uncached mappings. + */ + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + + return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff, + vma->vm_end - vma->vm_start, + vma->vm_page_prot); +} + +static void __iomem *ioport_map_pci(struct pci_dev *dev, + unsigned long port, unsigned int nr) +{ + struct pci_channel *chan = dev->sysdata; + + if (!chan->io_map_base) + chan->io_map_base = generic_io_base; + + return (void __iomem *)(chan->io_map_base + port); +} + +void __iomem *pci_iomap(struct pci_dev *dev, int bar, unsigned long maxlen) +{ + resource_size_t start = pci_resource_start(dev, bar); + resource_size_t len = pci_resource_len(dev, bar); + unsigned long flags = pci_resource_flags(dev, bar); + + if (unlikely(!len || !start)) + return NULL; + if (maxlen && len > maxlen) + len = maxlen; + + if (flags & IORESOURCE_IO) + return ioport_map_pci(dev, start, len); + + /* + * Presently the IORESOURCE_MEM case is a bit special, most + * SH7751 style PCI controllers have PCI memory at a fixed + * location in the address space where no remapping is desired. + * With the IORESOURCE_MEM case more care has to be taken + * to inhibit page table mapping for legacy cores, but this is + * punted off to __ioremap(). + * -- PFM. + */ + if (flags & IORESOURCE_MEM) { + if (flags & IORESOURCE_CACHEABLE) + return ioremap(start, len); + + return ioremap_nocache(start, len); + } + + return NULL; +} +EXPORT_SYMBOL(pci_iomap); + +void pci_iounmap(struct pci_dev *dev, void __iomem *addr) +{ + iounmap(addr); +} +EXPORT_SYMBOL(pci_iounmap); + +#ifdef CONFIG_HOTPLUG +EXPORT_SYMBOL(pcibios_resource_to_bus); +EXPORT_SYMBOL(pcibios_bus_to_resource); +EXPORT_SYMBOL(PCIBIOS_MIN_IO); +EXPORT_SYMBOL(PCIBIOS_MIN_MEM); +#endif From cf242007a1eb29dcf93d1cb34713ec9b3f4a0e1b Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 20 Apr 2009 21:53:33 +0900 Subject: [PATCH 0506/2061] sh: pci: Rename pci-new.c to pci.c. pci-new.c is now in a state to replace the old pci.c, rename it accordingly. Signed-off-by: Paul Mundt --- arch/sh/drivers/pci/Kconfig | 4 ---- arch/sh/drivers/pci/Makefile | 2 +- arch/sh/drivers/pci/{pci-new.c => pci.c} | 0 3 files changed, 1 insertion(+), 5 deletions(-) rename arch/sh/drivers/pci/{pci-new.c => pci.c} (100%) diff --git a/arch/sh/drivers/pci/Kconfig b/arch/sh/drivers/pci/Kconfig index ea903a984f0..e8db585a663 100644 --- a/arch/sh/drivers/pci/Kconfig +++ b/arch/sh/drivers/pci/Kconfig @@ -17,7 +17,3 @@ config SH_PCIDMA_NONCOHERENT code will not have to flush the CPU's caches. If you have a PCI host bridge integrated with your SH CPU, refer carefully to the chip specs to see if you can say 'N' here. Otherwise, leave it as 'Y'. - -config PCI_NEW - def_bool y - depends on PCI diff --git a/arch/sh/drivers/pci/Makefile b/arch/sh/drivers/pci/Makefile index eacac7f475d..d2ffc477549 100644 --- a/arch/sh/drivers/pci/Makefile +++ b/arch/sh/drivers/pci/Makefile @@ -1,7 +1,7 @@ # # Makefile for the PCI specific kernel interface routines under Linux. # -obj-$(CONFIG_PCI_NEW) += pci-new.o +obj-y += pci.o obj-$(CONFIG_CPU_SUBTYPE_SH7751) += pci-sh7751.o ops-sh4.o obj-$(CONFIG_CPU_SUBTYPE_SH7751R) += pci-sh7751.o ops-sh4.o diff --git a/arch/sh/drivers/pci/pci-new.c b/arch/sh/drivers/pci/pci.c similarity index 100% rename from arch/sh/drivers/pci/pci-new.c rename to arch/sh/drivers/pci/pci.c From 9ae5b8790037d05d32746f521af146c32089bfec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:27:58 -0400 Subject: [PATCH 0507/2061] tracing: change branch profiling to a choice selection This patch makes the branch profiling into a choice selection: None - no branch profiling likely/unlikely - only profile likely/unlikely branches all - profile all branches The all profiler will also enable the likely/unlikely branches. This does not change the way the profiler works or the dependencies between the profilers. What this patch does, is keep the branch profiling from being selected by an allyesconfig make. The branch profiler is very intrusive and it is known to break various architecture builds when selected as an allyesconfig. [ Impact: prevent branch profiler from being selected in allyesconfig ] Reported-by: Heiko Carstens Reported-by: Al Viro Reported-by: Stephen Rothwell Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 39 +++++++++++++++++++++++++++++++++------ 1 file changed, 33 insertions(+), 6 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 57981d338d1..3ee28db69be 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -212,8 +212,36 @@ config BOOT_TRACER to enable this on bootup. config TRACE_BRANCH_PROFILING - bool "Trace likely/unlikely profiler" + bool select TRACING + +choice + prompt "Branch Profiling" + default BRANCH_PROFILE_NONE + help + The branch profiling is a software profiler. It will add hooks + into the C conditionals to test which path a branch takes. + + The likely/unlikely profiler only looks at the conditions that + are annotated with a likely or unlikely macro. + + The "all branch" profiler will profile every if statement in the + kernel. This profiler will also enable the likely/unlikely + profiler as well. + + Either of the above profilers add a bit of overhead to the system. + If unsure choose "No branch profiling". + +config BRANCH_PROFILE_NONE + bool "No branch profiling" + help + No branch profiling. Branch profiling adds a bit of overhead. + Only enable it if you want to analyse the branching behavior. + Otherwise keep it disabled. + +config PROFILE_ANNOTATED_BRANCHES + bool "Trace likely/unlikely profiler" + select TRACE_BRANCH_PROFILING help This tracer profiles all the the likely and unlikely macros in the kernel. It will display the results in: @@ -223,11 +251,9 @@ config TRACE_BRANCH_PROFILING Note: this will add a significant overhead, only turn this on if you need to profile the system's use of these macros. - Say N if unsure. - config PROFILE_ALL_BRANCHES bool "Profile all if conditionals" - depends on TRACE_BRANCH_PROFILING + select TRACE_BRANCH_PROFILING help This tracer profiles all branch conditions. Every if () taken in the kernel is recorded whether it hit or miss. @@ -235,11 +261,12 @@ config PROFILE_ALL_BRANCHES /debugfs/tracing/profile_branch + This option also enables the likely/unlikely profiler. + This configuration, when enabled, will impose a great overhead on the system. This should only be enabled when the system is to be analyzed - - Say N if unsure. +endchoice config TRACING_BRANCHES bool From 4ed9f0716e46bb9646f26e73f4a1b5b24db7947a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:47:36 -0400 Subject: [PATCH 0508/2061] tracing: create menuconfig for tracing infrastructure During testing we often use randconfig to test various kernels. The current configuration set up does not give an easy way to disable all tracing with a single config. The case where randconfig would test all tracing disabled is very unlikely. This patch adds a config option to enable or disable all tracing. It is hooked into the tracing menu just like other submenus are done. [ Impact: allow randconfig to easily produce all traces disabled ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3ee28db69be..3fa36d2bc29 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -77,7 +77,12 @@ config TRACING_SUPPORT if TRACING_SUPPORT -menu "Tracers" +menuconfig FTRACE + bool "Tracers" + help + Enable the kernel tracing infrastructure. + +if FTRACE config FUNCTION_TRACER bool "Kernel Function Tracer" @@ -462,7 +467,7 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. -endmenu +endif # FTRACE endif # TRACING_SUPPORT From d24e473e5b2ca86d1288b9416227ccc603313d0f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 13:32:07 +0200 Subject: [PATCH 0509/2061] perf_counter: copy in Git's top Makefile We'd like to have a similar user-space structure as Git has, for the perfcounter tools - so copy in Git's toplevel makefile as-is. We'll strip it down in subsequent commits to make it fit the perfcounters code. The Git version used: 66996ec: Sync with 1.6.2.4 Signed-off-by: Ingo Molnar --- Documentation/perf_counter/Makefile | 1735 ++++++++++++++++++++++++++- 1 file changed, 1724 insertions(+), 11 deletions(-) diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index 1dd37ee7dbd..6e0838b03ad 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -1,18 +1,1731 @@ -BINS = kerneltop perfstat perf-record perf-report +# The default target of this Makefile is... +all:: -all: $(BINS) +# Define V=1 to have a more verbose compile. +# +# Define SNPRINTF_RETURNS_BOGUS if your are on a system which snprintf() +# or vsnprintf() return -1 instead of number of characters which would +# have been written to the final string if enough space had been available. +# +# Define FREAD_READS_DIRECTORIES if your are on a system which succeeds +# when attempting to read from an fopen'ed directory. +# +# Define NO_OPENSSL environment variable if you do not have OpenSSL. +# This also implies MOZILLA_SHA1. +# +# Define NO_CURL if you do not have libcurl installed. git-http-pull and +# git-http-push are not built, and you cannot use http:// and https:// +# transports. +# +# Define CURLDIR=/foo/bar if your curl header and library files are in +# /foo/bar/include and /foo/bar/lib directories. +# +# Define NO_EXPAT if you do not have expat installed. git-http-push is +# not built, and you cannot push using http:// and https:// transports. +# +# Define EXPATDIR=/foo/bar if your expat header and library files are in +# /foo/bar/include and /foo/bar/lib directories. +# +# Define NO_D_INO_IN_DIRENT if you don't have d_ino in your struct dirent. +# +# Define NO_D_TYPE_IN_DIRENT if your platform defines DT_UNKNOWN but lacks +# d_type in struct dirent (latest Cygwin -- will be fixed soonish). +# +# Define NO_C99_FORMAT if your formatted IO functions (printf/scanf et.al.) +# do not support the 'size specifiers' introduced by C99, namely ll, hh, +# j, z, t. (representing long long int, char, intmax_t, size_t, ptrdiff_t). +# some C compilers supported these specifiers prior to C99 as an extension. +# +# Define NO_STRCASESTR if you don't have strcasestr. +# +# Define NO_MEMMEM if you don't have memmem. +# +# Define NO_STRLCPY if you don't have strlcpy. +# +# Define NO_STRTOUMAX if you don't have strtoumax in the C library. +# If your compiler also does not support long long or does not have +# strtoull, define NO_STRTOULL. +# +# Define NO_SETENV if you don't have setenv in the C library. +# +# Define NO_UNSETENV if you don't have unsetenv in the C library. +# +# Define NO_MKDTEMP if you don't have mkdtemp in the C library. +# +# Define NO_SYS_SELECT_H if you don't have sys/select.h. +# +# Define NO_SYMLINK_HEAD if you never want .git/HEAD to be a symbolic link. +# Enable it on Windows. By default, symrefs are still used. +# +# Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability +# tests. These tests take up a significant amount of the total test time +# but are not needed unless you plan to talk to SVN repos. +# +# Define NO_FINK if you are building on Darwin/Mac OS X, have Fink +# installed in /sw, but don't want GIT to link against any libraries +# installed there. If defined you may specify your own (or Fink's) +# include directories and library directories by defining CFLAGS +# and LDFLAGS appropriately. +# +# Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X, +# have DarwinPorts installed in /opt/local, but don't want GIT to +# link against any libraries installed there. If defined you may +# specify your own (or DarwinPort's) include directories and +# library directories by defining CFLAGS and LDFLAGS appropriately. +# +# Define PPC_SHA1 environment variable when running make to make use of +# a bundled SHA1 routine optimized for PowerPC. +# +# Define ARM_SHA1 environment variable when running make to make use of +# a bundled SHA1 routine optimized for ARM. +# +# Define MOZILLA_SHA1 environment variable when running make to make use of +# a bundled SHA1 routine coming from Mozilla. It is GPL'd and should be fast +# on non-x86 architectures (e.g. PowerPC), while the OpenSSL version (default +# choice) has very fast version optimized for i586. +# +# Define NEEDS_SSL_WITH_CRYPTO if you need -lcrypto with -lssl (Darwin). +# +# Define NEEDS_LIBICONV if linking with libc is not enough (Darwin). +# +# Define NEEDS_SOCKET if linking with libc is not enough (SunOS, +# Patrick Mauritz). +# +# Define NO_MMAP if you want to avoid mmap. +# +# Define NO_PTHREADS if you do not have or do not want to use Pthreads. +# +# Define NO_PREAD if you have a problem with pread() system call (e.g. +# cygwin.dll before v1.5.22). +# +# Define NO_FAST_WORKING_DIRECTORY if accessing objects in pack files is +# generally faster on your platform than accessing the working directory. +# +# Define NO_TRUSTABLE_FILEMODE if your filesystem may claim to support +# the executable mode bit, but doesn't really do so. +# +# Define NO_IPV6 if you lack IPv6 support and getaddrinfo(). +# +# Define NO_SOCKADDR_STORAGE if your platform does not have struct +# sockaddr_storage. +# +# Define NO_ICONV if your libc does not properly support iconv. +# +# Define OLD_ICONV if your library has an old iconv(), where the second +# (input buffer pointer) parameter is declared with type (const char **). +# +# Define NO_DEFLATE_BOUND if your zlib does not have deflateBound. +# +# Define NO_R_TO_GCC_LINKER if your gcc does not like "-R/path/lib" +# that tells runtime paths to dynamic libraries; +# "-Wl,-rpath=/path/lib" is used instead. +# +# Define USE_NSEC below if you want git to care about sub-second file mtimes +# and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and +# it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely +# randomly break unless your underlying filesystem supports those sub-second +# times (my ext3 doesn't). +# +# Define USE_ST_TIMESPEC if your "struct stat" uses "st_ctimespec" instead of +# "st_ctim" +# +# Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec" +# available. This automatically turns USE_NSEC off. +# +# Define USE_STDEV below if you want git to care about the underlying device +# change being considered an inode change from the update-index perspective. +# +# Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks +# field that counts the on-disk footprint in 512-byte blocks. +# +# Define ASCIIDOC8 if you want to format documentation with AsciiDoc 8 +# +# Define DOCBOOK_XSL_172 if you want to format man pages with DocBook XSL v1.72. +# +# Define NO_PERL_MAKEMAKER if you cannot use Makefiles generated by perl's +# MakeMaker (e.g. using ActiveState under Cygwin). +# +# Define NO_PERL if you do not want Perl scripts or libraries at all. +# +# Define NO_TCLTK if you do not want Tcl/Tk GUI. +# +# The TCL_PATH variable governs the location of the Tcl interpreter +# used to optimize git-gui for your system. Only used if NO_TCLTK +# is not set. Defaults to the bare 'tclsh'. +# +# The TCLTK_PATH variable governs the location of the Tcl/Tk interpreter. +# If not set it defaults to the bare 'wish'. If it is set to the empty +# string then NO_TCLTK will be forced (this is used by configure script). +# +# Define THREADED_DELTA_SEARCH if you have pthreads and wish to exploit +# parallel delta searching when packing objects. +# +# Define INTERNAL_QSORT to use Git's implementation of qsort(), which +# is a simplified version of the merge sort used in glibc. This is +# recommended if Git triggers O(n^2) behavior in your platform's qsort(). +# +# Define NO_EXTERNAL_GREP if you don't want "git grep" to ever call +# your external grep (e.g., if your system lacks grep, if its grep is +# broken, or spawning external process is slower than built-in grep git has). -kerneltop: kerneltop.c ../../include/linux/perf_counter.h - cc -O6 -Wall -lrt -o $@ $< +GIT-VERSION-FILE: .FORCE-GIT-VERSION-FILE + @$(SHELL_PATH) ./GIT-VERSION-GEN +-include GIT-VERSION-FILE -perf-record: perf-record.c ../../include/linux/perf_counter.h - cc -O6 -Wall -lrt -o $@ $< +uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') +uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') +uname_O := $(shell sh -c 'uname -o 2>/dev/null || echo not') +uname_R := $(shell sh -c 'uname -r 2>/dev/null || echo not') +uname_P := $(shell sh -c 'uname -p 2>/dev/null || echo not') +uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not') -perf-report: perf-report.cc ../../include/linux/perf_counter.h - g++ -O6 -Wall -lrt -o $@ $< +# CFLAGS and LDFLAGS are for the users to override from the command line. -perfstat: kerneltop - ln -sf kerneltop perfstat +CFLAGS = -g -O2 -Wall +LDFLAGS = +ALL_CFLAGS = $(CFLAGS) +ALL_LDFLAGS = $(LDFLAGS) +STRIP ?= strip + +# Among the variables below, these: +# gitexecdir +# template_dir +# mandir +# infodir +# htmldir +# ETC_GITCONFIG (but not sysconfdir) +# can be specified as a relative path some/where/else; +# this is interpreted as relative to $(prefix) and "git" at +# runtime figures out where they are based on the path to the executable. +# This can help installing the suite in a relocatable way. + +prefix = $(HOME) +bindir_relative = bin +bindir = $(prefix)/$(bindir_relative) +mandir = share/man +infodir = share/info +gitexecdir = libexec/git-core +sharedir = $(prefix)/share +template_dir = share/git-core/templates +htmldir = share/doc/git-doc +ifeq ($(prefix),/usr) +sysconfdir = /etc +ETC_GITCONFIG = $(sysconfdir)/gitconfig +else +sysconfdir = $(prefix)/etc +ETC_GITCONFIG = etc/gitconfig +endif +lib = lib +# DESTDIR= + +# default configuration for gitweb +GITWEB_CONFIG = gitweb_config.perl +GITWEB_CONFIG_SYSTEM = /etc/gitweb.conf +GITWEB_HOME_LINK_STR = projects +GITWEB_SITENAME = +GITWEB_PROJECTROOT = /pub/git +GITWEB_PROJECT_MAXDEPTH = 2007 +GITWEB_EXPORT_OK = +GITWEB_STRICT_EXPORT = +GITWEB_BASE_URL = +GITWEB_LIST = +GITWEB_HOMETEXT = indextext.html +GITWEB_CSS = gitweb.css +GITWEB_LOGO = git-logo.png +GITWEB_FAVICON = git-favicon.png +GITWEB_SITE_HEADER = +GITWEB_SITE_FOOTER = + +export prefix bindir sharedir sysconfdir + +CC = gcc +AR = ar +RM = rm -f +TAR = tar +FIND = find +INSTALL = install +RPMBUILD = rpmbuild +TCL_PATH = tclsh +TCLTK_PATH = wish +PTHREAD_LIBS = -lpthread + +export TCL_PATH TCLTK_PATH + +# sparse is architecture-neutral, which means that we need to tell it +# explicitly what architecture to check for. Fix this up for yours.. +SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__ + + + +### --- END CONFIGURATION SECTION --- + +# Those must not be GNU-specific; they are shared with perl/ which may +# be built by a different compiler. (Note that this is an artifact now +# but it still might be nice to keep that distinction.) +BASIC_CFLAGS = +BASIC_LDFLAGS = + +# Guard against environment variables +BUILTIN_OBJS = +BUILT_INS = +COMPAT_CFLAGS = +COMPAT_OBJS = +LIB_H = +LIB_OBJS = +PROGRAMS = +SCRIPT_PERL = +SCRIPT_SH = +TEST_PROGRAMS = + +SCRIPT_SH += git-am.sh +SCRIPT_SH += git-bisect.sh +SCRIPT_SH += git-difftool--helper.sh +SCRIPT_SH += git-filter-branch.sh +SCRIPT_SH += git-lost-found.sh +SCRIPT_SH += git-merge-octopus.sh +SCRIPT_SH += git-merge-one-file.sh +SCRIPT_SH += git-merge-resolve.sh +SCRIPT_SH += git-mergetool.sh +SCRIPT_SH += git-mergetool--lib.sh +SCRIPT_SH += git-parse-remote.sh +SCRIPT_SH += git-pull.sh +SCRIPT_SH += git-quiltimport.sh +SCRIPT_SH += git-rebase--interactive.sh +SCRIPT_SH += git-rebase.sh +SCRIPT_SH += git-repack.sh +SCRIPT_SH += git-request-pull.sh +SCRIPT_SH += git-sh-setup.sh +SCRIPT_SH += git-stash.sh +SCRIPT_SH += git-submodule.sh +SCRIPT_SH += git-web--browse.sh + +SCRIPT_PERL += git-add--interactive.perl +SCRIPT_PERL += git-difftool.perl +SCRIPT_PERL += git-archimport.perl +SCRIPT_PERL += git-cvsexportcommit.perl +SCRIPT_PERL += git-cvsimport.perl +SCRIPT_PERL += git-cvsserver.perl +SCRIPT_PERL += git-relink.perl +SCRIPT_PERL += git-send-email.perl +SCRIPT_PERL += git-svn.perl + +SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \ + $(patsubst %.perl,%,$(SCRIPT_PERL)) \ + git-instaweb + +# Empty... +EXTRA_PROGRAMS = + +# ... and all the rest that could be moved out of bindir to gitexecdir +PROGRAMS += $(EXTRA_PROGRAMS) +PROGRAMS += git-fast-import$X +PROGRAMS += git-hash-object$X +PROGRAMS += git-index-pack$X +PROGRAMS += git-merge-index$X +PROGRAMS += git-merge-tree$X +PROGRAMS += git-mktag$X +PROGRAMS += git-mktree$X +PROGRAMS += git-pack-redundant$X +PROGRAMS += git-patch-id$X +PROGRAMS += git-shell$X +PROGRAMS += git-show-index$X +PROGRAMS += git-unpack-file$X +PROGRAMS += git-update-server-info$X +PROGRAMS += git-upload-pack$X +PROGRAMS += git-var$X + +# List built-in command $C whose implementation cmd_$C() is not in +# builtin-$C.o but is linked in as part of some other command. +BUILT_INS += $(patsubst builtin-%.o,git-%$X,$(BUILTIN_OBJS)) + +BUILT_INS += git-cherry$X +BUILT_INS += git-cherry-pick$X +BUILT_INS += git-format-patch$X +BUILT_INS += git-fsck-objects$X +BUILT_INS += git-get-tar-commit-id$X +BUILT_INS += git-init$X +BUILT_INS += git-merge-subtree$X +BUILT_INS += git-peek-remote$X +BUILT_INS += git-repo-config$X +BUILT_INS += git-show$X +BUILT_INS += git-stage$X +BUILT_INS += git-status$X +BUILT_INS += git-whatchanged$X + +# what 'all' will build and 'install' will install, in gitexecdir +ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) + +# what 'all' will build but not install in gitexecdir +OTHER_PROGRAMS = git$X +ifndef NO_PERL +OTHER_PROGRAMS += gitweb/gitweb.cgi +endif + +# Set paths to tools early so that they can be used for version tests. +ifndef SHELL_PATH + SHELL_PATH = /bin/sh +endif +ifndef PERL_PATH + PERL_PATH = /usr/bin/perl +endif + +export PERL_PATH + +LIB_FILE=libgit.a +XDIFF_LIB=xdiff/lib.a + +LIB_H += archive.h +LIB_H += attr.h +LIB_H += blob.h +LIB_H += builtin.h +LIB_H += cache.h +LIB_H += cache-tree.h +LIB_H += commit.h +LIB_H += compat/cygwin.h +LIB_H += compat/mingw.h +LIB_H += csum-file.h +LIB_H += decorate.h +LIB_H += delta.h +LIB_H += diffcore.h +LIB_H += diff.h +LIB_H += dir.h +LIB_H += fsck.h +LIB_H += git-compat-util.h +LIB_H += graph.h +LIB_H += grep.h +LIB_H += hash.h +LIB_H += help.h +LIB_H += levenshtein.h +LIB_H += list-objects.h +LIB_H += ll-merge.h +LIB_H += log-tree.h +LIB_H += mailmap.h +LIB_H += merge-recursive.h +LIB_H += object.h +LIB_H += pack.h +LIB_H += pack-refs.h +LIB_H += pack-revindex.h +LIB_H += parse-options.h +LIB_H += patch-ids.h +LIB_H += pkt-line.h +LIB_H += progress.h +LIB_H += quote.h +LIB_H += reflog-walk.h +LIB_H += refs.h +LIB_H += remote.h +LIB_H += rerere.h +LIB_H += revision.h +LIB_H += run-command.h +LIB_H += sha1-lookup.h +LIB_H += sideband.h +LIB_H += sigchain.h +LIB_H += strbuf.h +LIB_H += string-list.h +LIB_H += tag.h +LIB_H += transport.h +LIB_H += tree.h +LIB_H += tree-walk.h +LIB_H += unpack-trees.h +LIB_H += userdiff.h +LIB_H += utf8.h +LIB_H += wt-status.h + +LIB_OBJS += abspath.o +LIB_OBJS += alias.o +LIB_OBJS += alloc.o +LIB_OBJS += archive.o +LIB_OBJS += archive-tar.o +LIB_OBJS += archive-zip.o +LIB_OBJS += attr.o +LIB_OBJS += base85.o +LIB_OBJS += bisect.o +LIB_OBJS += blob.o +LIB_OBJS += branch.o +LIB_OBJS += bundle.o +LIB_OBJS += cache-tree.o +LIB_OBJS += color.o +LIB_OBJS += combine-diff.o +LIB_OBJS += commit.o +LIB_OBJS += config.o +LIB_OBJS += connect.o +LIB_OBJS += convert.o +LIB_OBJS += copy.o +LIB_OBJS += csum-file.o +LIB_OBJS += ctype.o +LIB_OBJS += date.o +LIB_OBJS += decorate.o +LIB_OBJS += diffcore-break.o +LIB_OBJS += diffcore-delta.o +LIB_OBJS += diffcore-order.o +LIB_OBJS += diffcore-pickaxe.o +LIB_OBJS += diffcore-rename.o +LIB_OBJS += diff-delta.o +LIB_OBJS += diff-lib.o +LIB_OBJS += diff-no-index.o +LIB_OBJS += diff.o +LIB_OBJS += dir.o +LIB_OBJS += editor.o +LIB_OBJS += entry.o +LIB_OBJS += environment.o +LIB_OBJS += exec_cmd.o +LIB_OBJS += fsck.o +LIB_OBJS += graph.o +LIB_OBJS += grep.o +LIB_OBJS += hash.o +LIB_OBJS += help.o +LIB_OBJS += ident.o +LIB_OBJS += levenshtein.o +LIB_OBJS += list-objects.o +LIB_OBJS += ll-merge.o +LIB_OBJS += lockfile.o +LIB_OBJS += log-tree.o +LIB_OBJS += mailmap.o +LIB_OBJS += match-trees.o +LIB_OBJS += merge-file.o +LIB_OBJS += merge-recursive.o +LIB_OBJS += name-hash.o +LIB_OBJS += object.o +LIB_OBJS += pack-check.o +LIB_OBJS += pack-refs.o +LIB_OBJS += pack-revindex.o +LIB_OBJS += pack-write.o +LIB_OBJS += pager.o +LIB_OBJS += parse-options.o +LIB_OBJS += patch-delta.o +LIB_OBJS += patch-ids.o +LIB_OBJS += path.o +LIB_OBJS += pkt-line.o +LIB_OBJS += preload-index.o +LIB_OBJS += pretty.o +LIB_OBJS += progress.o +LIB_OBJS += quote.o +LIB_OBJS += reachable.o +LIB_OBJS += read-cache.o +LIB_OBJS += reflog-walk.o +LIB_OBJS += refs.o +LIB_OBJS += remote.o +LIB_OBJS += rerere.o +LIB_OBJS += revision.o +LIB_OBJS += run-command.o +LIB_OBJS += server-info.o +LIB_OBJS += setup.o +LIB_OBJS += sha1-lookup.o +LIB_OBJS += sha1_file.o +LIB_OBJS += sha1_name.o +LIB_OBJS += shallow.o +LIB_OBJS += sideband.o +LIB_OBJS += sigchain.o +LIB_OBJS += strbuf.o +LIB_OBJS += string-list.o +LIB_OBJS += symlinks.o +LIB_OBJS += tag.o +LIB_OBJS += trace.o +LIB_OBJS += transport.o +LIB_OBJS += tree-diff.o +LIB_OBJS += tree.o +LIB_OBJS += tree-walk.o +LIB_OBJS += unpack-trees.o +LIB_OBJS += usage.o +LIB_OBJS += userdiff.o +LIB_OBJS += utf8.o +LIB_OBJS += walker.o +LIB_OBJS += wrapper.o +LIB_OBJS += write_or_die.o +LIB_OBJS += ws.o +LIB_OBJS += wt-status.o +LIB_OBJS += xdiff-interface.o + +BUILTIN_OBJS += builtin-add.o +BUILTIN_OBJS += builtin-annotate.o +BUILTIN_OBJS += builtin-apply.o +BUILTIN_OBJS += builtin-archive.o +BUILTIN_OBJS += builtin-bisect--helper.o +BUILTIN_OBJS += builtin-blame.o +BUILTIN_OBJS += builtin-branch.o +BUILTIN_OBJS += builtin-bundle.o +BUILTIN_OBJS += builtin-cat-file.o +BUILTIN_OBJS += builtin-check-attr.o +BUILTIN_OBJS += builtin-check-ref-format.o +BUILTIN_OBJS += builtin-checkout-index.o +BUILTIN_OBJS += builtin-checkout.o +BUILTIN_OBJS += builtin-clean.o +BUILTIN_OBJS += builtin-clone.o +BUILTIN_OBJS += builtin-commit-tree.o +BUILTIN_OBJS += builtin-commit.o +BUILTIN_OBJS += builtin-config.o +BUILTIN_OBJS += builtin-count-objects.o +BUILTIN_OBJS += builtin-describe.o +BUILTIN_OBJS += builtin-diff-files.o +BUILTIN_OBJS += builtin-diff-index.o +BUILTIN_OBJS += builtin-diff-tree.o +BUILTIN_OBJS += builtin-diff.o +BUILTIN_OBJS += builtin-fast-export.o +BUILTIN_OBJS += builtin-fetch--tool.o +BUILTIN_OBJS += builtin-fetch-pack.o +BUILTIN_OBJS += builtin-fetch.o +BUILTIN_OBJS += builtin-fmt-merge-msg.o +BUILTIN_OBJS += builtin-for-each-ref.o +BUILTIN_OBJS += builtin-fsck.o +BUILTIN_OBJS += builtin-gc.o +BUILTIN_OBJS += builtin-grep.o +BUILTIN_OBJS += builtin-help.o +BUILTIN_OBJS += builtin-init-db.o +BUILTIN_OBJS += builtin-log.o +BUILTIN_OBJS += builtin-ls-files.o +BUILTIN_OBJS += builtin-ls-remote.o +BUILTIN_OBJS += builtin-ls-tree.o +BUILTIN_OBJS += builtin-mailinfo.o +BUILTIN_OBJS += builtin-mailsplit.o +BUILTIN_OBJS += builtin-merge.o +BUILTIN_OBJS += builtin-merge-base.o +BUILTIN_OBJS += builtin-merge-file.o +BUILTIN_OBJS += builtin-merge-ours.o +BUILTIN_OBJS += builtin-merge-recursive.o +BUILTIN_OBJS += builtin-mv.o +BUILTIN_OBJS += builtin-name-rev.o +BUILTIN_OBJS += builtin-pack-objects.o +BUILTIN_OBJS += builtin-pack-refs.o +BUILTIN_OBJS += builtin-prune-packed.o +BUILTIN_OBJS += builtin-prune.o +BUILTIN_OBJS += builtin-push.o +BUILTIN_OBJS += builtin-read-tree.o +BUILTIN_OBJS += builtin-receive-pack.o +BUILTIN_OBJS += builtin-reflog.o +BUILTIN_OBJS += builtin-remote.o +BUILTIN_OBJS += builtin-rerere.o +BUILTIN_OBJS += builtin-reset.o +BUILTIN_OBJS += builtin-rev-list.o +BUILTIN_OBJS += builtin-rev-parse.o +BUILTIN_OBJS += builtin-revert.o +BUILTIN_OBJS += builtin-rm.o +BUILTIN_OBJS += builtin-send-pack.o +BUILTIN_OBJS += builtin-shortlog.o +BUILTIN_OBJS += builtin-show-branch.o +BUILTIN_OBJS += builtin-show-ref.o +BUILTIN_OBJS += builtin-stripspace.o +BUILTIN_OBJS += builtin-symbolic-ref.o +BUILTIN_OBJS += builtin-tag.o +BUILTIN_OBJS += builtin-tar-tree.o +BUILTIN_OBJS += builtin-unpack-objects.o +BUILTIN_OBJS += builtin-update-index.o +BUILTIN_OBJS += builtin-update-ref.o +BUILTIN_OBJS += builtin-upload-archive.o +BUILTIN_OBJS += builtin-verify-pack.o +BUILTIN_OBJS += builtin-verify-tag.o +BUILTIN_OBJS += builtin-write-tree.o + +GITLIBS = $(LIB_FILE) $(XDIFF_LIB) +EXTLIBS = + +# +# Platform specific tweaks +# + +# We choose to avoid "if .. else if .. else .. endif endif" +# because maintaining the nesting to match is a pain. If +# we had "elif" things would have been much nicer... + +ifeq ($(uname_S),Linux) + NO_STRLCPY = YesPlease + THREADED_DELTA_SEARCH = YesPlease +endif +ifeq ($(uname_S),GNU/kFreeBSD) + NO_STRLCPY = YesPlease + THREADED_DELTA_SEARCH = YesPlease +endif +ifeq ($(uname_S),UnixWare) + CC = cc + NEEDS_SOCKET = YesPlease + NEEDS_NSL = YesPlease + NEEDS_SSL_WITH_CRYPTO = YesPlease + NEEDS_LIBICONV = YesPlease + SHELL_PATH = /usr/local/bin/bash + NO_IPV6 = YesPlease + NO_HSTRERROR = YesPlease + BASIC_CFLAGS += -Kthread + BASIC_CFLAGS += -I/usr/local/include + BASIC_LDFLAGS += -L/usr/local/lib + INSTALL = ginstall + TAR = gtar + NO_STRCASESTR = YesPlease + NO_MEMMEM = YesPlease +endif +ifeq ($(uname_S),SCO_SV) + ifeq ($(uname_R),3.2) + CFLAGS = -O2 + endif + ifeq ($(uname_R),5) + CC = cc + BASIC_CFLAGS += -Kthread + endif + NEEDS_SOCKET = YesPlease + NEEDS_NSL = YesPlease + NEEDS_SSL_WITH_CRYPTO = YesPlease + NEEDS_LIBICONV = YesPlease + SHELL_PATH = /usr/bin/bash + NO_IPV6 = YesPlease + NO_HSTRERROR = YesPlease + BASIC_CFLAGS += -I/usr/local/include + BASIC_LDFLAGS += -L/usr/local/lib + NO_STRCASESTR = YesPlease + NO_MEMMEM = YesPlease + INSTALL = ginstall + TAR = gtar +endif +ifeq ($(uname_S),Darwin) + NEEDS_SSL_WITH_CRYPTO = YesPlease + NEEDS_LIBICONV = YesPlease + ifeq ($(shell expr "$(uname_R)" : '[15678]\.'),2) + OLD_ICONV = UnfortunatelyYes + endif + ifeq ($(shell expr "$(uname_R)" : '[15]\.'),2) + NO_STRLCPY = YesPlease + endif + NO_MEMMEM = YesPlease + THREADED_DELTA_SEARCH = YesPlease + USE_ST_TIMESPEC = YesPlease +endif +ifeq ($(uname_S),SunOS) + NEEDS_SOCKET = YesPlease + NEEDS_NSL = YesPlease + SHELL_PATH = /bin/bash + NO_STRCASESTR = YesPlease + NO_MEMMEM = YesPlease + NO_HSTRERROR = YesPlease + NO_MKDTEMP = YesPlease + OLD_ICONV = UnfortunatelyYes + ifeq ($(uname_R),5.8) + NO_UNSETENV = YesPlease + NO_SETENV = YesPlease + NO_C99_FORMAT = YesPlease + NO_STRTOUMAX = YesPlease + endif + ifeq ($(uname_R),5.9) + NO_UNSETENV = YesPlease + NO_SETENV = YesPlease + NO_C99_FORMAT = YesPlease + NO_STRTOUMAX = YesPlease + endif + INSTALL = ginstall + TAR = gtar + BASIC_CFLAGS += -D__EXTENSIONS__ +endif +ifeq ($(uname_O),Cygwin) + NO_D_TYPE_IN_DIRENT = YesPlease + NO_D_INO_IN_DIRENT = YesPlease + NO_STRCASESTR = YesPlease + NO_MEMMEM = YesPlease + NO_SYMLINK_HEAD = YesPlease + NEEDS_LIBICONV = YesPlease + NO_FAST_WORKING_DIRECTORY = UnfortunatelyYes + NO_TRUSTABLE_FILEMODE = UnfortunatelyYes + OLD_ICONV = UnfortunatelyYes + # There are conflicting reports about this. + # On some boxes NO_MMAP is needed, and not so elsewhere. + # Try commenting this out if you suspect MMAP is more efficient + NO_MMAP = YesPlease + NO_IPV6 = YesPlease + X = .exe +endif +ifeq ($(uname_S),FreeBSD) + NEEDS_LIBICONV = YesPlease + NO_MEMMEM = YesPlease + BASIC_CFLAGS += -I/usr/local/include + BASIC_LDFLAGS += -L/usr/local/lib + DIR_HAS_BSD_GROUP_SEMANTICS = YesPlease + USE_ST_TIMESPEC = YesPlease + THREADED_DELTA_SEARCH = YesPlease + ifeq ($(shell expr "$(uname_R)" : '4\.'),2) + PTHREAD_LIBS = -pthread + NO_UINTMAX_T = YesPlease + NO_STRTOUMAX = YesPlease + endif +endif +ifeq ($(uname_S),OpenBSD) + NO_STRCASESTR = YesPlease + NO_MEMMEM = YesPlease + NEEDS_LIBICONV = YesPlease + BASIC_CFLAGS += -I/usr/local/include + BASIC_LDFLAGS += -L/usr/local/lib + THREADED_DELTA_SEARCH = YesPlease +endif +ifeq ($(uname_S),NetBSD) + ifeq ($(shell expr "$(uname_R)" : '[01]\.'),2) + NEEDS_LIBICONV = YesPlease + endif + BASIC_CFLAGS += -I/usr/pkg/include + BASIC_LDFLAGS += -L/usr/pkg/lib $(CC_LD_DYNPATH)/usr/pkg/lib + THREADED_DELTA_SEARCH = YesPlease +endif +ifeq ($(uname_S),AIX) + NO_STRCASESTR=YesPlease + NO_MEMMEM = YesPlease + NO_MKDTEMP = YesPlease + NO_STRLCPY = YesPlease + NO_NSEC = YesPlease + FREAD_READS_DIRECTORIES = UnfortunatelyYes + INTERNAL_QSORT = UnfortunatelyYes + NEEDS_LIBICONV=YesPlease + BASIC_CFLAGS += -D_LARGE_FILES + ifneq ($(shell expr "$(uname_V)" : '[1234]'),1) + THREADED_DELTA_SEARCH = YesPlease + else + NO_PTHREADS = YesPlease + endif +endif +ifeq ($(uname_S),GNU) + # GNU/Hurd + NO_STRLCPY=YesPlease +endif +ifeq ($(uname_S),IRIX64) + NO_IPV6=YesPlease + NO_SETENV=YesPlease + NO_STRCASESTR=YesPlease + NO_MEMMEM = YesPlease + NO_STRLCPY = YesPlease + NO_SOCKADDR_STORAGE=YesPlease + SHELL_PATH=/usr/gnu/bin/bash + BASIC_CFLAGS += -DPATH_MAX=1024 + # for now, build 32-bit version + BASIC_LDFLAGS += -L/usr/lib32 +endif +ifeq ($(uname_S),HP-UX) + NO_IPV6=YesPlease + NO_SETENV=YesPlease + NO_STRCASESTR=YesPlease + NO_MEMMEM = YesPlease + NO_STRLCPY = YesPlease + NO_MKDTEMP = YesPlease + NO_UNSETENV = YesPlease + NO_HSTRERROR = YesPlease + NO_SYS_SELECT_H = YesPlease + SNPRINTF_RETURNS_BOGUS = YesPlease +endif +ifneq (,$(findstring CYGWIN,$(uname_S))) + COMPAT_OBJS += compat/cygwin.o +endif +ifneq (,$(findstring MINGW,$(uname_S))) + NO_PREAD = YesPlease + NO_OPENSSL = YesPlease + NO_CURL = YesPlease + NO_SYMLINK_HEAD = YesPlease + NO_IPV6 = YesPlease + NO_SETENV = YesPlease + NO_UNSETENV = YesPlease + NO_STRCASESTR = YesPlease + NO_STRLCPY = YesPlease + NO_MEMMEM = YesPlease + NO_PTHREADS = YesPlease + NEEDS_LIBICONV = YesPlease + OLD_ICONV = YesPlease + NO_C99_FORMAT = YesPlease + NO_STRTOUMAX = YesPlease + NO_MKDTEMP = YesPlease + SNPRINTF_RETURNS_BOGUS = YesPlease + NO_SVN_TESTS = YesPlease + NO_PERL_MAKEMAKER = YesPlease + RUNTIME_PREFIX = YesPlease + NO_POSIX_ONLY_PROGRAMS = YesPlease + NO_ST_BLOCKS_IN_STRUCT_STAT = YesPlease + NO_NSEC = YesPlease + USE_WIN32_MMAP = YesPlease + COMPAT_CFLAGS += -D__USE_MINGW_ACCESS -DNOGDI -Icompat -Icompat/regex -Icompat/fnmatch + COMPAT_CFLAGS += -DSNPRINTF_SIZE_CORR=1 + COMPAT_CFLAGS += -DSTRIP_EXTENSION=\".exe\" + COMPAT_OBJS += compat/mingw.o compat/fnmatch/fnmatch.o compat/regex/regex.o compat/winansi.o + EXTLIBS += -lws2_32 + X = .exe +endif +ifneq (,$(findstring arm,$(uname_M))) + ARM_SHA1 = YesPlease +endif + +-include config.mak.autogen +-include config.mak + +ifeq ($(uname_S),Darwin) + ifndef NO_FINK + ifeq ($(shell test -d /sw/lib && echo y),y) + BASIC_CFLAGS += -I/sw/include + BASIC_LDFLAGS += -L/sw/lib + endif + endif + ifndef NO_DARWIN_PORTS + ifeq ($(shell test -d /opt/local/lib && echo y),y) + BASIC_CFLAGS += -I/opt/local/include + BASIC_LDFLAGS += -L/opt/local/lib + endif + endif + PTHREAD_LIBS = +endif + +ifndef CC_LD_DYNPATH + ifdef NO_R_TO_GCC_LINKER + # Some gcc does not accept and pass -R to the linker to specify + # the runtime dynamic library path. + CC_LD_DYNPATH = -Wl,-rpath, + else + CC_LD_DYNPATH = -R + endif +endif + +ifdef NO_CURL + BASIC_CFLAGS += -DNO_CURL +else + ifdef CURLDIR + # Try "-Wl,-rpath=$(CURLDIR)/$(lib)" in such a case. + BASIC_CFLAGS += -I$(CURLDIR)/include + CURL_LIBCURL = -L$(CURLDIR)/$(lib) $(CC_LD_DYNPATH)$(CURLDIR)/$(lib) -lcurl + else + CURL_LIBCURL = -lcurl + endif + BUILTIN_OBJS += builtin-http-fetch.o + EXTLIBS += $(CURL_LIBCURL) + LIB_OBJS += http.o http-walker.o + curl_check := $(shell (echo 070908; curl-config --vernum) | sort -r | sed -ne 2p) + ifeq "$(curl_check)" "070908" + ifndef NO_EXPAT + PROGRAMS += git-http-push$X + endif + endif + ifndef NO_EXPAT + ifdef EXPATDIR + BASIC_CFLAGS += -I$(EXPATDIR)/include + EXPAT_LIBEXPAT = -L$(EXPATDIR)/$(lib) $(CC_LD_DYNPATH)$(EXPATDIR)/$(lib) -lexpat + else + EXPAT_LIBEXPAT = -lexpat + endif + endif +endif + +ifdef ZLIB_PATH + BASIC_CFLAGS += -I$(ZLIB_PATH)/include + EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib) +endif +EXTLIBS += -lz + +ifndef NO_POSIX_ONLY_PROGRAMS + PROGRAMS += git-daemon$X + PROGRAMS += git-imap-send$X +endif +ifndef NO_OPENSSL + OPENSSL_LIBSSL = -lssl + ifdef OPENSSLDIR + BASIC_CFLAGS += -I$(OPENSSLDIR)/include + OPENSSL_LINK = -L$(OPENSSLDIR)/$(lib) $(CC_LD_DYNPATH)$(OPENSSLDIR)/$(lib) + else + OPENSSL_LINK = + endif +else + BASIC_CFLAGS += -DNO_OPENSSL + MOZILLA_SHA1 = 1 + OPENSSL_LIBSSL = +endif +ifdef NEEDS_SSL_WITH_CRYPTO + LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto -lssl +else + LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto +endif +ifdef NEEDS_LIBICONV + ifdef ICONVDIR + BASIC_CFLAGS += -I$(ICONVDIR)/include + ICONV_LINK = -L$(ICONVDIR)/$(lib) $(CC_LD_DYNPATH)$(ICONVDIR)/$(lib) + else + ICONV_LINK = + endif + EXTLIBS += $(ICONV_LINK) -liconv +endif +ifdef NEEDS_SOCKET + EXTLIBS += -lsocket +endif +ifdef NEEDS_NSL + EXTLIBS += -lnsl +endif +ifdef NO_D_TYPE_IN_DIRENT + BASIC_CFLAGS += -DNO_D_TYPE_IN_DIRENT +endif +ifdef NO_D_INO_IN_DIRENT + BASIC_CFLAGS += -DNO_D_INO_IN_DIRENT +endif +ifdef NO_ST_BLOCKS_IN_STRUCT_STAT + BASIC_CFLAGS += -DNO_ST_BLOCKS_IN_STRUCT_STAT +endif +ifdef USE_NSEC + BASIC_CFLAGS += -DUSE_NSEC +endif +ifdef USE_ST_TIMESPEC + BASIC_CFLAGS += -DUSE_ST_TIMESPEC +endif +ifdef NO_NSEC + BASIC_CFLAGS += -DNO_NSEC +endif +ifdef NO_C99_FORMAT + BASIC_CFLAGS += -DNO_C99_FORMAT +endif +ifdef SNPRINTF_RETURNS_BOGUS + COMPAT_CFLAGS += -DSNPRINTF_RETURNS_BOGUS + COMPAT_OBJS += compat/snprintf.o +endif +ifdef FREAD_READS_DIRECTORIES + COMPAT_CFLAGS += -DFREAD_READS_DIRECTORIES + COMPAT_OBJS += compat/fopen.o +endif +ifdef NO_SYMLINK_HEAD + BASIC_CFLAGS += -DNO_SYMLINK_HEAD +endif +ifdef NO_STRCASESTR + COMPAT_CFLAGS += -DNO_STRCASESTR + COMPAT_OBJS += compat/strcasestr.o +endif +ifdef NO_STRLCPY + COMPAT_CFLAGS += -DNO_STRLCPY + COMPAT_OBJS += compat/strlcpy.o +endif +ifdef NO_STRTOUMAX + COMPAT_CFLAGS += -DNO_STRTOUMAX + COMPAT_OBJS += compat/strtoumax.o +endif +ifdef NO_STRTOULL + COMPAT_CFLAGS += -DNO_STRTOULL +endif +ifdef NO_SETENV + COMPAT_CFLAGS += -DNO_SETENV + COMPAT_OBJS += compat/setenv.o +endif +ifdef NO_MKDTEMP + COMPAT_CFLAGS += -DNO_MKDTEMP + COMPAT_OBJS += compat/mkdtemp.o +endif +ifdef NO_UNSETENV + COMPAT_CFLAGS += -DNO_UNSETENV + COMPAT_OBJS += compat/unsetenv.o +endif +ifdef NO_SYS_SELECT_H + BASIC_CFLAGS += -DNO_SYS_SELECT_H +endif +ifdef NO_MMAP + COMPAT_CFLAGS += -DNO_MMAP + COMPAT_OBJS += compat/mmap.o +else + ifdef USE_WIN32_MMAP + COMPAT_CFLAGS += -DUSE_WIN32_MMAP + COMPAT_OBJS += compat/win32mmap.o + endif +endif +ifdef NO_PREAD + COMPAT_CFLAGS += -DNO_PREAD + COMPAT_OBJS += compat/pread.o +endif +ifdef NO_FAST_WORKING_DIRECTORY + BASIC_CFLAGS += -DNO_FAST_WORKING_DIRECTORY +endif +ifdef NO_TRUSTABLE_FILEMODE + BASIC_CFLAGS += -DNO_TRUSTABLE_FILEMODE +endif +ifdef NO_IPV6 + BASIC_CFLAGS += -DNO_IPV6 +endif +ifdef NO_UINTMAX_T + BASIC_CFLAGS += -Duintmax_t=uint32_t +endif +ifdef NO_SOCKADDR_STORAGE +ifdef NO_IPV6 + BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in +else + BASIC_CFLAGS += -Dsockaddr_storage=sockaddr_in6 +endif +endif +ifdef NO_INET_NTOP + LIB_OBJS += compat/inet_ntop.o +endif +ifdef NO_INET_PTON + LIB_OBJS += compat/inet_pton.o +endif + +ifdef NO_ICONV + BASIC_CFLAGS += -DNO_ICONV +endif + +ifdef OLD_ICONV + BASIC_CFLAGS += -DOLD_ICONV +endif + +ifdef NO_DEFLATE_BOUND + BASIC_CFLAGS += -DNO_DEFLATE_BOUND +endif + +ifdef PPC_SHA1 + SHA1_HEADER = "ppc/sha1.h" + LIB_OBJS += ppc/sha1.o ppc/sha1ppc.o +else +ifdef ARM_SHA1 + SHA1_HEADER = "arm/sha1.h" + LIB_OBJS += arm/sha1.o arm/sha1_arm.o +else +ifdef MOZILLA_SHA1 + SHA1_HEADER = "mozilla-sha1/sha1.h" + LIB_OBJS += mozilla-sha1/sha1.o +else + SHA1_HEADER = + EXTLIBS += $(LIB_4_CRYPTO) +endif +endif +endif +ifdef NO_PERL_MAKEMAKER + export NO_PERL_MAKEMAKER +endif +ifdef NO_HSTRERROR + COMPAT_CFLAGS += -DNO_HSTRERROR + COMPAT_OBJS += compat/hstrerror.o +endif +ifdef NO_MEMMEM + COMPAT_CFLAGS += -DNO_MEMMEM + COMPAT_OBJS += compat/memmem.o +endif +ifdef INTERNAL_QSORT + COMPAT_CFLAGS += -DINTERNAL_QSORT + COMPAT_OBJS += compat/qsort.o +endif +ifdef RUNTIME_PREFIX + COMPAT_CFLAGS += -DRUNTIME_PREFIX +endif + +ifdef NO_PTHREADS + THREADED_DELTA_SEARCH = + BASIC_CFLAGS += -DNO_PTHREADS +else + EXTLIBS += $(PTHREAD_LIBS) +endif + +ifdef THREADED_DELTA_SEARCH + BASIC_CFLAGS += -DTHREADED_DELTA_SEARCH + LIB_OBJS += thread-utils.o +endif +ifdef DIR_HAS_BSD_GROUP_SEMANTICS + COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS +endif +ifdef NO_EXTERNAL_GREP + BASIC_CFLAGS += -DNO_EXTERNAL_GREP +endif + +ifeq ($(TCLTK_PATH),) +NO_TCLTK=NoThanks +endif + +ifeq ($(PERL_PATH),) +NO_PERL=NoThanks +endif + +QUIET_SUBDIR0 = +$(MAKE) -C # space to separate -C and subdir +QUIET_SUBDIR1 = + +ifneq ($(findstring $(MAKEFLAGS),w),w) +PRINT_DIR = --no-print-directory +else # "make -w" +NO_SUBDIR = : +endif + +ifneq ($(findstring $(MAKEFLAGS),s),s) +ifndef V + QUIET_CC = @echo ' ' CC $@; + QUIET_AR = @echo ' ' AR $@; + QUIET_LINK = @echo ' ' LINK $@; + QUIET_BUILT_IN = @echo ' ' BUILTIN $@; + QUIET_GEN = @echo ' ' GEN $@; + QUIET_SUBDIR0 = +@subdir= + QUIET_SUBDIR1 = ;$(NO_SUBDIR) echo ' ' SUBDIR $$subdir; \ + $(MAKE) $(PRINT_DIR) -C $$subdir + export V + export QUIET_GEN + export QUIET_BUILT_IN +endif +endif + +ifdef ASCIIDOC8 + export ASCIIDOC8 +endif + +# Shell quote (do not use $(call) to accommodate ancient setups); + +SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER)) +ETC_GITCONFIG_SQ = $(subst ','\'',$(ETC_GITCONFIG)) + +DESTDIR_SQ = $(subst ','\'',$(DESTDIR)) +bindir_SQ = $(subst ','\'',$(bindir)) +bindir_relative_SQ = $(subst ','\'',$(bindir_relative)) +mandir_SQ = $(subst ','\'',$(mandir)) +infodir_SQ = $(subst ','\'',$(infodir)) +gitexecdir_SQ = $(subst ','\'',$(gitexecdir)) +template_dir_SQ = $(subst ','\'',$(template_dir)) +htmldir_SQ = $(subst ','\'',$(htmldir)) +prefix_SQ = $(subst ','\'',$(prefix)) + +SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH)) +PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH)) +TCLTK_PATH_SQ = $(subst ','\'',$(TCLTK_PATH)) + +LIBS = $(GITLIBS) $(EXTLIBS) + +BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \ + $(COMPAT_CFLAGS) +LIB_OBJS += $(COMPAT_OBJS) + +ALL_CFLAGS += $(BASIC_CFLAGS) +ALL_LDFLAGS += $(BASIC_LDFLAGS) + +export TAR INSTALL DESTDIR SHELL_PATH + + +### Build rules + +SHELL = $(SHELL_PATH) + +all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) GIT-BUILD-OPTIONS +ifneq (,$X) + $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), test '$p' -ef '$p$X' || $(RM) '$p';) +endif + +all:: +ifndef NO_TCLTK + $(QUIET_SUBDIR0)git-gui $(QUIET_SUBDIR1) gitexecdir='$(gitexec_instdir_SQ)' all + $(QUIET_SUBDIR0)gitk-git $(QUIET_SUBDIR1) all +endif +ifndef NO_PERL + $(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' all +endif + $(QUIET_SUBDIR0)templates $(QUIET_SUBDIR1) + +please_set_SHELL_PATH_to_a_more_modern_shell: + @$$(:) + +shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell + +strip: $(PROGRAMS) git$X + $(STRIP) $(STRIP_OPTS) $(PROGRAMS) git$X + +git.o: git.c common-cmds.h GIT-CFLAGS + $(QUIET_CC)$(CC) -DGIT_VERSION='"$(GIT_VERSION)"' \ + '-DGIT_HTML_PATH="$(htmldir_SQ)"' \ + $(ALL_CFLAGS) -c $(filter %.c,$^) + +git$X: git.o $(BUILTIN_OBJS) $(GITLIBS) + $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ git.o \ + $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS) + +builtin-help.o: builtin-help.c common-cmds.h GIT-CFLAGS + $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \ + '-DGIT_HTML_PATH="$(htmldir_SQ)"' \ + '-DGIT_MAN_PATH="$(mandir_SQ)"' \ + '-DGIT_INFO_PATH="$(infodir_SQ)"' $< + +$(BUILT_INS): git$X + $(QUIET_BUILT_IN)$(RM) $@ && \ + ln git$X $@ 2>/dev/null || \ + ln -s git$X $@ 2>/dev/null || \ + cp git$X $@ + +common-cmds.h: ./generate-cmdlist.sh command-list.txt + +common-cmds.h: $(wildcard Documentation/git-*.txt) + $(QUIET_GEN)./generate-cmdlist.sh > $@+ && mv $@+ $@ + +$(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh + $(QUIET_GEN)$(RM) $@ $@+ && \ + sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \ + -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \ + -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \ + -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \ + -e 's/@@NO_CURL@@/$(NO_CURL)/g' \ + $@.sh >$@+ && \ + chmod +x $@+ && \ + mv $@+ $@ + +ifndef NO_PERL +$(patsubst %.perl,%,$(SCRIPT_PERL)): perl/perl.mak + +perl/perl.mak: GIT-CFLAGS perl/Makefile perl/Makefile.PL + $(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' $(@F) + +$(patsubst %.perl,%,$(SCRIPT_PERL)): % : %.perl + $(QUIET_GEN)$(RM) $@ $@+ && \ + INSTLIBDIR=`MAKEFLAGS= $(MAKE) -C perl -s --no-print-directory instlibdir` && \ + sed -e '1{' \ + -e ' s|#!.*perl|#!$(PERL_PATH_SQ)|' \ + -e ' h' \ + -e ' s=.*=use lib (split(/:/, $$ENV{GITPERLLIB} || "@@INSTLIBDIR@@"));=' \ + -e ' H' \ + -e ' x' \ + -e '}' \ + -e 's|@@INSTLIBDIR@@|'"$$INSTLIBDIR"'|g' \ + -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \ + $@.perl >$@+ && \ + chmod +x $@+ && \ + mv $@+ $@ + +gitweb/gitweb.cgi: gitweb/gitweb.perl + $(QUIET_GEN)$(RM) $@ $@+ && \ + sed -e '1s|#!.*perl|#!$(PERL_PATH_SQ)|' \ + -e 's|++GIT_VERSION++|$(GIT_VERSION)|g' \ + -e 's|++GIT_BINDIR++|$(bindir)|g' \ + -e 's|++GITWEB_CONFIG++|$(GITWEB_CONFIG)|g' \ + -e 's|++GITWEB_CONFIG_SYSTEM++|$(GITWEB_CONFIG_SYSTEM)|g' \ + -e 's|++GITWEB_HOME_LINK_STR++|$(GITWEB_HOME_LINK_STR)|g' \ + -e 's|++GITWEB_SITENAME++|$(GITWEB_SITENAME)|g' \ + -e 's|++GITWEB_PROJECTROOT++|$(GITWEB_PROJECTROOT)|g' \ + -e 's|"++GITWEB_PROJECT_MAXDEPTH++"|$(GITWEB_PROJECT_MAXDEPTH)|g' \ + -e 's|++GITWEB_EXPORT_OK++|$(GITWEB_EXPORT_OK)|g' \ + -e 's|++GITWEB_STRICT_EXPORT++|$(GITWEB_STRICT_EXPORT)|g' \ + -e 's|++GITWEB_BASE_URL++|$(GITWEB_BASE_URL)|g' \ + -e 's|++GITWEB_LIST++|$(GITWEB_LIST)|g' \ + -e 's|++GITWEB_HOMETEXT++|$(GITWEB_HOMETEXT)|g' \ + -e 's|++GITWEB_CSS++|$(GITWEB_CSS)|g' \ + -e 's|++GITWEB_LOGO++|$(GITWEB_LOGO)|g' \ + -e 's|++GITWEB_FAVICON++|$(GITWEB_FAVICON)|g' \ + -e 's|++GITWEB_SITE_HEADER++|$(GITWEB_SITE_HEADER)|g' \ + -e 's|++GITWEB_SITE_FOOTER++|$(GITWEB_SITE_FOOTER)|g' \ + $< >$@+ && \ + chmod +x $@+ && \ + mv $@+ $@ + +git-instaweb: git-instaweb.sh gitweb/gitweb.cgi gitweb/gitweb.css + $(QUIET_GEN)$(RM) $@ $@+ && \ + sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \ + -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \ + -e 's/@@NO_CURL@@/$(NO_CURL)/g' \ + -e '/@@GITWEB_CGI@@/r gitweb/gitweb.cgi' \ + -e '/@@GITWEB_CGI@@/d' \ + -e '/@@GITWEB_CSS@@/r gitweb/gitweb.css' \ + -e '/@@GITWEB_CSS@@/d' \ + -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \ + $@.sh > $@+ && \ + chmod +x $@+ && \ + mv $@+ $@ +else # NO_PERL +$(patsubst %.perl,%,$(SCRIPT_PERL)) git-instaweb: % : unimplemented.sh + $(QUIET_GEN)$(RM) $@ $@+ && \ + sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \ + -e 's|@@REASON@@|NO_PERL=$(NO_PERL)|g' \ + unimplemented.sh >$@+ && \ + chmod +x $@+ && \ + mv $@+ $@ +endif # NO_PERL + +configure: configure.ac + $(QUIET_GEN)$(RM) $@ $<+ && \ + sed -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \ + $< > $<+ && \ + autoconf -o $@ $<+ && \ + $(RM) $<+ + +# These can record GIT_VERSION +git.o git.spec \ + $(patsubst %.sh,%,$(SCRIPT_SH)) \ + $(patsubst %.perl,%,$(SCRIPT_PERL)) \ + : GIT-VERSION-FILE + +%.o: %.c GIT-CFLAGS + $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $< +%.s: %.c GIT-CFLAGS + $(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $< +%.o: %.S + $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $< + +exec_cmd.o: exec_cmd.c GIT-CFLAGS + $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \ + '-DGIT_EXEC_PATH="$(gitexecdir_SQ)"' \ + '-DBINDIR="$(bindir_relative_SQ)"' \ + '-DPREFIX="$(prefix_SQ)"' \ + $< + +builtin-init-db.o: builtin-init-db.c GIT-CFLAGS + $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_GIT_TEMPLATE_DIR='"$(template_dir_SQ)"' $< + +config.o: config.c GIT-CFLAGS + $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_GITCONFIG='"$(ETC_GITCONFIG_SQ)"' $< + +http.o: http.c GIT-CFLAGS + $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DGIT_USER_AGENT='"git/$(GIT_VERSION)"' $< + +ifdef NO_EXPAT +http-walker.o: http-walker.c http.h GIT-CFLAGS + $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DNO_EXPAT $< +endif + +git-%$X: %.o $(GITLIBS) + $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS) + +git-imap-send$X: imap-send.o $(GITLIBS) + $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \ + $(LIBS) $(OPENSSL_LINK) $(OPENSSL_LIBSSL) + +http.o http-walker.o http-push.o transport.o: http.h + +git-http-push$X: revision.o http.o http-push.o $(GITLIBS) + $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \ + $(LIBS) $(CURL_LIBCURL) $(EXPAT_LIBEXPAT) + +$(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H) +$(patsubst git-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h) +builtin-revert.o wt-status.o: wt-status.h + +$(LIB_FILE): $(LIB_OBJS) + $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS) + +XDIFF_OBJS=xdiff/xdiffi.o xdiff/xprepare.o xdiff/xutils.o xdiff/xemit.o \ + xdiff/xmerge.o xdiff/xpatience.o +$(XDIFF_OBJS): xdiff/xinclude.h xdiff/xmacros.h xdiff/xdiff.h xdiff/xtypes.h \ + xdiff/xutils.h xdiff/xprepare.h xdiff/xdiffi.h xdiff/xemit.h + +$(XDIFF_LIB): $(XDIFF_OBJS) + $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(XDIFF_OBJS) + + +doc: + $(MAKE) -C Documentation all + +man: + $(MAKE) -C Documentation man + +html: + $(MAKE) -C Documentation html + +info: + $(MAKE) -C Documentation info + +pdf: + $(MAKE) -C Documentation pdf + +TAGS: + $(RM) TAGS + $(FIND) . -name '*.[hcS]' -print | xargs etags -a + +tags: + $(RM) tags + $(FIND) . -name '*.[hcS]' -print | xargs ctags -a + +cscope: + $(RM) cscope* + $(FIND) . -name '*.[hcS]' -print | xargs cscope -b + +### Detect prefix changes +TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\ + $(bindir_SQ):$(gitexecdir_SQ):$(template_dir_SQ):$(prefix_SQ) + +GIT-CFLAGS: .FORCE-GIT-CFLAGS + @FLAGS='$(TRACK_CFLAGS)'; \ + if test x"$$FLAGS" != x"`cat GIT-CFLAGS 2>/dev/null`" ; then \ + echo 1>&2 " * new build flags or prefix"; \ + echo "$$FLAGS" >GIT-CFLAGS; \ + fi + +# We need to apply sq twice, once to protect from the shell +# that runs GIT-BUILD-OPTIONS, and then again to protect it +# and the first level quoting from the shell that runs "echo". +GIT-BUILD-OPTIONS: .FORCE-GIT-BUILD-OPTIONS + @echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@ + @echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@ + @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@ + @echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@ + +### Detect Tck/Tk interpreter path changes +ifndef NO_TCLTK +TRACK_VARS = $(subst ','\'',-DTCLTK_PATH='$(TCLTK_PATH_SQ)') + +GIT-GUI-VARS: .FORCE-GIT-GUI-VARS + @VARS='$(TRACK_VARS)'; \ + if test x"$$VARS" != x"`cat $@ 2>/dev/null`" ; then \ + echo 1>&2 " * new Tcl/Tk interpreter location"; \ + echo "$$VARS" >$@; \ + fi + +.PHONY: .FORCE-GIT-GUI-VARS +endif + +### Testing rules + +TEST_PROGRAMS += test-chmtime$X +TEST_PROGRAMS += test-ctype$X +TEST_PROGRAMS += test-date$X +TEST_PROGRAMS += test-delta$X +TEST_PROGRAMS += test-dump-cache-tree$X +TEST_PROGRAMS += test-genrandom$X +TEST_PROGRAMS += test-match-trees$X +TEST_PROGRAMS += test-parse-options$X +TEST_PROGRAMS += test-path-utils$X +TEST_PROGRAMS += test-sha1$X +TEST_PROGRAMS += test-sigchain$X + +all:: $(TEST_PROGRAMS) + +# GNU make supports exporting all variables by "export" without parameters. +# However, the environment gets quite big, and some programs have problems +# with that. + +export NO_SVN_TESTS + +test: all + $(MAKE) -C t/ all + +test-ctype$X: ctype.o + +test-date$X: date.o ctype.o + +test-delta$X: diff-delta.o patch-delta.o + +test-parse-options$X: parse-options.o + +.PRECIOUS: $(patsubst test-%$X,test-%.o,$(TEST_PROGRAMS)) + +test-%$X: test-%.o $(GITLIBS) + $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS) + +check-sha1:: test-sha1$X + ./test-sha1.sh + +check: common-cmds.h + if sparse; \ + then \ + for i in *.c; \ + do \ + sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \ + done; \ + else \ + echo 2>&1 "Did you mean 'make test'?"; \ + exit 1; \ + fi + +remove-dashes: + ./fixup-builtins $(BUILT_INS) $(PROGRAMS) $(SCRIPTS) + +### Installation rules + +ifneq ($(filter /%,$(firstword $(template_dir))),) +template_instdir = $(template_dir) +else +template_instdir = $(prefix)/$(template_dir) +endif +export template_instdir + +ifneq ($(filter /%,$(firstword $(gitexecdir))),) +gitexec_instdir = $(gitexecdir) +else +gitexec_instdir = $(prefix)/$(gitexecdir) +endif +gitexec_instdir_SQ = $(subst ','\'',$(gitexec_instdir)) +export gitexec_instdir + +install: all + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(gitexec_instdir_SQ)' + $(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)' + $(INSTALL) git$X git-upload-pack$X git-receive-pack$X git-upload-archive$X git-shell$X git-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)' + $(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install + $(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install +ifndef NO_TCLTK + $(MAKE) -C gitk-git install + $(MAKE) -C git-gui gitexecdir='$(gitexec_instdir_SQ)' install +endif +ifneq (,$X) + $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), $(RM) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)/$p';) +endif + bindir=$$(cd '$(DESTDIR_SQ)$(bindir_SQ)' && pwd) && \ + execdir=$$(cd '$(DESTDIR_SQ)$(gitexec_instdir_SQ)' && pwd) && \ + { $(RM) "$$execdir/git-add$X" && \ + ln "$$bindir/git$X" "$$execdir/git-add$X" 2>/dev/null || \ + cp "$$bindir/git$X" "$$execdir/git-add$X"; } && \ + { for p in $(filter-out git-add$X,$(BUILT_INS)); do \ + $(RM) "$$execdir/$$p" && \ + ln "$$execdir/git-add$X" "$$execdir/$$p" 2>/dev/null || \ + ln -s "git-add$X" "$$execdir/$$p" 2>/dev/null || \ + cp "$$execdir/git-add$X" "$$execdir/$$p" || exit; \ + done } && \ + ./check_bindir "z$$bindir" "z$$execdir" "$$bindir/git-add$X" + +install-doc: + $(MAKE) -C Documentation install + +install-man: + $(MAKE) -C Documentation install-man + +install-html: + $(MAKE) -C Documentation install-html + +install-info: + $(MAKE) -C Documentation install-info + +install-pdf: + $(MAKE) -C Documentation install-pdf + +quick-install-doc: + $(MAKE) -C Documentation quick-install + +quick-install-man: + $(MAKE) -C Documentation quick-install-man + +quick-install-html: + $(MAKE) -C Documentation quick-install-html + + + +### Maintainer's dist rules + +git.spec: git.spec.in + sed -e 's/@@VERSION@@/$(GIT_VERSION)/g' < $< > $@+ + mv $@+ $@ + +GIT_TARNAME=git-$(GIT_VERSION) +dist: git.spec git-archive$(X) configure + ./git-archive --format=tar \ + --prefix=$(GIT_TARNAME)/ HEAD^{tree} > $(GIT_TARNAME).tar + @mkdir -p $(GIT_TARNAME) + @cp git.spec configure $(GIT_TARNAME) + @echo $(GIT_VERSION) > $(GIT_TARNAME)/version + @$(MAKE) -C git-gui TARDIR=../$(GIT_TARNAME)/git-gui dist-version + $(TAR) rf $(GIT_TARNAME).tar \ + $(GIT_TARNAME)/git.spec \ + $(GIT_TARNAME)/configure \ + $(GIT_TARNAME)/version \ + $(GIT_TARNAME)/git-gui/version + @$(RM) -r $(GIT_TARNAME) + gzip -f -9 $(GIT_TARNAME).tar + +rpm: dist + $(RPMBUILD) -ta $(GIT_TARNAME).tar.gz + +htmldocs = git-htmldocs-$(GIT_VERSION) +manpages = git-manpages-$(GIT_VERSION) +dist-doc: + $(RM) -r .doc-tmp-dir + mkdir .doc-tmp-dir + $(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc + cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar . + gzip -n -9 -f $(htmldocs).tar + : + $(RM) -r .doc-tmp-dir + mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7 + $(MAKE) -C Documentation DESTDIR=./ \ + man1dir=../.doc-tmp-dir/man1 \ + man5dir=../.doc-tmp-dir/man5 \ + man7dir=../.doc-tmp-dir/man7 \ + install + cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar . + gzip -n -9 -f $(manpages).tar + $(RM) -r .doc-tmp-dir + +### Cleaning rules + +distclean: clean + $(RM) configure clean: - rm $(BINS) + $(RM) *.o mozilla-sha1/*.o arm/*.o ppc/*.o compat/*.o xdiff/*.o \ + $(LIB_FILE) $(XDIFF_LIB) + $(RM) $(ALL_PROGRAMS) $(BUILT_INS) git$X + $(RM) $(TEST_PROGRAMS) + $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope* + $(RM) -r autom4te.cache + $(RM) config.log config.mak.autogen config.mak.append config.status config.cache + $(RM) -r $(GIT_TARNAME) .doc-tmp-dir + $(RM) $(GIT_TARNAME).tar.gz git-core_$(GIT_VERSION)-*.tar.gz + $(RM) $(htmldocs).tar.gz $(manpages).tar.gz + $(MAKE) -C Documentation/ clean +ifndef NO_PERL + $(RM) gitweb/gitweb.cgi + $(MAKE) -C perl clean +endif + $(MAKE) -C templates/ clean + $(MAKE) -C t/ clean +ifndef NO_TCLTK + $(MAKE) -C gitk-git clean + $(MAKE) -C git-gui clean +endif + $(RM) GIT-VERSION-FILE GIT-CFLAGS GIT-GUI-VARS GIT-BUILD-OPTIONS + +.PHONY: all install clean strip +.PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell +.PHONY: .FORCE-GIT-VERSION-FILE TAGS tags cscope .FORCE-GIT-CFLAGS +.PHONY: .FORCE-GIT-BUILD-OPTIONS + +### Check documentation +# +check-docs:: + @(for v in $(ALL_PROGRAMS) $(BUILT_INS) git gitk; \ + do \ + case "$$v" in \ + git-merge-octopus | git-merge-ours | git-merge-recursive | \ + git-merge-resolve | git-merge-subtree | \ + git-fsck-objects | git-init-db | \ + git-?*--?* ) continue ;; \ + esac ; \ + test -f "Documentation/$$v.txt" || \ + echo "no doc: $$v"; \ + sed -e '/^#/d' command-list.txt | \ + grep -q "^$$v[ ]" || \ + case "$$v" in \ + git) ;; \ + *) echo "no link: $$v";; \ + esac ; \ + done; \ + ( \ + sed -e '/^#/d' \ + -e 's/[ ].*//' \ + -e 's/^/listed /' command-list.txt; \ + ls -1 Documentation/git*txt | \ + sed -e 's|Documentation/|documented |' \ + -e 's/\.txt//'; \ + ) | while read how cmd; \ + do \ + case "$$how,$$cmd" in \ + *,git-citool | \ + *,git-gui | \ + *,git-help | \ + documented,gitattributes | \ + documented,gitignore | \ + documented,gitmodules | \ + documented,gitcli | \ + documented,git-tools | \ + documented,gitcore-tutorial | \ + documented,gitcvs-migration | \ + documented,gitdiffcore | \ + documented,gitglossary | \ + documented,githooks | \ + documented,gitrepository-layout | \ + documented,gittutorial | \ + documented,gittutorial-2 | \ + sentinel,not,matching,is,ok ) continue ;; \ + esac; \ + case " $(ALL_PROGRAMS) $(BUILT_INS) git gitk " in \ + *" $$cmd "*) ;; \ + *) echo "removed but $$how: $$cmd" ;; \ + esac; \ + done ) | sort + +### Make sure built-ins do not have dups and listed in git.c +# +check-builtins:: + ./check-builtins.sh + +### Test suite coverage testing +# +.PHONY: coverage coverage-clean coverage-build coverage-report + +coverage: + $(MAKE) coverage-build + $(MAKE) coverage-report + +coverage-clean: + rm -f *.gcda *.gcno + +COVERAGE_CFLAGS = $(CFLAGS) -O0 -ftest-coverage -fprofile-arcs +COVERAGE_LDFLAGS = $(CFLAGS) -O0 -lgcov + +coverage-build: coverage-clean + $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" all + $(MAKE) CFLAGS="$(COVERAGE_CFLAGS)" LDFLAGS="$(COVERAGE_LDFLAGS)" \ + -j1 test + +coverage-report: + gcov -b *.c + grep '^function.*called 0 ' *.c.gcov \ + | sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \ + | tee coverage-untested-functions From 0780060124011b94af55830939c86cc0916be0f5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 15:00:56 +0200 Subject: [PATCH 0510/2061] perf_counter tools: add in basic glue from Git First very raw version at having a central 'perf' command and a list of subcommands: perf top perf stat perf record perf report ... This is done by picking up Git's collection of utility functions, and hacking them to build fine in this new environment. Signed-off-by: Ingo Molnar --- Documentation/perf_counter/.gitignore | 179 +++ Documentation/perf_counter/Makefile | 1025 ++---------- Documentation/perf_counter/PERF-BUILD-OPTIONS | 4 + Documentation/perf_counter/PERF-CFLAGS | 1 + Documentation/perf_counter/PERF-VERSION-FILE | 1 + Documentation/perf_counter/PERF-VERSION-GEN | 42 + Documentation/perf_counter/abspath.c | 117 ++ Documentation/perf_counter/alias.c | 77 + Documentation/perf_counter/builtin-help.c | 463 ++++++ Documentation/perf_counter/builtin-top.c | 1411 +++++++++++++++++ Documentation/perf_counter/builtin.h | 18 + Documentation/perf_counter/cache.h | 97 ++ Documentation/perf_counter/command-list.txt | 4 + Documentation/perf_counter/config.c | 966 +++++++++++ Documentation/perf_counter/ctype.c | 26 + Documentation/perf_counter/exec_cmd.c | 165 ++ Documentation/perf_counter/exec_cmd.h | 13 + .../perf_counter/generate-cmdlist.sh | 24 + Documentation/perf_counter/help.c | 366 +++++ Documentation/perf_counter/help.h | 29 + Documentation/perf_counter/levenshtein.c | 84 + Documentation/perf_counter/levenshtein.h | 8 + Documentation/perf_counter/parse-options.c | 495 ++++++ Documentation/perf_counter/parse-options.h | 172 ++ Documentation/perf_counter/path.c | 392 +++++ Documentation/perf_counter/perf.c | 411 +++++ Documentation/perf_counter/quote.c | 478 ++++++ Documentation/perf_counter/quote.h | 68 + Documentation/perf_counter/run-command.c | 395 +++++ Documentation/perf_counter/run-command.h | 93 ++ Documentation/perf_counter/strbuf.c | 359 +++++ Documentation/perf_counter/strbuf.h | 137 ++ Documentation/perf_counter/usage.c | 80 + Documentation/perf_counter/util.h | 394 +++++ Documentation/perf_counter/wrapper.c | 206 +++ 35 files changed, 7947 insertions(+), 853 deletions(-) create mode 100644 Documentation/perf_counter/.gitignore create mode 100644 Documentation/perf_counter/PERF-BUILD-OPTIONS create mode 100644 Documentation/perf_counter/PERF-CFLAGS create mode 100644 Documentation/perf_counter/PERF-VERSION-FILE create mode 100755 Documentation/perf_counter/PERF-VERSION-GEN create mode 100644 Documentation/perf_counter/abspath.c create mode 100644 Documentation/perf_counter/alias.c create mode 100644 Documentation/perf_counter/builtin-help.c create mode 100644 Documentation/perf_counter/builtin-top.c create mode 100644 Documentation/perf_counter/builtin.h create mode 100644 Documentation/perf_counter/cache.h create mode 100644 Documentation/perf_counter/command-list.txt create mode 100644 Documentation/perf_counter/config.c create mode 100644 Documentation/perf_counter/ctype.c create mode 100644 Documentation/perf_counter/exec_cmd.c create mode 100644 Documentation/perf_counter/exec_cmd.h create mode 100755 Documentation/perf_counter/generate-cmdlist.sh create mode 100644 Documentation/perf_counter/help.c create mode 100644 Documentation/perf_counter/help.h create mode 100644 Documentation/perf_counter/levenshtein.c create mode 100644 Documentation/perf_counter/levenshtein.h create mode 100644 Documentation/perf_counter/parse-options.c create mode 100644 Documentation/perf_counter/parse-options.h create mode 100644 Documentation/perf_counter/path.c create mode 100644 Documentation/perf_counter/perf.c create mode 100644 Documentation/perf_counter/quote.c create mode 100644 Documentation/perf_counter/quote.h create mode 100644 Documentation/perf_counter/run-command.c create mode 100644 Documentation/perf_counter/run-command.h create mode 100644 Documentation/perf_counter/strbuf.c create mode 100644 Documentation/perf_counter/strbuf.h create mode 100644 Documentation/perf_counter/usage.c create mode 100644 Documentation/perf_counter/util.h create mode 100644 Documentation/perf_counter/wrapper.c diff --git a/Documentation/perf_counter/.gitignore b/Documentation/perf_counter/.gitignore new file mode 100644 index 00000000000..41c0b20a76c --- /dev/null +++ b/Documentation/perf_counter/.gitignore @@ -0,0 +1,179 @@ +GIT-BUILD-OPTIONS +GIT-CFLAGS +GIT-GUI-VARS +GIT-VERSION-FILE +git +git-add +git-add--interactive +git-am +git-annotate +git-apply +git-archimport +git-archive +git-bisect +git-bisect--helper +git-blame +git-branch +git-bundle +git-cat-file +git-check-attr +git-check-ref-format +git-checkout +git-checkout-index +git-cherry +git-cherry-pick +git-clean +git-clone +git-commit +git-commit-tree +git-config +git-count-objects +git-cvsexportcommit +git-cvsimport +git-cvsserver +git-daemon +git-diff +git-diff-files +git-diff-index +git-diff-tree +git-difftool +git-difftool--helper +git-describe +git-fast-export +git-fast-import +git-fetch +git-fetch--tool +git-fetch-pack +git-filter-branch +git-fmt-merge-msg +git-for-each-ref +git-format-patch +git-fsck +git-fsck-objects +git-gc +git-get-tar-commit-id +git-grep +git-hash-object +git-help +git-http-fetch +git-http-push +git-imap-send +git-index-pack +git-init +git-init-db +git-instaweb +git-log +git-lost-found +git-ls-files +git-ls-remote +git-ls-tree +git-mailinfo +git-mailsplit +git-merge +git-merge-base +git-merge-index +git-merge-file +git-merge-tree +git-merge-octopus +git-merge-one-file +git-merge-ours +git-merge-recursive +git-merge-resolve +git-merge-subtree +git-mergetool +git-mergetool--lib +git-mktag +git-mktree +git-name-rev +git-mv +git-pack-redundant +git-pack-objects +git-pack-refs +git-parse-remote +git-patch-id +git-peek-remote +git-prune +git-prune-packed +git-pull +git-push +git-quiltimport +git-read-tree +git-rebase +git-rebase--interactive +git-receive-pack +git-reflog +git-relink +git-remote +git-repack +git-repo-config +git-request-pull +git-rerere +git-reset +git-rev-list +git-rev-parse +git-revert +git-rm +git-send-email +git-send-pack +git-sh-setup +git-shell +git-shortlog +git-show +git-show-branch +git-show-index +git-show-ref +git-stage +git-stash +git-status +git-stripspace +git-submodule +git-svn +git-symbolic-ref +git-tag +git-tar-tree +git-unpack-file +git-unpack-objects +git-update-index +git-update-ref +git-update-server-info +git-upload-archive +git-upload-pack +git-var +git-verify-pack +git-verify-tag +git-web--browse +git-whatchanged +git-write-tree +git-core-*/?* +gitk-wish +gitweb/gitweb.cgi +test-chmtime +test-ctype +test-date +test-delta +test-dump-cache-tree +test-genrandom +test-match-trees +test-parse-options +test-path-utils +test-sha1 +test-sigchain +common-cmds.h +*.tar.gz +*.dsc +*.deb +git.spec +*.exe +*.[aos] +*.py[co] +config.mak +autom4te.cache +config.cache +config.log +config.status +config.mak.autogen +config.mak.append +configure +tags +TAGS +cscope* diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index 6e0838b03ad..11809b943fc 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -13,16 +13,9 @@ all:: # Define NO_OPENSSL environment variable if you do not have OpenSSL. # This also implies MOZILLA_SHA1. # -# Define NO_CURL if you do not have libcurl installed. git-http-pull and -# git-http-push are not built, and you cannot use http:// and https:// -# transports. -# # Define CURLDIR=/foo/bar if your curl header and library files are in # /foo/bar/include and /foo/bar/lib directories. # -# Define NO_EXPAT if you do not have expat installed. git-http-push is -# not built, and you cannot push using http:// and https:// transports. -# # Define EXPATDIR=/foo/bar if your expat header and library files are in # /foo/bar/include and /foo/bar/lib directories. # @@ -40,8 +33,6 @@ all:: # # Define NO_MEMMEM if you don't have memmem. # -# Define NO_STRLCPY if you don't have strlcpy. -# # Define NO_STRTOUMAX if you don't have strtoumax in the C library. # If your compiler also does not support long long or does not have # strtoull, define NO_STRTOULL. @@ -54,7 +45,7 @@ all:: # # Define NO_SYS_SELECT_H if you don't have sys/select.h. # -# Define NO_SYMLINK_HEAD if you never want .git/HEAD to be a symbolic link. +# Define NO_SYMLINK_HEAD if you never want .perf/HEAD to be a symbolic link. # Enable it on Windows. By default, symrefs are still used. # # Define NO_SVN_TESTS if you want to skip time-consuming SVN interoperability @@ -62,13 +53,13 @@ all:: # but are not needed unless you plan to talk to SVN repos. # # Define NO_FINK if you are building on Darwin/Mac OS X, have Fink -# installed in /sw, but don't want GIT to link against any libraries +# installed in /sw, but don't want PERF to link against any libraries # installed there. If defined you may specify your own (or Fink's) # include directories and library directories by defining CFLAGS # and LDFLAGS appropriately. # # Define NO_DARWIN_PORTS if you are building on Darwin/Mac OS X, -# have DarwinPorts installed in /opt/local, but don't want GIT to +# have DarwinPorts installed in /opt/local, but don't want PERF to # link against any libraries installed there. If defined you may # specify your own (or DarwinPort's) include directories and # library directories by defining CFLAGS and LDFLAGS appropriately. @@ -120,7 +111,7 @@ all:: # that tells runtime paths to dynamic libraries; # "-Wl,-rpath=/path/lib" is used instead. # -# Define USE_NSEC below if you want git to care about sub-second file mtimes +# Define USE_NSEC below if you want perf to care about sub-second file mtimes # and ctimes. Note that you need recent glibc (at least 2.2.4) for this, and # it will BREAK YOUR LOCAL DIFFS! show-diff and anything using it will likely # randomly break unless your underlying filesystem supports those sub-second @@ -132,7 +123,7 @@ all:: # Define NO_NSEC if your "struct stat" does not have "st_ctim.tv_nsec" # available. This automatically turns USE_NSEC off. # -# Define USE_STDEV below if you want git to care about the underlying device +# Define USE_STDEV below if you want perf to care about the underlying device # change being considered an inode change from the update-index perspective. # # Define NO_ST_BLOCKS_IN_STRUCT_STAT if your platform does not have st_blocks @@ -150,27 +141,24 @@ all:: # Define NO_TCLTK if you do not want Tcl/Tk GUI. # # The TCL_PATH variable governs the location of the Tcl interpreter -# used to optimize git-gui for your system. Only used if NO_TCLTK +# used to optimize perf-gui for your system. Only used if NO_TCLTK # is not set. Defaults to the bare 'tclsh'. # # The TCLTK_PATH variable governs the location of the Tcl/Tk interpreter. # If not set it defaults to the bare 'wish'. If it is set to the empty # string then NO_TCLTK will be forced (this is used by configure script). # -# Define THREADED_DELTA_SEARCH if you have pthreads and wish to exploit -# parallel delta searching when packing objects. -# # Define INTERNAL_QSORT to use Git's implementation of qsort(), which # is a simplified version of the merge sort used in glibc. This is # recommended if Git triggers O(n^2) behavior in your platform's qsort(). # -# Define NO_EXTERNAL_GREP if you don't want "git grep" to ever call +# Define NO_EXTERNAL_GREP if you don't want "perf grep" to ever call # your external grep (e.g., if your system lacks grep, if its grep is -# broken, or spawning external process is slower than built-in grep git has). +# broken, or spawning external process is slower than built-in grep perf has). -GIT-VERSION-FILE: .FORCE-GIT-VERSION-FILE - @$(SHELL_PATH) ./GIT-VERSION-GEN --include GIT-VERSION-FILE +PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE + @$(SHELL_PATH) ./PERF-VERSION-GEN +-include PERF-VERSION-FILE uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') uname_M := $(shell sh -c 'uname -m 2>/dev/null || echo not') @@ -182,20 +170,20 @@ uname_V := $(shell sh -c 'uname -v 2>/dev/null || echo not') # CFLAGS and LDFLAGS are for the users to override from the command line. CFLAGS = -g -O2 -Wall -LDFLAGS = +LDFLAGS = -lpthread -lrt ALL_CFLAGS = $(CFLAGS) ALL_LDFLAGS = $(LDFLAGS) STRIP ?= strip # Among the variables below, these: -# gitexecdir +# perfexecdir # template_dir # mandir # infodir # htmldir -# ETC_GITCONFIG (but not sysconfdir) +# ETC_PERFCONFIG (but not sysconfdir) # can be specified as a relative path some/where/else; -# this is interpreted as relative to $(prefix) and "git" at +# this is interpreted as relative to $(prefix) and "perf" at # runtime figures out where they are based on the path to the executable. # This can help installing the suite in a relocatable way. @@ -204,38 +192,20 @@ bindir_relative = bin bindir = $(prefix)/$(bindir_relative) mandir = share/man infodir = share/info -gitexecdir = libexec/git-core +perfexecdir = libexec/perf-core sharedir = $(prefix)/share -template_dir = share/git-core/templates -htmldir = share/doc/git-doc +template_dir = share/perf-core/templates +htmldir = share/doc/perf-doc ifeq ($(prefix),/usr) sysconfdir = /etc -ETC_GITCONFIG = $(sysconfdir)/gitconfig +ETC_PERFCONFIG = $(sysconfdir)/perfconfig else sysconfdir = $(prefix)/etc -ETC_GITCONFIG = etc/gitconfig +ETC_PERFCONFIG = etc/perfconfig endif lib = lib # DESTDIR= -# default configuration for gitweb -GITWEB_CONFIG = gitweb_config.perl -GITWEB_CONFIG_SYSTEM = /etc/gitweb.conf -GITWEB_HOME_LINK_STR = projects -GITWEB_SITENAME = -GITWEB_PROJECTROOT = /pub/git -GITWEB_PROJECT_MAXDEPTH = 2007 -GITWEB_EXPORT_OK = -GITWEB_STRICT_EXPORT = -GITWEB_BASE_URL = -GITWEB_LIST = -GITWEB_HOMETEXT = indextext.html -GITWEB_CSS = gitweb.css -GITWEB_LOGO = git-logo.png -GITWEB_FAVICON = git-favicon.png -GITWEB_SITE_HEADER = -GITWEB_SITE_FOOTER = - export prefix bindir sharedir sysconfdir CC = gcc @@ -277,89 +247,46 @@ SCRIPT_PERL = SCRIPT_SH = TEST_PROGRAMS = -SCRIPT_SH += git-am.sh -SCRIPT_SH += git-bisect.sh -SCRIPT_SH += git-difftool--helper.sh -SCRIPT_SH += git-filter-branch.sh -SCRIPT_SH += git-lost-found.sh -SCRIPT_SH += git-merge-octopus.sh -SCRIPT_SH += git-merge-one-file.sh -SCRIPT_SH += git-merge-resolve.sh -SCRIPT_SH += git-mergetool.sh -SCRIPT_SH += git-mergetool--lib.sh -SCRIPT_SH += git-parse-remote.sh -SCRIPT_SH += git-pull.sh -SCRIPT_SH += git-quiltimport.sh -SCRIPT_SH += git-rebase--interactive.sh -SCRIPT_SH += git-rebase.sh -SCRIPT_SH += git-repack.sh -SCRIPT_SH += git-request-pull.sh -SCRIPT_SH += git-sh-setup.sh -SCRIPT_SH += git-stash.sh -SCRIPT_SH += git-submodule.sh -SCRIPT_SH += git-web--browse.sh +# +# No scripts right now: +# -SCRIPT_PERL += git-add--interactive.perl -SCRIPT_PERL += git-difftool.perl -SCRIPT_PERL += git-archimport.perl -SCRIPT_PERL += git-cvsexportcommit.perl -SCRIPT_PERL += git-cvsimport.perl -SCRIPT_PERL += git-cvsserver.perl -SCRIPT_PERL += git-relink.perl -SCRIPT_PERL += git-send-email.perl -SCRIPT_PERL += git-svn.perl +# SCRIPT_SH += perf-am.sh + +# +# No Perl scripts right now: +# + +# SCRIPT_PERL += perf-add--interactive.perl SCRIPTS = $(patsubst %.sh,%,$(SCRIPT_SH)) \ - $(patsubst %.perl,%,$(SCRIPT_PERL)) \ - git-instaweb + $(patsubst %.perl,%,$(SCRIPT_PERL)) # Empty... EXTRA_PROGRAMS = -# ... and all the rest that could be moved out of bindir to gitexecdir +# ... and all the rest that could be moved out of bindir to perfexecdir PROGRAMS += $(EXTRA_PROGRAMS) -PROGRAMS += git-fast-import$X -PROGRAMS += git-hash-object$X -PROGRAMS += git-index-pack$X -PROGRAMS += git-merge-index$X -PROGRAMS += git-merge-tree$X -PROGRAMS += git-mktag$X -PROGRAMS += git-mktree$X -PROGRAMS += git-pack-redundant$X -PROGRAMS += git-patch-id$X -PROGRAMS += git-shell$X -PROGRAMS += git-show-index$X -PROGRAMS += git-unpack-file$X -PROGRAMS += git-update-server-info$X -PROGRAMS += git-upload-pack$X -PROGRAMS += git-var$X + +# +# None right now: +# +# PROGRAMS += perf-fast-import$X # List built-in command $C whose implementation cmd_$C() is not in # builtin-$C.o but is linked in as part of some other command. -BUILT_INS += $(patsubst builtin-%.o,git-%$X,$(BUILTIN_OBJS)) +BUILT_INS += $(patsubst builtin-%.o,perf-%$X,$(BUILTIN_OBJS)) -BUILT_INS += git-cherry$X -BUILT_INS += git-cherry-pick$X -BUILT_INS += git-format-patch$X -BUILT_INS += git-fsck-objects$X -BUILT_INS += git-get-tar-commit-id$X -BUILT_INS += git-init$X -BUILT_INS += git-merge-subtree$X -BUILT_INS += git-peek-remote$X -BUILT_INS += git-repo-config$X -BUILT_INS += git-show$X -BUILT_INS += git-stage$X -BUILT_INS += git-status$X -BUILT_INS += git-whatchanged$X +# +# None right now: +# +# BUILT_INS += perf-init $X -# what 'all' will build and 'install' will install, in gitexecdir +# what 'all' will build and 'install' will install, in perfexecdir ALL_PROGRAMS = $(PROGRAMS) $(SCRIPTS) -# what 'all' will build but not install in gitexecdir -OTHER_PROGRAMS = git$X -ifndef NO_PERL -OTHER_PROGRAMS += gitweb/gitweb.cgi -endif +# what 'all' will build but not install in perfexecdir +OTHER_PROGRAMS = perf$X # Set paths to tools early so that they can be used for version tests. ifndef SHELL_PATH @@ -371,250 +298,34 @@ endif export PERL_PATH -LIB_FILE=libgit.a -XDIFF_LIB=xdiff/lib.a +LIB_FILE=libperf.a -LIB_H += archive.h -LIB_H += attr.h -LIB_H += blob.h -LIB_H += builtin.h -LIB_H += cache.h -LIB_H += cache-tree.h -LIB_H += commit.h -LIB_H += compat/cygwin.h -LIB_H += compat/mingw.h -LIB_H += csum-file.h -LIB_H += decorate.h -LIB_H += delta.h -LIB_H += diffcore.h -LIB_H += diff.h -LIB_H += dir.h -LIB_H += fsck.h -LIB_H += git-compat-util.h -LIB_H += graph.h -LIB_H += grep.h -LIB_H += hash.h -LIB_H += help.h +LIB_H += ../../include/linux/perf_counter.h LIB_H += levenshtein.h -LIB_H += list-objects.h -LIB_H += ll-merge.h -LIB_H += log-tree.h -LIB_H += mailmap.h -LIB_H += merge-recursive.h -LIB_H += object.h -LIB_H += pack.h -LIB_H += pack-refs.h -LIB_H += pack-revindex.h LIB_H += parse-options.h -LIB_H += patch-ids.h -LIB_H += pkt-line.h -LIB_H += progress.h LIB_H += quote.h -LIB_H += reflog-walk.h -LIB_H += refs.h -LIB_H += remote.h -LIB_H += rerere.h -LIB_H += revision.h -LIB_H += run-command.h -LIB_H += sha1-lookup.h -LIB_H += sideband.h -LIB_H += sigchain.h LIB_H += strbuf.h -LIB_H += string-list.h -LIB_H += tag.h -LIB_H += transport.h -LIB_H += tree.h -LIB_H += tree-walk.h -LIB_H += unpack-trees.h -LIB_H += userdiff.h -LIB_H += utf8.h -LIB_H += wt-status.h +LIB_H += run-command.h LIB_OBJS += abspath.o LIB_OBJS += alias.o -LIB_OBJS += alloc.o -LIB_OBJS += archive.o -LIB_OBJS += archive-tar.o -LIB_OBJS += archive-zip.o -LIB_OBJS += attr.o -LIB_OBJS += base85.o -LIB_OBJS += bisect.o -LIB_OBJS += blob.o -LIB_OBJS += branch.o -LIB_OBJS += bundle.o -LIB_OBJS += cache-tree.o -LIB_OBJS += color.o -LIB_OBJS += combine-diff.o -LIB_OBJS += commit.o LIB_OBJS += config.o -LIB_OBJS += connect.o -LIB_OBJS += convert.o -LIB_OBJS += copy.o -LIB_OBJS += csum-file.o LIB_OBJS += ctype.o -LIB_OBJS += date.o -LIB_OBJS += decorate.o -LIB_OBJS += diffcore-break.o -LIB_OBJS += diffcore-delta.o -LIB_OBJS += diffcore-order.o -LIB_OBJS += diffcore-pickaxe.o -LIB_OBJS += diffcore-rename.o -LIB_OBJS += diff-delta.o -LIB_OBJS += diff-lib.o -LIB_OBJS += diff-no-index.o -LIB_OBJS += diff.o -LIB_OBJS += dir.o -LIB_OBJS += editor.o -LIB_OBJS += entry.o -LIB_OBJS += environment.o LIB_OBJS += exec_cmd.o -LIB_OBJS += fsck.o -LIB_OBJS += graph.o -LIB_OBJS += grep.o -LIB_OBJS += hash.o LIB_OBJS += help.o -LIB_OBJS += ident.o LIB_OBJS += levenshtein.o -LIB_OBJS += list-objects.o -LIB_OBJS += ll-merge.o -LIB_OBJS += lockfile.o -LIB_OBJS += log-tree.o -LIB_OBJS += mailmap.o -LIB_OBJS += match-trees.o -LIB_OBJS += merge-file.o -LIB_OBJS += merge-recursive.o -LIB_OBJS += name-hash.o -LIB_OBJS += object.o -LIB_OBJS += pack-check.o -LIB_OBJS += pack-refs.o -LIB_OBJS += pack-revindex.o -LIB_OBJS += pack-write.o -LIB_OBJS += pager.o LIB_OBJS += parse-options.o -LIB_OBJS += patch-delta.o -LIB_OBJS += patch-ids.o LIB_OBJS += path.o -LIB_OBJS += pkt-line.o -LIB_OBJS += preload-index.o -LIB_OBJS += pretty.o -LIB_OBJS += progress.o -LIB_OBJS += quote.o -LIB_OBJS += reachable.o -LIB_OBJS += read-cache.o -LIB_OBJS += reflog-walk.o -LIB_OBJS += refs.o -LIB_OBJS += remote.o -LIB_OBJS += rerere.o -LIB_OBJS += revision.o LIB_OBJS += run-command.o -LIB_OBJS += server-info.o -LIB_OBJS += setup.o -LIB_OBJS += sha1-lookup.o -LIB_OBJS += sha1_file.o -LIB_OBJS += sha1_name.o -LIB_OBJS += shallow.o -LIB_OBJS += sideband.o -LIB_OBJS += sigchain.o +LIB_OBJS += quote.o LIB_OBJS += strbuf.o -LIB_OBJS += string-list.o -LIB_OBJS += symlinks.o -LIB_OBJS += tag.o -LIB_OBJS += trace.o -LIB_OBJS += transport.o -LIB_OBJS += tree-diff.o -LIB_OBJS += tree.o -LIB_OBJS += tree-walk.o -LIB_OBJS += unpack-trees.o LIB_OBJS += usage.o -LIB_OBJS += userdiff.o -LIB_OBJS += utf8.o -LIB_OBJS += walker.o LIB_OBJS += wrapper.o -LIB_OBJS += write_or_die.o -LIB_OBJS += ws.o -LIB_OBJS += wt-status.o -LIB_OBJS += xdiff-interface.o -BUILTIN_OBJS += builtin-add.o -BUILTIN_OBJS += builtin-annotate.o -BUILTIN_OBJS += builtin-apply.o -BUILTIN_OBJS += builtin-archive.o -BUILTIN_OBJS += builtin-bisect--helper.o -BUILTIN_OBJS += builtin-blame.o -BUILTIN_OBJS += builtin-branch.o -BUILTIN_OBJS += builtin-bundle.o -BUILTIN_OBJS += builtin-cat-file.o -BUILTIN_OBJS += builtin-check-attr.o -BUILTIN_OBJS += builtin-check-ref-format.o -BUILTIN_OBJS += builtin-checkout-index.o -BUILTIN_OBJS += builtin-checkout.o -BUILTIN_OBJS += builtin-clean.o -BUILTIN_OBJS += builtin-clone.o -BUILTIN_OBJS += builtin-commit-tree.o -BUILTIN_OBJS += builtin-commit.o -BUILTIN_OBJS += builtin-config.o -BUILTIN_OBJS += builtin-count-objects.o -BUILTIN_OBJS += builtin-describe.o -BUILTIN_OBJS += builtin-diff-files.o -BUILTIN_OBJS += builtin-diff-index.o -BUILTIN_OBJS += builtin-diff-tree.o -BUILTIN_OBJS += builtin-diff.o -BUILTIN_OBJS += builtin-fast-export.o -BUILTIN_OBJS += builtin-fetch--tool.o -BUILTIN_OBJS += builtin-fetch-pack.o -BUILTIN_OBJS += builtin-fetch.o -BUILTIN_OBJS += builtin-fmt-merge-msg.o -BUILTIN_OBJS += builtin-for-each-ref.o -BUILTIN_OBJS += builtin-fsck.o -BUILTIN_OBJS += builtin-gc.o -BUILTIN_OBJS += builtin-grep.o BUILTIN_OBJS += builtin-help.o -BUILTIN_OBJS += builtin-init-db.o -BUILTIN_OBJS += builtin-log.o -BUILTIN_OBJS += builtin-ls-files.o -BUILTIN_OBJS += builtin-ls-remote.o -BUILTIN_OBJS += builtin-ls-tree.o -BUILTIN_OBJS += builtin-mailinfo.o -BUILTIN_OBJS += builtin-mailsplit.o -BUILTIN_OBJS += builtin-merge.o -BUILTIN_OBJS += builtin-merge-base.o -BUILTIN_OBJS += builtin-merge-file.o -BUILTIN_OBJS += builtin-merge-ours.o -BUILTIN_OBJS += builtin-merge-recursive.o -BUILTIN_OBJS += builtin-mv.o -BUILTIN_OBJS += builtin-name-rev.o -BUILTIN_OBJS += builtin-pack-objects.o -BUILTIN_OBJS += builtin-pack-refs.o -BUILTIN_OBJS += builtin-prune-packed.o -BUILTIN_OBJS += builtin-prune.o -BUILTIN_OBJS += builtin-push.o -BUILTIN_OBJS += builtin-read-tree.o -BUILTIN_OBJS += builtin-receive-pack.o -BUILTIN_OBJS += builtin-reflog.o -BUILTIN_OBJS += builtin-remote.o -BUILTIN_OBJS += builtin-rerere.o -BUILTIN_OBJS += builtin-reset.o -BUILTIN_OBJS += builtin-rev-list.o -BUILTIN_OBJS += builtin-rev-parse.o -BUILTIN_OBJS += builtin-revert.o -BUILTIN_OBJS += builtin-rm.o -BUILTIN_OBJS += builtin-send-pack.o -BUILTIN_OBJS += builtin-shortlog.o -BUILTIN_OBJS += builtin-show-branch.o -BUILTIN_OBJS += builtin-show-ref.o -BUILTIN_OBJS += builtin-stripspace.o -BUILTIN_OBJS += builtin-symbolic-ref.o -BUILTIN_OBJS += builtin-tag.o -BUILTIN_OBJS += builtin-tar-tree.o -BUILTIN_OBJS += builtin-unpack-objects.o -BUILTIN_OBJS += builtin-update-index.o -BUILTIN_OBJS += builtin-update-ref.o -BUILTIN_OBJS += builtin-upload-archive.o -BUILTIN_OBJS += builtin-verify-pack.o -BUILTIN_OBJS += builtin-verify-tag.o -BUILTIN_OBJS += builtin-write-tree.o +BUILTIN_OBJS += builtin-top.o -GITLIBS = $(LIB_FILE) $(XDIFF_LIB) +PERFLIBS = $(LIB_FILE) EXTLIBS = # @@ -625,221 +336,6 @@ EXTLIBS = # because maintaining the nesting to match is a pain. If # we had "elif" things would have been much nicer... -ifeq ($(uname_S),Linux) - NO_STRLCPY = YesPlease - THREADED_DELTA_SEARCH = YesPlease -endif -ifeq ($(uname_S),GNU/kFreeBSD) - NO_STRLCPY = YesPlease - THREADED_DELTA_SEARCH = YesPlease -endif -ifeq ($(uname_S),UnixWare) - CC = cc - NEEDS_SOCKET = YesPlease - NEEDS_NSL = YesPlease - NEEDS_SSL_WITH_CRYPTO = YesPlease - NEEDS_LIBICONV = YesPlease - SHELL_PATH = /usr/local/bin/bash - NO_IPV6 = YesPlease - NO_HSTRERROR = YesPlease - BASIC_CFLAGS += -Kthread - BASIC_CFLAGS += -I/usr/local/include - BASIC_LDFLAGS += -L/usr/local/lib - INSTALL = ginstall - TAR = gtar - NO_STRCASESTR = YesPlease - NO_MEMMEM = YesPlease -endif -ifeq ($(uname_S),SCO_SV) - ifeq ($(uname_R),3.2) - CFLAGS = -O2 - endif - ifeq ($(uname_R),5) - CC = cc - BASIC_CFLAGS += -Kthread - endif - NEEDS_SOCKET = YesPlease - NEEDS_NSL = YesPlease - NEEDS_SSL_WITH_CRYPTO = YesPlease - NEEDS_LIBICONV = YesPlease - SHELL_PATH = /usr/bin/bash - NO_IPV6 = YesPlease - NO_HSTRERROR = YesPlease - BASIC_CFLAGS += -I/usr/local/include - BASIC_LDFLAGS += -L/usr/local/lib - NO_STRCASESTR = YesPlease - NO_MEMMEM = YesPlease - INSTALL = ginstall - TAR = gtar -endif -ifeq ($(uname_S),Darwin) - NEEDS_SSL_WITH_CRYPTO = YesPlease - NEEDS_LIBICONV = YesPlease - ifeq ($(shell expr "$(uname_R)" : '[15678]\.'),2) - OLD_ICONV = UnfortunatelyYes - endif - ifeq ($(shell expr "$(uname_R)" : '[15]\.'),2) - NO_STRLCPY = YesPlease - endif - NO_MEMMEM = YesPlease - THREADED_DELTA_SEARCH = YesPlease - USE_ST_TIMESPEC = YesPlease -endif -ifeq ($(uname_S),SunOS) - NEEDS_SOCKET = YesPlease - NEEDS_NSL = YesPlease - SHELL_PATH = /bin/bash - NO_STRCASESTR = YesPlease - NO_MEMMEM = YesPlease - NO_HSTRERROR = YesPlease - NO_MKDTEMP = YesPlease - OLD_ICONV = UnfortunatelyYes - ifeq ($(uname_R),5.8) - NO_UNSETENV = YesPlease - NO_SETENV = YesPlease - NO_C99_FORMAT = YesPlease - NO_STRTOUMAX = YesPlease - endif - ifeq ($(uname_R),5.9) - NO_UNSETENV = YesPlease - NO_SETENV = YesPlease - NO_C99_FORMAT = YesPlease - NO_STRTOUMAX = YesPlease - endif - INSTALL = ginstall - TAR = gtar - BASIC_CFLAGS += -D__EXTENSIONS__ -endif -ifeq ($(uname_O),Cygwin) - NO_D_TYPE_IN_DIRENT = YesPlease - NO_D_INO_IN_DIRENT = YesPlease - NO_STRCASESTR = YesPlease - NO_MEMMEM = YesPlease - NO_SYMLINK_HEAD = YesPlease - NEEDS_LIBICONV = YesPlease - NO_FAST_WORKING_DIRECTORY = UnfortunatelyYes - NO_TRUSTABLE_FILEMODE = UnfortunatelyYes - OLD_ICONV = UnfortunatelyYes - # There are conflicting reports about this. - # On some boxes NO_MMAP is needed, and not so elsewhere. - # Try commenting this out if you suspect MMAP is more efficient - NO_MMAP = YesPlease - NO_IPV6 = YesPlease - X = .exe -endif -ifeq ($(uname_S),FreeBSD) - NEEDS_LIBICONV = YesPlease - NO_MEMMEM = YesPlease - BASIC_CFLAGS += -I/usr/local/include - BASIC_LDFLAGS += -L/usr/local/lib - DIR_HAS_BSD_GROUP_SEMANTICS = YesPlease - USE_ST_TIMESPEC = YesPlease - THREADED_DELTA_SEARCH = YesPlease - ifeq ($(shell expr "$(uname_R)" : '4\.'),2) - PTHREAD_LIBS = -pthread - NO_UINTMAX_T = YesPlease - NO_STRTOUMAX = YesPlease - endif -endif -ifeq ($(uname_S),OpenBSD) - NO_STRCASESTR = YesPlease - NO_MEMMEM = YesPlease - NEEDS_LIBICONV = YesPlease - BASIC_CFLAGS += -I/usr/local/include - BASIC_LDFLAGS += -L/usr/local/lib - THREADED_DELTA_SEARCH = YesPlease -endif -ifeq ($(uname_S),NetBSD) - ifeq ($(shell expr "$(uname_R)" : '[01]\.'),2) - NEEDS_LIBICONV = YesPlease - endif - BASIC_CFLAGS += -I/usr/pkg/include - BASIC_LDFLAGS += -L/usr/pkg/lib $(CC_LD_DYNPATH)/usr/pkg/lib - THREADED_DELTA_SEARCH = YesPlease -endif -ifeq ($(uname_S),AIX) - NO_STRCASESTR=YesPlease - NO_MEMMEM = YesPlease - NO_MKDTEMP = YesPlease - NO_STRLCPY = YesPlease - NO_NSEC = YesPlease - FREAD_READS_DIRECTORIES = UnfortunatelyYes - INTERNAL_QSORT = UnfortunatelyYes - NEEDS_LIBICONV=YesPlease - BASIC_CFLAGS += -D_LARGE_FILES - ifneq ($(shell expr "$(uname_V)" : '[1234]'),1) - THREADED_DELTA_SEARCH = YesPlease - else - NO_PTHREADS = YesPlease - endif -endif -ifeq ($(uname_S),GNU) - # GNU/Hurd - NO_STRLCPY=YesPlease -endif -ifeq ($(uname_S),IRIX64) - NO_IPV6=YesPlease - NO_SETENV=YesPlease - NO_STRCASESTR=YesPlease - NO_MEMMEM = YesPlease - NO_STRLCPY = YesPlease - NO_SOCKADDR_STORAGE=YesPlease - SHELL_PATH=/usr/gnu/bin/bash - BASIC_CFLAGS += -DPATH_MAX=1024 - # for now, build 32-bit version - BASIC_LDFLAGS += -L/usr/lib32 -endif -ifeq ($(uname_S),HP-UX) - NO_IPV6=YesPlease - NO_SETENV=YesPlease - NO_STRCASESTR=YesPlease - NO_MEMMEM = YesPlease - NO_STRLCPY = YesPlease - NO_MKDTEMP = YesPlease - NO_UNSETENV = YesPlease - NO_HSTRERROR = YesPlease - NO_SYS_SELECT_H = YesPlease - SNPRINTF_RETURNS_BOGUS = YesPlease -endif -ifneq (,$(findstring CYGWIN,$(uname_S))) - COMPAT_OBJS += compat/cygwin.o -endif -ifneq (,$(findstring MINGW,$(uname_S))) - NO_PREAD = YesPlease - NO_OPENSSL = YesPlease - NO_CURL = YesPlease - NO_SYMLINK_HEAD = YesPlease - NO_IPV6 = YesPlease - NO_SETENV = YesPlease - NO_UNSETENV = YesPlease - NO_STRCASESTR = YesPlease - NO_STRLCPY = YesPlease - NO_MEMMEM = YesPlease - NO_PTHREADS = YesPlease - NEEDS_LIBICONV = YesPlease - OLD_ICONV = YesPlease - NO_C99_FORMAT = YesPlease - NO_STRTOUMAX = YesPlease - NO_MKDTEMP = YesPlease - SNPRINTF_RETURNS_BOGUS = YesPlease - NO_SVN_TESTS = YesPlease - NO_PERL_MAKEMAKER = YesPlease - RUNTIME_PREFIX = YesPlease - NO_POSIX_ONLY_PROGRAMS = YesPlease - NO_ST_BLOCKS_IN_STRUCT_STAT = YesPlease - NO_NSEC = YesPlease - USE_WIN32_MMAP = YesPlease - COMPAT_CFLAGS += -D__USE_MINGW_ACCESS -DNOGDI -Icompat -Icompat/regex -Icompat/fnmatch - COMPAT_CFLAGS += -DSNPRINTF_SIZE_CORR=1 - COMPAT_CFLAGS += -DSTRIP_EXTENSION=\".exe\" - COMPAT_OBJS += compat/mingw.o compat/fnmatch/fnmatch.o compat/regex/regex.o compat/winansi.o - EXTLIBS += -lws2_32 - X = .exe -endif -ifneq (,$(findstring arm,$(uname_M))) - ARM_SHA1 = YesPlease -endif - -include config.mak.autogen -include config.mak @@ -869,72 +365,12 @@ ifndef CC_LD_DYNPATH endif endif -ifdef NO_CURL - BASIC_CFLAGS += -DNO_CURL -else - ifdef CURLDIR - # Try "-Wl,-rpath=$(CURLDIR)/$(lib)" in such a case. - BASIC_CFLAGS += -I$(CURLDIR)/include - CURL_LIBCURL = -L$(CURLDIR)/$(lib) $(CC_LD_DYNPATH)$(CURLDIR)/$(lib) -lcurl - else - CURL_LIBCURL = -lcurl - endif - BUILTIN_OBJS += builtin-http-fetch.o - EXTLIBS += $(CURL_LIBCURL) - LIB_OBJS += http.o http-walker.o - curl_check := $(shell (echo 070908; curl-config --vernum) | sort -r | sed -ne 2p) - ifeq "$(curl_check)" "070908" - ifndef NO_EXPAT - PROGRAMS += git-http-push$X - endif - endif - ifndef NO_EXPAT - ifdef EXPATDIR - BASIC_CFLAGS += -I$(EXPATDIR)/include - EXPAT_LIBEXPAT = -L$(EXPATDIR)/$(lib) $(CC_LD_DYNPATH)$(EXPATDIR)/$(lib) -lexpat - else - EXPAT_LIBEXPAT = -lexpat - endif - endif -endif - ifdef ZLIB_PATH BASIC_CFLAGS += -I$(ZLIB_PATH)/include EXTLIBS += -L$(ZLIB_PATH)/$(lib) $(CC_LD_DYNPATH)$(ZLIB_PATH)/$(lib) endif EXTLIBS += -lz -ifndef NO_POSIX_ONLY_PROGRAMS - PROGRAMS += git-daemon$X - PROGRAMS += git-imap-send$X -endif -ifndef NO_OPENSSL - OPENSSL_LIBSSL = -lssl - ifdef OPENSSLDIR - BASIC_CFLAGS += -I$(OPENSSLDIR)/include - OPENSSL_LINK = -L$(OPENSSLDIR)/$(lib) $(CC_LD_DYNPATH)$(OPENSSLDIR)/$(lib) - else - OPENSSL_LINK = - endif -else - BASIC_CFLAGS += -DNO_OPENSSL - MOZILLA_SHA1 = 1 - OPENSSL_LIBSSL = -endif -ifdef NEEDS_SSL_WITH_CRYPTO - LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto -lssl -else - LIB_4_CRYPTO = $(OPENSSL_LINK) -lcrypto -endif -ifdef NEEDS_LIBICONV - ifdef ICONVDIR - BASIC_CFLAGS += -I$(ICONVDIR)/include - ICONV_LINK = -L$(ICONVDIR)/$(lib) $(CC_LD_DYNPATH)$(ICONVDIR)/$(lib) - else - ICONV_LINK = - endif - EXTLIBS += $(ICONV_LINK) -liconv -endif ifdef NEEDS_SOCKET EXTLIBS += -lsocket endif @@ -977,10 +413,6 @@ ifdef NO_STRCASESTR COMPAT_CFLAGS += -DNO_STRCASESTR COMPAT_OBJS += compat/strcasestr.o endif -ifdef NO_STRLCPY - COMPAT_CFLAGS += -DNO_STRLCPY - COMPAT_OBJS += compat/strlcpy.o -endif ifdef NO_STRTOUMAX COMPAT_CFLAGS += -DNO_STRTOUMAX COMPAT_OBJS += compat/strtoumax.o @@ -1090,17 +522,6 @@ ifdef RUNTIME_PREFIX COMPAT_CFLAGS += -DRUNTIME_PREFIX endif -ifdef NO_PTHREADS - THREADED_DELTA_SEARCH = - BASIC_CFLAGS += -DNO_PTHREADS -else - EXTLIBS += $(PTHREAD_LIBS) -endif - -ifdef THREADED_DELTA_SEARCH - BASIC_CFLAGS += -DTHREADED_DELTA_SEARCH - LIB_OBJS += thread-utils.o -endif ifdef DIR_HAS_BSD_GROUP_SEMANTICS COMPAT_CFLAGS += -DDIR_HAS_BSD_GROUP_SEMANTICS endif @@ -1148,14 +569,14 @@ endif # Shell quote (do not use $(call) to accommodate ancient setups); SHA1_HEADER_SQ = $(subst ','\'',$(SHA1_HEADER)) -ETC_GITCONFIG_SQ = $(subst ','\'',$(ETC_GITCONFIG)) +ETC_PERFCONFIG_SQ = $(subst ','\'',$(ETC_PERFCONFIG)) DESTDIR_SQ = $(subst ','\'',$(DESTDIR)) bindir_SQ = $(subst ','\'',$(bindir)) bindir_relative_SQ = $(subst ','\'',$(bindir_relative)) mandir_SQ = $(subst ','\'',$(mandir)) infodir_SQ = $(subst ','\'',$(infodir)) -gitexecdir_SQ = $(subst ','\'',$(gitexecdir)) +perfexecdir_SQ = $(subst ','\'',$(perfexecdir)) template_dir_SQ = $(subst ','\'',$(template_dir)) htmldir_SQ = $(subst ','\'',$(htmldir)) prefix_SQ = $(subst ','\'',$(prefix)) @@ -1164,7 +585,7 @@ SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH)) PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH)) TCLTK_PATH_SQ = $(subst ','\'',$(TCLTK_PATH)) -LIBS = $(GITLIBS) $(EXTLIBS) +LIBS = $(PERFLIBS) $(EXTLIBS) BASIC_CFLAGS += -DSHA1_HEADER='$(SHA1_HEADER_SQ)' \ $(COMPAT_CFLAGS) @@ -1180,15 +601,15 @@ export TAR INSTALL DESTDIR SHELL_PATH SHELL = $(SHELL_PATH) -all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) GIT-BUILD-OPTIONS +all:: shell_compatibility_test $(ALL_PROGRAMS) $(BUILT_INS) $(OTHER_PROGRAMS) PERF-BUILD-OPTIONS ifneq (,$X) - $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), test '$p' -ef '$p$X' || $(RM) '$p';) + $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), test '$p' -ef '$p$X' || $(RM) '$p';) endif all:: ifndef NO_TCLTK - $(QUIET_SUBDIR0)git-gui $(QUIET_SUBDIR1) gitexecdir='$(gitexec_instdir_SQ)' all - $(QUIET_SUBDIR0)gitk-git $(QUIET_SUBDIR1) all + $(QUIET_SUBDIR0)perf-gui $(QUIET_SUBDIR1) perfexecdir='$(perfexec_instdir_SQ)' all + $(QUIET_SUBDIR0)perfk-perf $(QUIET_SUBDIR1) all endif ifndef NO_PERL $(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' all @@ -1200,33 +621,33 @@ please_set_SHELL_PATH_to_a_more_modern_shell: shell_compatibility_test: please_set_SHELL_PATH_to_a_more_modern_shell -strip: $(PROGRAMS) git$X - $(STRIP) $(STRIP_OPTS) $(PROGRAMS) git$X +strip: $(PROGRAMS) perf$X + $(STRIP) $(STRIP_OPTS) $(PROGRAMS) perf$X -git.o: git.c common-cmds.h GIT-CFLAGS - $(QUIET_CC)$(CC) -DGIT_VERSION='"$(GIT_VERSION)"' \ - '-DGIT_HTML_PATH="$(htmldir_SQ)"' \ +perf.o: perf.c common-cmds.h PERF-CFLAGS + $(QUIET_CC)$(CC) -DPERF_VERSION='"$(PERF_VERSION)"' \ + '-DPERF_HTML_PATH="$(htmldir_SQ)"' \ $(ALL_CFLAGS) -c $(filter %.c,$^) -git$X: git.o $(BUILTIN_OBJS) $(GITLIBS) - $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ git.o \ +perf$X: perf.o $(BUILTIN_OBJS) $(PERFLIBS) + $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ perf.o \ $(BUILTIN_OBJS) $(ALL_LDFLAGS) $(LIBS) -builtin-help.o: builtin-help.c common-cmds.h GIT-CFLAGS +builtin-help.o: builtin-help.c common-cmds.h PERF-CFLAGS $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \ - '-DGIT_HTML_PATH="$(htmldir_SQ)"' \ - '-DGIT_MAN_PATH="$(mandir_SQ)"' \ - '-DGIT_INFO_PATH="$(infodir_SQ)"' $< + '-DPERF_HTML_PATH="$(htmldir_SQ)"' \ + '-DPERF_MAN_PATH="$(mandir_SQ)"' \ + '-DPERF_INFO_PATH="$(infodir_SQ)"' $< -$(BUILT_INS): git$X +$(BUILT_INS): perf$X $(QUIET_BUILT_IN)$(RM) $@ && \ - ln git$X $@ 2>/dev/null || \ - ln -s git$X $@ 2>/dev/null || \ - cp git$X $@ + ln perf$X $@ 2>/dev/null || \ + ln -s perf$X $@ 2>/dev/null || \ + cp perf$X $@ common-cmds.h: ./generate-cmdlist.sh command-list.txt -common-cmds.h: $(wildcard Documentation/git-*.txt) +common-cmds.h: $(wildcard Documentation/perf-*.txt) $(QUIET_GEN)./generate-cmdlist.sh > $@+ && mv $@+ $@ $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh @@ -1234,152 +655,55 @@ $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \ -e 's|@SHELL_PATH@|$(SHELL_PATH_SQ)|' \ -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \ - -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \ + -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \ -e 's/@@NO_CURL@@/$(NO_CURL)/g' \ $@.sh >$@+ && \ chmod +x $@+ && \ mv $@+ $@ -ifndef NO_PERL -$(patsubst %.perl,%,$(SCRIPT_PERL)): perl/perl.mak - -perl/perl.mak: GIT-CFLAGS perl/Makefile perl/Makefile.PL - $(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' $(@F) - -$(patsubst %.perl,%,$(SCRIPT_PERL)): % : %.perl - $(QUIET_GEN)$(RM) $@ $@+ && \ - INSTLIBDIR=`MAKEFLAGS= $(MAKE) -C perl -s --no-print-directory instlibdir` && \ - sed -e '1{' \ - -e ' s|#!.*perl|#!$(PERL_PATH_SQ)|' \ - -e ' h' \ - -e ' s=.*=use lib (split(/:/, $$ENV{GITPERLLIB} || "@@INSTLIBDIR@@"));=' \ - -e ' H' \ - -e ' x' \ - -e '}' \ - -e 's|@@INSTLIBDIR@@|'"$$INSTLIBDIR"'|g' \ - -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \ - $@.perl >$@+ && \ - chmod +x $@+ && \ - mv $@+ $@ - -gitweb/gitweb.cgi: gitweb/gitweb.perl - $(QUIET_GEN)$(RM) $@ $@+ && \ - sed -e '1s|#!.*perl|#!$(PERL_PATH_SQ)|' \ - -e 's|++GIT_VERSION++|$(GIT_VERSION)|g' \ - -e 's|++GIT_BINDIR++|$(bindir)|g' \ - -e 's|++GITWEB_CONFIG++|$(GITWEB_CONFIG)|g' \ - -e 's|++GITWEB_CONFIG_SYSTEM++|$(GITWEB_CONFIG_SYSTEM)|g' \ - -e 's|++GITWEB_HOME_LINK_STR++|$(GITWEB_HOME_LINK_STR)|g' \ - -e 's|++GITWEB_SITENAME++|$(GITWEB_SITENAME)|g' \ - -e 's|++GITWEB_PROJECTROOT++|$(GITWEB_PROJECTROOT)|g' \ - -e 's|"++GITWEB_PROJECT_MAXDEPTH++"|$(GITWEB_PROJECT_MAXDEPTH)|g' \ - -e 's|++GITWEB_EXPORT_OK++|$(GITWEB_EXPORT_OK)|g' \ - -e 's|++GITWEB_STRICT_EXPORT++|$(GITWEB_STRICT_EXPORT)|g' \ - -e 's|++GITWEB_BASE_URL++|$(GITWEB_BASE_URL)|g' \ - -e 's|++GITWEB_LIST++|$(GITWEB_LIST)|g' \ - -e 's|++GITWEB_HOMETEXT++|$(GITWEB_HOMETEXT)|g' \ - -e 's|++GITWEB_CSS++|$(GITWEB_CSS)|g' \ - -e 's|++GITWEB_LOGO++|$(GITWEB_LOGO)|g' \ - -e 's|++GITWEB_FAVICON++|$(GITWEB_FAVICON)|g' \ - -e 's|++GITWEB_SITE_HEADER++|$(GITWEB_SITE_HEADER)|g' \ - -e 's|++GITWEB_SITE_FOOTER++|$(GITWEB_SITE_FOOTER)|g' \ - $< >$@+ && \ - chmod +x $@+ && \ - mv $@+ $@ - -git-instaweb: git-instaweb.sh gitweb/gitweb.cgi gitweb/gitweb.css - $(QUIET_GEN)$(RM) $@ $@+ && \ - sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \ - -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \ - -e 's/@@NO_CURL@@/$(NO_CURL)/g' \ - -e '/@@GITWEB_CGI@@/r gitweb/gitweb.cgi' \ - -e '/@@GITWEB_CGI@@/d' \ - -e '/@@GITWEB_CSS@@/r gitweb/gitweb.css' \ - -e '/@@GITWEB_CSS@@/d' \ - -e 's|@@PERL@@|$(PERL_PATH_SQ)|g' \ - $@.sh > $@+ && \ - chmod +x $@+ && \ - mv $@+ $@ -else # NO_PERL -$(patsubst %.perl,%,$(SCRIPT_PERL)) git-instaweb: % : unimplemented.sh - $(QUIET_GEN)$(RM) $@ $@+ && \ - sed -e '1s|#!.*/sh|#!$(SHELL_PATH_SQ)|' \ - -e 's|@@REASON@@|NO_PERL=$(NO_PERL)|g' \ - unimplemented.sh >$@+ && \ - chmod +x $@+ && \ - mv $@+ $@ -endif # NO_PERL - configure: configure.ac $(QUIET_GEN)$(RM) $@ $<+ && \ - sed -e 's/@@GIT_VERSION@@/$(GIT_VERSION)/g' \ + sed -e 's/@@PERF_VERSION@@/$(PERF_VERSION)/g' \ $< > $<+ && \ autoconf -o $@ $<+ && \ $(RM) $<+ -# These can record GIT_VERSION -git.o git.spec \ +# These can record PERF_VERSION +perf.o perf.spec \ $(patsubst %.sh,%,$(SCRIPT_SH)) \ $(patsubst %.perl,%,$(SCRIPT_PERL)) \ - : GIT-VERSION-FILE + : PERF-VERSION-FILE -%.o: %.c GIT-CFLAGS +%.o: %.c PERF-CFLAGS $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $< -%.s: %.c GIT-CFLAGS +%.s: %.c PERF-CFLAGS $(QUIET_CC)$(CC) -S $(ALL_CFLAGS) $< %.o: %.S $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $< -exec_cmd.o: exec_cmd.c GIT-CFLAGS +exec_cmd.o: exec_cmd.c PERF-CFLAGS $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \ - '-DGIT_EXEC_PATH="$(gitexecdir_SQ)"' \ + '-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \ '-DBINDIR="$(bindir_relative_SQ)"' \ '-DPREFIX="$(prefix_SQ)"' \ $< -builtin-init-db.o: builtin-init-db.c GIT-CFLAGS - $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_GIT_TEMPLATE_DIR='"$(template_dir_SQ)"' $< +builtin-init-db.o: builtin-init-db.c PERF-CFLAGS + $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $< -config.o: config.c GIT-CFLAGS - $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_GITCONFIG='"$(ETC_GITCONFIG_SQ)"' $< +config.o: config.c PERF-CFLAGS + $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $< -http.o: http.c GIT-CFLAGS - $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DGIT_USER_AGENT='"git/$(GIT_VERSION)"' $< - -ifdef NO_EXPAT -http-walker.o: http-walker.c http.h GIT-CFLAGS - $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DNO_EXPAT $< -endif - -git-%$X: %.o $(GITLIBS) +perf-%$X: %.o $(PERFLIBS) $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS) -git-imap-send$X: imap-send.o $(GITLIBS) - $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \ - $(LIBS) $(OPENSSL_LINK) $(OPENSSL_LIBSSL) - -http.o http-walker.o http-push.o transport.o: http.h - -git-http-push$X: revision.o http.o http-push.o $(GITLIBS) - $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) \ - $(LIBS) $(CURL_LIBCURL) $(EXPAT_LIBEXPAT) - $(LIB_OBJS) $(BUILTIN_OBJS): $(LIB_H) -$(patsubst git-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h) +$(patsubst perf-%$X,%.o,$(PROGRAMS)): $(LIB_H) $(wildcard */*.h) builtin-revert.o wt-status.o: wt-status.h $(LIB_FILE): $(LIB_OBJS) $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS) -XDIFF_OBJS=xdiff/xdiffi.o xdiff/xprepare.o xdiff/xutils.o xdiff/xemit.o \ - xdiff/xmerge.o xdiff/xpatience.o -$(XDIFF_OBJS): xdiff/xinclude.h xdiff/xmacros.h xdiff/xdiff.h xdiff/xtypes.h \ - xdiff/xutils.h xdiff/xprepare.h xdiff/xdiffi.h xdiff/xemit.h - -$(XDIFF_LIB): $(XDIFF_OBJS) - $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(XDIFF_OBJS) - - doc: $(MAKE) -C Documentation all @@ -1409,19 +733,19 @@ cscope: ### Detect prefix changes TRACK_CFLAGS = $(subst ','\'',$(ALL_CFLAGS)):\ - $(bindir_SQ):$(gitexecdir_SQ):$(template_dir_SQ):$(prefix_SQ) + $(bindir_SQ):$(perfexecdir_SQ):$(template_dir_SQ):$(prefix_SQ) -GIT-CFLAGS: .FORCE-GIT-CFLAGS +PERF-CFLAGS: .FORCE-PERF-CFLAGS @FLAGS='$(TRACK_CFLAGS)'; \ - if test x"$$FLAGS" != x"`cat GIT-CFLAGS 2>/dev/null`" ; then \ + if test x"$$FLAGS" != x"`cat PERF-CFLAGS 2>/dev/null`" ; then \ echo 1>&2 " * new build flags or prefix"; \ - echo "$$FLAGS" >GIT-CFLAGS; \ + echo "$$FLAGS" >PERF-CFLAGS; \ fi # We need to apply sq twice, once to protect from the shell -# that runs GIT-BUILD-OPTIONS, and then again to protect it +# that runs PERF-BUILD-OPTIONS, and then again to protect it # and the first level quoting from the shell that runs "echo". -GIT-BUILD-OPTIONS: .FORCE-GIT-BUILD-OPTIONS +PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS @echo SHELL_PATH=\''$(subst ','\'',$(SHELL_PATH_SQ))'\' >$@ @echo TAR=\''$(subst ','\'',$(subst ','\'',$(TAR)))'\' >>$@ @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@ @@ -1431,14 +755,14 @@ GIT-BUILD-OPTIONS: .FORCE-GIT-BUILD-OPTIONS ifndef NO_TCLTK TRACK_VARS = $(subst ','\'',-DTCLTK_PATH='$(TCLTK_PATH_SQ)') -GIT-GUI-VARS: .FORCE-GIT-GUI-VARS +PERF-GUI-VARS: .FORCE-PERF-GUI-VARS @VARS='$(TRACK_VARS)'; \ if test x"$$VARS" != x"`cat $@ 2>/dev/null`" ; then \ echo 1>&2 " * new Tcl/Tk interpreter location"; \ echo "$$VARS" >$@; \ fi -.PHONY: .FORCE-GIT-GUI-VARS +.PHONY: .FORCE-PERF-GUI-VARS endif ### Testing rules @@ -1476,7 +800,7 @@ test-parse-options$X: parse-options.o .PRECIOUS: $(patsubst test-%$X,test-%.o,$(TEST_PROGRAMS)) -test-%$X: test-%.o $(GITLIBS) +test-%$X: test-%.o $(PERFLIBS) $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS) check-sha1:: test-sha1$X @@ -1506,40 +830,40 @@ template_instdir = $(prefix)/$(template_dir) endif export template_instdir -ifneq ($(filter /%,$(firstword $(gitexecdir))),) -gitexec_instdir = $(gitexecdir) +ifneq ($(filter /%,$(firstword $(perfexecdir))),) +perfexec_instdir = $(perfexecdir) else -gitexec_instdir = $(prefix)/$(gitexecdir) +perfexec_instdir = $(prefix)/$(perfexecdir) endif -gitexec_instdir_SQ = $(subst ','\'',$(gitexec_instdir)) -export gitexec_instdir +perfexec_instdir_SQ = $(subst ','\'',$(perfexec_instdir)) +export perfexec_instdir install: all $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' - $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(gitexec_instdir_SQ)' - $(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)' - $(INSTALL) git$X git-upload-pack$X git-receive-pack$X git-upload-archive$X git-shell$X git-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)' + $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' + $(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' + $(INSTALL) perf$X perf-upload-pack$X perf-receive-pack$X perf-upload-archive$X perf-shell$X perf-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)' $(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install $(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install ifndef NO_TCLTK - $(MAKE) -C gitk-git install - $(MAKE) -C git-gui gitexecdir='$(gitexec_instdir_SQ)' install + $(MAKE) -C perfk-perf install + $(MAKE) -C perf-gui perfexecdir='$(perfexec_instdir_SQ)' install endif ifneq (,$X) - $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) git$X)), $(RM) '$(DESTDIR_SQ)$(gitexec_instdir_SQ)/$p';) + $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';) endif bindir=$$(cd '$(DESTDIR_SQ)$(bindir_SQ)' && pwd) && \ - execdir=$$(cd '$(DESTDIR_SQ)$(gitexec_instdir_SQ)' && pwd) && \ - { $(RM) "$$execdir/git-add$X" && \ - ln "$$bindir/git$X" "$$execdir/git-add$X" 2>/dev/null || \ - cp "$$bindir/git$X" "$$execdir/git-add$X"; } && \ - { for p in $(filter-out git-add$X,$(BUILT_INS)); do \ + execdir=$$(cd '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' && pwd) && \ + { $(RM) "$$execdir/perf-add$X" && \ + ln "$$bindir/perf$X" "$$execdir/perf-add$X" 2>/dev/null || \ + cp "$$bindir/perf$X" "$$execdir/perf-add$X"; } && \ + { for p in $(filter-out perf-add$X,$(BUILT_INS)); do \ $(RM) "$$execdir/$$p" && \ - ln "$$execdir/git-add$X" "$$execdir/$$p" 2>/dev/null || \ - ln -s "git-add$X" "$$execdir/$$p" 2>/dev/null || \ - cp "$$execdir/git-add$X" "$$execdir/$$p" || exit; \ + ln "$$execdir/perf-add$X" "$$execdir/$$p" 2>/dev/null || \ + ln -s "perf-add$X" "$$execdir/$$p" 2>/dev/null || \ + cp "$$execdir/perf-add$X" "$$execdir/$$p" || exit; \ done } && \ - ./check_bindir "z$$bindir" "z$$execdir" "$$bindir/git-add$X" + ./check_bindir "z$$bindir" "z$$execdir" "$$bindir/perf-add$X" install-doc: $(MAKE) -C Documentation install @@ -1569,31 +893,31 @@ quick-install-html: ### Maintainer's dist rules -git.spec: git.spec.in - sed -e 's/@@VERSION@@/$(GIT_VERSION)/g' < $< > $@+ +perf.spec: perf.spec.in + sed -e 's/@@VERSION@@/$(PERF_VERSION)/g' < $< > $@+ mv $@+ $@ -GIT_TARNAME=git-$(GIT_VERSION) -dist: git.spec git-archive$(X) configure - ./git-archive --format=tar \ - --prefix=$(GIT_TARNAME)/ HEAD^{tree} > $(GIT_TARNAME).tar - @mkdir -p $(GIT_TARNAME) - @cp git.spec configure $(GIT_TARNAME) - @echo $(GIT_VERSION) > $(GIT_TARNAME)/version - @$(MAKE) -C git-gui TARDIR=../$(GIT_TARNAME)/git-gui dist-version - $(TAR) rf $(GIT_TARNAME).tar \ - $(GIT_TARNAME)/git.spec \ - $(GIT_TARNAME)/configure \ - $(GIT_TARNAME)/version \ - $(GIT_TARNAME)/git-gui/version - @$(RM) -r $(GIT_TARNAME) - gzip -f -9 $(GIT_TARNAME).tar +PERF_TARNAME=perf-$(PERF_VERSION) +dist: perf.spec perf-archive$(X) configure + ./perf-archive --format=tar \ + --prefix=$(PERF_TARNAME)/ HEAD^{tree} > $(PERF_TARNAME).tar + @mkdir -p $(PERF_TARNAME) + @cp perf.spec configure $(PERF_TARNAME) + @echo $(PERF_VERSION) > $(PERF_TARNAME)/version + @$(MAKE) -C perf-gui TARDIR=../$(PERF_TARNAME)/perf-gui dist-version + $(TAR) rf $(PERF_TARNAME).tar \ + $(PERF_TARNAME)/perf.spec \ + $(PERF_TARNAME)/configure \ + $(PERF_TARNAME)/version \ + $(PERF_TARNAME)/perf-gui/version + @$(RM) -r $(PERF_TARNAME) + gzip -f -9 $(PERF_TARNAME).tar rpm: dist - $(RPMBUILD) -ta $(GIT_TARNAME).tar.gz + $(RPMBUILD) -ta $(PERF_TARNAME).tar.gz -htmldocs = git-htmldocs-$(GIT_VERSION) -manpages = git-manpages-$(GIT_VERSION) +htmldocs = perf-htmldocs-$(PERF_VERSION) +manpages = perf-manpages-$(PERF_VERSION) dist-doc: $(RM) -r .doc-tmp-dir mkdir .doc-tmp-dir @@ -1618,51 +942,46 @@ distclean: clean $(RM) configure clean: - $(RM) *.o mozilla-sha1/*.o arm/*.o ppc/*.o compat/*.o xdiff/*.o \ - $(LIB_FILE) $(XDIFF_LIB) - $(RM) $(ALL_PROGRAMS) $(BUILT_INS) git$X + $(RM) *.o $(LIB_FILE) + $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X $(RM) $(TEST_PROGRAMS) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope* $(RM) -r autom4te.cache $(RM) config.log config.mak.autogen config.mak.append config.status config.cache - $(RM) -r $(GIT_TARNAME) .doc-tmp-dir - $(RM) $(GIT_TARNAME).tar.gz git-core_$(GIT_VERSION)-*.tar.gz + $(RM) -r $(PERF_TARNAME) .doc-tmp-dir + $(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz $(RM) $(htmldocs).tar.gz $(manpages).tar.gz $(MAKE) -C Documentation/ clean -ifndef NO_PERL - $(RM) gitweb/gitweb.cgi - $(MAKE) -C perl clean -endif $(MAKE) -C templates/ clean $(MAKE) -C t/ clean ifndef NO_TCLTK - $(MAKE) -C gitk-git clean - $(MAKE) -C git-gui clean + $(MAKE) -C perfk-perf clean + $(MAKE) -C perf-gui clean endif - $(RM) GIT-VERSION-FILE GIT-CFLAGS GIT-GUI-VARS GIT-BUILD-OPTIONS + $(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-GUI-VARS PERF-BUILD-OPTIONS .PHONY: all install clean strip .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell -.PHONY: .FORCE-GIT-VERSION-FILE TAGS tags cscope .FORCE-GIT-CFLAGS -.PHONY: .FORCE-GIT-BUILD-OPTIONS +.PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS +.PHONY: .FORCE-PERF-BUILD-OPTIONS ### Check documentation # check-docs:: - @(for v in $(ALL_PROGRAMS) $(BUILT_INS) git gitk; \ + @(for v in $(ALL_PROGRAMS) $(BUILT_INS) perf perfk; \ do \ case "$$v" in \ - git-merge-octopus | git-merge-ours | git-merge-recursive | \ - git-merge-resolve | git-merge-subtree | \ - git-fsck-objects | git-init-db | \ - git-?*--?* ) continue ;; \ + perf-merge-octopus | perf-merge-ours | perf-merge-recursive | \ + perf-merge-resolve | perf-merge-subtree | \ + perf-fsck-objects | perf-init-db | \ + perf-?*--?* ) continue ;; \ esac ; \ test -f "Documentation/$$v.txt" || \ echo "no doc: $$v"; \ sed -e '/^#/d' command-list.txt | \ grep -q "^$$v[ ]" || \ case "$$v" in \ - git) ;; \ + perf) ;; \ *) echo "no link: $$v";; \ esac ; \ done; \ @@ -1670,37 +989,37 @@ check-docs:: sed -e '/^#/d' \ -e 's/[ ].*//' \ -e 's/^/listed /' command-list.txt; \ - ls -1 Documentation/git*txt | \ + ls -1 Documentation/perf*txt | \ sed -e 's|Documentation/|documented |' \ -e 's/\.txt//'; \ ) | while read how cmd; \ do \ case "$$how,$$cmd" in \ - *,git-citool | \ - *,git-gui | \ - *,git-help | \ - documented,gitattributes | \ - documented,gitignore | \ - documented,gitmodules | \ - documented,gitcli | \ - documented,git-tools | \ - documented,gitcore-tutorial | \ - documented,gitcvs-migration | \ - documented,gitdiffcore | \ - documented,gitglossary | \ - documented,githooks | \ - documented,gitrepository-layout | \ - documented,gittutorial | \ - documented,gittutorial-2 | \ + *,perf-citool | \ + *,perf-gui | \ + *,perf-help | \ + documented,perfattributes | \ + documented,perfignore | \ + documented,perfmodules | \ + documented,perfcli | \ + documented,perf-tools | \ + documented,perfcore-tutorial | \ + documented,perfcvs-migration | \ + documented,perfdiffcore | \ + documented,perfglossary | \ + documented,perfhooks | \ + documented,perfrepository-layout | \ + documented,perftutorial | \ + documented,perftutorial-2 | \ sentinel,not,matching,is,ok ) continue ;; \ esac; \ - case " $(ALL_PROGRAMS) $(BUILT_INS) git gitk " in \ + case " $(ALL_PROGRAMS) $(BUILT_INS) perf perfk " in \ *" $$cmd "*) ;; \ *) echo "removed but $$how: $$cmd" ;; \ esac; \ done ) | sort -### Make sure built-ins do not have dups and listed in git.c +### Make sure built-ins do not have dups and listed in perf.c # check-builtins:: ./check-builtins.sh diff --git a/Documentation/perf_counter/PERF-BUILD-OPTIONS b/Documentation/perf_counter/PERF-BUILD-OPTIONS new file mode 100644 index 00000000000..46d8d6ceb2f --- /dev/null +++ b/Documentation/perf_counter/PERF-BUILD-OPTIONS @@ -0,0 +1,4 @@ +SHELL_PATH='/bin/sh' +TAR='tar' +NO_CURL='' +NO_PERL='' diff --git a/Documentation/perf_counter/PERF-CFLAGS b/Documentation/perf_counter/PERF-CFLAGS new file mode 100644 index 00000000000..f24906ca688 --- /dev/null +++ b/Documentation/perf_counter/PERF-CFLAGS @@ -0,0 +1 @@ +-g -O2 -Wall -DSHA1_HEADER='' : /home/mingo/bin:libexec/perf-core:share/perf-core/templates:/home/mingo diff --git a/Documentation/perf_counter/PERF-VERSION-FILE b/Documentation/perf_counter/PERF-VERSION-FILE new file mode 100644 index 00000000000..328e244c0c8 --- /dev/null +++ b/Documentation/perf_counter/PERF-VERSION-FILE @@ -0,0 +1 @@ +PERF_VERSION = 0.0.1.PERF diff --git a/Documentation/perf_counter/PERF-VERSION-GEN b/Documentation/perf_counter/PERF-VERSION-GEN new file mode 100755 index 00000000000..c561d1538c0 --- /dev/null +++ b/Documentation/perf_counter/PERF-VERSION-GEN @@ -0,0 +1,42 @@ +#!/bin/sh + +GVF=PERF-VERSION-FILE +DEF_VER=v0.0.1.PERF + +LF=' +' + +# First see if there is a version file (included in release tarballs), +# then try git-describe, then default. +if test -f version +then + VN=$(cat version) || VN="$DEF_VER" +elif test -d .git -o -f .git && + VN=$(git describe --abbrev=4 HEAD 2>/dev/null) && + case "$VN" in + *$LF*) (exit 1) ;; + v[0-9]*) + git update-index -q --refresh + test -z "$(git diff-index --name-only HEAD --)" || + VN="$VN-dirty" ;; + esac +then + VN=$(echo "$VN" | sed -e 's/-/./g'); +else + VN="$DEF_VER" +fi + +VN=$(expr "$VN" : v*'\(.*\)') + +if test -r $GVF +then + VC=$(sed -e 's/^PERF_VERSION = //' <$GVF) +else + VC=unset +fi +test "$VN" = "$VC" || { + echo >&2 "PERF_VERSION = $VN" + echo "PERF_VERSION = $VN" >$GVF +} + + diff --git a/Documentation/perf_counter/abspath.c b/Documentation/perf_counter/abspath.c new file mode 100644 index 00000000000..649f34f8336 --- /dev/null +++ b/Documentation/perf_counter/abspath.c @@ -0,0 +1,117 @@ +#include "cache.h" + +/* + * Do not use this for inspecting *tracked* content. When path is a + * symlink to a directory, we do not want to say it is a directory when + * dealing with tracked content in the working tree. + */ +int is_directory(const char *path) +{ + struct stat st; + return (!stat(path, &st) && S_ISDIR(st.st_mode)); +} + +/* We allow "recursive" symbolic links. Only within reason, though. */ +#define MAXDEPTH 5 + +const char *make_absolute_path(const char *path) +{ + static char bufs[2][PATH_MAX + 1], *buf = bufs[0], *next_buf = bufs[1]; + char cwd[1024] = ""; + int buf_index = 1, len; + + int depth = MAXDEPTH; + char *last_elem = NULL; + struct stat st; + + if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX) + die ("Too long path: %.*s", 60, path); + + while (depth--) { + if (!is_directory(buf)) { + char *last_slash = strrchr(buf, '/'); + if (last_slash) { + *last_slash = '\0'; + last_elem = xstrdup(last_slash + 1); + } else { + last_elem = xstrdup(buf); + *buf = '\0'; + } + } + + if (*buf) { + if (!*cwd && !getcwd(cwd, sizeof(cwd))) + die ("Could not get current working directory"); + + if (chdir(buf)) + die ("Could not switch to '%s'", buf); + } + if (!getcwd(buf, PATH_MAX)) + die ("Could not get current working directory"); + + if (last_elem) { + int len = strlen(buf); + if (len + strlen(last_elem) + 2 > PATH_MAX) + die ("Too long path name: '%s/%s'", + buf, last_elem); + buf[len] = '/'; + strcpy(buf + len + 1, last_elem); + free(last_elem); + last_elem = NULL; + } + + if (!lstat(buf, &st) && S_ISLNK(st.st_mode)) { + len = readlink(buf, next_buf, PATH_MAX); + if (len < 0) + die ("Invalid symlink: %s", buf); + if (PATH_MAX <= len) + die("symbolic link too long: %s", buf); + next_buf[len] = '\0'; + buf = next_buf; + buf_index = 1 - buf_index; + next_buf = bufs[buf_index]; + } else + break; + } + + if (*cwd && chdir(cwd)) + die ("Could not change back to '%s'", cwd); + + return buf; +} + +static const char *get_pwd_cwd(void) +{ + static char cwd[PATH_MAX + 1]; + char *pwd; + struct stat cwd_stat, pwd_stat; + if (getcwd(cwd, PATH_MAX) == NULL) + return NULL; + pwd = getenv("PWD"); + if (pwd && strcmp(pwd, cwd)) { + stat(cwd, &cwd_stat); + if (!stat(pwd, &pwd_stat) && + pwd_stat.st_dev == cwd_stat.st_dev && + pwd_stat.st_ino == cwd_stat.st_ino) { + strlcpy(cwd, pwd, PATH_MAX); + } + } + return cwd; +} + +const char *make_nonrelative_path(const char *path) +{ + static char buf[PATH_MAX + 1]; + + if (is_absolute_path(path)) { + if (strlcpy(buf, path, PATH_MAX) >= PATH_MAX) + die("Too long path: %.*s", 60, path); + } else { + const char *cwd = get_pwd_cwd(); + if (!cwd) + die("Cannot determine the current working directory"); + if (snprintf(buf, PATH_MAX, "%s/%s", cwd, path) >= PATH_MAX) + die("Too long path: %.*s", 60, path); + } + return buf; +} diff --git a/Documentation/perf_counter/alias.c b/Documentation/perf_counter/alias.c new file mode 100644 index 00000000000..9b3dd2b428d --- /dev/null +++ b/Documentation/perf_counter/alias.c @@ -0,0 +1,77 @@ +#include "cache.h" + +static const char *alias_key; +static char *alias_val; + +static int alias_lookup_cb(const char *k, const char *v, void *cb) +{ + if (!prefixcmp(k, "alias.") && !strcmp(k+6, alias_key)) { + if (!v) + return config_error_nonbool(k); + alias_val = strdup(v); + return 0; + } + return 0; +} + +char *alias_lookup(const char *alias) +{ + alias_key = alias; + alias_val = NULL; + perf_config(alias_lookup_cb, NULL); + return alias_val; +} + +int split_cmdline(char *cmdline, const char ***argv) +{ + int src, dst, count = 0, size = 16; + char quoted = 0; + + *argv = malloc(sizeof(char*) * size); + + /* split alias_string */ + (*argv)[count++] = cmdline; + for (src = dst = 0; cmdline[src];) { + char c = cmdline[src]; + if (!quoted && isspace(c)) { + cmdline[dst++] = 0; + while (cmdline[++src] + && isspace(cmdline[src])) + ; /* skip */ + if (count >= size) { + size += 16; + *argv = realloc(*argv, sizeof(char*) * size); + } + (*argv)[count++] = cmdline + dst; + } else if (!quoted && (c == '\'' || c == '"')) { + quoted = c; + src++; + } else if (c == quoted) { + quoted = 0; + src++; + } else { + if (c == '\\' && quoted != '\'') { + src++; + c = cmdline[src]; + if (!c) { + free(*argv); + *argv = NULL; + return error("cmdline ends with \\"); + } + } + cmdline[dst++] = c; + src++; + } + } + + cmdline[dst] = 0; + + if (quoted) { + free(*argv); + *argv = NULL; + return error("unclosed quote"); + } + + return count; +} + diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c new file mode 100644 index 00000000000..125fcc2f490 --- /dev/null +++ b/Documentation/perf_counter/builtin-help.c @@ -0,0 +1,463 @@ +/* + * builtin-help.c + * + * Builtin help command + */ +#include "cache.h" +#include "builtin.h" +#include "exec_cmd.h" +#include "common-cmds.h" +#include "parse-options.h" +#include "run-command.h" +#include "help.h" + +static struct man_viewer_list { + struct man_viewer_list *next; + char name[FLEX_ARRAY]; +} *man_viewer_list; + +static struct man_viewer_info_list { + struct man_viewer_info_list *next; + const char *info; + char name[FLEX_ARRAY]; +} *man_viewer_info_list; + +enum help_format { + HELP_FORMAT_MAN, + HELP_FORMAT_INFO, + HELP_FORMAT_WEB, +}; + +static int show_all = 0; +static enum help_format help_format = HELP_FORMAT_MAN; +static struct option builtin_help_options[] = { + OPT_BOOLEAN('a', "all", &show_all, "print all available commands"), + OPT_SET_INT('m', "man", &help_format, "show man page", HELP_FORMAT_MAN), + OPT_SET_INT('w', "web", &help_format, "show manual in web browser", + HELP_FORMAT_WEB), + OPT_SET_INT('i', "info", &help_format, "show info page", + HELP_FORMAT_INFO), + OPT_END(), +}; + +static const char * const builtin_help_usage[] = { + "perf help [--all] [--man|--web|--info] [command]", + NULL +}; + +static enum help_format parse_help_format(const char *format) +{ + if (!strcmp(format, "man")) + return HELP_FORMAT_MAN; + if (!strcmp(format, "info")) + return HELP_FORMAT_INFO; + if (!strcmp(format, "web") || !strcmp(format, "html")) + return HELP_FORMAT_WEB; + die("unrecognized help format '%s'", format); +} + +static const char *get_man_viewer_info(const char *name) +{ + struct man_viewer_info_list *viewer; + + for (viewer = man_viewer_info_list; viewer; viewer = viewer->next) + { + if (!strcasecmp(name, viewer->name)) + return viewer->info; + } + return NULL; +} + +static int check_emacsclient_version(void) +{ + struct strbuf buffer = STRBUF_INIT; + struct child_process ec_process; + const char *argv_ec[] = { "emacsclient", "--version", NULL }; + int version; + + /* emacsclient prints its version number on stderr */ + memset(&ec_process, 0, sizeof(ec_process)); + ec_process.argv = argv_ec; + ec_process.err = -1; + ec_process.stdout_to_stderr = 1; + if (start_command(&ec_process)) { + fprintf(stderr, "Failed to start emacsclient.\n"); + return -1; + } + strbuf_read(&buffer, ec_process.err, 20); + close(ec_process.err); + + /* + * Don't bother checking return value, because "emacsclient --version" + * seems to always exits with code 1. + */ + finish_command(&ec_process); + + if (prefixcmp(buffer.buf, "emacsclient")) { + fprintf(stderr, "Failed to parse emacsclient version.\n"); + strbuf_release(&buffer); + return -1; + } + + strbuf_remove(&buffer, 0, strlen("emacsclient")); + version = atoi(buffer.buf); + + if (version < 22) { + fprintf(stderr, + "emacsclient version '%d' too old (< 22).\n", + version); + strbuf_release(&buffer); + return -1; + } + + strbuf_release(&buffer); + return 0; +} + +static void exec_woman_emacs(const char* path, const char *page) +{ + if (!check_emacsclient_version()) { + /* This works only with emacsclient version >= 22. */ + struct strbuf man_page = STRBUF_INIT; + + if (!path) + path = "emacsclient"; + strbuf_addf(&man_page, "(woman \"%s\")", page); + execlp(path, "emacsclient", "-e", man_page.buf, NULL); + warning("failed to exec '%s': %s", path, strerror(errno)); + } +} + +static void exec_man_konqueror(const char* path, const char *page) +{ + const char *display = getenv("DISPLAY"); + if (display && *display) { + struct strbuf man_page = STRBUF_INIT; + const char *filename = "kfmclient"; + + /* It's simpler to launch konqueror using kfmclient. */ + if (path) { + const char *file = strrchr(path, '/'); + if (file && !strcmp(file + 1, "konqueror")) { + char *new = strdup(path); + char *dest = strrchr(new, '/'); + + /* strlen("konqueror") == strlen("kfmclient") */ + strcpy(dest + 1, "kfmclient"); + path = new; + } + if (file) + filename = file; + } else + path = "kfmclient"; + strbuf_addf(&man_page, "man:%s(1)", page); + execlp(path, filename, "newTab", man_page.buf, NULL); + warning("failed to exec '%s': %s", path, strerror(errno)); + } +} + +static void exec_man_man(const char* path, const char *page) +{ + if (!path) + path = "man"; + execlp(path, "man", page, NULL); + warning("failed to exec '%s': %s", path, strerror(errno)); +} + +static void exec_man_cmd(const char *cmd, const char *page) +{ + struct strbuf shell_cmd = STRBUF_INIT; + strbuf_addf(&shell_cmd, "%s %s", cmd, page); + execl("/bin/sh", "sh", "-c", shell_cmd.buf, NULL); + warning("failed to exec '%s': %s", cmd, strerror(errno)); +} + +static void add_man_viewer(const char *name) +{ + struct man_viewer_list **p = &man_viewer_list; + size_t len = strlen(name); + + while (*p) + p = &((*p)->next); + *p = calloc(1, (sizeof(**p) + len + 1)); + strncpy((*p)->name, name, len); +} + +static int supported_man_viewer(const char *name, size_t len) +{ + return (!strncasecmp("man", name, len) || + !strncasecmp("woman", name, len) || + !strncasecmp("konqueror", name, len)); +} + +static void do_add_man_viewer_info(const char *name, + size_t len, + const char *value) +{ + struct man_viewer_info_list *new = calloc(1, sizeof(*new) + len + 1); + + strncpy(new->name, name, len); + new->info = strdup(value); + new->next = man_viewer_info_list; + man_viewer_info_list = new; +} + +static int add_man_viewer_path(const char *name, + size_t len, + const char *value) +{ + if (supported_man_viewer(name, len)) + do_add_man_viewer_info(name, len, value); + else + warning("'%s': path for unsupported man viewer.\n" + "Please consider using 'man..cmd' instead.", + name); + + return 0; +} + +static int add_man_viewer_cmd(const char *name, + size_t len, + const char *value) +{ + if (supported_man_viewer(name, len)) + warning("'%s': cmd for supported man viewer.\n" + "Please consider using 'man..path' instead.", + name); + else + do_add_man_viewer_info(name, len, value); + + return 0; +} + +static int add_man_viewer_info(const char *var, const char *value) +{ + const char *name = var + 4; + const char *subkey = strrchr(name, '.'); + + if (!subkey) + return error("Config with no key for man viewer: %s", name); + + if (!strcmp(subkey, ".path")) { + if (!value) + return config_error_nonbool(var); + return add_man_viewer_path(name, subkey - name, value); + } + if (!strcmp(subkey, ".cmd")) { + if (!value) + return config_error_nonbool(var); + return add_man_viewer_cmd(name, subkey - name, value); + } + + warning("'%s': unsupported man viewer sub key.", subkey); + return 0; +} + +static int perf_help_config(const char *var, const char *value, void *cb) +{ + if (!strcmp(var, "help.format")) { + if (!value) + return config_error_nonbool(var); + help_format = parse_help_format(value); + return 0; + } + if (!strcmp(var, "man.viewer")) { + if (!value) + return config_error_nonbool(var); + add_man_viewer(value); + return 0; + } + if (!prefixcmp(var, "man.")) + return add_man_viewer_info(var, value); + + return perf_default_config(var, value, cb); +} + +static struct cmdnames main_cmds, other_cmds; + +void list_common_cmds_help(void) +{ + int i, longest = 0; + + for (i = 0; i < ARRAY_SIZE(common_cmds); i++) { + if (longest < strlen(common_cmds[i].name)) + longest = strlen(common_cmds[i].name); + } + + puts("The most commonly used perf commands are:"); + for (i = 0; i < ARRAY_SIZE(common_cmds); i++) { + printf(" %s ", common_cmds[i].name); + mput_char(' ', longest - strlen(common_cmds[i].name)); + puts(common_cmds[i].help); + } +} + +static int is_perf_command(const char *s) +{ + return is_in_cmdlist(&main_cmds, s) || + is_in_cmdlist(&other_cmds, s); +} + +static const char *prepend(const char *prefix, const char *cmd) +{ + size_t pre_len = strlen(prefix); + size_t cmd_len = strlen(cmd); + char *p = malloc(pre_len + cmd_len + 1); + memcpy(p, prefix, pre_len); + strcpy(p + pre_len, cmd); + return p; +} + +static const char *cmd_to_page(const char *perf_cmd) +{ + if (!perf_cmd) + return "perf"; + else if (!prefixcmp(perf_cmd, "perf")) + return perf_cmd; + else if (is_perf_command(perf_cmd)) + return prepend("perf-", perf_cmd); + else + return prepend("perf", perf_cmd); +} + +static void setup_man_path(void) +{ + struct strbuf new_path = STRBUF_INIT; + const char *old_path = getenv("MANPATH"); + + /* We should always put ':' after our path. If there is no + * old_path, the ':' at the end will let 'man' to try + * system-wide paths after ours to find the manual page. If + * there is old_path, we need ':' as delimiter. */ + strbuf_addstr(&new_path, system_path(PERF_MAN_PATH)); + strbuf_addch(&new_path, ':'); + if (old_path) + strbuf_addstr(&new_path, old_path); + + setenv("MANPATH", new_path.buf, 1); + + strbuf_release(&new_path); +} + +static void exec_viewer(const char *name, const char *page) +{ + const char *info = get_man_viewer_info(name); + + if (!strcasecmp(name, "man")) + exec_man_man(info, page); + else if (!strcasecmp(name, "woman")) + exec_woman_emacs(info, page); + else if (!strcasecmp(name, "konqueror")) + exec_man_konqueror(info, page); + else if (info) + exec_man_cmd(info, page); + else + warning("'%s': unknown man viewer.", name); +} + +static void show_man_page(const char *perf_cmd) +{ + struct man_viewer_list *viewer; + const char *page = cmd_to_page(perf_cmd); + const char *fallback = getenv("PERF_MAN_VIEWER"); + + setup_man_path(); + for (viewer = man_viewer_list; viewer; viewer = viewer->next) + { + exec_viewer(viewer->name, page); /* will return when unable */ + } + if (fallback) + exec_viewer(fallback, page); + exec_viewer("man", page); + die("no man viewer handled the request"); +} + +static void show_info_page(const char *perf_cmd) +{ + const char *page = cmd_to_page(perf_cmd); + setenv("INFOPATH", system_path(PERF_INFO_PATH), 1); + execlp("info", "info", "perfman", page, NULL); +} + +static void get_html_page_path(struct strbuf *page_path, const char *page) +{ + struct stat st; + const char *html_path = system_path(PERF_HTML_PATH); + + /* Check that we have a perf documentation directory. */ + if (stat(mkpath("%s/perf.html", html_path), &st) + || !S_ISREG(st.st_mode)) + die("'%s': not a documentation directory.", html_path); + + strbuf_init(page_path, 0); + strbuf_addf(page_path, "%s/%s.html", html_path, page); +} + +/* + * If open_html is not defined in a platform-specific way (see for + * example compat/mingw.h), we use the script web--browse to display + * HTML. + */ +#ifndef open_html +void open_html(const char *path) +{ + execl_perf_cmd("web--browse", "-c", "help.browser", path, NULL); +} +#endif + +static void show_html_page(const char *perf_cmd) +{ + const char *page = cmd_to_page(perf_cmd); + struct strbuf page_path; /* it leaks but we exec bellow */ + + get_html_page_path(&page_path, page); + + open_html(page_path.buf); +} + +int cmd_help(int argc, const char **argv, const char *prefix) +{ + int nonperf; + const char *alias; + load_command_list("perf-", &main_cmds, &other_cmds); + + /* setup_perf_directory_gently(&nonperf); */ + perf_config(perf_help_config, NULL); + + argc = parse_options(argc, argv, builtin_help_options, + builtin_help_usage, 0); + + if (show_all) { + printf("usage: %s\n\n", perf_usage_string); + list_commands("perf commands", &main_cmds, &other_cmds); + printf("%s\n", perf_more_info_string); + return 0; + } + + if (!argv[0]) { + printf("usage: %s\n\n", perf_usage_string); + list_common_cmds_help(); + printf("\n%s\n", perf_more_info_string); + return 0; + } + + alias = alias_lookup(argv[0]); + if (alias && !is_perf_command(argv[0])) { + printf("`perf %s' is aliased to `%s'\n", argv[0], alias); + return 0; + } + + switch (help_format) { + case HELP_FORMAT_MAN: + show_man_page(argv[0]); + break; + case HELP_FORMAT_INFO: + show_info_page(argv[0]); + break; + case HELP_FORMAT_WEB: + show_html_page(argv[0]); + break; + } + + return 0; +} diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c new file mode 100644 index 00000000000..9d2c769e5f8 --- /dev/null +++ b/Documentation/perf_counter/builtin-top.c @@ -0,0 +1,1411 @@ +/* + * kerneltop.c: show top kernel functions - performance counters showcase + + Build with: + + cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt + + Sample output: + +------------------------------------------------------------------------------ + KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2) +------------------------------------------------------------------------------ + + weight RIP kernel function + ______ ________________ _______________ + + 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev + 33.00 - ffffffff804cb740 : sock_alloc_send_skb + 31.26 - ffffffff804ce808 : skb_push + 22.43 - ffffffff80510004 : tcp_established_options + 19.00 - ffffffff8027d250 : find_get_page + 15.76 - ffffffff804e4fc9 : eth_type_trans + 15.20 - ffffffff804d8baa : dst_release + 14.86 - ffffffff804cf5d8 : skb_release_head_state + 14.00 - ffffffff802217d5 : read_hpet + 12.00 - ffffffff804ffb7f : __ip_local_out + 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish + 8.54 - ffffffff805001a3 : ip_queue_xmit + */ + +/* + * perfstat: /usr/bin/time -alike performance counter statistics utility + + It summarizes the counter events of all tasks (and child tasks), + covering all CPUs that the command (or workload) executes on. + It only counts the per-task events of the workload started, + independent of how many other tasks run on those CPUs. + + Sample output: + + $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null + + Performance counter stats for 'ls': + + 163516953 instructions + 2295 cache-misses + 2855182 branch-misses + */ + + /* + * Copyright (C) 2008, Red Hat Inc, Ingo Molnar + * + * Improvements and fixes by: + * + * Arjan van de Ven + * Yanmin Zhang + * Wu Fengguang + * Mike Galbraith + * Paul Mackerras + * + * Released under the GPL v2. (and only v2, not any later version) + */ + +#include "util.h" + +#define _GNU_SOURCE +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../../include/linux/perf_counter.h" + + +/* + * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * counters in the current task. + */ +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + +#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) + +#define rdclock() \ +({ \ + struct timespec ts; \ + \ + clock_gettime(CLOCK_MONOTONIC, &ts); \ + ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ +}) + +/* + * Pick up some kernel type conventions: + */ +#define __user +#define asmlinkage + +#ifdef __x86_64__ +#define __NR_perf_counter_open 295 +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __i386__ +#define __NR_perf_counter_open 333 +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __powerpc__ +#define __NR_perf_counter_open 319 +#define rmb() asm volatile ("sync" ::: "memory") +#define cpu_relax() asm volatile ("" ::: "memory"); +#endif + +#define unlikely(x) __builtin_expect(!!(x), 0) +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +asmlinkage int sys_perf_counter_open( + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd, + unsigned long flags) +{ + return syscall( + __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); +} + +#define MAX_COUNTERS 64 +#define MAX_NR_CPUS 256 + +#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) + +static int run_perfstat = 0; +static int system_wide = 0; + +static int nr_counters = 0; +static __u64 event_id[MAX_COUNTERS] = { + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), + + EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), + EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), + EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), + EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), +}; +static int default_interval = 100000; +static int event_count[MAX_COUNTERS]; +static int fd[MAX_NR_CPUS][MAX_COUNTERS]; + +static __u64 count_filter = 100; + +static int tid = -1; +static int profile_cpu = -1; +static int nr_cpus = 0; +static int nmi = 1; +static unsigned int realtime_prio = 0; +static int group = 0; +static unsigned int page_size; +static unsigned int mmap_pages = 16; +static int use_mmap = 0; +static int use_munmap = 0; + +static char *vmlinux; + +static char *sym_filter; +static unsigned long filter_start; +static unsigned long filter_end; + +static int delay_secs = 2; +static int zero; +static int dump_symtab; + +static int scale; + +struct source_line { + uint64_t EIP; + unsigned long count; + char *line; + struct source_line *next; +}; + +static struct source_line *lines; +static struct source_line **lines_tail; + +const unsigned int default_count[] = { + 1000000, + 1000000, + 10000, + 10000, + 1000000, + 10000, +}; + +static char *hw_event_names[] = { + "CPU cycles", + "instructions", + "cache references", + "cache misses", + "branches", + "branch misses", + "bus cycles", +}; + +static char *sw_event_names[] = { + "cpu clock ticks", + "task clock ticks", + "pagefaults", + "context switches", + "CPU migrations", + "minor faults", + "major faults", +}; + +struct event_symbol { + __u64 event; + char *symbol; +}; + +static struct event_symbol event_symbols[] = { + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", }, + + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", }, +}; + +#define __PERF_COUNTER_FIELD(config, name) \ + ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) + +#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW) +#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG) +#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) +#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) + +static void display_events_help(void) +{ + unsigned int i; + __u64 e; + + printf( + " -e EVENT --event=EVENT # symbolic-name abbreviations"); + + for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { + int type, id; + + e = event_symbols[i].event; + type = PERF_COUNTER_TYPE(e); + id = PERF_COUNTER_ID(e); + + printf("\n %d:%d: %-20s", + type, id, event_symbols[i].symbol); + } + + printf("\n" + " rNNN: raw PMU events (eventsel+umask)\n\n"); +} + +static void display_perfstat_help(void) +{ + printf( + "Usage: perfstat [] \n\n" + "PerfStat Options (up to %d event types can be specified):\n\n", + MAX_COUNTERS); + + display_events_help(); + + printf( + " -l # scale counter values\n" + " -a # system-wide collection\n"); + exit(0); +} + +static void display_help(void) +{ + if (run_perfstat) + return display_perfstat_help(); + + printf( + "Usage: kerneltop []\n" + " Or: kerneltop -S [] COMMAND [ARGS]\n\n" + "KernelTop Options (up to %d event types can be specified at once):\n\n", + MAX_COUNTERS); + + display_events_help(); + + printf( + " -S --stat # perfstat COMMAND\n" + " -a # system-wide collection (for perfstat)\n\n" + " -c CNT --count=CNT # event period to sample\n\n" + " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n" + " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n" + " -l # show scale factor for RR events\n" + " -d delay --delay= # sampling/display delay [default: 2]\n" + " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n" + " -r prio --realtime= # event acquisition runs with SCHED_FIFO policy\n" + " -s symbol --symbol= # function to be showed annotated one-shot\n" + " -x path --vmlinux= # the vmlinux binary, required for -s use\n" + " -z --zero # zero counts after display\n" + " -D --dump_symtab # dump symbol table to stderr on startup\n" + " -m pages --mmap_pages= # number of mmap data pages\n" + " -M --mmap_info # print mmap info stream\n" + " -U --munmap_info # print munmap info stream\n" + ); + + exit(0); +} + +static char *event_name(int ctr) +{ + __u64 config = event_id[ctr]; + int type = PERF_COUNTER_TYPE(config); + int id = PERF_COUNTER_ID(config); + static char buf[32]; + + if (PERF_COUNTER_RAW(config)) { + sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config)); + return buf; + } + + switch (type) { + case PERF_TYPE_HARDWARE: + if (id < PERF_HW_EVENTS_MAX) + return hw_event_names[id]; + return "unknown-hardware"; + + case PERF_TYPE_SOFTWARE: + if (id < PERF_SW_EVENTS_MAX) + return sw_event_names[id]; + return "unknown-software"; + + default: + break; + } + + return "unknown"; +} + +/* + * Each event can have multiple symbolic names. + * Symbolic names are (almost) exactly matched. + */ +static __u64 match_event_symbols(char *str) +{ + __u64 config, id; + int type; + unsigned int i; + + if (sscanf(str, "r%llx", &config) == 1) + return config | PERF_COUNTER_RAW_MASK; + + if (sscanf(str, "%d:%llu", &type, &id) == 2) + return EID(type, id); + + for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { + if (!strncmp(str, event_symbols[i].symbol, + strlen(event_symbols[i].symbol))) + return event_symbols[i].event; + } + + return ~0ULL; +} + +static int parse_events(char *str) +{ + __u64 config; + +again: + if (nr_counters == MAX_COUNTERS) + return -1; + + config = match_event_symbols(str); + if (config == ~0ULL) + return -1; + + event_id[nr_counters] = config; + nr_counters++; + + str = strstr(str, ","); + if (str) { + str++; + goto again; + } + + return 0; +} + + +/* + * perfstat + */ + +char fault_here[1000000]; + +static void create_perfstat_counter(int counter) +{ + struct perf_counter_hw_event hw_event; + + memset(&hw_event, 0, sizeof(hw_event)); + hw_event.config = event_id[counter]; + hw_event.record_type = 0; + hw_event.nmi = 0; + if (scale) + hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; + + if (system_wide) { + int cpu; + for (cpu = 0; cpu < nr_cpus; cpu ++) { + fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0); + if (fd[cpu][counter] < 0) { + printf("perfstat error: syscall returned with %d (%s)\n", + fd[cpu][counter], strerror(errno)); + exit(-1); + } + } + } else { + hw_event.inherit = 1; + hw_event.disabled = 1; + + fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0); + if (fd[0][counter] < 0) { + printf("perfstat error: syscall returned with %d (%s)\n", + fd[0][counter], strerror(errno)); + exit(-1); + } + } +} + +int do_perfstat(int argc, char *argv[]) +{ + unsigned long long t0, t1; + int counter; + ssize_t res; + int status; + int pid; + + if (!system_wide) + nr_cpus = 1; + + for (counter = 0; counter < nr_counters; counter++) + create_perfstat_counter(counter); + + argc -= optind; + argv += optind; + + if (!argc) + display_help(); + + /* + * Enable counters and exec the command: + */ + t0 = rdclock(); + prctl(PR_TASK_PERF_COUNTERS_ENABLE); + + if ((pid = fork()) < 0) + perror("failed to fork"); + if (!pid) { + if (execvp(argv[0], argv)) { + perror(argv[0]); + exit(-1); + } + } + while (wait(&status) >= 0) + ; + prctl(PR_TASK_PERF_COUNTERS_DISABLE); + t1 = rdclock(); + + fflush(stdout); + + fprintf(stderr, "\n"); + fprintf(stderr, " Performance counter stats for \'%s\':\n", + argv[0]); + fprintf(stderr, "\n"); + + for (counter = 0; counter < nr_counters; counter++) { + int cpu, nv; + __u64 count[3], single_count[3]; + int scaled; + + count[0] = count[1] = count[2] = 0; + nv = scale ? 3 : 1; + for (cpu = 0; cpu < nr_cpus; cpu ++) { + res = read(fd[cpu][counter], + single_count, nv * sizeof(__u64)); + assert(res == nv * sizeof(__u64)); + + count[0] += single_count[0]; + if (scale) { + count[1] += single_count[1]; + count[2] += single_count[2]; + } + } + + scaled = 0; + if (scale) { + if (count[2] == 0) { + fprintf(stderr, " %14s %-20s\n", + "", event_name(counter)); + continue; + } + if (count[2] < count[1]) { + scaled = 1; + count[0] = (unsigned long long) + ((double)count[0] * count[1] / count[2] + 0.5); + } + } + + if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) || + event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) { + + double msecs = (double)count[0] / 1000000; + + fprintf(stderr, " %14.6f %-20s (msecs)", + msecs, event_name(counter)); + } else { + fprintf(stderr, " %14Ld %-20s (events)", + count[0], event_name(counter)); + } + if (scaled) + fprintf(stderr, " (scaled from %.2f%%)", + (double) count[2] / count[1] * 100); + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); + fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", + (double)(t1-t0)/1e6); + fprintf(stderr, "\n"); + + return 0; +} + +/* + * Symbols + */ + +static uint64_t min_ip; +static uint64_t max_ip = -1ll; + +struct sym_entry { + unsigned long long addr; + char *sym; + unsigned long count[MAX_COUNTERS]; + int skip; + struct source_line *source; +}; + +#define MAX_SYMS 100000 + +static int sym_table_count; + +struct sym_entry *sym_filter_entry; + +static struct sym_entry sym_table[MAX_SYMS]; + +static void show_details(struct sym_entry *sym); + +/* + * Ordering weight: count-1 * count-2 * ... / count-n + */ +static double sym_weight(const struct sym_entry *sym) +{ + double weight; + int counter; + + weight = sym->count[0]; + + for (counter = 1; counter < nr_counters-1; counter++) + weight *= sym->count[counter]; + + weight /= (sym->count[counter] + 1); + + return weight; +} + +static int compare(const void *__sym1, const void *__sym2) +{ + const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; + + return sym_weight(sym1) < sym_weight(sym2); +} + +static long events; +static long userspace_events; +static const char CONSOLE_CLEAR[] = ""; + +static struct sym_entry tmp[MAX_SYMS]; + +static void print_sym_table(void) +{ + int i, printed; + int counter; + float events_per_sec = events/delay_secs; + float kevents_per_sec = (events-userspace_events)/delay_secs; + float sum_kevents = 0.0; + + events = userspace_events = 0; + memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count); + qsort(tmp, sym_table_count, sizeof(tmp[0]), compare); + + for (i = 0; i < sym_table_count && tmp[i].count[0]; i++) + sum_kevents += tmp[i].count[0]; + + write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR)); + + printf( +"------------------------------------------------------------------------------\n"); + printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ", + events_per_sec, + 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)), + nmi ? "NMI" : "IRQ"); + + if (nr_counters == 1) + printf("%d ", event_count[0]); + + for (counter = 0; counter < nr_counters; counter++) { + if (counter) + printf("/"); + + printf("%s", event_name(counter)); + } + + printf( "], "); + + if (tid != -1) + printf(" (tid: %d", tid); + else + printf(" (all"); + + if (profile_cpu != -1) + printf(", cpu: %d)\n", profile_cpu); + else { + if (tid != -1) + printf(")\n"); + else + printf(", %d CPUs)\n", nr_cpus); + } + + printf("------------------------------------------------------------------------------\n\n"); + + if (nr_counters == 1) + printf(" events pcnt"); + else + printf(" weight events pcnt"); + + printf(" RIP kernel function\n" + " ______ ______ _____ ________________ _______________\n\n" + ); + + for (i = 0, printed = 0; i < sym_table_count; i++) { + float pcnt; + int count; + + if (printed <= 18 && tmp[i].count[0] >= count_filter) { + pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents)); + + if (nr_counters == 1) + printf("%19.2f - %4.1f%% - %016llx : %s\n", + sym_weight(tmp + i), + pcnt, tmp[i].addr, tmp[i].sym); + else + printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n", + sym_weight(tmp + i), + tmp[i].count[0], + pcnt, tmp[i].addr, tmp[i].sym); + printed++; + } + /* + * Add decay to the counts: + */ + for (count = 0; count < nr_counters; count++) + sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8; + } + + if (sym_filter_entry) + show_details(sym_filter_entry); + + { + struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; + + if (poll(&stdin_poll, 1, 0) == 1) { + printf("key pressed - exiting.\n"); + exit(0); + } + } +} + +static void *display_thread(void *arg) +{ + printf("KernelTop refresh period: %d seconds\n", delay_secs); + + while (!sleep(delay_secs)) + print_sym_table(); + + return NULL; +} + +static int read_symbol(FILE *in, struct sym_entry *s) +{ + static int filter_match = 0; + char *sym, stype; + char str[500]; + int rc, pos; + + rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str); + if (rc == EOF) + return -1; + + assert(rc == 3); + + /* skip until end of line: */ + pos = strlen(str); + do { + rc = fgetc(in); + if (rc == '\n' || rc == EOF || pos >= 499) + break; + str[pos] = rc; + pos++; + } while (1); + str[pos] = 0; + + sym = str; + + /* Filter out known duplicates and non-text symbols. */ + if (!strcmp(sym, "_text")) + return 1; + if (!min_ip && !strcmp(sym, "_stext")) + return 1; + if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext")) + return 1; + if (stype != 'T' && stype != 't') + return 1; + if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14)) + return 1; + if (strstr(sym, "_text_start") || strstr(sym, "_text_end")) + return 1; + + s->sym = malloc(strlen(str)); + assert(s->sym); + + strcpy((char *)s->sym, str); + s->skip = 0; + + /* Tag events to be skipped. */ + if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym)) + s->skip = 1; + else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym)) + s->skip = 1; + else if (!strcmp("mwait_idle", s->sym)) + s->skip = 1; + + if (filter_match == 1) { + filter_end = s->addr; + filter_match = -1; + if (filter_end - filter_start > 10000) { + printf("hm, too large filter symbol <%s> - skipping.\n", + sym_filter); + printf("symbol filter start: %016lx\n", filter_start); + printf(" end: %016lx\n", filter_end); + filter_end = filter_start = 0; + sym_filter = NULL; + sleep(1); + } + } + if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) { + filter_match = 1; + filter_start = s->addr; + } + + return 0; +} + +int compare_addr(const void *__sym1, const void *__sym2) +{ + const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; + + return sym1->addr > sym2->addr; +} + +static void sort_symbol_table(void) +{ + int i, dups; + + do { + qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr); + for (i = 0, dups = 0; i < sym_table_count; i++) { + if (sym_table[i].addr == sym_table[i+1].addr) { + sym_table[i+1].addr = -1ll; + dups++; + } + } + sym_table_count -= dups; + } while(dups); +} + +static void parse_symbols(void) +{ + struct sym_entry *last; + + FILE *kallsyms = fopen("/proc/kallsyms", "r"); + + if (!kallsyms) { + printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n"); + exit(-1); + } + + while (!feof(kallsyms)) { + if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) { + sym_table_count++; + assert(sym_table_count <= MAX_SYMS); + } + } + + sort_symbol_table(); + min_ip = sym_table[0].addr; + max_ip = sym_table[sym_table_count-1].addr; + last = sym_table + sym_table_count++; + + last->addr = -1ll; + last->sym = ""; + + if (filter_end) { + int count; + for (count=0; count < sym_table_count; count ++) { + if (!strcmp(sym_table[count].sym, sym_filter)) { + sym_filter_entry = &sym_table[count]; + break; + } + } + } + if (dump_symtab) { + int i; + + for (i = 0; i < sym_table_count; i++) + fprintf(stderr, "%llx %s\n", + sym_table[i].addr, sym_table[i].sym); + } +} + +/* + * Source lines + */ + +static void parse_vmlinux(char *filename) +{ + FILE *file; + char command[PATH_MAX*2]; + if (!filename) + return; + + sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename); + + file = popen(command, "r"); + if (!file) + return; + + lines_tail = &lines; + while (!feof(file)) { + struct source_line *src; + size_t dummy = 0; + char *c; + + src = malloc(sizeof(struct source_line)); + assert(src != NULL); + memset(src, 0, sizeof(struct source_line)); + + if (getline(&src->line, &dummy, file) < 0) + break; + if (!src->line) + break; + + c = strchr(src->line, '\n'); + if (c) + *c = 0; + + src->next = NULL; + *lines_tail = src; + lines_tail = &src->next; + + if (strlen(src->line)>8 && src->line[8] == ':') + src->EIP = strtoull(src->line, NULL, 16); + if (strlen(src->line)>8 && src->line[16] == ':') + src->EIP = strtoull(src->line, NULL, 16); + } + pclose(file); +} + +static void record_precise_ip(uint64_t ip) +{ + struct source_line *line; + + for (line = lines; line; line = line->next) { + if (line->EIP == ip) + line->count++; + if (line->EIP > ip) + break; + } +} + +static void lookup_sym_in_vmlinux(struct sym_entry *sym) +{ + struct source_line *line; + char pattern[PATH_MAX]; + sprintf(pattern, "<%s>:", sym->sym); + + for (line = lines; line; line = line->next) { + if (strstr(line->line, pattern)) { + sym->source = line; + break; + } + } +} + +static void show_lines(struct source_line *line_queue, int line_queue_count) +{ + int i; + struct source_line *line; + + line = line_queue; + for (i = 0; i < line_queue_count; i++) { + printf("%8li\t%s\n", line->count, line->line); + line = line->next; + } +} + +#define TRACE_COUNT 3 + +static void show_details(struct sym_entry *sym) +{ + struct source_line *line; + struct source_line *line_queue = NULL; + int displayed = 0; + int line_queue_count = 0; + + if (!sym->source) + lookup_sym_in_vmlinux(sym); + if (!sym->source) + return; + + printf("Showing details for %s\n", sym->sym); + + line = sym->source; + while (line) { + if (displayed && strstr(line->line, ">:")) + break; + + if (!line_queue_count) + line_queue = line; + line_queue_count ++; + + if (line->count >= count_filter) { + show_lines(line_queue, line_queue_count); + line_queue_count = 0; + line_queue = NULL; + } else if (line_queue_count > TRACE_COUNT) { + line_queue = line_queue->next; + line_queue_count --; + } + + line->count = 0; + displayed++; + if (displayed > 300) + break; + line = line->next; + } +} + +/* + * Binary search in the histogram table and record the hit: + */ +static void record_ip(uint64_t ip, int counter) +{ + int left_idx, middle_idx, right_idx, idx; + unsigned long left, middle, right; + + record_precise_ip(ip); + + left_idx = 0; + right_idx = sym_table_count-1; + assert(ip <= max_ip && ip >= min_ip); + + while (left_idx + 1 < right_idx) { + middle_idx = (left_idx + right_idx) / 2; + + left = sym_table[ left_idx].addr; + middle = sym_table[middle_idx].addr; + right = sym_table[ right_idx].addr; + + if (!(left <= middle && middle <= right)) { + printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right); + printf("%d %d %d\n", left_idx, middle_idx, right_idx); + } + assert(left <= middle && middle <= right); + if (!(left <= ip && ip <= right)) { + printf(" left: %016lx\n", left); + printf(" ip: %016lx\n", (unsigned long)ip); + printf("right: %016lx\n", right); + } + assert(left <= ip && ip <= right); + /* + * [ left .... target .... middle .... right ] + * => right := middle + */ + if (ip < middle) { + right_idx = middle_idx; + continue; + } + /* + * [ left .... middle ... target ... right ] + * => left := middle + */ + left_idx = middle_idx; + } + + idx = left_idx; + + if (!sym_table[idx].skip) + sym_table[idx].count[counter]++; + else events--; +} + +static void process_event(uint64_t ip, int counter) +{ + events++; + + if (ip < min_ip || ip > max_ip) { + userspace_events++; + return; + } + + record_ip(ip, counter); +} + +static void process_options(int argc, char *argv[]) +{ + int error = 0, counter; + + if (strstr(argv[0], "perfstat")) + run_perfstat = 1; + + for (;;) { + int option_index = 0; + /** Options for getopt */ + static struct option long_options[] = { + {"count", required_argument, NULL, 'c'}, + {"cpu", required_argument, NULL, 'C'}, + {"delay", required_argument, NULL, 'd'}, + {"dump_symtab", no_argument, NULL, 'D'}, + {"event", required_argument, NULL, 'e'}, + {"filter", required_argument, NULL, 'f'}, + {"group", required_argument, NULL, 'g'}, + {"help", no_argument, NULL, 'h'}, + {"nmi", required_argument, NULL, 'n'}, + {"mmap_info", no_argument, NULL, 'M'}, + {"mmap_pages", required_argument, NULL, 'm'}, + {"munmap_info", no_argument, NULL, 'U'}, + {"pid", required_argument, NULL, 'p'}, + {"realtime", required_argument, NULL, 'r'}, + {"scale", no_argument, NULL, 'l'}, + {"symbol", required_argument, NULL, 's'}, + {"stat", no_argument, NULL, 'S'}, + {"vmlinux", required_argument, NULL, 'x'}, + {"zero", no_argument, NULL, 'z'}, + {NULL, 0, NULL, 0 } + }; + int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'a': system_wide = 1; break; + case 'c': default_interval = atoi(optarg); break; + case 'C': + /* CPU and PID are mutually exclusive */ + if (tid != -1) { + printf("WARNING: CPU switch overriding PID\n"); + sleep(1); + tid = -1; + } + profile_cpu = atoi(optarg); break; + case 'd': delay_secs = atoi(optarg); break; + case 'D': dump_symtab = 1; break; + + case 'e': error = parse_events(optarg); break; + + case 'f': count_filter = atoi(optarg); break; + case 'g': group = atoi(optarg); break; + case 'h': display_help(); break; + case 'l': scale = 1; break; + case 'n': nmi = atoi(optarg); break; + case 'p': + /* CPU and PID are mutually exclusive */ + if (profile_cpu != -1) { + printf("WARNING: PID switch overriding CPU\n"); + sleep(1); + profile_cpu = -1; + } + tid = atoi(optarg); break; + case 'r': realtime_prio = atoi(optarg); break; + case 's': sym_filter = strdup(optarg); break; + case 'S': run_perfstat = 1; break; + case 'x': vmlinux = strdup(optarg); break; + case 'z': zero = 1; break; + case 'm': mmap_pages = atoi(optarg); break; + case 'M': use_mmap = 1; break; + case 'U': use_munmap = 1; break; + default: error = 1; break; + } + } + if (error) + display_help(); + + if (!nr_counters) { + if (run_perfstat) + nr_counters = 8; + else { + nr_counters = 1; + event_id[0] = 0; + } + } + + for (counter = 0; counter < nr_counters; counter++) { + if (event_count[counter]) + continue; + + event_count[counter] = default_interval; + } +} + +struct mmap_data { + int counter; + void *base; + unsigned int mask; + unsigned int prev; +}; + +static unsigned int mmap_read_head(struct mmap_data *md) +{ + struct perf_counter_mmap_page *pc = md->base; + int head; + + head = pc->data_head; + rmb(); + + return head; +} + +struct timeval last_read, this_read; + +static void mmap_read(struct mmap_data *md) +{ + unsigned int head = mmap_read_head(md); + unsigned int old = md->prev; + unsigned char *data = md->base + page_size; + int diff; + + gettimeofday(&this_read, NULL); + + /* + * If we're further behind than half the buffer, there's a chance + * the writer will bite our tail and screw up the events under us. + * + * If we somehow ended up ahead of the head, we got messed up. + * + * In either case, truncate and restart at head. + */ + diff = head - old; + if (diff > md->mask / 2 || diff < 0) { + struct timeval iv; + unsigned long msecs; + + timersub(&this_read, &last_read, &iv); + msecs = iv.tv_sec*1000 + iv.tv_usec/1000; + + fprintf(stderr, "WARNING: failed to keep up with mmap data." + " Last read %lu msecs ago.\n", msecs); + + /* + * head points to a known good entry, start there. + */ + old = head; + } + + last_read = this_read; + + for (; old != head;) { + struct ip_event { + struct perf_event_header header; + __u64 ip; + __u32 pid, tid; + }; + struct mmap_event { + struct perf_event_header header; + __u32 pid, tid; + __u64 start; + __u64 len; + __u64 pgoff; + char filename[PATH_MAX]; + }; + + typedef union event_union { + struct perf_event_header header; + struct ip_event ip; + struct mmap_event mmap; + } event_t; + + event_t *event = (event_t *)&data[old & md->mask]; + + event_t event_copy; + + unsigned int size = event->header.size; + + /* + * Event straddles the mmap boundary -- header should always + * be inside due to u64 alignment of output. + */ + if ((old & md->mask) + size != ((old + size) & md->mask)) { + unsigned int offset = old; + unsigned int len = min(sizeof(*event), size), cpy; + void *dst = &event_copy; + + do { + cpy = min(md->mask + 1 - (offset & md->mask), len); + memcpy(dst, &data[offset & md->mask], cpy); + offset += cpy; + dst += cpy; + len -= cpy; + } while (len); + + event = &event_copy; + } + + old += size; + + if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) { + if (event->header.type & PERF_RECORD_IP) + process_event(event->ip.ip, md->counter); + } else { + switch (event->header.type) { + case PERF_EVENT_MMAP: + case PERF_EVENT_MUNMAP: + printf("%s: %Lu %Lu %Lu %s\n", + event->header.type == PERF_EVENT_MMAP + ? "mmap" : "munmap", + event->mmap.start, + event->mmap.len, + event->mmap.pgoff, + event->mmap.filename); + break; + } + } + } + + md->prev = old; +} + +int cmd_top(int argc, const char **argv, const char *prefix) +{ + struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; + struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; + struct perf_counter_hw_event hw_event; + pthread_t thread; + int i, counter, group_fd, nr_poll = 0; + unsigned int cpu; + int ret; + + page_size = sysconf(_SC_PAGE_SIZE); + + process_options(argc, argv); + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + assert(nr_cpus <= MAX_NR_CPUS); + assert(nr_cpus >= 0); + + if (run_perfstat) + return do_perfstat(argc, argv); + + if (tid != -1 || profile_cpu != -1) + nr_cpus = 1; + + parse_symbols(); + if (vmlinux && sym_filter_entry) + parse_vmlinux(vmlinux); + + for (i = 0; i < nr_cpus; i++) { + group_fd = -1; + for (counter = 0; counter < nr_counters; counter++) { + + cpu = profile_cpu; + if (tid == -1 && profile_cpu == -1) + cpu = i; + + memset(&hw_event, 0, sizeof(hw_event)); + hw_event.config = event_id[counter]; + hw_event.irq_period = event_count[counter]; + hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID; + hw_event.nmi = nmi; + hw_event.mmap = use_mmap; + hw_event.munmap = use_munmap; + + fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); + if (fd[i][counter] < 0) { + int err = errno; + printf("kerneltop error: syscall returned with %d (%s)\n", + fd[i][counter], strerror(err)); + if (err == EPERM) + printf("Are you root?\n"); + exit(-1); + } + assert(fd[i][counter] >= 0); + fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); + + /* + * First counter acts as the group leader: + */ + if (group && group_fd == -1) + group_fd = fd[i][counter]; + + event_array[nr_poll].fd = fd[i][counter]; + event_array[nr_poll].events = POLLIN; + nr_poll++; + + mmap_array[i][counter].counter = counter; + mmap_array[i][counter].prev = 0; + mmap_array[i][counter].mask = mmap_pages*page_size - 1; + mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size, + PROT_READ, MAP_SHARED, fd[i][counter], 0); + if (mmap_array[i][counter].base == MAP_FAILED) { + printf("kerneltop error: failed to mmap with %d (%s)\n", + errno, strerror(errno)); + exit(-1); + } + } + } + + if (pthread_create(&thread, NULL, display_thread, NULL)) { + printf("Could not create display thread.\n"); + exit(-1); + } + + if (realtime_prio) { + struct sched_param param; + + param.sched_priority = realtime_prio; + if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { + printf("Could not set realtime priority.\n"); + exit(-1); + } + } + + while (1) { + int hits = events; + + for (i = 0; i < nr_cpus; i++) { + for (counter = 0; counter < nr_counters; counter++) + mmap_read(&mmap_array[i][counter]); + } + + if (hits == events) + ret = poll(event_array, nr_poll, 100); + } + + return 0; +} diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h new file mode 100644 index 00000000000..41637444ce2 --- /dev/null +++ b/Documentation/perf_counter/builtin.h @@ -0,0 +1,18 @@ +#ifndef BUILTIN_H +#define BUILTIN_H + +#include "util.h" +#include "strbuf.h" + +extern const char perf_version_string[]; +extern const char perf_usage_string[]; +extern const char perf_more_info_string[]; + +extern void list_common_cmds_help(void); +extern const char *help_unknown_cmd(const char *cmd); +extern void prune_packed_objects(int); +extern int read_line_with_nul(char *buf, int size, FILE *file); +extern int check_pager_config(const char *cmd); + +extern int cmd_top(int argc, const char **argv, const char *prefix); +#endif diff --git a/Documentation/perf_counter/cache.h b/Documentation/perf_counter/cache.h new file mode 100644 index 00000000000..dc085640a57 --- /dev/null +++ b/Documentation/perf_counter/cache.h @@ -0,0 +1,97 @@ +#ifndef CACHE_H +#define CACHE_H + +#include "util.h" +#include "strbuf.h" + +#define PERF_DIR_ENVIRONMENT "PERF_DIR" +#define PERF_WORK_TREE_ENVIRONMENT "PERF_WORK_TREE" +#define DEFAULT_PERF_DIR_ENVIRONMENT ".perf" +#define DB_ENVIRONMENT "PERF_OBJECT_DIRECTORY" +#define INDEX_ENVIRONMENT "PERF_INDEX_FILE" +#define GRAFT_ENVIRONMENT "PERF_GRAFT_FILE" +#define TEMPLATE_DIR_ENVIRONMENT "PERF_TEMPLATE_DIR" +#define CONFIG_ENVIRONMENT "PERF_CONFIG" +#define EXEC_PATH_ENVIRONMENT "PERF_EXEC_PATH" +#define CEILING_DIRECTORIES_ENVIRONMENT "PERF_CEILING_DIRECTORIES" +#define PERFATTRIBUTES_FILE ".perfattributes" +#define INFOATTRIBUTES_FILE "info/attributes" +#define ATTRIBUTE_MACRO_PREFIX "[attr]" + +typedef int (*config_fn_t)(const char *, const char *, void *); +extern int perf_default_config(const char *, const char *, void *); +extern int perf_config_from_file(config_fn_t fn, const char *, void *); +extern int perf_config(config_fn_t fn, void *); +extern int perf_parse_ulong(const char *, unsigned long *); +extern int perf_config_int(const char *, const char *); +extern unsigned long perf_config_ulong(const char *, const char *); +extern int perf_config_bool_or_int(const char *, const char *, int *); +extern int perf_config_bool(const char *, const char *); +extern int perf_config_string(const char **, const char *, const char *); +extern int perf_config_set(const char *, const char *); +extern int perf_config_set_multivar(const char *, const char *, const char *, int); +extern int perf_config_rename_section(const char *, const char *); +extern const char *perf_etc_perfconfig(void); +extern int check_repository_format_version(const char *var, const char *value, void *cb); +extern int perf_config_system(void); +extern int perf_config_global(void); +extern int config_error_nonbool(const char *); +extern const char *config_exclusive_filename; + +#define MAX_PERFNAME (1000) +extern char perf_default_email[MAX_PERFNAME]; +extern char perf_default_name[MAX_PERFNAME]; +extern int user_ident_explicitly_given; + +extern const char *perf_log_output_encoding; +extern const char *perf_mailmap_file; + +/* IO helper functions */ +extern void maybe_flush_or_die(FILE *, const char *); +extern int copy_fd(int ifd, int ofd); +extern int copy_file(const char *dst, const char *src, int mode); +extern ssize_t read_in_full(int fd, void *buf, size_t count); +extern ssize_t write_in_full(int fd, const void *buf, size_t count); +extern void write_or_die(int fd, const void *buf, size_t count); +extern int write_or_whine(int fd, const void *buf, size_t count, const char *msg); +extern int write_or_whine_pipe(int fd, const void *buf, size_t count, const char *msg); +extern void fsync_or_die(int fd, const char *); + +/* pager.c */ +extern void setup_pager(void); +extern const char *pager_program; +extern int pager_in_use(void); +extern int pager_use_color; + +extern const char *editor_program; +extern const char *excludes_file; + +char *alias_lookup(const char *alias); +int split_cmdline(char *cmdline, const char ***argv); + +#define alloc_nr(x) (((x)+16)*3/2) + +/* + * Realloc the buffer pointed at by variable 'x' so that it can hold + * at least 'nr' entries; the number of entries currently allocated + * is 'alloc', using the standard growing factor alloc_nr() macro. + * + * DO NOT USE any expression with side-effect for 'x' or 'alloc'. + */ +#define ALLOC_GROW(x, nr, alloc) \ + do { \ + if ((nr) > alloc) { \ + if (alloc_nr(alloc) < (nr)) \ + alloc = (nr); \ + else \ + alloc = alloc_nr(alloc); \ + x = xrealloc((x), alloc * sizeof(*(x))); \ + } \ + } while(0) + + +static inline int is_absolute_path(const char *path) +{ + return path[0] == '/'; +} +#endif /* CACHE_H */ diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt new file mode 100644 index 00000000000..1eab3659b20 --- /dev/null +++ b/Documentation/perf_counter/command-list.txt @@ -0,0 +1,4 @@ +# List of known perf commands. +# command name category [deprecated] [common] +perf-top mainporcelain common + diff --git a/Documentation/perf_counter/config.c b/Documentation/perf_counter/config.c new file mode 100644 index 00000000000..672d5395933 --- /dev/null +++ b/Documentation/perf_counter/config.c @@ -0,0 +1,966 @@ +/* + * GIT - The information manager from hell + * + * Copyright (C) Linus Torvalds, 2005 + * Copyright (C) Johannes Schindelin, 2005 + * + */ +#include "util.h" +#include "cache.h" +#include "exec_cmd.h" + +#define MAXNAME (256) + +static FILE *config_file; +static const char *config_file_name; +static int config_linenr; +static int config_file_eof; +static int zlib_compression_seen; + +const char *config_exclusive_filename = NULL; + +static int get_next_char(void) +{ + int c; + FILE *f; + + c = '\n'; + if ((f = config_file) != NULL) { + c = fgetc(f); + if (c == '\r') { + /* DOS like systems */ + c = fgetc(f); + if (c != '\n') { + ungetc(c, f); + c = '\r'; + } + } + if (c == '\n') + config_linenr++; + if (c == EOF) { + config_file_eof = 1; + c = '\n'; + } + } + return c; +} + +static char *parse_value(void) +{ + static char value[1024]; + int quote = 0, comment = 0, len = 0, space = 0; + + for (;;) { + int c = get_next_char(); + if (len >= sizeof(value) - 1) + return NULL; + if (c == '\n') { + if (quote) + return NULL; + value[len] = 0; + return value; + } + if (comment) + continue; + if (isspace(c) && !quote) { + space = 1; + continue; + } + if (!quote) { + if (c == ';' || c == '#') { + comment = 1; + continue; + } + } + if (space) { + if (len) + value[len++] = ' '; + space = 0; + } + if (c == '\\') { + c = get_next_char(); + switch (c) { + case '\n': + continue; + case 't': + c = '\t'; + break; + case 'b': + c = '\b'; + break; + case 'n': + c = '\n'; + break; + /* Some characters escape as themselves */ + case '\\': case '"': + break; + /* Reject unknown escape sequences */ + default: + return NULL; + } + value[len++] = c; + continue; + } + if (c == '"') { + quote = 1-quote; + continue; + } + value[len++] = c; + } +} + +static inline int iskeychar(int c) +{ + return isalnum(c) || c == '-'; +} + +static int get_value(config_fn_t fn, void *data, char *name, unsigned int len) +{ + int c; + char *value; + + /* Get the full name */ + for (;;) { + c = get_next_char(); + if (config_file_eof) + break; + if (!iskeychar(c)) + break; + name[len++] = tolower(c); + if (len >= MAXNAME) + return -1; + } + name[len] = 0; + while (c == ' ' || c == '\t') + c = get_next_char(); + + value = NULL; + if (c != '\n') { + if (c != '=') + return -1; + value = parse_value(); + if (!value) + return -1; + } + return fn(name, value, data); +} + +static int get_extended_base_var(char *name, int baselen, int c) +{ + do { + if (c == '\n') + return -1; + c = get_next_char(); + } while (isspace(c)); + + /* We require the format to be '[base "extension"]' */ + if (c != '"') + return -1; + name[baselen++] = '.'; + + for (;;) { + int c = get_next_char(); + if (c == '\n') + return -1; + if (c == '"') + break; + if (c == '\\') { + c = get_next_char(); + if (c == '\n') + return -1; + } + name[baselen++] = c; + if (baselen > MAXNAME / 2) + return -1; + } + + /* Final ']' */ + if (get_next_char() != ']') + return -1; + return baselen; +} + +static int get_base_var(char *name) +{ + int baselen = 0; + + for (;;) { + int c = get_next_char(); + if (config_file_eof) + return -1; + if (c == ']') + return baselen; + if (isspace(c)) + return get_extended_base_var(name, baselen, c); + if (!iskeychar(c) && c != '.') + return -1; + if (baselen > MAXNAME / 2) + return -1; + name[baselen++] = tolower(c); + } +} + +static int perf_parse_file(config_fn_t fn, void *data) +{ + int comment = 0; + int baselen = 0; + static char var[MAXNAME]; + + /* U+FEFF Byte Order Mark in UTF8 */ + static const unsigned char *utf8_bom = (unsigned char *) "\xef\xbb\xbf"; + const unsigned char *bomptr = utf8_bom; + + for (;;) { + int c = get_next_char(); + if (bomptr && *bomptr) { + /* We are at the file beginning; skip UTF8-encoded BOM + * if present. Sane editors won't put this in on their + * own, but e.g. Windows Notepad will do it happily. */ + if ((unsigned char) c == *bomptr) { + bomptr++; + continue; + } else { + /* Do not tolerate partial BOM. */ + if (bomptr != utf8_bom) + break; + /* No BOM at file beginning. Cool. */ + bomptr = NULL; + } + } + if (c == '\n') { + if (config_file_eof) + return 0; + comment = 0; + continue; + } + if (comment || isspace(c)) + continue; + if (c == '#' || c == ';') { + comment = 1; + continue; + } + if (c == '[') { + baselen = get_base_var(var); + if (baselen <= 0) + break; + var[baselen++] = '.'; + var[baselen] = 0; + continue; + } + if (!isalpha(c)) + break; + var[baselen] = tolower(c); + if (get_value(fn, data, var, baselen+1) < 0) + break; + } + die("bad config file line %d in %s", config_linenr, config_file_name); +} + +static int parse_unit_factor(const char *end, unsigned long *val) +{ + if (!*end) + return 1; + else if (!strcasecmp(end, "k")) { + *val *= 1024; + return 1; + } + else if (!strcasecmp(end, "m")) { + *val *= 1024 * 1024; + return 1; + } + else if (!strcasecmp(end, "g")) { + *val *= 1024 * 1024 * 1024; + return 1; + } + return 0; +} + +static int perf_parse_long(const char *value, long *ret) +{ + if (value && *value) { + char *end; + long val = strtol(value, &end, 0); + unsigned long factor = 1; + if (!parse_unit_factor(end, &factor)) + return 0; + *ret = val * factor; + return 1; + } + return 0; +} + +int perf_parse_ulong(const char *value, unsigned long *ret) +{ + if (value && *value) { + char *end; + unsigned long val = strtoul(value, &end, 0); + if (!parse_unit_factor(end, &val)) + return 0; + *ret = val; + return 1; + } + return 0; +} + +static void die_bad_config(const char *name) +{ + if (config_file_name) + die("bad config value for '%s' in %s", name, config_file_name); + die("bad config value for '%s'", name); +} + +int perf_config_int(const char *name, const char *value) +{ + long ret = 0; + if (!perf_parse_long(value, &ret)) + die_bad_config(name); + return ret; +} + +unsigned long perf_config_ulong(const char *name, const char *value) +{ + unsigned long ret; + if (!perf_parse_ulong(value, &ret)) + die_bad_config(name); + return ret; +} + +int perf_config_bool_or_int(const char *name, const char *value, int *is_bool) +{ + *is_bool = 1; + if (!value) + return 1; + if (!*value) + return 0; + if (!strcasecmp(value, "true") || !strcasecmp(value, "yes") || !strcasecmp(value, "on")) + return 1; + if (!strcasecmp(value, "false") || !strcasecmp(value, "no") || !strcasecmp(value, "off")) + return 0; + *is_bool = 0; + return perf_config_int(name, value); +} + +int perf_config_bool(const char *name, const char *value) +{ + int discard; + return !!perf_config_bool_or_int(name, value, &discard); +} + +int perf_config_string(const char **dest, const char *var, const char *value) +{ + if (!value) + return config_error_nonbool(var); + *dest = strdup(value); + return 0; +} + +static int perf_default_core_config(const char *var, const char *value) +{ + /* Add other config variables here and to Documentation/config.txt. */ + return 0; +} + +int perf_default_config(const char *var, const char *value, void *dummy) +{ + if (!prefixcmp(var, "core.")) + return perf_default_core_config(var, value); + + /* Add other config variables here and to Documentation/config.txt. */ + return 0; +} + +int perf_config_from_file(config_fn_t fn, const char *filename, void *data) +{ + int ret; + FILE *f = fopen(filename, "r"); + + ret = -1; + if (f) { + config_file = f; + config_file_name = filename; + config_linenr = 1; + config_file_eof = 0; + ret = perf_parse_file(fn, data); + fclose(f); + config_file_name = NULL; + } + return ret; +} + +const char *perf_etc_perfconfig(void) +{ + static const char *system_wide; + if (!system_wide) + system_wide = system_path(ETC_PERFCONFIG); + return system_wide; +} + +static int perf_env_bool(const char *k, int def) +{ + const char *v = getenv(k); + return v ? perf_config_bool(k, v) : def; +} + +int perf_config_system(void) +{ + return !perf_env_bool("PERF_CONFIG_NOSYSTEM", 0); +} + +int perf_config_global(void) +{ + return !perf_env_bool("PERF_CONFIG_NOGLOBAL", 0); +} + +int perf_config(config_fn_t fn, void *data) +{ + int ret = 0, found = 0; + char *repo_config = NULL; + const char *home = NULL; + + /* Setting $PERF_CONFIG makes perf read _only_ the given config file. */ + if (config_exclusive_filename) + return perf_config_from_file(fn, config_exclusive_filename, data); + if (perf_config_system() && !access(perf_etc_perfconfig(), R_OK)) { + ret += perf_config_from_file(fn, perf_etc_perfconfig(), + data); + found += 1; + } + + home = getenv("HOME"); + if (perf_config_global() && home) { + char *user_config = strdup(mkpath("%s/.perfconfig", home)); + if (!access(user_config, R_OK)) { + ret += perf_config_from_file(fn, user_config, data); + found += 1; + } + free(user_config); + } + + repo_config = perf_pathdup("config"); + if (!access(repo_config, R_OK)) { + ret += perf_config_from_file(fn, repo_config, data); + found += 1; + } + free(repo_config); + if (found == 0) + return -1; + return ret; +} + +/* + * Find all the stuff for perf_config_set() below. + */ + +#define MAX_MATCHES 512 + +static struct { + int baselen; + char* key; + int do_not_match; + regex_t* value_regex; + int multi_replace; + size_t offset[MAX_MATCHES]; + enum { START, SECTION_SEEN, SECTION_END_SEEN, KEY_SEEN } state; + int seen; +} store; + +static int matches(const char* key, const char* value) +{ + return !strcmp(key, store.key) && + (store.value_regex == NULL || + (store.do_not_match ^ + !regexec(store.value_regex, value, 0, NULL, 0))); +} + +static int store_aux(const char* key, const char* value, void *cb) +{ + const char *ep; + size_t section_len; + + switch (store.state) { + case KEY_SEEN: + if (matches(key, value)) { + if (store.seen == 1 && store.multi_replace == 0) { + warning("%s has multiple values", key); + } else if (store.seen >= MAX_MATCHES) { + error("too many matches for %s", key); + return 1; + } + + store.offset[store.seen] = ftell(config_file); + store.seen++; + } + break; + case SECTION_SEEN: + /* + * What we are looking for is in store.key (both + * section and var), and its section part is baselen + * long. We found key (again, both section and var). + * We would want to know if this key is in the same + * section as what we are looking for. We already + * know we are in the same section as what should + * hold store.key. + */ + ep = strrchr(key, '.'); + section_len = ep - key; + + if ((section_len != store.baselen) || + memcmp(key, store.key, section_len+1)) { + store.state = SECTION_END_SEEN; + break; + } + + /* + * Do not increment matches: this is no match, but we + * just made sure we are in the desired section. + */ + store.offset[store.seen] = ftell(config_file); + /* fallthru */ + case SECTION_END_SEEN: + case START: + if (matches(key, value)) { + store.offset[store.seen] = ftell(config_file); + store.state = KEY_SEEN; + store.seen++; + } else { + if (strrchr(key, '.') - key == store.baselen && + !strncmp(key, store.key, store.baselen)) { + store.state = SECTION_SEEN; + store.offset[store.seen] = ftell(config_file); + } + } + } + return 0; +} + +static int write_error(const char *filename) +{ + error("failed to write new configuration file %s", filename); + + /* Same error code as "failed to rename". */ + return 4; +} + +static int store_write_section(int fd, const char* key) +{ + const char *dot; + int i, success; + struct strbuf sb = STRBUF_INIT; + + dot = memchr(key, '.', store.baselen); + if (dot) { + strbuf_addf(&sb, "[%.*s \"", (int)(dot - key), key); + for (i = dot - key + 1; i < store.baselen; i++) { + if (key[i] == '"' || key[i] == '\\') + strbuf_addch(&sb, '\\'); + strbuf_addch(&sb, key[i]); + } + strbuf_addstr(&sb, "\"]\n"); + } else { + strbuf_addf(&sb, "[%.*s]\n", store.baselen, key); + } + + success = write_in_full(fd, sb.buf, sb.len) == sb.len; + strbuf_release(&sb); + + return success; +} + +static int store_write_pair(int fd, const char* key, const char* value) +{ + int i, success; + int length = strlen(key + store.baselen + 1); + const char *quote = ""; + struct strbuf sb = STRBUF_INIT; + + /* + * Check to see if the value needs to be surrounded with a dq pair. + * Note that problematic characters are always backslash-quoted; this + * check is about not losing leading or trailing SP and strings that + * follow beginning-of-comment characters (i.e. ';' and '#') by the + * configuration parser. + */ + if (value[0] == ' ') + quote = "\""; + for (i = 0; value[i]; i++) + if (value[i] == ';' || value[i] == '#') + quote = "\""; + if (i && value[i - 1] == ' ') + quote = "\""; + + strbuf_addf(&sb, "\t%.*s = %s", + length, key + store.baselen + 1, quote); + + for (i = 0; value[i]; i++) + switch (value[i]) { + case '\n': + strbuf_addstr(&sb, "\\n"); + break; + case '\t': + strbuf_addstr(&sb, "\\t"); + break; + case '"': + case '\\': + strbuf_addch(&sb, '\\'); + default: + strbuf_addch(&sb, value[i]); + break; + } + strbuf_addf(&sb, "%s\n", quote); + + success = write_in_full(fd, sb.buf, sb.len) == sb.len; + strbuf_release(&sb); + + return success; +} + +static ssize_t find_beginning_of_line(const char* contents, size_t size, + size_t offset_, int* found_bracket) +{ + size_t equal_offset = size, bracket_offset = size; + ssize_t offset; + +contline: + for (offset = offset_-2; offset > 0 + && contents[offset] != '\n'; offset--) + switch (contents[offset]) { + case '=': equal_offset = offset; break; + case ']': bracket_offset = offset; break; + } + if (offset > 0 && contents[offset-1] == '\\') { + offset_ = offset; + goto contline; + } + if (bracket_offset < equal_offset) { + *found_bracket = 1; + offset = bracket_offset+1; + } else + offset++; + + return offset; +} + +int perf_config_set(const char* key, const char* value) +{ + return perf_config_set_multivar(key, value, NULL, 0); +} + +/* + * If value==NULL, unset in (remove from) config, + * if value_regex!=NULL, disregard key/value pairs where value does not match. + * if multi_replace==0, nothing, or only one matching key/value is replaced, + * else all matching key/values (regardless how many) are removed, + * before the new pair is written. + * + * Returns 0 on success. + * + * This function does this: + * + * - it locks the config file by creating ".perf/config.lock" + * + * - it then parses the config using store_aux() as validator to find + * the position on the key/value pair to replace. If it is to be unset, + * it must be found exactly once. + * + * - the config file is mmap()ed and the part before the match (if any) is + * written to the lock file, then the changed part and the rest. + * + * - the config file is removed and the lock file rename()d to it. + * + */ +int perf_config_set_multivar(const char* key, const char* value, + const char* value_regex, int multi_replace) +{ + int i, dot; + int fd = -1, in_fd; + int ret; + char* config_filename; + const char* last_dot = strrchr(key, '.'); + + if (config_exclusive_filename) + config_filename = strdup(config_exclusive_filename); + else + config_filename = perf_pathdup("config"); + + /* + * Since "key" actually contains the section name and the real + * key name separated by a dot, we have to know where the dot is. + */ + + if (last_dot == NULL) { + error("key does not contain a section: %s", key); + ret = 2; + goto out_free; + } + store.baselen = last_dot - key; + + store.multi_replace = multi_replace; + + /* + * Validate the key and while at it, lower case it for matching. + */ + store.key = malloc(strlen(key) + 1); + dot = 0; + for (i = 0; key[i]; i++) { + unsigned char c = key[i]; + if (c == '.') + dot = 1; + /* Leave the extended basename untouched.. */ + if (!dot || i > store.baselen) { + if (!iskeychar(c) || (i == store.baselen+1 && !isalpha(c))) { + error("invalid key: %s", key); + free(store.key); + ret = 1; + goto out_free; + } + c = tolower(c); + } else if (c == '\n') { + error("invalid key (newline): %s", key); + free(store.key); + ret = 1; + goto out_free; + } + store.key[i] = c; + } + store.key[i] = 0; + + /* + * If .perf/config does not exist yet, write a minimal version. + */ + in_fd = open(config_filename, O_RDONLY); + if ( in_fd < 0 ) { + free(store.key); + + if ( ENOENT != errno ) { + error("opening %s: %s", config_filename, + strerror(errno)); + ret = 3; /* same as "invalid config file" */ + goto out_free; + } + /* if nothing to unset, error out */ + if (value == NULL) { + ret = 5; + goto out_free; + } + + store.key = (char*)key; + if (!store_write_section(fd, key) || + !store_write_pair(fd, key, value)) + goto write_err_out; + } else { + struct stat st; + char* contents; + size_t contents_sz, copy_begin, copy_end; + int i, new_line = 0; + + if (value_regex == NULL) + store.value_regex = NULL; + else { + if (value_regex[0] == '!') { + store.do_not_match = 1; + value_regex++; + } else + store.do_not_match = 0; + + store.value_regex = (regex_t*)malloc(sizeof(regex_t)); + if (regcomp(store.value_regex, value_regex, + REG_EXTENDED)) { + error("invalid pattern: %s", value_regex); + free(store.value_regex); + ret = 6; + goto out_free; + } + } + + store.offset[0] = 0; + store.state = START; + store.seen = 0; + + /* + * After this, store.offset will contain the *end* offset + * of the last match, or remain at 0 if no match was found. + * As a side effect, we make sure to transform only a valid + * existing config file. + */ + if (perf_config_from_file(store_aux, config_filename, NULL)) { + error("invalid config file %s", config_filename); + free(store.key); + if (store.value_regex != NULL) { + regfree(store.value_regex); + free(store.value_regex); + } + ret = 3; + goto out_free; + } + + free(store.key); + if (store.value_regex != NULL) { + regfree(store.value_regex); + free(store.value_regex); + } + + /* if nothing to unset, or too many matches, error out */ + if ((store.seen == 0 && value == NULL) || + (store.seen > 1 && multi_replace == 0)) { + ret = 5; + goto out_free; + } + + fstat(in_fd, &st); + contents_sz = xsize_t(st.st_size); + contents = mmap(NULL, contents_sz, PROT_READ, + MAP_PRIVATE, in_fd, 0); + close(in_fd); + + if (store.seen == 0) + store.seen = 1; + + for (i = 0, copy_begin = 0; i < store.seen; i++) { + if (store.offset[i] == 0) { + store.offset[i] = copy_end = contents_sz; + } else if (store.state != KEY_SEEN) { + copy_end = store.offset[i]; + } else + copy_end = find_beginning_of_line( + contents, contents_sz, + store.offset[i]-2, &new_line); + + if (copy_end > 0 && contents[copy_end-1] != '\n') + new_line = 1; + + /* write the first part of the config */ + if (copy_end > copy_begin) { + if (write_in_full(fd, contents + copy_begin, + copy_end - copy_begin) < + copy_end - copy_begin) + goto write_err_out; + if (new_line && + write_in_full(fd, "\n", 1) != 1) + goto write_err_out; + } + copy_begin = store.offset[i]; + } + + /* write the pair (value == NULL means unset) */ + if (value != NULL) { + if (store.state == START) { + if (!store_write_section(fd, key)) + goto write_err_out; + } + if (!store_write_pair(fd, key, value)) + goto write_err_out; + } + + /* write the rest of the config */ + if (copy_begin < contents_sz) + if (write_in_full(fd, contents + copy_begin, + contents_sz - copy_begin) < + contents_sz - copy_begin) + goto write_err_out; + + munmap(contents, contents_sz); + } + + ret = 0; + +out_free: + free(config_filename); + return ret; + +write_err_out: + goto out_free; + +} + +static int section_name_match (const char *buf, const char *name) +{ + int i = 0, j = 0, dot = 0; + for (; buf[i] && buf[i] != ']'; i++) { + if (!dot && isspace(buf[i])) { + dot = 1; + if (name[j++] != '.') + break; + for (i++; isspace(buf[i]); i++) + ; /* do nothing */ + if (buf[i] != '"') + break; + continue; + } + if (buf[i] == '\\' && dot) + i++; + else if (buf[i] == '"' && dot) { + for (i++; isspace(buf[i]); i++) + ; /* do_nothing */ + break; + } + if (buf[i] != name[j++]) + break; + } + return (buf[i] == ']' && name[j] == 0); +} + +/* if new_name == NULL, the section is removed instead */ +int perf_config_rename_section(const char *old_name, const char *new_name) +{ + int ret = 0, remove = 0; + char *config_filename; + int out_fd; + char buf[1024]; + + if (config_exclusive_filename) + config_filename = strdup(config_exclusive_filename); + else + config_filename = perf_pathdup("config"); + if (out_fd < 0) { + ret = error("could not lock config file %s", config_filename); + goto out; + } + + if (!(config_file = fopen(config_filename, "rb"))) { + /* no config file means nothing to rename, no error */ + goto unlock_and_out; + } + + while (fgets(buf, sizeof(buf), config_file)) { + int i; + int length; + for (i = 0; buf[i] && isspace(buf[i]); i++) + ; /* do nothing */ + if (buf[i] == '[') { + /* it's a section */ + if (section_name_match (&buf[i+1], old_name)) { + ret++; + if (new_name == NULL) { + remove = 1; + continue; + } + store.baselen = strlen(new_name); + if (!store_write_section(out_fd, new_name)) { + goto out; + } + continue; + } + remove = 0; + } + if (remove) + continue; + length = strlen(buf); + if (write_in_full(out_fd, buf, length) != length) { + goto out; + } + } + fclose(config_file); + unlock_and_out: + out: + free(config_filename); + return ret; +} + +/* + * Call this to report error for your variable that should not + * get a boolean value (i.e. "[my] var" means "true"). + */ +int config_error_nonbool(const char *var) +{ + return error("Missing value for '%s'", var); +} diff --git a/Documentation/perf_counter/ctype.c b/Documentation/perf_counter/ctype.c new file mode 100644 index 00000000000..b90ec004f29 --- /dev/null +++ b/Documentation/perf_counter/ctype.c @@ -0,0 +1,26 @@ +/* + * Sane locale-independent, ASCII ctype. + * + * No surprises, and works with signed and unsigned chars. + */ +#include "cache.h" + +enum { + S = GIT_SPACE, + A = GIT_ALPHA, + D = GIT_DIGIT, + G = GIT_GLOB_SPECIAL, /* *, ?, [, \\ */ + R = GIT_REGEX_SPECIAL, /* $, (, ), +, ., ^, {, | * */ +}; + +unsigned char sane_ctype[256] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, S, S, 0, 0, S, 0, 0, /* 0.. 15 */ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 16.. 31 */ + S, 0, 0, 0, R, 0, 0, 0, R, R, G, R, 0, 0, R, 0, /* 32.. 47 */ + D, D, D, D, D, D, D, D, D, D, 0, 0, 0, 0, 0, G, /* 48.. 63 */ + 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 64.. 79 */ + A, A, A, A, A, A, A, A, A, A, A, G, G, 0, R, 0, /* 80.. 95 */ + 0, A, A, A, A, A, A, A, A, A, A, A, A, A, A, A, /* 96..111 */ + A, A, A, A, A, A, A, A, A, A, A, R, R, 0, 0, 0, /* 112..127 */ + /* Nothing in the 128.. range */ +}; diff --git a/Documentation/perf_counter/exec_cmd.c b/Documentation/perf_counter/exec_cmd.c new file mode 100644 index 00000000000..d3929226315 --- /dev/null +++ b/Documentation/perf_counter/exec_cmd.c @@ -0,0 +1,165 @@ +#include "cache.h" +#include "exec_cmd.h" +#include "quote.h" +#define MAX_ARGS 32 + +extern char **environ; +static const char *argv_exec_path; +static const char *argv0_path; + +const char *system_path(const char *path) +{ +#ifdef RUNTIME_PREFIX + static const char *prefix; +#else + static const char *prefix = PREFIX; +#endif + struct strbuf d = STRBUF_INIT; + + if (is_absolute_path(path)) + return path; + +#ifdef RUNTIME_PREFIX + assert(argv0_path); + assert(is_absolute_path(argv0_path)); + + if (!prefix && + !(prefix = strip_path_suffix(argv0_path, PERF_EXEC_PATH)) && + !(prefix = strip_path_suffix(argv0_path, BINDIR)) && + !(prefix = strip_path_suffix(argv0_path, "perf"))) { + prefix = PREFIX; + fprintf(stderr, "RUNTIME_PREFIX requested, " + "but prefix computation failed. " + "Using static fallback '%s'.\n", prefix); + } +#endif + + strbuf_addf(&d, "%s/%s", prefix, path); + path = strbuf_detach(&d, NULL); + return path; +} + +const char *perf_extract_argv0_path(const char *argv0) +{ + const char *slash; + + if (!argv0 || !*argv0) + return NULL; + slash = argv0 + strlen(argv0); + + while (argv0 <= slash && !is_dir_sep(*slash)) + slash--; + + if (slash >= argv0) { + argv0_path = strndup(argv0, slash - argv0); + return slash + 1; + } + + return argv0; +} + +void perf_set_argv_exec_path(const char *exec_path) +{ + argv_exec_path = exec_path; + /* + * Propagate this setting to external programs. + */ + setenv(EXEC_PATH_ENVIRONMENT, exec_path, 1); +} + + +/* Returns the highest-priority, location to look for perf programs. */ +const char *perf_exec_path(void) +{ + const char *env; + + if (argv_exec_path) + return argv_exec_path; + + env = getenv(EXEC_PATH_ENVIRONMENT); + if (env && *env) { + return env; + } + + return system_path(PERF_EXEC_PATH); +} + +static void add_path(struct strbuf *out, const char *path) +{ + if (path && *path) { + if (is_absolute_path(path)) + strbuf_addstr(out, path); + else + strbuf_addstr(out, make_nonrelative_path(path)); + + strbuf_addch(out, PATH_SEP); + } +} + +void setup_path(void) +{ + const char *old_path = getenv("PATH"); + struct strbuf new_path = STRBUF_INIT; + + add_path(&new_path, perf_exec_path()); + add_path(&new_path, argv0_path); + + if (old_path) + strbuf_addstr(&new_path, old_path); + else + strbuf_addstr(&new_path, "/usr/local/bin:/usr/bin:/bin"); + + setenv("PATH", new_path.buf, 1); + + strbuf_release(&new_path); +} + +const char **prepare_perf_cmd(const char **argv) +{ + int argc; + const char **nargv; + + for (argc = 0; argv[argc]; argc++) + ; /* just counting */ + nargv = malloc(sizeof(*nargv) * (argc + 2)); + + nargv[0] = "perf"; + for (argc = 0; argv[argc]; argc++) + nargv[argc + 1] = argv[argc]; + nargv[argc + 1] = NULL; + return nargv; +} + +int execv_perf_cmd(const char **argv) { + const char **nargv = prepare_perf_cmd(argv); + + /* execvp() can only ever return if it fails */ + execvp("perf", (char **)nargv); + + free(nargv); + return -1; +} + + +int execl_perf_cmd(const char *cmd,...) +{ + int argc; + const char *argv[MAX_ARGS + 1]; + const char *arg; + va_list param; + + va_start(param, cmd); + argv[0] = cmd; + argc = 1; + while (argc < MAX_ARGS) { + arg = argv[argc++] = va_arg(param, char *); + if (!arg) + break; + } + va_end(param); + if (MAX_ARGS <= argc) + return error("too many args to run %s", cmd); + + argv[argc] = NULL; + return execv_perf_cmd(argv); +} diff --git a/Documentation/perf_counter/exec_cmd.h b/Documentation/perf_counter/exec_cmd.h new file mode 100644 index 00000000000..effe25eb154 --- /dev/null +++ b/Documentation/perf_counter/exec_cmd.h @@ -0,0 +1,13 @@ +#ifndef PERF_EXEC_CMD_H +#define PERF_EXEC_CMD_H + +extern void perf_set_argv_exec_path(const char *exec_path); +extern const char *perf_extract_argv0_path(const char *path); +extern const char *perf_exec_path(void); +extern void setup_path(void); +extern const char **prepare_perf_cmd(const char **argv); +extern int execv_perf_cmd(const char **argv); /* NULL terminated */ +extern int execl_perf_cmd(const char *cmd, ...); +extern const char *system_path(const char *path); + +#endif /* PERF_EXEC_CMD_H */ diff --git a/Documentation/perf_counter/generate-cmdlist.sh b/Documentation/perf_counter/generate-cmdlist.sh new file mode 100755 index 00000000000..75c68d948fd --- /dev/null +++ b/Documentation/perf_counter/generate-cmdlist.sh @@ -0,0 +1,24 @@ +#!/bin/sh + +echo "/* Automatically generated by $0 */ +struct cmdname_help +{ + char name[16]; + char help[80]; +}; + +static struct cmdname_help common_cmds[] = {" + +sed -n -e 's/^git-\([^ ]*\)[ ].* common.*/\1/p' command-list.txt | +sort | +while read cmd +do + sed -n ' + /^NAME/,/git-'"$cmd"'/H + ${ + x + s/.*git-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/ + p + }' "Documentation/git-$cmd.txt" +done +echo "};" diff --git a/Documentation/perf_counter/help.c b/Documentation/perf_counter/help.c new file mode 100644 index 00000000000..ec011672166 --- /dev/null +++ b/Documentation/perf_counter/help.c @@ -0,0 +1,366 @@ +#include "cache.h" +#include "builtin.h" +#include "exec_cmd.h" +#include "levenshtein.h" +#include "help.h" + +/* most GUI terminals set COLUMNS (although some don't export it) */ +static int term_columns(void) +{ + char *col_string = getenv("COLUMNS"); + int n_cols; + + if (col_string && (n_cols = atoi(col_string)) > 0) + return n_cols; + +#ifdef TIOCGWINSZ + { + struct winsize ws; + if (!ioctl(1, TIOCGWINSZ, &ws)) { + if (ws.ws_col) + return ws.ws_col; + } + } +#endif + + return 80; +} + +void add_cmdname(struct cmdnames *cmds, const char *name, int len) +{ + struct cmdname *ent = malloc(sizeof(*ent) + len + 1); + + ent->len = len; + memcpy(ent->name, name, len); + ent->name[len] = 0; + + ALLOC_GROW(cmds->names, cmds->cnt + 1, cmds->alloc); + cmds->names[cmds->cnt++] = ent; +} + +static void clean_cmdnames(struct cmdnames *cmds) +{ + int i; + for (i = 0; i < cmds->cnt; ++i) + free(cmds->names[i]); + free(cmds->names); + cmds->cnt = 0; + cmds->alloc = 0; +} + +static int cmdname_compare(const void *a_, const void *b_) +{ + struct cmdname *a = *(struct cmdname **)a_; + struct cmdname *b = *(struct cmdname **)b_; + return strcmp(a->name, b->name); +} + +static void uniq(struct cmdnames *cmds) +{ + int i, j; + + if (!cmds->cnt) + return; + + for (i = j = 1; i < cmds->cnt; i++) + if (strcmp(cmds->names[i]->name, cmds->names[i-1]->name)) + cmds->names[j++] = cmds->names[i]; + + cmds->cnt = j; +} + +void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes) +{ + int ci, cj, ei; + int cmp; + + ci = cj = ei = 0; + while (ci < cmds->cnt && ei < excludes->cnt) { + cmp = strcmp(cmds->names[ci]->name, excludes->names[ei]->name); + if (cmp < 0) + cmds->names[cj++] = cmds->names[ci++]; + else if (cmp == 0) + ci++, ei++; + else if (cmp > 0) + ei++; + } + + while (ci < cmds->cnt) + cmds->names[cj++] = cmds->names[ci++]; + + cmds->cnt = cj; +} + +static void pretty_print_string_list(struct cmdnames *cmds, int longest) +{ + int cols = 1, rows; + int space = longest + 1; /* min 1 SP between words */ + int max_cols = term_columns() - 1; /* don't print *on* the edge */ + int i, j; + + if (space < max_cols) + cols = max_cols / space; + rows = (cmds->cnt + cols - 1) / cols; + + for (i = 0; i < rows; i++) { + printf(" "); + + for (j = 0; j < cols; j++) { + int n = j * rows + i; + int size = space; + if (n >= cmds->cnt) + break; + if (j == cols-1 || n + rows >= cmds->cnt) + size = 1; + printf("%-*s", size, cmds->names[n]->name); + } + putchar('\n'); + } +} + +static int is_executable(const char *name) +{ + struct stat st; + + if (stat(name, &st) || /* stat, not lstat */ + !S_ISREG(st.st_mode)) + return 0; + +#ifdef __MINGW32__ + /* cannot trust the executable bit, peek into the file instead */ + char buf[3] = { 0 }; + int n; + int fd = open(name, O_RDONLY); + st.st_mode &= ~S_IXUSR; + if (fd >= 0) { + n = read(fd, buf, 2); + if (n == 2) + /* DOS executables start with "MZ" */ + if (!strcmp(buf, "#!") || !strcmp(buf, "MZ")) + st.st_mode |= S_IXUSR; + close(fd); + } +#endif + return st.st_mode & S_IXUSR; +} + +static void list_commands_in_dir(struct cmdnames *cmds, + const char *path, + const char *prefix) +{ + int prefix_len; + DIR *dir = opendir(path); + struct dirent *de; + struct strbuf buf = STRBUF_INIT; + int len; + + if (!dir) + return; + if (!prefix) + prefix = "perf-"; + prefix_len = strlen(prefix); + + strbuf_addf(&buf, "%s/", path); + len = buf.len; + + while ((de = readdir(dir)) != NULL) { + int entlen; + + if (prefixcmp(de->d_name, prefix)) + continue; + + strbuf_setlen(&buf, len); + strbuf_addstr(&buf, de->d_name); + if (!is_executable(buf.buf)) + continue; + + entlen = strlen(de->d_name) - prefix_len; + if (has_extension(de->d_name, ".exe")) + entlen -= 4; + + add_cmdname(cmds, de->d_name + prefix_len, entlen); + } + closedir(dir); + strbuf_release(&buf); +} + +void load_command_list(const char *prefix, + struct cmdnames *main_cmds, + struct cmdnames *other_cmds) +{ + const char *env_path = getenv("PATH"); + const char *exec_path = perf_exec_path(); + + if (exec_path) { + list_commands_in_dir(main_cmds, exec_path, prefix); + qsort(main_cmds->names, main_cmds->cnt, + sizeof(*main_cmds->names), cmdname_compare); + uniq(main_cmds); + } + + if (env_path) { + char *paths, *path, *colon; + path = paths = strdup(env_path); + while (1) { + if ((colon = strchr(path, PATH_SEP))) + *colon = 0; + if (!exec_path || strcmp(path, exec_path)) + list_commands_in_dir(other_cmds, path, prefix); + + if (!colon) + break; + path = colon + 1; + } + free(paths); + + qsort(other_cmds->names, other_cmds->cnt, + sizeof(*other_cmds->names), cmdname_compare); + uniq(other_cmds); + } + exclude_cmds(other_cmds, main_cmds); +} + +void list_commands(const char *title, struct cmdnames *main_cmds, + struct cmdnames *other_cmds) +{ + int i, longest = 0; + + for (i = 0; i < main_cmds->cnt; i++) + if (longest < main_cmds->names[i]->len) + longest = main_cmds->names[i]->len; + for (i = 0; i < other_cmds->cnt; i++) + if (longest < other_cmds->names[i]->len) + longest = other_cmds->names[i]->len; + + if (main_cmds->cnt) { + const char *exec_path = perf_exec_path(); + printf("available %s in '%s'\n", title, exec_path); + printf("----------------"); + mput_char('-', strlen(title) + strlen(exec_path)); + putchar('\n'); + pretty_print_string_list(main_cmds, longest); + putchar('\n'); + } + + if (other_cmds->cnt) { + printf("%s available from elsewhere on your $PATH\n", title); + printf("---------------------------------------"); + mput_char('-', strlen(title)); + putchar('\n'); + pretty_print_string_list(other_cmds, longest); + putchar('\n'); + } +} + +int is_in_cmdlist(struct cmdnames *c, const char *s) +{ + int i; + for (i = 0; i < c->cnt; i++) + if (!strcmp(s, c->names[i]->name)) + return 1; + return 0; +} + +static int autocorrect; +static struct cmdnames aliases; + +static int perf_unknown_cmd_config(const char *var, const char *value, void *cb) +{ + if (!strcmp(var, "help.autocorrect")) + autocorrect = perf_config_int(var,value); + /* Also use aliases for command lookup */ + if (!prefixcmp(var, "alias.")) + add_cmdname(&aliases, var + 6, strlen(var + 6)); + + return perf_default_config(var, value, cb); +} + +static int levenshtein_compare(const void *p1, const void *p2) +{ + const struct cmdname *const *c1 = p1, *const *c2 = p2; + const char *s1 = (*c1)->name, *s2 = (*c2)->name; + int l1 = (*c1)->len; + int l2 = (*c2)->len; + return l1 != l2 ? l1 - l2 : strcmp(s1, s2); +} + +static void add_cmd_list(struct cmdnames *cmds, struct cmdnames *old) +{ + int i; + ALLOC_GROW(cmds->names, cmds->cnt + old->cnt, cmds->alloc); + + for (i = 0; i < old->cnt; i++) + cmds->names[cmds->cnt++] = old->names[i]; + free(old->names); + old->cnt = 0; + old->names = NULL; +} + +const char *help_unknown_cmd(const char *cmd) +{ + int i, n, best_similarity = 0; + struct cmdnames main_cmds, other_cmds; + + memset(&main_cmds, 0, sizeof(main_cmds)); + memset(&other_cmds, 0, sizeof(main_cmds)); + memset(&aliases, 0, sizeof(aliases)); + + perf_config(perf_unknown_cmd_config, NULL); + + load_command_list("perf-", &main_cmds, &other_cmds); + + add_cmd_list(&main_cmds, &aliases); + add_cmd_list(&main_cmds, &other_cmds); + qsort(main_cmds.names, main_cmds.cnt, + sizeof(main_cmds.names), cmdname_compare); + uniq(&main_cmds); + + /* This reuses cmdname->len for similarity index */ + for (i = 0; i < main_cmds.cnt; ++i) + main_cmds.names[i]->len = + levenshtein(cmd, main_cmds.names[i]->name, 0, 2, 1, 4); + + qsort(main_cmds.names, main_cmds.cnt, + sizeof(*main_cmds.names), levenshtein_compare); + + if (!main_cmds.cnt) + die ("Uh oh. Your system reports no Git commands at all."); + + best_similarity = main_cmds.names[0]->len; + n = 1; + while (n < main_cmds.cnt && best_similarity == main_cmds.names[n]->len) + ++n; + if (autocorrect && n == 1) { + const char *assumed = main_cmds.names[0]->name; + main_cmds.names[0] = NULL; + clean_cmdnames(&main_cmds); + fprintf(stderr, "WARNING: You called a Git program named '%s', " + "which does not exist.\n" + "Continuing under the assumption that you meant '%s'\n", + cmd, assumed); + if (autocorrect > 0) { + fprintf(stderr, "in %0.1f seconds automatically...\n", + (float)autocorrect/10.0); + poll(NULL, 0, autocorrect * 100); + } + return assumed; + } + + fprintf(stderr, "perf: '%s' is not a perf-command. See 'perf --help'.\n", cmd); + + if (best_similarity < 6) { + fprintf(stderr, "\nDid you mean %s?\n", + n < 2 ? "this": "one of these"); + + for (i = 0; i < n; i++) + fprintf(stderr, "\t%s\n", main_cmds.names[i]->name); + } + + exit(1); +} + +int cmd_version(int argc, const char **argv, const char *prefix) +{ + printf("perf version %s\n", perf_version_string); + return 0; +} diff --git a/Documentation/perf_counter/help.h b/Documentation/perf_counter/help.h new file mode 100644 index 00000000000..56bc15406ff --- /dev/null +++ b/Documentation/perf_counter/help.h @@ -0,0 +1,29 @@ +#ifndef HELP_H +#define HELP_H + +struct cmdnames { + int alloc; + int cnt; + struct cmdname { + size_t len; /* also used for similarity index in help.c */ + char name[FLEX_ARRAY]; + } **names; +}; + +static inline void mput_char(char c, unsigned int num) +{ + while(num--) + putchar(c); +} + +void load_command_list(const char *prefix, + struct cmdnames *main_cmds, + struct cmdnames *other_cmds); +void add_cmdname(struct cmdnames *cmds, const char *name, int len); +/* Here we require that excludes is a sorted list. */ +void exclude_cmds(struct cmdnames *cmds, struct cmdnames *excludes); +int is_in_cmdlist(struct cmdnames *c, const char *s); +void list_commands(const char *title, struct cmdnames *main_cmds, + struct cmdnames *other_cmds); + +#endif /* HELP_H */ diff --git a/Documentation/perf_counter/levenshtein.c b/Documentation/perf_counter/levenshtein.c new file mode 100644 index 00000000000..e521d1516df --- /dev/null +++ b/Documentation/perf_counter/levenshtein.c @@ -0,0 +1,84 @@ +#include "cache.h" +#include "levenshtein.h" + +/* + * This function implements the Damerau-Levenshtein algorithm to + * calculate a distance between strings. + * + * Basically, it says how many letters need to be swapped, substituted, + * deleted from, or added to string1, at least, to get string2. + * + * The idea is to build a distance matrix for the substrings of both + * strings. To avoid a large space complexity, only the last three rows + * are kept in memory (if swaps had the same or higher cost as one deletion + * plus one insertion, only two rows would be needed). + * + * At any stage, "i + 1" denotes the length of the current substring of + * string1 that the distance is calculated for. + * + * row2 holds the current row, row1 the previous row (i.e. for the substring + * of string1 of length "i"), and row0 the row before that. + * + * In other words, at the start of the big loop, row2[j + 1] contains the + * Damerau-Levenshtein distance between the substring of string1 of length + * "i" and the substring of string2 of length "j + 1". + * + * All the big loop does is determine the partial minimum-cost paths. + * + * It does so by calculating the costs of the path ending in characters + * i (in string1) and j (in string2), respectively, given that the last + * operation is a substition, a swap, a deletion, or an insertion. + * + * This implementation allows the costs to be weighted: + * + * - w (as in "sWap") + * - s (as in "Substitution") + * - a (for insertion, AKA "Add") + * - d (as in "Deletion") + * + * Note that this algorithm calculates a distance _iff_ d == a. + */ +int levenshtein(const char *string1, const char *string2, + int w, int s, int a, int d) +{ + int len1 = strlen(string1), len2 = strlen(string2); + int *row0 = malloc(sizeof(int) * (len2 + 1)); + int *row1 = malloc(sizeof(int) * (len2 + 1)); + int *row2 = malloc(sizeof(int) * (len2 + 1)); + int i, j; + + for (j = 0; j <= len2; j++) + row1[j] = j * a; + for (i = 0; i < len1; i++) { + int *dummy; + + row2[0] = (i + 1) * d; + for (j = 0; j < len2; j++) { + /* substitution */ + row2[j + 1] = row1[j] + s * (string1[i] != string2[j]); + /* swap */ + if (i > 0 && j > 0 && string1[i - 1] == string2[j] && + string1[i] == string2[j - 1] && + row2[j + 1] > row0[j - 1] + w) + row2[j + 1] = row0[j - 1] + w; + /* deletion */ + if (row2[j + 1] > row1[j + 1] + d) + row2[j + 1] = row1[j + 1] + d; + /* insertion */ + if (row2[j + 1] > row2[j] + a) + row2[j + 1] = row2[j] + a; + } + + dummy = row0; + row0 = row1; + row1 = row2; + row2 = dummy; + } + + i = row1[len2]; + free(row0); + free(row1); + free(row2); + + return i; +} diff --git a/Documentation/perf_counter/levenshtein.h b/Documentation/perf_counter/levenshtein.h new file mode 100644 index 00000000000..0173abeef52 --- /dev/null +++ b/Documentation/perf_counter/levenshtein.h @@ -0,0 +1,8 @@ +#ifndef LEVENSHTEIN_H +#define LEVENSHTEIN_H + +int levenshtein(const char *string1, const char *string2, + int swap_penalty, int substition_penalty, + int insertion_penalty, int deletion_penalty); + +#endif diff --git a/Documentation/perf_counter/parse-options.c b/Documentation/perf_counter/parse-options.c new file mode 100644 index 00000000000..7464f34e540 --- /dev/null +++ b/Documentation/perf_counter/parse-options.c @@ -0,0 +1,495 @@ +#include "util.h" +#include "parse-options.h" +#include "cache.h" + +#define OPT_SHORT 1 +#define OPT_UNSET 2 + +static int opterror(const struct option *opt, const char *reason, int flags) +{ + if (flags & OPT_SHORT) + return error("switch `%c' %s", opt->short_name, reason); + if (flags & OPT_UNSET) + return error("option `no-%s' %s", opt->long_name, reason); + return error("option `%s' %s", opt->long_name, reason); +} + +static int get_arg(struct parse_opt_ctx_t *p, const struct option *opt, + int flags, const char **arg) +{ + if (p->opt) { + *arg = p->opt; + p->opt = NULL; + } else if (p->argc == 1 && (opt->flags & PARSE_OPT_LASTARG_DEFAULT)) { + *arg = (const char *)opt->defval; + } else if (p->argc > 1) { + p->argc--; + *arg = *++p->argv; + } else + return opterror(opt, "requires a value", flags); + return 0; +} + +static int get_value(struct parse_opt_ctx_t *p, + const struct option *opt, int flags) +{ + const char *s, *arg; + const int unset = flags & OPT_UNSET; + + if (unset && p->opt) + return opterror(opt, "takes no value", flags); + if (unset && (opt->flags & PARSE_OPT_NONEG)) + return opterror(opt, "isn't available", flags); + + if (!(flags & OPT_SHORT) && p->opt) { + switch (opt->type) { + case OPTION_CALLBACK: + if (!(opt->flags & PARSE_OPT_NOARG)) + break; + /* FALLTHROUGH */ + case OPTION_BOOLEAN: + case OPTION_BIT: + case OPTION_SET_INT: + case OPTION_SET_PTR: + return opterror(opt, "takes no value", flags); + default: + break; + } + } + + switch (opt->type) { + case OPTION_BIT: + if (unset) + *(int *)opt->value &= ~opt->defval; + else + *(int *)opt->value |= opt->defval; + return 0; + + case OPTION_BOOLEAN: + *(int *)opt->value = unset ? 0 : *(int *)opt->value + 1; + return 0; + + case OPTION_SET_INT: + *(int *)opt->value = unset ? 0 : opt->defval; + return 0; + + case OPTION_SET_PTR: + *(void **)opt->value = unset ? NULL : (void *)opt->defval; + return 0; + + case OPTION_STRING: + if (unset) + *(const char **)opt->value = NULL; + else if (opt->flags & PARSE_OPT_OPTARG && !p->opt) + *(const char **)opt->value = (const char *)opt->defval; + else + return get_arg(p, opt, flags, (const char **)opt->value); + return 0; + + case OPTION_CALLBACK: + if (unset) + return (*opt->callback)(opt, NULL, 1) ? (-1) : 0; + if (opt->flags & PARSE_OPT_NOARG) + return (*opt->callback)(opt, NULL, 0) ? (-1) : 0; + if (opt->flags & PARSE_OPT_OPTARG && !p->opt) + return (*opt->callback)(opt, NULL, 0) ? (-1) : 0; + if (get_arg(p, opt, flags, &arg)) + return -1; + return (*opt->callback)(opt, arg, 0) ? (-1) : 0; + + case OPTION_INTEGER: + if (unset) { + *(int *)opt->value = 0; + return 0; + } + if (opt->flags & PARSE_OPT_OPTARG && !p->opt) { + *(int *)opt->value = opt->defval; + return 0; + } + if (get_arg(p, opt, flags, &arg)) + return -1; + *(int *)opt->value = strtol(arg, (char **)&s, 10); + if (*s) + return opterror(opt, "expects a numerical value", flags); + return 0; + + default: + die("should not happen, someone must be hit on the forehead"); + } +} + +static int parse_short_opt(struct parse_opt_ctx_t *p, const struct option *options) +{ + for (; options->type != OPTION_END; options++) { + if (options->short_name == *p->opt) { + p->opt = p->opt[1] ? p->opt + 1 : NULL; + return get_value(p, options, OPT_SHORT); + } + } + return -2; +} + +static int parse_long_opt(struct parse_opt_ctx_t *p, const char *arg, + const struct option *options) +{ + const char *arg_end = strchr(arg, '='); + const struct option *abbrev_option = NULL, *ambiguous_option = NULL; + int abbrev_flags = 0, ambiguous_flags = 0; + + if (!arg_end) + arg_end = arg + strlen(arg); + + for (; options->type != OPTION_END; options++) { + const char *rest; + int flags = 0; + + if (!options->long_name) + continue; + + rest = skip_prefix(arg, options->long_name); + if (options->type == OPTION_ARGUMENT) { + if (!rest) + continue; + if (*rest == '=') + return opterror(options, "takes no value", flags); + if (*rest) + continue; + p->out[p->cpidx++] = arg - 2; + return 0; + } + if (!rest) { + /* abbreviated? */ + if (!strncmp(options->long_name, arg, arg_end - arg)) { +is_abbreviated: + if (abbrev_option) { + /* + * If this is abbreviated, it is + * ambiguous. So when there is no + * exact match later, we need to + * error out. + */ + ambiguous_option = abbrev_option; + ambiguous_flags = abbrev_flags; + } + if (!(flags & OPT_UNSET) && *arg_end) + p->opt = arg_end + 1; + abbrev_option = options; + abbrev_flags = flags; + continue; + } + /* negated and abbreviated very much? */ + if (!prefixcmp("no-", arg)) { + flags |= OPT_UNSET; + goto is_abbreviated; + } + /* negated? */ + if (strncmp(arg, "no-", 3)) + continue; + flags |= OPT_UNSET; + rest = skip_prefix(arg + 3, options->long_name); + /* abbreviated and negated? */ + if (!rest && !prefixcmp(options->long_name, arg + 3)) + goto is_abbreviated; + if (!rest) + continue; + } + if (*rest) { + if (*rest != '=') + continue; + p->opt = rest + 1; + } + return get_value(p, options, flags); + } + + if (ambiguous_option) + return error("Ambiguous option: %s " + "(could be --%s%s or --%s%s)", + arg, + (ambiguous_flags & OPT_UNSET) ? "no-" : "", + ambiguous_option->long_name, + (abbrev_flags & OPT_UNSET) ? "no-" : "", + abbrev_option->long_name); + if (abbrev_option) + return get_value(p, abbrev_option, abbrev_flags); + return -2; +} + +static void check_typos(const char *arg, const struct option *options) +{ + if (strlen(arg) < 3) + return; + + if (!prefixcmp(arg, "no-")) { + error ("did you mean `--%s` (with two dashes ?)", arg); + exit(129); + } + + for (; options->type != OPTION_END; options++) { + if (!options->long_name) + continue; + if (!prefixcmp(options->long_name, arg)) { + error ("did you mean `--%s` (with two dashes ?)", arg); + exit(129); + } + } +} + +void parse_options_start(struct parse_opt_ctx_t *ctx, + int argc, const char **argv, int flags) +{ + memset(ctx, 0, sizeof(*ctx)); + ctx->argc = argc - 1; + ctx->argv = argv + 1; + ctx->out = argv; + ctx->cpidx = ((flags & PARSE_OPT_KEEP_ARGV0) != 0); + ctx->flags = flags; + if ((flags & PARSE_OPT_KEEP_UNKNOWN) && + (flags & PARSE_OPT_STOP_AT_NON_OPTION)) + die("STOP_AT_NON_OPTION and KEEP_UNKNOWN don't go together"); +} + +static int usage_with_options_internal(const char * const *, + const struct option *, int); + +int parse_options_step(struct parse_opt_ctx_t *ctx, + const struct option *options, + const char * const usagestr[]) +{ + int internal_help = !(ctx->flags & PARSE_OPT_NO_INTERNAL_HELP); + + /* we must reset ->opt, unknown short option leave it dangling */ + ctx->opt = NULL; + + for (; ctx->argc; ctx->argc--, ctx->argv++) { + const char *arg = ctx->argv[0]; + + if (*arg != '-' || !arg[1]) { + if (ctx->flags & PARSE_OPT_STOP_AT_NON_OPTION) + break; + ctx->out[ctx->cpidx++] = ctx->argv[0]; + continue; + } + + if (arg[1] != '-') { + ctx->opt = arg + 1; + if (internal_help && *ctx->opt == 'h') + return parse_options_usage(usagestr, options); + switch (parse_short_opt(ctx, options)) { + case -1: + return parse_options_usage(usagestr, options); + case -2: + goto unknown; + } + if (ctx->opt) + check_typos(arg + 1, options); + while (ctx->opt) { + if (internal_help && *ctx->opt == 'h') + return parse_options_usage(usagestr, options); + switch (parse_short_opt(ctx, options)) { + case -1: + return parse_options_usage(usagestr, options); + case -2: + /* fake a short option thing to hide the fact that we may have + * started to parse aggregated stuff + * + * This is leaky, too bad. + */ + ctx->argv[0] = strdup(ctx->opt - 1); + *(char *)ctx->argv[0] = '-'; + goto unknown; + } + } + continue; + } + + if (!arg[2]) { /* "--" */ + if (!(ctx->flags & PARSE_OPT_KEEP_DASHDASH)) { + ctx->argc--; + ctx->argv++; + } + break; + } + + if (internal_help && !strcmp(arg + 2, "help-all")) + return usage_with_options_internal(usagestr, options, 1); + if (internal_help && !strcmp(arg + 2, "help")) + return parse_options_usage(usagestr, options); + switch (parse_long_opt(ctx, arg + 2, options)) { + case -1: + return parse_options_usage(usagestr, options); + case -2: + goto unknown; + } + continue; +unknown: + if (!(ctx->flags & PARSE_OPT_KEEP_UNKNOWN)) + return PARSE_OPT_UNKNOWN; + ctx->out[ctx->cpidx++] = ctx->argv[0]; + ctx->opt = NULL; + } + return PARSE_OPT_DONE; +} + +int parse_options_end(struct parse_opt_ctx_t *ctx) +{ + memmove(ctx->out + ctx->cpidx, ctx->argv, ctx->argc * sizeof(*ctx->out)); + ctx->out[ctx->cpidx + ctx->argc] = NULL; + return ctx->cpidx + ctx->argc; +} + +int parse_options(int argc, const char **argv, const struct option *options, + const char * const usagestr[], int flags) +{ + struct parse_opt_ctx_t ctx; + + parse_options_start(&ctx, argc, argv, flags); + switch (parse_options_step(&ctx, options, usagestr)) { + case PARSE_OPT_HELP: + exit(129); + case PARSE_OPT_DONE: + break; + default: /* PARSE_OPT_UNKNOWN */ + if (ctx.argv[0][1] == '-') { + error("unknown option `%s'", ctx.argv[0] + 2); + } else { + error("unknown switch `%c'", *ctx.opt); + } + usage_with_options(usagestr, options); + } + + return parse_options_end(&ctx); +} + +#define USAGE_OPTS_WIDTH 24 +#define USAGE_GAP 2 + +int usage_with_options_internal(const char * const *usagestr, + const struct option *opts, int full) +{ + if (!usagestr) + return PARSE_OPT_HELP; + + fprintf(stderr, "usage: %s\n", *usagestr++); + while (*usagestr && **usagestr) + fprintf(stderr, " or: %s\n", *usagestr++); + while (*usagestr) { + fprintf(stderr, "%s%s\n", + **usagestr ? " " : "", + *usagestr); + usagestr++; + } + + if (opts->type != OPTION_GROUP) + fputc('\n', stderr); + + for (; opts->type != OPTION_END; opts++) { + size_t pos; + int pad; + + if (opts->type == OPTION_GROUP) { + fputc('\n', stderr); + if (*opts->help) + fprintf(stderr, "%s\n", opts->help); + continue; + } + if (!full && (opts->flags & PARSE_OPT_HIDDEN)) + continue; + + pos = fprintf(stderr, " "); + if (opts->short_name) + pos += fprintf(stderr, "-%c", opts->short_name); + if (opts->long_name && opts->short_name) + pos += fprintf(stderr, ", "); + if (opts->long_name) + pos += fprintf(stderr, "--%s", opts->long_name); + + switch (opts->type) { + case OPTION_ARGUMENT: + break; + case OPTION_INTEGER: + if (opts->flags & PARSE_OPT_OPTARG) + if (opts->long_name) + pos += fprintf(stderr, "[=]"); + else + pos += fprintf(stderr, "[]"); + else + pos += fprintf(stderr, " "); + break; + case OPTION_CALLBACK: + if (opts->flags & PARSE_OPT_NOARG) + break; + /* FALLTHROUGH */ + case OPTION_STRING: + if (opts->argh) { + if (opts->flags & PARSE_OPT_OPTARG) + if (opts->long_name) + pos += fprintf(stderr, "[=<%s>]", opts->argh); + else + pos += fprintf(stderr, "[<%s>]", opts->argh); + else + pos += fprintf(stderr, " <%s>", opts->argh); + } else { + if (opts->flags & PARSE_OPT_OPTARG) + if (opts->long_name) + pos += fprintf(stderr, "[=...]"); + else + pos += fprintf(stderr, "[...]"); + else + pos += fprintf(stderr, " ..."); + } + break; + default: /* OPTION_{BIT,BOOLEAN,SET_INT,SET_PTR} */ + break; + } + + if (pos <= USAGE_OPTS_WIDTH) + pad = USAGE_OPTS_WIDTH - pos; + else { + fputc('\n', stderr); + pad = USAGE_OPTS_WIDTH; + } + fprintf(stderr, "%*s%s\n", pad + USAGE_GAP, "", opts->help); + } + fputc('\n', stderr); + + return PARSE_OPT_HELP; +} + +void usage_with_options(const char * const *usagestr, + const struct option *opts) +{ + usage_with_options_internal(usagestr, opts, 0); + exit(129); +} + +int parse_options_usage(const char * const *usagestr, + const struct option *opts) +{ + return usage_with_options_internal(usagestr, opts, 0); +} + + +/*----- some often used options -----*/ +#include "cache.h" + +int parse_opt_verbosity_cb(const struct option *opt, const char *arg, + int unset) +{ + int *target = opt->value; + + if (unset) + /* --no-quiet, --no-verbose */ + *target = 0; + else if (opt->short_name == 'v') { + if (*target >= 0) + (*target)++; + else + *target = 1; + } else { + if (*target <= 0) + (*target)--; + else + *target = -1; + } + return 0; +} diff --git a/Documentation/perf_counter/parse-options.h b/Documentation/perf_counter/parse-options.h new file mode 100644 index 00000000000..a81c7faff68 --- /dev/null +++ b/Documentation/perf_counter/parse-options.h @@ -0,0 +1,172 @@ +#ifndef PARSE_OPTIONS_H +#define PARSE_OPTIONS_H + +enum parse_opt_type { + /* special types */ + OPTION_END, + OPTION_ARGUMENT, + OPTION_GROUP, + /* options with no arguments */ + OPTION_BIT, + OPTION_BOOLEAN, /* _INCR would have been a better name */ + OPTION_SET_INT, + OPTION_SET_PTR, + /* options with arguments (usually) */ + OPTION_STRING, + OPTION_INTEGER, + OPTION_CALLBACK, +}; + +enum parse_opt_flags { + PARSE_OPT_KEEP_DASHDASH = 1, + PARSE_OPT_STOP_AT_NON_OPTION = 2, + PARSE_OPT_KEEP_ARGV0 = 4, + PARSE_OPT_KEEP_UNKNOWN = 8, + PARSE_OPT_NO_INTERNAL_HELP = 16, +}; + +enum parse_opt_option_flags { + PARSE_OPT_OPTARG = 1, + PARSE_OPT_NOARG = 2, + PARSE_OPT_NONEG = 4, + PARSE_OPT_HIDDEN = 8, + PARSE_OPT_LASTARG_DEFAULT = 16, +}; + +struct option; +typedef int parse_opt_cb(const struct option *, const char *arg, int unset); + +/* + * `type`:: + * holds the type of the option, you must have an OPTION_END last in your + * array. + * + * `short_name`:: + * the character to use as a short option name, '\0' if none. + * + * `long_name`:: + * the long option name, without the leading dashes, NULL if none. + * + * `value`:: + * stores pointers to the values to be filled. + * + * `argh`:: + * token to explain the kind of argument this option wants. Keep it + * homogenous across the repository. + * + * `help`:: + * the short help associated to what the option does. + * Must never be NULL (except for OPTION_END). + * OPTION_GROUP uses this pointer to store the group header. + * + * `flags`:: + * mask of parse_opt_option_flags. + * PARSE_OPT_OPTARG: says that the argument is optionnal (not for BOOLEANs) + * PARSE_OPT_NOARG: says that this option takes no argument, for CALLBACKs + * PARSE_OPT_NONEG: says that this option cannot be negated + * PARSE_OPT_HIDDEN this option is skipped in the default usage, showed in + * the long one. + * + * `callback`:: + * pointer to the callback to use for OPTION_CALLBACK. + * + * `defval`:: + * default value to fill (*->value) with for PARSE_OPT_OPTARG. + * OPTION_{BIT,SET_INT,SET_PTR} store the {mask,integer,pointer} to put in + * the value when met. + * CALLBACKS can use it like they want. + */ +struct option { + enum parse_opt_type type; + int short_name; + const char *long_name; + void *value; + const char *argh; + const char *help; + + int flags; + parse_opt_cb *callback; + intptr_t defval; +}; + +#define OPT_END() { OPTION_END } +#define OPT_ARGUMENT(l, h) { OPTION_ARGUMENT, 0, (l), NULL, NULL, (h) } +#define OPT_GROUP(h) { OPTION_GROUP, 0, NULL, NULL, NULL, (h) } +#define OPT_BIT(s, l, v, h, b) { OPTION_BIT, (s), (l), (v), NULL, (h), 0, NULL, (b) } +#define OPT_BOOLEAN(s, l, v, h) { OPTION_BOOLEAN, (s), (l), (v), NULL, (h) } +#define OPT_SET_INT(s, l, v, h, i) { OPTION_SET_INT, (s), (l), (v), NULL, (h), 0, NULL, (i) } +#define OPT_SET_PTR(s, l, v, h, p) { OPTION_SET_PTR, (s), (l), (v), NULL, (h), 0, NULL, (p) } +#define OPT_INTEGER(s, l, v, h) { OPTION_INTEGER, (s), (l), (v), NULL, (h) } +#define OPT_STRING(s, l, v, a, h) { OPTION_STRING, (s), (l), (v), (a), (h) } +#define OPT_DATE(s, l, v, h) \ + { OPTION_CALLBACK, (s), (l), (v), "time",(h), 0, \ + parse_opt_approxidate_cb } +#define OPT_CALLBACK(s, l, v, a, h, f) \ + { OPTION_CALLBACK, (s), (l), (v), (a), (h), 0, (f) } + +/* parse_options() will filter out the processed options and leave the + * non-option argments in argv[]. + * Returns the number of arguments left in argv[]. + */ +extern int parse_options(int argc, const char **argv, + const struct option *options, + const char * const usagestr[], int flags); + +extern NORETURN void usage_with_options(const char * const *usagestr, + const struct option *options); + +/*----- incremantal advanced APIs -----*/ + +enum { + PARSE_OPT_HELP = -1, + PARSE_OPT_DONE, + PARSE_OPT_UNKNOWN, +}; + +/* + * It's okay for the caller to consume argv/argc in the usual way. + * Other fields of that structure are private to parse-options and should not + * be modified in any way. + */ +struct parse_opt_ctx_t { + const char **argv; + const char **out; + int argc, cpidx; + const char *opt; + int flags; +}; + +extern int parse_options_usage(const char * const *usagestr, + const struct option *opts); + +extern void parse_options_start(struct parse_opt_ctx_t *ctx, + int argc, const char **argv, int flags); + +extern int parse_options_step(struct parse_opt_ctx_t *ctx, + const struct option *options, + const char * const usagestr[]); + +extern int parse_options_end(struct parse_opt_ctx_t *ctx); + + +/*----- some often used options -----*/ +extern int parse_opt_abbrev_cb(const struct option *, const char *, int); +extern int parse_opt_approxidate_cb(const struct option *, const char *, int); +extern int parse_opt_verbosity_cb(const struct option *, const char *, int); + +#define OPT__VERBOSE(var) OPT_BOOLEAN('v', "verbose", (var), "be verbose") +#define OPT__QUIET(var) OPT_BOOLEAN('q', "quiet", (var), "be quiet") +#define OPT__VERBOSITY(var) \ + { OPTION_CALLBACK, 'v', "verbose", (var), NULL, "be more verbose", \ + PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 }, \ + { OPTION_CALLBACK, 'q', "quiet", (var), NULL, "be more quiet", \ + PARSE_OPT_NOARG, &parse_opt_verbosity_cb, 0 } +#define OPT__DRY_RUN(var) OPT_BOOLEAN('n', "dry-run", (var), "dry run") +#define OPT__ABBREV(var) \ + { OPTION_CALLBACK, 0, "abbrev", (var), "n", \ + "use digits to display SHA-1s", \ + PARSE_OPT_OPTARG, &parse_opt_abbrev_cb, 0 } + +extern const char *parse_options_fix_filename(const char *prefix, const char *file); + +#endif diff --git a/Documentation/perf_counter/path.c b/Documentation/perf_counter/path.c new file mode 100644 index 00000000000..891b612ec1a --- /dev/null +++ b/Documentation/perf_counter/path.c @@ -0,0 +1,392 @@ +/* + * I'm tired of doing "vsnprintf()" etc just to open a + * file, so here's a "return static buffer with printf" + * interface for paths. + * + * It's obviously not thread-safe. Sue me. But it's quite + * useful for doing things like + * + * f = open(mkpath("%s/%s.perf", base, name), O_RDONLY); + * + * which is what it's designed for. + */ +#include "cache.h" + +static char bad_path[] = "/bad-path/"; +/* + * Two hacks: + */ + +static char *get_perf_dir(void) +{ + return "."; +} + +size_t strlcpy(char *dest, const char *src, size_t size) +{ + size_t ret = strlen(src); + + if (size) { + size_t len = (ret >= size) ? size - 1 : ret; + memcpy(dest, src, len); + dest[len] = '\0'; + } + return ret; +} + + +static char *get_pathname(void) +{ + static char pathname_array[4][PATH_MAX]; + static int index; + return pathname_array[3 & ++index]; +} + +static char *cleanup_path(char *path) +{ + /* Clean it up */ + if (!memcmp(path, "./", 2)) { + path += 2; + while (*path == '/') + path++; + } + return path; +} + +char *mksnpath(char *buf, size_t n, const char *fmt, ...) +{ + va_list args; + unsigned len; + + va_start(args, fmt); + len = vsnprintf(buf, n, fmt, args); + va_end(args); + if (len >= n) { + strlcpy(buf, bad_path, n); + return buf; + } + return cleanup_path(buf); +} + +static char *perf_vsnpath(char *buf, size_t n, const char *fmt, va_list args) +{ + const char *perf_dir = get_perf_dir(); + size_t len; + + len = strlen(perf_dir); + if (n < len + 1) + goto bad; + memcpy(buf, perf_dir, len); + if (len && !is_dir_sep(perf_dir[len-1])) + buf[len++] = '/'; + len += vsnprintf(buf + len, n - len, fmt, args); + if (len >= n) + goto bad; + return cleanup_path(buf); +bad: + strlcpy(buf, bad_path, n); + return buf; +} + +char *perf_snpath(char *buf, size_t n, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + (void)perf_vsnpath(buf, n, fmt, args); + va_end(args); + return buf; +} + +char *perf_pathdup(const char *fmt, ...) +{ + char path[PATH_MAX]; + va_list args; + va_start(args, fmt); + (void)perf_vsnpath(path, sizeof(path), fmt, args); + va_end(args); + return xstrdup(path); +} + +char *mkpath(const char *fmt, ...) +{ + va_list args; + unsigned len; + char *pathname = get_pathname(); + + va_start(args, fmt); + len = vsnprintf(pathname, PATH_MAX, fmt, args); + va_end(args); + if (len >= PATH_MAX) + return bad_path; + return cleanup_path(pathname); +} + +char *perf_path(const char *fmt, ...) +{ + const char *perf_dir = get_perf_dir(); + char *pathname = get_pathname(); + va_list args; + unsigned len; + + len = strlen(perf_dir); + if (len > PATH_MAX-100) + return bad_path; + memcpy(pathname, perf_dir, len); + if (len && perf_dir[len-1] != '/') + pathname[len++] = '/'; + va_start(args, fmt); + len += vsnprintf(pathname + len, PATH_MAX - len, fmt, args); + va_end(args); + if (len >= PATH_MAX) + return bad_path; + return cleanup_path(pathname); +} + + +/* perf_mkstemp() - create tmp file honoring TMPDIR variable */ +int perf_mkstemp(char *path, size_t len, const char *template) +{ + const char *tmp; + size_t n; + + tmp = getenv("TMPDIR"); + if (!tmp) + tmp = "/tmp"; + n = snprintf(path, len, "%s/%s", tmp, template); + if (len <= n) { + errno = ENAMETOOLONG; + return -1; + } + return mkstemp(path); +} + + +static char *user_path(char *buf, char *path, int sz) +{ + struct passwd *pw; + char *slash; + int len, baselen; + + if (!path || path[0] != '~') + return NULL; + path++; + slash = strchr(path, '/'); + if (path[0] == '/' || !path[0]) { + pw = getpwuid(getuid()); + } + else { + if (slash) { + *slash = 0; + pw = getpwnam(path); + *slash = '/'; + } + else + pw = getpwnam(path); + } + if (!pw || !pw->pw_dir || sz <= strlen(pw->pw_dir)) + return NULL; + baselen = strlen(pw->pw_dir); + memcpy(buf, pw->pw_dir, baselen); + while ((1 < baselen) && (buf[baselen-1] == '/')) { + buf[baselen-1] = 0; + baselen--; + } + if (slash && slash[1]) { + len = strlen(slash); + if (sz <= baselen + len) + return NULL; + memcpy(buf + baselen, slash, len + 1); + } + return buf; +} + +const char *make_relative_path(const char *abs, const char *base) +{ + static char buf[PATH_MAX + 1]; + int baselen; + if (!base) + return abs; + baselen = strlen(base); + if (prefixcmp(abs, base)) + return abs; + if (abs[baselen] == '/') + baselen++; + else if (base[baselen - 1] != '/') + return abs; + strcpy(buf, abs + baselen); + return buf; +} + +/* + * It is okay if dst == src, but they should not overlap otherwise. + * + * Performs the following normalizations on src, storing the result in dst: + * - Ensures that components are separated by '/' (Windows only) + * - Squashes sequences of '/'. + * - Removes "." components. + * - Removes ".." components, and the components the precede them. + * Returns failure (non-zero) if a ".." component appears as first path + * component anytime during the normalization. Otherwise, returns success (0). + * + * Note that this function is purely textual. It does not follow symlinks, + * verify the existence of the path, or make any system calls. + */ +int normalize_path_copy(char *dst, const char *src) +{ + char *dst0; + + if (has_dos_drive_prefix(src)) { + *dst++ = *src++; + *dst++ = *src++; + } + dst0 = dst; + + if (is_dir_sep(*src)) { + *dst++ = '/'; + while (is_dir_sep(*src)) + src++; + } + + for (;;) { + char c = *src; + + /* + * A path component that begins with . could be + * special: + * (1) "." and ends -- ignore and terminate. + * (2) "./" -- ignore them, eat slash and continue. + * (3) ".." and ends -- strip one and terminate. + * (4) "../" -- strip one, eat slash and continue. + */ + if (c == '.') { + if (!src[1]) { + /* (1) */ + src++; + } else if (is_dir_sep(src[1])) { + /* (2) */ + src += 2; + while (is_dir_sep(*src)) + src++; + continue; + } else if (src[1] == '.') { + if (!src[2]) { + /* (3) */ + src += 2; + goto up_one; + } else if (is_dir_sep(src[2])) { + /* (4) */ + src += 3; + while (is_dir_sep(*src)) + src++; + goto up_one; + } + } + } + + /* copy up to the next '/', and eat all '/' */ + while ((c = *src++) != '\0' && !is_dir_sep(c)) + *dst++ = c; + if (is_dir_sep(c)) { + *dst++ = '/'; + while (is_dir_sep(c)) + c = *src++; + src--; + } else if (!c) + break; + continue; + + up_one: + /* + * dst0..dst is prefix portion, and dst[-1] is '/'; + * go up one level. + */ + dst--; /* go to trailing '/' */ + if (dst <= dst0) + return -1; + /* Windows: dst[-1] cannot be backslash anymore */ + while (dst0 < dst && dst[-1] != '/') + dst--; + } + *dst = '\0'; + return 0; +} + +/* + * path = Canonical absolute path + * prefix_list = Colon-separated list of absolute paths + * + * Determines, for each path in prefix_list, whether the "prefix" really + * is an ancestor directory of path. Returns the length of the longest + * ancestor directory, excluding any trailing slashes, or -1 if no prefix + * is an ancestor. (Note that this means 0 is returned if prefix_list is + * "/".) "/foo" is not considered an ancestor of "/foobar". Directories + * are not considered to be their own ancestors. path must be in a + * canonical form: empty components, or "." or ".." components are not + * allowed. prefix_list may be null, which is like "". + */ +int longest_ancestor_length(const char *path, const char *prefix_list) +{ + char buf[PATH_MAX+1]; + const char *ceil, *colon; + int len, max_len = -1; + + if (prefix_list == NULL || !strcmp(path, "/")) + return -1; + + for (colon = ceil = prefix_list; *colon; ceil = colon+1) { + for (colon = ceil; *colon && *colon != PATH_SEP; colon++); + len = colon - ceil; + if (len == 0 || len > PATH_MAX || !is_absolute_path(ceil)) + continue; + strlcpy(buf, ceil, len+1); + if (normalize_path_copy(buf, buf) < 0) + continue; + len = strlen(buf); + if (len > 0 && buf[len-1] == '/') + buf[--len] = '\0'; + + if (!strncmp(path, buf, len) && + path[len] == '/' && + len > max_len) { + max_len = len; + } + } + + return max_len; +} + +/* strip arbitrary amount of directory separators at end of path */ +static inline int chomp_trailing_dir_sep(const char *path, int len) +{ + while (len && is_dir_sep(path[len - 1])) + len--; + return len; +} + +/* + * If path ends with suffix (complete path components), returns the + * part before suffix (sans trailing directory separators). + * Otherwise returns NULL. + */ +char *strip_path_suffix(const char *path, const char *suffix) +{ + int path_len = strlen(path), suffix_len = strlen(suffix); + + while (suffix_len) { + if (!path_len) + return NULL; + + if (is_dir_sep(path[path_len - 1])) { + if (!is_dir_sep(suffix[suffix_len - 1])) + return NULL; + path_len = chomp_trailing_dir_sep(path, path_len); + suffix_len = chomp_trailing_dir_sep(suffix, suffix_len); + } + else if (path[--path_len] != suffix[--suffix_len]) + return NULL; + } + + if (path_len && !is_dir_sep(path[path_len - 1])) + return NULL; + return xstrndup(path, chomp_trailing_dir_sep(path, path_len)); +} diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c new file mode 100644 index 00000000000..9256f6a1644 --- /dev/null +++ b/Documentation/perf_counter/perf.c @@ -0,0 +1,411 @@ +#include "builtin.h" +#include "exec_cmd.h" +#include "cache.h" +//#include "quote.h" +#include "run-command.h" + +const char perf_usage_string[] = + "perf [--version] [--exec-path[=PERF_EXEC_PATH]] [--html-path] [-p|--paginate|--no-pager] [--bare] [--perf-dir=PERF_DIR] [--work-tree=PERF_WORK_TREE] [--help] COMMAND [ARGS]"; + +const char perf_more_info_string[] = + "See 'perf help COMMAND' for more information on a specific command."; + +static int use_pager = -1; +struct pager_config { + const char *cmd; + int val; +}; + +static int pager_command_config(const char *var, const char *value, void *data) +{ + struct pager_config *c = data; + if (!prefixcmp(var, "pager.") && !strcmp(var + 6, c->cmd)) + c->val = perf_config_bool(var, value); + return 0; +} + +/* returns 0 for "no pager", 1 for "use pager", and -1 for "not specified" */ +int check_pager_config(const char *cmd) +{ + struct pager_config c; + c.cmd = cmd; + c.val = -1; + perf_config(pager_command_config, &c); + return c.val; +} + +static void commit_pager_choice(void) { + switch (use_pager) { + case 0: + setenv("PERF_PAGER", "cat", 1); + break; + case 1: + /* setup_pager(); */ + break; + default: + break; + } +} + +static int handle_options(const char*** argv, int* argc, int* envchanged) +{ + int handled = 0; + + while (*argc > 0) { + const char *cmd = (*argv)[0]; + if (cmd[0] != '-') + break; + + /* + * For legacy reasons, the "version" and "help" + * commands can be written with "--" prepended + * to make them look like flags. + */ + if (!strcmp(cmd, "--help") || !strcmp(cmd, "--version")) + break; + + /* + * Check remaining flags. + */ + if (!prefixcmp(cmd, "--exec-path")) { + cmd += 11; + if (*cmd == '=') + perf_set_argv_exec_path(cmd + 1); + else { + puts(perf_exec_path()); + exit(0); + } + } else if (!strcmp(cmd, "--html-path")) { + puts(system_path(PERF_HTML_PATH)); + exit(0); + } else if (!strcmp(cmd, "-p") || !strcmp(cmd, "--paginate")) { + use_pager = 1; + } else if (!strcmp(cmd, "--no-pager")) { + use_pager = 0; + if (envchanged) + *envchanged = 1; + } else if (!strcmp(cmd, "--perf-dir")) { + if (*argc < 2) { + fprintf(stderr, "No directory given for --perf-dir.\n" ); + usage(perf_usage_string); + } + setenv(PERF_DIR_ENVIRONMENT, (*argv)[1], 1); + if (envchanged) + *envchanged = 1; + (*argv)++; + (*argc)--; + handled++; + } else if (!prefixcmp(cmd, "--perf-dir=")) { + setenv(PERF_DIR_ENVIRONMENT, cmd + 10, 1); + if (envchanged) + *envchanged = 1; + } else if (!strcmp(cmd, "--work-tree")) { + if (*argc < 2) { + fprintf(stderr, "No directory given for --work-tree.\n" ); + usage(perf_usage_string); + } + setenv(PERF_WORK_TREE_ENVIRONMENT, (*argv)[1], 1); + if (envchanged) + *envchanged = 1; + (*argv)++; + (*argc)--; + } else if (!prefixcmp(cmd, "--work-tree=")) { + setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1); + if (envchanged) + *envchanged = 1; + } else { + fprintf(stderr, "Unknown option: %s\n", cmd); + usage(perf_usage_string); + } + + (*argv)++; + (*argc)--; + handled++; + } + return handled; +} + +static int handle_alias(int *argcp, const char ***argv) +{ + int envchanged = 0, ret = 0, saved_errno = errno; + int count, option_count; + const char** new_argv; + const char *alias_command; + char *alias_string; + int unused_nonperf; + + alias_command = (*argv)[0]; + alias_string = alias_lookup(alias_command); + if (alias_string) { + if (alias_string[0] == '!') { + if (*argcp > 1) { + struct strbuf buf; + + strbuf_init(&buf, PATH_MAX); + strbuf_addstr(&buf, alias_string); + sq_quote_argv(&buf, (*argv) + 1, PATH_MAX); + free(alias_string); + alias_string = buf.buf; + } + ret = system(alias_string + 1); + if (ret >= 0 && WIFEXITED(ret) && + WEXITSTATUS(ret) != 127) + exit(WEXITSTATUS(ret)); + die("Failed to run '%s' when expanding alias '%s'", + alias_string + 1, alias_command); + } + count = split_cmdline(alias_string, &new_argv); + if (count < 0) + die("Bad alias.%s string", alias_command); + option_count = handle_options(&new_argv, &count, &envchanged); + if (envchanged) + die("alias '%s' changes environment variables\n" + "You can use '!perf' in the alias to do this.", + alias_command); + memmove(new_argv - option_count, new_argv, + count * sizeof(char *)); + new_argv -= option_count; + + if (count < 1) + die("empty alias for %s", alias_command); + + if (!strcmp(alias_command, new_argv[0])) + die("recursive alias: %s", alias_command); + + new_argv = realloc(new_argv, sizeof(char*) * + (count + *argcp + 1)); + /* insert after command name */ + memcpy(new_argv + count, *argv + 1, sizeof(char*) * *argcp); + new_argv[count+*argcp] = NULL; + + *argv = new_argv; + *argcp += count - 1; + + ret = 1; + } + + errno = saved_errno; + + return ret; +} + +const char perf_version_string[] = PERF_VERSION; + +#define RUN_SETUP (1<<0) +#define USE_PAGER (1<<1) +/* + * require working tree to be present -- anything uses this needs + * RUN_SETUP for reading from the configuration file. + */ +#define NEED_WORK_TREE (1<<2) + +struct cmd_struct { + const char *cmd; + int (*fn)(int, const char **, const char *); + int option; +}; + +static int run_builtin(struct cmd_struct *p, int argc, const char **argv) +{ + int status; + struct stat st; + const char *prefix; + + prefix = NULL; + if (p->option & RUN_SETUP) + prefix = NULL; /* setup_perf_directory(); */ + + if (use_pager == -1 && p->option & RUN_SETUP) + use_pager = check_pager_config(p->cmd); + if (use_pager == -1 && p->option & USE_PAGER) + use_pager = 1; + commit_pager_choice(); + + if (p->option & NEED_WORK_TREE) + /* setup_work_tree() */; + + status = p->fn(argc, argv, prefix); + if (status) + return status & 0xff; + + /* Somebody closed stdout? */ + if (fstat(fileno(stdout), &st)) + return 0; + /* Ignore write errors for pipes and sockets.. */ + if (S_ISFIFO(st.st_mode) || S_ISSOCK(st.st_mode)) + return 0; + + /* Check for ENOSPC and EIO errors.. */ + if (fflush(stdout)) + die("write failure on standard output: %s", strerror(errno)); + if (ferror(stdout)) + die("unknown write failure on standard output"); + if (fclose(stdout)) + die("close failed on standard output: %s", strerror(errno)); + return 0; +} + +static void handle_internal_command(int argc, const char **argv) +{ + const char *cmd = argv[0]; + static struct cmd_struct commands[] = { + { "top", cmd_top, 0 }, + }; + int i; + static const char ext[] = STRIP_EXTENSION; + + if (sizeof(ext) > 1) { + i = strlen(argv[0]) - strlen(ext); + if (i > 0 && !strcmp(argv[0] + i, ext)) { + char *argv0 = strdup(argv[0]); + argv[0] = cmd = argv0; + argv0[i] = '\0'; + } + } + + /* Turn "perf cmd --help" into "perf help cmd" */ + if (argc > 1 && !strcmp(argv[1], "--help")) { + argv[1] = argv[0]; + argv[0] = cmd = "help"; + } + + for (i = 0; i < ARRAY_SIZE(commands); i++) { + struct cmd_struct *p = commands+i; + if (strcmp(p->cmd, cmd)) + continue; + exit(run_builtin(p, argc, argv)); + } +} + +static void execv_dashed_external(const char **argv) +{ + struct strbuf cmd = STRBUF_INIT; + const char *tmp; + int status; + + strbuf_addf(&cmd, "perf-%s", argv[0]); + + /* + * argv[0] must be the perf command, but the argv array + * belongs to the caller, and may be reused in + * subsequent loop iterations. Save argv[0] and + * restore it on error. + */ + tmp = argv[0]; + argv[0] = cmd.buf; + + /* + * if we fail because the command is not found, it is + * OK to return. Otherwise, we just pass along the status code. + */ + status = run_command_v_opt(argv, 0); + if (status != -ERR_RUN_COMMAND_EXEC) { + if (IS_RUN_COMMAND_ERR(status)) + die("unable to run '%s'", argv[0]); + exit(-status); + } + errno = ENOENT; /* as if we called execvp */ + + argv[0] = tmp; + + strbuf_release(&cmd); +} + +static int run_argv(int *argcp, const char ***argv) +{ + int done_alias = 0; + + while (1) { + /* See if it's an internal command */ + handle_internal_command(*argcp, *argv); + + /* .. then try the external ones */ + execv_dashed_external(*argv); + + /* It could be an alias -- this works around the insanity + * of overriding "perf log" with "perf show" by having + * alias.log = show + */ + if (done_alias || !handle_alias(argcp, argv)) + break; + done_alias = 1; + } + + return done_alias; +} + + +int main(int argc, const char **argv) +{ + const char *cmd; + + cmd = perf_extract_argv0_path(argv[0]); + if (!cmd) + cmd = "perf-help"; + + /* + * "perf-xxxx" is the same as "perf xxxx", but we obviously: + * + * - cannot take flags in between the "perf" and the "xxxx". + * - cannot execute it externally (since it would just do + * the same thing over again) + * + * So we just directly call the internal command handler, and + * die if that one cannot handle it. + */ + if (!prefixcmp(cmd, "perf-")) { + cmd += 4; + argv[0] = cmd; + handle_internal_command(argc, argv); + die("cannot handle %s internally", cmd); + } + + /* Look for flags.. */ + argv++; + argc--; + handle_options(&argv, &argc, NULL); + commit_pager_choice(); + if (argc > 0) { + if (!prefixcmp(argv[0], "--")) + argv[0] += 2; + } else { + /* The user didn't specify a command; give them help */ + printf("usage: %s\n\n", perf_usage_string); + list_common_cmds_help(); + printf("\n%s\n", perf_more_info_string); + exit(1); + } + cmd = argv[0]; + + /* + * We use PATH to find perf commands, but we prepend some higher + * precidence paths: the "--exec-path" option, the PERF_EXEC_PATH + * environment, and the $(perfexecdir) from the Makefile at build + * time. + */ + setup_path(); + + while (1) { + static int done_help = 0; + static int was_alias = 0; + was_alias = run_argv(&argc, &argv); + if (errno != ENOENT) + break; + if (was_alias) { + fprintf(stderr, "Expansion of alias '%s' failed; " + "'%s' is not a perf-command\n", + cmd, argv[0]); + exit(1); + } + if (!done_help) { + cmd = argv[0] = help_unknown_cmd(cmd); + done_help = 1; + } else + break; + } + + fprintf(stderr, "Failed to run command '%s': %s\n", + cmd, strerror(errno)); + + return 1; +} diff --git a/Documentation/perf_counter/quote.c b/Documentation/perf_counter/quote.c new file mode 100644 index 00000000000..7a49fcf6967 --- /dev/null +++ b/Documentation/perf_counter/quote.c @@ -0,0 +1,478 @@ +#include "cache.h" +#include "quote.h" + +int quote_path_fully = 1; + +/* Help to copy the thing properly quoted for the shell safety. + * any single quote is replaced with '\'', any exclamation point + * is replaced with '\!', and the whole thing is enclosed in a + * + * E.g. + * original sq_quote result + * name ==> name ==> 'name' + * a b ==> a b ==> 'a b' + * a'b ==> a'\''b ==> 'a'\''b' + * a!b ==> a'\!'b ==> 'a'\!'b' + */ +static inline int need_bs_quote(char c) +{ + return (c == '\'' || c == '!'); +} + +void sq_quote_buf(struct strbuf *dst, const char *src) +{ + char *to_free = NULL; + + if (dst->buf == src) + to_free = strbuf_detach(dst, NULL); + + strbuf_addch(dst, '\''); + while (*src) { + size_t len = strcspn(src, "'!"); + strbuf_add(dst, src, len); + src += len; + while (need_bs_quote(*src)) { + strbuf_addstr(dst, "'\\"); + strbuf_addch(dst, *src++); + strbuf_addch(dst, '\''); + } + } + strbuf_addch(dst, '\''); + free(to_free); +} + +void sq_quote_print(FILE *stream, const char *src) +{ + char c; + + fputc('\'', stream); + while ((c = *src++)) { + if (need_bs_quote(c)) { + fputs("'\\", stream); + fputc(c, stream); + fputc('\'', stream); + } else { + fputc(c, stream); + } + } + fputc('\'', stream); +} + +void sq_quote_argv(struct strbuf *dst, const char** argv, size_t maxlen) +{ + int i; + + /* Copy into destination buffer. */ + strbuf_grow(dst, 255); + for (i = 0; argv[i]; ++i) { + strbuf_addch(dst, ' '); + sq_quote_buf(dst, argv[i]); + if (maxlen && dst->len > maxlen) + die("Too many or long arguments"); + } +} + +char *sq_dequote_step(char *arg, char **next) +{ + char *dst = arg; + char *src = arg; + char c; + + if (*src != '\'') + return NULL; + for (;;) { + c = *++src; + if (!c) + return NULL; + if (c != '\'') { + *dst++ = c; + continue; + } + /* We stepped out of sq */ + switch (*++src) { + case '\0': + *dst = 0; + if (next) + *next = NULL; + return arg; + case '\\': + c = *++src; + if (need_bs_quote(c) && *++src == '\'') { + *dst++ = c; + continue; + } + /* Fallthrough */ + default: + if (!next || !isspace(*src)) + return NULL; + do { + c = *++src; + } while (isspace(c)); + *dst = 0; + *next = src; + return arg; + } + } +} + +char *sq_dequote(char *arg) +{ + return sq_dequote_step(arg, NULL); +} + +int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc) +{ + char *next = arg; + + if (!*arg) + return 0; + do { + char *dequoted = sq_dequote_step(next, &next); + if (!dequoted) + return -1; + ALLOC_GROW(*argv, *nr + 1, *alloc); + (*argv)[(*nr)++] = dequoted; + } while (next); + + return 0; +} + +/* 1 means: quote as octal + * 0 means: quote as octal if (quote_path_fully) + * -1 means: never quote + * c: quote as "\\c" + */ +#define X8(x) x, x, x, x, x, x, x, x +#define X16(x) X8(x), X8(x) +static signed char const sq_lookup[256] = { + /* 0 1 2 3 4 5 6 7 */ + /* 0x00 */ 1, 1, 1, 1, 1, 1, 1, 'a', + /* 0x08 */ 'b', 't', 'n', 'v', 'f', 'r', 1, 1, + /* 0x10 */ X16(1), + /* 0x20 */ -1, -1, '"', -1, -1, -1, -1, -1, + /* 0x28 */ X16(-1), X16(-1), X16(-1), + /* 0x58 */ -1, -1, -1, -1,'\\', -1, -1, -1, + /* 0x60 */ X16(-1), X8(-1), + /* 0x78 */ -1, -1, -1, -1, -1, -1, -1, 1, + /* 0x80 */ /* set to 0 */ +}; + +static inline int sq_must_quote(char c) +{ + return sq_lookup[(unsigned char)c] + quote_path_fully > 0; +} + +/* returns the longest prefix not needing a quote up to maxlen if positive. + This stops at the first \0 because it's marked as a character needing an + escape */ +static size_t next_quote_pos(const char *s, ssize_t maxlen) +{ + size_t len; + if (maxlen < 0) { + for (len = 0; !sq_must_quote(s[len]); len++); + } else { + for (len = 0; len < maxlen && !sq_must_quote(s[len]); len++); + } + return len; +} + +/* + * C-style name quoting. + * + * (1) if sb and fp are both NULL, inspect the input name and counts the + * number of bytes that are needed to hold c_style quoted version of name, + * counting the double quotes around it but not terminating NUL, and + * returns it. + * However, if name does not need c_style quoting, it returns 0. + * + * (2) if sb or fp are not NULL, it emits the c_style quoted version + * of name, enclosed with double quotes if asked and needed only. + * Return value is the same as in (1). + */ +static size_t quote_c_style_counted(const char *name, ssize_t maxlen, + struct strbuf *sb, FILE *fp, int no_dq) +{ +#undef EMIT +#define EMIT(c) \ + do { \ + if (sb) strbuf_addch(sb, (c)); \ + if (fp) fputc((c), fp); \ + count++; \ + } while (0) +#define EMITBUF(s, l) \ + do { \ + if (sb) strbuf_add(sb, (s), (l)); \ + if (fp) fwrite((s), (l), 1, fp); \ + count += (l); \ + } while (0) + + size_t len, count = 0; + const char *p = name; + + for (;;) { + int ch; + + len = next_quote_pos(p, maxlen); + if (len == maxlen || !p[len]) + break; + + if (!no_dq && p == name) + EMIT('"'); + + EMITBUF(p, len); + EMIT('\\'); + p += len; + ch = (unsigned char)*p++; + if (sq_lookup[ch] >= ' ') { + EMIT(sq_lookup[ch]); + } else { + EMIT(((ch >> 6) & 03) + '0'); + EMIT(((ch >> 3) & 07) + '0'); + EMIT(((ch >> 0) & 07) + '0'); + } + } + + EMITBUF(p, len); + if (p == name) /* no ending quote needed */ + return 0; + + if (!no_dq) + EMIT('"'); + return count; +} + +size_t quote_c_style(const char *name, struct strbuf *sb, FILE *fp, int nodq) +{ + return quote_c_style_counted(name, -1, sb, fp, nodq); +} + +void quote_two_c_style(struct strbuf *sb, const char *prefix, const char *path, int nodq) +{ + if (quote_c_style(prefix, NULL, NULL, 0) || + quote_c_style(path, NULL, NULL, 0)) { + if (!nodq) + strbuf_addch(sb, '"'); + quote_c_style(prefix, sb, NULL, 1); + quote_c_style(path, sb, NULL, 1); + if (!nodq) + strbuf_addch(sb, '"'); + } else { + strbuf_addstr(sb, prefix); + strbuf_addstr(sb, path); + } +} + +void write_name_quoted(const char *name, FILE *fp, int terminator) +{ + if (terminator) { + quote_c_style(name, NULL, fp, 0); + } else { + fputs(name, fp); + } + fputc(terminator, fp); +} + +extern void write_name_quotedpfx(const char *pfx, size_t pfxlen, + const char *name, FILE *fp, int terminator) +{ + int needquote = 0; + + if (terminator) { + needquote = next_quote_pos(pfx, pfxlen) < pfxlen + || name[next_quote_pos(name, -1)]; + } + if (needquote) { + fputc('"', fp); + quote_c_style_counted(pfx, pfxlen, NULL, fp, 1); + quote_c_style(name, NULL, fp, 1); + fputc('"', fp); + } else { + fwrite(pfx, pfxlen, 1, fp); + fputs(name, fp); + } + fputc(terminator, fp); +} + +/* quote path as relative to the given prefix */ +char *quote_path_relative(const char *in, int len, + struct strbuf *out, const char *prefix) +{ + int needquote; + + if (len < 0) + len = strlen(in); + + /* "../" prefix itself does not need quoting, but "in" might. */ + needquote = next_quote_pos(in, len) < len; + strbuf_setlen(out, 0); + strbuf_grow(out, len); + + if (needquote) + strbuf_addch(out, '"'); + if (prefix) { + int off = 0; + while (prefix[off] && off < len && prefix[off] == in[off]) + if (prefix[off] == '/') { + prefix += off + 1; + in += off + 1; + len -= off + 1; + off = 0; + } else + off++; + + for (; *prefix; prefix++) + if (*prefix == '/') + strbuf_addstr(out, "../"); + } + + quote_c_style_counted (in, len, out, NULL, 1); + + if (needquote) + strbuf_addch(out, '"'); + if (!out->len) + strbuf_addstr(out, "./"); + + return out->buf; +} + +/* + * C-style name unquoting. + * + * Quoted should point at the opening double quote. + * + Returns 0 if it was able to unquote the string properly, and appends the + * result in the strbuf `sb'. + * + Returns -1 in case of error, and doesn't touch the strbuf. Though note + * that this function will allocate memory in the strbuf, so calling + * strbuf_release is mandatory whichever result unquote_c_style returns. + * + * Updates endp pointer to point at one past the ending double quote if given. + */ +int unquote_c_style(struct strbuf *sb, const char *quoted, const char **endp) +{ + size_t oldlen = sb->len, len; + int ch, ac; + + if (*quoted++ != '"') + return -1; + + for (;;) { + len = strcspn(quoted, "\"\\"); + strbuf_add(sb, quoted, len); + quoted += len; + + switch (*quoted++) { + case '"': + if (endp) + *endp = quoted; + return 0; + case '\\': + break; + default: + goto error; + } + + switch ((ch = *quoted++)) { + case 'a': ch = '\a'; break; + case 'b': ch = '\b'; break; + case 'f': ch = '\f'; break; + case 'n': ch = '\n'; break; + case 'r': ch = '\r'; break; + case 't': ch = '\t'; break; + case 'v': ch = '\v'; break; + + case '\\': case '"': + break; /* verbatim */ + + /* octal values with first digit over 4 overflow */ + case '0': case '1': case '2': case '3': + ac = ((ch - '0') << 6); + if ((ch = *quoted++) < '0' || '7' < ch) + goto error; + ac |= ((ch - '0') << 3); + if ((ch = *quoted++) < '0' || '7' < ch) + goto error; + ac |= (ch - '0'); + ch = ac; + break; + default: + goto error; + } + strbuf_addch(sb, ch); + } + + error: + strbuf_setlen(sb, oldlen); + return -1; +} + +/* quoting as a string literal for other languages */ + +void perl_quote_print(FILE *stream, const char *src) +{ + const char sq = '\''; + const char bq = '\\'; + char c; + + fputc(sq, stream); + while ((c = *src++)) { + if (c == sq || c == bq) + fputc(bq, stream); + fputc(c, stream); + } + fputc(sq, stream); +} + +void python_quote_print(FILE *stream, const char *src) +{ + const char sq = '\''; + const char bq = '\\'; + const char nl = '\n'; + char c; + + fputc(sq, stream); + while ((c = *src++)) { + if (c == nl) { + fputc(bq, stream); + fputc('n', stream); + continue; + } + if (c == sq || c == bq) + fputc(bq, stream); + fputc(c, stream); + } + fputc(sq, stream); +} + +void tcl_quote_print(FILE *stream, const char *src) +{ + char c; + + fputc('"', stream); + while ((c = *src++)) { + switch (c) { + case '[': case ']': + case '{': case '}': + case '$': case '\\': case '"': + fputc('\\', stream); + default: + fputc(c, stream); + break; + case '\f': + fputs("\\f", stream); + break; + case '\r': + fputs("\\r", stream); + break; + case '\n': + fputs("\\n", stream); + break; + case '\t': + fputs("\\t", stream); + break; + case '\v': + fputs("\\v", stream); + break; + } + } + fputc('"', stream); +} diff --git a/Documentation/perf_counter/quote.h b/Documentation/perf_counter/quote.h new file mode 100644 index 00000000000..66730f2bff3 --- /dev/null +++ b/Documentation/perf_counter/quote.h @@ -0,0 +1,68 @@ +#ifndef QUOTE_H +#define QUOTE_H + +#include +#include + +/* Help to copy the thing properly quoted for the shell safety. + * any single quote is replaced with '\'', any exclamation point + * is replaced with '\!', and the whole thing is enclosed in a + * single quote pair. + * + * For example, if you are passing the result to system() as an + * argument: + * + * sprintf(cmd, "foobar %s %s", sq_quote(arg0), sq_quote(arg1)) + * + * would be appropriate. If the system() is going to call ssh to + * run the command on the other side: + * + * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1)); + * sprintf(rcmd, "ssh %s %s", sq_quote(host), sq_quote(cmd)); + * + * Note that the above examples leak memory! Remember to free result from + * sq_quote() in a real application. + * + * sq_quote_buf() writes to an existing buffer of specified size; it + * will return the number of characters that would have been written + * excluding the final null regardless of the buffer size. + */ + +extern void sq_quote_print(FILE *stream, const char *src); + +extern void sq_quote_buf(struct strbuf *, const char *src); +extern void sq_quote_argv(struct strbuf *, const char **argv, size_t maxlen); + +/* This unwraps what sq_quote() produces in place, but returns + * NULL if the input does not look like what sq_quote would have + * produced. + */ +extern char *sq_dequote(char *); + +/* + * Same as the above, but can be used to unwrap many arguments in the + * same string separated by space. "next" is changed to point to the + * next argument that should be passed as first parameter. When there + * is no more argument to be dequoted, "next" is updated to point to NULL. + */ +extern char *sq_dequote_step(char *arg, char **next); +extern int sq_dequote_to_argv(char *arg, const char ***argv, int *nr, int *alloc); + +extern int unquote_c_style(struct strbuf *, const char *quoted, const char **endp); +extern size_t quote_c_style(const char *name, struct strbuf *, FILE *, int no_dq); +extern void quote_two_c_style(struct strbuf *, const char *, const char *, int); + +extern void write_name_quoted(const char *name, FILE *, int terminator); +extern void write_name_quotedpfx(const char *pfx, size_t pfxlen, + const char *name, FILE *, int terminator); + +/* quote path as relative to the given prefix */ +char *quote_path_relative(const char *in, int len, + struct strbuf *out, const char *prefix); + +/* quoting as a string literal for other languages */ +extern void perl_quote_print(FILE *stream, const char *src); +extern void python_quote_print(FILE *stream, const char *src); +extern void tcl_quote_print(FILE *stream, const char *src); + +#endif diff --git a/Documentation/perf_counter/run-command.c b/Documentation/perf_counter/run-command.c new file mode 100644 index 00000000000..b2f5e854f40 --- /dev/null +++ b/Documentation/perf_counter/run-command.c @@ -0,0 +1,395 @@ +#include "cache.h" +#include "run-command.h" +#include "exec_cmd.h" + +static inline void close_pair(int fd[2]) +{ + close(fd[0]); + close(fd[1]); +} + +static inline void dup_devnull(int to) +{ + int fd = open("/dev/null", O_RDWR); + dup2(fd, to); + close(fd); +} + +int start_command(struct child_process *cmd) +{ + int need_in, need_out, need_err; + int fdin[2], fdout[2], fderr[2]; + + /* + * In case of errors we must keep the promise to close FDs + * that have been passed in via ->in and ->out. + */ + + need_in = !cmd->no_stdin && cmd->in < 0; + if (need_in) { + if (pipe(fdin) < 0) { + if (cmd->out > 0) + close(cmd->out); + return -ERR_RUN_COMMAND_PIPE; + } + cmd->in = fdin[1]; + } + + need_out = !cmd->no_stdout + && !cmd->stdout_to_stderr + && cmd->out < 0; + if (need_out) { + if (pipe(fdout) < 0) { + if (need_in) + close_pair(fdin); + else if (cmd->in) + close(cmd->in); + return -ERR_RUN_COMMAND_PIPE; + } + cmd->out = fdout[0]; + } + + need_err = !cmd->no_stderr && cmd->err < 0; + if (need_err) { + if (pipe(fderr) < 0) { + if (need_in) + close_pair(fdin); + else if (cmd->in) + close(cmd->in); + if (need_out) + close_pair(fdout); + else if (cmd->out) + close(cmd->out); + return -ERR_RUN_COMMAND_PIPE; + } + cmd->err = fderr[0]; + } + +#ifndef __MINGW32__ + fflush(NULL); + cmd->pid = fork(); + if (!cmd->pid) { + if (cmd->no_stdin) + dup_devnull(0); + else if (need_in) { + dup2(fdin[0], 0); + close_pair(fdin); + } else if (cmd->in) { + dup2(cmd->in, 0); + close(cmd->in); + } + + if (cmd->no_stderr) + dup_devnull(2); + else if (need_err) { + dup2(fderr[1], 2); + close_pair(fderr); + } + + if (cmd->no_stdout) + dup_devnull(1); + else if (cmd->stdout_to_stderr) + dup2(2, 1); + else if (need_out) { + dup2(fdout[1], 1); + close_pair(fdout); + } else if (cmd->out > 1) { + dup2(cmd->out, 1); + close(cmd->out); + } + + if (cmd->dir && chdir(cmd->dir)) + die("exec %s: cd to %s failed (%s)", cmd->argv[0], + cmd->dir, strerror(errno)); + if (cmd->env) { + for (; *cmd->env; cmd->env++) { + if (strchr(*cmd->env, '=')) + putenv((char*)*cmd->env); + else + unsetenv(*cmd->env); + } + } + if (cmd->preexec_cb) + cmd->preexec_cb(); + if (cmd->perf_cmd) { + execv_perf_cmd(cmd->argv); + } else { + execvp(cmd->argv[0], (char *const*) cmd->argv); + } + exit(127); + } +#else + int s0 = -1, s1 = -1, s2 = -1; /* backups of stdin, stdout, stderr */ + const char **sargv = cmd->argv; + char **env = environ; + + if (cmd->no_stdin) { + s0 = dup(0); + dup_devnull(0); + } else if (need_in) { + s0 = dup(0); + dup2(fdin[0], 0); + } else if (cmd->in) { + s0 = dup(0); + dup2(cmd->in, 0); + } + + if (cmd->no_stderr) { + s2 = dup(2); + dup_devnull(2); + } else if (need_err) { + s2 = dup(2); + dup2(fderr[1], 2); + } + + if (cmd->no_stdout) { + s1 = dup(1); + dup_devnull(1); + } else if (cmd->stdout_to_stderr) { + s1 = dup(1); + dup2(2, 1); + } else if (need_out) { + s1 = dup(1); + dup2(fdout[1], 1); + } else if (cmd->out > 1) { + s1 = dup(1); + dup2(cmd->out, 1); + } + + if (cmd->dir) + die("chdir in start_command() not implemented"); + if (cmd->env) { + env = copy_environ(); + for (; *cmd->env; cmd->env++) + env = env_setenv(env, *cmd->env); + } + + if (cmd->perf_cmd) { + cmd->argv = prepare_perf_cmd(cmd->argv); + } + + cmd->pid = mingw_spawnvpe(cmd->argv[0], cmd->argv, env); + + if (cmd->env) + free_environ(env); + if (cmd->perf_cmd) + free(cmd->argv); + + cmd->argv = sargv; + if (s0 >= 0) + dup2(s0, 0), close(s0); + if (s1 >= 0) + dup2(s1, 1), close(s1); + if (s2 >= 0) + dup2(s2, 2), close(s2); +#endif + + if (cmd->pid < 0) { + int err = errno; + if (need_in) + close_pair(fdin); + else if (cmd->in) + close(cmd->in); + if (need_out) + close_pair(fdout); + else if (cmd->out) + close(cmd->out); + if (need_err) + close_pair(fderr); + return err == ENOENT ? + -ERR_RUN_COMMAND_EXEC : + -ERR_RUN_COMMAND_FORK; + } + + if (need_in) + close(fdin[0]); + else if (cmd->in) + close(cmd->in); + + if (need_out) + close(fdout[1]); + else if (cmd->out) + close(cmd->out); + + if (need_err) + close(fderr[1]); + + return 0; +} + +static int wait_or_whine(pid_t pid) +{ + for (;;) { + int status, code; + pid_t waiting = waitpid(pid, &status, 0); + + if (waiting < 0) { + if (errno == EINTR) + continue; + error("waitpid failed (%s)", strerror(errno)); + return -ERR_RUN_COMMAND_WAITPID; + } + if (waiting != pid) + return -ERR_RUN_COMMAND_WAITPID_WRONG_PID; + if (WIFSIGNALED(status)) + return -ERR_RUN_COMMAND_WAITPID_SIGNAL; + + if (!WIFEXITED(status)) + return -ERR_RUN_COMMAND_WAITPID_NOEXIT; + code = WEXITSTATUS(status); + switch (code) { + case 127: + return -ERR_RUN_COMMAND_EXEC; + case 0: + return 0; + default: + return -code; + } + } +} + +int finish_command(struct child_process *cmd) +{ + return wait_or_whine(cmd->pid); +} + +int run_command(struct child_process *cmd) +{ + int code = start_command(cmd); + if (code) + return code; + return finish_command(cmd); +} + +static void prepare_run_command_v_opt(struct child_process *cmd, + const char **argv, + int opt) +{ + memset(cmd, 0, sizeof(*cmd)); + cmd->argv = argv; + cmd->no_stdin = opt & RUN_COMMAND_NO_STDIN ? 1 : 0; + cmd->perf_cmd = opt & RUN_PERF_CMD ? 1 : 0; + cmd->stdout_to_stderr = opt & RUN_COMMAND_STDOUT_TO_STDERR ? 1 : 0; +} + +int run_command_v_opt(const char **argv, int opt) +{ + struct child_process cmd; + prepare_run_command_v_opt(&cmd, argv, opt); + return run_command(&cmd); +} + +int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env) +{ + struct child_process cmd; + prepare_run_command_v_opt(&cmd, argv, opt); + cmd.dir = dir; + cmd.env = env; + return run_command(&cmd); +} + +#ifdef __MINGW32__ +static __stdcall unsigned run_thread(void *data) +{ + struct async *async = data; + return async->proc(async->fd_for_proc, async->data); +} +#endif + +int start_async(struct async *async) +{ + int pipe_out[2]; + + if (pipe(pipe_out) < 0) + return error("cannot create pipe: %s", strerror(errno)); + async->out = pipe_out[0]; + +#ifndef __MINGW32__ + /* Flush stdio before fork() to avoid cloning buffers */ + fflush(NULL); + + async->pid = fork(); + if (async->pid < 0) { + error("fork (async) failed: %s", strerror(errno)); + close_pair(pipe_out); + return -1; + } + if (!async->pid) { + close(pipe_out[0]); + exit(!!async->proc(pipe_out[1], async->data)); + } + close(pipe_out[1]); +#else + async->fd_for_proc = pipe_out[1]; + async->tid = (HANDLE) _beginthreadex(NULL, 0, run_thread, async, 0, NULL); + if (!async->tid) { + error("cannot create thread: %s", strerror(errno)); + close_pair(pipe_out); + return -1; + } +#endif + return 0; +} + +int finish_async(struct async *async) +{ +#ifndef __MINGW32__ + int ret = 0; + + if (wait_or_whine(async->pid)) + ret = error("waitpid (async) failed"); +#else + DWORD ret = 0; + if (WaitForSingleObject(async->tid, INFINITE) != WAIT_OBJECT_0) + ret = error("waiting for thread failed: %lu", GetLastError()); + else if (!GetExitCodeThread(async->tid, &ret)) + ret = error("cannot get thread exit code: %lu", GetLastError()); + CloseHandle(async->tid); +#endif + return ret; +} + +int run_hook(const char *index_file, const char *name, ...) +{ + struct child_process hook; + const char **argv = NULL, *env[2]; + char index[PATH_MAX]; + va_list args; + int ret; + size_t i = 0, alloc = 0; + + if (access(perf_path("hooks/%s", name), X_OK) < 0) + return 0; + + va_start(args, name); + ALLOC_GROW(argv, i + 1, alloc); + argv[i++] = perf_path("hooks/%s", name); + while (argv[i-1]) { + ALLOC_GROW(argv, i + 1, alloc); + argv[i++] = va_arg(args, const char *); + } + va_end(args); + + memset(&hook, 0, sizeof(hook)); + hook.argv = argv; + hook.no_stdin = 1; + hook.stdout_to_stderr = 1; + if (index_file) { + snprintf(index, sizeof(index), "PERF_INDEX_FILE=%s", index_file); + env[0] = index; + env[1] = NULL; + hook.env = env; + } + + ret = start_command(&hook); + free(argv); + if (ret) { + warning("Could not spawn %s", argv[0]); + return ret; + } + ret = finish_command(&hook); + if (ret == -ERR_RUN_COMMAND_WAITPID_SIGNAL) + warning("%s exited due to uncaught signal", argv[0]); + + return ret; +} diff --git a/Documentation/perf_counter/run-command.h b/Documentation/perf_counter/run-command.h new file mode 100644 index 00000000000..328289f2366 --- /dev/null +++ b/Documentation/perf_counter/run-command.h @@ -0,0 +1,93 @@ +#ifndef RUN_COMMAND_H +#define RUN_COMMAND_H + +enum { + ERR_RUN_COMMAND_FORK = 10000, + ERR_RUN_COMMAND_EXEC, + ERR_RUN_COMMAND_PIPE, + ERR_RUN_COMMAND_WAITPID, + ERR_RUN_COMMAND_WAITPID_WRONG_PID, + ERR_RUN_COMMAND_WAITPID_SIGNAL, + ERR_RUN_COMMAND_WAITPID_NOEXIT, +}; +#define IS_RUN_COMMAND_ERR(x) (-(x) >= ERR_RUN_COMMAND_FORK) + +struct child_process { + const char **argv; + pid_t pid; + /* + * Using .in, .out, .err: + * - Specify 0 for no redirections (child inherits stdin, stdout, + * stderr from parent). + * - Specify -1 to have a pipe allocated as follows: + * .in: returns the writable pipe end; parent writes to it, + * the readable pipe end becomes child's stdin + * .out, .err: returns the readable pipe end; parent reads from + * it, the writable pipe end becomes child's stdout/stderr + * The caller of start_command() must close the returned FDs + * after it has completed reading from/writing to it! + * - Specify > 0 to set a channel to a particular FD as follows: + * .in: a readable FD, becomes child's stdin + * .out: a writable FD, becomes child's stdout/stderr + * .err > 0 not supported + * The specified FD is closed by start_command(), even in case + * of errors! + */ + int in; + int out; + int err; + const char *dir; + const char *const *env; + unsigned no_stdin:1; + unsigned no_stdout:1; + unsigned no_stderr:1; + unsigned perf_cmd:1; /* if this is to be perf sub-command */ + unsigned stdout_to_stderr:1; + void (*preexec_cb)(void); +}; + +int start_command(struct child_process *); +int finish_command(struct child_process *); +int run_command(struct child_process *); + +extern int run_hook(const char *index_file, const char *name, ...); + +#define RUN_COMMAND_NO_STDIN 1 +#define RUN_PERF_CMD 2 /*If this is to be perf sub-command */ +#define RUN_COMMAND_STDOUT_TO_STDERR 4 +int run_command_v_opt(const char **argv, int opt); + +/* + * env (the environment) is to be formatted like environ: "VAR=VALUE". + * To unset an environment variable use just "VAR". + */ +int run_command_v_opt_cd_env(const char **argv, int opt, const char *dir, const char *const *env); + +/* + * The purpose of the following functions is to feed a pipe by running + * a function asynchronously and providing output that the caller reads. + * + * It is expected that no synchronization and mutual exclusion between + * the caller and the feed function is necessary so that the function + * can run in a thread without interfering with the caller. + */ +struct async { + /* + * proc writes to fd and closes it; + * returns 0 on success, non-zero on failure + */ + int (*proc)(int fd, void *data); + void *data; + int out; /* caller reads from here and closes it */ +#ifndef __MINGW32__ + pid_t pid; +#else + HANDLE tid; + int fd_for_proc; +#endif +}; + +int start_async(struct async *async); +int finish_async(struct async *async); + +#endif diff --git a/Documentation/perf_counter/strbuf.c b/Documentation/perf_counter/strbuf.c new file mode 100644 index 00000000000..eaba0930680 --- /dev/null +++ b/Documentation/perf_counter/strbuf.c @@ -0,0 +1,359 @@ +#include "cache.h" + +int prefixcmp(const char *str, const char *prefix) +{ + for (; ; str++, prefix++) + if (!*prefix) + return 0; + else if (*str != *prefix) + return (unsigned char)*prefix - (unsigned char)*str; +} + +/* + * Used as the default ->buf value, so that people can always assume + * buf is non NULL and ->buf is NUL terminated even for a freshly + * initialized strbuf. + */ +char strbuf_slopbuf[1]; + +void strbuf_init(struct strbuf *sb, size_t hint) +{ + sb->alloc = sb->len = 0; + sb->buf = strbuf_slopbuf; + if (hint) + strbuf_grow(sb, hint); +} + +void strbuf_release(struct strbuf *sb) +{ + if (sb->alloc) { + free(sb->buf); + strbuf_init(sb, 0); + } +} + +char *strbuf_detach(struct strbuf *sb, size_t *sz) +{ + char *res = sb->alloc ? sb->buf : NULL; + if (sz) + *sz = sb->len; + strbuf_init(sb, 0); + return res; +} + +void strbuf_attach(struct strbuf *sb, void *buf, size_t len, size_t alloc) +{ + strbuf_release(sb); + sb->buf = buf; + sb->len = len; + sb->alloc = alloc; + strbuf_grow(sb, 0); + sb->buf[sb->len] = '\0'; +} + +void strbuf_grow(struct strbuf *sb, size_t extra) +{ + if (sb->len + extra + 1 <= sb->len) + die("you want to use way too much memory"); + if (!sb->alloc) + sb->buf = NULL; + ALLOC_GROW(sb->buf, sb->len + extra + 1, sb->alloc); +} + +void strbuf_trim(struct strbuf *sb) +{ + char *b = sb->buf; + while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1])) + sb->len--; + while (sb->len > 0 && isspace(*b)) { + b++; + sb->len--; + } + memmove(sb->buf, b, sb->len); + sb->buf[sb->len] = '\0'; +} +void strbuf_rtrim(struct strbuf *sb) +{ + while (sb->len > 0 && isspace((unsigned char)sb->buf[sb->len - 1])) + sb->len--; + sb->buf[sb->len] = '\0'; +} + +void strbuf_ltrim(struct strbuf *sb) +{ + char *b = sb->buf; + while (sb->len > 0 && isspace(*b)) { + b++; + sb->len--; + } + memmove(sb->buf, b, sb->len); + sb->buf[sb->len] = '\0'; +} + +void strbuf_tolower(struct strbuf *sb) +{ + int i; + for (i = 0; i < sb->len; i++) + sb->buf[i] = tolower(sb->buf[i]); +} + +struct strbuf **strbuf_split(const struct strbuf *sb, int delim) +{ + int alloc = 2, pos = 0; + char *n, *p; + struct strbuf **ret; + struct strbuf *t; + + ret = calloc(alloc, sizeof(struct strbuf *)); + p = n = sb->buf; + while (n < sb->buf + sb->len) { + int len; + n = memchr(n, delim, sb->len - (n - sb->buf)); + if (pos + 1 >= alloc) { + alloc = alloc * 2; + ret = realloc(ret, sizeof(struct strbuf *) * alloc); + } + if (!n) + n = sb->buf + sb->len - 1; + len = n - p + 1; + t = malloc(sizeof(struct strbuf)); + strbuf_init(t, len); + strbuf_add(t, p, len); + ret[pos] = t; + ret[++pos] = NULL; + p = ++n; + } + return ret; +} + +void strbuf_list_free(struct strbuf **sbs) +{ + struct strbuf **s = sbs; + + while (*s) { + strbuf_release(*s); + free(*s++); + } + free(sbs); +} + +int strbuf_cmp(const struct strbuf *a, const struct strbuf *b) +{ + int len = a->len < b->len ? a->len: b->len; + int cmp = memcmp(a->buf, b->buf, len); + if (cmp) + return cmp; + return a->len < b->len ? -1: a->len != b->len; +} + +void strbuf_splice(struct strbuf *sb, size_t pos, size_t len, + const void *data, size_t dlen) +{ + if (pos + len < pos) + die("you want to use way too much memory"); + if (pos > sb->len) + die("`pos' is too far after the end of the buffer"); + if (pos + len > sb->len) + die("`pos + len' is too far after the end of the buffer"); + + if (dlen >= len) + strbuf_grow(sb, dlen - len); + memmove(sb->buf + pos + dlen, + sb->buf + pos + len, + sb->len - pos - len); + memcpy(sb->buf + pos, data, dlen); + strbuf_setlen(sb, sb->len + dlen - len); +} + +void strbuf_insert(struct strbuf *sb, size_t pos, const void *data, size_t len) +{ + strbuf_splice(sb, pos, 0, data, len); +} + +void strbuf_remove(struct strbuf *sb, size_t pos, size_t len) +{ + strbuf_splice(sb, pos, len, NULL, 0); +} + +void strbuf_add(struct strbuf *sb, const void *data, size_t len) +{ + strbuf_grow(sb, len); + memcpy(sb->buf + sb->len, data, len); + strbuf_setlen(sb, sb->len + len); +} + +void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len) +{ + strbuf_grow(sb, len); + memcpy(sb->buf + sb->len, sb->buf + pos, len); + strbuf_setlen(sb, sb->len + len); +} + +void strbuf_addf(struct strbuf *sb, const char *fmt, ...) +{ + int len; + va_list ap; + + if (!strbuf_avail(sb)) + strbuf_grow(sb, 64); + va_start(ap, fmt); + len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap); + va_end(ap); + if (len < 0) + die("your vsnprintf is broken"); + if (len > strbuf_avail(sb)) { + strbuf_grow(sb, len); + va_start(ap, fmt); + len = vsnprintf(sb->buf + sb->len, sb->alloc - sb->len, fmt, ap); + va_end(ap); + if (len > strbuf_avail(sb)) { + die("this should not happen, your snprintf is broken"); + } + } + strbuf_setlen(sb, sb->len + len); +} + +void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, + void *context) +{ + for (;;) { + const char *percent; + size_t consumed; + + percent = strchrnul(format, '%'); + strbuf_add(sb, format, percent - format); + if (!*percent) + break; + format = percent + 1; + + consumed = fn(sb, format, context); + if (consumed) + format += consumed; + else + strbuf_addch(sb, '%'); + } +} + +size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, + void *context) +{ + struct strbuf_expand_dict_entry *e = context; + size_t len; + + for (; e->placeholder && (len = strlen(e->placeholder)); e++) { + if (!strncmp(placeholder, e->placeholder, len)) { + if (e->value) + strbuf_addstr(sb, e->value); + return len; + } + } + return 0; +} + +size_t strbuf_fread(struct strbuf *sb, size_t size, FILE *f) +{ + size_t res; + size_t oldalloc = sb->alloc; + + strbuf_grow(sb, size); + res = fread(sb->buf + sb->len, 1, size, f); + if (res > 0) + strbuf_setlen(sb, sb->len + res); + else if (res < 0 && oldalloc == 0) + strbuf_release(sb); + return res; +} + +ssize_t strbuf_read(struct strbuf *sb, int fd, size_t hint) +{ + size_t oldlen = sb->len; + size_t oldalloc = sb->alloc; + + strbuf_grow(sb, hint ? hint : 8192); + for (;;) { + ssize_t cnt; + + cnt = read(fd, sb->buf + sb->len, sb->alloc - sb->len - 1); + if (cnt < 0) { + if (oldalloc == 0) + strbuf_release(sb); + else + strbuf_setlen(sb, oldlen); + return -1; + } + if (!cnt) + break; + sb->len += cnt; + strbuf_grow(sb, 8192); + } + + sb->buf[sb->len] = '\0'; + return sb->len - oldlen; +} + +#define STRBUF_MAXLINK (2*PATH_MAX) + +int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint) +{ + size_t oldalloc = sb->alloc; + + if (hint < 32) + hint = 32; + + while (hint < STRBUF_MAXLINK) { + int len; + + strbuf_grow(sb, hint); + len = readlink(path, sb->buf, hint); + if (len < 0) { + if (errno != ERANGE) + break; + } else if (len < hint) { + strbuf_setlen(sb, len); + return 0; + } + + /* .. the buffer was too small - try again */ + hint *= 2; + } + if (oldalloc == 0) + strbuf_release(sb); + return -1; +} + +int strbuf_getline(struct strbuf *sb, FILE *fp, int term) +{ + int ch; + + strbuf_grow(sb, 0); + if (feof(fp)) + return EOF; + + strbuf_reset(sb); + while ((ch = fgetc(fp)) != EOF) { + if (ch == term) + break; + strbuf_grow(sb, 1); + sb->buf[sb->len++] = ch; + } + if (ch == EOF && sb->len == 0) + return EOF; + + sb->buf[sb->len] = '\0'; + return 0; +} + +int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint) +{ + int fd, len; + + fd = open(path, O_RDONLY); + if (fd < 0) + return -1; + len = strbuf_read(sb, fd, hint); + close(fd); + if (len < 0) + return -1; + + return len; +} diff --git a/Documentation/perf_counter/strbuf.h b/Documentation/perf_counter/strbuf.h new file mode 100644 index 00000000000..9ee908a3ec5 --- /dev/null +++ b/Documentation/perf_counter/strbuf.h @@ -0,0 +1,137 @@ +#ifndef STRBUF_H +#define STRBUF_H + +/* + * Strbuf's can be use in many ways: as a byte array, or to store arbitrary + * long, overflow safe strings. + * + * Strbufs has some invariants that are very important to keep in mind: + * + * 1. the ->buf member is always malloc-ed, hence strbuf's can be used to + * build complex strings/buffers whose final size isn't easily known. + * + * It is NOT legal to copy the ->buf pointer away. + * `strbuf_detach' is the operation that detachs a buffer from its shell + * while keeping the shell valid wrt its invariants. + * + * 2. the ->buf member is a byte array that has at least ->len + 1 bytes + * allocated. The extra byte is used to store a '\0', allowing the ->buf + * member to be a valid C-string. Every strbuf function ensure this + * invariant is preserved. + * + * Note that it is OK to "play" with the buffer directly if you work it + * that way: + * + * strbuf_grow(sb, SOME_SIZE); + * ... Here, the memory array starting at sb->buf, and of length + * ... strbuf_avail(sb) is all yours, and you are sure that + * ... strbuf_avail(sb) is at least SOME_SIZE. + * strbuf_setlen(sb, sb->len + SOME_OTHER_SIZE); + * + * Of course, SOME_OTHER_SIZE must be smaller or equal to strbuf_avail(sb). + * + * Doing so is safe, though if it has to be done in many places, adding the + * missing API to the strbuf module is the way to go. + * + * XXX: do _not_ assume that the area that is yours is of size ->alloc - 1 + * even if it's true in the current implementation. Alloc is somehow a + * "private" member that should not be messed with. + */ + +#include + +extern char strbuf_slopbuf[]; +struct strbuf { + size_t alloc; + size_t len; + char *buf; +}; + +#define STRBUF_INIT { 0, 0, strbuf_slopbuf } + +/*----- strbuf life cycle -----*/ +extern void strbuf_init(struct strbuf *, size_t); +extern void strbuf_release(struct strbuf *); +extern char *strbuf_detach(struct strbuf *, size_t *); +extern void strbuf_attach(struct strbuf *, void *, size_t, size_t); +static inline void strbuf_swap(struct strbuf *a, struct strbuf *b) { + struct strbuf tmp = *a; + *a = *b; + *b = tmp; +} + +/*----- strbuf size related -----*/ +static inline size_t strbuf_avail(const struct strbuf *sb) { + return sb->alloc ? sb->alloc - sb->len - 1 : 0; +} + +extern void strbuf_grow(struct strbuf *, size_t); + +static inline void strbuf_setlen(struct strbuf *sb, size_t len) { + if (!sb->alloc) + strbuf_grow(sb, 0); + assert(len < sb->alloc); + sb->len = len; + sb->buf[len] = '\0'; +} +#define strbuf_reset(sb) strbuf_setlen(sb, 0) + +/*----- content related -----*/ +extern void strbuf_trim(struct strbuf *); +extern void strbuf_rtrim(struct strbuf *); +extern void strbuf_ltrim(struct strbuf *); +extern int strbuf_cmp(const struct strbuf *, const struct strbuf *); +extern void strbuf_tolower(struct strbuf *); + +extern struct strbuf **strbuf_split(const struct strbuf *, int delim); +extern void strbuf_list_free(struct strbuf **); + +/*----- add data in your buffer -----*/ +static inline void strbuf_addch(struct strbuf *sb, int c) { + strbuf_grow(sb, 1); + sb->buf[sb->len++] = c; + sb->buf[sb->len] = '\0'; +} + +extern void strbuf_insert(struct strbuf *, size_t pos, const void *, size_t); +extern void strbuf_remove(struct strbuf *, size_t pos, size_t len); + +/* splice pos..pos+len with given data */ +extern void strbuf_splice(struct strbuf *, size_t pos, size_t len, + const void *, size_t); + +extern void strbuf_add(struct strbuf *, const void *, size_t); +static inline void strbuf_addstr(struct strbuf *sb, const char *s) { + strbuf_add(sb, s, strlen(s)); +} +static inline void strbuf_addbuf(struct strbuf *sb, const struct strbuf *sb2) { + strbuf_add(sb, sb2->buf, sb2->len); +} +extern void strbuf_adddup(struct strbuf *sb, size_t pos, size_t len); + +typedef size_t (*expand_fn_t) (struct strbuf *sb, const char *placeholder, void *context); +extern void strbuf_expand(struct strbuf *sb, const char *format, expand_fn_t fn, void *context); +struct strbuf_expand_dict_entry { + const char *placeholder; + const char *value; +}; +extern size_t strbuf_expand_dict_cb(struct strbuf *sb, const char *placeholder, void *context); + +__attribute__((format(printf,2,3))) +extern void strbuf_addf(struct strbuf *sb, const char *fmt, ...); + +extern size_t strbuf_fread(struct strbuf *, size_t, FILE *); +/* XXX: if read fails, any partial read is undone */ +extern ssize_t strbuf_read(struct strbuf *, int fd, size_t hint); +extern int strbuf_read_file(struct strbuf *sb, const char *path, size_t hint); +extern int strbuf_readlink(struct strbuf *sb, const char *path, size_t hint); + +extern int strbuf_getline(struct strbuf *, FILE *, int); + +extern void stripspace(struct strbuf *buf, int skip_comments); +extern int launch_editor(const char *path, struct strbuf *buffer, const char *const *env); + +extern int strbuf_branchname(struct strbuf *sb, const char *name); +extern int strbuf_check_branch_ref(struct strbuf *sb, const char *name); + +#endif /* STRBUF_H */ diff --git a/Documentation/perf_counter/usage.c b/Documentation/perf_counter/usage.c new file mode 100644 index 00000000000..7a10421fe6b --- /dev/null +++ b/Documentation/perf_counter/usage.c @@ -0,0 +1,80 @@ +/* + * GIT - The information manager from hell + * + * Copyright (C) Linus Torvalds, 2005 + */ +#include "util.h" + +static void report(const char *prefix, const char *err, va_list params) +{ + char msg[1024]; + vsnprintf(msg, sizeof(msg), err, params); + fprintf(stderr, "%s%s\n", prefix, msg); +} + +static NORETURN void usage_builtin(const char *err) +{ + fprintf(stderr, "usage: %s\n", err); + exit(129); +} + +static NORETURN void die_builtin(const char *err, va_list params) +{ + report("fatal: ", err, params); + exit(128); +} + +static void error_builtin(const char *err, va_list params) +{ + report("error: ", err, params); +} + +static void warn_builtin(const char *warn, va_list params) +{ + report("warning: ", warn, params); +} + +/* If we are in a dlopen()ed .so write to a global variable would segfault + * (ugh), so keep things static. */ +static void (*usage_routine)(const char *err) NORETURN = usage_builtin; +static void (*die_routine)(const char *err, va_list params) NORETURN = die_builtin; +static void (*error_routine)(const char *err, va_list params) = error_builtin; +static void (*warn_routine)(const char *err, va_list params) = warn_builtin; + +void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN) +{ + die_routine = routine; +} + +void usage(const char *err) +{ + usage_routine(err); +} + +void die(const char *err, ...) +{ + va_list params; + + va_start(params, err); + die_routine(err, params); + va_end(params); +} + +int error(const char *err, ...) +{ + va_list params; + + va_start(params, err); + error_routine(err, params); + va_end(params); + return -1; +} + +void warning(const char *warn, ...) +{ + va_list params; + + va_start(params, warn); + warn_routine(warn, params); + va_end(params); +} diff --git a/Documentation/perf_counter/util.h b/Documentation/perf_counter/util.h new file mode 100644 index 00000000000..13f8bdce760 --- /dev/null +++ b/Documentation/perf_counter/util.h @@ -0,0 +1,394 @@ +#ifndef GIT_COMPAT_UTIL_H +#define GIT_COMPAT_UTIL_H + +#define _FILE_OFFSET_BITS 64 + +#ifndef FLEX_ARRAY +/* + * See if our compiler is known to support flexible array members. + */ +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) +# define FLEX_ARRAY /* empty */ +#elif defined(__GNUC__) +# if (__GNUC__ >= 3) +# define FLEX_ARRAY /* empty */ +# else +# define FLEX_ARRAY 0 /* older GNU extension */ +# endif +#endif + +/* + * Otherwise, default to safer but a bit wasteful traditional style + */ +#ifndef FLEX_ARRAY +# define FLEX_ARRAY 1 +#endif +#endif + +#define ARRAY_SIZE(x) (sizeof(x)/sizeof(x[0])) + +#ifdef __GNUC__ +#define TYPEOF(x) (__typeof__(x)) +#else +#define TYPEOF(x) +#endif + +#define MSB(x, bits) ((x) & TYPEOF(x)(~0ULL << (sizeof(x) * 8 - (bits)))) +#define HAS_MULTI_BITS(i) ((i) & ((i) - 1)) /* checks if an integer has more than 1 bit set */ + +/* Approximation of the length of the decimal representation of this type. */ +#define decimal_length(x) ((int)(sizeof(x) * 2.56 + 0.5) + 1) + +#if !defined(__APPLE__) && !defined(__FreeBSD__) && !defined(__USLC__) && !defined(_M_UNIX) +#define _XOPEN_SOURCE 600 /* glibc2 and AIX 5.3L need 500, OpenBSD needs 600 for S_ISLNK() */ +#define _XOPEN_SOURCE_EXTENDED 1 /* AIX 5.3L needs this */ +#endif +#define _ALL_SOURCE 1 +#define _GNU_SOURCE 1 +#define _BSD_SOURCE 1 + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef __MINGW32__ +#include +#include +#include +#include +#ifndef NO_SYS_SELECT_H +#include +#endif +#include +#include +#include +#include +#include +#include +#if defined(__CYGWIN__) +#undef _XOPEN_SOURCE +#include +#define _XOPEN_SOURCE 600 +#include "compat/cygwin.h" +#else +#undef _ALL_SOURCE /* AIX 5.3L defines a struct list with _ALL_SOURCE. */ +#include +#define _ALL_SOURCE 1 +#endif +#else /* __MINGW32__ */ +/* pull in Windows compatibility stuff */ +#include "compat/mingw.h" +#endif /* __MINGW32__ */ + +#ifndef NO_ICONV +#include +#endif + +#ifndef NO_OPENSSL +#include +#include +#endif + +/* On most systems would have given us this, but + * not on some systems (e.g. GNU/Hurd). + */ +#ifndef PATH_MAX +#define PATH_MAX 4096 +#endif + +#ifndef PRIuMAX +#define PRIuMAX "llu" +#endif + +#ifndef PRIu32 +#define PRIu32 "u" +#endif + +#ifndef PRIx32 +#define PRIx32 "x" +#endif + +#ifndef PATH_SEP +#define PATH_SEP ':' +#endif + +#ifndef STRIP_EXTENSION +#define STRIP_EXTENSION "" +#endif + +#ifndef has_dos_drive_prefix +#define has_dos_drive_prefix(path) 0 +#endif + +#ifndef is_dir_sep +#define is_dir_sep(c) ((c) == '/') +#endif + +#ifdef __GNUC__ +#define NORETURN __attribute__((__noreturn__)) +#else +#define NORETURN +#ifndef __attribute__ +#define __attribute__(x) +#endif +#endif + +/* General helper functions */ +extern void usage(const char *err) NORETURN; +extern void die(const char *err, ...) NORETURN __attribute__((format (printf, 1, 2))); +extern int error(const char *err, ...) __attribute__((format (printf, 1, 2))); +extern void warning(const char *err, ...) __attribute__((format (printf, 1, 2))); + +extern void set_die_routine(void (*routine)(const char *err, va_list params) NORETURN); + +extern int prefixcmp(const char *str, const char *prefix); +extern time_t tm_to_time_t(const struct tm *tm); + +static inline const char *skip_prefix(const char *str, const char *prefix) +{ + size_t len = strlen(prefix); + return strncmp(str, prefix, len) ? NULL : str + len; +} + +#if defined(NO_MMAP) || defined(USE_WIN32_MMAP) + +#ifndef PROT_READ +#define PROT_READ 1 +#define PROT_WRITE 2 +#define MAP_PRIVATE 1 +#define MAP_FAILED ((void*)-1) +#endif + +#define mmap git_mmap +#define munmap git_munmap +extern void *git_mmap(void *start, size_t length, int prot, int flags, int fd, off_t offset); +extern int git_munmap(void *start, size_t length); + +#else /* NO_MMAP || USE_WIN32_MMAP */ + +#include + +#endif /* NO_MMAP || USE_WIN32_MMAP */ + +#ifdef NO_MMAP + +/* This value must be multiple of (pagesize * 2) */ +#define DEFAULT_PACKED_GIT_WINDOW_SIZE (1 * 1024 * 1024) + +#else /* NO_MMAP */ + +/* This value must be multiple of (pagesize * 2) */ +#define DEFAULT_PACKED_GIT_WINDOW_SIZE \ + (sizeof(void*) >= 8 \ + ? 1 * 1024 * 1024 * 1024 \ + : 32 * 1024 * 1024) + +#endif /* NO_MMAP */ + +#ifdef NO_ST_BLOCKS_IN_STRUCT_STAT +#define on_disk_bytes(st) ((st).st_size) +#else +#define on_disk_bytes(st) ((st).st_blocks * 512) +#endif + +#define DEFAULT_PACKED_GIT_LIMIT \ + ((1024L * 1024L) * (sizeof(void*) >= 8 ? 8192 : 256)) + +#ifdef NO_PREAD +#define pread git_pread +extern ssize_t git_pread(int fd, void *buf, size_t count, off_t offset); +#endif +/* + * Forward decl that will remind us if its twin in cache.h changes. + * This function is used in compat/pread.c. But we can't include + * cache.h there. + */ +extern ssize_t read_in_full(int fd, void *buf, size_t count); + +#ifdef NO_SETENV +#define setenv gitsetenv +extern int gitsetenv(const char *, const char *, int); +#endif + +#ifdef NO_MKDTEMP +#define mkdtemp gitmkdtemp +extern char *gitmkdtemp(char *); +#endif + +#ifdef NO_UNSETENV +#define unsetenv gitunsetenv +extern void gitunsetenv(const char *); +#endif + +#ifdef NO_STRCASESTR +#define strcasestr gitstrcasestr +extern char *gitstrcasestr(const char *haystack, const char *needle); +#endif + +#ifdef NO_STRLCPY +#define strlcpy gitstrlcpy +extern size_t gitstrlcpy(char *, const char *, size_t); +#endif + +#ifdef NO_STRTOUMAX +#define strtoumax gitstrtoumax +extern uintmax_t gitstrtoumax(const char *, char **, int); +#endif + +#ifdef NO_HSTRERROR +#define hstrerror githstrerror +extern const char *githstrerror(int herror); +#endif + +#ifdef NO_MEMMEM +#define memmem gitmemmem +void *gitmemmem(const void *haystack, size_t haystacklen, + const void *needle, size_t needlelen); +#endif + +#ifdef FREAD_READS_DIRECTORIES +#ifdef fopen +#undef fopen +#endif +#define fopen(a,b) git_fopen(a,b) +extern FILE *git_fopen(const char*, const char*); +#endif + +#ifdef SNPRINTF_RETURNS_BOGUS +#define snprintf git_snprintf +extern int git_snprintf(char *str, size_t maxsize, + const char *format, ...); +#define vsnprintf git_vsnprintf +extern int git_vsnprintf(char *str, size_t maxsize, + const char *format, va_list ap); +#endif + +#ifdef __GLIBC_PREREQ +#if __GLIBC_PREREQ(2, 1) +#define HAVE_STRCHRNUL +#endif +#endif + +#ifndef HAVE_STRCHRNUL +#define strchrnul gitstrchrnul +static inline char *gitstrchrnul(const char *s, int c) +{ + while (*s && *s != c) + s++; + return (char *)s; +} +#endif + +static inline size_t xsize_t(off_t len) +{ + return (size_t)len; +} + +static inline int has_extension(const char *filename, const char *ext) +{ + size_t len = strlen(filename); + size_t extlen = strlen(ext); + return len > extlen && !memcmp(filename + len - extlen, ext, extlen); +} + +/* Sane ctype - no locale, and works with signed chars */ +#undef isascii +#undef isspace +#undef isdigit +#undef isalpha +#undef isalnum +#undef tolower +#undef toupper +extern unsigned char sane_ctype[256]; +#define GIT_SPACE 0x01 +#define GIT_DIGIT 0x02 +#define GIT_ALPHA 0x04 +#define GIT_GLOB_SPECIAL 0x08 +#define GIT_REGEX_SPECIAL 0x10 +#define sane_istest(x,mask) ((sane_ctype[(unsigned char)(x)] & (mask)) != 0) +#define isascii(x) (((x) & ~0x7f) == 0) +#define isspace(x) sane_istest(x,GIT_SPACE) +#define isdigit(x) sane_istest(x,GIT_DIGIT) +#define isalpha(x) sane_istest(x,GIT_ALPHA) +#define isalnum(x) sane_istest(x,GIT_ALPHA | GIT_DIGIT) +#define is_glob_special(x) sane_istest(x,GIT_GLOB_SPECIAL) +#define is_regex_special(x) sane_istest(x,GIT_GLOB_SPECIAL | GIT_REGEX_SPECIAL) +#define tolower(x) sane_case((unsigned char)(x), 0x20) +#define toupper(x) sane_case((unsigned char)(x), 0) + +static inline int sane_case(int x, int high) +{ + if (sane_istest(x, GIT_ALPHA)) + x = (x & ~0x20) | high; + return x; +} + +static inline int strtoul_ui(char const *s, int base, unsigned int *result) +{ + unsigned long ul; + char *p; + + errno = 0; + ul = strtoul(s, &p, base); + if (errno || *p || p == s || (unsigned int) ul != ul) + return -1; + *result = ul; + return 0; +} + +static inline int strtol_i(char const *s, int base, int *result) +{ + long ul; + char *p; + + errno = 0; + ul = strtol(s, &p, base); + if (errno || *p || p == s || (int) ul != ul) + return -1; + *result = ul; + return 0; +} + +#ifdef INTERNAL_QSORT +void git_qsort(void *base, size_t nmemb, size_t size, + int(*compar)(const void *, const void *)); +#define qsort git_qsort +#endif + +#ifndef DIR_HAS_BSD_GROUP_SEMANTICS +# define FORCE_DIR_SET_GID S_ISGID +#else +# define FORCE_DIR_SET_GID 0 +#endif + +#ifdef NO_NSEC +#undef USE_NSEC +#define ST_CTIME_NSEC(st) 0 +#define ST_MTIME_NSEC(st) 0 +#else +#ifdef USE_ST_TIMESPEC +#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctimespec.tv_nsec)) +#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtimespec.tv_nsec)) +#else +#define ST_CTIME_NSEC(st) ((unsigned int)((st).st_ctim.tv_nsec)) +#define ST_MTIME_NSEC(st) ((unsigned int)((st).st_mtim.tv_nsec)) +#endif +#endif + +#endif diff --git a/Documentation/perf_counter/wrapper.c b/Documentation/perf_counter/wrapper.c new file mode 100644 index 00000000000..6350d65f6d9 --- /dev/null +++ b/Documentation/perf_counter/wrapper.c @@ -0,0 +1,206 @@ +/* + * Various trivial helper wrappers around standard functions + */ +#include "cache.h" + +/* + * There's no pack memory to release - but stay close to the Git + * version so wrap this away: + */ +static inline void release_pack_memory(size_t size, int flag) +{ +} + +char *xstrdup(const char *str) +{ + char *ret = strdup(str); + if (!ret) { + release_pack_memory(strlen(str) + 1, -1); + ret = strdup(str); + if (!ret) + die("Out of memory, strdup failed"); + } + return ret; +} + +void *xmalloc(size_t size) +{ + void *ret = malloc(size); + if (!ret && !size) + ret = malloc(1); + if (!ret) { + release_pack_memory(size, -1); + ret = malloc(size); + if (!ret && !size) + ret = malloc(1); + if (!ret) + die("Out of memory, malloc failed"); + } +#ifdef XMALLOC_POISON + memset(ret, 0xA5, size); +#endif + return ret; +} + +/* + * xmemdupz() allocates (len + 1) bytes of memory, duplicates "len" bytes of + * "data" to the allocated memory, zero terminates the allocated memory, + * and returns a pointer to the allocated memory. If the allocation fails, + * the program dies. + */ +void *xmemdupz(const void *data, size_t len) +{ + char *p = xmalloc(len + 1); + memcpy(p, data, len); + p[len] = '\0'; + return p; +} + +char *xstrndup(const char *str, size_t len) +{ + char *p = memchr(str, '\0', len); + return xmemdupz(str, p ? p - str : len); +} + +void *xrealloc(void *ptr, size_t size) +{ + void *ret = realloc(ptr, size); + if (!ret && !size) + ret = realloc(ptr, 1); + if (!ret) { + release_pack_memory(size, -1); + ret = realloc(ptr, size); + if (!ret && !size) + ret = realloc(ptr, 1); + if (!ret) + die("Out of memory, realloc failed"); + } + return ret; +} + +void *xcalloc(size_t nmemb, size_t size) +{ + void *ret = calloc(nmemb, size); + if (!ret && (!nmemb || !size)) + ret = calloc(1, 1); + if (!ret) { + release_pack_memory(nmemb * size, -1); + ret = calloc(nmemb, size); + if (!ret && (!nmemb || !size)) + ret = calloc(1, 1); + if (!ret) + die("Out of memory, calloc failed"); + } + return ret; +} + +void *xmmap(void *start, size_t length, + int prot, int flags, int fd, off_t offset) +{ + void *ret = mmap(start, length, prot, flags, fd, offset); + if (ret == MAP_FAILED) { + if (!length) + return NULL; + release_pack_memory(length, fd); + ret = mmap(start, length, prot, flags, fd, offset); + if (ret == MAP_FAILED) + die("Out of memory? mmap failed: %s", strerror(errno)); + } + return ret; +} + +/* + * xread() is the same a read(), but it automatically restarts read() + * operations with a recoverable error (EAGAIN and EINTR). xread() + * DOES NOT GUARANTEE that "len" bytes is read even if the data is available. + */ +ssize_t xread(int fd, void *buf, size_t len) +{ + ssize_t nr; + while (1) { + nr = read(fd, buf, len); + if ((nr < 0) && (errno == EAGAIN || errno == EINTR)) + continue; + return nr; + } +} + +/* + * xwrite() is the same a write(), but it automatically restarts write() + * operations with a recoverable error (EAGAIN and EINTR). xwrite() DOES NOT + * GUARANTEE that "len" bytes is written even if the operation is successful. + */ +ssize_t xwrite(int fd, const void *buf, size_t len) +{ + ssize_t nr; + while (1) { + nr = write(fd, buf, len); + if ((nr < 0) && (errno == EAGAIN || errno == EINTR)) + continue; + return nr; + } +} + +ssize_t read_in_full(int fd, void *buf, size_t count) +{ + char *p = buf; + ssize_t total = 0; + + while (count > 0) { + ssize_t loaded = xread(fd, p, count); + if (loaded <= 0) + return total ? total : loaded; + count -= loaded; + p += loaded; + total += loaded; + } + + return total; +} + +ssize_t write_in_full(int fd, const void *buf, size_t count) +{ + const char *p = buf; + ssize_t total = 0; + + while (count > 0) { + ssize_t written = xwrite(fd, p, count); + if (written < 0) + return -1; + if (!written) { + errno = ENOSPC; + return -1; + } + count -= written; + p += written; + total += written; + } + + return total; +} + +int xdup(int fd) +{ + int ret = dup(fd); + if (ret < 0) + die("dup failed: %s", strerror(errno)); + return ret; +} + +FILE *xfdopen(int fd, const char *mode) +{ + FILE *stream = fdopen(fd, mode); + if (stream == NULL) + die("Out of memory? fdopen failed: %s", strerror(errno)); + return stream; +} + +int xmkstemp(char *template) +{ + int fd; + + fd = mkstemp(template); + if (fd < 0) + die("Unable to create temporary file: %s", strerror(errno)); + return fd; +} From 6f06ccbc86f8a02aa32271263249657ce484eb25 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 15:22:22 +0200 Subject: [PATCH 0511/2061] perf_counter tools: clean up after introduction of the Git command framework Signed-off-by: Ingo Molnar --- Documentation/perf_counter/Makefile | 207 +--------------------- Documentation/perf_counter/builtin-help.c | 2 - Documentation/perf_counter/builtin-top.c | 17 +- Documentation/perf_counter/cache.h | 20 +++ Documentation/perf_counter/config.c | 95 +--------- Documentation/perf_counter/path.c | 39 ---- Documentation/perf_counter/perf.c | 3 +- Documentation/perf_counter/util.h | 14 ++ 8 files changed, 45 insertions(+), 352 deletions(-) diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index 11809b943fc..1b602655554 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -138,16 +138,6 @@ all:: # # Define NO_PERL if you do not want Perl scripts or libraries at all. # -# Define NO_TCLTK if you do not want Tcl/Tk GUI. -# -# The TCL_PATH variable governs the location of the Tcl interpreter -# used to optimize perf-gui for your system. Only used if NO_TCLTK -# is not set. Defaults to the bare 'tclsh'. -# -# The TCLTK_PATH variable governs the location of the Tcl/Tk interpreter. -# If not set it defaults to the bare 'wish'. If it is set to the empty -# string then NO_TCLTK will be forced (this is used by configure script). -# # Define INTERNAL_QSORT to use Git's implementation of qsort(), which # is a simplified version of the merge sort used in glibc. This is # recommended if Git triggers O(n^2) behavior in your platform's qsort(). @@ -215,12 +205,8 @@ TAR = tar FIND = find INSTALL = install RPMBUILD = rpmbuild -TCL_PATH = tclsh -TCLTK_PATH = wish PTHREAD_LIBS = -lpthread -export TCL_PATH TCLTK_PATH - # sparse is architecture-neutral, which means that we need to tell it # explicitly what architecture to check for. Fix this up for yours.. SPARSE_FLAGS = -D__BIG_ENDIAN__ -D__powerpc__ @@ -529,10 +515,6 @@ ifdef NO_EXTERNAL_GREP BASIC_CFLAGS += -DNO_EXTERNAL_GREP endif -ifeq ($(TCLTK_PATH),) -NO_TCLTK=NoThanks -endif - ifeq ($(PERL_PATH),) NO_PERL=NoThanks endif @@ -583,7 +565,6 @@ prefix_SQ = $(subst ','\'',$(prefix)) SHELL_PATH_SQ = $(subst ','\'',$(SHELL_PATH)) PERL_PATH_SQ = $(subst ','\'',$(PERL_PATH)) -TCLTK_PATH_SQ = $(subst ','\'',$(TCLTK_PATH)) LIBS = $(PERFLIBS) $(EXTLIBS) @@ -607,14 +588,6 @@ ifneq (,$X) endif all:: -ifndef NO_TCLTK - $(QUIET_SUBDIR0)perf-gui $(QUIET_SUBDIR1) perfexecdir='$(perfexec_instdir_SQ)' all - $(QUIET_SUBDIR0)perfk-perf $(QUIET_SUBDIR1) all -endif -ifndef NO_PERL - $(QUIET_SUBDIR0)perl $(QUIET_SUBDIR1) PERL_PATH='$(PERL_PATH_SQ)' prefix='$(prefix_SQ)' all -endif - $(QUIET_SUBDIR0)templates $(QUIET_SUBDIR1) please_set_SHELL_PATH_to_a_more_modern_shell: @$$(:) @@ -704,21 +677,6 @@ builtin-revert.o wt-status.o: wt-status.h $(LIB_FILE): $(LIB_OBJS) $(QUIET_AR)$(RM) $@ && $(AR) rcs $@ $(LIB_OBJS) -doc: - $(MAKE) -C Documentation all - -man: - $(MAKE) -C Documentation man - -html: - $(MAKE) -C Documentation html - -info: - $(MAKE) -C Documentation info - -pdf: - $(MAKE) -C Documentation pdf - TAGS: $(RM) TAGS $(FIND) . -name '*.[hcS]' -print | xargs etags -a @@ -751,33 +709,12 @@ PERF-BUILD-OPTIONS: .FORCE-PERF-BUILD-OPTIONS @echo NO_CURL=\''$(subst ','\'',$(subst ','\'',$(NO_CURL)))'\' >>$@ @echo NO_PERL=\''$(subst ','\'',$(subst ','\'',$(NO_PERL)))'\' >>$@ -### Detect Tck/Tk interpreter path changes -ifndef NO_TCLTK -TRACK_VARS = $(subst ','\'',-DTCLTK_PATH='$(TCLTK_PATH_SQ)') - -PERF-GUI-VARS: .FORCE-PERF-GUI-VARS - @VARS='$(TRACK_VARS)'; \ - if test x"$$VARS" != x"`cat $@ 2>/dev/null`" ; then \ - echo 1>&2 " * new Tcl/Tk interpreter location"; \ - echo "$$VARS" >$@; \ - fi - -.PHONY: .FORCE-PERF-GUI-VARS -endif - ### Testing rules -TEST_PROGRAMS += test-chmtime$X -TEST_PROGRAMS += test-ctype$X -TEST_PROGRAMS += test-date$X -TEST_PROGRAMS += test-delta$X -TEST_PROGRAMS += test-dump-cache-tree$X -TEST_PROGRAMS += test-genrandom$X -TEST_PROGRAMS += test-match-trees$X -TEST_PROGRAMS += test-parse-options$X -TEST_PROGRAMS += test-path-utils$X -TEST_PROGRAMS += test-sha1$X -TEST_PROGRAMS += test-sigchain$X +# +# None right now: +# +# TEST_PROGRAMS += test-something$X all:: $(TEST_PROGRAMS) @@ -787,25 +724,6 @@ all:: $(TEST_PROGRAMS) export NO_SVN_TESTS -test: all - $(MAKE) -C t/ all - -test-ctype$X: ctype.o - -test-date$X: date.o ctype.o - -test-delta$X: diff-delta.o patch-delta.o - -test-parse-options$X: parse-options.o - -.PRECIOUS: $(patsubst test-%$X,test-%.o,$(TEST_PROGRAMS)) - -test-%$X: test-%.o $(PERFLIBS) - $(QUIET_LINK)$(CC) $(ALL_CFLAGS) -o $@ $(ALL_LDFLAGS) $(filter %.o,$^) $(LIBS) - -check-sha1:: test-sha1$X - ./test-sha1.sh - check: common-cmds.h if sparse; \ then \ @@ -845,10 +763,6 @@ install: all $(INSTALL) perf$X perf-upload-pack$X perf-receive-pack$X perf-upload-archive$X perf-shell$X perf-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)' $(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install $(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install -ifndef NO_TCLTK - $(MAKE) -C perfk-perf install - $(MAKE) -C perf-gui perfexecdir='$(perfexec_instdir_SQ)' install -endif ifneq (,$X) $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';) endif @@ -865,32 +779,6 @@ endif done } && \ ./check_bindir "z$$bindir" "z$$execdir" "$$bindir/perf-add$X" -install-doc: - $(MAKE) -C Documentation install - -install-man: - $(MAKE) -C Documentation install-man - -install-html: - $(MAKE) -C Documentation install-html - -install-info: - $(MAKE) -C Documentation install-info - -install-pdf: - $(MAKE) -C Documentation install-pdf - -quick-install-doc: - $(MAKE) -C Documentation quick-install - -quick-install-man: - $(MAKE) -C Documentation quick-install-man - -quick-install-html: - $(MAKE) -C Documentation quick-install-html - - - ### Maintainer's dist rules perf.spec: perf.spec.in @@ -904,38 +792,16 @@ dist: perf.spec perf-archive$(X) configure @mkdir -p $(PERF_TARNAME) @cp perf.spec configure $(PERF_TARNAME) @echo $(PERF_VERSION) > $(PERF_TARNAME)/version - @$(MAKE) -C perf-gui TARDIR=../$(PERF_TARNAME)/perf-gui dist-version $(TAR) rf $(PERF_TARNAME).tar \ $(PERF_TARNAME)/perf.spec \ $(PERF_TARNAME)/configure \ - $(PERF_TARNAME)/version \ - $(PERF_TARNAME)/perf-gui/version + $(PERF_TARNAME)/version @$(RM) -r $(PERF_TARNAME) gzip -f -9 $(PERF_TARNAME).tar rpm: dist $(RPMBUILD) -ta $(PERF_TARNAME).tar.gz -htmldocs = perf-htmldocs-$(PERF_VERSION) -manpages = perf-manpages-$(PERF_VERSION) -dist-doc: - $(RM) -r .doc-tmp-dir - mkdir .doc-tmp-dir - $(MAKE) -C Documentation WEBDOC_DEST=../.doc-tmp-dir install-webdoc - cd .doc-tmp-dir && $(TAR) cf ../$(htmldocs).tar . - gzip -n -9 -f $(htmldocs).tar - : - $(RM) -r .doc-tmp-dir - mkdir -p .doc-tmp-dir/man1 .doc-tmp-dir/man5 .doc-tmp-dir/man7 - $(MAKE) -C Documentation DESTDIR=./ \ - man1dir=../.doc-tmp-dir/man1 \ - man5dir=../.doc-tmp-dir/man5 \ - man7dir=../.doc-tmp-dir/man7 \ - install - cd .doc-tmp-dir && $(TAR) cf ../$(manpages).tar . - gzip -n -9 -f $(manpages).tar - $(RM) -r .doc-tmp-dir - ### Cleaning rules distclean: clean @@ -951,74 +817,13 @@ clean: $(RM) -r $(PERF_TARNAME) .doc-tmp-dir $(RM) $(PERF_TARNAME).tar.gz perf-core_$(PERF_VERSION)-*.tar.gz $(RM) $(htmldocs).tar.gz $(manpages).tar.gz - $(MAKE) -C Documentation/ clean - $(MAKE) -C templates/ clean - $(MAKE) -C t/ clean -ifndef NO_TCLTK - $(MAKE) -C perfk-perf clean - $(MAKE) -C perf-gui clean -endif - $(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-GUI-VARS PERF-BUILD-OPTIONS + $(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS .PHONY: all install clean strip .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS .PHONY: .FORCE-PERF-BUILD-OPTIONS -### Check documentation -# -check-docs:: - @(for v in $(ALL_PROGRAMS) $(BUILT_INS) perf perfk; \ - do \ - case "$$v" in \ - perf-merge-octopus | perf-merge-ours | perf-merge-recursive | \ - perf-merge-resolve | perf-merge-subtree | \ - perf-fsck-objects | perf-init-db | \ - perf-?*--?* ) continue ;; \ - esac ; \ - test -f "Documentation/$$v.txt" || \ - echo "no doc: $$v"; \ - sed -e '/^#/d' command-list.txt | \ - grep -q "^$$v[ ]" || \ - case "$$v" in \ - perf) ;; \ - *) echo "no link: $$v";; \ - esac ; \ - done; \ - ( \ - sed -e '/^#/d' \ - -e 's/[ ].*//' \ - -e 's/^/listed /' command-list.txt; \ - ls -1 Documentation/perf*txt | \ - sed -e 's|Documentation/|documented |' \ - -e 's/\.txt//'; \ - ) | while read how cmd; \ - do \ - case "$$how,$$cmd" in \ - *,perf-citool | \ - *,perf-gui | \ - *,perf-help | \ - documented,perfattributes | \ - documented,perfignore | \ - documented,perfmodules | \ - documented,perfcli | \ - documented,perf-tools | \ - documented,perfcore-tutorial | \ - documented,perfcvs-migration | \ - documented,perfdiffcore | \ - documented,perfglossary | \ - documented,perfhooks | \ - documented,perfrepository-layout | \ - documented,perftutorial | \ - documented,perftutorial-2 | \ - sentinel,not,matching,is,ok ) continue ;; \ - esac; \ - case " $(ALL_PROGRAMS) $(BUILT_INS) perf perfk " in \ - *" $$cmd "*) ;; \ - *) echo "removed but $$how: $$cmd" ;; \ - esac; \ - done ) | sort - ### Make sure built-ins do not have dups and listed in perf.c # check-builtins:: diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c index 125fcc2f490..a136d619db3 100644 --- a/Documentation/perf_counter/builtin-help.c +++ b/Documentation/perf_counter/builtin-help.c @@ -417,11 +417,9 @@ static void show_html_page(const char *perf_cmd) int cmd_help(int argc, const char **argv, const char *prefix) { - int nonperf; const char *alias; load_command_list("perf-", &main_cmds, &other_cmds); - /* setup_perf_directory_gently(&nonperf); */ perf_config(perf_help_config, NULL); argc = parse_options(argc, argv, builtin_help_options, diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index 9d2c769e5f8..601bddbc30d 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -63,15 +63,6 @@ #include "util.h" -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include @@ -103,8 +94,6 @@ #define PR_TASK_PERF_COUNTERS_DISABLE 31 #define PR_TASK_PERF_COUNTERS_ENABLE 32 -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - #define rdclock() \ ({ \ struct timespec ts; \ @@ -1077,7 +1066,7 @@ static void process_event(uint64_t ip, int counter) record_ip(ip, counter); } -static void process_options(int argc, char *argv[]) +static void process_options(int argc, char **argv) { int error = 0, counter; @@ -1255,7 +1244,7 @@ static void mmap_read(struct mmap_data *md) event_t event_copy; - unsigned int size = event->header.size; + size_t size = event->header.size; /* * Event straddles the mmap boundary -- header should always @@ -1301,7 +1290,7 @@ static void mmap_read(struct mmap_data *md) md->prev = old; } -int cmd_top(int argc, const char **argv, const char *prefix) +int cmd_top(int argc, char **argv, const char *prefix) { struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; diff --git a/Documentation/perf_counter/cache.h b/Documentation/perf_counter/cache.h index dc085640a57..71080512fa8 100644 --- a/Documentation/perf_counter/cache.h +++ b/Documentation/perf_counter/cache.h @@ -94,4 +94,24 @@ static inline int is_absolute_path(const char *path) { return path[0] == '/'; } + +const char *make_absolute_path(const char *path); +const char *make_nonrelative_path(const char *path); +const char *make_relative_path(const char *abs, const char *base); +int normalize_path_copy(char *dst, const char *src); +int longest_ancestor_length(const char *path, const char *prefix_list); +char *strip_path_suffix(const char *path, const char *suffix); + +extern char *mkpath(const char *fmt, ...) __attribute__((format (printf, 1, 2))); +extern char *perf_path(const char *fmt, ...) __attribute__((format (printf, 1, 2))); + +extern char *mksnpath(char *buf, size_t n, const char *fmt, ...) + __attribute__((format (printf, 3, 4))); +extern char *perf_snpath(char *buf, size_t n, const char *fmt, ...) + __attribute__((format (printf, 3, 4))); +extern char *perf_pathdup(const char *fmt, ...) + __attribute__((format (printf, 1, 2))); + +extern size_t strlcpy(char *dest, const char *src, size_t size); + #endif /* CACHE_H */ diff --git a/Documentation/perf_counter/config.c b/Documentation/perf_counter/config.c index 672d5395933..3dd13faa6a2 100644 --- a/Documentation/perf_counter/config.c +++ b/Documentation/perf_counter/config.c @@ -15,7 +15,6 @@ static FILE *config_file; static const char *config_file_name; static int config_linenr; static int config_file_eof; -static int zlib_compression_seen; const char *config_exclusive_filename = NULL; @@ -533,14 +532,6 @@ static int store_aux(const char* key, const char* value, void *cb) return 0; } -static int write_error(const char *filename) -{ - error("failed to write new configuration file %s", filename); - - /* Same error code as "failed to rename". */ - return 4; -} - static int store_write_section(int fd, const char* key) { const char *dot; @@ -673,7 +664,7 @@ int perf_config_set_multivar(const char* key, const char* value, { int i, dot; int fd = -1, in_fd; - int ret; + int ret = 0; char* config_filename; const char* last_dot = strrchr(key, '.'); @@ -872,90 +863,6 @@ write_err_out: } -static int section_name_match (const char *buf, const char *name) -{ - int i = 0, j = 0, dot = 0; - for (; buf[i] && buf[i] != ']'; i++) { - if (!dot && isspace(buf[i])) { - dot = 1; - if (name[j++] != '.') - break; - for (i++; isspace(buf[i]); i++) - ; /* do nothing */ - if (buf[i] != '"') - break; - continue; - } - if (buf[i] == '\\' && dot) - i++; - else if (buf[i] == '"' && dot) { - for (i++; isspace(buf[i]); i++) - ; /* do_nothing */ - break; - } - if (buf[i] != name[j++]) - break; - } - return (buf[i] == ']' && name[j] == 0); -} - -/* if new_name == NULL, the section is removed instead */ -int perf_config_rename_section(const char *old_name, const char *new_name) -{ - int ret = 0, remove = 0; - char *config_filename; - int out_fd; - char buf[1024]; - - if (config_exclusive_filename) - config_filename = strdup(config_exclusive_filename); - else - config_filename = perf_pathdup("config"); - if (out_fd < 0) { - ret = error("could not lock config file %s", config_filename); - goto out; - } - - if (!(config_file = fopen(config_filename, "rb"))) { - /* no config file means nothing to rename, no error */ - goto unlock_and_out; - } - - while (fgets(buf, sizeof(buf), config_file)) { - int i; - int length; - for (i = 0; buf[i] && isspace(buf[i]); i++) - ; /* do nothing */ - if (buf[i] == '[') { - /* it's a section */ - if (section_name_match (&buf[i+1], old_name)) { - ret++; - if (new_name == NULL) { - remove = 1; - continue; - } - store.baselen = strlen(new_name); - if (!store_write_section(out_fd, new_name)) { - goto out; - } - continue; - } - remove = 0; - } - if (remove) - continue; - length = strlen(buf); - if (write_in_full(out_fd, buf, length) != length) { - goto out; - } - } - fclose(config_file); - unlock_and_out: - out: - free(config_filename); - return ret; -} - /* * Call this to report error for your variable that should not * get a boolean value (i.e. "[my] var" means "true"). diff --git a/Documentation/perf_counter/path.c b/Documentation/perf_counter/path.c index 891b612ec1a..a501a40dd2c 100644 --- a/Documentation/perf_counter/path.c +++ b/Documentation/perf_counter/path.c @@ -161,45 +161,6 @@ int perf_mkstemp(char *path, size_t len, const char *template) } -static char *user_path(char *buf, char *path, int sz) -{ - struct passwd *pw; - char *slash; - int len, baselen; - - if (!path || path[0] != '~') - return NULL; - path++; - slash = strchr(path, '/'); - if (path[0] == '/' || !path[0]) { - pw = getpwuid(getuid()); - } - else { - if (slash) { - *slash = 0; - pw = getpwnam(path); - *slash = '/'; - } - else - pw = getpwnam(path); - } - if (!pw || !pw->pw_dir || sz <= strlen(pw->pw_dir)) - return NULL; - baselen = strlen(pw->pw_dir); - memcpy(buf, pw->pw_dir, baselen); - while ((1 < baselen) && (buf[baselen-1] == '/')) { - buf[baselen-1] = 0; - baselen--; - } - if (slash && slash[1]) { - len = strlen(slash); - if (sz <= baselen + len) - return NULL; - memcpy(buf + baselen, slash, len + 1); - } - return buf; -} - const char *make_relative_path(const char *abs, const char *base) { static char buf[PATH_MAX + 1]; diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c index 9256f6a1644..63f8a892c0d 100644 --- a/Documentation/perf_counter/perf.c +++ b/Documentation/perf_counter/perf.c @@ -1,7 +1,7 @@ #include "builtin.h" #include "exec_cmd.h" #include "cache.h" -//#include "quote.h" +#include "quote.h" #include "run-command.h" const char perf_usage_string[] = @@ -132,7 +132,6 @@ static int handle_alias(int *argcp, const char ***argv) const char** new_argv; const char *alias_command; char *alias_string; - int unused_nonperf; alias_command = (*argv)[0]; alias_string = alias_lookup(alias_command); diff --git a/Documentation/perf_counter/util.h b/Documentation/perf_counter/util.h index 13f8bdce760..36e40c38e09 100644 --- a/Documentation/perf_counter/util.h +++ b/Documentation/perf_counter/util.h @@ -295,6 +295,20 @@ static inline char *gitstrchrnul(const char *s, int c) } #endif +/* + * Wrappers: + */ +extern char *xstrdup(const char *str); +extern void *xmalloc(size_t size); +extern void *xmemdupz(const void *data, size_t len); +extern char *xstrndup(const char *str, size_t len); +extern void *xrealloc(void *ptr, size_t size); +extern void *xcalloc(size_t nmemb, size_t size); +extern void *xmmap(void *start, size_t length, int prot, int flags, int fd, off_t offset); +extern ssize_t xread(int fd, void *buf, size_t len); +extern ssize_t xwrite(int fd, const void *buf, size_t len); +extern int xdup(int fd); +extern FILE *xfdopen(int fd, const char *mode); static inline size_t xsize_t(off_t len) { return (size_t)len; From ddcacfa0febff6454dba6cea1931f3020a9f6c24 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 15:37:32 +0200 Subject: [PATCH 0512/2061] perf_counter tools: separate kerneltop into 'perf top' and 'perf stat' Lets use the Git framework of built-in commands. Signed-off-by: Ingo Molnar --- Documentation/perf_counter/Makefile | 1 + Documentation/perf_counter/builtin-stat.c | 592 ++++++++ Documentation/perf_counter/builtin-top.c | 204 +-- Documentation/perf_counter/builtin.h | 3 +- Documentation/perf_counter/command-list.txt | 1 + Documentation/perf_counter/kerneltop.c | 1409 ------------------- Documentation/perf_counter/perf.c | 1 + 7 files changed, 601 insertions(+), 1610 deletions(-) create mode 100644 Documentation/perf_counter/builtin-stat.c delete mode 100644 Documentation/perf_counter/kerneltop.c diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index 1b602655554..fb8b71744e5 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -309,6 +309,7 @@ LIB_OBJS += usage.o LIB_OBJS += wrapper.o BUILTIN_OBJS += builtin-help.o +BUILTIN_OBJS += builtin-stat.o BUILTIN_OBJS += builtin-top.o PERFLIBS = $(LIB_FILE) diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c new file mode 100644 index 00000000000..169a2d1783f --- /dev/null +++ b/Documentation/perf_counter/builtin-stat.c @@ -0,0 +1,592 @@ +/* + * kerneltop.c: show top kernel functions - performance counters showcase + + Build with: + + cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt + + Sample output: + +------------------------------------------------------------------------------ + KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2) +------------------------------------------------------------------------------ + + weight RIP kernel function + ______ ________________ _______________ + + 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev + 33.00 - ffffffff804cb740 : sock_alloc_send_skb + 31.26 - ffffffff804ce808 : skb_push + 22.43 - ffffffff80510004 : tcp_established_options + 19.00 - ffffffff8027d250 : find_get_page + 15.76 - ffffffff804e4fc9 : eth_type_trans + 15.20 - ffffffff804d8baa : dst_release + 14.86 - ffffffff804cf5d8 : skb_release_head_state + 14.00 - ffffffff802217d5 : read_hpet + 12.00 - ffffffff804ffb7f : __ip_local_out + 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish + 8.54 - ffffffff805001a3 : ip_queue_xmit + */ + +/* + * perfstat: /usr/bin/time -alike performance counter statistics utility + + It summarizes the counter events of all tasks (and child tasks), + covering all CPUs that the command (or workload) executes on. + It only counts the per-task events of the workload started, + independent of how many other tasks run on those CPUs. + + Sample output: + + $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null + + Performance counter stats for 'ls': + + 163516953 instructions + 2295 cache-misses + 2855182 branch-misses + */ + + /* + * Copyright (C) 2008, Red Hat Inc, Ingo Molnar + * + * Improvements and fixes by: + * + * Arjan van de Ven + * Yanmin Zhang + * Wu Fengguang + * Mike Galbraith + * Paul Mackerras + * + * Released under the GPL v2. (and only v2, not any later version) + */ + +#include "util.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include "../../include/linux/perf_counter.h" + + +/* + * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * counters in the current task. + */ +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + +#define rdclock() \ +({ \ + struct timespec ts; \ + \ + clock_gettime(CLOCK_MONOTONIC, &ts); \ + ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ +}) + +/* + * Pick up some kernel type conventions: + */ +#define __user +#define asmlinkage + +#ifdef __x86_64__ +#define __NR_perf_counter_open 295 +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __i386__ +#define __NR_perf_counter_open 333 +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __powerpc__ +#define __NR_perf_counter_open 319 +#define rmb() asm volatile ("sync" ::: "memory") +#define cpu_relax() asm volatile ("" ::: "memory"); +#endif + +#define unlikely(x) __builtin_expect(!!(x), 0) +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +extern asmlinkage int sys_perf_counter_open( + struct perf_counter_hw_event *hw_event_uptr __user, + pid_t pid, + int cpu, + int group_fd, + unsigned long flags); + +#define MAX_COUNTERS 64 +#define MAX_NR_CPUS 256 + +#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) + +static int system_wide = 0; + +static int nr_counters = 0; +static __u64 event_id[MAX_COUNTERS] = { + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), + EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), + + EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), + EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), + EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), + EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), +}; +static int default_interval = 100000; +static int event_count[MAX_COUNTERS]; +static int fd[MAX_NR_CPUS][MAX_COUNTERS]; + +static int tid = -1; +static int profile_cpu = -1; +static int nr_cpus = 0; +static int nmi = 1; +static int group = 0; +static unsigned int page_size; + +static int zero; + +static int scale; + +static const unsigned int default_count[] = { + 1000000, + 1000000, + 10000, + 10000, + 1000000, + 10000, +}; + +static char *hw_event_names[] = { + "CPU cycles", + "instructions", + "cache references", + "cache misses", + "branches", + "branch misses", + "bus cycles", +}; + +static char *sw_event_names[] = { + "cpu clock ticks", + "task clock ticks", + "pagefaults", + "context switches", + "CPU migrations", + "minor faults", + "major faults", +}; + +struct event_symbol { + __u64 event; + char *symbol; +}; + +static struct event_symbol event_symbols[] = { + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", }, + {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", }, + + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", }, + {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", }, +}; + +#define __PERF_COUNTER_FIELD(config, name) \ + ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) + +#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW) +#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG) +#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) +#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) + +static void display_events_help(void) +{ + unsigned int i; + __u64 e; + + printf( + " -e EVENT --event=EVENT # symbolic-name abbreviations"); + + for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { + int type, id; + + e = event_symbols[i].event; + type = PERF_COUNTER_TYPE(e); + id = PERF_COUNTER_ID(e); + + printf("\n %d:%d: %-20s", + type, id, event_symbols[i].symbol); + } + + printf("\n" + " rNNN: raw PMU events (eventsel+umask)\n\n"); +} + +static void display_help(void) +{ + printf( + "Usage: perfstat [] \n\n" + "PerfStat Options (up to %d event types can be specified):\n\n", + MAX_COUNTERS); + + display_events_help(); + + printf( + " -l # scale counter values\n" + " -a # system-wide collection\n"); + exit(0); +} + +static char *event_name(int ctr) +{ + __u64 config = event_id[ctr]; + int type = PERF_COUNTER_TYPE(config); + int id = PERF_COUNTER_ID(config); + static char buf[32]; + + if (PERF_COUNTER_RAW(config)) { + sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config)); + return buf; + } + + switch (type) { + case PERF_TYPE_HARDWARE: + if (id < PERF_HW_EVENTS_MAX) + return hw_event_names[id]; + return "unknown-hardware"; + + case PERF_TYPE_SOFTWARE: + if (id < PERF_SW_EVENTS_MAX) + return sw_event_names[id]; + return "unknown-software"; + + default: + break; + } + + return "unknown"; +} + +/* + * Each event can have multiple symbolic names. + * Symbolic names are (almost) exactly matched. + */ +static __u64 match_event_symbols(char *str) +{ + __u64 config, id; + int type; + unsigned int i; + + if (sscanf(str, "r%llx", &config) == 1) + return config | PERF_COUNTER_RAW_MASK; + + if (sscanf(str, "%d:%llu", &type, &id) == 2) + return EID(type, id); + + for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { + if (!strncmp(str, event_symbols[i].symbol, + strlen(event_symbols[i].symbol))) + return event_symbols[i].event; + } + + return ~0ULL; +} + +static int parse_events(char *str) +{ + __u64 config; + +again: + if (nr_counters == MAX_COUNTERS) + return -1; + + config = match_event_symbols(str); + if (config == ~0ULL) + return -1; + + event_id[nr_counters] = config; + nr_counters++; + + str = strstr(str, ","); + if (str) { + str++; + goto again; + } + + return 0; +} + + +/* + * perfstat + */ + +char fault_here[1000000]; + +static void create_perfstat_counter(int counter) +{ + struct perf_counter_hw_event hw_event; + + memset(&hw_event, 0, sizeof(hw_event)); + hw_event.config = event_id[counter]; + hw_event.record_type = 0; + hw_event.nmi = 0; + if (scale) + hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | + PERF_FORMAT_TOTAL_TIME_RUNNING; + + if (system_wide) { + int cpu; + for (cpu = 0; cpu < nr_cpus; cpu ++) { + fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0); + if (fd[cpu][counter] < 0) { + printf("perfstat error: syscall returned with %d (%s)\n", + fd[cpu][counter], strerror(errno)); + exit(-1); + } + } + } else { + hw_event.inherit = 1; + hw_event.disabled = 1; + + fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0); + if (fd[0][counter] < 0) { + printf("perfstat error: syscall returned with %d (%s)\n", + fd[0][counter], strerror(errno)); + exit(-1); + } + } +} + +int do_perfstat(int argc, char *argv[]) +{ + unsigned long long t0, t1; + int counter; + ssize_t res; + int status; + int pid; + + if (!system_wide) + nr_cpus = 1; + + for (counter = 0; counter < nr_counters; counter++) + create_perfstat_counter(counter); + + argc -= optind; + argv += optind; + + if (!argc) + display_help(); + + /* + * Enable counters and exec the command: + */ + t0 = rdclock(); + prctl(PR_TASK_PERF_COUNTERS_ENABLE); + + if ((pid = fork()) < 0) + perror("failed to fork"); + if (!pid) { + if (execvp(argv[0], argv)) { + perror(argv[0]); + exit(-1); + } + } + while (wait(&status) >= 0) + ; + prctl(PR_TASK_PERF_COUNTERS_DISABLE); + t1 = rdclock(); + + fflush(stdout); + + fprintf(stderr, "\n"); + fprintf(stderr, " Performance counter stats for \'%s\':\n", + argv[0]); + fprintf(stderr, "\n"); + + for (counter = 0; counter < nr_counters; counter++) { + int cpu, nv; + __u64 count[3], single_count[3]; + int scaled; + + count[0] = count[1] = count[2] = 0; + nv = scale ? 3 : 1; + for (cpu = 0; cpu < nr_cpus; cpu ++) { + res = read(fd[cpu][counter], + single_count, nv * sizeof(__u64)); + assert(res == nv * sizeof(__u64)); + + count[0] += single_count[0]; + if (scale) { + count[1] += single_count[1]; + count[2] += single_count[2]; + } + } + + scaled = 0; + if (scale) { + if (count[2] == 0) { + fprintf(stderr, " %14s %-20s\n", + "", event_name(counter)); + continue; + } + if (count[2] < count[1]) { + scaled = 1; + count[0] = (unsigned long long) + ((double)count[0] * count[1] / count[2] + 0.5); + } + } + + if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) || + event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) { + + double msecs = (double)count[0] / 1000000; + + fprintf(stderr, " %14.6f %-20s (msecs)", + msecs, event_name(counter)); + } else { + fprintf(stderr, " %14Ld %-20s (events)", + count[0], event_name(counter)); + } + if (scaled) + fprintf(stderr, " (scaled from %.2f%%)", + (double) count[2] / count[1] * 100); + fprintf(stderr, "\n"); + } + fprintf(stderr, "\n"); + fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", + (double)(t1-t0)/1e6); + fprintf(stderr, "\n"); + + return 0; +} + +static void process_options(int argc, char **argv) +{ + int error = 0, counter; + + for (;;) { + int option_index = 0; + /** Options for getopt */ + static struct option long_options[] = { + {"count", required_argument, NULL, 'c'}, + {"cpu", required_argument, NULL, 'C'}, + {"delay", required_argument, NULL, 'd'}, + {"dump_symtab", no_argument, NULL, 'D'}, + {"event", required_argument, NULL, 'e'}, + {"filter", required_argument, NULL, 'f'}, + {"group", required_argument, NULL, 'g'}, + {"help", no_argument, NULL, 'h'}, + {"nmi", required_argument, NULL, 'n'}, + {"munmap_info", no_argument, NULL, 'U'}, + {"pid", required_argument, NULL, 'p'}, + {"realtime", required_argument, NULL, 'r'}, + {"scale", no_argument, NULL, 'l'}, + {"symbol", required_argument, NULL, 's'}, + {"stat", no_argument, NULL, 'S'}, + {"vmlinux", required_argument, NULL, 'x'}, + {"zero", no_argument, NULL, 'z'}, + {NULL, 0, NULL, 0 } + }; + int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU", + long_options, &option_index); + if (c == -1) + break; + + switch (c) { + case 'a': system_wide = 1; break; + case 'c': default_interval = atoi(optarg); break; + case 'C': + /* CPU and PID are mutually exclusive */ + if (tid != -1) { + printf("WARNING: CPU switch overriding PID\n"); + sleep(1); + tid = -1; + } + profile_cpu = atoi(optarg); break; + + case 'e': error = parse_events(optarg); break; + + case 'g': group = atoi(optarg); break; + case 'h': display_help(); break; + case 'l': scale = 1; break; + case 'n': nmi = atoi(optarg); break; + case 'p': + /* CPU and PID are mutually exclusive */ + if (profile_cpu != -1) { + printf("WARNING: PID switch overriding CPU\n"); + sleep(1); + profile_cpu = -1; + } + tid = atoi(optarg); break; + case 'z': zero = 1; break; + default: error = 1; break; + } + } + if (error) + display_help(); + + if (!nr_counters) { + nr_counters = 8; + } + + for (counter = 0; counter < nr_counters; counter++) { + if (event_count[counter]) + continue; + + event_count[counter] = default_interval; + } +} + +int cmd_stat(int argc, char **argv, const char *prefix) +{ + page_size = sysconf(_SC_PAGE_SIZE); + + process_options(argc, argv); + + nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); + assert(nr_cpus <= MAX_NR_CPUS); + assert(nr_cpus >= 0); + + return do_perfstat(argc, argv); +} diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index 601bddbc30d..98e8690b6bc 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -28,25 +28,6 @@ 8.54 - ffffffff805001a3 : ip_queue_xmit */ -/* - * perfstat: /usr/bin/time -alike performance counter statistics utility - - It summarizes the counter events of all tasks (and child tasks), - covering all CPUs that the command (or workload) executes on. - It only counts the per-task events of the workload started, - independent of how many other tasks run on those CPUs. - - Sample output: - - $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null - - Performance counter stats for 'ls': - - 163516953 instructions - 2295 cache-misses - 2855182 branch-misses - */ - /* * Copyright (C) 2008, Red Hat Inc, Ingo Molnar * @@ -149,7 +130,6 @@ asmlinkage int sys_perf_counter_open( #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) -static int run_perfstat = 0; static int system_wide = 0; static int nr_counters = 0; @@ -203,7 +183,7 @@ struct source_line { static struct source_line *lines; static struct source_line **lines_tail; -const unsigned int default_count[] = { +static const unsigned int default_count[] = { 1000000, 1000000, 10000, @@ -291,26 +271,8 @@ static void display_events_help(void) " rNNN: raw PMU events (eventsel+umask)\n\n"); } -static void display_perfstat_help(void) -{ - printf( - "Usage: perfstat [] \n\n" - "PerfStat Options (up to %d event types can be specified):\n\n", - MAX_COUNTERS); - - display_events_help(); - - printf( - " -l # scale counter values\n" - " -a # system-wide collection\n"); - exit(0); -} - static void display_help(void) { - if (run_perfstat) - return display_perfstat_help(); - printf( "Usage: kerneltop []\n" " Or: kerneltop -S [] COMMAND [ARGS]\n\n" @@ -320,8 +282,6 @@ static void display_help(void) display_events_help(); printf( - " -S --stat # perfstat COMMAND\n" - " -a # system-wide collection (for perfstat)\n\n" " -c CNT --count=CNT # event period to sample\n\n" " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n" " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n" @@ -420,151 +380,6 @@ again: return 0; } - -/* - * perfstat - */ - -char fault_here[1000000]; - -static void create_perfstat_counter(int counter) -{ - struct perf_counter_hw_event hw_event; - - memset(&hw_event, 0, sizeof(hw_event)); - hw_event.config = event_id[counter]; - hw_event.record_type = 0; - hw_event.nmi = 0; - if (scale) - hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | - PERF_FORMAT_TOTAL_TIME_RUNNING; - - if (system_wide) { - int cpu; - for (cpu = 0; cpu < nr_cpus; cpu ++) { - fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0); - if (fd[cpu][counter] < 0) { - printf("perfstat error: syscall returned with %d (%s)\n", - fd[cpu][counter], strerror(errno)); - exit(-1); - } - } - } else { - hw_event.inherit = 1; - hw_event.disabled = 1; - - fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0); - if (fd[0][counter] < 0) { - printf("perfstat error: syscall returned with %d (%s)\n", - fd[0][counter], strerror(errno)); - exit(-1); - } - } -} - -int do_perfstat(int argc, char *argv[]) -{ - unsigned long long t0, t1; - int counter; - ssize_t res; - int status; - int pid; - - if (!system_wide) - nr_cpus = 1; - - for (counter = 0; counter < nr_counters; counter++) - create_perfstat_counter(counter); - - argc -= optind; - argv += optind; - - if (!argc) - display_help(); - - /* - * Enable counters and exec the command: - */ - t0 = rdclock(); - prctl(PR_TASK_PERF_COUNTERS_ENABLE); - - if ((pid = fork()) < 0) - perror("failed to fork"); - if (!pid) { - if (execvp(argv[0], argv)) { - perror(argv[0]); - exit(-1); - } - } - while (wait(&status) >= 0) - ; - prctl(PR_TASK_PERF_COUNTERS_DISABLE); - t1 = rdclock(); - - fflush(stdout); - - fprintf(stderr, "\n"); - fprintf(stderr, " Performance counter stats for \'%s\':\n", - argv[0]); - fprintf(stderr, "\n"); - - for (counter = 0; counter < nr_counters; counter++) { - int cpu, nv; - __u64 count[3], single_count[3]; - int scaled; - - count[0] = count[1] = count[2] = 0; - nv = scale ? 3 : 1; - for (cpu = 0; cpu < nr_cpus; cpu ++) { - res = read(fd[cpu][counter], - single_count, nv * sizeof(__u64)); - assert(res == nv * sizeof(__u64)); - - count[0] += single_count[0]; - if (scale) { - count[1] += single_count[1]; - count[2] += single_count[2]; - } - } - - scaled = 0; - if (scale) { - if (count[2] == 0) { - fprintf(stderr, " %14s %-20s\n", - "", event_name(counter)); - continue; - } - if (count[2] < count[1]) { - scaled = 1; - count[0] = (unsigned long long) - ((double)count[0] * count[1] / count[2] + 0.5); - } - } - - if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) || - event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) { - - double msecs = (double)count[0] / 1000000; - - fprintf(stderr, " %14.6f %-20s (msecs)", - msecs, event_name(counter)); - } else { - fprintf(stderr, " %14Ld %-20s (events)", - count[0], event_name(counter)); - } - if (scaled) - fprintf(stderr, " (scaled from %.2f%%)", - (double) count[2] / count[1] * 100); - fprintf(stderr, "\n"); - } - fprintf(stderr, "\n"); - fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", - (double)(t1-t0)/1e6); - fprintf(stderr, "\n"); - - return 0; -} - /* * Symbols */ @@ -805,7 +620,7 @@ static int read_symbol(FILE *in, struct sym_entry *s) return 0; } -int compare_addr(const void *__sym1, const void *__sym2) +static int compare_addr(const void *__sym1, const void *__sym2) { const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; @@ -1070,9 +885,6 @@ static void process_options(int argc, char **argv) { int error = 0, counter; - if (strstr(argv[0], "perfstat")) - run_perfstat = 1; - for (;;) { int option_index = 0; /** Options for getopt */ @@ -1134,7 +946,6 @@ static void process_options(int argc, char **argv) tid = atoi(optarg); break; case 'r': realtime_prio = atoi(optarg); break; case 's': sym_filter = strdup(optarg); break; - case 'S': run_perfstat = 1; break; case 'x': vmlinux = strdup(optarg); break; case 'z': zero = 1; break; case 'm': mmap_pages = atoi(optarg); break; @@ -1147,12 +958,8 @@ static void process_options(int argc, char **argv) display_help(); if (!nr_counters) { - if (run_perfstat) - nr_counters = 8; - else { - nr_counters = 1; - event_id[0] = 0; - } + nr_counters = 1; + event_id[0] = 0; } for (counter = 0; counter < nr_counters; counter++) { @@ -1308,9 +1115,6 @@ int cmd_top(int argc, char **argv, const char *prefix) assert(nr_cpus <= MAX_NR_CPUS); assert(nr_cpus >= 0); - if (run_perfstat) - return do_perfstat(argc, argv); - if (tid != -1 || profile_cpu != -1) nr_cpus = 1; diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h index 41637444ce2..a3bb6cd6bed 100644 --- a/Documentation/perf_counter/builtin.h +++ b/Documentation/perf_counter/builtin.h @@ -14,5 +14,6 @@ extern void prune_packed_objects(int); extern int read_line_with_nul(char *buf, int size, FILE *file); extern int check_pager_config(const char *cmd); -extern int cmd_top(int argc, const char **argv, const char *prefix); +extern int cmd_top(int argc, char **argv, const char *prefix); +extern int cmd_stat(int argc, char **argv, const char *prefix); #endif diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt index 1eab3659b20..52455d46bfb 100644 --- a/Documentation/perf_counter/command-list.txt +++ b/Documentation/perf_counter/command-list.txt @@ -1,4 +1,5 @@ # List of known perf commands. # command name category [deprecated] [common] perf-top mainporcelain common +perf-stat mainporcelain common diff --git a/Documentation/perf_counter/kerneltop.c b/Documentation/perf_counter/kerneltop.c deleted file mode 100644 index 042c1b83a87..00000000000 --- a/Documentation/perf_counter/kerneltop.c +++ /dev/null @@ -1,1409 +0,0 @@ -/* - * kerneltop.c: show top kernel functions - performance counters showcase - - Build with: - - cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt - - Sample output: - ------------------------------------------------------------------------------- - KernelTop: 2669 irqs/sec [NMI, cache-misses/cache-refs], (all, cpu: 2) ------------------------------------------------------------------------------- - - weight RIP kernel function - ______ ________________ _______________ - - 35.20 - ffffffff804ce74b : skb_copy_and_csum_dev - 33.00 - ffffffff804cb740 : sock_alloc_send_skb - 31.26 - ffffffff804ce808 : skb_push - 22.43 - ffffffff80510004 : tcp_established_options - 19.00 - ffffffff8027d250 : find_get_page - 15.76 - ffffffff804e4fc9 : eth_type_trans - 15.20 - ffffffff804d8baa : dst_release - 14.86 - ffffffff804cf5d8 : skb_release_head_state - 14.00 - ffffffff802217d5 : read_hpet - 12.00 - ffffffff804ffb7f : __ip_local_out - 11.97 - ffffffff804fc0c8 : ip_local_deliver_finish - 8.54 - ffffffff805001a3 : ip_queue_xmit - */ - -/* - * perfstat: /usr/bin/time -alike performance counter statistics utility - - It summarizes the counter events of all tasks (and child tasks), - covering all CPUs that the command (or workload) executes on. - It only counts the per-task events of the workload started, - independent of how many other tasks run on those CPUs. - - Sample output: - - $ ./perfstat -e 1 -e 3 -e 5 ls -lR /usr/include/ >/dev/null - - Performance counter stats for 'ls': - - 163516953 instructions - 2295 cache-misses - 2855182 branch-misses - */ - - /* - * Copyright (C) 2008, Red Hat Inc, Ingo Molnar - * - * Improvements and fixes by: - * - * Arjan van de Ven - * Yanmin Zhang - * Wu Fengguang - * Mike Galbraith - * Paul Mackerras - * - * Released under the GPL v2. (and only v2, not any later version) - */ - -#define _GNU_SOURCE -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include - -#include -#include - -#include "../../include/linux/perf_counter.h" - - -/* - * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all - * counters in the current task. - */ -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#define rdclock() \ -({ \ - struct timespec ts; \ - \ - clock_gettime(CLOCK_MONOTONIC, &ts); \ - ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ -}) - -/* - * Pick up some kernel type conventions: - */ -#define __user -#define asmlinkage - -#ifdef __x86_64__ -#define __NR_perf_counter_open 295 -#define rmb() asm volatile("lfence" ::: "memory") -#define cpu_relax() asm volatile("rep; nop" ::: "memory"); -#endif - -#ifdef __i386__ -#define __NR_perf_counter_open 333 -#define rmb() asm volatile("lfence" ::: "memory") -#define cpu_relax() asm volatile("rep; nop" ::: "memory"); -#endif - -#ifdef __powerpc__ -#define __NR_perf_counter_open 319 -#define rmb() asm volatile ("sync" ::: "memory") -#define cpu_relax() asm volatile ("" ::: "memory"); -#endif - -#define unlikely(x) __builtin_expect(!!(x), 0) -#define min(x, y) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - (void) (&_min1 == &_min2); \ - _min1 < _min2 ? _min1 : _min2; }) - -asmlinkage int sys_perf_counter_open( - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd, - unsigned long flags) -{ - return syscall( - __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); -} - -#define MAX_COUNTERS 64 -#define MAX_NR_CPUS 256 - -#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) - -static int run_perfstat = 0; -static int system_wide = 0; - -static int nr_counters = 0; -static __u64 event_id[MAX_COUNTERS] = { - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), - EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), - - EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), - EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), - EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), - EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), -}; -static int default_interval = 100000; -static int event_count[MAX_COUNTERS]; -static int fd[MAX_NR_CPUS][MAX_COUNTERS]; - -static __u64 count_filter = 100; - -static int tid = -1; -static int profile_cpu = -1; -static int nr_cpus = 0; -static int nmi = 1; -static unsigned int realtime_prio = 0; -static int group = 0; -static unsigned int page_size; -static unsigned int mmap_pages = 16; -static int use_mmap = 0; -static int use_munmap = 0; - -static char *vmlinux; - -static char *sym_filter; -static unsigned long filter_start; -static unsigned long filter_end; - -static int delay_secs = 2; -static int zero; -static int dump_symtab; - -static int scale; - -struct source_line { - uint64_t EIP; - unsigned long count; - char *line; - struct source_line *next; -}; - -static struct source_line *lines; -static struct source_line **lines_tail; - -const unsigned int default_count[] = { - 1000000, - 1000000, - 10000, - 10000, - 1000000, - 10000, -}; - -static char *hw_event_names[] = { - "CPU cycles", - "instructions", - "cache references", - "cache misses", - "branches", - "branch misses", - "bus cycles", -}; - -static char *sw_event_names[] = { - "cpu clock ticks", - "task clock ticks", - "pagefaults", - "context switches", - "CPU migrations", - "minor faults", - "major faults", -}; - -struct event_symbol { - __u64 event; - char *symbol; -}; - -static struct event_symbol event_symbols[] = { - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cpu-cycles", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CPU_CYCLES), "cycles", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_INSTRUCTIONS), "instructions", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_REFERENCES), "cache-references", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_CACHE_MISSES), "cache-misses", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branch-instructions", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_INSTRUCTIONS), "branches", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BRANCH_MISSES), "branch-misses", }, - {EID(PERF_TYPE_HARDWARE, PERF_COUNT_BUS_CYCLES), "bus-cycles", }, - - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK), "cpu-clock", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK), "task-clock", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "page-faults", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS), "faults", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MIN), "minor-faults", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_PAGE_FAULTS_MAJ), "major-faults", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "context-switches", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CONTEXT_SWITCHES), "cs", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "cpu-migrations", }, - {EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_MIGRATIONS), "migrations", }, -}; - -#define __PERF_COUNTER_FIELD(config, name) \ - ((config & PERF_COUNTER_##name##_MASK) >> PERF_COUNTER_##name##_SHIFT) - -#define PERF_COUNTER_RAW(config) __PERF_COUNTER_FIELD(config, RAW) -#define PERF_COUNTER_CONFIG(config) __PERF_COUNTER_FIELD(config, CONFIG) -#define PERF_COUNTER_TYPE(config) __PERF_COUNTER_FIELD(config, TYPE) -#define PERF_COUNTER_ID(config) __PERF_COUNTER_FIELD(config, EVENT) - -static void display_events_help(void) -{ - unsigned int i; - __u64 e; - - printf( - " -e EVENT --event=EVENT # symbolic-name abbreviations"); - - for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { - int type, id; - - e = event_symbols[i].event; - type = PERF_COUNTER_TYPE(e); - id = PERF_COUNTER_ID(e); - - printf("\n %d:%d: %-20s", - type, id, event_symbols[i].symbol); - } - - printf("\n" - " rNNN: raw PMU events (eventsel+umask)\n\n"); -} - -static void display_perfstat_help(void) -{ - printf( - "Usage: perfstat [] \n\n" - "PerfStat Options (up to %d event types can be specified):\n\n", - MAX_COUNTERS); - - display_events_help(); - - printf( - " -l # scale counter values\n" - " -a # system-wide collection\n"); - exit(0); -} - -static void display_help(void) -{ - if (run_perfstat) - return display_perfstat_help(); - - printf( - "Usage: kerneltop []\n" - " Or: kerneltop -S [] COMMAND [ARGS]\n\n" - "KernelTop Options (up to %d event types can be specified at once):\n\n", - MAX_COUNTERS); - - display_events_help(); - - printf( - " -S --stat # perfstat COMMAND\n" - " -a # system-wide collection (for perfstat)\n\n" - " -c CNT --count=CNT # event period to sample\n\n" - " -C CPU --cpu=CPU # CPU (-1 for all) [default: -1]\n" - " -p PID --pid=PID # PID of sampled task (-1 for all) [default: -1]\n\n" - " -l # show scale factor for RR events\n" - " -d delay --delay= # sampling/display delay [default: 2]\n" - " -f CNT --filter=CNT # min-event-count filter [default: 100]\n\n" - " -r prio --realtime= # event acquisition runs with SCHED_FIFO policy\n" - " -s symbol --symbol= # function to be showed annotated one-shot\n" - " -x path --vmlinux= # the vmlinux binary, required for -s use\n" - " -z --zero # zero counts after display\n" - " -D --dump_symtab # dump symbol table to stderr on startup\n" - " -m pages --mmap_pages= # number of mmap data pages\n" - " -M --mmap_info # print mmap info stream\n" - " -U --munmap_info # print munmap info stream\n" - ); - - exit(0); -} - -static char *event_name(int ctr) -{ - __u64 config = event_id[ctr]; - int type = PERF_COUNTER_TYPE(config); - int id = PERF_COUNTER_ID(config); - static char buf[32]; - - if (PERF_COUNTER_RAW(config)) { - sprintf(buf, "raw 0x%llx", PERF_COUNTER_CONFIG(config)); - return buf; - } - - switch (type) { - case PERF_TYPE_HARDWARE: - if (id < PERF_HW_EVENTS_MAX) - return hw_event_names[id]; - return "unknown-hardware"; - - case PERF_TYPE_SOFTWARE: - if (id < PERF_SW_EVENTS_MAX) - return sw_event_names[id]; - return "unknown-software"; - - default: - break; - } - - return "unknown"; -} - -/* - * Each event can have multiple symbolic names. - * Symbolic names are (almost) exactly matched. - */ -static __u64 match_event_symbols(char *str) -{ - __u64 config, id; - int type; - unsigned int i; - - if (sscanf(str, "r%llx", &config) == 1) - return config | PERF_COUNTER_RAW_MASK; - - if (sscanf(str, "%d:%llu", &type, &id) == 2) - return EID(type, id); - - for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { - if (!strncmp(str, event_symbols[i].symbol, - strlen(event_symbols[i].symbol))) - return event_symbols[i].event; - } - - return ~0ULL; -} - -static int parse_events(char *str) -{ - __u64 config; - -again: - if (nr_counters == MAX_COUNTERS) - return -1; - - config = match_event_symbols(str); - if (config == ~0ULL) - return -1; - - event_id[nr_counters] = config; - nr_counters++; - - str = strstr(str, ","); - if (str) { - str++; - goto again; - } - - return 0; -} - - -/* - * perfstat - */ - -char fault_here[1000000]; - -static void create_perfstat_counter(int counter) -{ - struct perf_counter_hw_event hw_event; - - memset(&hw_event, 0, sizeof(hw_event)); - hw_event.config = event_id[counter]; - hw_event.record_type = 0; - hw_event.nmi = 0; - if (scale) - hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | - PERF_FORMAT_TOTAL_TIME_RUNNING; - - if (system_wide) { - int cpu; - for (cpu = 0; cpu < nr_cpus; cpu ++) { - fd[cpu][counter] = sys_perf_counter_open(&hw_event, -1, cpu, -1, 0); - if (fd[cpu][counter] < 0) { - printf("perfstat error: syscall returned with %d (%s)\n", - fd[cpu][counter], strerror(errno)); - exit(-1); - } - } - } else { - hw_event.inherit = 1; - hw_event.disabled = 1; - - fd[0][counter] = sys_perf_counter_open(&hw_event, 0, -1, -1, 0); - if (fd[0][counter] < 0) { - printf("perfstat error: syscall returned with %d (%s)\n", - fd[0][counter], strerror(errno)); - exit(-1); - } - } -} - -int do_perfstat(int argc, char *argv[]) -{ - unsigned long long t0, t1; - int counter; - ssize_t res; - int status; - int pid; - - if (!system_wide) - nr_cpus = 1; - - for (counter = 0; counter < nr_counters; counter++) - create_perfstat_counter(counter); - - argc -= optind; - argv += optind; - - if (!argc) - display_help(); - - /* - * Enable counters and exec the command: - */ - t0 = rdclock(); - prctl(PR_TASK_PERF_COUNTERS_ENABLE); - - if ((pid = fork()) < 0) - perror("failed to fork"); - if (!pid) { - if (execvp(argv[0], argv)) { - perror(argv[0]); - exit(-1); - } - } - while (wait(&status) >= 0) - ; - prctl(PR_TASK_PERF_COUNTERS_DISABLE); - t1 = rdclock(); - - fflush(stdout); - - fprintf(stderr, "\n"); - fprintf(stderr, " Performance counter stats for \'%s\':\n", - argv[0]); - fprintf(stderr, "\n"); - - for (counter = 0; counter < nr_counters; counter++) { - int cpu, nv; - __u64 count[3], single_count[3]; - int scaled; - - count[0] = count[1] = count[2] = 0; - nv = scale ? 3 : 1; - for (cpu = 0; cpu < nr_cpus; cpu ++) { - res = read(fd[cpu][counter], - single_count, nv * sizeof(__u64)); - assert(res == nv * sizeof(__u64)); - - count[0] += single_count[0]; - if (scale) { - count[1] += single_count[1]; - count[2] += single_count[2]; - } - } - - scaled = 0; - if (scale) { - if (count[2] == 0) { - fprintf(stderr, " %14s %-20s\n", - "", event_name(counter)); - continue; - } - if (count[2] < count[1]) { - scaled = 1; - count[0] = (unsigned long long) - ((double)count[0] * count[1] / count[2] + 0.5); - } - } - - if (event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_CPU_CLOCK) || - event_id[counter] == EID(PERF_TYPE_SOFTWARE, PERF_COUNT_TASK_CLOCK)) { - - double msecs = (double)count[0] / 1000000; - - fprintf(stderr, " %14.6f %-20s (msecs)", - msecs, event_name(counter)); - } else { - fprintf(stderr, " %14Ld %-20s (events)", - count[0], event_name(counter)); - } - if (scaled) - fprintf(stderr, " (scaled from %.2f%%)", - (double) count[2] / count[1] * 100); - fprintf(stderr, "\n"); - } - fprintf(stderr, "\n"); - fprintf(stderr, " Wall-clock time elapsed: %12.6f msecs\n", - (double)(t1-t0)/1e6); - fprintf(stderr, "\n"); - - return 0; -} - -/* - * Symbols - */ - -static uint64_t min_ip; -static uint64_t max_ip = -1ll; - -struct sym_entry { - unsigned long long addr; - char *sym; - unsigned long count[MAX_COUNTERS]; - int skip; - struct source_line *source; -}; - -#define MAX_SYMS 100000 - -static int sym_table_count; - -struct sym_entry *sym_filter_entry; - -static struct sym_entry sym_table[MAX_SYMS]; - -static void show_details(struct sym_entry *sym); - -/* - * Ordering weight: count-1 * count-2 * ... / count-n - */ -static double sym_weight(const struct sym_entry *sym) -{ - double weight; - int counter; - - weight = sym->count[0]; - - for (counter = 1; counter < nr_counters-1; counter++) - weight *= sym->count[counter]; - - weight /= (sym->count[counter] + 1); - - return weight; -} - -static int compare(const void *__sym1, const void *__sym2) -{ - const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; - - return sym_weight(sym1) < sym_weight(sym2); -} - -static long events; -static long userspace_events; -static const char CONSOLE_CLEAR[] = ""; - -static struct sym_entry tmp[MAX_SYMS]; - -static void print_sym_table(void) -{ - int i, printed; - int counter; - float events_per_sec = events/delay_secs; - float kevents_per_sec = (events-userspace_events)/delay_secs; - float sum_kevents = 0.0; - - events = userspace_events = 0; - memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count); - qsort(tmp, sym_table_count, sizeof(tmp[0]), compare); - - for (i = 0; i < sym_table_count && tmp[i].count[0]; i++) - sum_kevents += tmp[i].count[0]; - - write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR)); - - printf( -"------------------------------------------------------------------------------\n"); - printf( " KernelTop:%8.0f irqs/sec kernel:%4.1f%% [%s, ", - events_per_sec, - 100.0 - (100.0*((events_per_sec-kevents_per_sec)/events_per_sec)), - nmi ? "NMI" : "IRQ"); - - if (nr_counters == 1) - printf("%d ", event_count[0]); - - for (counter = 0; counter < nr_counters; counter++) { - if (counter) - printf("/"); - - printf("%s", event_name(counter)); - } - - printf( "], "); - - if (tid != -1) - printf(" (tid: %d", tid); - else - printf(" (all"); - - if (profile_cpu != -1) - printf(", cpu: %d)\n", profile_cpu); - else { - if (tid != -1) - printf(")\n"); - else - printf(", %d CPUs)\n", nr_cpus); - } - - printf("------------------------------------------------------------------------------\n\n"); - - if (nr_counters == 1) - printf(" events pcnt"); - else - printf(" weight events pcnt"); - - printf(" RIP kernel function\n" - " ______ ______ _____ ________________ _______________\n\n" - ); - - for (i = 0, printed = 0; i < sym_table_count; i++) { - float pcnt; - int count; - - if (printed <= 18 && tmp[i].count[0] >= count_filter) { - pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents)); - - if (nr_counters == 1) - printf("%19.2f - %4.1f%% - %016llx : %s\n", - sym_weight(tmp + i), - pcnt, tmp[i].addr, tmp[i].sym); - else - printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n", - sym_weight(tmp + i), - tmp[i].count[0], - pcnt, tmp[i].addr, tmp[i].sym); - printed++; - } - /* - * Add decay to the counts: - */ - for (count = 0; count < nr_counters; count++) - sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8; - } - - if (sym_filter_entry) - show_details(sym_filter_entry); - - { - struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; - - if (poll(&stdin_poll, 1, 0) == 1) { - printf("key pressed - exiting.\n"); - exit(0); - } - } -} - -static void *display_thread(void *arg) -{ - printf("KernelTop refresh period: %d seconds\n", delay_secs); - - while (!sleep(delay_secs)) - print_sym_table(); - - return NULL; -} - -static int read_symbol(FILE *in, struct sym_entry *s) -{ - static int filter_match = 0; - char *sym, stype; - char str[500]; - int rc, pos; - - rc = fscanf(in, "%llx %c %499s", &s->addr, &stype, str); - if (rc == EOF) - return -1; - - assert(rc == 3); - - /* skip until end of line: */ - pos = strlen(str); - do { - rc = fgetc(in); - if (rc == '\n' || rc == EOF || pos >= 499) - break; - str[pos] = rc; - pos++; - } while (1); - str[pos] = 0; - - sym = str; - - /* Filter out known duplicates and non-text symbols. */ - if (!strcmp(sym, "_text")) - return 1; - if (!min_ip && !strcmp(sym, "_stext")) - return 1; - if (!strcmp(sym, "_etext") || !strcmp(sym, "_sinittext")) - return 1; - if (stype != 'T' && stype != 't') - return 1; - if (!strncmp("init_module", sym, 11) || !strncmp("cleanup_module", sym, 14)) - return 1; - if (strstr(sym, "_text_start") || strstr(sym, "_text_end")) - return 1; - - s->sym = malloc(strlen(str)); - assert(s->sym); - - strcpy((char *)s->sym, str); - s->skip = 0; - - /* Tag events to be skipped. */ - if (!strcmp("default_idle", s->sym) || !strcmp("cpu_idle", s->sym)) - s->skip = 1; - else if (!strcmp("enter_idle", s->sym) || !strcmp("exit_idle", s->sym)) - s->skip = 1; - else if (!strcmp("mwait_idle", s->sym)) - s->skip = 1; - - if (filter_match == 1) { - filter_end = s->addr; - filter_match = -1; - if (filter_end - filter_start > 10000) { - printf("hm, too large filter symbol <%s> - skipping.\n", - sym_filter); - printf("symbol filter start: %016lx\n", filter_start); - printf(" end: %016lx\n", filter_end); - filter_end = filter_start = 0; - sym_filter = NULL; - sleep(1); - } - } - if (filter_match == 0 && sym_filter && !strcmp(s->sym, sym_filter)) { - filter_match = 1; - filter_start = s->addr; - } - - return 0; -} - -int compare_addr(const void *__sym1, const void *__sym2) -{ - const struct sym_entry *sym1 = __sym1, *sym2 = __sym2; - - return sym1->addr > sym2->addr; -} - -static void sort_symbol_table(void) -{ - int i, dups; - - do { - qsort(sym_table, sym_table_count, sizeof(sym_table[0]), compare_addr); - for (i = 0, dups = 0; i < sym_table_count; i++) { - if (sym_table[i].addr == sym_table[i+1].addr) { - sym_table[i+1].addr = -1ll; - dups++; - } - } - sym_table_count -= dups; - } while(dups); -} - -static void parse_symbols(void) -{ - struct sym_entry *last; - - FILE *kallsyms = fopen("/proc/kallsyms", "r"); - - if (!kallsyms) { - printf("Could not open /proc/kallsyms - no CONFIG_KALLSYMS_ALL=y?\n"); - exit(-1); - } - - while (!feof(kallsyms)) { - if (read_symbol(kallsyms, &sym_table[sym_table_count]) == 0) { - sym_table_count++; - assert(sym_table_count <= MAX_SYMS); - } - } - - sort_symbol_table(); - min_ip = sym_table[0].addr; - max_ip = sym_table[sym_table_count-1].addr; - last = sym_table + sym_table_count++; - - last->addr = -1ll; - last->sym = ""; - - if (filter_end) { - int count; - for (count=0; count < sym_table_count; count ++) { - if (!strcmp(sym_table[count].sym, sym_filter)) { - sym_filter_entry = &sym_table[count]; - break; - } - } - } - if (dump_symtab) { - int i; - - for (i = 0; i < sym_table_count; i++) - fprintf(stderr, "%llx %s\n", - sym_table[i].addr, sym_table[i].sym); - } -} - -/* - * Source lines - */ - -static void parse_vmlinux(char *filename) -{ - FILE *file; - char command[PATH_MAX*2]; - if (!filename) - return; - - sprintf(command, "objdump --start-address=0x%016lx --stop-address=0x%016lx -dS %s", filter_start, filter_end, filename); - - file = popen(command, "r"); - if (!file) - return; - - lines_tail = &lines; - while (!feof(file)) { - struct source_line *src; - size_t dummy = 0; - char *c; - - src = malloc(sizeof(struct source_line)); - assert(src != NULL); - memset(src, 0, sizeof(struct source_line)); - - if (getline(&src->line, &dummy, file) < 0) - break; - if (!src->line) - break; - - c = strchr(src->line, '\n'); - if (c) - *c = 0; - - src->next = NULL; - *lines_tail = src; - lines_tail = &src->next; - - if (strlen(src->line)>8 && src->line[8] == ':') - src->EIP = strtoull(src->line, NULL, 16); - if (strlen(src->line)>8 && src->line[16] == ':') - src->EIP = strtoull(src->line, NULL, 16); - } - pclose(file); -} - -static void record_precise_ip(uint64_t ip) -{ - struct source_line *line; - - for (line = lines; line; line = line->next) { - if (line->EIP == ip) - line->count++; - if (line->EIP > ip) - break; - } -} - -static void lookup_sym_in_vmlinux(struct sym_entry *sym) -{ - struct source_line *line; - char pattern[PATH_MAX]; - sprintf(pattern, "<%s>:", sym->sym); - - for (line = lines; line; line = line->next) { - if (strstr(line->line, pattern)) { - sym->source = line; - break; - } - } -} - -static void show_lines(struct source_line *line_queue, int line_queue_count) -{ - int i; - struct source_line *line; - - line = line_queue; - for (i = 0; i < line_queue_count; i++) { - printf("%8li\t%s\n", line->count, line->line); - line = line->next; - } -} - -#define TRACE_COUNT 3 - -static void show_details(struct sym_entry *sym) -{ - struct source_line *line; - struct source_line *line_queue = NULL; - int displayed = 0; - int line_queue_count = 0; - - if (!sym->source) - lookup_sym_in_vmlinux(sym); - if (!sym->source) - return; - - printf("Showing details for %s\n", sym->sym); - - line = sym->source; - while (line) { - if (displayed && strstr(line->line, ">:")) - break; - - if (!line_queue_count) - line_queue = line; - line_queue_count ++; - - if (line->count >= count_filter) { - show_lines(line_queue, line_queue_count); - line_queue_count = 0; - line_queue = NULL; - } else if (line_queue_count > TRACE_COUNT) { - line_queue = line_queue->next; - line_queue_count --; - } - - line->count = 0; - displayed++; - if (displayed > 300) - break; - line = line->next; - } -} - -/* - * Binary search in the histogram table and record the hit: - */ -static void record_ip(uint64_t ip, int counter) -{ - int left_idx, middle_idx, right_idx, idx; - unsigned long left, middle, right; - - record_precise_ip(ip); - - left_idx = 0; - right_idx = sym_table_count-1; - assert(ip <= max_ip && ip >= min_ip); - - while (left_idx + 1 < right_idx) { - middle_idx = (left_idx + right_idx) / 2; - - left = sym_table[ left_idx].addr; - middle = sym_table[middle_idx].addr; - right = sym_table[ right_idx].addr; - - if (!(left <= middle && middle <= right)) { - printf("%016lx...\n%016lx...\n%016lx\n", left, middle, right); - printf("%d %d %d\n", left_idx, middle_idx, right_idx); - } - assert(left <= middle && middle <= right); - if (!(left <= ip && ip <= right)) { - printf(" left: %016lx\n", left); - printf(" ip: %016lx\n", (unsigned long)ip); - printf("right: %016lx\n", right); - } - assert(left <= ip && ip <= right); - /* - * [ left .... target .... middle .... right ] - * => right := middle - */ - if (ip < middle) { - right_idx = middle_idx; - continue; - } - /* - * [ left .... middle ... target ... right ] - * => left := middle - */ - left_idx = middle_idx; - } - - idx = left_idx; - - if (!sym_table[idx].skip) - sym_table[idx].count[counter]++; - else events--; -} - -static void process_event(uint64_t ip, int counter) -{ - events++; - - if (ip < min_ip || ip > max_ip) { - userspace_events++; - return; - } - - record_ip(ip, counter); -} - -static void process_options(int argc, char *argv[]) -{ - int error = 0, counter; - - if (strstr(argv[0], "perfstat")) - run_perfstat = 1; - - for (;;) { - int option_index = 0; - /** Options for getopt */ - static struct option long_options[] = { - {"count", required_argument, NULL, 'c'}, - {"cpu", required_argument, NULL, 'C'}, - {"delay", required_argument, NULL, 'd'}, - {"dump_symtab", no_argument, NULL, 'D'}, - {"event", required_argument, NULL, 'e'}, - {"filter", required_argument, NULL, 'f'}, - {"group", required_argument, NULL, 'g'}, - {"help", no_argument, NULL, 'h'}, - {"nmi", required_argument, NULL, 'n'}, - {"mmap_info", no_argument, NULL, 'M'}, - {"mmap_pages", required_argument, NULL, 'm'}, - {"munmap_info", no_argument, NULL, 'U'}, - {"pid", required_argument, NULL, 'p'}, - {"realtime", required_argument, NULL, 'r'}, - {"scale", no_argument, NULL, 'l'}, - {"symbol", required_argument, NULL, 's'}, - {"stat", no_argument, NULL, 'S'}, - {"vmlinux", required_argument, NULL, 'x'}, - {"zero", no_argument, NULL, 'z'}, - {NULL, 0, NULL, 0 } - }; - int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU", - long_options, &option_index); - if (c == -1) - break; - - switch (c) { - case 'a': system_wide = 1; break; - case 'c': default_interval = atoi(optarg); break; - case 'C': - /* CPU and PID are mutually exclusive */ - if (tid != -1) { - printf("WARNING: CPU switch overriding PID\n"); - sleep(1); - tid = -1; - } - profile_cpu = atoi(optarg); break; - case 'd': delay_secs = atoi(optarg); break; - case 'D': dump_symtab = 1; break; - - case 'e': error = parse_events(optarg); break; - - case 'f': count_filter = atoi(optarg); break; - case 'g': group = atoi(optarg); break; - case 'h': display_help(); break; - case 'l': scale = 1; break; - case 'n': nmi = atoi(optarg); break; - case 'p': - /* CPU and PID are mutually exclusive */ - if (profile_cpu != -1) { - printf("WARNING: PID switch overriding CPU\n"); - sleep(1); - profile_cpu = -1; - } - tid = atoi(optarg); break; - case 'r': realtime_prio = atoi(optarg); break; - case 's': sym_filter = strdup(optarg); break; - case 'S': run_perfstat = 1; break; - case 'x': vmlinux = strdup(optarg); break; - case 'z': zero = 1; break; - case 'm': mmap_pages = atoi(optarg); break; - case 'M': use_mmap = 1; break; - case 'U': use_munmap = 1; break; - default: error = 1; break; - } - } - if (error) - display_help(); - - if (!nr_counters) { - if (run_perfstat) - nr_counters = 8; - else { - nr_counters = 1; - event_id[0] = 0; - } - } - - for (counter = 0; counter < nr_counters; counter++) { - if (event_count[counter]) - continue; - - event_count[counter] = default_interval; - } -} - -struct mmap_data { - int counter; - void *base; - unsigned int mask; - unsigned int prev; -}; - -static unsigned int mmap_read_head(struct mmap_data *md) -{ - struct perf_counter_mmap_page *pc = md->base; - int head; - - head = pc->data_head; - rmb(); - - return head; -} - -struct timeval last_read, this_read; - -static void mmap_read(struct mmap_data *md) -{ - unsigned int head = mmap_read_head(md); - unsigned int old = md->prev; - unsigned char *data = md->base + page_size; - int diff; - - gettimeofday(&this_read, NULL); - - /* - * If we're further behind than half the buffer, there's a chance - * the writer will bite our tail and screw up the events under us. - * - * If we somehow ended up ahead of the head, we got messed up. - * - * In either case, truncate and restart at head. - */ - diff = head - old; - if (diff > md->mask / 2 || diff < 0) { - struct timeval iv; - unsigned long msecs; - - timersub(&this_read, &last_read, &iv); - msecs = iv.tv_sec*1000 + iv.tv_usec/1000; - - fprintf(stderr, "WARNING: failed to keep up with mmap data." - " Last read %lu msecs ago.\n", msecs); - - /* - * head points to a known good entry, start there. - */ - old = head; - } - - last_read = this_read; - - for (; old != head;) { - struct ip_event { - struct perf_event_header header; - __u64 ip; - __u32 pid, tid; - }; - struct mmap_event { - struct perf_event_header header; - __u32 pid, tid; - __u64 start; - __u64 len; - __u64 pgoff; - char filename[PATH_MAX]; - }; - - typedef union event_union { - struct perf_event_header header; - struct ip_event ip; - struct mmap_event mmap; - } event_t; - - event_t *event = (event_t *)&data[old & md->mask]; - - event_t event_copy; - - unsigned int size = event->header.size; - - /* - * Event straddles the mmap boundary -- header should always - * be inside due to u64 alignment of output. - */ - if ((old & md->mask) + size != ((old + size) & md->mask)) { - unsigned int offset = old; - unsigned int len = min(sizeof(*event), size), cpy; - void *dst = &event_copy; - - do { - cpy = min(md->mask + 1 - (offset & md->mask), len); - memcpy(dst, &data[offset & md->mask], cpy); - offset += cpy; - dst += cpy; - len -= cpy; - } while (len); - - event = &event_copy; - } - - old += size; - - if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) { - if (event->header.type & PERF_RECORD_IP) - process_event(event->ip.ip, md->counter); - } else { - switch (event->header.type) { - case PERF_EVENT_MMAP: - case PERF_EVENT_MUNMAP: - printf("%s: %Lu %Lu %Lu %s\n", - event->header.type == PERF_EVENT_MMAP - ? "mmap" : "munmap", - event->mmap.start, - event->mmap.len, - event->mmap.pgoff, - event->mmap.filename); - break; - } - } - } - - md->prev = old; -} - -int main(int argc, char *argv[]) -{ - struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; - struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; - struct perf_counter_hw_event hw_event; - pthread_t thread; - int i, counter, group_fd, nr_poll = 0; - unsigned int cpu; - int ret; - - page_size = sysconf(_SC_PAGE_SIZE); - - process_options(argc, argv); - - nr_cpus = sysconf(_SC_NPROCESSORS_ONLN); - assert(nr_cpus <= MAX_NR_CPUS); - assert(nr_cpus >= 0); - - if (run_perfstat) - return do_perfstat(argc, argv); - - if (tid != -1 || profile_cpu != -1) - nr_cpus = 1; - - parse_symbols(); - if (vmlinux && sym_filter_entry) - parse_vmlinux(vmlinux); - - for (i = 0; i < nr_cpus; i++) { - group_fd = -1; - for (counter = 0; counter < nr_counters; counter++) { - - cpu = profile_cpu; - if (tid == -1 && profile_cpu == -1) - cpu = i; - - memset(&hw_event, 0, sizeof(hw_event)); - hw_event.config = event_id[counter]; - hw_event.irq_period = event_count[counter]; - hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID; - hw_event.nmi = nmi; - hw_event.mmap = use_mmap; - hw_event.munmap = use_munmap; - - fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); - if (fd[i][counter] < 0) { - int err = errno; - printf("kerneltop error: syscall returned with %d (%s)\n", - fd[i][counter], strerror(err)); - if (err == EPERM) - printf("Are you root?\n"); - exit(-1); - } - assert(fd[i][counter] >= 0); - fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); - - /* - * First counter acts as the group leader: - */ - if (group && group_fd == -1) - group_fd = fd[i][counter]; - - event_array[nr_poll].fd = fd[i][counter]; - event_array[nr_poll].events = POLLIN; - nr_poll++; - - mmap_array[i][counter].counter = counter; - mmap_array[i][counter].prev = 0; - mmap_array[i][counter].mask = mmap_pages*page_size - 1; - mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size, - PROT_READ, MAP_SHARED, fd[i][counter], 0); - if (mmap_array[i][counter].base == MAP_FAILED) { - printf("kerneltop error: failed to mmap with %d (%s)\n", - errno, strerror(errno)); - exit(-1); - } - } - } - - if (pthread_create(&thread, NULL, display_thread, NULL)) { - printf("Could not create display thread.\n"); - exit(-1); - } - - if (realtime_prio) { - struct sched_param param; - - param.sched_priority = realtime_prio; - if (sched_setscheduler(0, SCHED_FIFO, ¶m)) { - printf("Could not set realtime priority.\n"); - exit(-1); - } - } - - while (1) { - int hits = events; - - for (i = 0; i < nr_cpus; i++) { - for (counter = 0; counter < nr_counters; counter++) - mmap_read(&mmap_array[i][counter]); - } - - if (hits == events) - ret = poll(event_array, nr_poll, 100); - } - - return 0; -} diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c index 63f8a892c0d..ff8658f2a2f 100644 --- a/Documentation/perf_counter/perf.c +++ b/Documentation/perf_counter/perf.c @@ -249,6 +249,7 @@ static void handle_internal_command(int argc, const char **argv) const char *cmd = argv[0]; static struct cmd_struct commands[] = { { "top", cmd_top, 0 }, + { "stat", cmd_stat, 0 }, }; int i; static const char ext[] = STRIP_EXTENSION; From 1d8c8b209e9351a7de1307d7b9b6df4222b8d742 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 15:52:29 +0200 Subject: [PATCH 0513/2061] perf_counter tools: add help texts Add Documentation/perf-stat.txt and Documentation/perf-top.txt. The template that was used for it: Documentation/git-add.txt from Git. Fix up small bugs to make these help texts show up both in the 'perf' common-command summary output screen, and on the individual help screens. Signed-off-by: Ingo Molnar --- .../perf_counter/Documentation/perf-stat.txt | 76 +++++++++++++++++++ .../perf_counter/Documentation/perf-top.txt | 61 +++++++++++++++ Documentation/perf_counter/builtin.h | 4 +- .../perf_counter/generate-cmdlist.sh | 8 +- 4 files changed, 143 insertions(+), 6 deletions(-) create mode 100644 Documentation/perf_counter/Documentation/perf-stat.txt create mode 100644 Documentation/perf_counter/Documentation/perf-top.txt diff --git a/Documentation/perf_counter/Documentation/perf-stat.txt b/Documentation/perf_counter/Documentation/perf-stat.txt new file mode 100644 index 00000000000..7fcab271e57 --- /dev/null +++ b/Documentation/perf_counter/Documentation/perf-stat.txt @@ -0,0 +1,76 @@ +perf-stat(1) +========== + +NAME +---- +perf-stat - Run a command and gather performance counter statistics + +SYNOPSIS +-------- +[verse] +'perf stat' [-e | --event=EVENT] [-l] [-a] + +DESCRIPTION +----------- +This command runs a command and gathers performance counter statistics +from it. + + +OPTIONS +------- +...:: + Any command you can specify in a shell. + +-e:: +--event=:: + 0:0: cpu-cycles + 0:0: cycles + 0:1: instructions + 0:2: cache-references + 0:3: cache-misses + 0:4: branch-instructions + 0:4: branches + 0:5: branch-misses + 0:6: bus-cycles + 1:0: cpu-clock + 1:1: task-clock + 1:2: page-faults + 1:2: faults + 1:5: minor-faults + 1:6: major-faults + 1:3: context-switches + 1:3: cs + 1:4: cpu-migrations + 1:4: migrations + rNNN: raw PMU events (eventsel+umask) + +-a:: + system-wide collection + +-l:: + scale counter values + +Configuration +------------- + +EXAMPLES +-------- + +$ perf stat sleep 1 + + Performance counter stats for 'sleep': + + 0.678356 task clock ticks (msecs) + 7 context switches (events) + 4 CPU migrations (events) + 232 pagefaults (events) + 1810403 CPU cycles (events) + 946759 instructions (events) + 18952 cache references (events) + 4885 cache misses (events) + + Wall-clock time elapsed: 1001.252894 msecs + +SEE ALSO +-------- +linkperf:git-tops[1] diff --git a/Documentation/perf_counter/Documentation/perf-top.txt b/Documentation/perf_counter/Documentation/perf-top.txt new file mode 100644 index 00000000000..057333b7253 --- /dev/null +++ b/Documentation/perf_counter/Documentation/perf-top.txt @@ -0,0 +1,61 @@ +perf-top(1) +========== + +NAME +---- +perf-top - Run a command and profile it + +SYNOPSIS +-------- +[verse] +'perf top' [-e | --event=EVENT] [-l] [-a] + +DESCRIPTION +----------- +This command runs a command and gathers a performance counter profile +from it. + + +OPTIONS +------- +...:: + Any command you can specify in a shell. + +-e:: +--event=:: + 0:0: cpu-cycles + 0:0: cycles + 0:1: instructions + 0:2: cache-references + 0:3: cache-misses + 0:4: branch-instructions + 0:4: branches + 0:5: branch-misses + 0:6: bus-cycles + 1:0: cpu-clock + 1:1: task-clock + 1:2: page-faults + 1:2: faults + 1:5: minor-faults + 1:6: major-faults + 1:3: context-switches + 1:3: cs + 1:4: cpu-migrations + 1:4: migrations + rNNN: raw PMU events (eventsel+umask) + +-a:: + system-wide collection + +-l:: + scale counter values + +Configuration +------------- + +EXAMPLES +-------- + +SEE ALSO +-------- +linkperf:git-stat[1] diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h index a3bb6cd6bed..605323c691f 100644 --- a/Documentation/perf_counter/builtin.h +++ b/Documentation/perf_counter/builtin.h @@ -14,6 +14,6 @@ extern void prune_packed_objects(int); extern int read_line_with_nul(char *buf, int size, FILE *file); extern int check_pager_config(const char *cmd); -extern int cmd_top(int argc, char **argv, const char *prefix); -extern int cmd_stat(int argc, char **argv, const char *prefix); +extern int cmd_top(int argc, const char **argv, const char *prefix); +extern int cmd_stat(int argc, const char **argv, const char *prefix); #endif diff --git a/Documentation/perf_counter/generate-cmdlist.sh b/Documentation/perf_counter/generate-cmdlist.sh index 75c68d948fd..f06f6fd148f 100755 --- a/Documentation/perf_counter/generate-cmdlist.sh +++ b/Documentation/perf_counter/generate-cmdlist.sh @@ -9,16 +9,16 @@ struct cmdname_help static struct cmdname_help common_cmds[] = {" -sed -n -e 's/^git-\([^ ]*\)[ ].* common.*/\1/p' command-list.txt | +sed -n -e 's/^perf-\([^ ]*\)[ ].* common.*/\1/p' command-list.txt | sort | while read cmd do sed -n ' - /^NAME/,/git-'"$cmd"'/H + /^NAME/,/perf-'"$cmd"'/H ${ x - s/.*git-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/ + s/.*perf-'"$cmd"' - \(.*\)/ {"'"$cmd"'", "\1"},/ p - }' "Documentation/git-$cmd.txt" + }' "Documentation/perf-$cmd.txt" done echo "};" From e33e0a43736307512422e41aee6e24d5a8c39181 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 15:58:01 +0200 Subject: [PATCH 0514/2061] perf_counter tools: add 'perf record' command Move perf-record.c into the perf suite of commands. Signed-off-by: Ingo Molnar --- .../Documentation/perf-record.txt | 63 +++++++++++++++++++ Documentation/perf_counter/Makefile | 1 + .../{perf-record.c => builtin-record.c} | 30 +-------- Documentation/perf_counter/builtin.h | 3 +- Documentation/perf_counter/command-list.txt | 3 +- Documentation/perf_counter/perf.c | 3 +- 6 files changed, 73 insertions(+), 30 deletions(-) create mode 100644 Documentation/perf_counter/Documentation/perf-record.txt rename Documentation/perf_counter/{perf-record.c => builtin-record.c} (95%) diff --git a/Documentation/perf_counter/Documentation/perf-record.txt b/Documentation/perf_counter/Documentation/perf-record.txt new file mode 100644 index 00000000000..d07700e35eb --- /dev/null +++ b/Documentation/perf_counter/Documentation/perf-record.txt @@ -0,0 +1,63 @@ +perf-record(1) +========== + +NAME +---- +perf-record - Run a command and record its profile into output.perf + +SYNOPSIS +-------- +[verse] +'perf record' [-e | --event=EVENT] [-l] [-a] + +DESCRIPTION +----------- +This command runs a command and gathers a performance counter profile +from it, into output.perf - without displaying anything. + +This file can then be inspected later on, using 'perf report'. + + +OPTIONS +------- +...:: + Any command you can specify in a shell. + +-e:: +--event=:: + 0:0: cpu-cycles + 0:0: cycles + 0:1: instructions + 0:2: cache-references + 0:3: cache-misses + 0:4: branch-instructions + 0:4: branches + 0:5: branch-misses + 0:6: bus-cycles + 1:0: cpu-clock + 1:1: task-clock + 1:2: page-faults + 1:2: faults + 1:5: minor-faults + 1:6: major-faults + 1:3: context-switches + 1:3: cs + 1:4: cpu-migrations + 1:4: migrations + rNNN: raw PMU events (eventsel+umask) + +-a:: + system-wide collection + +-l:: + scale counter values + +Configuration +------------- + +EXAMPLES +-------- + +SEE ALSO +-------- +linkperf:git-stat[1] diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index fb8b71744e5..b6c665eb22e 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -309,6 +309,7 @@ LIB_OBJS += usage.o LIB_OBJS += wrapper.o BUILTIN_OBJS += builtin-help.o +BUILTIN_OBJS += builtin-record.o BUILTIN_OBJS += builtin-stat.o BUILTIN_OBJS += builtin-top.o diff --git a/Documentation/perf_counter/perf-record.c b/Documentation/perf_counter/builtin-record.c similarity index 95% rename from Documentation/perf_counter/perf-record.c rename to Documentation/perf_counter/builtin-record.c index 614de7c468b..4a50abf843e 100644 --- a/Documentation/perf_counter/perf-record.c +++ b/Documentation/perf_counter/builtin-record.c @@ -81,16 +81,12 @@ (void) (&_min1 == &_min2); \ _min1 < _min2 ? _min1 : _min2; }) -asmlinkage int sys_perf_counter_open( +extern asmlinkage int sys_perf_counter_open( struct perf_counter_hw_event *hw_event_uptr __user, pid_t pid, int cpu, int group_fd, - unsigned long flags) -{ - return syscall( - __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); -} + unsigned long flags); #define MAX_COUNTERS 64 #define MAX_NR_CPUS 256 @@ -119,26 +115,6 @@ const unsigned int default_count[] = { 10000, }; -static char *hw_event_names[] = { - "CPU cycles", - "instructions", - "cache references", - "cache misses", - "branches", - "branch misses", - "bus cycles", -}; - -static char *sw_event_names[] = { - "cpu clock ticks", - "task clock ticks", - "pagefaults", - "context switches", - "CPU migrations", - "minor faults", - "major faults", -}; - struct event_symbol { __u64 event; char *symbol; @@ -414,7 +390,7 @@ static void sigchld_handler(int sig) done = 1; } -int main(int argc, char *argv[]) +int cmd_record(int argc, const char **argv) { struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h index 605323c691f..5854b1715f5 100644 --- a/Documentation/perf_counter/builtin.h +++ b/Documentation/perf_counter/builtin.h @@ -14,6 +14,7 @@ extern void prune_packed_objects(int); extern int read_line_with_nul(char *buf, int size, FILE *file); extern int check_pager_config(const char *cmd); -extern int cmd_top(int argc, const char **argv, const char *prefix); +extern int cmd_record(int argc, const char **argv, const char *prefix); extern int cmd_stat(int argc, const char **argv, const char *prefix); +extern int cmd_top(int argc, const char **argv, const char *prefix); #endif diff --git a/Documentation/perf_counter/command-list.txt b/Documentation/perf_counter/command-list.txt index 52455d46bfb..d15210aa0ca 100644 --- a/Documentation/perf_counter/command-list.txt +++ b/Documentation/perf_counter/command-list.txt @@ -1,5 +1,6 @@ # List of known perf commands. # command name category [deprecated] [common] -perf-top mainporcelain common +perf-record mainporcelain common perf-stat mainporcelain common +perf-top mainporcelain common diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c index ff8658f2a2f..e849dd66b5e 100644 --- a/Documentation/perf_counter/perf.c +++ b/Documentation/perf_counter/perf.c @@ -248,8 +248,9 @@ static void handle_internal_command(int argc, const char **argv) { const char *cmd = argv[0]; static struct cmd_struct commands[] = { - { "top", cmd_top, 0 }, + { "record", cmd_record, 0 }, { "stat", cmd_stat, 0 }, + { "top", cmd_top, 0 }, }; int i; static const char ext[] = STRIP_EXTENSION; From cc13a5913797365b683212f5fca5fb04bb3582eb Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 16:01:30 +0200 Subject: [PATCH 0515/2061] perf_counter tools: fix --version Hook up the 'perf version' built-in command. Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin.h | 1 + Documentation/perf_counter/perf.c | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h index 5854b1715f5..aec5ae388e6 100644 --- a/Documentation/perf_counter/builtin.h +++ b/Documentation/perf_counter/builtin.h @@ -17,4 +17,5 @@ extern int check_pager_config(const char *cmd); extern int cmd_record(int argc, const char **argv, const char *prefix); extern int cmd_stat(int argc, const char **argv, const char *prefix); extern int cmd_top(int argc, const char **argv, const char *prefix); +extern int cmd_version(int argc, const char **argv, const char *prefix); #endif diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c index e849dd66b5e..20d508c7729 100644 --- a/Documentation/perf_counter/perf.c +++ b/Documentation/perf_counter/perf.c @@ -5,7 +5,7 @@ #include "run-command.h" const char perf_usage_string[] = - "perf [--version] [--exec-path[=PERF_EXEC_PATH]] [--html-path] [-p|--paginate|--no-pager] [--bare] [--perf-dir=PERF_DIR] [--work-tree=PERF_WORK_TREE] [--help] COMMAND [ARGS]"; + "perf [--version] [--help] COMMAND [ARGS]"; const char perf_more_info_string[] = "See 'perf help COMMAND' for more information on a specific command."; @@ -251,6 +251,7 @@ static void handle_internal_command(int argc, const char **argv) { "record", cmd_record, 0 }, { "stat", cmd_stat, 0 }, { "top", cmd_top, 0 }, + { "version", cmd_version, 0 }, }; int i; static const char ext[] = STRIP_EXTENSION; From 6142fdd968c76a0f2ee753c39bd5be8d1bb4ef04 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 16:05:55 +0200 Subject: [PATCH 0516/2061] perf_counter tools: add 'perf help' Signed-off-by: Ingo Molnar --- .../perf_counter/Documentation/perf-help.txt | 38 +++++++++++++++++++ Documentation/perf_counter/builtin.h | 1 + Documentation/perf_counter/perf.c | 1 + 3 files changed, 40 insertions(+) create mode 100644 Documentation/perf_counter/Documentation/perf-help.txt diff --git a/Documentation/perf_counter/Documentation/perf-help.txt b/Documentation/perf_counter/Documentation/perf-help.txt new file mode 100644 index 00000000000..f85fed5a7ed --- /dev/null +++ b/Documentation/perf_counter/Documentation/perf-help.txt @@ -0,0 +1,38 @@ +perf-help(1) +=========== + +NAME +---- +perf-help - display help information about perf + +SYNOPSIS +-------- +'perf help' [-a|--all] [COMMAND] + +DESCRIPTION +----------- + +With no options and no COMMAND given, the synopsis of the 'perf' +command and a list of the most commonly used perf commands are printed +on the standard output. + +If the option '--all' or '-a' is given, then all available commands are +printed on the standard output. + +If a perf command is named, a manual page for that command is brought +up. The 'man' program is used by default for this purpose, but this +can be overridden by other options or configuration variables. + +Note that `perf --help ...` is identical to `perf help ...` because the +former is internally converted into the latter. + +OPTIONS +------- +-a:: +--all:: + Prints all the available commands on the standard output. This + option supersedes any other option. + +PERF +---- +Part of the linkperf:perf[1] suite diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h index aec5ae388e6..800f86c1d44 100644 --- a/Documentation/perf_counter/builtin.h +++ b/Documentation/perf_counter/builtin.h @@ -14,6 +14,7 @@ extern void prune_packed_objects(int); extern int read_line_with_nul(char *buf, int size, FILE *file); extern int check_pager_config(const char *cmd); +extern int cmd_help(int argc, const char **argv, const char *prefix); extern int cmd_record(int argc, const char **argv, const char *prefix); extern int cmd_stat(int argc, const char **argv, const char *prefix); extern int cmd_top(int argc, const char **argv, const char *prefix); diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c index 20d508c7729..8d6faecdc15 100644 --- a/Documentation/perf_counter/perf.c +++ b/Documentation/perf_counter/perf.c @@ -248,6 +248,7 @@ static void handle_internal_command(int argc, const char **argv) { const char *cmd = argv[0]; static struct cmd_struct commands[] = { + { "help", cmd_help, 0 }, { "record", cmd_record, 0 }, { "stat", cmd_stat, 0 }, { "top", cmd_top, 0 }, From 125e702b09a28a502e145fb434678ee27720fc48 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 16:13:46 +0200 Subject: [PATCH 0517/2061] perf_counter tools: fix 'make install' Remove Git leftovers from this area. Signed-off-by: Ingo Molnar --- Documentation/perf_counter/Makefile | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index b6c665eb22e..690045e4969 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -255,9 +255,9 @@ EXTRA_PROGRAMS = PROGRAMS += $(EXTRA_PROGRAMS) # -# None right now: +# Single 'perf' binary right now: # -# PROGRAMS += perf-fast-import$X +PROGRAMS += perf # List built-in command $C whose implementation cmd_$C() is not in # builtin-$C.o but is linked in as part of some other command. @@ -762,24 +762,9 @@ install: all $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(bindir_SQ)' $(INSTALL) -d -m 755 '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' $(INSTALL) $(ALL_PROGRAMS) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' - $(INSTALL) perf$X perf-upload-pack$X perf-receive-pack$X perf-upload-archive$X perf-shell$X perf-cvsserver '$(DESTDIR_SQ)$(bindir_SQ)' - $(MAKE) -C templates DESTDIR='$(DESTDIR_SQ)' install - $(MAKE) -C perl prefix='$(prefix_SQ)' DESTDIR='$(DESTDIR_SQ)' install ifneq (,$X) $(foreach p,$(patsubst %$X,%,$(filter %$X,$(ALL_PROGRAMS) $(BUILT_INS) perf$X)), $(RM) '$(DESTDIR_SQ)$(perfexec_instdir_SQ)/$p';) endif - bindir=$$(cd '$(DESTDIR_SQ)$(bindir_SQ)' && pwd) && \ - execdir=$$(cd '$(DESTDIR_SQ)$(perfexec_instdir_SQ)' && pwd) && \ - { $(RM) "$$execdir/perf-add$X" && \ - ln "$$bindir/perf$X" "$$execdir/perf-add$X" 2>/dev/null || \ - cp "$$bindir/perf$X" "$$execdir/perf-add$X"; } && \ - { for p in $(filter-out perf-add$X,$(BUILT_INS)); do \ - $(RM) "$$execdir/$$p" && \ - ln "$$execdir/perf-add$X" "$$execdir/$$p" 2>/dev/null || \ - ln -s "perf-add$X" "$$execdir/$$p" 2>/dev/null || \ - cp "$$execdir/perf-add$X" "$$execdir/$$p" || exit; \ - done } && \ - ./check_bindir "z$$bindir" "z$$execdir" "$$bindir/perf-add$X" ### Maintainer's dist rules From a7abe97fd8e7a6ccabba5a04a9f17be9211d418c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 10:59:34 -0400 Subject: [PATCH 0518/2061] tracing: rename EVENT_TRACER config to ENABLE_EVENT_TRACING Currently we have two configs: EVENT_TRACING and EVENT_TRACER. All tracers enable EVENT_TRACING. The EVENT_TRACER is only a convenience to enable the EVENT_TRACING when no other tracers are enabled. The names EVENT_TRACER and EVENT_TRACING are too similar and confusing. This patch renames EVENT_TRACER to ENABLE_EVENT_TRACING to be more appropriate to what it actually does, as well as add a comment in the help menu to explain the option's purpose. [ Impact: rename config option to reduce confusion ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 3fa36d2bc29..450d3c2cfbd 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -183,7 +183,7 @@ config CONTEXT_SWITCH_TRACER This tracer gets called from the context switch and records all switching of tasks. -config EVENT_TRACER +config ENABLE_EVENT_TRACING bool "Trace various events in the kernel" select TRACING help @@ -191,6 +191,10 @@ config EVENT_TRACER allowing the user to pick and choose which trace point they want to trace. + Note, all tracers enable event tracing. This option is + only a convenience to enable event tracing when no other + tracers are selected. + config FTRACE_SYSCALLS bool "Trace syscalls" depends on HAVE_FTRACE_SYSCALLS From 28d20e2d6e94434827e11c310788b87204b84559 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 12:12:44 -0400 Subject: [PATCH 0519/2061] tracing/events: call the correct event trace selftest init function The late_initcall calls a helper function instead of the proper init event selftest function. This update may have been lost due to conflicting merges. [ Impact: fix compiler warning and call extended event trace self tests ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 64f9d6d2735..98daf5dc74a 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1199,6 +1199,6 @@ static __init int event_trace_self_tests_init(void) return 0; } -late_initcall(event_trace_self_tests); +late_initcall(event_trace_self_tests_init); #endif From 23de29de2d8b227943be191d59fb6d983996d55e Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 12:59:29 -0400 Subject: [PATCH 0520/2061] tracing: remove dangling semicolon Due to a cut and paste error, the trace_seq_putc had a semicolon after the prototype but before the stub function when tracing is disabled. [Impact: fix compile error ] Signed-off-by: Steven Rostedt --- include/linux/trace_seq.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/trace_seq.h b/include/linux/trace_seq.h index 37db9bdfbc1..ba9627f00d3 100644 --- a/include/linux/trace_seq.h +++ b/include/linux/trace_seq.h @@ -63,7 +63,7 @@ static inline int trace_seq_puts(struct trace_seq *s, const char *str) { return 0; } -static inline int trace_seq_putc(struct trace_seq *s, unsigned char c); +static inline int trace_seq_putc(struct trace_seq *s, unsigned char c) { return 0; } From 17487bfeb6cfb05920e6a9d5a54f345f2917b4e7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 13:24:21 -0400 Subject: [PATCH 0521/2061] tracing: fix recursive test level calculation The recursive tests to detect same level recursion in the ring buffers did not account for the hard/softirq_counts to be shifted. Thus the numbers could be larger than then mask to be tested. This patch includes the shift for the calculation of the irq depth. [ Impact: stop false positives in trace recursion detection ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index e145969a8ed..aa40ae92233 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1483,7 +1483,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, static int trace_irq_level(void) { - return hardirq_count() + softirq_count() + in_nmi(); + return (hardirq_count() >> HARDIRQ_SHIFT) + + (softirq_count() >> + SOFTIRQ_SHIFT) + + !!in_nmi(); } static int trace_recursive_lock(void) From e395898e98119085f666febbc7b631dd69bc637f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 13:32:44 -0400 Subject: [PATCH 0522/2061] tracing: remove recursive test from ring_buffer_event_discard The ring_buffer_event_discard is not tied to ring_buffer_lock_reserve. It can be called inside or outside the reserve/commit. Even if it is called inside the reserve/commit the commit part must also be called. Only ring_buffer_discard_commit can be used as a replacement for ring_buffer_unlock_commit. This patch removes the trace_recursive_unlock from ring_buffer_event_discard since it would be the wrong place to do so. [Impact: prevent breakage in trace recursive testing ] Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index aa40ae92233..a6997670cc4 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1667,7 +1667,6 @@ static inline void rb_event_discard(struct ring_buffer_event *event) void ring_buffer_event_discard(struct ring_buffer_event *event) { rb_event_discard(event); - trace_recursive_unlock(); } EXPORT_SYMBOL_GPL(ring_buffer_event_discard); From f1f9b3b1795da8625e0e6096813c9d18d4a344ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 20 Apr 2009 20:38:21 +0200 Subject: [PATCH 0523/2061] perfcounters, sched: remove __task_delta_exec() This function was left orphan by the latest round of sw-counter cleanups. [ Impact: remove unused kernel function ] Signed-off-by: Ingo Molnar --- include/linux/kernel_stat.h | 1 - kernel/sched.c | 23 ----------------------- 2 files changed, 24 deletions(-) diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 080d1fd461d..a77c6007dc9 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -85,7 +85,6 @@ static inline unsigned int kstat_irqs(unsigned int irq) /* * Lock/unlock the current runqueue - to extract task statistics: */ -extern unsigned long long __task_delta_exec(struct task_struct *tsk, int update); extern unsigned long long task_delta_exec(struct task_struct *); extern void account_user_time(struct task_struct *, cputime_t, cputime_t); diff --git a/kernel/sched.c b/kernel/sched.c index b66a08c2480..a69278eef42 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4546,29 +4546,6 @@ DEFINE_PER_CPU(struct kernel_stat, kstat); EXPORT_PER_CPU_SYMBOL(kstat); -/* - * Return any ns on the sched_clock that have not yet been banked in - * @p in case that task is currently running. - */ -unsigned long long __task_delta_exec(struct task_struct *p, int update) -{ - s64 delta_exec; - struct rq *rq; - - rq = task_rq(p); - WARN_ON_ONCE(!runqueue_is_locked()); - WARN_ON_ONCE(!task_current(rq, p)); - - if (update) - update_rq_clock(rq); - - delta_exec = rq->clock - p->se.exec_start; - - WARN_ON_ONCE(delta_exec < 0); - - return delta_exec; -} - /* * Return any ns on the sched_clock that have not yet been banked in * @p in case that task is currently running. From ff743345bf7685a207868048a70e23164c4785e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 13 Mar 2009 12:21:26 +0100 Subject: [PATCH 0524/2061] sched: remove extra call overhead for schedule() Lai Jiangshan's patch reminded me that I promised Nick to remove that extra call overhead in schedule(). Signed-off-by: Peter Zijlstra LKML-Reference: <20090313112300.927414207@chello.nl> Signed-off-by: Ingo Molnar --- kernel/mutex.c | 4 +++- kernel/sched.c | 12 ++++-------- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/kernel/mutex.c b/kernel/mutex.c index 5d79781394a..e1fb7351040 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -248,7 +248,9 @@ __mutex_lock_common(struct mutex *lock, long state, unsigned int subclass, /* didnt get the lock, go to sleep: */ spin_unlock_mutex(&lock->wait_lock, flags); - __schedule(); + preempt_enable_no_resched(); + schedule(); + preempt_disable(); spin_lock_mutex(&lock->wait_lock, flags); } diff --git a/kernel/sched.c b/kernel/sched.c index 7601ceebf7c..797f6fdabad 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5131,13 +5131,15 @@ pick_next_task(struct rq *rq) /* * schedule() is the main scheduler function. */ -asmlinkage void __sched __schedule(void) +asmlinkage void __sched schedule(void) { struct task_struct *prev, *next; unsigned long *switch_count; struct rq *rq; int cpu; +need_resched: + preempt_disable(); cpu = smp_processor_id(); rq = cpu_rq(cpu); rcu_qsctr_inc(cpu); @@ -5194,15 +5196,9 @@ need_resched_nonpreemptible: if (unlikely(reacquire_kernel_lock(current) < 0)) goto need_resched_nonpreemptible; -} -asmlinkage void __sched schedule(void) -{ -need_resched: - preempt_disable(); - __schedule(); preempt_enable_no_resched(); - if (unlikely(test_thread_flag(TIF_NEED_RESCHED))) + if (need_resched()) goto need_resched; } EXPORT_SYMBOL(schedule); From aa18efb2a2f07e1cf062039848e9d369bb358724 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 16:16:11 -0400 Subject: [PATCH 0525/2061] tracing: use recursive counter over irq level Althought using the irq level (hardirq_count, softirq_count and in_nmi) was nice to detect bad recursion right away, but since the counters are not atomically updated with respect to the interrupts, the function tracer might trigger the test from an interrupt handler before the hardirq_count is updated. This will trigger a false warning. This patch converts the recursive detection to a simple counter. If the depth is greater than 16 then the recursive detection will trigger. 16 is more than enough for any nested interrupts. [ Impact: fix false positive trace recursion detection ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 45 ++++++++++++++------------------------ 1 file changed, 16 insertions(+), 29 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a6997670cc4..7bcfd3e6053 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1481,47 +1481,34 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, return event; } -static int trace_irq_level(void) -{ - return (hardirq_count() >> HARDIRQ_SHIFT) + - (softirq_count() >> + SOFTIRQ_SHIFT) + - !!in_nmi(); -} +#define TRACE_RECURSIVE_DEPTH 16 static int trace_recursive_lock(void) { - int level; + current->trace_recursion++; - level = trace_irq_level(); + if (likely(current->trace_recursion < TRACE_RECURSIVE_DEPTH)) + return 0; - if (unlikely(current->trace_recursion & (1 << level))) { - /* Disable all tracing before we do anything else */ - tracing_off_permanent(); + /* Disable all tracing before we do anything else */ + tracing_off_permanent(); - printk_once(KERN_WARNING "Tracing recursion: " - "HC[%lu]:SC[%lu]:NMI[%lu]\n", - hardirq_count() >> HARDIRQ_SHIFT, - softirq_count() >> SOFTIRQ_SHIFT, - in_nmi()); + printk_once(KERN_WARNING "Tracing recursion: depth[%d]:" + "HC[%lu]:SC[%lu]:NMI[%lu]\n", + current->trace_recursion, + hardirq_count() >> HARDIRQ_SHIFT, + softirq_count() >> SOFTIRQ_SHIFT, + in_nmi()); - WARN_ON_ONCE(1); - return -1; - } - - current->trace_recursion |= 1 << level; - - return 0; + WARN_ON_ONCE(1); + return -1; } static void trace_recursive_unlock(void) { - int level; + WARN_ON_ONCE(!current->trace_recursion); - level = trace_irq_level(); - - WARN_ON_ONCE(!current->trace_recursion & (1 << level)); - - current->trace_recursion &= ~(1 << level); + current->trace_recursion--; } static DEFINE_PER_CPU(int, rb_need_resched); From cb4764a6dbffd9bb3cf759421ae82384071a933d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 20 Apr 2009 18:16:44 -0400 Subject: [PATCH 0526/2061] tracing: use nowakeup version of commit for function event trace tests The startup tests for the event tracer also runs with the function tracer enabled. The "wakeup" version of the trace commit was used which can grab spinlocks. If a task was preempted by an NMI that called a function being traced, it could deadlock due to the function tracer trying to grab the same lock. Thanks to Frederic Weisbecker for pointing out where the bug was. Reported-by: Ingo Molnar Reported-by: Frederic Weisbecker Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 98daf5dc74a..672b195f86c 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1164,7 +1164,7 @@ function_test_events_call(unsigned long ip, unsigned long parent_ip) entry->ip = ip; entry->parent_ip = parent_ip; - trace_current_buffer_unlock_commit(event, flags, pc); + trace_nowake_buffer_unlock_commit(event, flags, pc); out: atomic_dec(&per_cpu(test_event_disable, cpu)); From 2ecf0a57c60dcb588f310d94412118e15c510532 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Apr 2009 12:16:56 +0900 Subject: [PATCH 0527/2061] ide-dma: don't reset request fields on dma_timeout_retry() Impact: drop unnecessary code Now that everything uses bio and block operations, there is no need to reset request fields manually when retrying a request. Every field is guaranteed to be always valid. Drop unnecessary request field resetting from ide_dma_timeout_retry(). Signed-off-by: Tejun Heo --- drivers/ide/ide-dma.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index a0b8cab1d9a..d9123ecae4a 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -510,23 +510,11 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) /* * un-busy drive etc and make sure request is sane */ - rq = hwif->rq; - if (!rq) - goto out; - - hwif->rq = NULL; - - rq->errors = 0; - - if (!rq->bio) - goto out; - - rq->sector = rq->bio->bi_sector; - rq->current_nr_sectors = bio_iovec(rq->bio)->bv_len >> 9; - rq->hard_cur_sectors = rq->current_nr_sectors; - rq->buffer = bio_data(rq->bio); -out: + if (rq) { + hwif->rq = NULL; + rq->errors = 0; + } return ret; } From 6e29ec5701e9d44fa02b96c1c5c45f7516182b65 Mon Sep 17 00:00:00 2001 From: Gautham R Shenoy Date: Tue, 21 Apr 2009 08:40:49 +0530 Subject: [PATCH 0528/2061] sched: Replace first_cpu() with cpumask_first() in ILB nomination code Stephen Rothwell reported this build warning: > kernel/sched.c: In function 'find_new_ilb': > kernel/sched.c:4355: warning: passing argument 1 of '__first_cpu' from incompatible pointer type > > Possibly caused by commit f711f6090a81cbd396b63de90f415d33f563af9b > ("sched: Nominate idle load balancer from a semi-idle package") from > the sched tree. Should this call to first_cpu be cpumask_first? For !(CONFIG_SCHED_MC || CONFIG_SCHED_SMT), find_new_ilb() nominates the Idle load balancer as the first cpu from the nohz.cpu_mask. This code uses the older API first_cpu(). Replace it with cpumask_first(), which is the correct API here. [ Impact: cleanup, address build warning ] Reported-by: Stephen Rothwell Signed-off-by: Gautham R Shenoy Cc: Rusty Russell LKML-Reference: <20090421031049.GA4140@in.ibm.com> Signed-off-by: Ingo Molnar --- kernel/sched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched.c b/kernel/sched.c index 797f6fdabad..54d67b94f1a 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4356,7 +4356,7 @@ out_done: #else /* (CONFIG_SCHED_MC || CONFIG_SCHED_SMT) */ static inline int find_new_ilb(int call_cpu) { - return first_cpu(nohz.cpu_mask); + return cpumask_first(nohz.cpu_mask); } #endif From 89388913f2c88a2cd15d24abab571b17a2596127 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 21 Apr 2009 11:39:27 +0300 Subject: [PATCH 0529/2061] x86: unify noexec handling This patch unifies noexec handling on 32-bit and 64-bit. [ Impact: cleanup ] Signed-off-by: Pekka Enberg [ mingo@elte.hu: build fix ] LKML-Reference: <1240303167.771.69.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/pgtable_types.h | 1 - arch/x86/mm/init.c | 67 ++++++++++++++++++++++++++-- arch/x86/mm/init_32.c | 52 --------------------- arch/x86/mm/init_64.c | 33 -------------- 4 files changed, 63 insertions(+), 90 deletions(-) diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index b8238dc8786..4d258ad76a0 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -273,7 +273,6 @@ typedef struct page *pgtable_t; extern pteval_t __supported_pte_mask; extern int nx_enabled; -extern void set_nx(void); #define pgprot_writecombine pgprot_writecombine extern pgprot_t pgprot_writecombine(pgprot_t prot); diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fd3da1dda1c..fedde5359a0 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -22,6 +22,69 @@ int direct_gbpages #endif ; +int nx_enabled; + +#if defined(CONFIG_X86_64) || defined(CONFIG_X86_PAE) +static int disable_nx __cpuinitdata; + +/* + * noexec = on|off + * + * Control non-executable mappings for processes. + * + * on Enable + * off Disable + */ +static int __init noexec_setup(char *str) +{ + if (!str) + return -EINVAL; + if (!strncmp(str, "on", 2)) { + __supported_pte_mask |= _PAGE_NX; + disable_nx = 0; + } else if (!strncmp(str, "off", 3)) { + disable_nx = 1; + __supported_pte_mask &= ~_PAGE_NX; + } + return 0; +} +early_param("noexec", noexec_setup); +#endif + +#ifdef CONFIG_X86_PAE +static void __init set_nx(void) +{ + unsigned int v[4], l, h; + + if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { + cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); + + if ((v[3] & (1 << 20)) && !disable_nx) { + rdmsr(MSR_EFER, l, h); + l |= EFER_NX; + wrmsr(MSR_EFER, l, h); + nx_enabled = 1; + __supported_pte_mask |= _PAGE_NX; + } + } +} +#else +static inline void set_nx(void) +{ +} +#endif + +#ifdef CONFIG_X86_64 +void __cpuinit check_efer(void) +{ + unsigned long efer; + + rdmsrl(MSR_EFER, efer); + if (!(efer & EFER_NX) || disable_nx) + __supported_pte_mask &= ~_PAGE_NX; +} +#endif + static void __init find_early_table_space(unsigned long end, int use_pse, int use_gbpages) { @@ -158,12 +221,9 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, use_gbpages = direct_gbpages; #endif -#ifdef CONFIG_X86_32 -#ifdef CONFIG_X86_PAE set_nx(); if (nx_enabled) printk(KERN_INFO "NX (Execute Disable) protection: active\n"); -#endif /* Enable PSE if available */ if (cpu_has_pse) @@ -174,7 +234,6 @@ unsigned long __init_refok init_memory_mapping(unsigned long start, set_in_cr4(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; } -#endif if (use_gbpages) page_size_mask |= 1 << PG_LEVEL_1G; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 749559ed80f..2b27120665b 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -587,61 +587,9 @@ void zap_low_mappings(void) flush_tlb_all(); } -int nx_enabled; - pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP); EXPORT_SYMBOL_GPL(__supported_pte_mask); -#ifdef CONFIG_X86_PAE - -static int disable_nx __initdata; - -/* - * noexec = on|off - * - * Control non executable mappings. - * - * on Enable - * off Disable - */ -static int __init noexec_setup(char *str) -{ - if (!str || !strcmp(str, "on")) { - if (cpu_has_nx) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } - } else { - if (!strcmp(str, "off")) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } else { - return -EINVAL; - } - } - - return 0; -} -early_param("noexec", noexec_setup); - -void __init set_nx(void) -{ - unsigned int v[4], l, h; - - if (cpu_has_pae && (cpuid_eax(0x80000000) > 0x80000001)) { - cpuid(0x80000001, &v[0], &v[1], &v[2], &v[3]); - - if ((v[3] & (1 << 20)) && !disable_nx) { - rdmsr(MSR_EFER, l, h); - l |= EFER_NX; - wrmsr(MSR_EFER, l, h); - nx_enabled = 1; - __supported_pte_mask |= _PAGE_NX; - } - } -} -#endif - /* user-defined highmem size */ static unsigned int highmem_pages = -1; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1753e8020df..a4e7846efb1 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -85,39 +85,6 @@ early_param("gbpages", parse_direct_gbpages_on); pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP; EXPORT_SYMBOL_GPL(__supported_pte_mask); -static int disable_nx __cpuinitdata; - -/* - * noexec=on|off - * Control non-executable mappings for 64-bit processes. - * - * on Enable (default) - * off Disable - */ -static int __init nonx_setup(char *str) -{ - if (!str) - return -EINVAL; - if (!strncmp(str, "on", 2)) { - __supported_pte_mask |= _PAGE_NX; - disable_nx = 0; - } else if (!strncmp(str, "off", 3)) { - disable_nx = 1; - __supported_pte_mask &= ~_PAGE_NX; - } - return 0; -} -early_param("noexec", nonx_setup); - -void __cpuinit check_efer(void) -{ - unsigned long efer; - - rdmsrl(MSR_EFER, efer); - if (!(efer & EFER_NX) || disable_nx) - __supported_pte_mask &= ~_PAGE_NX; -} - int force_personality32; /* From e8082f3f5a17d7a7bfc7dd1050a3f958dc034e9a Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Apr 2009 17:11:46 +0800 Subject: [PATCH 0530/2061] tracing/filters: don't remove old filters when failed to write subsys->filter If writing subsys->filter returns EINVAL or ENOSPC, the original filters in subsys/ and subsys/events/ will be removed. This is definitely wrong. [ Impact: fix filter setting semantics on error condition ] Signed-off-by: Li Zefan Cc: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49ED8DD2.2070700@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 672b195f86c..9ea55a7dfde 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -600,7 +600,6 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, err = filter_add_subsystem_pred(system, pred); if (err < 0) { - filter_free_subsystem_preds(system); filter_free_pred(pred); return err; } From f66578a7637b87810cbb9041c4e3a77fd2fa4706 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 21 Apr 2009 17:12:11 +0800 Subject: [PATCH 0531/2061] tracing/filters: allow user-input to be integer-like string Suppose we would like to trace all tasks named '123', but this will fail: # echo 'parent_comm == 123' > events/sched/sched_process_fork/filter bash: echo: write error: Invalid argument Don't guess the type of the filter pred in filter_parse(), but instead we check it in __filter_add_pred(). [ Impact: extend allowed filter field string values ] Signed-off-by: Li Zefan Cc: Tom Zanussi Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <49ED8DEB.6000700@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 26 ++++++++++++-------------- 1 file changed, 12 insertions(+), 14 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index e0fcfd2a16d..65418288f95 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -313,6 +313,7 @@ static int __filter_add_pred(struct ftrace_event_call *call, { struct ftrace_event_field *field; filter_pred_fn_t fn; + unsigned long long val; field = find_event_field(call, pred->field_name); if (!field) @@ -322,14 +323,13 @@ static int __filter_add_pred(struct ftrace_event_call *call, pred->offset = field->offset; if (is_string_field(field->type)) { - if (!pred->str_len) - return -EINVAL; fn = filter_pred_string; pred->str_len = field->size; return filter_add_pred_fn(call, pred, fn); } else { - if (pred->str_len) + if (strict_strtoull(pred->str_val, 0, &val)) return -EINVAL; + pred->val = val; } switch (field->size) { @@ -413,12 +413,16 @@ int filter_add_subsystem_pred(struct event_subsystem *system, return 0; } +/* + * The filter format can be + * - 0, which means remove all filter preds + * - [||/&&] ==/!= + */ int filter_parse(char **pbuf, struct filter_pred *pred) { - char *tmp, *tok, *val_str = NULL; + char *tok, *val_str = NULL; int tok_n = 0; - /* field ==/!= number, or/and field ==/!= number, number */ while ((tok = strsep(pbuf, " \n"))) { if (tok_n == 0) { if (!strcmp(tok, "0")) { @@ -478,19 +482,13 @@ int filter_parse(char **pbuf, struct filter_pred *pred) return -EINVAL; } + strcpy(pred->str_val, val_str); + pred->str_len = strlen(val_str); + pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); if (!pred->field_name) return -ENOMEM; - pred->str_len = 0; - pred->val = simple_strtoull(val_str, &tmp, 0); - if (tmp == val_str) { - strncpy(pred->str_val, val_str, MAX_FILTER_STR_VAL); - pred->str_len = strlen(val_str); - pred->str_val[pred->str_len] = '\0'; - } else if (*tmp != '\0') - return -EINVAL; - return 0; } From 3554228d4289098a8fe5cfd87512ec32a19bbe5a Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 21 Apr 2009 09:41:26 -0400 Subject: [PATCH 0532/2061] ring-buffer: only warn on wrap if buffer is bigger than two pages On boot up, to save memory, ftrace allocates the minimum buffer which is two pages. Ftrace also goes through a series of tests (when configured) on boot up. These tests can fill up a page within a single interrupt. The ring buffer also has a WARN_ON when it detects that the buffer was completely filled within a single commit (other commits are allowed to be nested). Combine the small buffer on start up, with the tests that can fill more than a single page within an interrupt, this can trigger the WARN_ON. This patch makes the WARN_ON only happen when the ring buffer consists of more than two pages. [ Impact: prevent false WARN_ON in ftrace startup tests ] Reported-by: Ingo Molnar LKML-Reference: <20090421094616.GA14561@elte.hu> Signed-off-by: Steven Rostedt Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7bcfd3e6053..61dbdf21cd3 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1241,7 +1241,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * about it. */ if (unlikely(next_page == commit_page)) { - WARN_ON_ONCE(1); + /* This can easily happen on small ring buffers */ + WARN_ON_ONCE(buffer->pages > 2); goto out_reset; } From 19bdc9d061bcb71efd2b53083d96b59bbe1a1751 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 17 Apr 2009 05:26:31 +0000 Subject: [PATCH 0533/2061] clocksource: sh_cmt clocksource support Add clocksource support to the sh_cmt driver. With this in place we can do tickless with a single CMT channel. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/clocksource/sh_cmt.c | 66 ++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 02bae3994ab..c2475648961 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -47,6 +47,7 @@ struct sh_cmt_priv { unsigned long rate; spinlock_t lock; struct clock_event_device ced; + struct clocksource cs; unsigned long total_cycles; }; @@ -376,6 +377,68 @@ static void sh_cmt_stop(struct sh_cmt_priv *p, unsigned long flag) spin_unlock_irqrestore(&p->lock, flags); } +static struct sh_cmt_priv *cs_to_sh_cmt(struct clocksource *cs) +{ + return container_of(cs, struct sh_cmt_priv, cs); +} + +static cycle_t sh_cmt_clocksource_read(struct clocksource *cs) +{ + struct sh_cmt_priv *p = cs_to_sh_cmt(cs); + unsigned long flags, raw; + unsigned long value; + int has_wrapped; + + spin_lock_irqsave(&p->lock, flags); + value = p->total_cycles; + raw = sh_cmt_get_counter(p, &has_wrapped); + + if (unlikely(has_wrapped)) + raw = p->match_value; + spin_unlock_irqrestore(&p->lock, flags); + + return value + raw; +} + +static int sh_cmt_clocksource_enable(struct clocksource *cs) +{ + struct sh_cmt_priv *p = cs_to_sh_cmt(cs); + int ret; + + p->total_cycles = 0; + + ret = sh_cmt_start(p, FLAG_CLOCKSOURCE); + if (ret) + return ret; + + /* TODO: calculate good shift from rate and counter bit width */ + cs->shift = 0; + cs->mult = clocksource_hz2mult(p->rate, cs->shift); + return 0; +} + +static void sh_cmt_clocksource_disable(struct clocksource *cs) +{ + sh_cmt_stop(cs_to_sh_cmt(cs), FLAG_CLOCKSOURCE); +} + +static int sh_cmt_register_clocksource(struct sh_cmt_priv *p, + char *name, unsigned long rating) +{ + struct clocksource *cs = &p->cs; + + cs->name = name; + cs->rating = rating; + cs->read = sh_cmt_clocksource_read; + cs->enable = sh_cmt_clocksource_enable; + cs->disable = sh_cmt_clocksource_disable; + cs->mask = CLOCKSOURCE_MASK(sizeof(unsigned long) * 8); + cs->flags = CLOCK_SOURCE_IS_CONTINUOUS; + pr_info("sh_cmt: %s used as clock source\n", cs->name); + clocksource_register(cs); + return 0; +} + static struct sh_cmt_priv *ced_to_sh_cmt(struct clock_event_device *ced) { return container_of(ced, struct sh_cmt_priv, ced); @@ -483,6 +546,9 @@ int sh_cmt_register(struct sh_cmt_priv *p, char *name, if (clockevent_rating) sh_cmt_register_clockevent(p, name, clockevent_rating); + if (clocksource_rating) + sh_cmt_register_clocksource(p, name, clocksource_rating); + return 0; } From 7a4f453b6d7379a7c380825949977c5a838aa012 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 22 Apr 2009 16:53:34 +0800 Subject: [PATCH 0534/2061] tracing/events: make struct trace_entry->type to be int type struct trace_entry->type is unsigned char, while trace event's id is int type, thus for a event with id >= 256, it's entry->type is cast to (id % 256), and then we can't see the trace output of this event. # insmod trace-events-sample.ko # echo foo_bar > /mnt/tracing/set_event # cat /debug/tracing/events/trace-events-sample/foo_bar/id 256 # cat /mnt/tracing/trace_pipe <...>-3548 [001] 215.091142: Unknown type 0 <...>-3548 [001] 216.089207: Unknown type 0 <...>-3548 [001] 217.087271: Unknown type 0 <...>-3548 [001] 218.085332: Unknown type 0 [ Impact: fix output for trace events with id >= 256 ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <49EEDB0E.5070207@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 ++-- include/trace/ftrace.h | 2 +- kernel/trace/trace.c | 4 ++-- kernel/trace/trace.h | 2 +- kernel/trace/trace_events.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 75f3ac01a87..2a4a4074991 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -16,7 +16,7 @@ struct dentry; * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ struct trace_entry { - unsigned char type; + int type; unsigned char flags; unsigned char preempt_count; int pid; @@ -73,7 +73,7 @@ enum print_line_t { struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, +trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc); diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 39a3351f2e7..15ef08d9add 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -198,7 +198,7 @@ ftrace_define_fields_##call(void) \ struct ftrace_event_call *event_call = &event_##call; \ int ret; \ \ - __common_field(unsigned char, type); \ + __common_field(int, type); \ __common_field(unsigned char, flags); \ __common_field(unsigned char, preempt_count); \ __common_field(int, pid); \ diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b9a3adce922..b6183bc9eca 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -838,7 +838,7 @@ tracing_generic_entry_update(struct trace_entry *entry, unsigned long flags, } struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc) { @@ -881,7 +881,7 @@ void trace_buffer_unlock_commit(struct trace_array *tr, } struct ring_buffer_event * -trace_current_buffer_lock_reserve(unsigned char type, unsigned long len, +trace_current_buffer_lock_reserve(int type, unsigned long len, unsigned long flags, int pc) { return trace_buffer_lock_reserve(&global_trace, diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 247948e81b0..7d55bcf50e4 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -422,7 +422,7 @@ void init_tracer_sysprof_debugfs(struct dentry *d_tracer); struct ring_buffer_event; struct ring_buffer_event *trace_buffer_lock_reserve(struct trace_array *tr, - unsigned char type, + int type, unsigned long len, unsigned long flags, int pc); diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9ea55a7dfde..5d6e879cf87 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", - FIELD(unsigned char, type), + FIELD(int, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), From 9cbf117662e24c6d33245666804487f92c21b59d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 04:51:29 +0200 Subject: [PATCH 0535/2061] tracing/events: provide string with undefined size support This patch provides the support for dynamic size strings on event tracing. The key concept is to use a structure with an ending char array field of undefined size and use such ability to allocate the minimal size on the ring buffer to make one or more string entries fit inside, as opposite to a fixed length strings with upper bound. The strings themselves are represented using fields which have an offset value from the beginning of the entry. This patch provides three new macros: __string(item, src) This one declares a string to the structure inside TP_STRUCT__entry. You need to provide the name of the string field and the source that will be copied inside. This will also add the dynamic size of the string needed for the ring buffer entry allocation. A stack allocated structure is used to temporarily store the offset of each strings, avoiding double calls to strlen() on each event insertion. __get_str(field) This one will give you a pointer to the string you have created. This is an abstract helper to resolve the absolute address given the field name which is a relative address from the beginning of the trace_structure. __assign_str(dst, src) Use this macro to automatically perform the string copy from src to dst. src must be a variable to assign and dst is the name of a __string field. Example on how to use it: TRACE_EVENT(my_event, TP_PROTO(char *src1, char *src2), TP_ARGS(src1, src2), TP_STRUCT__entry( __string(str1, src1) __string(str2, src2) ), TP_fast_assign( __assign_str(str1, src1); __assign_str(str2, src2); ), TP_printk("%s %s", __get_str(src1), __get_str(src2)) ) Of course you can mix-up any __field or __array inside this TRACE_EVENT. The position of the __string or __assign_str doesn't matter. Changes in v2: Address the suggestion of Steven Rostedt: drop the opening_string() macro and redefine __ending_string() to get the size of the string to be copied instead of overwritting the whole ring buffer allocation. Changes in v3: Address other suggestions of Steven Rostedt and Peter Zijlstra with some changes: drop the __ending_string and the need to have only one string field. Use offsets instead of absolute addresses. [ Impact: allow more compact memory usage for string tracing ] Signed-off-by: Frederic Weisbecker Cc: Steven Rostedt Cc: Li Zefan Cc: Peter Zijlstra --- include/trace/ftrace.h | 88 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 85 insertions(+), 3 deletions(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 15ef08d9add..5a7d18c4363 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -27,6 +27,9 @@ #undef __field #define __field(type, item) type item; +#undef __string +#define __string(item, src) int __str_loc_##item; + #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args @@ -35,14 +38,53 @@ struct ftrace_raw_##name { \ struct trace_entry ent; \ tstruct \ + char __str_data[0]; \ }; \ static struct ftrace_event_call event_##name #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + /* * Stage 2 of the trace events. * + * Include the following: + * + * struct ftrace_str_offsets_ { + * int ; + * int ; + * [...] + * }; + * + * The __string() macro will create each int , this is to + * keep the offset of each string from the beggining of the event + * once we perform the strlen() of the src strings. + * + */ + +#undef TRACE_FORMAT +#define TRACE_FORMAT(call, proto, args, fmt) + +#undef __array +#define __array(type, item, len) + +#undef __field +#define __field(type, item); + +#undef __string +#define __string(item, src) int item; + +#undef TRACE_EVENT +#define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ + struct ftrace_str_offsets_##call { \ + tstruct; \ + }; + +#include TRACE_INCLUDE(TRACE_INCLUDE_FILE) + +/* + * Stage 3 of the trace events. + * * Override the macros in to include the following: * * enum print_line_t @@ -80,6 +122,9 @@ #undef TP_printk #define TP_printk(fmt, args...) fmt "\n", args +#undef __get_str +#define __get_str(field) (char *)__entry + __entry->__str_loc_##field + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ enum print_line_t \ @@ -146,6 +191,16 @@ ftrace_raw_output_##call(struct trace_iterator *iter, int flags) \ if (!ret) \ return 0; +#undef __string +#define __string(item, src) \ + ret = trace_seq_printf(s, "\tfield: __str_loc " #item ";\t" \ + "offset:%u;tsize:%u;\n", \ + (unsigned int)offsetof(typeof(field), \ + __str_loc_##item), \ + (unsigned int)sizeof(field.__str_loc_##item)); \ + if (!ret) \ + return 0; + #undef __entry #define __entry REC @@ -189,6 +244,12 @@ ftrace_format_##call(struct trace_seq *s) \ if (ret) \ return ret; +#undef __string +#define __string(item, src) \ + ret = trace_define_field(event_call, "__str_loc", #item, \ + offsetof(typeof(field), __str_loc_##item), \ + sizeof(field.__str_loc_##item)); + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ int \ @@ -212,7 +273,7 @@ ftrace_define_fields_##call(void) \ #include TRACE_INCLUDE(TRACE_INCLUDE_FILE) /* - * Stage 3 of the trace events. + * Stage 4 of the trace events. * * Override the macros in to include the following: * @@ -409,6 +470,23 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #undef __entry #define __entry entry +#undef __field +#define __field(type, item) + +#undef __array +#define __array(type, item, len) + +#undef __string +#define __string(item, src) \ + __str_offsets.item = __str_size + \ + offsetof(typeof(*entry), __str_data); \ + __str_size += strlen(src) + 1; + +#undef __assign_str +#define __assign_str(dst, src) \ + __entry->__str_loc_##dst = __str_offsets.dst; \ + strcpy(__get_str(dst), src); + #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ _TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ @@ -417,18 +495,22 @@ static struct ftrace_event_call event_##call; \ \ static void ftrace_raw_event_##call(proto) \ { \ + struct ftrace_str_offsets_##call __maybe_unused __str_offsets; \ struct ftrace_event_call *call = &event_##call; \ struct ring_buffer_event *event; \ struct ftrace_raw_##call *entry; \ unsigned long irq_flags; \ + int __str_size = 0; \ int pc; \ \ local_save_flags(irq_flags); \ pc = preempt_count(); \ \ + tstruct; \ + \ event = trace_current_buffer_lock_reserve(event_##call.id, \ - sizeof(struct ftrace_raw_##call), \ - irq_flags, pc); \ + sizeof(struct ftrace_raw_##call) + __str_size,\ + irq_flags, pc); \ if (!event) \ return; \ entry = ring_buffer_event_data(event); \ From 7e7ca9a22dbbc5c91763cd16923c7509918709b6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 19 Apr 2009 04:54:49 +0200 Subject: [PATCH 0536/2061] tracing/lock: provide lock_acquired event support for dynamic size string Now that we can support the dynamic sized string, make the lock tracing able to use it, making it safe against modules removal and consuming the right amount of memory needed for each lock name Changes in v2: adapt to the __ending_string() updates and the opening_string() removal. [ Impact: protect lock tracer against module removal ] Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Steven Rostedt --- include/trace/events/lockdep.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h index 45e326b5c7f..3ca315c1429 100644 --- a/include/trace/events/lockdep.h +++ b/include/trace/events/lockdep.h @@ -38,16 +38,16 @@ TRACE_EVENT(lock_acquired, TP_ARGS(lock, ip, waittime), TP_STRUCT__entry( - __field(const char *, name) + __string(name, lock->name) __field(unsigned long, wait_usec) __field(unsigned long, wait_nsec_rem) ), TP_fast_assign( - __entry->name = lock->name; + __assign_str(name, lock->name); __entry->wait_nsec_rem = do_div(waittime, NSEC_PER_USEC); __entry->wait_usec = (unsigned long) waittime; ), - TP_printk("%s (%lu.%03lu us)", __entry->name, __entry->wait_usec, + TP_printk("%s (%lu.%03lu us)", __get_str(name), __entry->wait_usec, __entry->wait_nsec_rem) ); From 6a74aa40907757ec98d8710ff66cd4cfe064e7d8 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 22 Apr 2009 00:41:09 +0200 Subject: [PATCH 0537/2061] tracing/events: protect __get_str() The __get_str() macro is used in a code part then its content should be protected with parenthesis. [ Impact: make macro definition more robust ] Reported-by: Steven Rostedt Signed-off-by: Frederic Weisbecker --- include/trace/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 5a7d18c4363..a77f71a46db 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -123,7 +123,7 @@ #define TP_printk(fmt, args...) fmt "\n", args #undef __get_str -#define __get_str(field) (char *)__entry + __entry->__str_loc_##field +#define __get_str(field) ((char *)__entry + __entry->__str_loc_##field) #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, assign, print) \ From 9be24414aad047dcf9d8d2a9a929321536c7ebec Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 10:25:24 -0400 Subject: [PATCH 0538/2061] tracing/wakeup: move access to wakeup_cpu into spinlock The code had the following outside the lock: if (next != wakeup_task) return; pc = preempt_count(); /* The task we are waiting for is waking up */ data = wakeup_trace->data[wakeup_cpu]; On initialization, wakeup_task is NULL and wakeup_cpu -1. This code is not under a lock. If wakeup_task is set on another CPU as that task is waking up, we can see the wakeup_task before wakeup_cpu is set. If we read wakeup_cpu while it is still -1 then we will have a bad data pointer. This patch moves the reading of wakeup_cpu within the protection of the spinlock used to protect the writing of wakeup_cpu and wakeup_task. [ Impact: remove possible race causing invalid pointer dereference ] Reported-by: Maneesh Soni Signed-off-by: Steven Rostedt --- kernel/trace/trace_sched_wakeup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c index b8b13c5540f..eacb2722517 100644 --- a/kernel/trace/trace_sched_wakeup.c +++ b/kernel/trace/trace_sched_wakeup.c @@ -138,9 +138,6 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, pc = preempt_count(); - /* The task we are waiting for is waking up */ - data = wakeup_trace->data[wakeup_cpu]; - /* disable local data, not wakeup_cpu data */ cpu = raw_smp_processor_id(); disabled = atomic_inc_return(&wakeup_trace->data[cpu]->disabled); @@ -154,6 +151,9 @@ probe_wakeup_sched_switch(struct rq *rq, struct task_struct *prev, if (unlikely(!tracer_enabled || next != wakeup_task)) goto out_unlock; + /* The task we are waiting for is waking up */ + data = wakeup_trace->data[wakeup_cpu]; + trace_function(wakeup_trace, CALLER_ADDR0, CALLER_ADDR1, flags, pc); tracing_sched_switch_trace(wakeup_trace, prev, next, flags, pc); From 89ec0dee9eba6275d47be0b878cf5f6d5c2fb6eb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 11:03:29 -0400 Subject: [PATCH 0539/2061] tracing: increase size of number of possible events With the new event tracing registration, we must increase the number of events that can be registered. Currently the type field is only one byte, which leaves us only 256 possible events. Since we do not save the CPU number in the tracer anymore (it is determined by the per cpu ring buffer that is used) we have an extra byte to use. This patch increases the size of type from 1 byte (256 events) to 2 bytes (65,536 events). It also adds a WARN_ON_ONCE if we exceed that limit. [ Impact: allow more than 255 events ] Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 5 ++++- kernel/trace/trace_events.c | 2 +- kernel/trace/trace_output.c | 2 ++ 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 2a4a4074991..07e0a6d64a2 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -16,13 +16,16 @@ struct dentry; * bash-15816 [01] 235.197585: idle_cpu <- irq_enter */ struct trace_entry { - int type; + unsigned short type; unsigned char flags; unsigned char preempt_count; int pid; int tgid; }; +#define FTRACE_MAX_EVENT \ + ((1 << (sizeof(((struct trace_entry *)0)->type) * 8)) - 1) + /* * Trace iterator - used by printout routines who present trace * results to users and which routines might sleep, etc: diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 5d6e879cf87..9887131afa0 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -398,7 +398,7 @@ static int trace_write_header(struct trace_seq *s) "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\tfield:%s %s;\toffset:%zu;\tsize:%zu;\n" "\n", - FIELD(int, type), + FIELD(unsigned short, type), FIELD(unsigned char, flags), FIELD(unsigned char, preempt_count), FIELD(int, pid), diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 83a8abb9640..06997e75114 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -537,6 +537,8 @@ int register_ftrace_event(struct trace_event *event) out: mutex_unlock(&trace_event_mutex); + WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT); + return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); From 75db37d2f4c0ad9466ead57d467277d097b4105c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 26 Mar 2009 11:43:36 -0400 Subject: [PATCH 0540/2061] tracing: add size checks for exported ftrace internal structures The events exported by TRACE_EVENT are automated and are guaranteed to be correct when used. The internal ftrace structures on the other hand are more manually exported. These require the ftrace maintainer to make sure they are up to date. This patch adds a size check to help flag when a type changes in an internal ftrace data structure, and the update needs to be reflected in the export. If a export is incorrect, then the only harm is that the user space tools will not know how to correctly read the internal structures of ftrace. [ Impact: help prevent inconsistent ftrace format print outs ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 3 +++ kernel/trace/trace_export.c | 4 ++++ 2 files changed, 7 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9887131afa0..b9208158808 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -381,8 +381,11 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +extern char *__bad_type_size(void); + #undef FIELD #define FIELD(type, name) \ + sizeof(type) != sizeof(field.name) ? __bad_type_size() : \ #type, "common_" #name, offsetof(typeof(field), name), \ sizeof(field.name) diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 48fc02fe73a..0cb1a142c74 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -19,8 +19,12 @@ #undef TRACE_STRUCT #define TRACE_STRUCT(args...) args +extern void __bad_type_size(void); + #undef TRACE_FIELD #define TRACE_FIELD(type, item, assign) \ + if (sizeof(type) != sizeof(field.item)) \ + __bad_type_size(); \ ret = trace_seq_printf(s, "\tfield:" #type " " #item ";\t" \ "offset:%u;\tsize:%u;\n", \ (unsigned int)offsetof(typeof(field), item), \ From d7285c6b5c54397fdf112c2fb98ee43193173aa9 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Thu, 23 Apr 2009 10:21:38 -0700 Subject: [PATCH 0541/2061] x86: use native register access for native tlb flushing currently these are paravirtulaized, doesn't appear any callers rely on this (no pv_ops backends are using native_tlb and overriding cr3/4 access). [ Impact: fix lockdep warning with paravirt and function tracer ] Signed-off-by: Chris Wright LKML-Reference: <20090423172138.GR3036@sequoia.sous-sol.org> Signed-off-by: Steven Rostedt --- arch/x86/include/asm/tlbflush.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h index d3539f998f8..e2927c5f45b 100644 --- a/arch/x86/include/asm/tlbflush.h +++ b/arch/x86/include/asm/tlbflush.h @@ -17,7 +17,7 @@ static inline void __native_flush_tlb(void) { - write_cr3(read_cr3()); + native_write_cr3(native_read_cr3()); } static inline void __native_flush_tlb_global(void) @@ -32,11 +32,11 @@ static inline void __native_flush_tlb_global(void) */ raw_local_irq_save(flags); - cr4 = read_cr4(); + cr4 = native_read_cr4(); /* clear PGE */ - write_cr4(cr4 & ~X86_CR4_PGE); + native_write_cr4(cr4 & ~X86_CR4_PGE); /* write old PGE again and flush TLBs */ - write_cr4(cr4); + native_write_cr4(cr4); raw_local_irq_restore(flags); } From c2518c4366f087ebc10b3919cb2461bbe4f42d0c Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 23 Apr 2009 23:26:18 -0400 Subject: [PATCH 0542/2061] tracing: fix cut and paste macro error In case a module uses the TRACE_EVENT macro for creating automated events in ftrace, it may choose to use a different file name than the defined system name, or choose to use a different path than the default "include/trace/events" include path. If this is done, then before including trace/define_trace.h the header would define either "TRACE_INCLUDE_FILE" for the file name or "TRACE_INCLUDE_PATH" for the include path. If it does not define these, then the define_trace.h defines them instead. If define trace defines them, then define_trace.h should also undefine them before exiting. To do this a macro is used to note this: #ifndef TRACE_INCLUDE_FILE # define TRACE_INCLUDE_FILE TRACE_SYSTEM # define UNDEF_TRACE_INCLUDE_FILE #endif [...] #ifdef UNDEF_TRACE_INCLUDE_FILE # undef TRACE_INCLUDE_FILE # undef UNDEF_TRACE_INCLUDE_FILE #endif The UNDEF_TRACE_INCLUDE_FILE acts as a CPP variable to know to undef the TRACE_INCLUDE_FILE before leaving define_trace.h. Unfortunately, due to cut and paste errors, the macros between FILE and PATH got mixed up. [ Impact: undef TRACE_INCLUDE_FILE and/or TRACE_INCLUDE_PATH when needed ] Signed-off-by: Steven Rostedt --- include/trace/define_trace.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index 7f1f23d601e..abc611feeb8 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -44,7 +44,7 @@ #ifndef TRACE_INCLUDE_PATH # define __TRACE_INCLUDE(system) -# define UNDEF_TRACE_INCLUDE_FILE +# define UNDEF_TRACE_INCLUDE_PATH #else # define __TRACE_INCLUDE(system) __stringify(TRACE_INCLUDE_PATH/system.h) #endif @@ -64,13 +64,13 @@ /* Only undef what we defined in this file */ #ifdef UNDEF_TRACE_INCLUDE_FILE -# undef TRACE_INCLUDE_PATH +# undef TRACE_INCLUDE_FILE # undef UNDEF_TRACE_INCLUDE_FILE #endif -#ifdef UNDEF_TRACE_INCLUDE_FILE +#ifdef UNDEF_TRACE_INCLUDE_PATH # undef TRACE_INCLUDE_PATH -# undef UNDEF_TRACE_INCLUDE_FILE +# undef UNDEF_TRACE_INCLUDE_PATH #endif /* We may be processing more files */ From 334d4169a6592d3fcd863bbe822a8f6985ffa9af Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Fri, 24 Apr 2009 11:27:05 +0800 Subject: [PATCH 0543/2061] ring_buffer: compressed event header RB_MAX_SMALL_DATA = 28bytes is too small for most tracers, it wastes an 'u32' to save the actually length for events which data size > 28. This fix uses compressed event header and enlarges RB_MAX_SMALL_DATA. [ Impact: saves about 0%-12.5%(depends on tracer) memory in ring_buffer ] Signed-off-by: Lai Jiangshan LKML-Reference: <49F13189.3090000@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 16 +++---- kernel/trace/ring_buffer.c | 83 ++++++++++++++++++------------------- 2 files changed, 50 insertions(+), 49 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index fac8f1ac6f4..1c2f80911fb 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -11,7 +11,7 @@ struct ring_buffer_iter; * Don't refer to this struct directly, use functions below. */ struct ring_buffer_event { - u32 type:2, len:3, time_delta:27; + u32 type_len:5, time_delta:27; u32 array[]; }; @@ -24,7 +24,8 @@ struct ring_buffer_event { * size is variable depending on how much * padding is needed * If time_delta is non zero: - * everything else same as RINGBUF_TYPE_DATA + * array[0] holds the actual length + * size = 4 + length (bytes) * * @RINGBUF_TYPE_TIME_EXTEND: Extend the time delta * array[0] = time delta (28 .. 59) @@ -35,22 +36,23 @@ struct ring_buffer_event { * array[1..2] = tv_sec * size = 16 bytes * - * @RINGBUF_TYPE_DATA: Data record - * If len is zero: + * <= @RINGBUF_TYPE_DATA_TYPE_LEN_MAX: + * Data record + * If type_len is zero: * array[0] holds the actual length * array[1..(length+3)/4] holds data - * size = 4 + 4 + length (bytes) + * size = 4 + length (bytes) * else - * length = len << 2 + * length = type_len << 2 * array[0..(length+3)/4-1] holds data * size = 4 + length (bytes) */ enum ring_buffer_type { + RINGBUF_TYPE_DATA_TYPE_LEN_MAX = 28, RINGBUF_TYPE_PADDING, RINGBUF_TYPE_TIME_EXTEND, /* FIXME: RINGBUF_TYPE_TIME_STAMP not implemented */ RINGBUF_TYPE_TIME_STAMP, - RINGBUF_TYPE_DATA, }; unsigned ring_buffer_event_length(struct ring_buffer_event *event); diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 61dbdf21cd3..9692f100ec1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -28,8 +28,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) { int ret; - ret = trace_seq_printf(s, "\ttype : 2 bits\n"); - ret = trace_seq_printf(s, "\tlen : 3 bits\n"); + ret = trace_seq_printf(s, "# compressed entry header\n"); + ret = trace_seq_printf(s, "\ttype_len : 5 bits\n"); ret = trace_seq_printf(s, "\ttime_delta : 27 bits\n"); ret = trace_seq_printf(s, "\tarray : 32 bits\n"); ret = trace_seq_printf(s, "\n"); @@ -37,8 +37,8 @@ int ring_buffer_print_entry_header(struct trace_seq *s) RINGBUF_TYPE_PADDING); ret = trace_seq_printf(s, "\ttime_extend : type == %d\n", RINGBUF_TYPE_TIME_EXTEND); - ret = trace_seq_printf(s, "\tdata : type == %d\n", - RINGBUF_TYPE_DATA); + ret = trace_seq_printf(s, "\tdata max type_len == %d\n", + RINGBUF_TYPE_DATA_TYPE_LEN_MAX); return ret; } @@ -204,7 +204,10 @@ EXPORT_SYMBOL_GPL(tracing_is_on); #define RB_EVNT_HDR_SIZE (offsetof(struct ring_buffer_event, array)) #define RB_ALIGNMENT 4U -#define RB_MAX_SMALL_DATA 28 +#define RB_MAX_SMALL_DATA (RB_ALIGNMENT * RINGBUF_TYPE_DATA_TYPE_LEN_MAX) + +/* define RINGBUF_TYPE_DATA for 'case RINGBUF_TYPE_DATA:' */ +#define RINGBUF_TYPE_DATA 0 ... RINGBUF_TYPE_DATA_TYPE_LEN_MAX enum { RB_LEN_TIME_EXTEND = 8, @@ -213,17 +216,18 @@ enum { static inline int rb_null_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta == 0; + return event->type_len == RINGBUF_TYPE_PADDING + && event->time_delta == 0; } static inline int rb_discarded_event(struct ring_buffer_event *event) { - return event->type == RINGBUF_TYPE_PADDING && event->time_delta; + return event->type_len == RINGBUF_TYPE_PADDING && event->time_delta; } static void rb_event_set_padding(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; + event->type_len = RINGBUF_TYPE_PADDING; event->time_delta = 0; } @@ -232,8 +236,8 @@ rb_event_data_length(struct ring_buffer_event *event) { unsigned length; - if (event->len) - length = event->len * RB_ALIGNMENT; + if (event->type_len) + length = event->type_len * RB_ALIGNMENT; else length = event->array[0]; return length + RB_EVNT_HDR_SIZE; @@ -243,12 +247,12 @@ rb_event_data_length(struct ring_buffer_event *event) static unsigned rb_event_length(struct ring_buffer_event *event) { - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) /* undefined */ return -1; - return rb_event_data_length(event); + return event->array[0] + RB_EVNT_HDR_SIZE; case RINGBUF_TYPE_TIME_EXTEND: return RB_LEN_TIME_EXTEND; @@ -272,7 +276,7 @@ rb_event_length(struct ring_buffer_event *event) unsigned ring_buffer_event_length(struct ring_buffer_event *event) { unsigned length = rb_event_length(event); - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) return length; length -= RB_EVNT_HDR_SIZE; if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) @@ -285,9 +289,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_event_length); static void * rb_event_data(struct ring_buffer_event *event) { - BUG_ON(event->type != RINGBUF_TYPE_DATA); + BUG_ON(event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX); /* If length is in len field, then array[0] has the data */ - if (event->len) + if (event->type_len) return (void *)&event->array[0]; /* Otherwise length is in array[0] and array[1] has the data */ return (void *)&event->array[1]; @@ -988,7 +992,7 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) return; /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->overrun++; cpu_buffer->entries--; @@ -1133,28 +1137,21 @@ static void rb_update_event(struct ring_buffer_event *event, unsigned type, unsigned length) { - event->type = type; + event->type_len = type; switch (type) { case RINGBUF_TYPE_PADDING: - break; - case RINGBUF_TYPE_TIME_EXTEND: - event->len = DIV_ROUND_UP(RB_LEN_TIME_EXTEND, RB_ALIGNMENT); - break; - case RINGBUF_TYPE_TIME_STAMP: - event->len = DIV_ROUND_UP(RB_LEN_TIME_STAMP, RB_ALIGNMENT); break; - case RINGBUF_TYPE_DATA: + case 0: length -= RB_EVNT_HDR_SIZE; - if (length > RB_MAX_SMALL_DATA) { - event->len = 0; + if (length > RB_MAX_SMALL_DATA) event->array[0] = length; - } else - event->len = DIV_ROUND_UP(length, RB_ALIGNMENT); + else + event->type_len = DIV_ROUND_UP(length, RB_ALIGNMENT); break; default: BUG(); @@ -1562,7 +1559,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_PAGE_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, RINGBUF_TYPE_DATA, length); + event = rb_reserve_next_event(cpu_buffer, 0, length); if (!event) goto out; @@ -1634,7 +1631,9 @@ EXPORT_SYMBOL_GPL(ring_buffer_unlock_commit); static inline void rb_event_discard(struct ring_buffer_event *event) { - event->type = RINGBUF_TYPE_PADDING; + /* array[0] holds the actual length for the discarded event */ + event->array[0] = rb_event_data_length(event) - RB_EVNT_HDR_SIZE; + event->type_len = RINGBUF_TYPE_PADDING; /* time delta must be non zero */ if (!event->time_delta) event->time_delta = 1; @@ -1786,8 +1785,7 @@ int ring_buffer_write(struct ring_buffer *buffer, goto out; event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, - RINGBUF_TYPE_DATA, event_length); + event = rb_reserve_next_event(cpu_buffer, 0, event_length); if (!event) goto out; @@ -2035,7 +2033,7 @@ rb_update_read_stamp(struct ring_buffer_per_cpu *cpu_buffer, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -2066,7 +2064,7 @@ rb_update_iter_read_stamp(struct ring_buffer_iter *iter, { u64 delta; - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: return; @@ -2181,7 +2179,8 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) event = rb_reader_event(cpu_buffer); - if (event->type == RINGBUF_TYPE_DATA || rb_discarded_event(event)) + if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX + || rb_discarded_event(event)) cpu_buffer->entries--; rb_update_read_stamp(cpu_buffer, event); @@ -2262,7 +2261,7 @@ rb_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_reader_event(cpu_buffer); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) RB_WARN_ON(cpu_buffer, 1); @@ -2334,7 +2333,7 @@ rb_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_head_event(iter); - switch (event->type) { + switch (event->type_len) { case RINGBUF_TYPE_PADDING: if (rb_null_event(event)) { rb_inc_iter(iter); @@ -2393,7 +2392,7 @@ ring_buffer_peek(struct ring_buffer *buffer, int cpu, u64 *ts) event = rb_buffer_peek(buffer, cpu, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2421,7 +2420,7 @@ ring_buffer_iter_peek(struct ring_buffer_iter *iter, u64 *ts) event = rb_iter_peek(iter, ts); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2466,7 +2465,7 @@ ring_buffer_consume(struct ring_buffer *buffer, int cpu, u64 *ts) out: preempt_enable(); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2559,7 +2558,7 @@ ring_buffer_read(struct ring_buffer_iter *iter, u64 *ts) out: spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); - if (event && event->type == RINGBUF_TYPE_PADDING) { + if (event && event->type_len == RINGBUF_TYPE_PADDING) { cpu_relax(); goto again; } @@ -2766,7 +2765,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) return; /* Only count data entries */ - if (event->type != RINGBUF_TYPE_DATA) + if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->entries--; } From 3e98f9f15e916c48dfc5231d7e6a59be7f122764 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 24 Apr 2009 15:39:39 +0900 Subject: [PATCH 0544/2061] sh: pci: Fix up the build for CONFIG_PCI=n. Signed-off-by: Paul Mundt --- arch/sh/include/asm/pci.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/sh/include/asm/pci.h b/arch/sh/include/asm/pci.h index f910121559b..5b2e0fcdfc2 100644 --- a/arch/sh/include/asm/pci.h +++ b/arch/sh/include/asm/pci.h @@ -88,6 +88,7 @@ static inline void pcibios_penalize_isa_irq(int irq, int active) #define pci_unmap_len_set(PTR, LEN_NAME, VAL) do { } while (0) #endif +#ifdef CONFIG_PCI static inline void pci_dma_burst_advice(struct pci_dev *pdev, enum pci_dma_burst_strategy *strat, unsigned long *strategy_parameter) @@ -95,6 +96,7 @@ static inline void pci_dma_burst_advice(struct pci_dev *pdev, *strat = PCI_DMA_BURST_INFINITY; *strategy_parameter = ~0UL; } +#endif #ifdef CONFIG_SUPERH32 /* From 782cc5ae6331d63b4febaa312c9d14493aafa9b8 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:43:09 +0200 Subject: [PATCH 0545/2061] x86, ds: fix buffer alignment in debug store selftest The debug store selftest code uses a stack-allocated buffer, which is not necessarily correctly aligned. For tests using a buffer to hold a single entry, the buffer that is passed to ds_request must already be suitably aligned. Pass a suitably aligned portion of the bigger buffer. [ Impact: fix hw-branch-tracer self-test failure ] Signed-off-by: Markus Metzger Cc: markus.t.metzger@gmail.com LKML-Reference: <20090424094309.A30145@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ds_selftest.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/ds_selftest.c b/arch/x86/kernel/ds_selftest.c index 5f104a0ace6..6bc7c199ab9 100644 --- a/arch/x86/kernel/ds_selftest.c +++ b/arch/x86/kernel/ds_selftest.c @@ -323,13 +323,15 @@ static int ds_selftest_bts_bad_request_task(void *buffer) int ds_selftest_bts(void) { struct ds_selftest_bts_conf conf; - unsigned char buffer[BUFFER_SIZE]; + unsigned char buffer[BUFFER_SIZE], *small_buffer; unsigned long irq; int cpu; printk(KERN_INFO "[ds] bts selftest..."); conf.error = 0; + small_buffer = (unsigned char *)ALIGN((unsigned long)buffer, 8) + 8; + get_online_cpus(); for_each_online_cpu(cpu) { conf.suspend = ds_suspend_bts_wrap; @@ -381,7 +383,7 @@ int ds_selftest_bts(void) conf.suspend = ds_suspend_bts_noirq; conf.resume = ds_resume_bts_noirq; conf.tracer = - ds_request_bts_task(current, buffer, SMALL_BUFFER_SIZE, + ds_request_bts_task(current, small_buffer, SMALL_BUFFER_SIZE, NULL, (size_t)-1, BTS_KERNEL); local_irq_save(irq); ds_selftest_bts_cpu(&conf); From 7e0bfad24d85de7cf2202a7b0ce51de11a077b21 Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:44:48 +0200 Subject: [PATCH 0546/2061] x86, bts: reenable ptrace branch trace support The races found by Oleg Nesterov have been fixed. Reenable branch trace support. Signed-off-by: Markus Metzger Acked-by: Oleg Nesterov LKML-Reference: <20090424094448.A30216@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.cpu | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 8130334329c..924e156a85a 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -506,7 +506,6 @@ config X86_PTRACE_BTS bool "Branch Trace Store" default y depends on X86_DEBUGCTLMSR - depends on BROKEN ---help--- This adds a ptrace interface to the hardware's branch trace store. From 1cb81b143fa8f0e4629f10690862e2e52ca792ff Mon Sep 17 00:00:00 2001 From: Markus Metzger Date: Fri, 24 Apr 2009 09:51:43 +0200 Subject: [PATCH 0547/2061] x86, bts, mm: clean up buffer allocation The current mm interface is asymetric. One function allocates a locked buffer, another function only refunds the memory. Change this to have two functions for accounting and refunding locked memory, respectively; and do the actual buffer allocation in ptrace. [ Impact: refactor BTS buffer allocation code ] Signed-off-by: Markus Metzger Acked-by: Andrew Morton Cc: Peter Zijlstra LKML-Reference: <20090424095143.A30265@sedona.ch.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 39 ++++++++++++++++++++++++++------------- include/linux/mm.h | 6 ++++-- mm/mlock.c | 36 +++++++++++++++++------------------- 3 files changed, 47 insertions(+), 34 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index d5252ae6c52..09ecbde91c1 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -617,17 +617,28 @@ struct bts_context { struct work_struct work; }; -static inline void alloc_bts_buffer(struct bts_context *context, - unsigned int size) +static int alloc_bts_buffer(struct bts_context *context, unsigned int size) { - void *buffer; + void *buffer = NULL; + int err = -ENOMEM; - buffer = alloc_locked_buffer(size); - if (buffer) { - context->buffer = buffer; - context->size = size; - context->mm = get_task_mm(current); - } + err = account_locked_memory(current->mm, current->signal->rlim, size); + if (err < 0) + return err; + + buffer = kzalloc(size, GFP_KERNEL); + if (!buffer) + goto out_refund; + + context->buffer = buffer; + context->size = size; + context->mm = get_task_mm(current); + + return 0; + + out_refund: + refund_locked_memory(current->mm, size); + return err; } static inline void free_bts_buffer(struct bts_context *context) @@ -638,7 +649,7 @@ static inline void free_bts_buffer(struct bts_context *context) kfree(context->buffer); context->buffer = NULL; - refund_locked_buffer_memory(context->mm, context->size); + refund_locked_memory(context->mm, context->size); context->size = 0; mmput(context->mm); @@ -786,13 +797,15 @@ static int ptrace_bts_config(struct task_struct *child, context->tracer = NULL; if ((cfg.flags & PTRACE_BTS_O_ALLOC) && (cfg.size != context->size)) { + int err; + free_bts_buffer(context); if (!cfg.size) return 0; - alloc_bts_buffer(context, cfg.size); - if (!context->buffer) - return -ENOMEM; + err = alloc_bts_buffer(context, cfg.size); + if (err < 0) + return err; } if (cfg.flags & PTRACE_BTS_O_TRACE) diff --git a/include/linux/mm.h b/include/linux/mm.h index a3963ba23a6..009eabd3c21 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -19,6 +19,7 @@ struct anon_vma; struct file_ra_state; struct user_struct; struct writeback_control; +struct rlimit; #ifndef CONFIG_DISCONTIGMEM /* Don't use mapnrs, do it properly */ extern unsigned long max_mapnr; @@ -1319,7 +1320,8 @@ int vmemmap_populate_basepages(struct page *start_page, int vmemmap_populate(struct page *start_page, unsigned long pages, int node); void vmemmap_populate_print_last(void); -extern void *alloc_locked_buffer(size_t size); -extern void refund_locked_buffer_memory(struct mm_struct *mm, size_t size); +extern int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, + size_t size); +extern void refund_locked_memory(struct mm_struct *mm, size_t size); #endif /* __KERNEL__ */ #endif /* _LINUX_MM_H */ diff --git a/mm/mlock.c b/mm/mlock.c index 28be15ead9c..ac130433c7d 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -629,38 +629,36 @@ void user_shm_unlock(size_t size, struct user_struct *user) free_uid(user); } -void *alloc_locked_buffer(size_t size) +int account_locked_memory(struct mm_struct *mm, struct rlimit *rlim, + size_t size) { - unsigned long rlim, vm, pgsz; - void *buffer = NULL; + unsigned long lim, vm, pgsz; + int error = -ENOMEM; pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; - down_write(¤t->mm->mmap_sem); + down_write(&mm->mmap_sem); - rlim = current->signal->rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; - vm = current->mm->total_vm + pgsz; - if (rlim < vm) + lim = rlim[RLIMIT_AS].rlim_cur >> PAGE_SHIFT; + vm = mm->total_vm + pgsz; + if (lim < vm) goto out; - rlim = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; - vm = current->mm->locked_vm + pgsz; - if (rlim < vm) + lim = rlim[RLIMIT_MEMLOCK].rlim_cur >> PAGE_SHIFT; + vm = mm->locked_vm + pgsz; + if (lim < vm) goto out; - buffer = kzalloc(size, GFP_KERNEL); - if (!buffer) - goto out; - - current->mm->total_vm += pgsz; - current->mm->locked_vm += pgsz; + mm->total_vm += pgsz; + mm->locked_vm += pgsz; + error = 0; out: - up_write(¤t->mm->mmap_sem); - return buffer; + up_write(&mm->mmap_sem); + return error; } -void refund_locked_buffer_memory(struct mm_struct *mm, size_t size) +void refund_locked_memory(struct mm_struct *mm, size_t size) { unsigned long pgsz = PAGE_ALIGN(size) >> PAGE_SHIFT; From 39517091f88fae32b52254b561ced78da1eaf0a7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 11:05:52 -0400 Subject: [PATCH 0548/2061] tracing/lockdep: convert lockdep to use TRACE_EVENT macro The TRACE_FORMAT will soon be deprecated. This patch converts it to the TRACE_EVENT macro. Note, this change should also speed up the tracing. [ Impact: remove a user of deprecated TRACE_FORMAT ] Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/trace/events/lockdep.h | 58 +++++++++++++++++++++++++++------- 1 file changed, 47 insertions(+), 11 deletions(-) diff --git a/include/trace/events/lockdep.h b/include/trace/events/lockdep.h index 3ca315c1429..0e956c9dfd7 100644 --- a/include/trace/events/lockdep.h +++ b/include/trace/events/lockdep.h @@ -9,28 +9,64 @@ #ifdef CONFIG_LOCKDEP -TRACE_FORMAT(lock_acquire, +TRACE_EVENT(lock_acquire, + TP_PROTO(struct lockdep_map *lock, unsigned int subclass, int trylock, int read, int check, struct lockdep_map *next_lock, unsigned long ip), - TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), - TP_FMT("%s%s%s", trylock ? "try " : "", - read ? "read " : "", lock->name) - ); -TRACE_FORMAT(lock_release, + TP_ARGS(lock, subclass, trylock, read, check, next_lock, ip), + + TP_STRUCT__entry( + __field(unsigned int, flags) + __string(name, lock->name) + ), + + TP_fast_assign( + __entry->flags = (trylock ? 1 : 0) | (read ? 2 : 0); + __assign_str(name, lock->name); + ), + + TP_printk("%s%s%s", (__entry->flags & 1) ? "try " : "", + (__entry->flags & 2) ? "read " : "", + __get_str(name)) +); + +TRACE_EVENT(lock_release, + TP_PROTO(struct lockdep_map *lock, int nested, unsigned long ip), + TP_ARGS(lock, nested, ip), - TP_FMT("%s", lock->name) - ); + + TP_STRUCT__entry( + __string(name, lock->name) + ), + + TP_fast_assign( + __assign_str(name, lock->name); + ), + + TP_printk("%s", __get_str(name)) +); #ifdef CONFIG_LOCK_STAT -TRACE_FORMAT(lock_contended, +TRACE_EVENT(lock_contended, + TP_PROTO(struct lockdep_map *lock, unsigned long ip), + TP_ARGS(lock, ip), - TP_FMT("%s", lock->name) - ); + + TP_STRUCT__entry( + __string(name, lock->name) + ), + + TP_fast_assign( + __assign_str(name, lock->name); + ), + + TP_printk("%s", __get_str(name)) +); TRACE_EVENT(lock_acquired, TP_PROTO(struct lockdep_map *lock, unsigned long ip, s64 waittime), From 160031b556e93590fa8635210d73d93c3d3853a9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 11:26:55 -0400 Subject: [PATCH 0549/2061] tracing/irq: convert irq traces to use TRACE_EVENT macro The TRACE_FORMAT will soon be deprecated. This patch converts it to the TRACE_EVENT macro. Note, this change should also speed up the tracing. [ Impact: remove a user of deprecated TRACE_FORMAT ] Cc: Jason Baron Signed-off-by: Steven Rostedt --- include/trace/events/irq.h | 61 +++++++++++++++++++++++++++++++------- 1 file changed, 50 insertions(+), 11 deletions(-) diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 75e3468e449..76868646751 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -10,11 +10,24 @@ /* * Tracepoint for entry of interrupt handler: */ -TRACE_FORMAT(irq_handler_entry, +TRACE_EVENT(irq_handler_entry, + TP_PROTO(int irq, struct irqaction *action), + TP_ARGS(irq, action), - TP_FMT("irq=%d handler=%s", irq, action->name) - ); + + TP_STRUCT__entry( + __field( int, irq ) + __string( name, action->name ) + ), + + TP_fast_assign( + __entry->irq = irq; + __assign_str(name, action->name); + ), + + TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name)) +); /* * Tracepoint for return of an interrupt handler: @@ -39,17 +52,43 @@ TRACE_EVENT(irq_handler_exit, __entry->irq, __entry->ret ? "handled" : "unhandled") ); -TRACE_FORMAT(softirq_entry, - TP_PROTO(struct softirq_action *h, struct softirq_action *vec), - TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); +TRACE_EVENT(softirq_entry, -TRACE_FORMAT(softirq_exit, TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + TP_ARGS(h, vec), - TP_FMT("softirq=%d action=%s", (int)(h - vec), softirq_to_name[h-vec]) - ); + + TP_STRUCT__entry( + __field( int, vec ) + __string( name, softirq_to_name[h-vec] ) + ), + + TP_fast_assign( + __entry->vec = (int)(h - vec); + __assign_str(name, softirq_to_name[h-vec]); + ), + + TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) +); + +TRACE_EVENT(softirq_exit, + + TP_PROTO(struct softirq_action *h, struct softirq_action *vec), + + TP_ARGS(h, vec), + + TP_STRUCT__entry( + __field( int, vec ) + __string( name, softirq_to_name[h-vec] ) + ), + + TP_fast_assign( + __entry->vec = (int)(h - vec); + __assign_str(name, softirq_to_name[h-vec]); + ), + + TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) +); #endif /* _TRACE_IRQ_H */ From b8e65554d80b4c560d201362d0e8fa02109d89fd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 11:50:39 -0400 Subject: [PATCH 0550/2061] tracing: remove deprecated TRACE_FORMAT The TRACE_FORMAT macro has been deprecated by the TRACE_EVENT macro. There are no more users. All new users must use the TRACE_EVENT macro. [ Impact: remove old functionality ] Cc: Peter Zijlstra Signed-off-by: Steven Rostedt --- include/linux/tracepoint.h | 5 --- include/trace/define_trace.h | 4 --- include/trace/ftrace.h | 66 ------------------------------------ 3 files changed, 75 deletions(-) diff --git a/include/linux/tracepoint.h b/include/linux/tracepoint.h index 4353f3f7e62..14df7e635d4 100644 --- a/include/linux/tracepoint.h +++ b/include/linux/tracepoint.h @@ -158,11 +158,6 @@ static inline void tracepoint_synchronize_unregister(void) #define PARAMS(args...) args -#ifndef TRACE_FORMAT -#define TRACE_FORMAT(name, proto, args, fmt) \ - DECLARE_TRACE(name, PARAMS(proto), PARAMS(args)) -#endif - #ifndef TRACE_EVENT /* * For use with the TRACE_EVENT macro: diff --git a/include/trace/define_trace.h b/include/trace/define_trace.h index abc611feeb8..f7a7ae1e8f9 100644 --- a/include/trace/define_trace.h +++ b/include/trace/define_trace.h @@ -26,10 +26,6 @@ #define TRACE_EVENT(name, proto, args, tstruct, assign, print) \ DEFINE_TRACE(name) -#undef TRACE_FORMAT -#define TRACE_FORMAT(name, proto, args, print) \ - DEFINE_TRACE(name) - #undef DECLARE_TRACE #define DECLARE_TRACE(name, proto, args) \ DEFINE_TRACE(name) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a77f71a46db..1e681142f1d 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -18,9 +18,6 @@ #include -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - #undef __array #define __array(type, item, len) type item[len]; @@ -62,9 +59,6 @@ * */ -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) - #undef __array #define __array(type, item, len) @@ -298,16 +292,6 @@ ftrace_define_fields_##call(void) \ * unregister_trace_(ftrace_event_); * } * - * For those macros defined with TRACE_FORMAT: - * - * static struct ftrace_event_call __used - * __attribute__((__aligned__(4))) - * __attribute__((section("_ftrace_events"))) event_ = { - * .name = "", - * .regfunc = ftrace_reg_event_, - * .unregfunc = ftrace_unreg_event_, - * } - * * * For those macros defined with TRACE_EVENT: * @@ -417,56 +401,6 @@ static void ftrace_profile_disable_##call(struct ftrace_event_call *call) \ #define _TRACE_PROFILE_INIT(call) #endif -#define _TRACE_FORMAT(call, proto, args, fmt) \ -static void ftrace_event_##call(proto) \ -{ \ - event_trace_printk(_RET_IP_, #call ": " fmt); \ -} \ - \ -static int ftrace_reg_event_##call(void) \ -{ \ - int ret; \ - \ - ret = register_trace_##call(ftrace_event_##call); \ - if (ret) \ - pr_info("event trace: Could not activate trace point " \ - "probe to " #call "\n"); \ - return ret; \ -} \ - \ -static void ftrace_unreg_event_##call(void) \ -{ \ - unregister_trace_##call(ftrace_event_##call); \ -} \ - \ -static struct ftrace_event_call event_##call; \ - \ -static int ftrace_init_event_##call(void) \ -{ \ - int id; \ - \ - id = register_ftrace_event(NULL); \ - if (!id) \ - return -ENODEV; \ - event_##call.id = id; \ - return 0; \ -} - -#undef TRACE_FORMAT -#define TRACE_FORMAT(call, proto, args, fmt) \ -_TRACE_FORMAT(call, PARAMS(proto), PARAMS(args), PARAMS(fmt)) \ -_TRACE_PROFILE(call, PARAMS(proto), PARAMS(args)) \ -static struct ftrace_event_call __used \ -__attribute__((__aligned__(4))) \ -__attribute__((section("_ftrace_events"))) event_##call = { \ - .name = #call, \ - .system = __stringify(TRACE_SYSTEM), \ - .raw_init = ftrace_init_event_##call, \ - .regfunc = ftrace_reg_event_##call, \ - .unregfunc = ftrace_unreg_event_##call, \ - _TRACE_PROFILE_INIT(call) \ -} - #undef __entry #define __entry entry From 060fa5c83e67901ba47ab484cfcdb32737d630ba Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 12:20:52 -0400 Subject: [PATCH 0551/2061] tracing/events: reuse trace event ids after overflow With modules being able to add trace events, and the max trace event counter is 16 bits (65536) we can overflow the counter easily with a simple while loop adding and removing modules that contain trace events. This patch links together the registered trace events and on overflow searches for available trace event ids. It will still fail if over 65536 events are registered, but considering that a typical kernel only has 22000 functions, 65000 events should be sufficient. Reported-by: Li Zefan Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 1 + kernel/trace/trace_output.c | 71 ++++++++++++++++++++++++++++++------ 2 files changed, 61 insertions(+), 11 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 07e0a6d64a2..78a9ba24cbf 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -56,6 +56,7 @@ typedef enum print_line_t (*trace_print_func)(struct trace_iterator *iter, int flags); struct trace_event { struct hlist_node node; + struct list_head list; int type; trace_print_func trace; trace_print_func raw; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 06997e75114..5fc51f0f75f 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -483,6 +483,36 @@ struct trace_event *ftrace_find_event(int type) return NULL; } +static LIST_HEAD(ftrace_event_list); + +static int trace_search_list(struct list_head **list) +{ + struct trace_event *e; + int last = __TRACE_LAST_TYPE; + + if (list_empty(&ftrace_event_list)) { + *list = &ftrace_event_list; + return last + 1; + } + + /* + * We used up all possible max events, + * lets see if somebody freed one. + */ + list_for_each_entry(e, &ftrace_event_list, list) { + if (e->type != last + 1) + break; + last++; + } + + /* Did we used up all 65 thousand events??? */ + if ((last + 1) > FTRACE_MAX_EVENT) + return 0; + + *list = &e->list; + return last + 1; +} + /** * register_ftrace_event - register output for an event type * @event: the event type to register @@ -505,20 +535,40 @@ int register_ftrace_event(struct trace_event *event) mutex_lock(&trace_event_mutex); - if (!event) { - ret = next_event_type++; + if (WARN_ON(!event)) goto out; - } - if (!event->type) - event->type = next_event_type++; - else if (event->type > __TRACE_LAST_TYPE) { + INIT_LIST_HEAD(&event->list); + + if (!event->type) { + struct list_head *list; + + if (next_event_type > FTRACE_MAX_EVENT) { + + event->type = trace_search_list(&list); + if (!event->type) + goto out; + + } else { + + event->type = next_event_type++; + list = &ftrace_event_list; + } + + if (WARN_ON(ftrace_find_event(event->type))) + goto out; + + list_add_tail(&event->list, list); + + } else if (event->type > __TRACE_LAST_TYPE) { printk(KERN_WARNING "Need to add type to trace.h\n"); WARN_ON(1); - } - - if (ftrace_find_event(event->type)) goto out; + } else { + /* Is this event already used */ + if (ftrace_find_event(event->type)) + goto out; + } if (event->trace == NULL) event->trace = trace_nop_print; @@ -537,8 +587,6 @@ int register_ftrace_event(struct trace_event *event) out: mutex_unlock(&trace_event_mutex); - WARN_ON_ONCE(next_event_type > FTRACE_MAX_EVENT); - return ret; } EXPORT_SYMBOL_GPL(register_ftrace_event); @@ -551,6 +599,7 @@ int unregister_ftrace_event(struct trace_event *event) { mutex_lock(&trace_event_mutex); hlist_del(&event->node); + list_del(&event->list); mutex_unlock(&trace_event_mutex); return 0; From 79ffab34391933ee3b95dac7f25c0478fa2f8f1e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 May 2009 15:13:42 -0400 Subject: [PATCH 0552/2061] ext4: Properly initialize the buffer_head state These struct buffer_heads are allocated on the stack (and hence are initialized with stack garbage). They are only used to call a get_blocks() function, so that's mostly OK, but b_state must be initialized to be 0 so we don't have any unexpected BH_* flags set by accident, such as BH_Unwritten or BH_Delay. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 1 + fs/ext4/inode.c | 15 ++++++++++++++- fs/mpage.c | 6 ++++-- 3 files changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e3a55eb8b26..a953214f282 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3150,6 +3150,7 @@ retry: ret = PTR_ERR(handle); break; } + map_bh.b_state = 0; ret = ext4_get_blocks_wrap(handle, inode, block, max_blocks, &map_bh, EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 2a9ffd528dd..d7ad0bb73cd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2055,7 +2055,20 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) if ((mpd->b_state & (1 << BH_Mapped)) && !(mpd->b_state & (1 << BH_Delay))) return 0; - new.b_state = mpd->b_state; + /* + * We need to make sure the BH_Delay flag is passed down to + * ext4_da_get_block_write(), since it calls + * ext4_get_blocks_wrap() with the EXT4_DELALLOC_RSVED flag. + * This flag causes ext4_get_blocks_wrap() to call + * ext4_da_update_reserve_space() if the passed buffer head + * has the BH_Delay flag set. In the future, once we clean up + * the interfaces to ext4_get_blocks_wrap(), we should pass in + * a separate flag which requests that the delayed allocation + * statistics should be updated, instead of depending on the + * state information getting passed down via the map_bh's + * state bitmasks plus the magic EXT4_DELALLOC_RSVED flag. + */ + new.b_state = mpd->b_state & (1 << BH_Delay); new.b_blocknr = 0; new.b_size = mpd->b_size; next = mpd->b_blocknr; diff --git a/fs/mpage.c b/fs/mpage.c index 680ba60863f..42381bd6543 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -379,7 +379,8 @@ mpage_readpages(struct address_space *mapping, struct list_head *pages, struct buffer_head map_bh; unsigned long first_logical_block = 0; - clear_buffer_mapped(&map_bh); + map_bh.b_state = 0; + map_bh.b_size = 0; for (page_idx = 0; page_idx < nr_pages; page_idx++) { struct page *page = list_entry(pages->prev, struct page, lru); @@ -412,7 +413,8 @@ int mpage_readpage(struct page *page, get_block_t get_block) struct buffer_head map_bh; unsigned long first_logical_block = 0; - clear_buffer_mapped(&map_bh); + map_bh.b_state = 0; + map_bh.b_size = 0; bio = do_mpage_readpage(bio, page, 1, &last_block_in_bio, &map_bh, &first_logical_block, get_block); if (bio) From 8fb0e342481c4d80040670fec915f0b9c7c6499a Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 12 May 2009 16:22:37 -0400 Subject: [PATCH 0553/2061] vfs: Add BUG_ON for delayed and unwritten flags in submit_bh() The BH_Delay and BH_Unwritten flags should never leak out to submit_bh(). So add some BUG_ON() checks to submit_bh so we can get a stack trace and determine how and why this might have happened. (Note that only XFS and ext4 use these buffer head flags, and XFS does not use submit_bh(). So this patch should only modify behavior for ext4.) Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" Cc: linux-fsdevel@vger.kernel.org --- fs/buffer.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/buffer.c b/fs/buffer.c index aed297739eb..ad011290022 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -2933,6 +2933,8 @@ int submit_bh(int rw, struct buffer_head * bh) BUG_ON(!buffer_locked(bh)); BUG_ON(!buffer_mapped(bh)); BUG_ON(!bh->b_end_io); + BUG_ON(buffer_delay(bh)); + BUG_ON(buffer_unwritten(bh)); /* * Mask in barrier bit for a write (could be either a WRITE or a From 29fa89d088941d79765d60f22d5ccdd6b8696e11 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 12 May 2009 16:30:27 -0400 Subject: [PATCH 0554/2061] ext4: Mark the unwritten buffer_head as mapped during write_begin Setting BH_Unwritten buffer_heads as BH_Mapped avoids multiple (unnecessary) calls to get_block() during the call to the write(2) system call. Setting BH_Unwritten buffer heads as BH_Mapped requires that the writepages() functions can handle BH_Unwritten buffer_heads. After this commit, things work as follows: ext4_ext_get_block() returns unmapped, unwritten, buffer head when called with create = 0 for prealloc space. This makes sure we handle the read path and non-delayed allocation case correctly. Even though the buffer head is marked unmapped we have valid b_blocknr and b_bdev values in the buffer_head. ext4_da_get_block_prep() called for block resrevation will now return mapped, unwritten, new buffer_head for prealloc space. This avoids multiple calls to get_block() for write to same offset. By making such buffers as BH_New, we also assure that sub-block zeroing of buffered writes happens correctly. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 +-- fs/ext4/inode.c | 82 ++++++++++++++++++++++++++++++----------------- 2 files changed, 54 insertions(+), 32 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a953214f282..ea5c47608ce 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2872,6 +2872,8 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, if (create == EXT4_CREATE_UNINITIALIZED_EXT) goto out; if (!create) { + if (allocated > max_blocks) + allocated = max_blocks; /* * We have blocks reserved already. We * return allocated blocks so that delalloc @@ -2879,8 +2881,6 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * the buffer head will be unmapped so that * a read from the block returns 0s. */ - if (allocated > max_blocks) - allocated = max_blocks; set_buffer_unwritten(bh_result); bh_result->b_bdev = inode->i_sb->s_bdev; bh_result->b_blocknr = newblock; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d7ad0bb73cd..96f3366f59f 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1852,7 +1852,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd) * @logical - first logical block to start assignment with * * the function goes through all passed space and put actual disk - * block numbers into buffer heads, dropping BH_Delay + * block numbers into buffer heads, dropping BH_Delay and BH_Unwritten */ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, struct buffer_head *exbh) @@ -1902,16 +1902,24 @@ static void mpage_put_bnr_to_bhs(struct mpage_da_data *mpd, sector_t logical, do { if (cur_logical >= logical + blocks) break; - if (buffer_delay(bh)) { - bh->b_blocknr = pblock; - clear_buffer_delay(bh); - bh->b_bdev = inode->i_sb->s_bdev; - } else if (buffer_unwritten(bh)) { - bh->b_blocknr = pblock; - clear_buffer_unwritten(bh); - set_buffer_mapped(bh); - set_buffer_new(bh); - bh->b_bdev = inode->i_sb->s_bdev; + + if (buffer_delay(bh) || + buffer_unwritten(bh)) { + + BUG_ON(bh->b_bdev != inode->i_sb->s_bdev); + + if (buffer_delay(bh)) { + clear_buffer_delay(bh); + bh->b_blocknr = pblock; + } else { + /* + * unwritten already should have + * blocknr assigned. Verify that + */ + clear_buffer_unwritten(bh); + BUG_ON(bh->b_blocknr != pblock); + } + } else if (buffer_mapped(bh)) BUG_ON(bh->b_blocknr != pblock); @@ -2053,7 +2061,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * We consider only non-mapped and non-allocated blocks */ if ((mpd->b_state & (1 << BH_Mapped)) && - !(mpd->b_state & (1 << BH_Delay))) + !(mpd->b_state & (1 << BH_Delay)) && + !(mpd->b_state & (1 << BH_Unwritten))) return 0; /* * We need to make sure the BH_Delay flag is passed down to @@ -2205,6 +2214,17 @@ flush_it: return; } +static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) +{ + /* + * unmapped buffer is possible for holes. + * delay buffer is possible with delayed allocation. + * We also need to consider unwritten buffer as unmapped. + */ + return (!buffer_mapped(bh) || buffer_delay(bh) || + buffer_unwritten(bh)) && buffer_dirty(bh); +} + /* * __mpage_da_writepage - finds extent of pages and blocks * @@ -2289,8 +2309,7 @@ static int __mpage_da_writepage(struct page *page, * Otherwise we won't make progress * with the page in ext4_da_writepage */ - if (buffer_dirty(bh) && - (!buffer_mapped(bh) || buffer_delay(bh))) { + if (ext4_bh_unmapped_or_delay(NULL, bh)) { mpage_add_bh_to_extent(mpd, logical, bh->b_size, bh->b_state); @@ -2318,6 +2337,14 @@ static int __mpage_da_writepage(struct page *page, /* * this is a special callback for ->write_begin() only * it's intention is to return mapped block or reserve space + * + * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. + * We also have b_blocknr = -1 and b_bdev initialized properly + * + * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. + * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev + * initialized properly. + * */ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -2353,28 +2380,23 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, set_buffer_delay(bh_result); } else if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); - /* - * With sub-block writes into unwritten extents - * we also need to mark the buffer as new so that - * the unwritten parts of the buffer gets correctly zeroed. - */ - if (buffer_unwritten(bh_result)) + if (buffer_unwritten(bh_result)) { + /* A delayed write to unwritten bh should + * be marked new and mapped. Mapped ensures + * that we don't do get_block multiple times + * when we write to the same offset and new + * ensures that we do proper zero out for + * partial write. + */ set_buffer_new(bh_result); + set_buffer_mapped(bh_result); + } ret = 0; } return ret; } -static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh) -{ - /* - * unmapped buffer is possible for holes. - * delay buffer is possible with delayed allocation - */ - return ((!buffer_mapped(bh) || buffer_delay(bh)) && buffer_dirty(bh)); -} - static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { @@ -2828,7 +2850,7 @@ static int ext4_da_should_update_i_disksize(struct page *page, for (i = 0; i < idx; i++) bh = bh->b_this_page; - if (!buffer_mapped(bh) || (buffer_delay(bh))) + if (!buffer_mapped(bh) || (buffer_delay(bh)) || buffer_unwritten(bh)) return 0; return 1; } From c5ca7c7636fa689a9746b6032f83aa7fffec31c6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 27 Apr 2009 22:48:48 -0400 Subject: [PATCH 0555/2061] ext4: Fallback to vmalloc if kmalloc can't allocate s_flex_groups array For very large filesystems, the s_flex_groups array can get quite big. For example, a filesystem that can be resized up to 16TB will have 8192 flex groups (assuming the default flex_bg size of 16), so the array is 96k, which is *very* marginal for kmalloc(). On the other hand, a 160GB filesystem without the resize_inode feature will only require 960 bytes. So we try to allocate the array first using kmalloc(), and if that fails, we'll try to use vmalloc() instead. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2958f4e6f22..3f4475daa66 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -586,7 +587,10 @@ static void ext4_put_super(struct super_block *sb) for (i = 0; i < sbi->s_gdb_count; i++) brelse(sbi->s_group_desc[i]); kfree(sbi->s_group_desc); - kfree(sbi->s_flex_groups); + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else + kfree(sbi->s_flex_groups); percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); @@ -1620,6 +1624,7 @@ static int ext4_fill_flex_info(struct super_block *sb) ext4_group_t flex_group_count; ext4_group_t flex_group; int groups_per_flex = 0; + size_t size; int i; if (!sbi->s_es->s_log_groups_per_flex) { @@ -1634,8 +1639,13 @@ static int ext4_fill_flex_info(struct super_block *sb) flex_group_count = ((sbi->s_groups_count + groups_per_flex - 1) + ((le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) + 1) << EXT4_DESC_PER_BLOCK_BITS(sb))) / groups_per_flex; - sbi->s_flex_groups = kzalloc(flex_group_count * - sizeof(struct flex_groups), GFP_KERNEL); + size = flex_group_count * sizeof(struct flex_groups); + sbi->s_flex_groups = kzalloc(size, GFP_KERNEL); + if (sbi->s_flex_groups == NULL) { + sbi->s_flex_groups = vmalloc(size); + if (sbi->s_flex_groups) + memset(sbi->s_flex_groups, 0, size); + } if (sbi->s_flex_groups == NULL) { printk(KERN_ERR "EXT4-fs: not enough memory for " "%u flex groups\n", flex_group_count); @@ -2842,6 +2852,12 @@ failed_mount4: sbi->s_journal = NULL; } failed_mount3: + if (sbi->s_flex_groups) { + if (is_vmalloc_addr(sbi->s_flex_groups)) + vfree(sbi->s_flex_groups); + else + kfree(sbi->s_flex_groups); + } percpu_counter_destroy(&sbi->s_freeblocks_counter); percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); From f7c439504ccba0cca43271e651013ab97a221c62 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 24 Apr 2009 23:31:59 -0400 Subject: [PATCH 0556/2061] ext4: Use is_power_of_2() for clarity Signed-off-by: Robert P. J. Day Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3f4475daa66..3e509bc647e 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1483,7 +1483,7 @@ set_qf_format: return 0; if (option < 0 || option > (1 << 30)) return 0; - if (option & (option - 1)) { + if (!is_power_of_2(option)) { printk(KERN_ERR "EXT4-fs: inode_readahead_blks" " must be a power of 2\n"); return 0; @@ -2101,8 +2101,7 @@ static ssize_t inode_readahead_blks_store(struct ext4_attr *a, if (parse_strtoul(buf, 0x40000000, &t)) return -EINVAL; - /* inode_readahead_blks must be a power of 2 */ - if (t & (t-1)) + if (!is_power_of_2(t)) return -EINVAL; sbi->s_inode_readahead_blks = t; From e2d670523c6c4ccb0fca9f3ab1b8f066d9aa57d6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 00:33:44 -0400 Subject: [PATCH 0557/2061] ext4: Simplify ext4_commit_super()'s function signature The ext4_commit_super() function took both a struct super_block * and a struct ext4_super_block *, but the struct ext4_super_block can be derived from the struct super_block. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3e509bc647e..ad4c9be4abd 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -54,8 +54,7 @@ static struct kset *ext4_kset; static int ext4_load_journal(struct super_block *, struct ext4_super_block *, unsigned long journal_devnum); -static int ext4_commit_super(struct super_block *sb, - struct ext4_super_block *es, int sync); +static int ext4_commit_super(struct super_block *sb, int sync); static void ext4_mark_recovery_complete(struct super_block *sb, struct ext4_super_block *es); static void ext4_clear_journal_err(struct super_block *sb, @@ -306,7 +305,7 @@ static void ext4_handle_error(struct super_block *sb) printk(KERN_CRIT "Remounting filesystem read-only\n"); sb->s_flags |= MS_RDONLY; } - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); if (test_opt(sb, ERRORS_PANIC)) panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); @@ -448,7 +447,7 @@ __acquires(bitlock) if (test_opt(sb, ERRORS_CONT)) { EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, es, 0); + ext4_commit_super(sb, 0); return; } ext4_unlock_group(sb, grp); @@ -577,7 +576,7 @@ static void ext4_put_super(struct super_block *sb) if (!(sb->s_flags & MS_RDONLY)) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); es->s_state = cpu_to_le16(sbi->s_mount_state); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); } if (sbi->s_proc) { remove_proc_entry(sb->s_id, ext4_proc_root); @@ -1596,7 +1595,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es, if (sbi->s_journal) EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); if (test_opt(sb, DEBUG)) printk(KERN_INFO "[EXT4 FS bs=%lu, gc=%u, " "bpg=%lu, ipg=%lu, mo=%04lx]\n", @@ -2655,7 +2654,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) if (test_opt(sb, ERRORS_PANIC)) { EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); goto failed_mount4; } } @@ -3132,15 +3131,15 @@ static int ext4_load_journal(struct super_block *sb, sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); } return 0; } -static int ext4_commit_super(struct super_block *sb, - struct ext4_super_block *es, int sync) +static int ext4_commit_super(struct super_block *sb, int sync) { + struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; @@ -3212,7 +3211,7 @@ static void ext4_mark_recovery_complete(struct super_block *sb, sb->s_flags & MS_RDONLY) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); sb->s_dirt = 0; - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); } unlock_super(sb); @@ -3253,7 +3252,7 @@ static void ext4_clear_journal_err(struct super_block *sb, EXT4_SB(sb)->s_mount_state |= EXT4_ERROR_FS; es->s_state |= cpu_to_le16(EXT4_ERROR_FS); - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); jbd2_journal_clear_err(journal); } @@ -3293,7 +3292,7 @@ static void ext4_write_super(struct super_block *sb) BUG(); sb->s_dirt = 0; } else { - ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + ext4_commit_super(sb, 1); } } @@ -3312,7 +3311,7 @@ static int ext4_sync_fs(struct super_block *sb, int wait) target); } } else { - ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); + ext4_commit_super(sb, wait); } return ret; } @@ -3345,7 +3344,7 @@ static int ext4_freeze(struct super_block *sb) /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + error = ext4_commit_super(sb, 1); if (error) goto out; } @@ -3365,7 +3364,7 @@ static int ext4_unfreeze(struct super_block *sb) lock_super(sb); /* Reser the needs_recovery flag before the fs is unlocked. */ EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); + ext4_commit_super(sb, 1); unlock_super(sb); jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); } @@ -3520,7 +3519,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) } } if (sbi->s_journal == NULL) - ext4_commit_super(sb, es, 1); + ext4_commit_super(sb, 1); #ifdef CONFIG_QUOTA /* Release old quota file names */ From 7234ab2a55e77784b44cf2d862136d9e41b8d98a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 30 Apr 2009 21:24:04 -0400 Subject: [PATCH 0558/2061] ext4: Fix and simplify s_dirt handling The s_dirt flag wasn't completely handled correctly, but it didn't really matter when journalling was enabled. It turns out that when ext4 runs without a journal, we don't clear s_dirt in places where we should have, with the result that the high-level write_super() function was writing the superblock when it wasn't necessary. So we fix this by making ext4_commit_super() clear the s_dirt flag, and removing many of the other places where s_dirt is manipulated. When journalling is enabled, the s_dirt flag might be left set more often, but s_dirt really doesn't matter when journalling is enabled. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index ad4c9be4abd..7c7a08af120 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3128,7 +3128,6 @@ static int ext4_load_journal(struct super_block *sb, if (journal_devnum && journal_devnum != le32_to_cpu(es->s_journal_dev)) { es->s_journal_dev = cpu_to_le32(journal_devnum); - sb->s_dirt = 1; /* Make sure we flush the recovery flag to disk. */ ext4_commit_super(sb, 1); @@ -3168,7 +3167,7 @@ static int ext4_commit_super(struct super_block *sb, int sync) &EXT4_SB(sb)->s_freeblocks_counter)); es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive( &EXT4_SB(sb)->s_freeinodes_counter)); - + sb->s_dirt = 0; BUFFER_TRACE(sbh, "marking dirty"); mark_buffer_dirty(sbh); if (sync) { @@ -3210,7 +3209,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb, if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - sb->s_dirt = 0; ext4_commit_super(sb, 1); } unlock_super(sb); @@ -3271,10 +3269,8 @@ int ext4_force_commit(struct super_block *sb) return 0; journal = EXT4_SB(sb)->s_journal; - if (journal) { - sb->s_dirt = 0; + if (journal) ret = ext4_journal_force_commit(journal); - } return ret; } @@ -3282,15 +3278,13 @@ int ext4_force_commit(struct super_block *sb) /* * Ext4 always journals updates to the superblock itself, so we don't * have to propagate any other updates to the superblock on disk at this - * point. (We can probably nuke this function altogether, and remove - * any mention to sb->s_dirt in all of fs/ext4; eventual cleanup...) + * point if the journalling is enabled. */ static void ext4_write_super(struct super_block *sb) { if (EXT4_SB(sb)->s_journal) { if (mutex_trylock(&sb->s_lock) != 0) BUG(); - sb->s_dirt = 0; } else { ext4_commit_super(sb, 1); } @@ -3302,7 +3296,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait) tid_t target; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); - sb->s_dirt = 0; if (EXT4_SB(sb)->s_journal) { if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { @@ -3324,7 +3317,6 @@ static int ext4_freeze(struct super_block *sb) { int error = 0; journal_t *journal; - sb->s_dirt = 0; if (!(sb->s_flags & MS_RDONLY)) { journal = EXT4_SB(sb)->s_journal; From 9ca92389c5312a51e819c15c762f0abdc7f3129b Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 12:52:25 -0400 Subject: [PATCH 0559/2061] ext4: Use separate super_operations structure for no_journal filesystems By using a separate super_operations structure for filesystems that have and don't have journals, we can simply ext4_write_super() --- which is only needed when no journal is present --- and ext4_freeze(), ext4_unfreeze(), and ext4_sync_fs(), which are only needed when the journal is present. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 108 +++++++++++++++++++++++++----------------------- 1 file changed, 57 insertions(+), 51 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7c7a08af120..68c3a44c4a9 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -995,7 +995,6 @@ static const struct super_operations ext4_sops = { .dirty_inode = ext4_dirty_inode, .delete_inode = ext4_delete_inode, .put_super = ext4_put_super, - .write_super = ext4_write_super, .sync_fs = ext4_sync_fs, .freeze_fs = ext4_freeze, .unfreeze_fs = ext4_unfreeze, @@ -1010,6 +1009,25 @@ static const struct super_operations ext4_sops = { .bdev_try_to_free_page = bdev_try_to_free_page, }; +static const struct super_operations ext4_nojournal_sops = { + .alloc_inode = ext4_alloc_inode, + .destroy_inode = ext4_destroy_inode, + .write_inode = ext4_write_inode, + .dirty_inode = ext4_dirty_inode, + .delete_inode = ext4_delete_inode, + .write_super = ext4_write_super, + .put_super = ext4_put_super, + .statfs = ext4_statfs, + .remount_fs = ext4_remount, + .clear_inode = ext4_clear_inode, + .show_options = ext4_show_options, +#ifdef CONFIG_QUOTA + .quota_read = ext4_quota_read, + .quota_write = ext4_quota_write, +#endif + .bdev_try_to_free_page = bdev_try_to_free_page, +}; + static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, @@ -2615,7 +2633,11 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* * set up enough so that it can read an inode */ - sb->s_op = &ext4_sops; + if (!test_opt(sb, NOLOAD) && + EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) + sb->s_op = &ext4_sops; + else + sb->s_op = &ext4_nojournal_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -3275,19 +3297,9 @@ int ext4_force_commit(struct super_block *sb) return ret; } -/* - * Ext4 always journals updates to the superblock itself, so we don't - * have to propagate any other updates to the superblock on disk at this - * point if the journalling is enabled. - */ static void ext4_write_super(struct super_block *sb) { - if (EXT4_SB(sb)->s_journal) { - if (mutex_trylock(&sb->s_lock) != 0) - BUG(); - } else { - ext4_commit_super(sb, 1); - } + ext4_commit_super(sb, 1); } static int ext4_sync_fs(struct super_block *sb, int wait) @@ -3296,15 +3308,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait) tid_t target; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); - if (EXT4_SB(sb)->s_journal) { - if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, - &target)) { - if (wait) - jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, - target); - } - } else { - ext4_commit_super(sb, wait); + if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, &target)) { + if (wait) + jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, target); } return ret; } @@ -3318,32 +3324,31 @@ static int ext4_freeze(struct super_block *sb) int error = 0; journal_t *journal; - if (!(sb->s_flags & MS_RDONLY)) { - journal = EXT4_SB(sb)->s_journal; + if (sb->s_flags & MS_RDONLY) + return 0; - if (journal) { - /* Now we set up the journal barrier. */ - jbd2_journal_lock_updates(journal); + journal = EXT4_SB(sb)->s_journal; - /* - * We don't want to clear needs_recovery flag when we - * failed to flush the journal. - */ - error = jbd2_journal_flush(journal); - if (error < 0) - goto out; - } + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); - /* Journal blocked and flushed, clear needs_recovery flag. */ - EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - error = ext4_commit_super(sb, 1); - if (error) - goto out; + /* + * Don't clear the needs_recovery flag if we failed to flush + * the journal. + */ + error = jbd2_journal_flush(journal); + if (error < 0) { + out: + jbd2_journal_unlock_updates(journal); + return error; } + + /* Journal blocked and flushed, clear needs_recovery flag. */ + EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + error = ext4_commit_super(sb, 1); + if (error) + goto out; return 0; -out: - jbd2_journal_unlock_updates(journal); - return error; } /* @@ -3352,14 +3357,15 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (EXT4_SB(sb)->s_journal && !(sb->s_flags & MS_RDONLY)) { - lock_super(sb); - /* Reser the needs_recovery flag before the fs is unlocked. */ - EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, 1); - unlock_super(sb); - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); - } + if (sb->s_flags & MS_RDONLY) + return 0; + + lock_super(sb); + /* Reset the needs_recovery flag before the fs is unlocked. */ + EXT4_SET_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); + ext4_commit_super(sb, 1); + unlock_super(sb); + jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); return 0; } From 8df9675f8b498d0bfa1f0b5b06f56bf1ff366dd5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 08:50:38 -0400 Subject: [PATCH 0560/2061] ext4: Avoid races caused by on-line resizing and SMP memory reordering Ext4's on-line resizing adds a new block group and then, only at the last step adjusts s_groups_count. However, it's possible on SMP systems that another CPU could see the updated the s_group_count and not see the newly initialized data structures for the just-added block group. For this reason, it's important to insert a SMP read barrier after reading s_groups_count and before reading any (for example) the new block group descriptors allowed by the increased value of s_groups_count. Unfortunately, we rather blatently violate this locking protocol documented in fs/ext4/resize.c. Fortunately, (1) on-line resizes happen relatively rarely, and (2) it seems rare that the filesystem code will immediately try to use just-added block group before any memory ordering issues resolve themselves. So apparently problems here are relatively hard to hit, since ext3 has been vulnerable to the same issue for years with no one apparently complaining. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 15 +++++++-------- fs/ext4/ext4.h | 12 ++++++++++++ fs/ext4/ialloc.c | 40 +++++++++++++++++++--------------------- fs/ext4/inode.c | 7 ++++--- fs/ext4/mballoc.c | 45 ++++++++++++++++++++++++--------------------- fs/ext4/super.c | 3 +-- 6 files changed, 67 insertions(+), 55 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 53c72ad8587..a5ba039850c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -88,6 +88,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) { int bit, bit_max; + ext4_group_t ngroups = ext4_get_groups_count(sb); unsigned free_blocks, group_blocks; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -123,7 +124,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, bit_max += ext4_bg_num_gdb(sb, block_group); } - if (block_group == sbi->s_groups_count - 1) { + if (block_group == ngroups - 1) { /* * Even though mke2fs always initialize first and last group * if some other tool enabled the EXT4_BG_BLOCK_UNINIT we need @@ -131,7 +132,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, */ group_blocks = ext4_blocks_count(sbi->s_es) - le32_to_cpu(sbi->s_es->s_first_data_block) - - (EXT4_BLOCKS_PER_GROUP(sb) * (sbi->s_groups_count - 1)); + (EXT4_BLOCKS_PER_GROUP(sb) * (ngroups - 1)); } else { group_blocks = EXT4_BLOCKS_PER_GROUP(sb); } @@ -205,18 +206,18 @@ struct ext4_group_desc * ext4_get_group_desc(struct super_block *sb, { unsigned int group_desc; unsigned int offset; + ext4_group_t ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (block_group >= sbi->s_groups_count) { + if (block_group >= ngroups) { ext4_error(sb, "ext4_get_group_desc", "block_group >= groups_count - " "block_group = %u, groups_count = %u", - block_group, sbi->s_groups_count); + block_group, ngroups); return NULL; } - smp_rmb(); group_desc = block_group >> EXT4_DESC_PER_BLOCK_BITS(sb); offset = block_group & (EXT4_DESC_PER_BLOCK(sb) - 1); @@ -665,7 +666,7 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) ext4_fsblk_t desc_count; struct ext4_group_desc *gdp; ext4_group_t i; - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; ext4_fsblk_t bitmap_count; @@ -677,7 +678,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) bitmap_count = 0; gdp = NULL; - smp_rmb(); for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) @@ -700,7 +700,6 @@ ext4_fsblk_t ext4_count_free_blocks(struct super_block *sb) return bitmap_count; #else desc_count = 0; - smp_rmb(); for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d0f15ef56de..02ec44bf38e 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1228,6 +1228,18 @@ struct ext4_group_info *ext4_get_group_info(struct super_block *sb, return grp_info[indexv][indexh]; } +/* + * Reading s_groups_count requires using smp_rmb() afterwards. See + * the locking protocol documented in the comments of ext4_group_add() + * in resize.c + */ +static inline ext4_group_t ext4_get_groups_count(struct super_block *sb) +{ + ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + + smp_rmb(); + return ngroups; +} static inline ext4_group_t ext4_flex_group(struct ext4_sb_info *sbi, ext4_group_t block_group) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f18e0a08a6b..55ba419ca00 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -316,7 +316,7 @@ error_return: static int find_group_dir(struct super_block *sb, struct inode *parent, ext4_group_t *best_group) { - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); unsigned int freei, avefreei; struct ext4_group_desc *desc, *best_desc = NULL; ext4_group_t group; @@ -353,7 +353,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, struct flex_groups *flex_group = sbi->s_flex_groups; ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); - ext4_group_t ngroups = sbi->s_groups_count; + ext4_group_t ngroups = ext4_get_groups_count(sb); int flex_size = ext4_flex_bg_size(sbi); ext4_group_t best_flex = parent_fbg_group; int blocks_per_flex = sbi->s_blocks_per_group * flex_size; @@ -362,7 +362,7 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, ext4_group_t n_fbg_groups; ext4_group_t i; - n_fbg_groups = (sbi->s_groups_count + flex_size - 1) >> + n_fbg_groups = (ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; find_close_to_parent: @@ -478,20 +478,21 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; struct ext4_sb_info *sbi = EXT4_SB(sb); - ext4_group_t ngroups = sbi->s_groups_count; + ext4_group_t real_ngroups = ext4_get_groups_count(sb); int inodes_per_group = EXT4_INODES_PER_GROUP(sb); unsigned int freei, avefreei; ext4_fsblk_t freeb, avefreeb; unsigned int ndirs; int max_dirs, min_inodes; ext4_grpblk_t min_blocks; - ext4_group_t i, grp, g; + ext4_group_t i, grp, g, ngroups; struct ext4_group_desc *desc; struct orlov_stats stats; int flex_size = ext4_flex_bg_size(sbi); + ngroups = real_ngroups; if (flex_size > 1) { - ngroups = (ngroups + flex_size - 1) >> + ngroups = (real_ngroups + flex_size - 1) >> sbi->s_log_groups_per_flex; parent_group >>= sbi->s_log_groups_per_flex; } @@ -543,7 +544,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, */ grp *= flex_size; for (i = 0; i < flex_size; i++) { - if (grp+i >= sbi->s_groups_count) + if (grp+i >= real_ngroups) break; desc = ext4_get_group_desc(sb, grp+i, NULL); if (desc && ext4_free_inodes_count(sb, desc)) { @@ -583,7 +584,7 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent, } fallback: - ngroups = sbi->s_groups_count; + ngroups = real_ngroups; avefreei = freei / ngroups; fallback_retry: parent_group = EXT4_I(parent)->i_block_group; @@ -613,9 +614,8 @@ static int find_group_other(struct super_block *sb, struct inode *parent, ext4_group_t *group, int mode) { ext4_group_t parent_group = EXT4_I(parent)->i_block_group; - ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count; + ext4_group_t i, last, ngroups = ext4_get_groups_count(sb); struct ext4_group_desc *desc; - ext4_group_t i, last; int flex_size = ext4_flex_bg_size(EXT4_SB(sb)); /* @@ -799,11 +799,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct super_block *sb; struct buffer_head *inode_bitmap_bh = NULL; struct buffer_head *group_desc_bh; - ext4_group_t group = 0; + ext4_group_t ngroups, group = 0; unsigned long ino = 0; struct inode *inode; struct ext4_group_desc *gdp = NULL; - struct ext4_super_block *es; struct ext4_inode_info *ei; struct ext4_sb_info *sbi; int ret2, err = 0; @@ -818,15 +817,14 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) return ERR_PTR(-EPERM); sb = dir->i_sb; + ngroups = ext4_get_groups_count(sb); trace_mark(ext4_request_inode, "dev %s dir %lu mode %d", sb->s_id, dir->i_ino, mode); inode = new_inode(sb); if (!inode) return ERR_PTR(-ENOMEM); ei = EXT4_I(inode); - sbi = EXT4_SB(sb); - es = sbi->s_es; if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) { ret2 = find_group_flex(sb, dir, &group); @@ -856,7 +854,7 @@ got_group: if (ret2 == -1) goto out; - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { err = -EIO; gdp = ext4_get_group_desc(sb, group, &group_desc_bh); @@ -917,7 +915,7 @@ repeat_in_this_group: * group descriptor metadata has not yet been updated. * So we just go onto the next blockgroup. */ - if (++group == sbi->s_groups_count) + if (++group == ngroups) group = 0; } err = -ENOSPC; @@ -1158,7 +1156,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) { unsigned long desc_count; struct ext4_group_desc *gdp; - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); #ifdef EXT4FS_DEBUG struct ext4_super_block *es; unsigned long bitmap_count, x; @@ -1168,7 +1166,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) desc_count = 0; bitmap_count = 0; gdp = NULL; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; @@ -1190,7 +1188,7 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) return desc_count; #else desc_count = 0; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; @@ -1205,9 +1203,9 @@ unsigned long ext4_count_free_inodes(struct super_block *sb) unsigned long ext4_count_dirs(struct super_block * sb) { unsigned long count = 0; - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { struct ext4_group_desc *gdp = ext4_get_group_desc(sb, i, NULL); if (!gdp) continue; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 96f3366f59f..4e7f363e303 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4965,7 +4965,8 @@ static int ext4_index_trans_blocks(struct inode *inode, int nrblocks, int chunk) */ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) { - int groups, gdpblocks; + ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); + int gdpblocks; int idxblocks; int ret = 0; @@ -4992,8 +4993,8 @@ int ext4_meta_trans_blocks(struct inode *inode, int nrblocks, int chunk) groups += nrblocks; gdpblocks = groups; - if (groups > EXT4_SB(inode->i_sb)->s_groups_count) - groups = EXT4_SB(inode->i_sb)->s_groups_count; + if (groups > ngroups) + groups = ngroups; if (groups > EXT4_SB(inode->i_sb)->s_gdb_count) gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index f871677a798..c3af9e6b666 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -739,6 +739,7 @@ static void ext4_mb_generate_buddy(struct super_block *sb, static int ext4_mb_init_cache(struct page *page, char *incore) { + ext4_group_t ngroups; int blocksize; int blocks_per_page; int groups_per_page; @@ -757,6 +758,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) inode = page->mapping->host; sb = inode->i_sb; + ngroups = ext4_get_groups_count(sb); blocksize = 1 << inode->i_blkbits; blocks_per_page = PAGE_CACHE_SIZE / blocksize; @@ -780,7 +782,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) for (i = 0; i < groups_per_page; i++) { struct ext4_group_desc *desc; - if (first_group + i >= EXT4_SB(sb)->s_groups_count) + if (first_group + i >= ngroups) break; err = -EIO; @@ -852,7 +854,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore) struct ext4_group_info *grinfo; group = (first_block + i) >> 1; - if (group >= EXT4_SB(sb)->s_groups_count) + if (group >= ngroups) break; /* @@ -1788,6 +1790,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) int block, pnum; int blocks_per_page; int groups_per_page; + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t first_group; struct ext4_group_info *grp; @@ -1807,7 +1810,7 @@ int ext4_mb_get_buddy_cache_lock(struct super_block *sb, ext4_group_t group) /* read all groups the page covers into the cache */ for (i = 0; i < groups_per_page; i++) { - if ((first_group + i) >= EXT4_SB(sb)->s_groups_count) + if ((first_group + i) >= ngroups) break; grp = ext4_get_group_info(sb, first_group + i); /* take all groups write allocation @@ -1945,8 +1948,7 @@ err: static noinline_for_stack int ext4_mb_regular_allocator(struct ext4_allocation_context *ac) { - ext4_group_t group; - ext4_group_t i; + ext4_group_t ngroups, group, i; int cr; int err = 0; int bsbits; @@ -1957,6 +1959,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) sb = ac->ac_sb; sbi = EXT4_SB(sb); + ngroups = ext4_get_groups_count(sb); BUG_ON(ac->ac_status == AC_STATUS_FOUND); /* first, try the goal */ @@ -2017,11 +2020,11 @@ repeat: */ group = ac->ac_g_ex.fe_group; - for (i = 0; i < EXT4_SB(sb)->s_groups_count; group++, i++) { + for (i = 0; i < ngroups; group++, i++) { struct ext4_group_info *grp; struct ext4_group_desc *desc; - if (group == EXT4_SB(sb)->s_groups_count) + if (group == ngroups) group = 0; /* quick check to skip empty groups */ @@ -2315,12 +2318,10 @@ static struct file_operations ext4_mb_seq_history_fops = { static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) { struct super_block *sb = seq->private; - struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; - if (*pos < 0 || *pos >= sbi->s_groups_count) + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; - group = *pos + 1; return (void *) ((unsigned long) group); } @@ -2328,11 +2329,10 @@ static void *ext4_mb_seq_groups_start(struct seq_file *seq, loff_t *pos) static void *ext4_mb_seq_groups_next(struct seq_file *seq, void *v, loff_t *pos) { struct super_block *sb = seq->private; - struct ext4_sb_info *sbi = EXT4_SB(sb); ext4_group_t group; ++*pos; - if (*pos < 0 || *pos >= sbi->s_groups_count) + if (*pos < 0 || *pos >= ext4_get_groups_count(sb)) return NULL; group = *pos + 1; return (void *) ((unsigned long) group); @@ -2587,6 +2587,7 @@ void ext4_mb_update_group_info(struct ext4_group_info *grp, ext4_grpblk_t add) static int ext4_mb_init_backend(struct super_block *sb) { + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int metalen; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -2598,7 +2599,7 @@ static int ext4_mb_init_backend(struct super_block *sb) struct ext4_group_desc *desc; /* This is the number of blocks used by GDT */ - num_meta_group_infos = (sbi->s_groups_count + EXT4_DESC_PER_BLOCK(sb) - + num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); /* @@ -2644,7 +2645,7 @@ static int ext4_mb_init_backend(struct super_block *sb) for (i = 0; i < num_meta_group_infos; i++) { if ((i + 1) == num_meta_group_infos) metalen = sizeof(*meta_group_info) * - (sbi->s_groups_count - + (ngroups - (i << EXT4_DESC_PER_BLOCK_BITS(sb))); meta_group_info = kmalloc(metalen, GFP_KERNEL); if (meta_group_info == NULL) { @@ -2655,7 +2656,7 @@ static int ext4_mb_init_backend(struct super_block *sb) sbi->s_group_info[i] = meta_group_info; } - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { desc = ext4_get_group_desc(sb, i, NULL); if (desc == NULL) { printk(KERN_ERR @@ -2781,13 +2782,14 @@ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) int ext4_mb_release(struct super_block *sb) { + ext4_group_t ngroups = ext4_get_groups_count(sb); ext4_group_t i; int num_meta_group_infos; struct ext4_group_info *grinfo; struct ext4_sb_info *sbi = EXT4_SB(sb); if (sbi->s_group_info) { - for (i = 0; i < sbi->s_groups_count; i++) { + for (i = 0; i < ngroups; i++) { grinfo = ext4_get_group_info(sb, i); #ifdef DOUBLE_CHECK kfree(grinfo->bb_bitmap); @@ -2797,7 +2799,7 @@ int ext4_mb_release(struct super_block *sb) ext4_unlock_group(sb, i); kfree(grinfo); } - num_meta_group_infos = (sbi->s_groups_count + + num_meta_group_infos = (ngroups + EXT4_DESC_PER_BLOCK(sb) - 1) >> EXT4_DESC_PER_BLOCK_BITS(sb); for (i = 0; i < num_meta_group_infos; i++) @@ -4121,7 +4123,7 @@ static void ext4_mb_return_to_preallocation(struct inode *inode, static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - ext4_group_t i; + ext4_group_t ngroups, i; printk(KERN_ERR "EXT4-fs: Can't allocate:" " Allocation context details:\n"); @@ -4145,7 +4147,8 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) printk(KERN_ERR "EXT4-fs: %lu scanned, %d found\n", ac->ac_ex_scanned, ac->ac_found); printk(KERN_ERR "EXT4-fs: groups: \n"); - for (i = 0; i < EXT4_SB(sb)->s_groups_count; i++) { + ngroups = ext4_get_groups_count(sb); + for (i = 0; i < ngroups; i++) { struct ext4_group_info *grp = ext4_get_group_info(sb, i); struct ext4_prealloc_space *pa; ext4_grpblk_t start; @@ -4469,13 +4472,13 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) static int ext4_mb_discard_preallocations(struct super_block *sb, int needed) { - ext4_group_t i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); int ret; int freed = 0; trace_mark(ext4_mb_discard_preallocations, "dev %s needed %d", sb->s_id, needed); - for (i = 0; i < EXT4_SB(sb)->s_groups_count && needed > 0; i++) { + for (i = 0; i < ngroups && needed > 0; i++) { ret = ext4_mb_discard_group_preallocations(sb, i, needed); freed += ret; needed -= ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 68c3a44c4a9..fcd7b24c6df 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3557,9 +3557,8 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) if (test_opt(sb, MINIX_DF)) { sbi->s_overhead_last = 0; } else if (sbi->s_blocks_last != ext4_blocks_count(es)) { - ext4_group_t ngroups = sbi->s_groups_count, i; + ext4_group_t i, ngroups = ext4_get_groups_count(sb); ext4_fsblk_t overhead = 0; - smp_rmb(); /* * Compute the overhead (FS structures). This is constant From 114e9fc90703bd6aac0229fb559e97caa6c49770 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 25 Apr 2009 15:48:07 -0400 Subject: [PATCH 0561/2061] ext4: Remove outdated comment about lock_super() ext4_fill_super() is no longer called by read_super(), and it is no longer called with the superblock locked. The unlock_super()/lock_super() is no longer present, so this comment is entirely superfluous. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index fcd7b24c6df..e3b35f26d5f 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2828,14 +2828,6 @@ no_journal: goto failed_mount4; }; - /* - * akpm: core read_super() calls in here with the superblock locked. - * That deadlocks, because orphan cleanup needs to lock the superblock - * in numerous places. Here we just pop the lock - it's relatively - * harmless, because we are now ready to accept write_super() requests, - * and aviro says that's the only reason for hanging onto the - * superblock lock. - */ EXT4_SB(sb)->s_mount_state |= EXT4_ORPHAN_FS; ext4_orphan_cleanup(sb, es); EXT4_SB(sb)->s_mount_state &= ~EXT4_ORPHAN_FS; From a63c9eb2ce6f5028da90f282798232c4f398ceb8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 01:59:42 -0400 Subject: [PATCH 0562/2061] ext4: ext4_mark_recovery_complete() doesn't need to use lock_super The function ext4_mark_recovery_complete() is called from two call paths: either (a) while mounting the filesystem, in which case there's no danger of any other CPU calling write_super() until the mount is completed, and (b) while remounting the filesystem read-write, in which case the fs core has already locked the superblock. This also allows us to take out a very vile unlock_super()/lock_super() pair in ext4_remount(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e3b35f26d5f..45d0ada9bfc 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3219,13 +3219,11 @@ static void ext4_mark_recovery_complete(struct super_block *sb, if (jbd2_journal_flush(journal) < 0) goto out; - lock_super(sb); if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER) && sb->s_flags & MS_RDONLY) { EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); ext4_commit_super(sb, 1); } - unlock_super(sb); out: jbd2_journal_unlock_updates(journal); @@ -3436,15 +3434,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) (sbi->s_mount_state & EXT4_VALID_FS)) es->s_state = cpu_to_le16(sbi->s_mount_state); - /* - * We have to unlock super so that we can wait for - * transactions. - */ - if (sbi->s_journal) { - unlock_super(sb); + if (sbi->s_journal) ext4_mark_recovery_complete(sb, es); - lock_super(sb); - } } else { int ret; if ((ret = EXT4_HAS_RO_COMPAT_FEATURE(sb, From 3b9d4ed26680771295d904a6b83e88e620780893 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 25 Apr 2009 22:54:04 -0400 Subject: [PATCH 0563/2061] ext4: Replace lock/unlock_super() with an explicit lock for the orphan list Use a separate lock to protect the orphan list, so we can stop overloading the use of lock_super(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_sb.h | 1 + fs/ext4/namei.c | 20 +++++++++++--------- fs/ext4/super.c | 1 + 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 57b71fefbcc..4bda2f75d42 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -71,6 +71,7 @@ struct ext4_sb_info { struct inode *s_journal_inode; struct journal_s *s_journal; struct list_head s_orphan; + struct mutex s_orphan_lock; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 22098e1cd08..8018e49a728 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -1997,7 +1997,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) if (!ext4_handle_valid(handle)) return 0; - lock_super(sb); + mutex_lock(&EXT4_SB(sb)->s_orphan_lock); if (!list_empty(&EXT4_I(inode)->i_orphan)) goto out_unlock; @@ -2006,9 +2006,13 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) /* @@@ FIXME: Observation from aviro: * I think I can trigger J_ASSERT in ext4_orphan_add(). We block - * here (on lock_super()), so race with ext4_link() which might bump + * here (on s_orphan_lock), so race with ext4_link() which might bump * ->i_nlink. For, say it, character device. Not a regular file, * not a directory, not a symlink and ->i_nlink > 0. + * + * tytso, 4/25/2009: I'm not sure how that could happen; + * shouldn't the fs core protect us from these sort of + * unlink()/link() races? */ J_ASSERT((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) || inode->i_nlink == 0); @@ -2045,7 +2049,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) jbd_debug(4, "orphan inode %lu will point to %d\n", inode->i_ino, NEXT_ORPHAN(inode)); out_unlock: - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_orphan_lock); ext4_std_error(inode->i_sb, err); return err; } @@ -2066,11 +2070,9 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) if (!ext4_handle_valid(handle)) return 0; - lock_super(inode->i_sb); - if (list_empty(&ei->i_orphan)) { - unlock_super(inode->i_sb); - return 0; - } + mutex_lock(&EXT4_SB(inode->i_sb)->s_orphan_lock); + if (list_empty(&ei->i_orphan)) + goto out; ino_next = NEXT_ORPHAN(inode); prev = ei->i_orphan.prev; @@ -2120,7 +2122,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) out_err: ext4_std_error(inode->i_sb, err); out: - unlock_super(inode->i_sb); + mutex_unlock(&EXT4_SB(inode->i_sb)->s_orphan_lock); return err; out_brelse: diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 45d0ada9bfc..7f43fde9554 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2645,6 +2645,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->dq_op = &ext4_quota_operations; #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ + mutex_init(&sbi->s_orphan_lock); sb->s_root = NULL; From 32ed5058ce90024efcd811254b4b1de0468099df Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 25 Apr 2009 22:53:39 -0400 Subject: [PATCH 0564/2061] ext4: Replace lock/unlock_super() with an explicit lock for resizing Use a separate lock to protect s_groups_count and the other block group descriptors which get changed via an on-line resize operation, so we can stop overloading the use of lock_super(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4_sb.h | 1 + fs/ext4/resize.c | 35 ++++++++++++++++++----------------- fs/ext4/super.c | 1 + 3 files changed, 20 insertions(+), 17 deletions(-) diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h index 4bda2f75d42..2d36223d5f5 100644 --- a/fs/ext4/ext4_sb.h +++ b/fs/ext4/ext4_sb.h @@ -72,6 +72,7 @@ struct ext4_sb_info { struct journal_s *s_journal; struct list_head s_orphan; struct mutex s_orphan_lock; + struct mutex s_resize_lock; unsigned long s_commit_interval; u32 s_max_batch_time; u32 s_min_batch_time; diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 546c7dd869e..e8ded13b5cb 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -193,7 +193,7 @@ static int setup_new_group_blocks(struct super_block *sb, if (IS_ERR(handle)) return PTR_ERR(handle); - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { err = -EBUSY; goto exit_journal; @@ -302,7 +302,7 @@ exit_bh: brelse(bh); exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; @@ -643,11 +643,12 @@ exit_free: * important part is that the new block and inode counts are in the backup * superblocks, and the location of the new group metadata in the GDT backups. * - * We do not need lock_super() for this, because these blocks are not - * otherwise touched by the filesystem code when it is mounted. We don't - * need to worry about last changing from sbi->s_groups_count, because the - * worst that can happen is that we do not copy the full number of backups - * at this time. The resize which changed s_groups_count will backup again. + * We do not need take the s_resize_lock for this, because these + * blocks are not otherwise touched by the filesystem code when it is + * mounted. We don't need to worry about last changing from + * sbi->s_groups_count, because the worst that can happen is that we + * do not copy the full number of backups at this time. The resize + * which changed s_groups_count will backup again. */ static void update_backups(struct super_block *sb, int blk_off, char *data, int size) @@ -809,7 +810,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) goto exit_put; } - lock_super(sb); + mutex_lock(&sbi->s_resize_lock); if (input->group != sbi->s_groups_count) { ext4_warning(sb, __func__, "multiple resizers run on filesystem!"); @@ -840,7 +841,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) /* * OK, now we've set up the new group. Time to make it active. * - * Current kernels don't lock all allocations via lock_super(), + * We do not lock all allocations via s_resize_lock * so we have to be safe wrt. concurrent accesses the group * data. So we need to be careful to set all of the relevant * group descriptor data etc. *before* we enable the group. @@ -900,12 +901,12 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) * * The precise rules we use are: * - * * Writers of s_groups_count *must* hold lock_super + * * Writers of s_groups_count *must* hold s_resize_lock * AND * * Writers must perform a smp_wmb() after updating all dependent * data and before modifying the groups count * - * * Readers must hold lock_super() over the access + * * Readers must hold s_resize_lock over the access * OR * * Readers must perform an smp_rmb() after reading the groups count * and before reading any dependent data. @@ -948,7 +949,7 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input) sb->s_dirt = 1; exit_journal: - unlock_super(sb); + mutex_unlock(&sbi->s_resize_lock); if ((err2 = ext4_journal_stop(handle)) && !err) err = err2; if (!err) { @@ -986,7 +987,7 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, /* We don't need to worry about locking wrt other resizers just * yet: we're going to revalidate es->s_blocks_count after - * taking lock_super() below. */ + * taking the s_resize_lock below. */ o_blocks_count = ext4_blocks_count(es); o_groups_count = EXT4_SB(sb)->s_groups_count; @@ -1056,11 +1057,11 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, goto exit_put; } - lock_super(sb); + mutex_lock(&EXT4_SB(sb)->s_resize_lock); if (o_blocks_count != ext4_blocks_count(es)) { ext4_warning(sb, __func__, "multiple resizers run on filesystem!"); - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); err = -EBUSY; goto exit_put; @@ -1070,14 +1071,14 @@ int ext4_group_extend(struct super_block *sb, struct ext4_super_block *es, EXT4_SB(sb)->s_sbh))) { ext4_warning(sb, __func__, "error %d on journal write access", err); - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_journal_stop(handle); goto exit_put; } ext4_blocks_count_set(es, o_blocks_count + add); ext4_handle_dirty_metadata(handle, NULL, EXT4_SB(sb)->s_sbh); sb->s_dirt = 1; - unlock_super(sb); + mutex_unlock(&EXT4_SB(sb)->s_resize_lock); ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count, o_blocks_count + add); /* We add the blocks to the bitmap and set the group need init bit */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7f43fde9554..1fbf0906ae2 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2646,6 +2646,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) #endif INIT_LIST_HEAD(&sbi->s_orphan); /* unlinked but open files */ mutex_init(&sbi->s_orphan_lock); + mutex_init(&sbi->s_resize_lock); sb->s_root = NULL; From 701970b3a83cc639c1ec8fc6f40a7871cb99426f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 24 Apr 2009 23:11:22 -0400 Subject: [PATCH 0565/2061] tracing/events: make modules have their own file_operations structure For proper module reference counting, the file_operations that modules use must have the "owner" field set to the module. Unfortunately, the trace events use share file_operations. The same file_operations are used by all both kernel core and all modules. This patch makes the modules allocate their own file_operations and copies the functions from the core kernel. This allows those file operations to be owned by the module. Care is taken to free this code on module unload. Thanks to Greg KH for reminding me that file_operations must be owned by the module to have reference counting take place. [ Impact: fix modular tracepoints / potential crash ] Signed-off-by: Steven Rostedt Acked-by: Greg Kroah-Hartman --- kernel/trace/trace_events.c | 95 ++++++++++++++++++++++++++++++++++--- 1 file changed, 88 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index b9208158808..be4d3a437c1 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -770,7 +770,11 @@ event_subsystem_dir(const char *name, struct dentry *d_events) } static int -event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) +event_create_dir(struct ftrace_event_call *call, struct dentry *d_events, + const struct file_operations *id, + const struct file_operations *enable, + const struct file_operations *filter, + const struct file_operations *format) { struct dentry *entry; int ret; @@ -800,11 +804,11 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) if (call->regfunc) entry = trace_create_file("enable", 0644, call->dir, call, - &ftrace_enable_fops); + enable); if (call->id) entry = trace_create_file("id", 0444, call->dir, call, - &ftrace_event_id_fops); + id); if (call->define_fields) { ret = call->define_fields(); @@ -814,7 +818,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return ret; } entry = trace_create_file("filter", 0644, call->dir, call, - &ftrace_event_filter_fops); + filter); } /* A trace may not want to export its format */ @@ -822,7 +826,7 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) return 0; entry = trace_create_file("format", 0444, call->dir, call, - &ftrace_event_format_fops); + format); return 0; } @@ -833,8 +837,60 @@ event_create_dir(struct ftrace_event_call *call, struct dentry *d_events) event++) #ifdef CONFIG_MODULES + +static LIST_HEAD(ftrace_module_file_list); + +/* + * Modules must own their file_operations to keep up with + * reference counting. + */ +struct ftrace_module_file_ops { + struct list_head list; + struct module *mod; + struct file_operations id; + struct file_operations enable; + struct file_operations format; + struct file_operations filter; +}; + +static struct ftrace_module_file_ops * +trace_create_file_ops(struct module *mod) +{ + struct ftrace_module_file_ops *file_ops; + + /* + * This is a bit of a PITA. To allow for correct reference + * counting, modules must "own" their file_operations. + * To do this, we allocate the file operations that will be + * used in the event directory. + */ + + file_ops = kmalloc(sizeof(*file_ops), GFP_KERNEL); + if (!file_ops) + return NULL; + + file_ops->mod = mod; + + file_ops->id = ftrace_event_id_fops; + file_ops->id.owner = mod; + + file_ops->enable = ftrace_enable_fops; + file_ops->enable.owner = mod; + + file_ops->filter = ftrace_event_filter_fops; + file_ops->filter.owner = mod; + + file_ops->format = ftrace_event_format_fops; + file_ops->format.owner = mod; + + list_add(&file_ops->list, &ftrace_module_file_list); + + return file_ops; +} + static void trace_module_add_events(struct module *mod) { + struct ftrace_module_file_ops *file_ops = NULL; struct ftrace_event_call *call, *start, *end; struct dentry *d_events; @@ -852,14 +908,27 @@ static void trace_module_add_events(struct module *mod) /* The linker may leave blanks */ if (!call->name) continue; + + /* + * This module has events, create file ops for this module + * if not already done. + */ + if (!file_ops) { + file_ops = trace_create_file_ops(mod); + if (!file_ops) + return; + } call->mod = mod; list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events); + event_create_dir(call, d_events, + &file_ops->id, &file_ops->enable, + &file_ops->filter, &file_ops->format); } } static void trace_module_remove_events(struct module *mod) { + struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; list_for_each_entry_safe(call, p, &ftrace_events, list) { @@ -874,6 +943,16 @@ static void trace_module_remove_events(struct module *mod) list_del(&call->list); } } + + /* Now free the file_operations */ + list_for_each_entry(file_ops, &ftrace_module_file_list, list) { + if (file_ops->mod == mod) + break; + } + if (&file_ops->list != &ftrace_module_file_list) { + list_del(&file_ops->list); + kfree(file_ops); + } } static int trace_module_notify(struct notifier_block *self, @@ -954,7 +1033,9 @@ static __init int event_trace_init(void) if (!call->name) continue; list_add(&call->list, &ftrace_events); - event_create_dir(call, d_events); + event_create_dir(call, d_events, &ftrace_event_id_fops, + &ftrace_enable_fops, &ftrace_event_filter_fops, + &ftrace_event_format_fops); } ret = register_module_notifier(&trace_module_nb); From 0a3ec21fcd311b26ab0f249d62960e127bc20ca8 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Sun, 26 Apr 2009 23:07:42 +0200 Subject: [PATCH 0566/2061] x86: beautify vmlinux_64.lds.S Beautify vmlinux_64.lds.S: - Use tabs for indent - Located curly braces like in C code - Rearranged a few comments There is no functional changes in this patch The beautification is done to prepare a unification of the _32 and the _64 variants of the linker scripts. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <20090426210742.GA3464@uranus.ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_64.lds.S | 430 +++++++++++++++++-------------- 1 file changed, 234 insertions(+), 196 deletions(-) diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index c8742507b03..6d5a5b05eaa 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -15,69 +15,79 @@ OUTPUT_ARCH(i386:x86-64) ENTRY(phys_startup_64) jiffies_64 = jiffies; PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + user PT_LOAD FLAGS(7); /* RWE */ data.init PT_LOAD FLAGS(7); /* RWE */ #ifdef CONFIG_SMP percpu PT_LOAD FLAGS(7); /* RWE */ #endif data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; - NOTES :text :note + /* Text and read-only data */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; + /* First the code that has to be first for bootstrapping */ + *(.text.head) + _stext = .; + /* Then the rest */ + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 + NOTES :text :note - RODATA + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 - . = ALIGN(PAGE_SIZE); /* Align data segment to page size boundary */ - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - _edata = .; /* End of data section */ + RODATA + + /* Align data segment to page size boundary */ + . = ALIGN(PAGE_SIZE); + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS + /* End of data section */ + _edata = .; } :data - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + *(.data.cacheline_aligned) + } + + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + } #define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) #define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) #define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) @@ -85,37 +95,53 @@ SECTIONS #define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) #define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { *(.vsyscall_0) } :user - __vsyscall_0 = VSYSCALL_VIRT_ADDR; + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { *(.vsyscall_fn) } - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) - { *(.vsyscall_gtod_data) } - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) - { *(.vsyscall_clock) } - vsyscall_clock = VVIRT(.vsyscall_clock); + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } + + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) - { *(.vsyscall_1) } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) - { *(.vsyscall_2) } + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { *(.vgetcpu_mode) } - vgetcpu_mode = VVIRT(.vgetcpu_mode); + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { *(.jiffies) } - jiffies = VVIRT(.jiffies); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) - { *(.vsyscall_3) } + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; #undef VSYSCALL_ADDR #undef VSYSCALL_PHYS_ADDR @@ -125,156 +151,168 @@ SECTIONS #undef VVIRT_OFFSET #undef VVIRT - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); /* init_task */ - *(.data.init_task) - }:data.init + /* init_task */ + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + . = ALIGN(THREAD_SIZE); + *(.data.init_task) + } :data.init - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + *(.data.page_aligned) + } + + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + /* might get freed after init */ + . = ALIGN(PAGE_SIZE); + __smp_alt_begin = .; + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + __smp_alt_end = .; + } + + /* Init code and data */ . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } + __init_begin = .; /* paired with __init_end */ + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; + } - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + __initdata_begin = .; + INIT_DATA + __initdata_end = .; + } - . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + . = ALIGN(16); + __setup_start = .; + *(.init.setup) + __setup_end = .; + } - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + . = ALIGN(8); + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } #ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } #endif #ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - __data_nosave - should + * start another section data.init2. Also, pda should be at the head of + * percpu area. Preallocate it and define the percpu offset symbol + * so that it can be accessed as a percpu variable. + */ + . = ALIGN(PAGE_SIZE); + PERCPU_VADDR(0, :percpu) #else - PERCPU(PAGE_SIZE) + PERCPU(PAGE_SIZE) #endif - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 /* use another section data.init2, see PERCPU_VADDR() above */ + __init_end = .; - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } :data.init2 + /* use another section data.init2, see PERCPU_VADDR() above */ - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __bss_start = .; /* BSS */ + *(.bss.page_aligned) + *(.bss) + __bss_stop = .; } - STABS_DEBUG + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } - DWARF_DEBUG + _end = . ; + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } + + STABS_DEBUG + DWARF_DEBUG } - /* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ #define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load INIT_PER_CPU(gdt_page); INIT_PER_CPU(irq_stack_union); From 51b26ada79b605ed709ddcedbb6012e8f8e0ebed Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 26 Apr 2009 10:12:47 -0700 Subject: [PATCH 0567/2061] x86: unify arch/x86/boot/compressed/vmlinux_*.lds Look at the: diff -u arch/x86/boot/compressed/vmlinux_*.lds output and realize that they're basially exactly the same except for trivial naming differences, and the fact that the 64-bit version has a "pgtable" thing. So unify them. There's some trivial cleanup there (make the output format a Kconfig thing rather than doing #ifdef's for it, and unify both 32-bit and 64-bit BSS end to "_ebss", where 32-bit used to use the traditional "_end"), but other than that it's really very mindless and straigt conversion. For example, I think we should aim to remove "startup_32" vs "startup_64", and just call it "startup", and get rid of one more difference. I didn't do that. Also, notice the comment in the unified vmlinux.lds.S talks about "head_64" and "startup_32" which is an odd and incorrect mix, but that was actually what the old 64-bit only lds file had, so the confusion isn't new, and now that mixing is arguably more accurate thanks to the vmlinux.lds.S file being shared between the two cases ;) [ Impact: cleanup, unification ] Signed-off-by: Linus Torvalds Acked-by: Sam Ravnborg Signed-off-by: Ingo Molnar --- arch/x86/Kconfig | 5 +++ arch/x86/boot/compressed/Makefile | 2 +- arch/x86/boot/compressed/head_32.S | 8 ++-- .../{vmlinux_64.lds => vmlinux.lds.S} | 11 ++++- arch/x86/boot/compressed/vmlinux_32.lds | 43 ------------------- 5 files changed, 20 insertions(+), 49 deletions(-) rename arch/x86/boot/compressed/{vmlinux_64.lds => vmlinux.lds.S} (78%) delete mode 100644 arch/x86/boot/compressed/vmlinux_32.lds diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index bc25b9f5e4c..039c3f04aac 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -47,6 +47,11 @@ config X86 select HAVE_KERNEL_BZIP2 select HAVE_KERNEL_LZMA +config OUTPUT_FORMAT + string + default "elf32-i386" if X86_32 + default "elf64-x86-64" if X86_64 + config ARCH_DEFCONFIG string default "arch/x86/configs/i386_defconfig" if X86_32 diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 65551c9f857..0f4b5e2abd3 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -19,7 +19,7 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS_vmlinux := -T -$(obj)/vmlinux: $(src)/vmlinux_$(BITS).lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE +$(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 3a8a866fb2e..85bd3285706 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -88,9 +88,9 @@ ENTRY(startup_32) * where decompression in place becomes safe. */ pushl %esi - leal _end(%ebp), %esi - leal _end(%ebx), %edi - movl $(_end - startup_32), %ecx + leal _ebss(%ebp), %esi + leal _ebss(%ebx), %edi + movl $(_ebss - startup_32), %ecx std rep movsb @@ -121,7 +121,7 @@ relocated: */ xorl %eax,%eax leal _edata(%ebx),%edi - leal _end(%ebx), %ecx + leal _ebss(%ebx), %ecx subl %edi,%ecx cld rep diff --git a/arch/x86/boot/compressed/vmlinux_64.lds b/arch/x86/boot/compressed/vmlinux.lds.S similarity index 78% rename from arch/x86/boot/compressed/vmlinux_64.lds rename to arch/x86/boot/compressed/vmlinux.lds.S index bef1ac891bc..ffcb19134bf 100644 --- a/arch/x86/boot/compressed/vmlinux_64.lds +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,6 +1,13 @@ -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_64 OUTPUT_ARCH(i386:x86-64) ENTRY(startup_64) +#else +OUTPUT_ARCH(i386) +ENTRY(startup_32) +#endif + SECTIONS { /* Be careful parts of head_64.S assume startup_32 is at @@ -38,11 +45,13 @@ SECTIONS *(.bss) *(.bss.*) *(COMMON) +#ifdef CONFIG_X86_64 . = ALIGN(8); _end_before_pgt = . ; . = ALIGN(4096); pgtable = . ; . = . + 4096 * 6; +#endif _ebss = .; } } diff --git a/arch/x86/boot/compressed/vmlinux_32.lds b/arch/x86/boot/compressed/vmlinux_32.lds deleted file mode 100644 index bb3c48379c4..00000000000 --- a/arch/x86/boot/compressed/vmlinux_32.lds +++ /dev/null @@ -1,43 +0,0 @@ -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(startup_32) -SECTIONS -{ - /* Be careful parts of head_32.S assume startup_32 is at - * address 0. - */ - . = 0; - .text.head : { - _head = . ; - *(.text.head) - _ehead = . ; - } - .rodata.compressed : { - *(.rodata.compressed) - } - .text : { - _text = .; /* Text */ - *(.text) - *(.text.*) - _etext = . ; - } - .rodata : { - _rodata = . ; - *(.rodata) /* read-only data */ - *(.rodata.*) - _erodata = . ; - } - .data : { - _data = . ; - *(.data) - *(.data.*) - _edata = . ; - } - .bss : { - _bss = . ; - *(.bss) - *(.bss.*) - *(COMMON) - _end = . ; - } -} From fc4967b8c6a1540ebce9ac48e44b8d44e7fac971 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 27 Apr 2009 14:06:26 +0900 Subject: [PATCH 0568/2061] sh: update defconfigs for PCI changes. Signed-off-by: Paul Mundt --- arch/sh/configs/ap325rxa_defconfig | 37 ++++++++++-- arch/sh/configs/cayman_defconfig | 71 ++++++++++------------- arch/sh/configs/dreamcast_defconfig | 39 ++++++++++--- arch/sh/configs/edosk7705_defconfig | 29 +++++++-- arch/sh/configs/edosk7760_defconfig | 32 ++++++++-- arch/sh/configs/espt_defconfig | 39 +++++++++++-- arch/sh/configs/hp6xx_defconfig | 35 +++++++++-- arch/sh/configs/landisk_defconfig | 48 ++++++++++++--- arch/sh/configs/lboxre2_defconfig | 44 +++++++++++--- arch/sh/configs/magicpanelr2_defconfig | 30 ++++++++-- arch/sh/configs/microdev_defconfig | 36 ++++++++++-- arch/sh/configs/migor_defconfig | 36 +++++++++--- arch/sh/configs/polaris_defconfig | 30 ++++++++-- arch/sh/configs/r7780mp_defconfig | 36 +++++++++--- arch/sh/configs/r7785rp_defconfig | 36 +++++++++--- arch/sh/configs/rsk7201_defconfig | 37 ++++++++++-- arch/sh/configs/rsk7203_defconfig | 40 ++++++++++--- arch/sh/configs/rts7751r2d1_defconfig | 43 +++++++++++--- arch/sh/configs/rts7751r2dplus_defconfig | 43 +++++++++++--- arch/sh/configs/sdk7780_defconfig | 38 +++++++++--- arch/sh/configs/se7206_defconfig | 35 +++++++++-- arch/sh/configs/se7343_defconfig | 42 ++++++++++++-- arch/sh/configs/se7619_defconfig | 35 +++++++++-- arch/sh/configs/se7705_defconfig | 34 +++++++++-- arch/sh/configs/se7712_defconfig | 30 ++++++++-- arch/sh/configs/se7721_defconfig | 33 +++++++++-- arch/sh/configs/se7722_defconfig | 38 +++++++++--- arch/sh/configs/se7750_defconfig | 34 +++++++++-- arch/sh/configs/se7751_defconfig | 34 +++++++++-- arch/sh/configs/se7780_defconfig | 38 +++++++++--- arch/sh/configs/sh03_defconfig | 43 +++++++++++--- arch/sh/configs/sh7710voipgw_defconfig | 35 +++++++++-- arch/sh/configs/sh7724_generic_defconfig | 4 +- arch/sh/configs/sh7763rdp_defconfig | 35 +++++++++-- arch/sh/configs/sh7785lcr_32bit_defconfig | 37 +++++++++--- arch/sh/configs/sh7785lcr_defconfig | 6 +- arch/sh/configs/shmin_defconfig | 31 ++++++++-- arch/sh/configs/shx3_defconfig | 32 ++++++++-- arch/sh/configs/snapgear_defconfig | 40 ++++++++++--- arch/sh/configs/systemh_defconfig | 39 +++++++++++-- arch/sh/configs/titan_defconfig | 40 ++++++++++--- arch/sh/configs/ul2_defconfig | 37 ++++++++++-- arch/sh/configs/urquell_defconfig | 39 +++++++++++-- 43 files changed, 1247 insertions(+), 303 deletions(-) diff --git a/arch/sh/configs/ap325rxa_defconfig b/arch/sh/configs/ap325rxa_defconfig index c8d982a8a2e..022f70e0ea0 100644 --- a/arch/sh/configs/ap325rxa_defconfig +++ b/arch/sh/configs/ap325rxa_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 17:46:53 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:42:06 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -74,6 +75,7 @@ CONFIG_EMBEDDED=y CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y # CONFIG_KALLSYMS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -92,12 +94,15 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -110,7 +115,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -159,6 +163,7 @@ CONFIG_ARCH_SHMOBILE=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set CONFIG_CPU_SUBTYPE_SH7723=y +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -168,8 +173,6 @@ CONFIG_CPU_SUBTYPE_SH7723=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -573,6 +576,7 @@ CONFIG_SCSI_WAIT_SCAN=m CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_SCSI_DEBUG is not set # CONFIG_SCSI_DH is not set # CONFIG_SCSI_OSD_INITIATOR is not set @@ -695,6 +699,7 @@ CONFIG_DEVKMEM=y # # Non-8250 serial port support # +# CONFIG_SERIAL_MAX3100 is not set CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_SH_SCI_NR_UARTS=6 CONFIG_SERIAL_SH_SCI_CONSOLE=y @@ -1030,6 +1035,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y CONFIG_EXT2_FS_SECURITY=y # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y @@ -1051,6 +1057,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1100,6 +1111,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1191,10 +1203,24 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1303,6 +1329,7 @@ CONFIG_CRYPTO_CBC=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/cayman_defconfig b/arch/sh/configs/cayman_defconfig index fa5fc1e1e98..40301f86a45 100644 --- a/arch/sh/configs/cayman_defconfig +++ b/arch/sh/configs/cayman_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 17:49:14 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:42:53 2009 # CONFIG_SUPERH=y # CONFIG_SUPERH32 is not set @@ -40,6 +40,7 @@ CONFIG_LOCALVERSION_AUTO=y CONFIG_SWAP=y # CONFIG_SYSVIPC is not set CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set @@ -70,6 +71,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -89,10 +91,13 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -105,7 +110,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -127,39 +131,6 @@ CONFIG_DEFAULT_IOSCHED="cfq" # System type # CONFIG_CPU_SH5=y -# CONFIG_CPU_SUBTYPE_SH7619 is not set -# CONFIG_CPU_SUBTYPE_SH7201 is not set -# CONFIG_CPU_SUBTYPE_SH7203 is not set -# CONFIG_CPU_SUBTYPE_SH7206 is not set -# CONFIG_CPU_SUBTYPE_SH7263 is not set -# CONFIG_CPU_SUBTYPE_MXG is not set -# CONFIG_CPU_SUBTYPE_SH7705 is not set -# CONFIG_CPU_SUBTYPE_SH7706 is not set -# CONFIG_CPU_SUBTYPE_SH7707 is not set -# CONFIG_CPU_SUBTYPE_SH7708 is not set -# CONFIG_CPU_SUBTYPE_SH7709 is not set -# CONFIG_CPU_SUBTYPE_SH7710 is not set -# CONFIG_CPU_SUBTYPE_SH7712 is not set -# CONFIG_CPU_SUBTYPE_SH7720 is not set -# CONFIG_CPU_SUBTYPE_SH7721 is not set -# CONFIG_CPU_SUBTYPE_SH7750 is not set -# CONFIG_CPU_SUBTYPE_SH7091 is not set -# CONFIG_CPU_SUBTYPE_SH7750R is not set -# CONFIG_CPU_SUBTYPE_SH7750S is not set -# CONFIG_CPU_SUBTYPE_SH7751 is not set -# CONFIG_CPU_SUBTYPE_SH7751R is not set -# CONFIG_CPU_SUBTYPE_SH7760 is not set -# CONFIG_CPU_SUBTYPE_SH4_202 is not set -# CONFIG_CPU_SUBTYPE_SH7723 is not set -# CONFIG_CPU_SUBTYPE_SH7763 is not set -# CONFIG_CPU_SUBTYPE_SH7770 is not set -# CONFIG_CPU_SUBTYPE_SH7780 is not set -# CONFIG_CPU_SUBTYPE_SH7785 is not set -# CONFIG_CPU_SUBTYPE_SH7786 is not set -# CONFIG_CPU_SUBTYPE_SHX3 is not set -# CONFIG_CPU_SUBTYPE_SH7343 is not set -# CONFIG_CPU_SUBTYPE_SH7722 is not set -# CONFIG_CPU_SUBTYPE_SH7366 is not set CONFIG_CPU_SUBTYPE_SH5_101=y # CONFIG_CPU_SUBTYPE_SH5_103 is not set @@ -279,8 +250,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000 # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -492,6 +461,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -568,6 +538,7 @@ CONFIG_NETDEV_1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -591,6 +562,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -779,6 +751,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_F71805F is not set # CONFIG_SENSORS_F71882FG is not set # CONFIG_SENSORS_F75375S is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_IT87 is not set @@ -1042,7 +1015,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y @@ -1082,6 +1054,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -1104,6 +1077,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1146,8 +1124,13 @@ CONFIG_MINIX_FS=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1208,6 +1191,9 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 CONFIG_SCHED_DEBUG=y CONFIG_SCHEDSTATS=y # CONFIG_TIMER_STATS is not set @@ -1241,15 +1227,21 @@ CONFIG_FRAME_POINTER=y # CONFIG_LATENCYTOP is not set # CONFIG_SYSCTL_SYSCALL_CHECK is not set # CONFIG_PAGE_POISONING is not set +CONFIG_TRACING_SUPPORT=y # # Tracers # # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set # CONFIG_EARLY_SCIF_CONSOLE is not set # CONFIG_DEBUG_BOOTMEM is not set @@ -1354,6 +1346,7 @@ CONFIG_CRYPTO=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/dreamcast_defconfig b/arch/sh/configs/dreamcast_defconfig index 5c112364014..1f3cc98330b 100644 --- a/arch/sh/configs/dreamcast_defconfig +++ b/arch/sh/configs/dreamcast_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 17:51:48 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:44:27 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -71,6 +72,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -90,6 +92,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set CONFIG_PROFILING=y +# CONFIG_MARKERS is not set # CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set @@ -98,6 +101,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -110,7 +115,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -156,6 +160,7 @@ CONFIG_CPU_SUBTYPE_SH7091=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -165,8 +170,6 @@ CONFIG_CPU_SUBTYPE_SH7091=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -271,7 +274,7 @@ CONFIG_CPU_FREQ_GOV_USERSPACE=y CONFIG_SH_DMA_API=y CONFIG_SH_DMA=y CONFIG_SH_DMA_IRQ_MULTI=y -CONFIG_NR_ONCHIP_DMA_CHANNELS=6 +CONFIG_NR_ONCHIP_DMA_CHANNELS=4 CONFIG_NR_DMA_CHANNELS_BOOL=y CONFIG_NR_DMA_CHANNELS=9 # CONFIG_PVR2_DMA is not set @@ -320,7 +323,6 @@ CONFIG_CMDLINE="console=ttySC1,115200 panic=3" CONFIG_MAPLE=y CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -602,6 +604,7 @@ CONFIG_INPUT_MOUSE=y # CONFIG_MOUSE_APPLETOUCH is not set # CONFIG_MOUSE_BCM5974 is not set # CONFIG_MOUSE_VSXXXAA is not set +# CONFIG_MOUSE_MAPLE is not set # CONFIG_INPUT_JOYSTICK is not set # CONFIG_INPUT_TABLET is not set # CONFIG_INPUT_TOUCHSCREEN is not set @@ -812,7 +815,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y @@ -866,6 +868,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -910,6 +917,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set @@ -947,10 +955,24 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1051,6 +1073,7 @@ CONFIG_CRYPTO=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/edosk7705_defconfig b/arch/sh/configs/edosk7705_defconfig index f4c34b03931..d7092457ddc 100644 --- a/arch/sh/configs/edosk7705_defconfig +++ b/arch/sh/configs/edosk7705_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 17:54:02 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:45:04 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_FIND_NEXT_BIT=y @@ -56,6 +57,7 @@ CONFIG_EMBEDDED=y # CONFIG_UID16 is not set # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_KALLSYMS is not set +# CONFIG_STRIP_ASM_SYMS is not set # CONFIG_HOTPLUG is not set # CONFIG_PRINTK is not set # CONFIG_BUG is not set @@ -74,12 +76,15 @@ CONFIG_SHMEM=y CONFIG_SLUB=y # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_BASE_SMALL=1 # CONFIG_MODULES is not set @@ -114,6 +119,7 @@ CONFIG_CPU_SUBTYPE_SH7705=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -123,8 +129,6 @@ CONFIG_CPU_SUBTYPE_SH7705=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -381,6 +385,10 @@ CONFIG_SSB_POSSIBLE=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# + # # Pseudo filesystems # @@ -409,10 +417,22 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -426,6 +446,7 @@ CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SECURITYFS is not set # CONFIG_SECURITY_FILE_CAPABILITIES is not set # CONFIG_CRYPTO is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/edosk7760_defconfig b/arch/sh/configs/edosk7760_defconfig index 7825c2699f1..a822b1d8c11 100644 --- a/arch/sh/configs/edosk7760_defconfig +++ b/arch/sh/configs/edosk7760_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 17:54:57 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:45:25 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -40,6 +41,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set # CONFIG_TASKSTATS is not set @@ -67,7 +69,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -77,6 +78,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -96,6 +98,7 @@ CONFIG_COMPAT_BRK=y CONFIG_SLUB=y # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -103,6 +106,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -115,7 +120,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -161,6 +165,7 @@ CONFIG_CPU_SH4=y CONFIG_CPU_SUBTYPE_SH7760=y # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -170,8 +175,6 @@ CONFIG_CPU_SUBTYPE_SH7760=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -815,6 +818,7 @@ CONFIG_EXT2_FS_XATTR=y # CONFIG_EXT2_FS_SECURITY is not set CONFIG_EXT2_FS_XIP=y CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -838,6 +842,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_FUSE_FS is not set CONFIG_GENERIC_ACL=y +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -883,6 +892,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V3 is not set @@ -964,6 +974,9 @@ CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 # CONFIG_SCHED_DEBUG is not set # CONFIG_SCHEDSTATS is not set CONFIG_TIMER_STATS=y @@ -1001,6 +1014,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -1010,9 +1024,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y # CONFIG_PREEMPT_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1126,6 +1145,7 @@ CONFIG_CRYPTO_DES=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/espt_defconfig b/arch/sh/configs/espt_defconfig index ebb4c37abaa..c5b50077913 100644 --- a/arch/sh/configs/espt_defconfig +++ b/arch/sh/configs/espt_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 17:58:18 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:46:26 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -78,6 +79,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -106,6 +108,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -117,7 +121,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -164,6 +167,7 @@ CONFIG_CPU_SH4A=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set CONFIG_CPU_SUBTYPE_SH7763=y # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -173,8 +177,6 @@ CONFIG_CPU_SUBTYPE_SH7763=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -548,6 +550,7 @@ CONFIG_SCSI_WAIT_SCAN=m CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_SCSI_DEBUG is not set # CONFIG_SCSI_DH is not set # CONFIG_SCSI_OSD_INITIATOR is not set @@ -919,6 +922,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -942,6 +946,11 @@ CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set CONFIG_GENERIC_ACL=y +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -985,8 +994,13 @@ CONFIG_CRAMFS=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V3 is not set @@ -1075,11 +1089,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1179,6 +1207,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/hp6xx_defconfig b/arch/sh/configs/hp6xx_defconfig index 82b113af08d..8e13027eecc 100644 --- a/arch/sh/configs/hp6xx_defconfig +++ b/arch/sh/configs/hp6xx_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:01:05 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:47:15 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -67,6 +68,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -85,12 +87,15 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -98,7 +103,6 @@ CONFIG_BASE_SMALL=0 # CONFIG_MODULES is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -144,6 +148,7 @@ CONFIG_CPU_SUBTYPE_SH7709=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -153,8 +158,6 @@ CONFIG_CPU_SUBTYPE_SH7709=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -385,6 +388,7 @@ CONFIG_BLK_DEV_SD=y # CONFIG_SCSI_SRP_ATTRS is not set CONFIG_SCSI_LOWLEVEL=y # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_SCSI_DEBUG is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set # CONFIG_SCSI_DH is not set @@ -431,6 +435,7 @@ CONFIG_KEYBOARD_HP6XX=y # CONFIG_INPUT_JOYSTICK is not set # CONFIG_INPUT_TABLET is not set CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_AD7879 is not set # CONFIG_TOUCHSCREEN_FUJITSU is not set # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set @@ -673,6 +678,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -719,6 +729,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set # # Partition Types @@ -786,10 +797,23 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -898,6 +922,7 @@ CONFIG_CRYPTO_MD5=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/landisk_defconfig b/arch/sh/configs/landisk_defconfig index b6fa4a7599d..7f549aef0df 100644 --- a/arch/sh/configs/landisk_defconfig +++ b/arch/sh/configs/landisk_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:02:54 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:47:48 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -69,6 +70,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y CONFIG_KALLSYMS_EXTRA_PASS=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -88,6 +90,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -95,6 +98,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -107,7 +112,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -153,6 +157,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -162,8 +167,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -292,8 +295,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000 # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -602,6 +603,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -706,6 +708,7 @@ CONFIG_NETDEV_1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -729,6 +732,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -922,6 +926,7 @@ CONFIG_VIDEO_HELPER_CHIPS_AUTO=y # CONFIG_SOC_CAMERA is not set CONFIG_V4L_USB_DRIVERS=y # CONFIG_USB_VIDEO_CLASS is not set +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y # CONFIG_USB_GSPCA is not set # CONFIG_VIDEO_HDPVR is not set CONFIG_VIDEO_USBVIDEO=m @@ -994,15 +999,17 @@ CONFIG_USB_HID=m # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=m CONFIG_HID_APPLE=m CONFIG_HID_BELKIN=m CONFIG_HID_CHERRY=m CONFIG_HID_CHICONY=m CONFIG_HID_CYPRESS=m +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=m +# CONFIG_HID_KYE is not set CONFIG_HID_GYRATION=m +# CONFIG_HID_KENSINGTON is not set CONFIG_HID_LOGITECH=m # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1197,6 +1204,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -1221,6 +1229,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1270,10 +1283,15 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set CONFIG_UFS_FS=m # CONFIG_UFS_FS_WRITE is not set # CONFIG_UFS_DEBUG is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=m CONFIG_NFS_V3=y @@ -1364,10 +1382,23 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y CONFIG_SH_STANDARD_BIOS=y @@ -1469,6 +1500,7 @@ CONFIG_CRYPTO=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/lboxre2_defconfig b/arch/sh/configs/lboxre2_defconfig index 92c515c4199..a7db539f280 100644 --- a/arch/sh/configs/lboxre2_defconfig +++ b/arch/sh/configs/lboxre2_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:06:51 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:48:54 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -69,6 +70,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y CONFIG_KALLSYMS_EXTRA_PASS=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -88,6 +90,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -95,6 +98,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -107,7 +112,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -153,6 +157,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -162,8 +167,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -293,8 +296,6 @@ CONFIG_CMDLINE="console=ttySC1,115200 root=/dev/sda1" # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -542,6 +543,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -702,6 +704,7 @@ CONFIG_NETDEV_1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -725,6 +728,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -931,7 +935,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y @@ -1007,6 +1010,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -1028,6 +1032,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1073,8 +1082,13 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set @@ -1151,10 +1165,23 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y CONFIG_SH_STANDARD_BIOS=y @@ -1256,6 +1283,7 @@ CONFIG_CRYPTO=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/magicpanelr2_defconfig b/arch/sh/configs/magicpanelr2_defconfig index 26586c2d64c..58bec61506f 100644 --- a/arch/sh/configs/magicpanelr2_defconfig +++ b/arch/sh/configs/magicpanelr2_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:07:39 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:49:32 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -39,6 +40,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y # CONFIG_TASKSTATS is not set @@ -66,7 +68,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -76,6 +77,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -94,6 +96,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -101,6 +104,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -113,7 +118,6 @@ CONFIG_MODVERSIONS=y CONFIG_MODULE_SRCVERSION_ALL=y CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -159,6 +163,7 @@ CONFIG_CPU_SUBTYPE_SH7720=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -168,8 +173,6 @@ CONFIG_CPU_SUBTYPE_SH7720=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -813,6 +816,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set # CONFIG_EXT4_FS is not set CONFIG_JBD=y @@ -830,6 +834,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -884,6 +893,7 @@ CONFIG_JFFS2_RTIME=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -965,6 +975,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set # CONFIG_SCHED_DEBUG is not set # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -1000,6 +1011,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -1008,9 +1020,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y # CONFIG_IRQSOFF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1035,6 +1052,7 @@ CONFIG_DUMP_CODE=y # CONFIG_SECURITYFS is not set # CONFIG_SECURITY_FILE_CAPABILITIES is not set # CONFIG_CRYPTO is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/microdev_defconfig b/arch/sh/configs/microdev_defconfig index 75178355d69..2886fc84bc1 100644 --- a/arch/sh/configs/microdev_defconfig +++ b/arch/sh/configs/microdev_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:11:13 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:50:51 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -65,7 +66,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -74,6 +74,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -92,12 +93,15 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -105,7 +109,6 @@ CONFIG_BASE_SMALL=0 # CONFIG_MODULES is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -151,6 +154,7 @@ CONFIG_CPU_SH4=y # CONFIG_CPU_SUBTYPE_SH7760 is not set CONFIG_CPU_SUBTYPE_SH4_202=y # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -160,8 +164,6 @@ CONFIG_CPU_SUBTYPE_SH4_202=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -647,6 +649,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -668,6 +671,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -715,6 +723,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -802,10 +811,24 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -914,6 +937,7 @@ CONFIG_CRYPTO_DES=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/migor_defconfig b/arch/sh/configs/migor_defconfig index a8720f9c604..8ecceb4bf27 100644 --- a/arch/sh/configs/migor_defconfig +++ b/arch/sh/configs/migor_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:14:03 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:51:34 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -67,7 +68,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -76,6 +76,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -104,6 +105,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -115,7 +118,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -165,6 +167,7 @@ CONFIG_ARCH_SHMOBILE=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -174,8 +177,6 @@ CONFIG_ARCH_SHMOBILE=y # CONFIG_CPU_SUBTYPE_SH7343 is not set CONFIG_CPU_SUBTYPE_SH7722=y # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -577,6 +578,7 @@ CONFIG_SCSI_WAIT_SCAN=m CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_SCSI_DEBUG is not set # CONFIG_SCSI_DH is not set # CONFIG_SCSI_OSD_INITIATOR is not set @@ -873,7 +875,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y # CONFIG_USB_ARCH_HAS_OHCI is not set @@ -1011,6 +1012,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1056,6 +1062,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V3 is not set @@ -1105,11 +1112,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1218,6 +1239,7 @@ CONFIG_CRYPTO_WORKQUEUE=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/polaris_defconfig b/arch/sh/configs/polaris_defconfig index df2d177d534..2b950728618 100644 --- a/arch/sh/configs/polaris_defconfig +++ b/arch/sh/configs/polaris_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:16:48 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:52:19 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -40,6 +41,7 @@ CONFIG_LOCALVERSION="" CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y CONFIG_BSD_PROCESS_ACCT_V3=y # CONFIG_TASKSTATS is not set @@ -76,6 +78,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -94,6 +97,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -101,6 +105,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -113,7 +119,6 @@ CONFIG_MODVERSIONS=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -159,6 +164,7 @@ CONFIG_CPU_SUBTYPE_SH7709=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -168,8 +174,6 @@ CONFIG_CPU_SUBTYPE_SH7709=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -777,6 +781,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -831,6 +840,7 @@ CONFIG_JFFS2_RTIME=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -874,6 +884,9 @@ CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 # CONFIG_SCHED_DEBUG is not set # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -914,6 +927,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -923,9 +937,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y # CONFIG_PREEMPT_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -950,6 +969,7 @@ CONFIG_DUMP_CODE=y # CONFIG_SECURITYFS is not set # CONFIG_SECURITY_FILE_CAPABILITIES is not set # CONFIG_CRYPTO is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig index 7def4df46dd..68e5eff5d7f 100644 --- a/arch/sh/configs/r7780mp_defconfig +++ b/arch/sh/configs/r7780mp_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:20:17 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:53:28 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -78,6 +79,7 @@ CONFIG_UID16=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -107,6 +109,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_BASE_SMALL=0 @@ -118,7 +122,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -165,6 +168,7 @@ CONFIG_CPU_SH4A=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set CONFIG_CPU_SUBTYPE_SH7780=y @@ -174,8 +178,6 @@ CONFIG_CPU_SUBTYPE_SH7780=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -313,8 +315,6 @@ CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -536,6 +536,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -696,6 +697,7 @@ CONFIG_E1000=m # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -719,6 +721,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -920,6 +923,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_F71805F is not set # CONFIG_SENSORS_F71882FG is not set # CONFIG_SENSORS_F75375S is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_IT87 is not set @@ -1027,7 +1031,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y @@ -1120,6 +1123,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -1142,6 +1146,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set CONFIG_FUSE_FS=m +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1191,6 +1200,7 @@ CONFIG_MINIX_FS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1279,6 +1289,9 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -1316,6 +1329,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -1325,11 +1339,16 @@ CONFIG_TRACING=y # CONFIG_PREEMPT_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1449,6 +1468,7 @@ CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig index cb134ffc211..890fe3cc914 100644 --- a/arch/sh/configs/r7785rp_defconfig +++ b/arch/sh/configs/r7785rp_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:24:35 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:55:10 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -43,6 +44,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set # CONFIG_TASKSTATS is not set @@ -78,6 +80,7 @@ CONFIG_UID16=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -108,6 +111,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -120,7 +125,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -168,6 +172,7 @@ CONFIG_CPU_SHX2=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -177,8 +182,6 @@ CONFIG_CPU_SUBTYPE_SH7785=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -337,8 +340,6 @@ CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set # CONFIG_PCI_LEGACY is not set @@ -561,6 +562,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -698,6 +700,7 @@ CONFIG_NETDEV_1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -721,6 +724,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -948,6 +952,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_F71805F is not set # CONFIG_SENSORS_F71882FG is not set # CONFIG_SENSORS_F75375S is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_IT87 is not set @@ -970,6 +975,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set # CONFIG_SENSORS_PCF8591 is not set +# CONFIG_SENSORS_SHT15 is not set # CONFIG_SENSORS_SIS5595 is not set # CONFIG_SENSORS_DME1737 is not set # CONFIG_SENSORS_SMSC47M1 is not set @@ -1109,7 +1115,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y @@ -1202,6 +1207,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -1224,6 +1230,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set CONFIG_FUSE_FS=m +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1273,6 +1284,7 @@ CONFIG_MINIX_FS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1359,6 +1371,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -1401,6 +1414,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -1410,11 +1424,16 @@ CONFIG_TRACING=y # CONFIG_PREEMPT_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1534,6 +1553,7 @@ CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/rsk7201_defconfig b/arch/sh/configs/rsk7201_defconfig index a037c744b79..fa4395768d1 100644 --- a/arch/sh/configs/rsk7201_defconfig +++ b/arch/sh/configs/rsk7201_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:29:08 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:56:29 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -65,7 +66,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -74,6 +74,7 @@ CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -100,6 +101,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_RT_MUTEXES=y CONFIG_BASE_SMALL=0 @@ -110,7 +113,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -157,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7201=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -166,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7201=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -602,6 +603,11 @@ CONFIG_EXT2_FS=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -651,8 +657,13 @@ CONFIG_JFFS2_RTIME=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set # # Partition Types @@ -686,11 +697,24 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -705,6 +729,7 @@ CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SECURITYFS is not set # CONFIG_SECURITY_FILE_CAPABILITIES is not set # CONFIG_CRYPTO is not set +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/rsk7203_defconfig b/arch/sh/configs/rsk7203_defconfig index 9ae28e88426..e3a65f819f0 100644 --- a/arch/sh/configs/rsk7203_defconfig +++ b/arch/sh/configs/rsk7203_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:30:34 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:57:06 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -39,6 +40,7 @@ CONFIG_LOCALVERSION="" CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set # CONFIG_TASKSTATS is not set @@ -70,7 +72,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -80,6 +81,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -106,6 +108,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_RT_MUTEXES=y CONFIG_BASE_SMALL=0 @@ -116,7 +120,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -163,6 +166,7 @@ CONFIG_CPU_SUBTYPE_SH7203=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -172,8 +176,6 @@ CONFIG_CPU_SUBTYPE_SH7203=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -750,15 +752,17 @@ CONFIG_USB_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +# CONFIG_HID_KYE is not set CONFIG_HID_GYRATION=y +# CONFIG_HID_KENSINGTON is not set CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -874,6 +878,7 @@ CONFIG_LEDS_CLASS=y # LED drivers # CONFIG_LEDS_GPIO=y +CONFIG_LEDS_GPIO_PLATFORM=y # # LED Triggers @@ -882,6 +887,7 @@ CONFIG_LEDS_TRIGGERS=y CONFIG_LEDS_TRIGGER_TIMER=y CONFIG_LEDS_TRIGGER_HEARTBEAT=y CONFIG_LEDS_TRIGGER_BACKLIGHT=y +# CONFIG_LEDS_TRIGGER_GPIO is not set CONFIG_LEDS_TRIGGER_DEFAULT_ON=y # @@ -950,6 +956,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -989,8 +1000,13 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V3 is not set @@ -1033,6 +1049,9 @@ CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -1076,6 +1095,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -1083,11 +1103,16 @@ CONFIG_TRACING=y # CONFIG_FUNCTION_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1111,6 +1136,7 @@ CONFIG_DUMP_CODE=y # CONFIG_SECURITYFS is not set # CONFIG_SECURITY_FILE_CAPABILITIES is not set # CONFIG_CRYPTO is not set +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/rts7751r2d1_defconfig b/arch/sh/configs/rts7751r2d1_defconfig index c0f741af6da..a4a59f6205a 100644 --- a/arch/sh/configs/rts7751r2d1_defconfig +++ b/arch/sh/configs/rts7751r2d1_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:33:25 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:58:13 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -70,6 +71,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -99,6 +101,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -110,7 +114,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -156,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -165,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -302,8 +304,6 @@ CONFIG_CMDLINE="console=tty0 console=ttySC0,115200 root=/dev/sda1 earlyprintk=se # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -513,6 +513,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -672,6 +673,7 @@ CONFIG_NETDEV_1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -695,6 +697,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -793,6 +796,7 @@ CONFIG_SERIAL_8250_RUNTIME_UARTS=4 # # Non-8250 serial port support # +# CONFIG_SERIAL_MAX3100 is not set CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_SH_SCI_NR_UARTS=1 CONFIG_SERIAL_SH_SCI_CONSOLE=y @@ -1079,15 +1083,17 @@ CONFIG_USB_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +# CONFIG_HID_KYE is not set CONFIG_HID_GYRATION=y +# CONFIG_HID_KENSINGTON is not set CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1290,6 +1296,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1337,6 +1348,7 @@ CONFIG_MINIX_FS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set @@ -1417,11 +1429,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1524,6 +1550,7 @@ CONFIG_CRYPTO=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig index 8feef629e49..2bc1c97d346 100644 --- a/arch/sh/configs/rts7751r2dplus_defconfig +++ b/arch/sh/configs/rts7751r2dplus_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:34:12 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:59:01 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -70,6 +71,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -99,6 +101,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -110,7 +114,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -156,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -165,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -302,8 +304,6 @@ CONFIG_CMDLINE="console=tty0 console=ttySC0,115200 root=/dev/sda1 earlyprintk=se # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -513,6 +513,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -672,6 +673,7 @@ CONFIG_NETDEV_1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -695,6 +697,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -793,6 +796,7 @@ CONFIG_SERIAL_8250_RUNTIME_UARTS=4 # # Non-8250 serial port support # +# CONFIG_SERIAL_MAX3100 is not set CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_SH_SCI_NR_UARTS=1 CONFIG_SERIAL_SH_SCI_CONSOLE=y @@ -1079,15 +1083,17 @@ CONFIG_USB_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +# CONFIG_HID_KYE is not set CONFIG_HID_GYRATION=y +# CONFIG_HID_KENSINGTON is not set CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1290,6 +1296,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1337,6 +1348,7 @@ CONFIG_MINIX_FS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set @@ -1417,11 +1429,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1524,6 +1550,7 @@ CONFIG_CRYPTO=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/sdk7780_defconfig b/arch/sh/configs/sdk7780_defconfig index 739e2299ae8..a629b79a184 100644 --- a/arch/sh/configs/sdk7780_defconfig +++ b/arch/sh/configs/sdk7780_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:34:43 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 12:59:32 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -41,6 +42,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set # CONFIG_TASKSTATS is not set @@ -73,6 +75,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -93,6 +96,7 @@ CONFIG_COMPAT_BRK=y CONFIG_SLUB=y # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -100,6 +104,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -112,7 +118,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y CONFIG_LBD=y -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -159,6 +164,7 @@ CONFIG_CPU_SH4A=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set CONFIG_CPU_SUBTYPE_SH7780=y @@ -168,8 +174,6 @@ CONFIG_CPU_SUBTYPE_SH7780=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -310,8 +314,6 @@ CONFIG_CMDLINE="mem=128M console=tty0 console=ttySC0,115200 ip=bootp root=/dev/n # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set # CONFIG_PCI_LEGACY is not set @@ -646,6 +648,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -1091,15 +1094,17 @@ CONFIG_USB_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +# CONFIG_HID_KYE is not set CONFIG_HID_GYRATION=y +# CONFIG_HID_KENSINGTON is not set CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1257,6 +1262,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y # CONFIG_EXT2_FS_SECURITY is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y # CONFIG_EXT3_FS_SECURITY is not set @@ -1280,6 +1286,11 @@ CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set CONFIG_GENERIC_ACL=y +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1331,6 +1342,7 @@ CONFIG_MINIX_FS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1418,6 +1430,9 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 # CONFIG_SCHED_DEBUG is not set # CONFIG_SCHEDSTATS is not set CONFIG_TIMER_STATS=y @@ -1455,6 +1470,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -1464,9 +1480,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y # CONFIG_PREEMPT_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1580,6 +1601,7 @@ CONFIG_CRYPTO_DES=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/se7206_defconfig b/arch/sh/configs/se7206_defconfig index d30e0a7ad9f..5caf85a3312 100644 --- a/arch/sh/configs/se7206_defconfig +++ b/arch/sh/configs/se7206_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:39:37 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:01:02 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -40,6 +41,7 @@ CONFIG_LOCALVERSION_AUTO=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set # CONFIG_TASKSTATS is not set @@ -63,6 +65,7 @@ CONFIG_CGROUP_DEBUG=y CONFIG_CGROUP_NS=y # CONFIG_CGROUP_FREEZER is not set CONFIG_CGROUP_DEVICE=y +# CONFIG_CPUSETS is not set CONFIG_CGROUP_CPUACCT=y CONFIG_RESOURCE_COUNTERS=y CONFIG_CGROUP_MEM_RES_CTLR=y @@ -80,7 +83,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -90,6 +92,7 @@ CONFIG_EMBEDDED=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -116,6 +119,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_RT_MUTEXES=y CONFIG_BASE_SMALL=0 @@ -127,7 +132,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -174,6 +178,7 @@ CONFIG_CPU_SUBTYPE_SH7206=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -183,8 +188,6 @@ CONFIG_CPU_SUBTYPE_SH7206=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -745,6 +748,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -785,8 +793,13 @@ CONFIG_CRAMFS=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -831,6 +844,9 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -869,6 +885,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -876,11 +893,16 @@ CONFIG_TRACING=y # CONFIG_FUNCTION_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -991,6 +1013,7 @@ CONFIG_CRYPTO_LZO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/se7343_defconfig b/arch/sh/configs/se7343_defconfig index fbb72d029e6..004d531716d 100644 --- a/arch/sh/configs/se7343_defconfig +++ b/arch/sh/configs/se7343_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:42:00 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:01:44 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -40,6 +41,7 @@ CONFIG_LOCALVERSION_AUTO=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set @@ -73,6 +75,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -91,6 +94,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -98,6 +102,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_BASE_SMALL=0 @@ -109,7 +115,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -158,6 +163,7 @@ CONFIG_ARCH_SHMOBILE=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -167,8 +173,6 @@ CONFIG_ARCH_SHMOBILE=y CONFIG_CPU_SUBTYPE_SH7343=y # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -769,6 +773,7 @@ CONFIG_VIDEO_HELPER_CHIPS_AUTO=y # CONFIG_SOC_CAMERA is not set CONFIG_V4L_USB_DRIVERS=y # CONFIG_USB_VIDEO_CLASS is not set +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y CONFIG_USB_GSPCA=m # CONFIG_USB_M5602 is not set # CONFIG_USB_STV06XX is not set @@ -800,6 +805,7 @@ CONFIG_USB_GSPCA=m # CONFIG_VIDEO_PVRUSB2 is not set # CONFIG_VIDEO_HDPVR is not set # CONFIG_VIDEO_EM28XX is not set +# CONFIG_VIDEO_CX231XX is not set # CONFIG_VIDEO_USBVISION is not set # CONFIG_USB_VICAM is not set # CONFIG_USB_IBMCAM is not set @@ -813,6 +819,7 @@ CONFIG_USB_GSPCA=m # CONFIG_USB_STV680 is not set # CONFIG_USB_ZC0301 is not set # CONFIG_USB_PWC is not set +CONFIG_USB_PWC_INPUT_EVDEV=y # CONFIG_USB_ZR364XX is not set # CONFIG_USB_STKWEBCAM is not set # CONFIG_USB_S2255 is not set @@ -914,15 +921,17 @@ CONFIG_USB_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +# CONFIG_HID_KYE is not set CONFIG_HID_GYRATION=y +# CONFIG_HID_KENSINGTON is not set CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1050,6 +1059,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -1070,6 +1080,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1125,6 +1140,7 @@ CONFIG_CRAMFS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1174,10 +1190,23 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1279,6 +1308,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/se7619_defconfig b/arch/sh/configs/se7619_defconfig index 125304e80f5..edbece52afc 100644 --- a/arch/sh/configs/se7619_defconfig +++ b/arch/sh/configs/se7619_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:44:53 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:02:32 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -61,6 +62,7 @@ CONFIG_EMBEDDED=y # CONFIG_UID16 is not set # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_KALLSYMS is not set +# CONFIG_STRIP_ASM_SYMS is not set # CONFIG_HOTPLUG is not set CONFIG_PRINTK=y CONFIG_BUG=y @@ -78,11 +80,14 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_BASE_SMALL=1 @@ -134,6 +139,7 @@ CONFIG_CPU_SUBTYPE_SH7619=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -143,8 +149,6 @@ CONFIG_CPU_SUBTYPE_SH7619=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -513,7 +517,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y # CONFIG_USB_ARCH_HAS_OHCI is not set @@ -563,6 +566,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -601,8 +609,13 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set # # Partition Types @@ -630,10 +643,21 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -647,6 +671,7 @@ CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SECURITYFS is not set # CONFIG_SECURITY_FILE_CAPABILITIES is not set # CONFIG_CRYPTO is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/se7705_defconfig b/arch/sh/configs/se7705_defconfig index 0308abf5238..bae161c6683 100644 --- a/arch/sh/configs/se7705_defconfig +++ b/arch/sh/configs/se7705_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:45:56 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:02:52 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -62,7 +63,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -70,6 +70,7 @@ CONFIG_EMBEDDED=y CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_KALLSYMS is not set +# CONFIG_STRIP_ASM_SYMS is not set # CONFIG_HOTPLUG is not set CONFIG_PRINTK=y CONFIG_BUG=y @@ -88,12 +89,15 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -150,6 +154,7 @@ CONFIG_CPU_SUBTYPE_SH7705=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -159,8 +164,6 @@ CONFIG_CPU_SUBTYPE_SH7705=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -698,7 +701,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y # CONFIG_USB_ARCH_HAS_OHCI is not set @@ -751,6 +753,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -804,6 +811,7 @@ CONFIG_JFFS2_RTIME=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V3 is not set @@ -847,10 +855,23 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -949,6 +970,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig index a8c24b70348..330043f3c31 100644 --- a/arch/sh/configs/se7712_defconfig +++ b/arch/sh/configs/se7712_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:48:18 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:03:27 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_FIND_NEXT_BIT=y @@ -38,6 +39,7 @@ CONFIG_LOCALVERSION="" CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set # CONFIG_TASKSTATS is not set @@ -69,6 +71,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y # CONFIG_BUG is not set @@ -87,6 +90,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -94,6 +98,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -105,7 +111,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -151,6 +156,7 @@ CONFIG_CPU_SUBTYPE_SH7712=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -160,8 +166,6 @@ CONFIG_CPU_SUBTYPE_SH7712=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -585,6 +589,7 @@ CONFIG_SCSI_WAIT_SCAN=m CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_SCSI_DEBUG is not set # CONFIG_SCSI_DH is not set # CONFIG_SCSI_OSD_INITIATOR is not set @@ -812,6 +817,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y CONFIG_EXT2_FS_SECURITY=y # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -832,6 +838,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -887,6 +898,7 @@ CONFIG_CRAMFS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V3 is not set @@ -927,6 +939,7 @@ CONFIG_FRAME_WARN=1024 CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -961,6 +974,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -969,9 +983,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y # CONFIG_IRQSOFF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1090,6 +1109,7 @@ CONFIG_CRYPTO_DEFLATE=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig index 4b79c2567dc..56478918440 100644 --- a/arch/sh/configs/se7721_defconfig +++ b/arch/sh/configs/se7721_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:51:44 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:04:19 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_FIND_NEXT_BIT=y @@ -38,6 +39,7 @@ CONFIG_LOCALVERSION="" CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set # CONFIG_TASKSTATS is not set @@ -73,6 +75,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y # CONFIG_BUG is not set @@ -91,6 +94,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -98,6 +102,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -109,7 +115,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -155,6 +160,7 @@ CONFIG_CPU_SUBTYPE_SH7721=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -164,8 +170,6 @@ CONFIG_CPU_SUBTYPE_SH7721=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -776,15 +780,17 @@ CONFIG_USB_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +# CONFIG_HID_KYE is not set CONFIG_HID_GYRATION=y +# CONFIG_HID_KENSINGTON is not set CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -943,6 +949,7 @@ CONFIG_EXT2_FS_POSIX_ACL=y CONFIG_EXT2_FS_SECURITY=y # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -963,6 +970,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1021,6 +1033,7 @@ CONFIG_CRAMFS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set # CONFIG_NETWORK_FILESYSTEMS is not set # @@ -1085,6 +1098,7 @@ CONFIG_FRAME_WARN=1024 CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -1119,6 +1133,7 @@ CONFIG_FRAME_POINTER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -1127,9 +1142,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y # CONFIG_IRQSOFF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1248,6 +1268,7 @@ CONFIG_CRYPTO_DEFLATE=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/se7722_defconfig b/arch/sh/configs/se7722_defconfig index 82bdaac45fb..726fdbdb280 100644 --- a/arch/sh/configs/se7722_defconfig +++ b/arch/sh/configs/se7722_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:55:10 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:05:29 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -69,7 +70,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -78,6 +78,7 @@ CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -97,6 +98,7 @@ CONFIG_COMPAT_BRK=y CONFIG_SLUB=y # CONFIG_SLOB is not set CONFIG_PROFILING=y +# CONFIG_MARKERS is not set # CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set @@ -105,6 +107,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -117,7 +121,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -167,6 +170,7 @@ CONFIG_ARCH_SHMOBILE=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -176,8 +180,6 @@ CONFIG_ARCH_SHMOBILE=y # CONFIG_CPU_SUBTYPE_SH7343 is not set CONFIG_CPU_SUBTYPE_SH7722=y # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -486,6 +488,7 @@ CONFIG_SCSI_WAIT_SCAN=m CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_SCSI_DEBUG is not set # CONFIG_SCSI_DH is not set # CONFIG_SCSI_OSD_INITIATOR is not set @@ -692,7 +695,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y # CONFIG_USB_ARCH_HAS_OHCI is not set @@ -766,6 +768,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -788,6 +791,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -832,6 +840,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set @@ -872,11 +881,25 @@ CONFIG_DEBUG_FS=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y CONFIG_SH_STANDARD_BIOS=y @@ -977,6 +1000,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/se7750_defconfig b/arch/sh/configs/se7750_defconfig index ceef6d9138e..ed1a1230f63 100644 --- a/arch/sh/configs/se7750_defconfig +++ b/arch/sh/configs/se7750_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:57:31 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:06:02 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -70,6 +71,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set # CONFIG_HOTPLUG is not set CONFIG_PRINTK=y CONFIG_BUG=y @@ -88,6 +90,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -95,6 +98,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -106,7 +111,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -152,6 +156,7 @@ CONFIG_CPU_SUBTYPE_SH7750=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -161,8 +166,6 @@ CONFIG_CPU_SUBTYPE_SH7750=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -553,6 +556,7 @@ CONFIG_SCSI_WAIT_SCAN=m CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_SCSI_DEBUG is not set # CONFIG_SCSI_DH is not set # CONFIG_SCSI_OSD_INITIATOR is not set @@ -774,6 +778,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -829,6 +838,7 @@ CONFIG_JFFS2_RTIME=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V3 is not set @@ -886,10 +896,23 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -989,6 +1012,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/se7751_defconfig b/arch/sh/configs/se7751_defconfig index 67fc26b3a7d..55f3b788e0c 100644 --- a/arch/sh/configs/se7751_defconfig +++ b/arch/sh/configs/se7751_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 18:59:59 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:06:44 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -65,7 +66,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -74,6 +74,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set # CONFIG_HOTPLUG is not set CONFIG_PRINTK=y CONFIG_BUG=y @@ -92,6 +93,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -99,6 +101,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -110,7 +114,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -156,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7751=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -165,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7751=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -741,6 +743,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -796,6 +803,7 @@ CONFIG_JFFS2_RTIME=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set @@ -833,10 +841,23 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -936,6 +957,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/se7780_defconfig b/arch/sh/configs/se7780_defconfig index ebce23cc2ad..c4f0af32efa 100644 --- a/arch/sh/configs/se7780_defconfig +++ b/arch/sh/configs/se7780_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:02:05 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:07:14 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -67,6 +68,7 @@ CONFIG_EMBEDDED=y CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y # CONFIG_KALLSYMS is not set +# CONFIG_STRIP_ASM_SYMS is not set # CONFIG_HOTPLUG is not set CONFIG_PRINTK=y CONFIG_BUG=y @@ -86,12 +88,15 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -103,7 +108,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_INTEGRITY is not set # @@ -149,6 +153,7 @@ CONFIG_CPU_SH4A=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set CONFIG_CPU_SUBTYPE_SH7780=y @@ -158,8 +163,6 @@ CONFIG_CPU_SUBTYPE_SH7780=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -285,8 +288,6 @@ CONFIG_CMDLINE="console=ttySC0.115200 root=/dev/sda1" # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -558,6 +559,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -957,15 +959,17 @@ CONFIG_USB_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +# CONFIG_HID_KYE is not set CONFIG_HID_GYRATION=y +# CONFIG_HID_KENSINGTON is not set CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1121,6 +1125,10 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# + # # CD-ROM/DVD Filesystems # @@ -1245,11 +1253,24 @@ CONFIG_DEBUG_FS=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1345,6 +1366,7 @@ CONFIG_CRYPTO=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/sh03_defconfig b/arch/sh/configs/sh03_defconfig index 6fcdb090cf3..f9c6e51dc0b 100644 --- a/arch/sh/configs/sh03_defconfig +++ b/arch/sh/configs/sh03_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:04:59 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:07:56 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -41,6 +42,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set # CONFIG_TASKSTATS is not set @@ -67,7 +69,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -76,6 +77,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -105,6 +107,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -117,7 +121,6 @@ CONFIG_MODVERSIONS=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -163,6 +166,7 @@ CONFIG_CPU_SUBTYPE_SH7751=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -172,8 +176,6 @@ CONFIG_CPU_SUBTYPE_SH7751=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -300,8 +302,6 @@ CONFIG_CMDLINE="console=ttySC1,115200 mem=64M root=/dev/nfs" # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -562,6 +562,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -656,6 +657,7 @@ CONFIG_NETDEV_1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -679,6 +681,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -887,7 +890,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y @@ -929,6 +931,7 @@ CONFIG_EXT2_FS_XATTR=y # CONFIG_EXT2_FS_SECURITY is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y # CONFIG_EXT3_FS_SECURITY is not set @@ -951,6 +954,11 @@ CONFIG_AUTOFS_FS=y CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1001,6 +1009,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1112,11 +1121,26 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y CONFIG_SH_STANDARD_BIOS=y @@ -1228,6 +1252,7 @@ CONFIG_CRYPTO_DEFLATE=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/sh7710voipgw_defconfig b/arch/sh/configs/sh7710voipgw_defconfig index 1ab37c01da6..48b58098cf8 100644 --- a/arch/sh/configs/sh7710voipgw_defconfig +++ b/arch/sh/configs/sh7710voipgw_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:09:01 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:09:16 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -39,6 +40,7 @@ CONFIG_LOCALVERSION_AUTO=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set @@ -72,6 +74,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -90,6 +93,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -97,6 +101,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_BASE_SMALL=0 @@ -108,7 +114,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -154,6 +159,7 @@ CONFIG_CPU_SUBTYPE_SH7710=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -163,8 +169,6 @@ CONFIG_CPU_SUBTYPE_SH7710=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -728,7 +732,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y # CONFIG_USB_ARCH_HAS_OHCI is not set @@ -779,6 +782,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -834,6 +842,7 @@ CONFIG_JFFS2_RTIME=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set @@ -871,11 +880,24 @@ CONFIG_DEBUG_FS=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -975,6 +997,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/sh7724_generic_defconfig b/arch/sh/configs/sh7724_generic_defconfig index 268d04ed8cd..ec8f18c7684 100644 --- a/arch/sh/configs/sh7724_generic_defconfig +++ b/arch/sh/configs/sh7724_generic_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.30-rc2 -# Thu Apr 16 15:42:20 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:09:47 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y diff --git a/arch/sh/configs/sh7763rdp_defconfig b/arch/sh/configs/sh7763rdp_defconfig index c79bb84ec30..f77bc7998d2 100644 --- a/arch/sh/configs/sh7763rdp_defconfig +++ b/arch/sh/configs/sh7763rdp_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:10:57 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:10:12 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -78,6 +79,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -106,6 +108,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -117,7 +121,6 @@ CONFIG_MODULES=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -164,6 +167,7 @@ CONFIG_CPU_SH4A=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set CONFIG_CPU_SUBTYPE_SH7763=y # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -173,8 +177,6 @@ CONFIG_CPU_SUBTYPE_SH7763=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -555,6 +557,7 @@ CONFIG_SCSI_WAIT_SCAN=m CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_SCSI_DEBUG is not set # CONFIG_SCSI_DH is not set # CONFIG_SCSI_OSD_INITIATOR is not set @@ -941,6 +944,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -964,6 +968,11 @@ CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set CONFIG_GENERIC_ACL=y +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1012,6 +1021,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V3 is not set @@ -1100,11 +1110,25 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1204,6 +1228,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/sh7785lcr_32bit_defconfig b/arch/sh/configs/sh7785lcr_32bit_defconfig index a6cf4505741..1c55b800d12 100644 --- a/arch/sh/configs/sh7785lcr_32bit_defconfig +++ b/arch/sh/configs/sh7785lcr_32bit_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:12:18 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:10:53 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -79,6 +80,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -98,6 +100,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set CONFIG_PROFILING=y +# CONFIG_MARKERS is not set # CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set @@ -106,6 +109,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -118,7 +123,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -166,6 +170,7 @@ CONFIG_CPU_SHX2=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -175,8 +180,6 @@ CONFIG_CPU_SUBTYPE_SH7785=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -310,8 +313,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000 # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -672,6 +673,7 @@ CONFIG_NETDEV_1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -1009,15 +1011,17 @@ CONFIG_USB_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +# CONFIG_HID_KYE is not set CONFIG_HID_GYRATION=y +# CONFIG_HID_KENSINGTON is not set CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1218,6 +1222,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -1239,6 +1244,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1289,6 +1299,7 @@ CONFIG_MINIX_FS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1377,6 +1388,9 @@ CONFIG_DEBUG_KERNEL=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -1413,6 +1427,7 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -1422,9 +1437,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y # CONFIG_PREEMPT_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1542,6 +1562,7 @@ CONFIG_CRYPTO_DES=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/sh7785lcr_defconfig b/arch/sh/configs/sh7785lcr_defconfig index e4fac2efc05..4385fe97a78 100644 --- a/arch/sh/configs/sh7785lcr_defconfig +++ b/arch/sh/configs/sh7785lcr_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.30-rc2 -# Wed Apr 22 19:17:56 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:11:48 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y @@ -307,8 +307,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000 # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y diff --git a/arch/sh/configs/shmin_defconfig b/arch/sh/configs/shmin_defconfig index d695e906187..4e120256ec6 100644 --- a/arch/sh/configs/shmin_defconfig +++ b/arch/sh/configs/shmin_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:19:03 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:12:41 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_FIND_NEXT_BIT=y @@ -63,6 +64,7 @@ CONFIG_EMBEDDED=y # CONFIG_UID16 is not set # CONFIG_SYSCTL_SYSCALL is not set # CONFIG_KALLSYMS is not set +# CONFIG_STRIP_ASM_SYMS is not set # CONFIG_HOTPLUG is not set CONFIG_PRINTK=y # CONFIG_BUG is not set @@ -81,12 +83,15 @@ CONFIG_COMPAT_BRK=y # CONFIG_SLUB is not set CONFIG_SLOB=y # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_BASE_SMALL=1 # CONFIG_MODULES is not set @@ -137,6 +142,7 @@ CONFIG_CPU_SUBTYPE_SH7706=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -146,8 +152,6 @@ CONFIG_CPU_SUBTYPE_SH7706=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -674,6 +678,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -718,6 +727,7 @@ CONFIG_CRAMFS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -762,10 +772,22 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y CONFIG_SH_STANDARD_BIOS=y @@ -864,6 +886,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/shx3_defconfig b/arch/sh/configs/shx3_defconfig index e3651f57439..c088144925f 100644 --- a/arch/sh/configs/shx3_defconfig +++ b/arch/sh/configs/shx3_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:20:54 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:13:12 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -42,6 +43,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set # CONFIG_TASKSTATS is not set @@ -96,6 +98,7 @@ CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -126,6 +129,8 @@ CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_USE_GENERIC_SMP_HELPERS=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_RT_MUTEXES=y CONFIG_BASE_SMALL=0 @@ -138,7 +143,6 @@ CONFIG_MODULE_UNLOAD=y CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -186,6 +190,7 @@ CONFIG_CPU_SHX3=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -195,8 +200,6 @@ CONFIG_CPU_SUBTYPE_SHX3=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -553,6 +556,7 @@ CONFIG_SCSI_WAIT_SCAN=m CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_SCSI_DEBUG is not set # CONFIG_SCSI_DH is not set # CONFIG_SCSI_OSD_INITIATOR is not set @@ -646,6 +650,7 @@ CONFIG_DEVKMEM=y # # Non-8250 serial port support # +# CONFIG_SERIAL_MAX3100 is not set CONFIG_SERIAL_SH_SCI=y CONFIG_SERIAL_SH_SCI_NR_UARTS=2 CONFIG_SERIAL_SH_SCI_CONSOLE=y @@ -985,6 +990,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -1007,6 +1013,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1051,6 +1062,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set @@ -1085,6 +1097,9 @@ CONFIG_DEBUG_SHIRQ=y CONFIG_DETECT_SOFTLOCKUP=y # CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC is not set CONFIG_BOOTPARAM_SOFTLOCKUP_PANIC_VALUE=0 +CONFIG_DETECT_HUNG_TASK=y +# CONFIG_BOOTPARAM_HUNG_TASK_PANIC is not set +CONFIG_BOOTPARAM_HUNG_TASK_PANIC_VALUE=0 CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -1124,6 +1139,7 @@ CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_RING_BUFFER=y CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -1133,11 +1149,16 @@ CONFIG_TRACING=y # CONFIG_PREEMPT_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_FTRACE_STARTUP_TEST is not set # CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1245,6 +1266,7 @@ CONFIG_CRYPTO=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +CONFIG_BINARY_PRINTF=y # # Library routines diff --git a/arch/sh/configs/snapgear_defconfig b/arch/sh/configs/snapgear_defconfig index 6960f60bf52..54a7a3c41f3 100644 --- a/arch/sh/configs/snapgear_defconfig +++ b/arch/sh/configs/snapgear_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:21:39 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:14:00 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -64,7 +65,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -73,6 +73,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set # CONFIG_HOTPLUG is not set CONFIG_PRINTK=y CONFIG_BUG=y @@ -92,12 +93,15 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -105,7 +109,6 @@ CONFIG_BASE_SMALL=0 # CONFIG_MODULES is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -151,6 +154,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -160,8 +164,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -295,8 +297,6 @@ CONFIG_BOOT_LINK_OFFSET=0x00800000 # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -762,6 +762,11 @@ CONFIG_FILE_LOCKING=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -805,8 +810,13 @@ CONFIG_CRAMFS=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y # CONFIG_NFS_FS is not set # CONFIG_NFSD is not set @@ -844,10 +854,23 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -862,6 +885,7 @@ CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SECURITYFS is not set # CONFIG_SECURITY_FILE_CAPABILITIES is not set # CONFIG_CRYPTO is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/systemh_defconfig b/arch/sh/configs/systemh_defconfig index 7ea639bc593..dbe7e546f0b 100644 --- a/arch/sh/configs/systemh_defconfig +++ b/arch/sh/configs/systemh_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:23:31 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:14:33 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -61,7 +62,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -70,6 +70,7 @@ CONFIG_UID16=y # CONFIG_SYSCTL_SYSCALL is not set CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set # CONFIG_HOTPLUG is not set CONFIG_PRINTK=y CONFIG_BUG=y @@ -88,6 +89,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -95,6 +97,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -107,7 +111,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -153,6 +156,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -162,8 +166,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -505,6 +507,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -547,8 +554,13 @@ CONFIG_CRAMFS=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set # # Partition Types @@ -577,10 +589,24 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -595,6 +621,7 @@ CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SECURITYFS is not set # CONFIG_SECURITY_FILE_CAPABILITIES is not set # CONFIG_CRYPTO is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig index bbeb4c6ebb9..8ca94ef7427 100644 --- a/arch/sh/configs/titan_defconfig +++ b/arch/sh/configs/titan_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:24:55 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:14:55 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -40,6 +41,7 @@ CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y # CONFIG_BSD_PROCESS_ACCT is not set # CONFIG_TASKSTATS is not set # CONFIG_AUDIT is not set @@ -66,7 +68,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -76,6 +77,7 @@ CONFIG_UID16=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_ALL is not set # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -95,6 +97,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set # CONFIG_PROFILING is not set +# CONFIG_MARKERS is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set CONFIG_HAVE_IOREMAP_PROT=y @@ -102,6 +105,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -114,7 +119,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -160,6 +164,7 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -169,8 +174,6 @@ CONFIG_CPU_SUBTYPE_SH7751R=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -305,8 +308,6 @@ CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" # CONFIG_PCI=y CONFIG_SH_PCIDMA_NONCOHERENT=y -CONFIG_PCI_AUTO=y -CONFIG_PCI_AUTO_UPDATE_RESOURCES=y # CONFIG_PCIEPORTBUS is not set # CONFIG_ARCH_SUPPORTS_MSI is not set CONFIG_PCI_LEGACY=y @@ -789,6 +790,7 @@ CONFIG_SCSI_LOWLEVEL=y # CONFIG_SCSI_MPT2SAS is not set # CONFIG_SCSI_HPTIOP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_FCOE is not set # CONFIG_SCSI_DMX3191D is not set # CONFIG_SCSI_FUTURE_DOMAIN is not set @@ -906,6 +908,7 @@ CONFIG_NETDEV_1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -929,6 +932,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -1182,7 +1186,6 @@ CONFIG_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_USB_SUPPORT=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB_ARCH_HAS_OHCI=y @@ -1393,6 +1396,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set # CONFIG_EXT3_FS_XATTR is not set # CONFIG_EXT4_FS is not set CONFIG_JBD=y @@ -1418,6 +1422,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set CONFIG_FUSE_FS=m +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1467,8 +1476,13 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_HPFS_FS is not set # CONFIG_QNX4FS_FS is not set CONFIG_ROMFS_FS=y +CONFIG_ROMFS_BACKED_BY_BLOCK=y +# CONFIG_ROMFS_BACKED_BY_MTD is not set +# CONFIG_ROMFS_BACKED_BY_BOTH is not set +CONFIG_ROMFS_ON_BLOCK=y # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1576,6 +1590,7 @@ CONFIG_MAGIC_SYSRQ=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set CONFIG_SCHED_DEBUG=y # CONFIG_SCHEDSTATS is not set # CONFIG_TIMER_STATS is not set @@ -1610,6 +1625,7 @@ CONFIG_SCHED_DEBUG=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -1618,9 +1634,14 @@ CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y # CONFIG_IRQSOFF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -1741,6 +1762,7 @@ CONFIG_CRYPTO_DEFLATE=y # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/ul2_defconfig b/arch/sh/configs/ul2_defconfig index 34f5192a324..bfb4d980689 100644 --- a/arch/sh/configs/ul2_defconfig +++ b/arch/sh/configs/ul2_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:30:27 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 13:17:05 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -69,7 +70,6 @@ CONFIG_INITRAMFS_SOURCE="" CONFIG_RD_GZIP=y # CONFIG_RD_BZIP2 is not set # CONFIG_RD_LZMA is not set -CONFIG_INITRAMFS_COMPRESSION_NONE=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y CONFIG_ANON_INODES=y @@ -78,6 +78,7 @@ CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -97,6 +98,7 @@ CONFIG_COMPAT_BRK=y CONFIG_SLUB=y # CONFIG_SLOB is not set CONFIG_PROFILING=y +# CONFIG_MARKERS is not set # CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set @@ -105,6 +107,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -117,7 +121,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -167,6 +170,7 @@ CONFIG_ARCH_SHMOBILE=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -176,8 +180,6 @@ CONFIG_ARCH_SHMOBILE=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set CONFIG_CPU_SUBTYPE_SH7366=y -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -584,6 +586,7 @@ CONFIG_SCSI_WAIT_SCAN=m CONFIG_SCSI_LOWLEVEL=y # CONFIG_ISCSI_TCP is not set # CONFIG_LIBFC is not set +# CONFIG_LIBFCOE is not set # CONFIG_SCSI_DEBUG is not set # CONFIG_SCSI_DH is not set # CONFIG_SCSI_OSD_INITIATOR is not set @@ -936,6 +939,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -957,6 +961,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1005,6 +1014,7 @@ CONFIG_CRAMFS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y # CONFIG_NFS_V3 is not set @@ -1095,10 +1105,24 @@ CONFIG_FRAME_WARN=1024 CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1208,6 +1232,7 @@ CONFIG_CRYPTO_ARC4=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set CONFIG_CRYPTO_HW=y +# CONFIG_BINARY_PRINTF is not set # # Library routines diff --git a/arch/sh/configs/urquell_defconfig b/arch/sh/configs/urquell_defconfig index d174b1a4d80..512664fed66 100644 --- a/arch/sh/configs/urquell_defconfig +++ b/arch/sh/configs/urquell_defconfig @@ -1,10 +1,11 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29 -# Thu Apr 2 19:33:39 2009 +# Linux kernel version: 2.6.30-rc3 +# Mon Apr 27 14:02:55 2009 # CONFIG_SUPERH=y CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" CONFIG_RWSEM_GENERIC_SPINLOCK=y CONFIG_GENERIC_BUG=y @@ -76,6 +77,7 @@ CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y # CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y @@ -94,6 +96,7 @@ CONFIG_SLAB=y # CONFIG_SLUB is not set # CONFIG_SLOB is not set CONFIG_PROFILING=y +# CONFIG_MARKERS is not set # CONFIG_OPROFILE is not set CONFIG_HAVE_OPROFILE=y # CONFIG_KPROBES is not set @@ -102,6 +105,8 @@ CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -114,7 +119,6 @@ CONFIG_MODULE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_BLOCK=y # CONFIG_LBD is not set -# CONFIG_BLK_DEV_IO_TRACE is not set # CONFIG_BLK_DEV_BSG is not set # CONFIG_BLK_DEV_INTEGRITY is not set @@ -162,6 +166,7 @@ CONFIG_CPU_SHX3=y # CONFIG_CPU_SUBTYPE_SH7760 is not set # CONFIG_CPU_SUBTYPE_SH4_202 is not set # CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set # CONFIG_CPU_SUBTYPE_SH7763 is not set # CONFIG_CPU_SUBTYPE_SH7770 is not set # CONFIG_CPU_SUBTYPE_SH7780 is not set @@ -171,8 +176,6 @@ CONFIG_CPU_SUBTYPE_SH7786=y # CONFIG_CPU_SUBTYPE_SH7343 is not set # CONFIG_CPU_SUBTYPE_SH7722 is not set # CONFIG_CPU_SUBTYPE_SH7366 is not set -# CONFIG_CPU_SUBTYPE_SH5_101 is not set -# CONFIG_CPU_SUBTYPE_SH5_103 is not set # # Memory management options @@ -900,15 +903,17 @@ CONFIG_USB_HID=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +# CONFIG_HID_KYE is not set CONFIG_HID_GYRATION=y +# CONFIG_HID_KENSINGTON is not set CONFIG_HID_LOGITECH=y # CONFIG_LOGITECH_FF is not set # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1046,6 +1051,7 @@ CONFIG_EXT2_FS=y # CONFIG_EXT2_FS_XATTR is not set # CONFIG_EXT2_FS_XIP is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y # CONFIG_EXT3_FS_POSIX_ACL is not set # CONFIG_EXT3_FS_SECURITY is not set @@ -1067,6 +1073,11 @@ CONFIG_INOTIFY_USER=y # CONFIG_AUTOFS4_FS is not set # CONFIG_FUSE_FS is not set +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -1117,6 +1128,7 @@ CONFIG_MINIX_FS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -1209,10 +1221,24 @@ CONFIG_SYSCTL_SYSCALL_CHECK=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_TRACING_SUPPORT=y # # Tracers # +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_BLK_DEV_IO_TRACE is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_SH_STANDARD_BIOS is not set @@ -1322,6 +1348,7 @@ CONFIG_CRYPTO_DES=y # # CONFIG_CRYPTO_ANSI_CPRNG is not set # CONFIG_CRYPTO_HW is not set +# CONFIG_BINARY_PRINTF is not set # # Library routines From 5be7c0a4d3dfe25091f2e4e524103e81d9e7e180 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 27 Apr 2009 14:40:47 +0900 Subject: [PATCH 0569/2061] sh: select GENERIC_TIME for new CMT driver. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 4c68fdedfa1..164748945f9 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -476,10 +476,11 @@ config SH_TIMER_CMT bool "CMT clockevents driver" depends on SYS_SUPPORTS_CMT && !SH_CMT select GENERIC_CLOCKEVENTS + select GENERIC_TIME config SH_MTU2 bool "MTU2 timer support" - depends on CPU_SH2A + depends on CPU_SH2A && !GENERIC_TIME default y help This enables the use of the MTU2 as the system timer. From 148be2c15d4a866fbc7a8f55342e4fd4de73be61 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 27 Apr 2009 08:02:14 +0200 Subject: [PATCH 0570/2061] perf_counter tools: move helper library to util/* Clean up the top level directory a bit by moving all the helper libraries to util/*.[ch]. Signed-off-by: Ingo Molnar --- Documentation/perf_counter/Makefile | 58 ++++++++++--------- Documentation/perf_counter/builtin-help.c | 10 ++-- Documentation/perf_counter/builtin-stat.c | 2 +- Documentation/perf_counter/builtin-top.c | 2 +- Documentation/perf_counter/builtin.h | 4 +- Documentation/perf_counter/perf.c | 8 +-- .../perf_counter/{ => util}/PERF-VERSION-GEN | 0 .../perf_counter/{ => util}/abspath.c | 0 Documentation/perf_counter/{ => util}/alias.c | 0 Documentation/perf_counter/{ => util}/cache.h | 0 .../perf_counter/{ => util}/config.c | 0 Documentation/perf_counter/{ => util}/ctype.c | 0 .../perf_counter/{ => util}/exec_cmd.c | 0 .../perf_counter/{ => util}/exec_cmd.h | 0 .../{ => util}/generate-cmdlist.sh | 0 Documentation/perf_counter/{ => util}/help.c | 2 +- Documentation/perf_counter/{ => util}/help.h | 0 .../perf_counter/{ => util}/levenshtein.c | 0 .../perf_counter/{ => util}/levenshtein.h | 0 .../perf_counter/{ => util}/parse-options.c | 3 - .../perf_counter/{ => util}/parse-options.h | 0 Documentation/perf_counter/{ => util}/path.c | 0 Documentation/perf_counter/{ => util}/quote.c | 0 Documentation/perf_counter/{ => util}/quote.h | 2 +- .../perf_counter/{ => util}/run-command.c | 0 .../perf_counter/{ => util}/run-command.h | 0 .../perf_counter/{ => util}/strbuf.c | 0 .../perf_counter/{ => util}/strbuf.h | 0 Documentation/perf_counter/{ => util}/usage.c | 0 Documentation/perf_counter/{ => util}/util.h | 0 .../perf_counter/{ => util}/wrapper.c | 0 31 files changed, 45 insertions(+), 46 deletions(-) rename Documentation/perf_counter/{ => util}/PERF-VERSION-GEN (100%) rename Documentation/perf_counter/{ => util}/abspath.c (100%) rename Documentation/perf_counter/{ => util}/alias.c (100%) rename Documentation/perf_counter/{ => util}/cache.h (100%) rename Documentation/perf_counter/{ => util}/config.c (100%) rename Documentation/perf_counter/{ => util}/ctype.c (100%) rename Documentation/perf_counter/{ => util}/exec_cmd.c (100%) rename Documentation/perf_counter/{ => util}/exec_cmd.h (100%) rename Documentation/perf_counter/{ => util}/generate-cmdlist.sh (100%) rename Documentation/perf_counter/{ => util}/help.c (99%) rename Documentation/perf_counter/{ => util}/help.h (100%) rename Documentation/perf_counter/{ => util}/levenshtein.c (100%) rename Documentation/perf_counter/{ => util}/levenshtein.h (100%) rename Documentation/perf_counter/{ => util}/parse-options.c (99%) rename Documentation/perf_counter/{ => util}/parse-options.h (100%) rename Documentation/perf_counter/{ => util}/path.c (100%) rename Documentation/perf_counter/{ => util}/quote.c (100%) rename Documentation/perf_counter/{ => util}/quote.h (97%) rename Documentation/perf_counter/{ => util}/run-command.c (100%) rename Documentation/perf_counter/{ => util}/run-command.h (100%) rename Documentation/perf_counter/{ => util}/strbuf.c (100%) rename Documentation/perf_counter/{ => util}/strbuf.h (100%) rename Documentation/perf_counter/{ => util}/usage.c (100%) rename Documentation/perf_counter/{ => util}/util.h (100%) rename Documentation/perf_counter/{ => util}/wrapper.c (100%) diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index 690045e4969..543ccf28ac4 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -147,7 +147,7 @@ all:: # broken, or spawning external process is slower than built-in grep perf has). PERF-VERSION-FILE: .FORCE-PERF-VERSION-FILE - @$(SHELL_PATH) ./PERF-VERSION-GEN + @$(SHELL_PATH) util/PERF-VERSION-GEN -include PERF-VERSION-FILE uname_S := $(shell sh -c 'uname -s 2>/dev/null || echo not') @@ -287,26 +287,28 @@ export PERL_PATH LIB_FILE=libperf.a LIB_H += ../../include/linux/perf_counter.h -LIB_H += levenshtein.h -LIB_H += parse-options.h -LIB_H += quote.h -LIB_H += strbuf.h -LIB_H += run-command.h +LIB_H += util/levenshtein.h +LIB_H += util/parse-options.h +LIB_H += util/quote.h +LIB_H += util/util.h +LIB_H += util/help.h +LIB_H += util/strbuf.h +LIB_H += util/run-command.h -LIB_OBJS += abspath.o -LIB_OBJS += alias.o -LIB_OBJS += config.o -LIB_OBJS += ctype.o -LIB_OBJS += exec_cmd.o -LIB_OBJS += help.o -LIB_OBJS += levenshtein.o -LIB_OBJS += parse-options.o -LIB_OBJS += path.o -LIB_OBJS += run-command.o -LIB_OBJS += quote.o -LIB_OBJS += strbuf.o -LIB_OBJS += usage.o -LIB_OBJS += wrapper.o +LIB_OBJS += util/abspath.o +LIB_OBJS += util/alias.o +LIB_OBJS += util/config.o +LIB_OBJS += util/ctype.o +LIB_OBJS += util/exec_cmd.o +LIB_OBJS += util/help.o +LIB_OBJS += util/levenshtein.o +LIB_OBJS += util/parse-options.o +LIB_OBJS += util/path.o +LIB_OBJS += util/run-command.o +LIB_OBJS += util/quote.o +LIB_OBJS += util/strbuf.o +LIB_OBJS += util/usage.o +LIB_OBJS += util/wrapper.o BUILTIN_OBJS += builtin-help.o BUILTIN_OBJS += builtin-record.o @@ -620,10 +622,10 @@ $(BUILT_INS): perf$X ln -s perf$X $@ 2>/dev/null || \ cp perf$X $@ -common-cmds.h: ./generate-cmdlist.sh command-list.txt +common-cmds.h: util/generate-cmdlist.sh command-list.txt common-cmds.h: $(wildcard Documentation/perf-*.txt) - $(QUIET_GEN)./generate-cmdlist.sh > $@+ && mv $@+ $@ + $(QUIET_GEN)util/generate-cmdlist.sh > $@+ && mv $@+ $@ $(patsubst %.sh,%,$(SCRIPT_SH)) : % : %.sh $(QUIET_GEN)$(RM) $@ $@+ && \ @@ -656,7 +658,7 @@ perf.o perf.spec \ %.o: %.S $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) $< -exec_cmd.o: exec_cmd.c PERF-CFLAGS +util/exec_cmd.o: util/exec_cmd.c PERF-CFLAGS $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) \ '-DPERF_EXEC_PATH="$(perfexecdir_SQ)"' \ '-DBINDIR="$(bindir_relative_SQ)"' \ @@ -666,7 +668,7 @@ exec_cmd.o: exec_cmd.c PERF-CFLAGS builtin-init-db.o: builtin-init-db.c PERF-CFLAGS $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DDEFAULT_PERF_TEMPLATE_DIR='"$(template_dir_SQ)"' $< -config.o: config.c PERF-CFLAGS +util/config.o: util/config.c PERF-CFLAGS $(QUIET_CC)$(CC) -o $*.o -c $(ALL_CFLAGS) -DETC_PERFCONFIG='"$(ETC_PERFCONFIG_SQ)"' $< perf-%$X: %.o $(PERFLIBS) @@ -729,7 +731,7 @@ export NO_SVN_TESTS check: common-cmds.h if sparse; \ then \ - for i in *.c; \ + for i in *.c */*.c; \ do \ sparse $(ALL_CFLAGS) $(SPARSE_FLAGS) $$i || exit; \ done; \ @@ -795,7 +797,7 @@ distclean: clean $(RM) configure clean: - $(RM) *.o $(LIB_FILE) + $(RM) *.o */*.o $(LIB_FILE) $(RM) $(ALL_PROGRAMS) $(BUILT_INS) perf$X $(RM) $(TEST_PROGRAMS) $(RM) *.spec *.pyc *.pyo */*.pyc */*.pyo common-cmds.h TAGS tags cscope* @@ -836,7 +838,7 @@ coverage-build: coverage-clean -j1 test coverage-report: - gcov -b *.c - grep '^function.*called 0 ' *.c.gcov \ + gcov -b *.c */*.c + grep '^function.*called 0 ' *.c.gcov */*.c.gcov \ | sed -e 's/\([^:]*\)\.gcov: *function \([^ ]*\) called.*/\1: \2/' \ | tee coverage-untested-functions diff --git a/Documentation/perf_counter/builtin-help.c b/Documentation/perf_counter/builtin-help.c index a136d619db3..6616de0ef05 100644 --- a/Documentation/perf_counter/builtin-help.c +++ b/Documentation/perf_counter/builtin-help.c @@ -3,13 +3,13 @@ * * Builtin help command */ -#include "cache.h" +#include "util/cache.h" #include "builtin.h" -#include "exec_cmd.h" +#include "util/exec_cmd.h" #include "common-cmds.h" -#include "parse-options.h" -#include "run-command.h" -#include "help.h" +#include "util/parse-options.h" +#include "util/run-command.h" +#include "util/help.h" static struct man_viewer_list { struct man_viewer_list *next; diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index 169a2d1783f..d7ace631fc4 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -61,7 +61,7 @@ * Released under the GPL v2. (and only v2, not any later version) */ -#include "util.h" +#include "util/util.h" #include #include diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index 98e8690b6bc..dea016fa41e 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -42,7 +42,7 @@ * Released under the GPL v2. (and only v2, not any later version) */ -#include "util.h" +#include "util/util.h" #include #include diff --git a/Documentation/perf_counter/builtin.h b/Documentation/perf_counter/builtin.h index 800f86c1d44..d32318aed8c 100644 --- a/Documentation/perf_counter/builtin.h +++ b/Documentation/perf_counter/builtin.h @@ -1,8 +1,8 @@ #ifndef BUILTIN_H #define BUILTIN_H -#include "util.h" -#include "strbuf.h" +#include "util/util.h" +#include "util/strbuf.h" extern const char perf_version_string[]; extern const char perf_usage_string[]; diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c index 8d6faecdc15..594d270be39 100644 --- a/Documentation/perf_counter/perf.c +++ b/Documentation/perf_counter/perf.c @@ -1,8 +1,8 @@ #include "builtin.h" -#include "exec_cmd.h" -#include "cache.h" -#include "quote.h" -#include "run-command.h" +#include "util/exec_cmd.h" +#include "util/cache.h" +#include "util/quote.h" +#include "util/run-command.h" const char perf_usage_string[] = "perf [--version] [--help] COMMAND [ARGS]"; diff --git a/Documentation/perf_counter/PERF-VERSION-GEN b/Documentation/perf_counter/util/PERF-VERSION-GEN similarity index 100% rename from Documentation/perf_counter/PERF-VERSION-GEN rename to Documentation/perf_counter/util/PERF-VERSION-GEN diff --git a/Documentation/perf_counter/abspath.c b/Documentation/perf_counter/util/abspath.c similarity index 100% rename from Documentation/perf_counter/abspath.c rename to Documentation/perf_counter/util/abspath.c diff --git a/Documentation/perf_counter/alias.c b/Documentation/perf_counter/util/alias.c similarity index 100% rename from Documentation/perf_counter/alias.c rename to Documentation/perf_counter/util/alias.c diff --git a/Documentation/perf_counter/cache.h b/Documentation/perf_counter/util/cache.h similarity index 100% rename from Documentation/perf_counter/cache.h rename to Documentation/perf_counter/util/cache.h diff --git a/Documentation/perf_counter/config.c b/Documentation/perf_counter/util/config.c similarity index 100% rename from Documentation/perf_counter/config.c rename to Documentation/perf_counter/util/config.c diff --git a/Documentation/perf_counter/ctype.c b/Documentation/perf_counter/util/ctype.c similarity index 100% rename from Documentation/perf_counter/ctype.c rename to Documentation/perf_counter/util/ctype.c diff --git a/Documentation/perf_counter/exec_cmd.c b/Documentation/perf_counter/util/exec_cmd.c similarity index 100% rename from Documentation/perf_counter/exec_cmd.c rename to Documentation/perf_counter/util/exec_cmd.c diff --git a/Documentation/perf_counter/exec_cmd.h b/Documentation/perf_counter/util/exec_cmd.h similarity index 100% rename from Documentation/perf_counter/exec_cmd.h rename to Documentation/perf_counter/util/exec_cmd.h diff --git a/Documentation/perf_counter/generate-cmdlist.sh b/Documentation/perf_counter/util/generate-cmdlist.sh similarity index 100% rename from Documentation/perf_counter/generate-cmdlist.sh rename to Documentation/perf_counter/util/generate-cmdlist.sh diff --git a/Documentation/perf_counter/help.c b/Documentation/perf_counter/util/help.c similarity index 99% rename from Documentation/perf_counter/help.c rename to Documentation/perf_counter/util/help.c index ec011672166..edde541d238 100644 --- a/Documentation/perf_counter/help.c +++ b/Documentation/perf_counter/util/help.c @@ -1,5 +1,5 @@ #include "cache.h" -#include "builtin.h" +#include "../builtin.h" #include "exec_cmd.h" #include "levenshtein.h" #include "help.h" diff --git a/Documentation/perf_counter/help.h b/Documentation/perf_counter/util/help.h similarity index 100% rename from Documentation/perf_counter/help.h rename to Documentation/perf_counter/util/help.h diff --git a/Documentation/perf_counter/levenshtein.c b/Documentation/perf_counter/util/levenshtein.c similarity index 100% rename from Documentation/perf_counter/levenshtein.c rename to Documentation/perf_counter/util/levenshtein.c diff --git a/Documentation/perf_counter/levenshtein.h b/Documentation/perf_counter/util/levenshtein.h similarity index 100% rename from Documentation/perf_counter/levenshtein.h rename to Documentation/perf_counter/util/levenshtein.h diff --git a/Documentation/perf_counter/parse-options.c b/Documentation/perf_counter/util/parse-options.c similarity index 99% rename from Documentation/perf_counter/parse-options.c rename to Documentation/perf_counter/util/parse-options.c index 7464f34e540..28b34c1c29c 100644 --- a/Documentation/perf_counter/parse-options.c +++ b/Documentation/perf_counter/util/parse-options.c @@ -469,9 +469,6 @@ int parse_options_usage(const char * const *usagestr, } -/*----- some often used options -----*/ -#include "cache.h" - int parse_opt_verbosity_cb(const struct option *opt, const char *arg, int unset) { diff --git a/Documentation/perf_counter/parse-options.h b/Documentation/perf_counter/util/parse-options.h similarity index 100% rename from Documentation/perf_counter/parse-options.h rename to Documentation/perf_counter/util/parse-options.h diff --git a/Documentation/perf_counter/path.c b/Documentation/perf_counter/util/path.c similarity index 100% rename from Documentation/perf_counter/path.c rename to Documentation/perf_counter/util/path.c diff --git a/Documentation/perf_counter/quote.c b/Documentation/perf_counter/util/quote.c similarity index 100% rename from Documentation/perf_counter/quote.c rename to Documentation/perf_counter/util/quote.c diff --git a/Documentation/perf_counter/quote.h b/Documentation/perf_counter/util/quote.h similarity index 97% rename from Documentation/perf_counter/quote.h rename to Documentation/perf_counter/util/quote.h index 66730f2bff3..5dfad89816d 100644 --- a/Documentation/perf_counter/quote.h +++ b/Documentation/perf_counter/util/quote.h @@ -18,7 +18,7 @@ * run the command on the other side: * * sprintf(cmd, "git-diff-tree %s %s", sq_quote(arg0), sq_quote(arg1)); - * sprintf(rcmd, "ssh %s %s", sq_quote(host), sq_quote(cmd)); + * sprintf(rcmd, "ssh %s %s", sq_util/quote.host), sq_quote(cmd)); * * Note that the above examples leak memory! Remember to free result from * sq_quote() in a real application. diff --git a/Documentation/perf_counter/run-command.c b/Documentation/perf_counter/util/run-command.c similarity index 100% rename from Documentation/perf_counter/run-command.c rename to Documentation/perf_counter/util/run-command.c diff --git a/Documentation/perf_counter/run-command.h b/Documentation/perf_counter/util/run-command.h similarity index 100% rename from Documentation/perf_counter/run-command.h rename to Documentation/perf_counter/util/run-command.h diff --git a/Documentation/perf_counter/strbuf.c b/Documentation/perf_counter/util/strbuf.c similarity index 100% rename from Documentation/perf_counter/strbuf.c rename to Documentation/perf_counter/util/strbuf.c diff --git a/Documentation/perf_counter/strbuf.h b/Documentation/perf_counter/util/strbuf.h similarity index 100% rename from Documentation/perf_counter/strbuf.h rename to Documentation/perf_counter/util/strbuf.h diff --git a/Documentation/perf_counter/usage.c b/Documentation/perf_counter/util/usage.c similarity index 100% rename from Documentation/perf_counter/usage.c rename to Documentation/perf_counter/util/usage.c diff --git a/Documentation/perf_counter/util.h b/Documentation/perf_counter/util/util.h similarity index 100% rename from Documentation/perf_counter/util.h rename to Documentation/perf_counter/util/util.h diff --git a/Documentation/perf_counter/wrapper.c b/Documentation/perf_counter/util/wrapper.c similarity index 100% rename from Documentation/perf_counter/wrapper.c rename to Documentation/perf_counter/util/wrapper.c From 47c8a08bbe77ad3c06f63919a14b0f0b0cd54390 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 27 Apr 2009 17:34:39 +0900 Subject: [PATCH 0571/2061] sh: rtc-generic support. This adds rtc-generic support for SUPERH32. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 1 + arch/sh/include/asm/rtc.h | 11 +++++++++++ arch/sh/kernel/time_32.c | 23 +++++++++++++++++++++++ drivers/rtc/Kconfig | 2 +- 4 files changed, 36 insertions(+), 1 deletion(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 164748945f9..9db02ced57a 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -30,6 +30,7 @@ config SUPERH32 select HAVE_DYNAMIC_FTRACE select HAVE_ARCH_KGDB select ARCH_HIBERNATION_POSSIBLE if MMU + select RTC_LIB config SUPERH64 def_bool ARCH = "sh64" diff --git a/arch/sh/include/asm/rtc.h b/arch/sh/include/asm/rtc.h index f7b010d48af..52b0c2dba97 100644 --- a/arch/sh/include/asm/rtc.h +++ b/arch/sh/include/asm/rtc.h @@ -6,6 +6,17 @@ extern void (*board_time_init)(void); extern void (*rtc_sh_get_time)(struct timespec *); extern int (*rtc_sh_set_time)(const time_t); +/* some dummy definitions */ +#define RTC_BATT_BAD 0x100 /* battery bad */ +#define RTC_SQWE 0x08 /* enable square-wave output */ +#define RTC_DM_BINARY 0x04 /* all time/date values are BCD if clear */ +#define RTC_24H 0x02 /* 24 hour mode - else hours bit 7 means pm */ +#define RTC_DST_EN 0x01 /* auto switch DST - works f. USA only */ + +struct rtc_time; +unsigned int get_rtc_time(struct rtc_time *); +int set_rtc_time(struct rtc_time *); + #define RTC_CAP_4_DIGIT_YEAR (1 << 0) struct sh_rtc_platform_info { diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c index c770413c321..109409f5ca5 100644 --- a/arch/sh/kernel/time_32.c +++ b/arch/sh/kernel/time_32.c @@ -19,6 +19,7 @@ #include /* for rtc_lock */ #include #include +#include #include #include #include @@ -45,6 +46,28 @@ static int null_rtc_set_time(const time_t secs) void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time; int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time; +unsigned int get_rtc_time(struct rtc_time *tm) +{ + if (rtc_sh_get_time != null_rtc_get_time) { + struct timespec tv; + + rtc_sh_get_time(&tv); + rtc_time_to_tm(tv.tv_sec, tm); + } + + return RTC_24H; +} +EXPORT_SYMBOL(get_rtc_time); + +int set_rtc_time(struct rtc_time *tm) +{ + unsigned long secs; + + rtc_tm_to_time(tm, &secs); + return rtc_sh_set_time(secs); +} +EXPORT_SYMBOL(set_rtc_time); + #ifndef CONFIG_GENERIC_TIME void do_gettimeofday(struct timeval *tv) { diff --git a/drivers/rtc/Kconfig b/drivers/rtc/Kconfig index 4e9851fc174..277d35d232f 100644 --- a/drivers/rtc/Kconfig +++ b/drivers/rtc/Kconfig @@ -692,7 +692,7 @@ config RTC_DRV_GENERIC tristate "Generic RTC support" # Please consider writing a new RTC driver instead of using the generic # RTC abstraction - depends on PARISC || M68K || PPC + depends on PARISC || M68K || PPC || SUPERH32 help Say Y or M here to enable RTC support on systems using the generic RTC abstraction. If you do not know what you are doing, you should From c2e0090c668fc99f5be65fd9907da781cb6a2ef5 Mon Sep 17 00:00:00 2001 From: Jean-Christophe PLAGNIOL-VILLARD Date: Mon, 27 Apr 2009 17:54:41 +0900 Subject: [PATCH 0572/2061] sh: mach-r2d: add physmap-flash support for R2D+ boards. The RTS7751R2D_1 boards only support 1MB of socket-mounted MBM29F040 flash, which we just leave alone as it's not terribly interesting. This adds support for the s29gl256p on the r2d+ boards that makes a bit more sense to expose to the user. Signed-off-by: Jean-Christophe PLAGNIOL-VILLARD Signed-off-by: Paul Mundt --- arch/sh/boards/mach-r2d/setup.c | 50 ++++++++++++++ arch/sh/configs/rts7751r2dplus_defconfig | 87 +++++++++++++++++++++++- 2 files changed, 136 insertions(+), 1 deletion(-) diff --git a/arch/sh/boards/mach-r2d/setup.c b/arch/sh/boards/mach-r2d/setup.c index c585be00956..a625ecb93e4 100644 --- a/arch/sh/boards/mach-r2d/setup.c +++ b/arch/sh/boards/mach-r2d/setup.c @@ -10,6 +10,9 @@ */ #include #include +#include +#include +#include #include #include #include @@ -181,6 +184,50 @@ static struct platform_device sm501_device = { .resource = sm501_resources, }; +static struct mtd_partition r2d_partitions[] = { + { + .name = "U-Boot", + .offset = 0x00000000, + .size = 0x00040000, + .mask_flags = MTD_WRITEABLE, + }, { + .name = "Environment", + .offset = MTDPART_OFS_NXTBLK, + .size = 0x00040000, + .mask_flags = MTD_WRITEABLE, + }, { + .name = "Kernel", + .offset = MTDPART_OFS_NXTBLK, + .size = 0x001c0000, + }, { + .name = "Flash_FS", + .offset = MTDPART_OFS_NXTBLK, + .size = MTDPART_SIZ_FULL, + } +}; + +static struct physmap_flash_data flash_data = { + .width = 2, + .nr_parts = ARRAY_SIZE(r2d_partitions), + .parts = r2d_partitions, +}; + +static struct resource flash_resource = { + .start = 0x00000000, + .end = 0x02000000, + .flags = IORESOURCE_MEM, +}; + +static struct platform_device flash_device = { + .name = "physmap-flash", + .id = -1, + .resource = &flash_resource, + .num_resources = 1, + .dev = { + .platform_data = &flash_data, + }, +}; + static struct platform_device *rts7751r2d_devices[] __initdata = { &sm501_device, &heartbeat_device, @@ -203,6 +250,9 @@ static int __init rts7751r2d_devices_setup(void) if (register_trapped_io(&cf_trapped_io) == 0) platform_device_register(&cf_ide_device); + if (mach_is_r2d_plus()) + platform_device_register(&flash_device); + spi_register_board_info(spi_bus, ARRAY_SIZE(spi_bus)); return platform_add_devices(rts7751r2d_devices, diff --git a/arch/sh/configs/rts7751r2dplus_defconfig b/arch/sh/configs/rts7751r2dplus_defconfig index 2bc1c97d346..a860435b884 100644 --- a/arch/sh/configs/rts7751r2dplus_defconfig +++ b/arch/sh/configs/rts7751r2dplus_defconfig @@ -424,7 +424,92 @@ CONFIG_FIRMWARE_IN_KERNEL=y CONFIG_EXTRA_FIRMWARE="" # CONFIG_SYS_HYPERVISOR is not set # CONFIG_CONNECTOR is not set -# CONFIG_MTD is not set +CONFIG_MTD=y +# CONFIG_MTD_DEBUG is not set +CONFIG_MTD_CONCAT=y +CONFIG_MTD_PARTITIONS=y +# CONFIG_MTD_TESTS is not set +# CONFIG_MTD_REDBOOT_PARTS is not set +CONFIG_MTD_CMDLINE_PARTS=y +# CONFIG_MTD_AR7_PARTS is not set + +# +# User Modules And Translation Layers +# +CONFIG_MTD_CHAR=y +# CONFIG_MTD_BLKDEVS is not set +# CONFIG_MTD_BLOCK is not set +# CONFIG_MTD_BLOCK_RO is not set +# CONFIG_FTL is not set +# CONFIG_NFTL is not set +# CONFIG_INFTL is not set +# CONFIG_RFD_FTL is not set +# CONFIG_SSFDC is not set +# CONFIG_MTD_OOPS is not set + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=y +# CONFIG_MTD_JEDECPROBE is not set +CONFIG_MTD_GEN_PROBE=y +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +# CONFIG_MTD_MAP_BANK_WIDTH_8 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_16 is not set +# CONFIG_MTD_MAP_BANK_WIDTH_32 is not set +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +# CONFIG_MTD_CFI_I4 is not set +# CONFIG_MTD_CFI_I8 is not set +# CONFIG_MTD_CFI_INTELEXT is not set +CONFIG_MTD_CFI_AMDSTD=y +# CONFIG_MTD_CFI_STAA is not set +CONFIG_MTD_CFI_UTIL=y +# CONFIG_MTD_RAM is not set +# CONFIG_MTD_ROM is not set +# CONFIG_MTD_ABSENT is not set + +# +# Mapping drivers for chip access +# +# CONFIG_MTD_COMPLEX_MAPPINGS is not set +CONFIG_MTD_PHYSMAP=y +# CONFIG_MTD_PHYSMAP_COMPAT is not set +# CONFIG_MTD_INTEL_VR_NOR is not set +# CONFIG_MTD_PLATRAM is not set + +# +# Self-contained MTD device drivers +# +# CONFIG_MTD_PMC551 is not set +# CONFIG_MTD_DATAFLASH is not set +# CONFIG_MTD_M25P80 is not set +# CONFIG_MTD_SLRAM is not set +# CONFIG_MTD_PHRAM is not set +# CONFIG_MTD_MTDRAM is not set +# CONFIG_MTD_BLOCK2MTD is not set + +# +# Disk-On-Chip Device Drivers +# +# CONFIG_MTD_DOC2000 is not set +# CONFIG_MTD_DOC2001 is not set +# CONFIG_MTD_DOC2001PLUS is not set +# CONFIG_MTD_NAND is not set +# CONFIG_MTD_ONENAND is not set + +# +# LPDDR flash memory drivers +# +# CONFIG_MTD_LPDDR is not set + +# +# UBI - Unsorted block images +# +# CONFIG_MTD_UBI is not set # CONFIG_PARPORT is not set CONFIG_BLK_DEV=y # CONFIG_BLK_CPQ_CISS_DA is not set From ac2ff946a53e7bd0ae98f4e5d1d6c1b1dced82e5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 12:38:32 +0900 Subject: [PATCH 0573/2061] mg_disk: fix locking IRQ and timeout handlers call functions which expect locked queue lock without locking it. Fix it. While at it, convert 0s used as null pointer constant to NULLs. [ Impact: fix locking, cleanup ] Signed-off-by: Tejun Heo Cc: unsik Kim --- drivers/block/mg_disk.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index fb39d9aa3cd..d3e72ad08db 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -160,11 +160,16 @@ static irqreturn_t mg_irq(int irq, void *dev_id) struct mg_host *host = dev_id; void (*handler)(struct mg_host *) = host->mg_do_intr; - host->mg_do_intr = 0; + spin_lock(&host->lock); + + host->mg_do_intr = NULL; del_timer(&host->timer); if (!handler) handler = mg_unexpected_intr; handler(host); + + spin_unlock(&host->lock); + return IRQ_HANDLED; } @@ -319,7 +324,7 @@ static void mg_read(struct request *req) remains = req->nr_sectors; - if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, 0) != + if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, NULL) != MG_ERR_NONE) mg_bad_rw_intr(host); @@ -363,7 +368,7 @@ static void mg_write(struct request *req) remains = req->nr_sectors; - if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, 0) != + if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, NULL) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; @@ -521,9 +526,11 @@ void mg_times_out(unsigned long data) char *name; struct request *req; + spin_lock_irq(&host->lock); + req = elv_next_request(host->breq); if (!req) - return; + goto out_unlock; host->mg_do_intr = NULL; @@ -534,6 +541,8 @@ void mg_times_out(unsigned long data) mg_bad_rw_intr(host); mg_request(host->breq); +out_unlock: + spin_unlock_irq(&host->lock); } static void mg_request_poll(struct request_queue *q) From 7090a0a97f55cbf47547a140fcc5a349f32c598c Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Apr 2009 12:38:33 +0900 Subject: [PATCH 0574/2061] mg_disk: fix CONFIG_LBD=y warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit drivers/block/mg_disk.c: In function ‘mg_dump_status’: drivers/block/mg_disk.c:265: warning: format ‘%ld’ expects type ‘long int’, but argument 2 has type ‘sector_t’ [ Impact: kill build warning ] Cc: unsik Kim Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Tejun Heo --- drivers/block/mg_disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index d3e72ad08db..f3898353d0a 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -79,7 +79,7 @@ static void mg_dump_status(const char *msg, unsigned int stat, if (host->breq) { req = elv_next_request(host->breq); if (req) - printk(", sector=%ld", req->sector); + printk(", sector=%u", (u32)req->sector); } } From e93b9fb7d85da4fd9d5171649e5ddcac1dd572bf Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 12:38:33 +0900 Subject: [PATCH 0575/2061] hd: fix locking hd dance around local irq and HD_IRQ enable without achieving much. It ends up transferring data from irq handler with both local irq and HD_IRQ disabled. The only place it actually does something is while transferring the first block of a request which it does with HD_IRQ disabled but local irq enabled. Unfortunately, the dancing is horribly broken from locking POV. IRQ and timeout handlers access block queue without grabbing the queue lock and running the driver in SMP configuration crashes the whole machine pretty quickly. Remove meaningless irq enable/disable dancing and add proper locking in issue, irq and timeout paths. Signed-off-by: Tejun Heo --- drivers/block/hd.c | 17 +++++++---------- 1 file changed, 7 insertions(+), 10 deletions(-) diff --git a/drivers/block/hd.c b/drivers/block/hd.c index 3c11f062a18..baaa9e486e5 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -509,7 +509,6 @@ ok_to_write: if (i > 0) { SET_HANDLER(&write_intr); outsw(HD_DATA, req->buffer, 256); - local_irq_enable(); } else { #if (HD_DELAY > 0) last_req = read_timer(); @@ -541,8 +540,7 @@ static void hd_times_out(unsigned long dummy) if (!CURRENT) return; - disable_irq(HD_IRQ); - local_irq_enable(); + spin_lock_irq(hd_queue->queue_lock); reset = 1; name = CURRENT->rq_disk->disk_name; printk("%s: timeout\n", name); @@ -552,9 +550,8 @@ static void hd_times_out(unsigned long dummy) #endif end_request(CURRENT, 0); } - local_irq_disable(); hd_request(); - enable_irq(HD_IRQ); + spin_unlock_irq(hd_queue->queue_lock); } static int do_special_op(struct hd_i_struct *disk, struct request *req) @@ -592,7 +589,6 @@ static void hd_request(void) return; repeat: del_timer(&device_timer); - local_irq_enable(); req = CURRENT; if (!req) { @@ -601,7 +597,6 @@ repeat: } if (reset) { - local_irq_disable(); reset_hd(); return; } @@ -660,9 +655,7 @@ repeat: static void do_hd_request(struct request_queue *q) { - disable_irq(HD_IRQ); hd_request(); - enable_irq(HD_IRQ); } static int hd_getgeo(struct block_device *bdev, struct hd_geometry *geo) @@ -684,12 +677,16 @@ static irqreturn_t hd_interrupt(int irq, void *dev_id) { void (*handler)(void) = do_hd; + spin_lock(hd_queue->queue_lock); + do_hd = NULL; del_timer(&device_timer); if (!handler) handler = unexpected_hd_interrupt; handler(); - local_irq_enable(); + + spin_unlock(hd_queue->queue_lock); + return IRQ_HANDLED; } From e686307fdc84f249490e6c9da92fcb2424491f14 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 17 Apr 2009 08:41:21 +0200 Subject: [PATCH 0576/2061] loop: use BIO list management functions Now that the bio list management stuff is generic, convert loop to use bio lists instead of its own private bio list implementation. Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Akinobu Mita Signed-off-by: Jens Axboe --- drivers/block/loop.c | 26 +++++++------------------- include/linux/bio.h | 2 +- include/linux/loop.h | 3 +-- 3 files changed, 9 insertions(+), 22 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index ddae8082589..9ca4bb01465 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -511,11 +511,7 @@ out: */ static void loop_add_bio(struct loop_device *lo, struct bio *bio) { - if (lo->lo_biotail) { - lo->lo_biotail->bi_next = bio; - lo->lo_biotail = bio; - } else - lo->lo_bio = lo->lo_biotail = bio; + bio_list_add(&lo->lo_bio_list, bio); } /* @@ -523,16 +519,7 @@ static void loop_add_bio(struct loop_device *lo, struct bio *bio) */ static struct bio *loop_get_bio(struct loop_device *lo) { - struct bio *bio; - - if ((bio = lo->lo_bio)) { - if (bio == lo->lo_biotail) - lo->lo_biotail = NULL; - lo->lo_bio = bio->bi_next; - bio->bi_next = NULL; - } - - return bio; + return bio_list_pop(&lo->lo_bio_list); } static int loop_make_request(struct request_queue *q, struct bio *old_bio) @@ -609,12 +596,13 @@ static int loop_thread(void *data) set_user_nice(current, -20); - while (!kthread_should_stop() || lo->lo_bio) { + while (!kthread_should_stop() || !bio_list_empty(&lo->lo_bio_list)) { wait_event_interruptible(lo->lo_event, - lo->lo_bio || kthread_should_stop()); + !bio_list_empty(&lo->lo_bio_list) || + kthread_should_stop()); - if (!lo->lo_bio) + if (bio_list_empty(&lo->lo_bio_list)) continue; spin_lock_irq(&lo->lo_lock); bio = loop_get_bio(lo); @@ -841,7 +829,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, lo->old_gfp_mask = mapping_gfp_mask(mapping); mapping_set_gfp_mask(mapping, lo->old_gfp_mask & ~(__GFP_IO|__GFP_FS)); - lo->lo_bio = lo->lo_biotail = NULL; + bio_list_init(&lo->lo_bio_list); /* * set queue make_request_fn, and add limits based on lower level diff --git a/include/linux/bio.h b/include/linux/bio.h index 7b214fd672a..f37ca8c726b 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -506,7 +506,7 @@ static inline int bio_has_data(struct bio *bio) } /* - * BIO list managment for use by remapping drivers (e.g. DM or MD). + * BIO list management for use by remapping drivers (e.g. DM or MD) and loop. * * A bio_list anchors a singly-linked list of bios chained through the bi_next * member of the bio. The bio_list also caches the last list member to allow diff --git a/include/linux/loop.h b/include/linux/loop.h index 40725447f5e..66c194e2d9b 100644 --- a/include/linux/loop.h +++ b/include/linux/loop.h @@ -56,8 +56,7 @@ struct loop_device { gfp_t old_gfp_mask; spinlock_t lo_lock; - struct bio *lo_bio; - struct bio *lo_biotail; + struct bio_list lo_bio_list; int lo_state; struct mutex lo_ctl_mutex; struct task_struct *lo_thread; From 924cec7789f65ab7f022256f6533ecba0747b5f3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0577/2061] block: clear req->errors on bio completion only for fs requests Impact: subtle behavior change For fs requests, rq is only carrier of bios and rq error status as a whole doesn't mean much. This is the reason why rq->errors is being cleared on each partial completion of a request as on each partial completion the error status is transferred to the respective bios. For pc requests, rq->errors is used to carry error status to the issuer and thus __end_that_request_first() doesn't clear it on such cases. The condition was fine till now as only fs and pc requests have used bio and thus the bio completion path. However, future changes will unify data accesses to bio and all non fs users care about rq error status. Clear rq->errors on bio completion only for fs requests. In general, the implicit clearing is a bit too subtle especially as the meaning of rq->errors is completely dependent on low level drivers. Unifying / cleaning up rq->errors usage and letting llds manage it would be better. TODO comment added. Signed-off-by: Tejun Heo Acked-by: Jens Axboe --- block/blk-core.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 2998fe3a237..41bc0ff75e2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1741,10 +1741,14 @@ static int __end_that_request_first(struct request *req, int error, trace_block_rq_complete(req->q, req); /* - * for a REQ_TYPE_BLOCK_PC request, we want to carry any eventual - * sense key with us all the way through + * For fs requests, rq is just carrier of independent bio's + * and each partial completion should be handled separately. + * Reset per-request error on each partial completion. + * + * TODO: tj: This is too subtle. It would be better to let + * low level drivers do what they see fit. */ - if (!blk_pc_request(req)) + if (blk_fs_request(req)) req->errors = 0; if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { From 0de57fb93b1daaeaecb658a98b3299ae460c02e9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0578/2061] ide-tape: remove back-to-back REQUEST_SENSE detection Impact: fix an oops which always triggers ide_tape_issue_pc() assumed drive->pc isn't NULL on invocation when checking for back-to-back request sense issues but drive->pc can be NULL and even when it's not NULL, it's not safe to dereference it once the previous command is complete because pc could have been freed or was on stack. Kill back-to-back REQUEST_SENSE detection. Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index cb942a9b580..3a53e0834cf 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -614,12 +614,6 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, { idetape_tape_t *tape = drive->driver_data; - if (drive->pc->c[0] == REQUEST_SENSE && - pc->c[0] == REQUEST_SENSE) { - printk(KERN_ERR "ide-tape: possible ide-tape.c bug - " - "Two request sense in serial were issued\n"); - } - if (drive->failed_pc == NULL && pc->c[0] != REQUEST_SENSE) drive->failed_pc = pc; From 220d06b5531e7b8a6226b2fdfb21198c3ccc4f76 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0579/2061] ide: use blk_run_queue() instead of blk_start_queueing() blk_start_queueing() is being phased out in favor of [__]blk_run_queue(). Switch. Signed-off-by: Tejun Heo --- drivers/ide/ide-park.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/ide/ide-park.c b/drivers/ide/ide-park.c index 310d03f2b5b..a914023d6d0 100644 --- a/drivers/ide/ide-park.c +++ b/drivers/ide/ide-park.c @@ -24,11 +24,8 @@ static void issue_park_cmd(ide_drive_t *drive, unsigned long timeout) start_queue = 1; spin_unlock_irq(&hwif->lock); - if (start_queue) { - spin_lock_irq(q->queue_lock); - blk_start_queueing(q); - spin_unlock_irq(q->queue_lock); - } + if (start_queue) + blk_run_queue(q); return; } spin_unlock_irq(&hwif->lock); From b2963ac1738542d30305d7e12c8c078a383a425c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0580/2061] ide: don't set REQ_SOFTBARRIER ide doesn't have to worry about REQ_SOFTBARRIER. Don't set it. Signed-off-by: Tejun Heo --- drivers/ide/ide-disk.c | 1 - drivers/ide/ide-ioctls.c | 1 - 2 files changed, 2 deletions(-) diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index a9fbe2c3121..c2438804d3c 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -411,7 +411,6 @@ static void idedisk_prepare_flush(struct request_queue *q, struct request *rq) cmd->protocol = ATA_PROT_NODATA; rq->cmd_type = REQ_TYPE_ATA_TASKFILE; - rq->cmd_flags |= REQ_SOFTBARRIER; rq->special = cmd; } diff --git a/drivers/ide/ide-ioctls.c b/drivers/ide/ide-ioctls.c index c1c25ebbaa1..5991b23793f 100644 --- a/drivers/ide/ide-ioctls.c +++ b/drivers/ide/ide-ioctls.c @@ -231,7 +231,6 @@ static int generic_drive_reset(ide_drive_t *drive) rq->cmd_type = REQ_TYPE_SPECIAL; rq->cmd_len = 1; rq->cmd[0] = REQ_DRIVE_RESET; - rq->cmd_flags |= REQ_SOFTBARRIER; if (blk_execute_rq(drive->queue, NULL, rq, 1)) ret = rq->errors; blk_put_request(rq); From 214ae19104141404f18ad6651752eb38e3dd584e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0581/2061] ide kill unused ide_cmd->special Impact: removal of unused field No one uses ide_cmd->special anymore. Kill it. Signed-off-by: Tejun Heo --- include/linux/ide.h | 1 - 1 file changed, 1 deletion(-) diff --git a/include/linux/ide.h b/include/linux/ide.h index ff65fffb078..846a1e13240 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -324,7 +324,6 @@ struct ide_cmd { unsigned int cursg_ofs; struct request *rq; /* copy of request */ - void *special; /* valid_t generally */ }; /* ATAPI packet command flags */ From 59a4f6f355fc718581ddcf1bb45a469d4756c035 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:41 +0900 Subject: [PATCH 0582/2061] ide-cd: clear sense buffer before issuing request sense Impact: code simplification ide_cd_request_sense_fixup() clears the tail of the sense buffer if the device didn't completely fill it. This patch makes cdrom_queue_request_sense() clear the sense buffer before issuing the command instead of clearing it afterwards. This simplifies code and eases future changes. Signed-off-by: Tejun Heo --- drivers/ide/ide-cd.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 3d4e0996976..b6a0d126b57 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -217,6 +217,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense, if (sense == NULL) sense = &info->sense_data; + memset(sense, 0, 18); + /* stuff the sense request in front of our current request */ blk_rq_init(NULL, rq); rq->cmd_type = REQ_TYPE_ATA_PC; @@ -504,14 +506,8 @@ static void ide_cd_request_sense_fixup(ide_drive_t *drive, struct ide_cmd *cmd) * and some drives don't send them. Sigh. */ if (rq->cmd[0] == GPCMD_REQUEST_SENSE && - cmd->nleft > 0 && cmd->nleft <= 5) { - unsigned int ofs = cmd->nbytes - cmd->nleft; - - while (cmd->nleft > 0) { - *((u8 *)rq->data + ofs++) = 0; - cmd->nleft--; - } - } + cmd->nleft > 0 && cmd->nleft <= 5) + cmd->nleft = 0; } int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, From 8968932e54db35cf9d69cfbbd50c26dfaaa586c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0583/2061] ide-floppy: block pc always uses bio Impact: remove unnecessary code path Block pc requests always use bio and rq->data is always NULL. No need to worry about !rq->bio cases in idefloppy_block_pc_cmd(). Note that ide-atapi uses ide_pio_bytes() for bio PIO transfer which handle sg fine. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-floppy.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 2b4868d95f8..3b22e066287 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -216,15 +216,13 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy, ide_init_pc(pc); memcpy(pc->c, rq->cmd, sizeof(pc->c)); pc->rq = rq; - if (rq->data_len && rq_data_dir(rq) == WRITE) - pc->flags |= PC_FLAG_WRITING; - pc->buf = rq->data; - if (rq->bio) + if (rq->data_len) { pc->flags |= PC_FLAG_DMA_OK; - /* - * possibly problematic, doesn't look like ide-floppy correctly - * handled scattered requests if dma fails... - */ + if (rq_data_dir(rq) == WRITE) + pc->flags |= PC_FLAG_WRITING; + } + /* pio will be performed by ide_pio_bytes() which handles sg fine */ + pc->buf = NULL; pc->req_xfer = pc->buf_size = rq->data_len; } From d868ca24302e99a0e8a86071ca2c66273edf97d9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0584/2061] ide-taskfile: don't abuse rq->buffer Impact: rq->buffer usage cleanup ide_raw_taskfile() directly uses rq->buffer to carry pointer to the data buffer. This complicates both block interface and ide backend request handling. Use blk_rq_map_kern() instead and drop special handling for REQ_TYPE_ATA_TASKFILE from ide_map_sg(). Note that REQ_RW setting is moved upwards as blk_rq_map_kern() uses it to initialize bio rw flag. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-io.c | 5 +---- drivers/ide/ide-taskfile.c | 18 +++++++++++------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 35dc38d3b2c..9b9e8b1aae5 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -248,10 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd) struct scatterlist *sg = hwif->sg_table; struct request *rq = cmd->rq; - if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) { - sg_init_one(sg, rq->buffer, rq->nr_sectors * SECTOR_SIZE); - cmd->sg_nents = 1; - } else if (!rq->bio) { + if (!rq->bio) { sg_init_one(sg, rq->data, rq->data_len); cmd->sg_nents = 1; } else diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index 4aa6223c11b..f400eb4d4af 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -424,7 +424,9 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_ATA_TASKFILE; - rq->buffer = buf; + + if (cmd->tf_flags & IDE_TFLAG_WRITE) + rq->cmd_flags |= REQ_RW; /* * (ks) We transfer currently only whole sectors. @@ -432,18 +434,20 @@ int ide_raw_taskfile(ide_drive_t *drive, struct ide_cmd *cmd, u8 *buf, * if we would find a solution to transfer any size. * To support special commands like READ LONG. */ - rq->hard_nr_sectors = rq->nr_sectors = nsect; - rq->hard_cur_sectors = rq->current_nr_sectors = nsect; - - if (cmd->tf_flags & IDE_TFLAG_WRITE) - rq->cmd_flags |= REQ_RW; + if (nsect) { + error = blk_rq_map_kern(drive->queue, rq, buf, + nsect * SECTOR_SIZE, __GFP_WAIT); + if (error) + goto put_req; + } rq->special = cmd; cmd->rq = rq; error = blk_execute_rq(drive->queue, NULL, rq, 0); - blk_put_request(rq); +put_req: + blk_put_request(rq); return error; } From ac0b0113ddbab3ed2388132d368c97292f9f3c84 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0585/2061] ide-atapi: don't abuse rq->buffer Impact: rq->buffer usage cleanup ide-atapi uses rq->buffer as private opaque value for internal special requests. rq->special isn't used for these cases (the only case where rq->special is used is for ide-tape rw requests). Use rq->special instead. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-atapi.c | 4 ++-- drivers/ide/ide-floppy.c | 2 +- drivers/ide/ide-tape.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 7201b176d75..2894577237b 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -90,7 +90,7 @@ static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk, blk_rq_init(NULL, rq); rq->cmd_type = REQ_TYPE_SPECIAL; rq->cmd_flags |= REQ_PREEMPT; - rq->buffer = (char *)pc; + rq->special = (char *)pc; rq->rq_disk = disk; if (pc->req_xfer) { @@ -119,7 +119,7 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_SPECIAL; - rq->buffer = (char *)pc; + rq->special = (char *)pc; if (pc->req_xfer) { rq->data = pc->buf; diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 3b22e066287..94600331a27 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -264,7 +264,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, pc = &floppy->queued_pc; idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block); } else if (blk_special_request(rq)) { - pc = (struct ide_atapi_pc *) rq->buffer; + pc = (struct ide_atapi_pc *)rq->special; } else if (blk_pc_request(rq)) { pc = &floppy->queued_pc; idefloppy_blockpc_cmd(floppy, pc, rq); diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 3a53e0834cf..aadf53cfac6 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -828,7 +828,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, goto out; } if (rq->cmd[13] & REQ_IDETAPE_PC1) { - pc = (struct ide_atapi_pc *) rq->buffer; + pc = (struct ide_atapi_pc *)rq->special; rq->cmd[13] &= ~(REQ_IDETAPE_PC1); rq->cmd[13] |= REQ_IDETAPE_PC2; goto out; From 1f181d2b1569dfb88a584a6e1847e9e1c7645951 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0586/2061] ide-cd: don't abuse rq->buffer Impact: rq->buffer usage cleanup ide-cd uses rq->buffer to carry pointer to the original request when issuing REQUEST_SENSE. Use rq->special instead. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-cd.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index b6a0d126b57..bb77c79c101 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -232,8 +232,8 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense, rq->cmd_type = REQ_TYPE_SENSE; rq->cmd_flags |= REQ_PREEMPT; - /* NOTE! Save the failed command in "rq->buffer" */ - rq->buffer = (void *) failed_command; + /* NOTE! Save the failed command in "rq->special" */ + rq->special = (void *)failed_command; if (failed_command) ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x", @@ -247,10 +247,10 @@ static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense, static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) { /* - * For REQ_TYPE_SENSE, "rq->buffer" points to the original + * For REQ_TYPE_SENSE, "rq->special" points to the original * failed request */ - struct request *failed = (struct request *)rq->buffer; + struct request *failed = (struct request *)rq->special; struct cdrom_info *info = drive->driver_data; void *sense = &info->sense_data; From e69d800f7e8797a8e3423380ee9d8ca1cb90c388 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0587/2061] ide: add helpers for preparing sense requests This is in preparation of removing the queueing of a sense request out of the IRQ handler path. Use struct request_sense as a general sense buffer for all ATAPI devices ide-{floppy,tape,cd}. tj: * blk_get_request(__GFP_WAIT) can't be called from do_request() as it can cause deadlock. Converted to use inline struct request and blk_rq_init(). * Added xfer / cdb len selection depending on device type. * All sense prep logics folded into ide_prep_sense() which never fails. * hwif->rq clearing and sense_rq used handling moved into ide_queue_sense_rq(). * blk_rq_map_kern() conversion is moved to later patch. CC: Bartlomiej Zolnierkiewicz CC: FUJITA Tomonori Signed-off-by: Borislav Petkov Signed-off-by: Tejun Heo --- drivers/ide/ide-atapi.c | 61 +++++++++++++++++++++++++++++++++++++++++ include/linux/ide.h | 11 ++++++++ 2 files changed, 72 insertions(+) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 2894577237b..c6e03485a63 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -191,6 +191,67 @@ void ide_create_request_sense_cmd(ide_drive_t *drive, struct ide_atapi_pc *pc) } EXPORT_SYMBOL_GPL(ide_create_request_sense_cmd); +void ide_prep_sense(ide_drive_t *drive, struct request *rq) +{ + struct request_sense *sense = &drive->sense_data; + struct request *sense_rq = &drive->sense_rq; + unsigned int cmd_len, sense_len; + + debug_log("%s: enter\n", __func__); + + switch (drive->media) { + case ide_floppy: + cmd_len = 255; + sense_len = 18; + break; + case ide_tape: + cmd_len = 20; + sense_len = 20; + break; + default: + cmd_len = 18; + sense_len = 18; + } + + BUG_ON(sense_len > sizeof(*sense)); + + if (blk_sense_request(rq) || drive->sense_rq_armed) + return; + + memset(sense, 0, sizeof(*sense)); + + blk_rq_init(rq->q, sense_rq); + sense_rq->rq_disk = rq->rq_disk; + + sense_rq->data = sense; + sense_rq->cmd[0] = GPCMD_REQUEST_SENSE; + sense_rq->cmd[4] = cmd_len; + sense_rq->data_len = sense_len; + + sense_rq->cmd_type = REQ_TYPE_SENSE; + sense_rq->cmd_flags |= REQ_PREEMPT; + + if (drive->media == ide_tape) + sense_rq->cmd[13] = REQ_IDETAPE_PC1; + + drive->sense_rq_armed = true; +} +EXPORT_SYMBOL_GPL(ide_prep_sense); + +void ide_queue_sense_rq(ide_drive_t *drive, void *special) +{ + BUG_ON(!drive->sense_rq_armed); + + drive->sense_rq.special = special; + drive->sense_rq_armed = false; + + drive->hwif->rq = NULL; + + elv_add_request(drive->queue, &drive->sense_rq, + ELEVATOR_INSERT_FRONT, 0); +} +EXPORT_SYMBOL_GPL(ide_queue_sense_rq); + /* * Called when an error was detected during the last packet command. * We queue a request sense packet command in the head of the request list. diff --git a/include/linux/ide.h b/include/linux/ide.h index 846a1e13240..a69ccac5641 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -26,6 +26,9 @@ #include #include +/* for request_sense */ +#include + #if defined(CONFIG_CRIS) || defined(CONFIG_FRV) || defined(CONFIG_MN10300) # define SUPPORT_VLB_SYNC 0 #else @@ -602,6 +605,11 @@ struct ide_drive_s { struct ide_atapi_pc request_sense_pc; struct request request_sense_rq; + + /* current sense rq and buffer */ + bool sense_rq_armed; + struct request sense_rq; + struct request_sense sense_data; }; typedef struct ide_drive_s ide_drive_t; @@ -1175,6 +1183,9 @@ int ide_set_media_lock(ide_drive_t *, struct gendisk *, int); void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *); void ide_retry_pc(ide_drive_t *, struct gendisk *); +void ide_prep_sense(ide_drive_t *drive, struct request *rq); +void ide_queue_sense_rq(ide_drive_t *drive, void *special); + int ide_cd_expiry(ide_drive_t *); int ide_cd_get_xferlen(struct request *); From c457ce874a0f3dfa3d5e9f2309789f6f34e24325 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0588/2061] ide-cd: convert to using generic sense request Preallocate a sense request in the ->do_request method and reinitialize it only on demand, in case it's been consumed in the IRQ handler path. The reason for this is that we don't want to be mapping rq to bio in the IRQ path and introduce all kinds of unnecessary hacks to the block layer. tj: * Both user and kernel PC requests expect sense data to be stored in separate storage other than drive->sense_data. Copy sense data to rq->sense on completion if rq->sense is not NULL. This fixes bogus sense data on PC requests. As a result, remove cdrom_queue_request_sense. CC: Bartlomiej Zolnierkiewicz CC: FUJITA Tomonori Signed-off-by: Borislav Petkov Signed-off-by: Tejun Heo --- drivers/ide/ide-cd.c | 54 ++++++++++---------------------------------- drivers/ide/ide-cd.h | 4 ---- 2 files changed, 12 insertions(+), 46 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index bb77c79c101..061d7bbcd34 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -206,44 +206,6 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive, ide_cd_log_error(drive->name, failed_command, sense); } -static void cdrom_queue_request_sense(ide_drive_t *drive, void *sense, - struct request *failed_command) -{ - struct cdrom_info *info = drive->driver_data; - struct request *rq = &drive->request_sense_rq; - - ide_debug_log(IDE_DBG_SENSE, "enter"); - - if (sense == NULL) - sense = &info->sense_data; - - memset(sense, 0, 18); - - /* stuff the sense request in front of our current request */ - blk_rq_init(NULL, rq); - rq->cmd_type = REQ_TYPE_ATA_PC; - rq->rq_disk = info->disk; - - rq->data = sense; - rq->cmd[0] = GPCMD_REQUEST_SENSE; - rq->cmd[4] = 18; - rq->data_len = 18; - - rq->cmd_type = REQ_TYPE_SENSE; - rq->cmd_flags |= REQ_PREEMPT; - - /* NOTE! Save the failed command in "rq->special" */ - rq->special = (void *)failed_command; - - if (failed_command) - ide_debug_log(IDE_DBG_SENSE, "failed_cmd: 0x%x", - failed_command->cmd[0]); - - drive->hwif->rq = NULL; - - elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0); -} - static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) { /* @@ -251,11 +213,16 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) * failed request */ struct request *failed = (struct request *)rq->special; - struct cdrom_info *info = drive->driver_data; - void *sense = &info->sense_data; + struct request_sense *sense = &drive->sense_data; if (failed) { if (failed->sense) { + /* + * Sense is always read into drive->sense_data. + * Copy back if the failed request has its + * sense pointer set. + */ + memcpy(failed->sense, sense, 18); sense = failed->sense; failed->sense_len = rq->sense_len; } @@ -431,7 +398,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) /* if we got a CHECK_CONDITION status, queue a request sense command */ if (stat & ATA_ERR) - cdrom_queue_request_sense(drive, NULL, NULL); + ide_queue_sense_rq(drive, NULL); return 1; end_request: @@ -445,7 +412,7 @@ end_request: hwif->rq = NULL; - cdrom_queue_request_sense(drive, rq->sense, rq); + ide_queue_sense_rq(drive, rq); return 1; } else return 2; @@ -893,6 +860,9 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, goto out_end; } + /* prepare sense request for this command */ + ide_prep_sense(drive, rq); + memset(&cmd, 0, sizeof(cmd)); if (rq_data_dir(rq)) diff --git a/drivers/ide/ide-cd.h b/drivers/ide/ide-cd.h index 1d97101099c..93a3cf1b0f3 100644 --- a/drivers/ide/ide-cd.h +++ b/drivers/ide/ide-cd.h @@ -87,10 +87,6 @@ struct cdrom_info { struct atapi_toc *toc; - /* The result of the last successful request sense command - on this device. */ - struct request_sense sense_data; - u8 max_speed; /* Max speed of the drive. */ u8 current_speed; /* Current speed of the drive. */ From 068753203e6cd085664a62e0fc0636e19b148a12 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0589/2061] ide-atapi: convert ide-{floppy,tape} to using preallocated sense buffer Since we're issuing REQ_TYPE_SENSE now we need to allow those types of rqs in the ->do_request callbacks. As a future improvement, sense_len assignment might be unified across all ATAPI devices. Borislav to check with specs and test. As a result, get rid of ide_queue_pc_head() and drive->request_sense_rq. tj: * Init request sense ide_atapi_pc from sense request. In the longer timer, it would probably better to fold ide_create_request_sense_cmd() into its only current user - ide_floppy_get_format_progress(). * ide_retry_pc() no longer takes @disk. CC: Bartlomiej Zolnierkiewicz CC: FUJITA Tomonori Signed-off-by: Borislav Petkov Signed-off-by: Tejun Heo --- drivers/ide/ide-atapi.c | 48 ++++++++++++---------------------------- drivers/ide/ide-floppy.c | 4 +++- drivers/ide/ide-tape.c | 7 ++++-- include/linux/ide.h | 3 +-- 4 files changed, 23 insertions(+), 39 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index c6e03485a63..972c522516f 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -79,34 +79,6 @@ void ide_init_pc(struct ide_atapi_pc *pc) } EXPORT_SYMBOL_GPL(ide_init_pc); -/* - * Generate a new packet command request in front of the request queue, before - * the current request, so that it will be processed immediately, on the next - * pass through the driver. - */ -static void ide_queue_pc_head(ide_drive_t *drive, struct gendisk *disk, - struct ide_atapi_pc *pc, struct request *rq) -{ - blk_rq_init(NULL, rq); - rq->cmd_type = REQ_TYPE_SPECIAL; - rq->cmd_flags |= REQ_PREEMPT; - rq->special = (char *)pc; - rq->rq_disk = disk; - - if (pc->req_xfer) { - rq->data = pc->buf; - rq->data_len = pc->req_xfer; - } - - memcpy(rq->cmd, pc->c, 12); - if (drive->media == ide_tape) - rq->cmd[13] = REQ_IDETAPE_PC1; - - drive->hwif->rq = NULL; - - elv_add_request(drive->queue, rq, ELEVATOR_INSERT_FRONT, 0); -} - /* * Add a special packet command request to the tail of the request queue, * and wait for it to be serviced. @@ -254,18 +226,26 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq); /* * Called when an error was detected during the last packet command. - * We queue a request sense packet command in the head of the request list. + * We queue a request sense packet command at the head of the request + * queue. */ -void ide_retry_pc(ide_drive_t *drive, struct gendisk *disk) +void ide_retry_pc(ide_drive_t *drive) { - struct request *rq = &drive->request_sense_rq; + struct request *sense_rq = &drive->sense_rq; struct ide_atapi_pc *pc = &drive->request_sense_pc; (void)ide_read_error(drive); - ide_create_request_sense_cmd(drive, pc); + + /* init pc from sense_rq */ + ide_init_pc(pc); + memcpy(pc->c, sense_rq->cmd, 12); + pc->buf = sense_rq->data; + pc->req_xfer = sense_rq->data_len; + if (drive->media == ide_tape) set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags); - ide_queue_pc_head(drive, disk, pc, rq); + + ide_queue_sense_rq(drive, pc); } EXPORT_SYMBOL_GPL(ide_retry_pc); @@ -404,7 +384,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) debug_log("[cmd %x]: check condition\n", rq->cmd[0]); /* Retry operation */ - ide_retry_pc(drive, rq->rq_disk); + ide_retry_pc(drive); /* queued, but not started */ return ide_stopped; diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 94600331a27..d3302cc891e 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -263,7 +263,7 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, } pc = &floppy->queued_pc; idefloppy_create_rw_cmd(drive, pc, rq, (unsigned long)block); - } else if (blk_special_request(rq)) { + } else if (blk_special_request(rq) || blk_sense_request(rq)) { pc = (struct ide_atapi_pc *)rq->special; } else if (blk_pc_request(rq)) { pc = &floppy->queued_pc; @@ -273,6 +273,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, goto out_end; } + ide_prep_sense(drive, rq); + memset(&cmd, 0, sizeof(cmd)); if (rq_data_dir(rq)) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index aadf53cfac6..8324dfa78a3 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -695,7 +695,7 @@ static ide_startstop_t idetape_media_access_finished(ide_drive_t *drive) printk(KERN_ERR "ide-tape: %s: I/O error, ", tape->name); /* Retry operation */ - ide_retry_pc(drive, tape->disk); + ide_retry_pc(drive); return ide_stopped; } pc->error = 0; @@ -752,7 +752,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, (unsigned long long)rq->sector, rq->nr_sectors, rq->current_nr_sectors); - if (!blk_special_request(rq)) { + if (!(blk_special_request(rq) || blk_sense_request(rq))) { /* We do not support buffer cache originated requests. */ printk(KERN_NOTICE "ide-tape: %s: Unsupported request in " "request queue (%d)\n", drive->name, rq->cmd_type); @@ -840,6 +840,9 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, BUG(); out: + /* prepare sense request for this command */ + ide_prep_sense(drive, rq); + memset(&cmd, 0, sizeof(cmd)); if (rq_data_dir(rq)) diff --git a/include/linux/ide.h b/include/linux/ide.h index a69ccac5641..9e67ccac3c1 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -604,7 +604,6 @@ struct ide_drive_s { unsigned long atapi_flags; struct ide_atapi_pc request_sense_pc; - struct request request_sense_rq; /* current sense rq and buffer */ bool sense_rq_armed; @@ -1181,7 +1180,7 @@ int ide_do_test_unit_ready(ide_drive_t *, struct gendisk *); int ide_do_start_stop(ide_drive_t *, struct gendisk *, int); int ide_set_media_lock(ide_drive_t *, struct gendisk *, int); void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *); -void ide_retry_pc(ide_drive_t *, struct gendisk *); +void ide_retry_pc(ide_drive_t *drive); void ide_prep_sense(ide_drive_t *drive, struct request *rq); void ide_queue_sense_rq(ide_drive_t *drive, void *special); From 02e7cf8f848841ca21864ccd019e480b73c323b7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:42 +0900 Subject: [PATCH 0590/2061] ide-cd,atapi: use bio for internal commands Impact: unify request data buffer handling rq->data is used mostly to pass kernel buffer through request queue without using bio. There are only a couple of places which still do this in kernel and converting to bio isn't difficult. This patch converts ide-cd and atapi to use bio instead of rq->data for request sense and internal pc commands. With previous change to unify sense request handling, this is relatively easily achieved by adding blk_rq_map_kern() during sense_rq prep and PC issue. If blk_rq_map_kern() fails for sense, the error is deferred till sense issue and aborts the failed command which triggered the sense. Note that this is a slim possibility as sense prep is done on each command issue, so for the above condition to actually trigger, all preps since the last sense issue till the issue of the request which would require a sense should fail. * do_request functions might sleep now. This should be okay as ide request_fn - do_ide_request() - is invoked only from make_request and plug work. Make sure this is the case by adding might_sleep() to do_ide_request(). * Functions which access the read sense data before the sense request is complete now should access bio_data(sense_rq->bio) as the sense buffer might have been copied during blk_rq_map_kern(). * ide-tape updated to map sg. * cdrom_do_block_pc() now doesn't have to deal with REQ_TYPE_ATA_PC special case. Simplified. * tp_ops->output/input_data path dropped from ide_pc_intr(). Signed-off-by: Tejun Heo --- drivers/ide/ide-atapi.c | 52 ++++++++++++++++++++++++----------------- drivers/ide/ide-cd.c | 28 +++++++++++----------- drivers/ide/ide-io.c | 3 +++ drivers/ide/ide-tape.c | 3 +++ include/linux/ide.h | 2 +- 5 files changed, 52 insertions(+), 36 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 972c522516f..5cefe12f562 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -94,16 +94,18 @@ int ide_queue_pc_tail(ide_drive_t *drive, struct gendisk *disk, rq->special = (char *)pc; if (pc->req_xfer) { - rq->data = pc->buf; - rq->data_len = pc->req_xfer; + error = blk_rq_map_kern(drive->queue, rq, pc->buf, pc->req_xfer, + GFP_NOIO); + if (error) + goto put_req; } memcpy(rq->cmd, pc->c, 12); if (drive->media == ide_tape) rq->cmd[13] = REQ_IDETAPE_PC1; error = blk_execute_rq(drive->queue, disk, rq, 0); +put_req: blk_put_request(rq); - return error; } EXPORT_SYMBOL_GPL(ide_queue_pc_tail); @@ -168,6 +170,7 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) struct request_sense *sense = &drive->sense_data; struct request *sense_rq = &drive->sense_rq; unsigned int cmd_len, sense_len; + int err; debug_log("%s: enter\n", __func__); @@ -193,13 +196,19 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) memset(sense, 0, sizeof(*sense)); blk_rq_init(rq->q, sense_rq); - sense_rq->rq_disk = rq->rq_disk; - sense_rq->data = sense; + err = blk_rq_map_kern(drive->queue, sense_rq, sense, sense_len, + GFP_NOIO); + if (unlikely(err)) { + if (printk_ratelimit()) + printk(KERN_WARNING "%s: failed to map sense buffer\n", + drive->name); + return; + } + + sense_rq->rq_disk = rq->rq_disk; sense_rq->cmd[0] = GPCMD_REQUEST_SENSE; sense_rq->cmd[4] = cmd_len; - sense_rq->data_len = sense_len; - sense_rq->cmd_type = REQ_TYPE_SENSE; sense_rq->cmd_flags |= REQ_PREEMPT; @@ -210,9 +219,14 @@ void ide_prep_sense(ide_drive_t *drive, struct request *rq) } EXPORT_SYMBOL_GPL(ide_prep_sense); -void ide_queue_sense_rq(ide_drive_t *drive, void *special) +int ide_queue_sense_rq(ide_drive_t *drive, void *special) { - BUG_ON(!drive->sense_rq_armed); + /* deferred failure from ide_prep_sense() */ + if (!drive->sense_rq_armed) { + printk(KERN_WARNING "%s: failed queue sense request\n", + drive->name); + return -ENOMEM; + } drive->sense_rq.special = special; drive->sense_rq_armed = false; @@ -221,6 +235,7 @@ void ide_queue_sense_rq(ide_drive_t *drive, void *special) elv_add_request(drive->queue, &drive->sense_rq, ELEVATOR_INSERT_FRONT, 0); + return 0; } EXPORT_SYMBOL_GPL(ide_queue_sense_rq); @@ -239,13 +254,14 @@ void ide_retry_pc(ide_drive_t *drive) /* init pc from sense_rq */ ide_init_pc(pc); memcpy(pc->c, sense_rq->cmd, 12); - pc->buf = sense_rq->data; + pc->buf = bio_data(sense_rq->bio); /* pointer to mapped address */ pc->req_xfer = sense_rq->data_len; if (drive->media == ide_tape) set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags); - ide_queue_sense_rq(drive, pc); + if (ide_queue_sense_rq(drive, pc)) + ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); } EXPORT_SYMBOL_GPL(ide_retry_pc); @@ -317,7 +333,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) struct ide_cmd *cmd = &hwif->cmd; struct request *rq = hwif->rq; const struct ide_tp_ops *tp_ops = hwif->tp_ops; - xfer_func_t *xferfunc; unsigned int timeout, done; u16 bcount; u8 stat, ireason, dsc = 0; @@ -411,7 +426,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) rq->errors = -EIO; } - if (drive->media == ide_tape) + if (drive->media == ide_tape && !rq->bio) done = ide_rq_bytes(rq); /* FIXME */ else done = blk_rq_bytes(rq); @@ -448,16 +463,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) return ide_do_reset(drive); } - xferfunc = write ? tp_ops->output_data : tp_ops->input_data; - - if (drive->media == ide_floppy && pc->buf == NULL) { + if (drive->media == ide_tape && pc->bh) + done = drive->pc_io_buffers(drive, pc, bcount, write); + else { done = min_t(unsigned int, bcount, cmd->nleft); ide_pio_bytes(drive, cmd, write, done); - } else if (drive->media == ide_tape && pc->bh) { - done = drive->pc_io_buffers(drive, pc, bcount, write); - } else { - done = min_t(unsigned int, bcount, pc->req_xfer - pc->xferred); - xferfunc(drive, NULL, pc->cur_pos, done); } /* Update the current position */ diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 061d7bbcd34..673628790f1 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -210,10 +210,12 @@ static void ide_cd_complete_failed_rq(ide_drive_t *drive, struct request *rq) { /* * For REQ_TYPE_SENSE, "rq->special" points to the original - * failed request + * failed request. Also, the sense data should be read + * directly from rq which might be different from the original + * sense buffer if it got copied during mapping. */ struct request *failed = (struct request *)rq->special; - struct request_sense *sense = &drive->sense_data; + void *sense = bio_data(rq->bio); if (failed) { if (failed->sense) { @@ -398,7 +400,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) /* if we got a CHECK_CONDITION status, queue a request sense command */ if (stat & ATA_ERR) - ide_queue_sense_rq(drive, NULL); + return ide_queue_sense_rq(drive, NULL) ? 2 : 1; return 1; end_request: @@ -412,8 +414,7 @@ end_request: hwif->rq = NULL; - ide_queue_sense_rq(drive, rq); - return 1; + return ide_queue_sense_rq(drive, rq) ? 2 : 1; } else return 2; } @@ -507,8 +508,12 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, rq->cmd_flags |= cmd_flags; rq->timeout = timeout; if (buffer) { - rq->data = buffer; - rq->data_len = *bufflen; + error = blk_rq_map_kern(drive->queue, rq, buffer, + *bufflen, GFP_NOIO); + if (error) { + blk_put_request(rq); + return error; + } } error = blk_execute_rq(drive->queue, info->disk, rq, 0); @@ -802,15 +807,10 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq) drive->dma = 0; /* sg request */ - if (rq->bio || ((rq->cmd_type == REQ_TYPE_ATA_PC) && rq->data_len)) { + if (rq->bio) { struct request_queue *q = drive->queue; + char *buf = bio_data(rq->bio); unsigned int alignment; - char *buf; - - if (rq->bio) - buf = bio_data(rq->bio); - else - buf = rq->data; drive->dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA); diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 9b9e8b1aae5..3245c2dbda3 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -481,6 +481,9 @@ void do_ide_request(struct request_queue *q) spin_unlock_irq(q->queue_lock); + /* HLD do_request() callback might sleep, make sure it's okay */ + might_sleep(); + if (ide_lock_host(host, hwif)) goto plug_device_2; diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 8324dfa78a3..9b762a2d5d9 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -850,6 +850,9 @@ out: cmd.rq = rq; + ide_init_sg_cmd(&cmd, pc->req_xfer); + ide_map_sg(drive, &cmd); + return ide_tape_issue_pc(drive, &cmd, pc); } diff --git a/include/linux/ide.h b/include/linux/ide.h index 9e67ccac3c1..1957461ac76 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1183,7 +1183,7 @@ void ide_create_request_sense_cmd(ide_drive_t *, struct ide_atapi_pc *); void ide_retry_pc(ide_drive_t *drive); void ide_prep_sense(ide_drive_t *drive, struct request *rq); -void ide_queue_sense_rq(ide_drive_t *drive, void *special); +int ide_queue_sense_rq(ide_drive_t *drive, void *special); int ide_cd_expiry(ide_drive_t *); From 765139ef5f1a4b1d5cb1f1a7a12de7ee61f6500f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 07:00:43 +0900 Subject: [PATCH 0591/2061] ide-pm: don't abuse rq->data Impact: cleanup rq->data usage ide-pm uses rq->data to carry pointer to struct request_pm_state through request queue and rq->special is used to carray pointer to local struct ide_cmd, which isn't necessary. Use rq->special for request_pm_state instead and use local ide_cmd in ide_start_power_step(). Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-io.c | 2 +- drivers/ide/ide-pm.c | 38 +++++++++++++++----------------------- 2 files changed, 16 insertions(+), 24 deletions(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 3245c2dbda3..6e3094e2277 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -368,7 +368,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) if (rq->cmd_type == REQ_TYPE_ATA_TASKFILE) return execute_drive_cmd(drive, rq); else if (blk_pm_request(rq)) { - struct request_pm_state *pm = rq->data; + struct request_pm_state *pm = rq->special; #ifdef DEBUG_PM printk("%s: start_power_step(step: %d)\n", drive->name, pm->pm_step); diff --git a/drivers/ide/ide-pm.c b/drivers/ide/ide-pm.c index 0d8a151c0a0..ba1488bd843 100644 --- a/drivers/ide/ide-pm.c +++ b/drivers/ide/ide-pm.c @@ -7,7 +7,6 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) ide_hwif_t *hwif = drive->hwif; struct request *rq; struct request_pm_state rqpm; - struct ide_cmd cmd; int ret; /* call ACPI _GTM only once */ @@ -15,11 +14,9 @@ int generic_ide_suspend(struct device *dev, pm_message_t mesg) ide_acpi_get_timing(hwif); memset(&rqpm, 0, sizeof(rqpm)); - memset(&cmd, 0, sizeof(cmd)); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_PM_SUSPEND; - rq->special = &cmd; - rq->data = &rqpm; + rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_SUSPEND; if (mesg.event == PM_EVENT_PRETHAW) mesg.event = PM_EVENT_FREEZE; @@ -41,7 +38,6 @@ int generic_ide_resume(struct device *dev) ide_hwif_t *hwif = drive->hwif; struct request *rq; struct request_pm_state rqpm; - struct ide_cmd cmd; int err; /* call ACPI _PS0 / _STM only once */ @@ -53,12 +49,10 @@ int generic_ide_resume(struct device *dev) ide_acpi_exec_tfs(drive); memset(&rqpm, 0, sizeof(rqpm)); - memset(&cmd, 0, sizeof(cmd)); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_PM_RESUME; rq->cmd_flags |= REQ_PREEMPT; - rq->special = &cmd; - rq->data = &rqpm; + rq->special = &rqpm; rqpm.pm_step = IDE_PM_START_RESUME; rqpm.pm_state = PM_EVENT_ON; @@ -77,7 +71,7 @@ int generic_ide_resume(struct device *dev) void ide_complete_power_step(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->data; + struct request_pm_state *pm = rq->special; #ifdef DEBUG_PM printk(KERN_INFO "%s: complete_power_step(step: %d)\n", @@ -107,10 +101,8 @@ void ide_complete_power_step(ide_drive_t *drive, struct request *rq) ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->data; - struct ide_cmd *cmd = rq->special; - - memset(cmd, 0, sizeof(*cmd)); + struct request_pm_state *pm = rq->special; + struct ide_cmd cmd = { }; switch (pm->pm_step) { case IDE_PM_FLUSH_CACHE: /* Suspend step 1 (flush cache) */ @@ -123,12 +115,12 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) return ide_stopped; } if (ata_id_flush_ext_enabled(drive->id)) - cmd->tf.command = ATA_CMD_FLUSH_EXT; + cmd.tf.command = ATA_CMD_FLUSH_EXT; else - cmd->tf.command = ATA_CMD_FLUSH; + cmd.tf.command = ATA_CMD_FLUSH; goto out_do_tf; case IDE_PM_STANDBY: /* Suspend step 2 (standby) */ - cmd->tf.command = ATA_CMD_STANDBYNOW1; + cmd.tf.command = ATA_CMD_STANDBYNOW1; goto out_do_tf; case IDE_PM_RESTORE_PIO: /* Resume step 1 (restore PIO) */ ide_set_max_pio(drive); @@ -141,7 +133,7 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) ide_complete_power_step(drive, rq); return ide_stopped; case IDE_PM_IDLE: /* Resume step 2 (idle) */ - cmd->tf.command = ATA_CMD_IDLEIMMEDIATE; + cmd.tf.command = ATA_CMD_IDLEIMMEDIATE; goto out_do_tf; case IDE_PM_RESTORE_DMA: /* Resume step 3 (restore DMA) */ /* @@ -163,11 +155,11 @@ ide_startstop_t ide_start_power_step(ide_drive_t *drive, struct request *rq) return ide_stopped; out_do_tf: - cmd->valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE; - cmd->valid.in.tf = IDE_VALID_IN_TF | IDE_VALID_DEVICE; - cmd->protocol = ATA_PROT_NODATA; + cmd.valid.out.tf = IDE_VALID_OUT_TF | IDE_VALID_DEVICE; + cmd.valid.in.tf = IDE_VALID_IN_TF | IDE_VALID_DEVICE; + cmd.protocol = ATA_PROT_NODATA; - return do_rw_taskfile(drive, cmd); + return do_rw_taskfile(drive, &cmd); } /** @@ -181,7 +173,7 @@ out_do_tf: void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) { struct request_queue *q = drive->queue; - struct request_pm_state *pm = rq->data; + struct request_pm_state *pm = rq->special; unsigned long flags; ide_complete_power_step(drive, rq); @@ -207,7 +199,7 @@ void ide_complete_pm_rq(ide_drive_t *drive, struct request *rq) void ide_check_pm_state(ide_drive_t *drive, struct request *rq) { - struct request_pm_state *pm = rq->data; + struct request_pm_state *pm = rq->special; if (blk_pm_suspend_request(rq) && pm->pm_step == IDE_PM_START_SUSPEND) From 08f370f0a2fb223bf48d0bfa2a173be0393c19dc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0592/2061] ide-tape,floppy: fix failed command completion after request sense Impact: fix infinite retry loop After a command failed, ide-tape and floppy inserts REQUEST_SENSE in front of the failed command and according to the result, sets pc->retries, flags and errors. After REQUEST_SENSE is complete, the failed command is again at the front of the queue and if the verdict was to terminate the request, the issue functions tries to complete it directly by calling drive->pc_callback() and returning ide_stopped. However, drive->pc_callback() doesn't complete a request. It only prepares for completion of the request. As a result, this creates an infinite loop where the failed request is retried perpetually. Fix it by actually ending the request by calling ide_complete_rq(). Signed-off-by: Tejun Heo --- drivers/ide/ide-floppy.c | 1 + drivers/ide/ide-tape.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index d3302cc891e..d20704ac318 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -141,6 +141,7 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive, drive->failed_pc = NULL; drive->pc_callback(drive, 0); + ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); return ide_stopped; } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 9b762a2d5d9..2b9a13671c5 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -643,6 +643,7 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, } drive->failed_pc = NULL; drive->pc_callback(drive, 0); + ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); return ide_stopped; } debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]); From eb6a61bb9543aa54d62595e27206b3d3c0293bcc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0593/2061] ide-atapi,tape,floppy: allow ->pc_callback() to change rq->data_len Impact: allow residual count implementation in ->pc_callback() rq->data_len has two duties - carrying the number of input bytes on issue and carrying residual count back to the issuer on completion. ide-atapi completion callback ->pc_callback() is the right place to do this but currently ide-atapi depends on rq->data_len carrying the original request size after calling ->pc_callback() to complete the pc request. This patch makes ide_pc_intr(), ide_tape_issue_pc() and ide_floppy_issue_pc() cache length to complete before calling ->pc_callback() so that it can modify rq->data_len as necessary. Note: As using rq->data_len for two purposes can make cases like this incorrect in subtle ways, future changes will introduce separate field for residual count. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-atapi.c | 16 ++++++++++------ drivers/ide/ide-floppy.c | 5 ++++- drivers/ide/ide-tape.c | 5 ++++- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 5cefe12f562..3df5442de71 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -409,6 +409,16 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0) dsc = 1; + /* + * ->pc_callback() might change rq->data_len for + * residual count, cache total length. + */ + if (!blk_special_request(rq) && + (drive->media == ide_tape && !rq->bio)) + done = ide_rq_bytes(rq); /* FIXME */ + else + done = blk_rq_bytes(rq); + /* Command finished - Call the callback function */ uptodate = drive->pc_callback(drive, dsc); @@ -417,7 +427,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) if (blk_special_request(rq)) { rq->errors = 0; - done = blk_rq_bytes(rq); error = 0; } else { @@ -426,11 +435,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) rq->errors = -EIO; } - if (drive->media == ide_tape && !rq->bio) - done = ide_rq_bytes(rq); /* FIXME */ - else - done = blk_rq_bytes(rq); - error = uptodate ? 0 : -EIO; } diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index d20704ac318..537b7c55803 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -134,14 +134,17 @@ static ide_startstop_t ide_floppy_issue_pc(ide_drive_t *drive, drive->pc = pc; if (pc->retries > IDEFLOPPY_MAX_PC_RETRIES) { + unsigned int done = blk_rq_bytes(drive->hwif->rq); + if (!(pc->flags & PC_FLAG_SUPPRESS_ERROR)) ide_floppy_report_error(floppy, pc); + /* Giving up */ pc->error = IDE_DRV_ERROR_GENERAL; drive->failed_pc = NULL; drive->pc_callback(drive, 0); - ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); + ide_complete_rq(drive, -EIO, done); return ide_stopped; } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 2b9a13671c5..8226d52504d 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -622,6 +622,8 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, if (pc->retries > IDETAPE_MAX_PC_RETRIES || (pc->flags & PC_FLAG_ABORT)) { + unsigned int done = blk_rq_bytes(drive->hwif->rq); + /* * We will "abort" retrying a packet command in case legitimate * error code was received (crossing a filemark, or end of the @@ -641,9 +643,10 @@ static ide_startstop_t ide_tape_issue_pc(ide_drive_t *drive, /* Giving up */ pc->error = IDE_DRV_ERROR_GENERAL; } + drive->failed_pc = NULL; drive->pc_callback(drive, 0); - ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); + ide_complete_rq(drive, -EIO, done); return ide_stopped; } debug_log(DBG_SENSE, "Retry #%d, cmd = %02X\n", pc->retries, pc->c[0]); From 7b13354eeaabaf6283b8c669a7d67d104ce7c638 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0594/2061] ide-tape: use single continuous buffer Impact: simpler buffer allocation and handling, kills OOM, fix DMA transfers ide-tape has its own multiple buffer mechanism using struct idetape_bh. It allocates buffer with decreasing order-of-two allocations so that it results in minimum number of segments. However, the implementation is quite complex and works in a way that no other block or ide driver works necessitating a lot of special case handling. The benefit this complex allocation scheme brings is questionable as PIO or DMA the number of segments (16 maximum) doesn't make any noticeable difference and it also doesn't negate the need for multiple order allocation which can fail under memory pressure or high fragmentation although it does lower the highest order necessary by one when the buffer size isn't power of two. As the first step to remove the custom buffer management, this patch makes ide-tape allocate single continous buffer. The maximum order is four. I doubt the change would cause any trouble but if it ever matters, it should be converted to regular sg mechanism like everyone else and even in that case dropping custom buffer handling and moving to standard mechanism first make sense as an intermediate step. This patch makes the first bh to contain the whole buffer and drops multi bh handling code. Following patches will make further changes. This patch has the side effect of killing OOM triggered by allocation path and fixing DMA transfers. Previously, bug in alloc path triggered OOM on command issue and commands were passed to DMA engine without DMA-mapping all the segments. Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 254 +++++++++-------------------------------- 1 file changed, 57 insertions(+), 197 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 8226d52504d..b2afbc7bcb6 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -134,7 +134,6 @@ enum { struct idetape_bh { u32 b_size; atomic_t b_count; - struct idetape_bh *b_reqnext; char *b_data; }; @@ -228,10 +227,6 @@ typedef struct ide_tape_obj { char *b_data; int b_count; - int pages_per_buffer; - /* Wasted space in each stage */ - int excess_bh_size; - /* Measures average tape speed */ unsigned long avg_time; int avg_size; @@ -303,9 +298,7 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, struct idetape_bh *bh = pc->bh; int count; - while (bcount) { - if (bh == NULL) - break; + if (bcount && bh) { count = min( (unsigned int)(bh->b_size - atomic_read(&bh->b_count)), bcount); @@ -313,15 +306,10 @@ static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, atomic_read(&bh->b_count), count); bcount -= count; atomic_add(count, &bh->b_count); - if (atomic_read(&bh->b_count) == bh->b_size) { - bh = bh->b_reqnext; - if (bh) - atomic_set(&bh->b_count, 0); - } + if (atomic_read(&bh->b_count) == bh->b_size) + pc->bh = NULL; } - pc->bh = bh; - return bcount; } @@ -331,22 +319,14 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, struct idetape_bh *bh = pc->bh; int count; - while (bcount) { - if (bh == NULL) - break; + if (bcount && bh) { count = min((unsigned int)pc->b_count, (unsigned int)bcount); drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count); bcount -= count; pc->b_data += count; pc->b_count -= count; - if (!pc->b_count) { - bh = bh->b_reqnext; - pc->bh = bh; - if (bh) { - pc->b_data = bh->b_data; - pc->b_count = atomic_read(&bh->b_count); - } - } + if (!pc->b_count) + pc->bh = NULL; } return bcount; @@ -355,24 +335,20 @@ static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc) { struct idetape_bh *bh = pc->bh; - int count; unsigned int bcount = pc->xferred; if (pc->flags & PC_FLAG_WRITING) return; - while (bcount) { - if (bh == NULL) { + if (bcount) { + if (bh == NULL || bcount > bh->b_size) { printk(KERN_ERR "ide-tape: bh == NULL in %s\n", __func__); return; } - count = min((unsigned int)bh->b_size, (unsigned int)bcount); - atomic_set(&bh->b_count, count); + atomic_set(&bh->b_count, bcount); if (atomic_read(&bh->b_count) == bh->b_size) - bh = bh->b_reqnext; - bcount -= count; + pc->bh = NULL; } - pc->bh = bh; } /* @@ -439,24 +415,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense) /* Free data buffers completely. */ static void ide_tape_kfree_buffer(idetape_tape_t *tape) { - struct idetape_bh *prev_bh, *bh = tape->merge_bh; + struct idetape_bh *bh = tape->merge_bh; - while (bh) { - u32 size = bh->b_size; - - while (size) { - unsigned int order = fls(size >> PAGE_SHIFT)-1; - - if (bh->b_data) - free_pages((unsigned long)bh->b_data, order); - - size &= (order-1); - bh->b_data += (1 << order) * PAGE_SIZE; - } - prev_bh = bh; - bh = bh->b_reqnext; - kfree(prev_bh); - } + kfree(bh->b_data); + kfree(bh); } static void ide_tape_handle_dsc(ide_drive_t *); @@ -861,117 +823,50 @@ out: } /* - * The function below uses __get_free_pages to allocate a data buffer of size - * tape->buffer_size (or a bit more). We attempt to combine sequential pages as - * much as possible. - * - * It returns a pointer to the newly allocated buffer, or NULL in case of - * failure. + * It returns a pointer to the newly allocated buffer, or NULL in case + * of failure. */ static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape, - int full, int clear) + int full) { - struct idetape_bh *prev_bh, *bh, *merge_bh; - int pages = tape->pages_per_buffer; - unsigned int order, b_allocd; - char *b_data = NULL; + struct idetape_bh *bh; - merge_bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL); - bh = merge_bh; - if (bh == NULL) - goto abort; + bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL); + if (!bh) + return NULL; - order = fls(pages) - 1; - bh->b_data = (char *) __get_free_pages(GFP_KERNEL, order); - if (!bh->b_data) - goto abort; - b_allocd = (1 << order) * PAGE_SIZE; - pages &= (order-1); - - if (clear) - memset(bh->b_data, 0, b_allocd); - bh->b_reqnext = NULL; - bh->b_size = b_allocd; - atomic_set(&bh->b_count, full ? bh->b_size : 0); - - while (pages) { - order = fls(pages) - 1; - b_data = (char *) __get_free_pages(GFP_KERNEL, order); - if (!b_data) - goto abort; - b_allocd = (1 << order) * PAGE_SIZE; - - if (clear) - memset(b_data, 0, b_allocd); - - /* newly allocated page frames below buffer header or ...*/ - if (bh->b_data == b_data + b_allocd) { - bh->b_size += b_allocd; - bh->b_data -= b_allocd; - if (full) - atomic_add(b_allocd, &bh->b_count); - continue; - } - /* they are above the header */ - if (b_data == bh->b_data + bh->b_size) { - bh->b_size += b_allocd; - if (full) - atomic_add(b_allocd, &bh->b_count); - continue; - } - prev_bh = bh; - bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL); - if (!bh) { - free_pages((unsigned long) b_data, order); - goto abort; - } - bh->b_reqnext = NULL; - bh->b_data = b_data; - bh->b_size = b_allocd; - atomic_set(&bh->b_count, full ? bh->b_size : 0); - prev_bh->b_reqnext = bh; - - pages &= (order-1); + bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL); + if (!bh->b_data) { + kfree(bh); + return NULL; } - bh->b_size -= tape->excess_bh_size; - if (full) - atomic_sub(tape->excess_bh_size, &bh->b_count); - return merge_bh; -abort: - ide_tape_kfree_buffer(tape); - return NULL; + bh->b_size = tape->buffer_size; + atomic_set(&bh->b_count, full ? bh->b_size : 0); + + return bh; } static int idetape_copy_stage_from_user(idetape_tape_t *tape, const char __user *buf, int n) { struct idetape_bh *bh = tape->bh; - int count; int ret = 0; - while (n) { - if (bh == NULL) { + if (n) { + if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) { printk(KERN_ERR "ide-tape: bh == NULL in %s\n", __func__); return 1; } - count = min((unsigned int) - (bh->b_size - atomic_read(&bh->b_count)), - (unsigned int)n); if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf, - count)) + n)) ret = 1; - n -= count; - atomic_add(count, &bh->b_count); - buf += count; - if (atomic_read(&bh->b_count) == bh->b_size) { - bh = bh->b_reqnext; - if (bh) - atomic_set(&bh->b_count, 0); - } + atomic_add(n, &bh->b_count); + if (atomic_read(&bh->b_count) == bh->b_size) + tape->bh = NULL; } - tape->bh = bh; + return ret; } @@ -979,30 +874,20 @@ static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf, int n) { struct idetape_bh *bh = tape->bh; - int count; int ret = 0; - while (n) { - if (bh == NULL) { + if (n) { + if (bh == NULL || n > tape->b_count) { printk(KERN_ERR "ide-tape: bh == NULL in %s\n", __func__); return 1; } - count = min(tape->b_count, n); - if (copy_to_user(buf, tape->b_data, count)) + if (copy_to_user(buf, tape->b_data, n)) ret = 1; - n -= count; - tape->b_data += count; - tape->b_count -= count; - buf += count; - if (!tape->b_count) { - bh = bh->b_reqnext; - tape->bh = bh; - if (bh) { - tape->b_data = bh->b_data; - tape->b_count = atomic_read(&bh->b_count); - } - } + tape->b_data += n; + tape->b_count -= n; + if (!tape->b_count) + tape->bh = NULL; } return ret; } @@ -1254,7 +1139,7 @@ static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks) static void ide_tape_flush_merge_buffer(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; - int blocks, min; + int blocks; struct idetape_bh *bh; if (tape->chrdev_dir != IDETAPE_DIR_WRITE) { @@ -1269,31 +1154,16 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive) if (tape->merge_bh_size) { blocks = tape->merge_bh_size / tape->blk_size; if (tape->merge_bh_size % tape->blk_size) { - unsigned int i; - + unsigned int i = tape->blk_size - + tape->merge_bh_size % tape->blk_size; blocks++; - i = tape->blk_size - tape->merge_bh_size % - tape->blk_size; - bh = tape->bh->b_reqnext; - while (bh) { - atomic_set(&bh->b_count, 0); - bh = bh->b_reqnext; - } bh = tape->bh; - while (i) { - if (bh == NULL) { - printk(KERN_INFO "ide-tape: bug," - " bh NULL\n"); - break; - } - min = min(i, (unsigned int)(bh->b_size - - atomic_read(&bh->b_count))); + if (bh) { memset(bh->b_data + atomic_read(&bh->b_count), - 0, min); - atomic_add(min, &bh->b_count); - i -= min; - bh = bh->b_reqnext; - } + 0, i); + atomic_add(i, &bh->b_count); + } else + printk(KERN_INFO "ide-tape: bug, bh NULL\n"); } (void) idetape_add_chrdev_write_request(drive, blocks); tape->merge_bh_size = 0; @@ -1321,7 +1191,7 @@ static int idetape_init_read(ide_drive_t *drive) " 0 now\n"); tape->merge_bh_size = 0; } - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0); + tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0); if (!tape->merge_bh) return -ENOMEM; tape->chrdev_dir = IDETAPE_DIR_READ; @@ -1368,23 +1238,18 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) static void idetape_pad_zeros(ide_drive_t *drive, int bcount) { idetape_tape_t *tape = drive->driver_data; - struct idetape_bh *bh; + struct idetape_bh *bh = tape->merge_bh; int blocks; while (bcount) { unsigned int count; - bh = tape->merge_bh; count = min(tape->buffer_size, bcount); bcount -= count; blocks = count / tape->blk_size; - while (count) { - atomic_set(&bh->b_count, - min(count, (unsigned int)bh->b_size)); - memset(bh->b_data, 0, atomic_read(&bh->b_count)); - count -= atomic_read(&bh->b_count); - bh = bh->b_reqnext; - } + atomic_set(&bh->b_count, count); + memset(bh->b_data, 0, atomic_read(&bh->b_count)); + idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks, tape->merge_bh); } @@ -1596,7 +1461,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, "should be 0 now\n"); tape->merge_bh_size = 0; } - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0, 0); + tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0); if (!tape->merge_bh) return -ENOMEM; tape->chrdev_dir = IDETAPE_DIR_WRITE; @@ -1970,7 +1835,7 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor) idetape_tape_t *tape = drive->driver_data; ide_tape_flush_merge_buffer(drive); - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1, 0); + tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1); if (tape->merge_bh != NULL) { idetape_pad_zeros(drive, tape->blk_size * (tape->user_bs_factor - 1)); @@ -2201,11 +2066,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor) tape->buffer_size = *ctl * tape->blk_size; } buffer_size = tape->buffer_size; - tape->pages_per_buffer = buffer_size / PAGE_SIZE; - if (buffer_size % PAGE_SIZE) { - tape->pages_per_buffer++; - tape->excess_bh_size = PAGE_SIZE - buffer_size % PAGE_SIZE; - } /* select the "best" DSC read/write polling freq */ speed = max(*(u16 *)&tape->caps[14], *(u16 *)&tape->caps[8]); From e998f30b45efb99a3c3ce7b5483f76317a17abed Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0595/2061] ide-tape: use standard data transfer mechanism Impact: use standard way to transfer data ide-tape uses rq in an interesting way. For r/w requests, rq->special is used to carry a private buffer management structure idetape_bh and rq->nr_sectors and current_nr_sectors are initialized to the number of idetape blocks which isn't necessary 512 bytes. Also, rq->current_nr_sectors is used to report back the residual count in units of idetape blocks. This peculiarity taxes both block layer and ide. ide-atapi has different paths and hooks to accomodate it and what a rq means becomes quite confusing and making changes at the block layer becomes quite difficult and error-prone. This patch makes ide-tape use bio instead. With the previous patch, ide-tape currently is using single contiguos buffer so replacing it isn't difficult. Data buffer is mapped into bio using blk_rq_map_kern() in idetape_queue_rw_tail(). idetape_io_buffers() and idetape_update_buffers() are dropped and pc->bh is set to null to tell ide-atapi to use standard data transfer mechanism and idetape_bh byte counts are updated by the issuer on completion using the residual count. This change also nicely removes the FIXME in ide_pc_intr() where ide-tape rqs need to be completed using ide_rq_bytes() instead of blk_rq_bytes() (although this didn't really matter as the request didn't have bio). Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-atapi.c | 6 +-- drivers/ide/ide-tape.c | 112 +++++++++------------------------------- 2 files changed, 24 insertions(+), 94 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 3df5442de71..b9dd4503cbc 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -413,11 +413,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) * ->pc_callback() might change rq->data_len for * residual count, cache total length. */ - if (!blk_special_request(rq) && - (drive->media == ide_tape && !rq->bio)) - done = ide_rq_bytes(rq); /* FIXME */ - else - done = blk_rq_bytes(rq); + done = blk_rq_bytes(rq); /* Command finished - Call the callback function */ uptodate = drive->pc_callback(drive, dsc); diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index b2afbc7bcb6..b373ac87635 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -292,65 +292,6 @@ static struct ide_tape_obj *ide_tape_chrdev_get(unsigned int i) return tape; } -static int idetape_input_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, - unsigned int bcount) -{ - struct idetape_bh *bh = pc->bh; - int count; - - if (bcount && bh) { - count = min( - (unsigned int)(bh->b_size - atomic_read(&bh->b_count)), - bcount); - drive->hwif->tp_ops->input_data(drive, NULL, bh->b_data + - atomic_read(&bh->b_count), count); - bcount -= count; - atomic_add(count, &bh->b_count); - if (atomic_read(&bh->b_count) == bh->b_size) - pc->bh = NULL; - } - - return bcount; -} - -static int idetape_output_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, - unsigned int bcount) -{ - struct idetape_bh *bh = pc->bh; - int count; - - if (bcount && bh) { - count = min((unsigned int)pc->b_count, (unsigned int)bcount); - drive->hwif->tp_ops->output_data(drive, NULL, pc->b_data, count); - bcount -= count; - pc->b_data += count; - pc->b_count -= count; - if (!pc->b_count) - pc->bh = NULL; - } - - return bcount; -} - -static void idetape_update_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc) -{ - struct idetape_bh *bh = pc->bh; - unsigned int bcount = pc->xferred; - - if (pc->flags & PC_FLAG_WRITING) - return; - if (bcount) { - if (bh == NULL || bcount > bh->b_size) { - printk(KERN_ERR "ide-tape: bh == NULL in %s\n", - __func__); - return; - } - atomic_set(&bh->b_count, bcount); - if (atomic_read(&bh->b_count) == bh->b_size) - pc->bh = NULL; - } -} - /* * called on each failed packet command retry to analyze the request sense. We * currently do not utilize this information. @@ -368,12 +309,10 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense) pc->c[0], tape->sense_key, tape->asc, tape->ascq); /* Correct pc->xferred by asking the tape. */ - if (pc->flags & PC_FLAG_DMA_ERROR) { + if (pc->flags & PC_FLAG_DMA_ERROR) pc->xferred = pc->req_xfer - tape->blk_size * get_unaligned_be32(&sense[3]); - idetape_update_buffers(drive, pc); - } /* * If error was the result of a zero-length read or write command, @@ -458,7 +397,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc) } tape->first_frame += blocks; - rq->current_nr_sectors -= blocks; + rq->data_len -= blocks * tape->blk_size; if (pc->error) { uptodate = 0; @@ -520,19 +459,6 @@ static void ide_tape_handle_dsc(ide_drive_t *drive) idetape_postpone_request(drive); } -static int ide_tape_io_buffers(ide_drive_t *drive, struct ide_atapi_pc *pc, - unsigned int bcount, int write) -{ - unsigned int bleft; - - if (write) - bleft = idetape_output_buffers(drive, pc, bcount); - else - bleft = idetape_input_buffers(drive, pc, bcount); - - return bcount - bleft; -} - /* * Packet Command Interface * @@ -683,7 +609,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape, ide_init_pc(pc); put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]); pc->c[1] = 1; - pc->bh = bh; + pc->bh = NULL; pc->buf = NULL; pc->buf_size = length * tape->blk_size; pc->req_xfer = pc->buf_size; @@ -1063,10 +989,12 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, struct idetape_bh *bh) { idetape_tape_t *tape = drive->driver_data; + size_t size = blocks * tape->blk_size; struct request *rq; - int ret, errors; + int ret; debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd); + BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_SPECIAL; @@ -1074,21 +1002,29 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, rq->rq_disk = tape->disk; rq->special = (void *)bh; rq->sector = tape->first_frame; - rq->nr_sectors = blocks; - rq->current_nr_sectors = blocks; + + if (size) { + ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size, + __GFP_WAIT); + if (ret) + goto out_put; + } + blk_execute_rq(drive->queue, tape->disk, rq, 0); - errors = rq->errors; - ret = tape->blk_size * (blocks - rq->current_nr_sectors); - blk_put_request(rq); + /* calculate the number of transferred bytes and update bh */ + size -= rq->data_len; + if (cmd == REQ_IDETAPE_READ) + atomic_add(size, &bh->b_count); - if ((cmd & (REQ_IDETAPE_READ | REQ_IDETAPE_WRITE)) == 0) - return 0; + ret = size; + if (rq->errors == IDE_DRV_ERROR_GENERAL) + ret = -EIO; if (tape->merge_bh) idetape_init_merge_buffer(tape); - if (errors == IDE_DRV_ERROR_GENERAL) - return -EIO; +out_put: + blk_put_request(rq); return ret; } @@ -2034,8 +1970,6 @@ static void idetape_setup(ide_drive_t *drive, idetape_tape_t *tape, int minor) u16 *ctl = (u16 *)&tape->caps[12]; drive->pc_callback = ide_tape_callback; - drive->pc_update_buffers = idetape_update_buffers; - drive->pc_io_buffers = ide_tape_io_buffers; drive->dev_flags |= IDE_DFLAG_DSC_OVERLAP; From 6cf3d545f7d71b183e5b89960d4cc850a42c410d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0596/2061] ide-tape: kill idetape_bh Impact: kill now unnecessary idetape_bh With everything using standard mechanisms, there is no need for idetape_bh anymore. Kill it and use tape->buf, cur and valid to describe data buffer instead. Changes worth mentioning are... * idetape_queue_rq_tail() now always queue tape->buf and and adjusts buffer state properly before completion. * idetape_pad_zeros() clears the buffer only once. Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 305 ++++++++++++----------------------------- 1 file changed, 84 insertions(+), 221 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index b373ac87635..d2e9c09b521 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -131,12 +131,6 @@ enum { IDETAPE_DIR_WRITE = (1 << 2), }; -struct idetape_bh { - u32 b_size; - atomic_t b_count; - char *b_data; -}; - /* Tape door status */ #define DOOR_UNLOCKED 0 #define DOOR_LOCKED 1 @@ -218,14 +212,12 @@ typedef struct ide_tape_obj { /* Data buffer size chosen based on the tape's recommendation */ int buffer_size; - /* merge buffer */ - struct idetape_bh *merge_bh; - /* size of the merge buffer */ - int merge_bh_size; - /* pointer to current buffer head within the merge buffer */ - struct idetape_bh *bh; - char *b_data; - int b_count; + /* Staging buffer of buffer_size bytes */ + void *buf; + /* The read/write cursor */ + void *cur; + /* The number of valid bytes in buf */ + size_t valid; /* Measures average tape speed */ unsigned long avg_time; @@ -351,15 +343,6 @@ static void idetape_analyze_error(ide_drive_t *drive, u8 *sense) } } -/* Free data buffers completely. */ -static void ide_tape_kfree_buffer(idetape_tape_t *tape) -{ - struct idetape_bh *bh = tape->merge_bh; - - kfree(bh->b_data); - kfree(bh); -} - static void ide_tape_handle_dsc(ide_drive_t *); static int ide_tape_callback(ide_drive_t *drive, int dsc) @@ -603,8 +586,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape, struct ide_atapi_pc *pc, struct request *rq, u8 opcode) { - struct idetape_bh *bh = (struct idetape_bh *)rq->special; - unsigned int length = rq->current_nr_sectors; + unsigned int length = rq->nr_sectors; ide_init_pc(pc); put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]); @@ -616,14 +598,11 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape, if (pc->req_xfer == tape->buffer_size) pc->flags |= PC_FLAG_DMA_OK; - if (opcode == READ_6) { + if (opcode == READ_6) pc->c[0] = READ_6; - atomic_set(&bh->b_count, 0); - } else if (opcode == WRITE_6) { + else if (opcode == WRITE_6) { pc->c[0] = WRITE_6; pc->flags |= PC_FLAG_WRITING; - pc->b_data = bh->b_data; - pc->b_count = atomic_read(&bh->b_count); } memcpy(rq->cmd, pc->c, 12); @@ -639,10 +618,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, struct ide_cmd cmd; u8 stat; - debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu," - " current_nr_sectors: %u\n", - (unsigned long long)rq->sector, rq->nr_sectors, - rq->current_nr_sectors); + debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n" + (unsigned long long)rq->sector, rq->nr_sectors); if (!(blk_special_request(rq) || blk_sense_request(rq))) { /* We do not support buffer cache originated requests. */ @@ -748,89 +725,6 @@ out: return ide_tape_issue_pc(drive, &cmd, pc); } -/* - * It returns a pointer to the newly allocated buffer, or NULL in case - * of failure. - */ -static struct idetape_bh *ide_tape_kmalloc_buffer(idetape_tape_t *tape, - int full) -{ - struct idetape_bh *bh; - - bh = kmalloc(sizeof(struct idetape_bh), GFP_KERNEL); - if (!bh) - return NULL; - - bh->b_data = kmalloc(tape->buffer_size, GFP_KERNEL); - if (!bh->b_data) { - kfree(bh); - return NULL; - } - - bh->b_size = tape->buffer_size; - atomic_set(&bh->b_count, full ? bh->b_size : 0); - - return bh; -} - -static int idetape_copy_stage_from_user(idetape_tape_t *tape, - const char __user *buf, int n) -{ - struct idetape_bh *bh = tape->bh; - int ret = 0; - - if (n) { - if (bh == NULL || n > bh->b_size - atomic_read(&bh->b_count)) { - printk(KERN_ERR "ide-tape: bh == NULL in %s\n", - __func__); - return 1; - } - if (copy_from_user(bh->b_data + atomic_read(&bh->b_count), buf, - n)) - ret = 1; - atomic_add(n, &bh->b_count); - if (atomic_read(&bh->b_count) == bh->b_size) - tape->bh = NULL; - } - - return ret; -} - -static int idetape_copy_stage_to_user(idetape_tape_t *tape, char __user *buf, - int n) -{ - struct idetape_bh *bh = tape->bh; - int ret = 0; - - if (n) { - if (bh == NULL || n > tape->b_count) { - printk(KERN_ERR "ide-tape: bh == NULL in %s\n", - __func__); - return 1; - } - if (copy_to_user(buf, tape->b_data, n)) - ret = 1; - tape->b_data += n; - tape->b_count -= n; - if (!tape->b_count) - tape->bh = NULL; - } - return ret; -} - -static void idetape_init_merge_buffer(idetape_tape_t *tape) -{ - struct idetape_bh *bh = tape->merge_bh; - tape->bh = tape->merge_bh; - - if (tape->chrdev_dir == IDETAPE_DIR_WRITE) - atomic_set(&bh->b_count, 0); - else { - tape->b_data = bh->b_data; - tape->b_count = atomic_read(&bh->b_count); - } -} - /* * Write a filemark if write_filemark=1. Flush the device buffers without * writing a filemark otherwise. @@ -928,10 +822,10 @@ static void __ide_tape_discard_merge_buffer(ide_drive_t *drive) return; clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags); - tape->merge_bh_size = 0; - if (tape->merge_bh != NULL) { - ide_tape_kfree_buffer(tape); - tape->merge_bh = NULL; + tape->valid = 0; + if (tape->buf != NULL) { + kfree(tape->buf); + tape->buf = NULL; } tape->chrdev_dir = IDETAPE_DIR_NONE; @@ -985,8 +879,7 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive, * Generate a read/write request for the block device interface and wait for it * to be serviced. */ -static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, - struct idetape_bh *bh) +static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks) { idetape_tape_t *tape = drive->driver_data; size_t size = blocks * tape->blk_size; @@ -1000,11 +893,10 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, rq->cmd_type = REQ_TYPE_SPECIAL; rq->cmd[13] = cmd; rq->rq_disk = tape->disk; - rq->special = (void *)bh; rq->sector = tape->first_frame; if (size) { - ret = blk_rq_map_kern(drive->queue, rq, bh->b_data, size, + ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size, __GFP_WAIT); if (ret) goto out_put; @@ -1012,17 +904,17 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks, blk_execute_rq(drive->queue, tape->disk, rq, 0); - /* calculate the number of transferred bytes and update bh */ + /* calculate the number of transferred bytes and update buffer state */ size -= rq->data_len; + tape->cur = tape->buf; if (cmd == REQ_IDETAPE_READ) - atomic_add(size, &bh->b_count); + tape->valid = size; + else + tape->valid = 0; ret = size; if (rq->errors == IDE_DRV_ERROR_GENERAL) ret = -EIO; - - if (tape->merge_bh) - idetape_init_merge_buffer(tape); out_put: blk_put_request(rq); return ret; @@ -1064,49 +956,33 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd) /* Queue up a character device originated write request. */ static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks) { - idetape_tape_t *tape = drive->driver_data; - debug_log(DBG_CHRDEV, "Enter %s\n", __func__); - return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, - blocks, tape->merge_bh); + return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks); } static void ide_tape_flush_merge_buffer(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; int blocks; - struct idetape_bh *bh; if (tape->chrdev_dir != IDETAPE_DIR_WRITE) { printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer" " but we are not writing.\n"); return; } - if (tape->merge_bh_size > tape->buffer_size) { - printk(KERN_ERR "ide-tape: bug: merge_buffer too big\n"); - tape->merge_bh_size = tape->buffer_size; - } - if (tape->merge_bh_size) { - blocks = tape->merge_bh_size / tape->blk_size; - if (tape->merge_bh_size % tape->blk_size) { - unsigned int i = tape->blk_size - - tape->merge_bh_size % tape->blk_size; + if (tape->buf) { + blocks = tape->valid / tape->blk_size; + if (tape->valid % tape->blk_size) { blocks++; - bh = tape->bh; - if (bh) { - memset(bh->b_data + atomic_read(&bh->b_count), - 0, i); - atomic_add(i, &bh->b_count); - } else - printk(KERN_INFO "ide-tape: bug, bh NULL\n"); + memset(tape->buf + tape->valid, 0, + tape->blk_size - tape->valid % tape->blk_size); } (void) idetape_add_chrdev_write_request(drive, blocks); - tape->merge_bh_size = 0; } - if (tape->merge_bh != NULL) { - ide_tape_kfree_buffer(tape); - tape->merge_bh = NULL; + if (tape->buf != NULL) { + kfree(tape->buf); + tape->buf = NULL; } tape->chrdev_dir = IDETAPE_DIR_NONE; } @@ -1122,15 +998,15 @@ static int idetape_init_read(ide_drive_t *drive) ide_tape_flush_merge_buffer(drive); idetape_flush_tape_buffers(drive); } - if (tape->merge_bh || tape->merge_bh_size) { - printk(KERN_ERR "ide-tape: merge_bh_size should be" - " 0 now\n"); - tape->merge_bh_size = 0; + if (tape->buf || tape->valid) { + printk(KERN_ERR "ide-tape: valid should be 0 now\n"); + tape->valid = 0; } - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0); - if (!tape->merge_bh) + tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); + if (!tape->buf) return -ENOMEM; tape->chrdev_dir = IDETAPE_DIR_READ; + tape->cur = tape->buf; /* * Issue a read 0 command to ensure that DSC handshake is @@ -1140,11 +1016,10 @@ static int idetape_init_read(ide_drive_t *drive) */ if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) { bytes_read = idetape_queue_rw_tail(drive, - REQ_IDETAPE_READ, 0, - tape->merge_bh); + REQ_IDETAPE_READ, 0); if (bytes_read < 0) { - ide_tape_kfree_buffer(tape); - tape->merge_bh = NULL; + kfree(tape->buf); + tape->buf = NULL; tape->chrdev_dir = IDETAPE_DIR_NONE; return bytes_read; } @@ -1157,8 +1032,6 @@ static int idetape_init_read(ide_drive_t *drive) /* called from idetape_chrdev_read() to service a chrdev read request. */ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) { - idetape_tape_t *tape = drive->driver_data; - debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks); /* If we are at a filemark, return a read length of 0 */ @@ -1167,27 +1040,21 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) idetape_init_read(drive); - return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks, - tape->merge_bh); + return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks); } static void idetape_pad_zeros(ide_drive_t *drive, int bcount) { idetape_tape_t *tape = drive->driver_data; - struct idetape_bh *bh = tape->merge_bh; - int blocks; + + memset(tape->buf, 0, tape->buffer_size); while (bcount) { - unsigned int count; + unsigned int count = min(tape->buffer_size, bcount); - count = min(tape->buffer_size, bcount); + idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, + count / tape->blk_size); bcount -= count; - blocks = count / tape->blk_size; - atomic_set(&bh->b_count, count); - memset(bh->b_data, 0, atomic_read(&bh->b_count)); - - idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks, - tape->merge_bh); } } @@ -1267,7 +1134,7 @@ static int idetape_space_over_filemarks(ide_drive_t *drive, short mt_op, } if (tape->chrdev_dir == IDETAPE_DIR_READ) { - tape->merge_bh_size = 0; + tape->valid = 0; if (test_and_clear_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) ++count; ide_tape_discard_merge_buffer(drive, 0); @@ -1333,20 +1200,20 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, return rc; if (count == 0) return (0); - if (tape->merge_bh_size) { - actually_read = min((unsigned int)(tape->merge_bh_size), - (unsigned int)count); - if (idetape_copy_stage_to_user(tape, buf, actually_read)) + if (tape->valid) { + actually_read = min_t(unsigned int, tape->valid, count); + if (copy_to_user(buf, tape->cur, actually_read)) ret = -EFAULT; buf += actually_read; - tape->merge_bh_size -= actually_read; count -= actually_read; + tape->cur += actually_read; + tape->valid -= actually_read; } while (count >= tape->buffer_size) { bytes_read = idetape_add_chrdev_read_request(drive, ctl); if (bytes_read <= 0) goto finish; - if (idetape_copy_stage_to_user(tape, buf, bytes_read)) + if (copy_to_user(buf, tape->cur, bytes_read)) ret = -EFAULT; buf += bytes_read; count -= bytes_read; @@ -1357,10 +1224,11 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, if (bytes_read <= 0) goto finish; temp = min((unsigned long)count, (unsigned long)bytes_read); - if (idetape_copy_stage_to_user(tape, buf, temp)) + if (copy_to_user(buf, tape->cur, temp)) ret = -EFAULT; actually_read += temp; - tape->merge_bh_size = bytes_read-temp; + tape->cur += temp; + tape->valid -= temp; } finish: if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) { @@ -1392,16 +1260,15 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, if (tape->chrdev_dir != IDETAPE_DIR_WRITE) { if (tape->chrdev_dir == IDETAPE_DIR_READ) ide_tape_discard_merge_buffer(drive, 1); - if (tape->merge_bh || tape->merge_bh_size) { - printk(KERN_ERR "ide-tape: merge_bh_size " - "should be 0 now\n"); - tape->merge_bh_size = 0; + if (tape->buf || tape->valid) { + printk(KERN_ERR "ide-tape: valid should be 0 now\n"); + tape->valid = 0; } - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 0); - if (!tape->merge_bh) + tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); + if (!tape->buf) return -ENOMEM; tape->chrdev_dir = IDETAPE_DIR_WRITE; - idetape_init_merge_buffer(tape); + tape->cur = tape->buf; /* * Issue a write 0 command to ensure that DSC handshake is @@ -1411,11 +1278,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, */ if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) { ssize_t retval = idetape_queue_rw_tail(drive, - REQ_IDETAPE_WRITE, 0, - tape->merge_bh); + REQ_IDETAPE_WRITE, 0); if (retval < 0) { - ide_tape_kfree_buffer(tape); - tape->merge_bh = NULL; + kfree(tape->buf); + tape->buf = NULL; tape->chrdev_dir = IDETAPE_DIR_NONE; return retval; } @@ -1423,23 +1289,19 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, } if (count == 0) return (0); - if (tape->merge_bh_size) { - if (tape->merge_bh_size >= tape->buffer_size) { - printk(KERN_ERR "ide-tape: bug: merge buf too big\n"); - tape->merge_bh_size = 0; - } - actually_written = min((unsigned int) - (tape->buffer_size - tape->merge_bh_size), - (unsigned int)count); - if (idetape_copy_stage_from_user(tape, buf, actually_written)) - ret = -EFAULT; + if (tape->valid < tape->buffer_size) { + actually_written = min_t(unsigned int, + tape->buffer_size - tape->valid, + count); + if (copy_from_user(tape->cur, buf, actually_written)) + ret = -EFAULT; buf += actually_written; - tape->merge_bh_size += actually_written; count -= actually_written; + tape->cur += actually_written; + tape->valid += actually_written; - if (tape->merge_bh_size == tape->buffer_size) { + if (tape->valid == tape->buffer_size) { ssize_t retval; - tape->merge_bh_size = 0; retval = idetape_add_chrdev_write_request(drive, ctl); if (retval <= 0) return (retval); @@ -1447,7 +1309,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, } while (count >= tape->buffer_size) { ssize_t retval; - if (idetape_copy_stage_from_user(tape, buf, tape->buffer_size)) + if (copy_from_user(tape->cur, buf, tape->buffer_size)) ret = -EFAULT; buf += tape->buffer_size; count -= tape->buffer_size; @@ -1458,9 +1320,10 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, } if (count) { actually_written += count; - if (idetape_copy_stage_from_user(tape, buf, count)) + if (copy_from_user(tape->cur, buf, count)) ret = -EFAULT; - tape->merge_bh_size += count; + tape->cur += count; + tape->valid += count; } return ret ? ret : actually_written; } @@ -1623,7 +1486,7 @@ static int idetape_chrdev_ioctl(struct inode *inode, struct file *file, idetape_flush_tape_buffers(drive); } if (cmd == MTIOCGET || cmd == MTIOCPOS) { - block_offset = tape->merge_bh_size / + block_offset = tape->valid / (tape->blk_size * tape->user_bs_factor); position = idetape_read_position(drive); if (position < 0) @@ -1771,12 +1634,12 @@ static void idetape_write_release(ide_drive_t *drive, unsigned int minor) idetape_tape_t *tape = drive->driver_data; ide_tape_flush_merge_buffer(drive); - tape->merge_bh = ide_tape_kmalloc_buffer(tape, 1); - if (tape->merge_bh != NULL) { + tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); + if (tape->buf != NULL) { idetape_pad_zeros(drive, tape->blk_size * (tape->user_bs_factor - 1)); - ide_tape_kfree_buffer(tape); - tape->merge_bh = NULL; + kfree(tape->buf); + tape->buf = NULL; } idetape_write_filemark(drive); idetape_flush_tape_buffers(drive); @@ -2042,7 +1905,7 @@ static void ide_tape_release(struct device *dev) ide_drive_t *drive = tape->drive; struct gendisk *g = tape->disk; - BUG_ON(tape->merge_bh_size); + BUG_ON(tape->valid); drive->dev_flags &= ~IDE_DFLAG_DSC_OVERLAP; drive->driver_data = NULL; From 3596b66452491a3cff26256a5e6e6061a66c4142 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:02 +0900 Subject: [PATCH 0597/2061] ide-tape: unify r/w init paths Impact: cleanup Read and write init paths are almost identical. Unify them into idetape_init_rw(). Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 110 +++++++++++++++++------------------------ 1 file changed, 46 insertions(+), 64 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index d2e9c09b521..4da7fa9bca1 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -987,42 +987,50 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive) tape->chrdev_dir = IDETAPE_DIR_NONE; } -static int idetape_init_read(ide_drive_t *drive) +static int idetape_init_rw(ide_drive_t *drive, int dir) { idetape_tape_t *tape = drive->driver_data; - int bytes_read; + int rc; - /* Initialize read operation */ - if (tape->chrdev_dir != IDETAPE_DIR_READ) { - if (tape->chrdev_dir == IDETAPE_DIR_WRITE) { - ide_tape_flush_merge_buffer(drive); - idetape_flush_tape_buffers(drive); - } - if (tape->buf || tape->valid) { - printk(KERN_ERR "ide-tape: valid should be 0 now\n"); - tape->valid = 0; - } - tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); - if (!tape->buf) - return -ENOMEM; - tape->chrdev_dir = IDETAPE_DIR_READ; - tape->cur = tape->buf; + BUG_ON(dir != IDETAPE_DIR_READ && dir != IDETAPE_DIR_WRITE); - /* - * Issue a read 0 command to ensure that DSC handshake is - * switched from completion mode to buffer available mode. - * No point in issuing this if DSC overlap isn't supported, some - * drives (Seagate STT3401A) will return an error. - */ - if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) { - bytes_read = idetape_queue_rw_tail(drive, - REQ_IDETAPE_READ, 0); - if (bytes_read < 0) { - kfree(tape->buf); - tape->buf = NULL; - tape->chrdev_dir = IDETAPE_DIR_NONE; - return bytes_read; - } + if (tape->chrdev_dir == dir) + return 0; + + if (tape->chrdev_dir == IDETAPE_DIR_READ) + ide_tape_discard_merge_buffer(drive, 1); + else if (tape->chrdev_dir == IDETAPE_DIR_WRITE) { + ide_tape_flush_merge_buffer(drive); + idetape_flush_tape_buffers(drive); + } + + if (tape->buf || tape->valid) { + printk(KERN_ERR "ide-tape: valid should be 0 now\n"); + tape->valid = 0; + } + + tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); + if (!tape->buf) + return -ENOMEM; + tape->chrdev_dir = dir; + tape->cur = tape->buf; + + /* + * Issue a 0 rw command to ensure that DSC handshake is + * switched from completion mode to buffer available mode. No + * point in issuing this if DSC overlap isn't supported, some + * drives (Seagate STT3401A) will return an error. + */ + if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) { + int cmd = dir == IDETAPE_DIR_READ ? REQ_IDETAPE_READ + : REQ_IDETAPE_WRITE; + + rc = idetape_queue_rw_tail(drive, cmd, 0); + if (rc < 0) { + kfree(tape->buf); + tape->buf = NULL; + tape->chrdev_dir = IDETAPE_DIR_NONE; + return rc; } } @@ -1038,7 +1046,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) return 0; - idetape_init_read(drive); + idetape_init_rw(drive, IDETAPE_DIR_READ); return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks); } @@ -1195,7 +1203,7 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, (count % tape->blk_size) == 0) tape->user_bs_factor = count / tape->blk_size; } - rc = idetape_init_read(drive); + rc = idetape_init_rw(drive, IDETAPE_DIR_READ); if (rc < 0) return rc; if (count == 0) @@ -1249,6 +1257,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, ssize_t actually_written = 0; ssize_t ret = 0; u16 ctl = *(u16 *)&tape->caps[12]; + int rc; /* The drive is write protected. */ if (tape->write_prot) @@ -1257,36 +1266,9 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count); /* Initialize write operation */ - if (tape->chrdev_dir != IDETAPE_DIR_WRITE) { - if (tape->chrdev_dir == IDETAPE_DIR_READ) - ide_tape_discard_merge_buffer(drive, 1); - if (tape->buf || tape->valid) { - printk(KERN_ERR "ide-tape: valid should be 0 now\n"); - tape->valid = 0; - } - tape->buf = kmalloc(tape->buffer_size, GFP_KERNEL); - if (!tape->buf) - return -ENOMEM; - tape->chrdev_dir = IDETAPE_DIR_WRITE; - tape->cur = tape->buf; - - /* - * Issue a write 0 command to ensure that DSC handshake is - * switched from completion mode to buffer available mode. No - * point in issuing this if DSC overlap isn't supported, some - * drives (Seagate STT3401A) will return an error. - */ - if (drive->dev_flags & IDE_DFLAG_DSC_OVERLAP) { - ssize_t retval = idetape_queue_rw_tail(drive, - REQ_IDETAPE_WRITE, 0); - if (retval < 0) { - kfree(tape->buf); - tape->buf = NULL; - tape->chrdev_dir = IDETAPE_DIR_NONE; - return retval; - } - } - } + rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE); + if (rc < 0) + return rc; if (count == 0) return (0); if (tape->valid < tape->buffer_size) { From 71294cf93d22ceaa75448cc6ebee2c65897be8c2 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:03 +0900 Subject: [PATCH 0598/2061] ide-tape: use byte size instead of sectors on rw issue functions Impact: cleanup Byte size is what most issue functions deal with, make idetape_queue_rw_tail() and its wrappers take byte size instead of sector counts. idetape_chrdev_read() and write() functions are converted to use tape->buffer_size instead of ctl from tape->cap. This cleans up code a little bit and will ease the next r/w reimplementation. Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 45 +++++++++++++++++++----------------------- 1 file changed, 20 insertions(+), 25 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 4da7fa9bca1..d5e9bb286e3 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -879,15 +879,15 @@ static void ide_tape_discard_merge_buffer(ide_drive_t *drive, * Generate a read/write request for the block device interface and wait for it * to be serviced. */ -static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int blocks) +static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) { idetape_tape_t *tape = drive->driver_data; - size_t size = blocks * tape->blk_size; struct request *rq; int ret; debug_log(DBG_SENSE, "%s: cmd=%d\n", __func__, cmd); BUG_ON(cmd != REQ_IDETAPE_READ && cmd != REQ_IDETAPE_WRITE); + BUG_ON(size < 0 || size % tape->blk_size); rq = blk_get_request(drive->queue, READ, __GFP_WAIT); rq->cmd_type = REQ_TYPE_SPECIAL; @@ -954,17 +954,16 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd) } /* Queue up a character device originated write request. */ -static int idetape_add_chrdev_write_request(ide_drive_t *drive, int blocks) +static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size) { debug_log(DBG_CHRDEV, "Enter %s\n", __func__); - return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, blocks); + return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size); } static void ide_tape_flush_merge_buffer(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; - int blocks; if (tape->chrdev_dir != IDETAPE_DIR_WRITE) { printk(KERN_ERR "ide-tape: bug: Trying to empty merge buffer" @@ -972,15 +971,10 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive) return; } if (tape->buf) { - blocks = tape->valid / tape->blk_size; - if (tape->valid % tape->blk_size) { - blocks++; - memset(tape->buf + tape->valid, 0, - tape->blk_size - tape->valid % tape->blk_size); - } - (void) idetape_add_chrdev_write_request(drive, blocks); - } - if (tape->buf != NULL) { + size_t aligned = roundup(tape->valid, tape->blk_size); + + memset(tape->cur, 0, aligned - tape->valid); + idetape_add_chrdev_write_request(drive, aligned); kfree(tape->buf); tape->buf = NULL; } @@ -1038,9 +1032,9 @@ static int idetape_init_rw(ide_drive_t *drive, int dir) } /* called from idetape_chrdev_read() to service a chrdev read request. */ -static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) +static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size) { - debug_log(DBG_PROCS, "Enter %s, %d blocks\n", __func__, blocks); + debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size); /* If we are at a filemark, return a read length of 0 */ if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) @@ -1048,7 +1042,7 @@ static int idetape_add_chrdev_read_request(ide_drive_t *drive, int blocks) idetape_init_rw(drive, IDETAPE_DIR_READ); - return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, blocks); + return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size); } static void idetape_pad_zeros(ide_drive_t *drive, int bcount) @@ -1060,8 +1054,7 @@ static void idetape_pad_zeros(ide_drive_t *drive, int bcount) while (bcount) { unsigned int count = min(tape->buffer_size, bcount); - idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, - count / tape->blk_size); + idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, count); bcount -= count; } } @@ -1193,7 +1186,6 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, ide_drive_t *drive = tape->drive; ssize_t bytes_read, temp, actually_read = 0, rc; ssize_t ret = 0; - u16 ctl = *(u16 *)&tape->caps[12]; debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count); @@ -1218,7 +1210,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, tape->valid -= actually_read; } while (count >= tape->buffer_size) { - bytes_read = idetape_add_chrdev_read_request(drive, ctl); + bytes_read = idetape_add_chrdev_read_request(drive, + tape->buffer_size); if (bytes_read <= 0) goto finish; if (copy_to_user(buf, tape->cur, bytes_read)) @@ -1228,7 +1221,8 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, actually_read += bytes_read; } if (count) { - bytes_read = idetape_add_chrdev_read_request(drive, ctl); + bytes_read = idetape_add_chrdev_read_request(drive, + tape->buffer_size); if (bytes_read <= 0) goto finish; temp = min((unsigned long)count, (unsigned long)bytes_read); @@ -1256,7 +1250,6 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, ide_drive_t *drive = tape->drive; ssize_t actually_written = 0; ssize_t ret = 0; - u16 ctl = *(u16 *)&tape->caps[12]; int rc; /* The drive is write protected. */ @@ -1284,7 +1277,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, if (tape->valid == tape->buffer_size) { ssize_t retval; - retval = idetape_add_chrdev_write_request(drive, ctl); + retval = idetape_add_chrdev_write_request(drive, + tape->buffer_size); if (retval <= 0) return (retval); } @@ -1295,7 +1289,8 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, ret = -EFAULT; buf += tape->buffer_size; count -= tape->buffer_size; - retval = idetape_add_chrdev_write_request(drive, ctl); + retval = idetape_add_chrdev_write_request(drive, + tape->buffer_size); actually_written += tape->buffer_size; if (retval <= 0) return (retval); From 4344d07fb8dbf0cbfec1f7d7c1afeccaceaaa120 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:03 +0900 Subject: [PATCH 0599/2061] ide-tape: simplify read/write functions Impact: cleanup idetape_chrdev_read/write() functions are unnecessarily complex when everything can be handled in a single loop. Collapse idetape_add_chrdev_read/write_request() into the rw functions and simplify the implementation. Signed-off-by: Tejun Heo --- drivers/ide/ide-tape.c | 147 ++++++++++++++--------------------------- 1 file changed, 49 insertions(+), 98 deletions(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index d5e9bb286e3..2599579e417 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -953,14 +953,6 @@ static void idetape_create_space_cmd(struct ide_atapi_pc *pc, int count, u8 cmd) pc->flags |= PC_FLAG_WAIT_FOR_DSC; } -/* Queue up a character device originated write request. */ -static int idetape_add_chrdev_write_request(ide_drive_t *drive, int size) -{ - debug_log(DBG_CHRDEV, "Enter %s\n", __func__); - - return idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, size); -} - static void ide_tape_flush_merge_buffer(ide_drive_t *drive) { idetape_tape_t *tape = drive->driver_data; @@ -974,7 +966,7 @@ static void ide_tape_flush_merge_buffer(ide_drive_t *drive) size_t aligned = roundup(tape->valid, tape->blk_size); memset(tape->cur, 0, aligned - tape->valid); - idetape_add_chrdev_write_request(drive, aligned); + idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, aligned); kfree(tape->buf); tape->buf = NULL; } @@ -1031,20 +1023,6 @@ static int idetape_init_rw(ide_drive_t *drive, int dir) return 0; } -/* called from idetape_chrdev_read() to service a chrdev read request. */ -static int idetape_add_chrdev_read_request(ide_drive_t *drive, int size) -{ - debug_log(DBG_PROCS, "Enter %s, %d bytes\n", __func__, size); - - /* If we are at a filemark, return a read length of 0 */ - if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) - return 0; - - idetape_init_rw(drive, IDETAPE_DIR_READ); - - return idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, size); -} - static void idetape_pad_zeros(ide_drive_t *drive, int bcount) { idetape_tape_t *tape = drive->driver_data; @@ -1184,8 +1162,9 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, { struct ide_tape_obj *tape = file->private_data; ide_drive_t *drive = tape->drive; - ssize_t bytes_read, temp, actually_read = 0, rc; + size_t done = 0; ssize_t ret = 0; + int rc; debug_log(DBG_CHRDEV, "Enter %s, count %Zd\n", __func__, count); @@ -1195,52 +1174,43 @@ static ssize_t idetape_chrdev_read(struct file *file, char __user *buf, (count % tape->blk_size) == 0) tape->user_bs_factor = count / tape->blk_size; } + rc = idetape_init_rw(drive, IDETAPE_DIR_READ); if (rc < 0) return rc; - if (count == 0) - return (0); - if (tape->valid) { - actually_read = min_t(unsigned int, tape->valid, count); - if (copy_to_user(buf, tape->cur, actually_read)) + + while (done < count) { + size_t todo; + + /* refill if staging buffer is empty */ + if (!tape->valid) { + /* If we are at a filemark, nothing more to read */ + if (test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) + break; + /* read */ + if (idetape_queue_rw_tail(drive, REQ_IDETAPE_READ, + tape->buffer_size) <= 0) + break; + } + + /* copy out */ + todo = min_t(size_t, count - done, tape->valid); + if (copy_to_user(buf + done, tape->cur, todo)) ret = -EFAULT; - buf += actually_read; - count -= actually_read; - tape->cur += actually_read; - tape->valid -= actually_read; + + tape->cur += todo; + tape->valid -= todo; + done += todo; } - while (count >= tape->buffer_size) { - bytes_read = idetape_add_chrdev_read_request(drive, - tape->buffer_size); - if (bytes_read <= 0) - goto finish; - if (copy_to_user(buf, tape->cur, bytes_read)) - ret = -EFAULT; - buf += bytes_read; - count -= bytes_read; - actually_read += bytes_read; - } - if (count) { - bytes_read = idetape_add_chrdev_read_request(drive, - tape->buffer_size); - if (bytes_read <= 0) - goto finish; - temp = min((unsigned long)count, (unsigned long)bytes_read); - if (copy_to_user(buf, tape->cur, temp)) - ret = -EFAULT; - actually_read += temp; - tape->cur += temp; - tape->valid -= temp; - } -finish: - if (!actually_read && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) { + + if (!done && test_bit(IDE_AFLAG_FILEMARK, &drive->atapi_flags)) { debug_log(DBG_SENSE, "%s: spacing over filemark\n", tape->name); idetape_space_over_filemarks(drive, MTFSF, 1); return 0; } - return ret ? ret : actually_read; + return ret ? ret : done; } static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, @@ -1248,7 +1218,7 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, { struct ide_tape_obj *tape = file->private_data; ide_drive_t *drive = tape->drive; - ssize_t actually_written = 0; + size_t done = 0; ssize_t ret = 0; int rc; @@ -1262,47 +1232,28 @@ static ssize_t idetape_chrdev_write(struct file *file, const char __user *buf, rc = idetape_init_rw(drive, IDETAPE_DIR_WRITE); if (rc < 0) return rc; - if (count == 0) - return (0); - if (tape->valid < tape->buffer_size) { - actually_written = min_t(unsigned int, - tape->buffer_size - tape->valid, - count); - if (copy_from_user(tape->cur, buf, actually_written)) - ret = -EFAULT; - buf += actually_written; - count -= actually_written; - tape->cur += actually_written; - tape->valid += actually_written; - if (tape->valid == tape->buffer_size) { - ssize_t retval; - retval = idetape_add_chrdev_write_request(drive, - tape->buffer_size); - if (retval <= 0) - return (retval); - } - } - while (count >= tape->buffer_size) { - ssize_t retval; - if (copy_from_user(tape->cur, buf, tape->buffer_size)) + while (done < count) { + size_t todo; + + /* flush if staging buffer is full */ + if (tape->valid == tape->buffer_size && + idetape_queue_rw_tail(drive, REQ_IDETAPE_WRITE, + tape->buffer_size) <= 0) + return rc; + + /* copy in */ + todo = min_t(size_t, count - done, + tape->buffer_size - tape->valid); + if (copy_from_user(tape->cur, buf + done, todo)) ret = -EFAULT; - buf += tape->buffer_size; - count -= tape->buffer_size; - retval = idetape_add_chrdev_write_request(drive, - tape->buffer_size); - actually_written += tape->buffer_size; - if (retval <= 0) - return (retval); + + tape->cur += todo; + tape->valid += todo; + done += todo; } - if (count) { - actually_written += count; - if (copy_from_user(tape->cur, buf, count)) - ret = -EFAULT; - tape->cur += count; - tape->valid += count; - } - return ret ? ret : actually_written; + + return ret ? ret : done; } static int idetape_write_filemark(ide_drive_t *drive) From 29d1a4371035e01b0d079bc5aa88b50f5af7a566 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:03 +0900 Subject: [PATCH 0600/2061] ide-atapi: kill unused fields and callbacks Impact: remove fields and code paths which are no longer necessary Now that ide-tape uses standard mechanisms to transfer data, special case handling for bh handling can be dropped from ide-atapi. Drop the followings. * pc->cur_pos, b_count, bh and b_data * drive->pc_update_buffers() and pc_io_buffers(). Signed-off-by: Tejun Heo --- drivers/ide/ide-atapi.c | 17 ++++------------- drivers/ide/ide-tape.c | 1 - include/linux/ide.h | 12 ------------ 3 files changed, 4 insertions(+), 26 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index b9dd4503cbc..afe5a432387 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -359,11 +359,8 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) drive->name, rq_data_dir(pc->rq) ? "write" : "read"); pc->flags |= PC_FLAG_DMA_ERROR; - } else { + } else pc->xferred = pc->req_xfer; - if (drive->pc_update_buffers) - drive->pc_update_buffers(drive, pc); - } debug_log("%s: DMA finished\n", drive->name); } @@ -463,16 +460,11 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) return ide_do_reset(drive); } - if (drive->media == ide_tape && pc->bh) - done = drive->pc_io_buffers(drive, pc, bcount, write); - else { - done = min_t(unsigned int, bcount, cmd->nleft); - ide_pio_bytes(drive, cmd, write, done); - } + done = min_t(unsigned int, bcount, cmd->nleft); + ide_pio_bytes(drive, cmd, write, done); - /* Update the current position */ + /* Update transferred byte count */ pc->xferred += done; - pc->cur_pos += done; bcount -= done; @@ -650,7 +642,6 @@ ide_startstop_t ide_issue_pc(ide_drive_t *drive, struct ide_cmd *cmd) /* We haven't transferred any data yet */ pc->xferred = 0; - pc->cur_pos = pc->buf; valid_tf = IDE_VALID_DEVICE; bcount = ((drive->media == ide_tape) ? diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 2599579e417..8dfc68892d6 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -591,7 +591,6 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape, ide_init_pc(pc); put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]); pc->c[1] = 1; - pc->bh = NULL; pc->buf = NULL; pc->buf_size = length * tape->blk_size; pc->req_xfer = pc->buf_size; diff --git a/include/linux/ide.h b/include/linux/ide.h index 1957461ac76..34c128f0a33 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -362,11 +362,7 @@ struct ide_atapi_pc { /* data buffer */ u8 *buf; - /* current buffer position */ - u8 *cur_pos; int buf_size; - /* missing/available data on the current buffer */ - int b_count; /* the corresponding request */ struct request *rq; @@ -379,10 +375,6 @@ struct ide_atapi_pc { */ u8 pc_buf[IDE_PC_BUFFER_SIZE]; - /* idetape only */ - struct idetape_bh *bh; - char *b_data; - unsigned long timeout; }; @@ -595,10 +587,6 @@ struct ide_drive_s { /* callback for packet commands */ int (*pc_callback)(struct ide_drive_s *, int); - void (*pc_update_buffers)(struct ide_drive_s *, struct ide_atapi_pc *); - int (*pc_io_buffers)(struct ide_drive_s *, struct ide_atapi_pc *, - unsigned int, int); - ide_startstop_t (*irq_handler)(struct ide_drive_s *); unsigned long atapi_flags; From 5ad960fe8d0e4f99fe2b8dded45e8251137293c9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Sun, 19 Apr 2009 08:46:03 +0900 Subject: [PATCH 0601/2061] ide: drop rq->data handling from ide_map_sg() Impact: remove code path which is no longer necessary All IDE data transfers now use rq->bio. Simplify ide_map_sg() accordingly. Signed-off-by: Tejun Heo Cc: Jens Axboe --- drivers/ide/ide-io.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 6e3094e2277..a0309ea661a 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -248,11 +248,7 @@ void ide_map_sg(ide_drive_t *drive, struct ide_cmd *cmd) struct scatterlist *sg = hwif->sg_table; struct request *rq = cmd->rq; - if (!rq->bio) { - sg_init_one(sg, rq->data, rq->data_len); - cmd->sg_nents = 1; - } else - cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg); + cmd->sg_nents = blk_rq_map_sg(drive->queue, rq, sg); } EXPORT_SYMBOL_GPL(ide_map_sg); From 586cf2681f527ce8b85b9bd57c8b9f7945fbe051 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 21 Apr 2009 12:16:56 +0900 Subject: [PATCH 0602/2061] ide-dma: don't reset request fields on dma_timeout_retry() Impact: drop unnecessary code Now that everything uses bio and block operations, there is no need to reset request fields manually when retrying a request. Every field is guaranteed to be always valid. Drop unnecessary request field resetting from ide_dma_timeout_retry(). Signed-off-by: Tejun Heo --- drivers/ide/ide-dma.c | 20 ++++---------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index a0b8cab1d9a..d9123ecae4a 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -510,23 +510,11 @@ ide_startstop_t ide_dma_timeout_retry(ide_drive_t *drive, int error) /* * un-busy drive etc and make sure request is sane */ - rq = hwif->rq; - if (!rq) - goto out; - - hwif->rq = NULL; - - rq->errors = 0; - - if (!rq->bio) - goto out; - - rq->sector = rq->bio->bi_sector; - rq->current_nr_sectors = bio_iovec(rq->bio)->bv_len >> 9; - rq->hard_cur_sectors = rq->current_nr_sectors; - rq->buffer = bio_data(rq->bio); -out: + if (rq) { + hwif->rq = NULL; + rq->errors = 0; + } return ret; } From db29a6b49674085f136331014ba0eee249c16a2c Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 21 Apr 2009 09:27:03 +0200 Subject: [PATCH 0603/2061] block: enable by default support for large devices and files on 32-bit archs Enable by default support for large devices and files (CONFIG_LBD): - With 1TB disks being a commodity hardware it is quite easy to hit 2TB limitation while building RAIDs etc. and many distros have been using CONFIG_LBD=y by default already (at least Fedora 10 and openSUSE 11.1). - This should also prevent a subtle ext4 filesystem compatibility issue: mke2fs.ext4 defaults to creating filesystems with huge_files feature enabled and such filesystems cannot be later mounted read-write on machines with CONFIG_LBD=n (it should be quite easy to hit this issue when trying to use filesystem created using distro kernel on system running the self-build kernel, think about USB disk enclosures & co.). While at it: - Clarify config option help text w.r.t. mounting ext4 filesystems (they can be mounted with CONFIG_LBD=n but in the read-only mode). Cc: "Theodore Ts'o" Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Jens Axboe --- block/Kconfig | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/block/Kconfig b/block/Kconfig index e7d12782bcf..2c39527aa7d 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -26,6 +26,7 @@ if BLOCK config LBD bool "Support for large block devices and files" depends on !64BIT + default y help Enable block devices or files of size 2TB and larger. @@ -38,11 +39,13 @@ config LBD The ext4 filesystem requires that this feature be enabled in order to support filesystems that have the huge_file feature - enabled. Otherwise, it will refuse to mount any filesystems - that use the huge_file feature, which is enabled by default - by mke2fs.ext4. The GFS2 filesystem also requires this feature. + enabled. Otherwise, it will refuse to mount in the read-write + mode any filesystems that use the huge_file feature, which is + enabled by default by mke2fs.ext4. - If unsure, say N. + The GFS2 filesystem also requires this feature. + + If unsure, say Y. config BLK_DEV_BSG bool "Block layer SG support v4 (EXPERIMENTAL)" From db2dbb12dc47a50c7a4c5678f526014063e486f6 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 22 Apr 2009 14:08:13 +0200 Subject: [PATCH 0604/2061] block: implement blkdev_readpages Doing a proper block dev ->readpages() speeds up the crazy dump(8) approach of using interleaved process IO. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- fs/block_dev.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/block_dev.c b/fs/block_dev.c index f45dbc18dd1..a85fe310fc6 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -331,6 +331,12 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } +static int blkdev_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); +} + static int blkdev_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1399,6 +1405,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, + .readpages = blkdev_readpages, .writepage = blkdev_writepage, .sync_page = block_sync_page, .write_begin = blkdev_write_begin, From a538cd03be6f363d039daa94199c28cfbd508455 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:17 +0900 Subject: [PATCH 0605/2061] block: merge blk_invoke_request_fn() into __blk_run_queue() __blk_run_queue wraps blk_invoke_request_fn() such that it additionally removes plug and bails out early if the queue is empty. Both extra operations have their own pending mechanisms and don't cause any harm correctness-wise when they are done superflously. The only user of blk_invoke_request_fn() being blk_start_queue(), there isn't much reason to keep both functions around. Merge blk_invoke_request_fn() into __blk_run_queue() and make blk_start_queue() use __blk_run_queue() instead. [ Impact: merge two subtly different internal functions ] Signed-off-by: Tejun Heo --- block/blk-core.c | 35 ++++++++++++++--------------------- 1 file changed, 14 insertions(+), 21 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 41bc0ff75e2..02f53bc00e4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -333,24 +333,6 @@ void blk_unplug(struct request_queue *q) } EXPORT_SYMBOL(blk_unplug); -static void blk_invoke_request_fn(struct request_queue *q) -{ - if (unlikely(blk_queue_stopped(q))) - return; - - /* - * one level of recursion is ok and is much faster than kicking - * the unplug handling - */ - if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { - q->request_fn(q); - queue_flag_clear(QUEUE_FLAG_REENTER, q); - } else { - queue_flag_set(QUEUE_FLAG_PLUGGED, q); - kblockd_schedule_work(q, &q->unplug_work); - } -} - /** * blk_start_queue - restart a previously stopped queue * @q: The &struct request_queue in question @@ -365,7 +347,7 @@ void blk_start_queue(struct request_queue *q) WARN_ON(!irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); - blk_invoke_request_fn(q); + __blk_run_queue(q); } EXPORT_SYMBOL(blk_start_queue); @@ -425,12 +407,23 @@ void __blk_run_queue(struct request_queue *q) { blk_remove_plug(q); + if (unlikely(blk_queue_stopped(q))) + return; + + if (elv_queue_empty(q)) + return; + /* * Only recurse once to avoid overrunning the stack, let the unplug * handling reinvoke the handler shortly if we already got there. */ - if (!elv_queue_empty(q)) - blk_invoke_request_fn(q); + if (!queue_flag_test_and_set(QUEUE_FLAG_REENTER, q)) { + q->request_fn(q); + queue_flag_clear(QUEUE_FLAG_REENTER, q); + } else { + queue_flag_set(QUEUE_FLAG_PLUGGED, q); + kblockd_schedule_work(q, &q->unplug_work); + } } EXPORT_SYMBOL(__blk_run_queue); From a7f557923441186a3cdbabc54f1bcacf42b63bf5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:17 +0900 Subject: [PATCH 0606/2061] block: kill blk_start_queueing() blk_start_queueing() is identical to __blk_run_queue() except that it doesn't check for recursion. None of the current users depends on blk_start_queueing() running request_fn directly. Replace usages of blk_start_queueing() with [__]blk_run_queue() and kill it. [ Impact: removal of mostly duplicate interface function ] Signed-off-by: Tejun Heo --- block/as-iosched.c | 6 +----- block/blk-core.c | 28 ++-------------------------- block/cfq-iosched.c | 6 +++--- block/elevator.c | 7 +++---- include/linux/blkdev.h | 1 - 5 files changed, 9 insertions(+), 39 deletions(-) diff --git a/block/as-iosched.c b/block/as-iosched.c index c48fa670d22..45bd07059c2 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -1312,12 +1312,8 @@ static void as_merged_requests(struct request_queue *q, struct request *req, static void as_work_handler(struct work_struct *work) { struct as_data *ad = container_of(work, struct as_data, antic_work); - struct request_queue *q = ad->q; - unsigned long flags; - spin_lock_irqsave(q->queue_lock, flags); - blk_start_queueing(q); - spin_unlock_irqrestore(q->queue_lock, flags); + blk_run_queue(ad->q); } static int as_may_queue(struct request_queue *q, int rw) diff --git a/block/blk-core.c b/block/blk-core.c index 02f53bc00e4..8b4a0af7d69 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -433,9 +433,7 @@ EXPORT_SYMBOL(__blk_run_queue); * * Description: * Invoke request handling on this queue, if it has pending work to do. - * May be used to restart queueing when a request has completed. Also - * See @blk_start_queueing. - * + * May be used to restart queueing when a request has completed. */ void blk_run_queue(struct request_queue *q) { @@ -894,28 +892,6 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) } EXPORT_SYMBOL(blk_get_request); -/** - * blk_start_queueing - initiate dispatch of requests to device - * @q: request queue to kick into gear - * - * This is basically a helper to remove the need to know whether a queue - * is plugged or not if someone just wants to initiate dispatch of requests - * for this queue. Should be used to start queueing on a device outside - * of ->request_fn() context. Also see @blk_run_queue. - * - * The queue lock must be held with interrupts disabled. - */ -void blk_start_queueing(struct request_queue *q) -{ - if (!blk_queue_plugged(q)) { - if (unlikely(blk_queue_stopped(q))) - return; - q->request_fn(q); - } else - __generic_unplug_device(q); -} -EXPORT_SYMBOL(blk_start_queueing); - /** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted @@ -984,7 +960,7 @@ void blk_insert_request(struct request_queue *q, struct request *rq, drive_stat_acct(rq, 1); __elv_add_request(q, rq, where, 0); - blk_start_queueing(q); + __blk_run_queue(q); spin_unlock_irqrestore(q->queue_lock, flags); } EXPORT_SYMBOL(blk_insert_request); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index a55a9bd75bd..def0c698a4b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -2088,7 +2088,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, if (blk_rq_bytes(rq) > PAGE_CACHE_SIZE || cfqd->busy_queues > 1) { del_timer(&cfqd->idle_slice_timer); - blk_start_queueing(cfqd->queue); + __blk_run_queue(cfqd->queue); } cfq_mark_cfqq_must_dispatch(cfqq); } @@ -2100,7 +2100,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, * this new queue is RT and the current one is BE */ cfq_preempt_queue(cfqd, cfqq); - blk_start_queueing(cfqd->queue); + __blk_run_queue(cfqd->queue); } } @@ -2345,7 +2345,7 @@ static void cfq_kick_queue(struct work_struct *work) struct request_queue *q = cfqd->queue; spin_lock_irq(q->queue_lock); - blk_start_queueing(q); + __blk_run_queue(cfqd->queue); spin_unlock_irq(q->queue_lock); } diff --git a/block/elevator.c b/block/elevator.c index 7073a907257..2e0fb21485b 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -599,7 +599,7 @@ void elv_quiesce_start(struct request_queue *q) */ elv_drain_elevator(q); while (q->rq.elvpriv) { - blk_start_queueing(q); + __blk_run_queue(q); spin_unlock_irq(q->queue_lock); msleep(10); spin_lock_irq(q->queue_lock); @@ -643,8 +643,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) * with anything. There's no point in delaying queue * processing. */ - blk_remove_plug(q); - blk_start_queueing(q); + __blk_run_queue(q); break; case ELEVATOR_INSERT_SORT: @@ -971,7 +970,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq) blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN && (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) { blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0); - blk_start_queueing(q); + __blk_run_queue(q); } } } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 2755d5c6da2..12e20de44b6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -797,7 +797,6 @@ extern void blk_sync_queue(struct request_queue *q); extern void __blk_stop_queue(struct request_queue *q); extern void __blk_run_queue(struct request_queue *); extern void blk_run_queue(struct request_queue *); -extern void blk_start_queueing(struct request_queue *); extern int blk_rq_map_user(struct request_queue *, struct request *, struct rq_map_data *, void __user *, unsigned long, gfp_t); From e4025f6c21f1389696c069be2dc647f364925c45 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:17 +0900 Subject: [PATCH 0607/2061] block: don't set REQ_NOMERGE unnecessarily RQ_NOMERGE_FLAGS already clears defines which REQ flags aren't mergeable. There is no reason to specify it superflously. It only adds to confusion. Don't set REQ_NOMERGE for barriers and requests with specific queueing directive. REQ_NOMERGE is now exclusively used by the merging code. [ Impact: cleanup ] Signed-off-by: Tejun Heo --- block/blk-core.c | 5 +---- block/blk-exec.c | 1 - 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 8b4a0af7d69..7e0fab53e93 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1082,16 +1082,13 @@ void init_request_from_bio(struct request *req, struct bio *bio) if (bio_failfast_driver(bio)) req->cmd_flags |= REQ_FAILFAST_DRIVER; - /* - * REQ_BARRIER implies no merging, but lets make it explicit - */ if (unlikely(bio_discard(bio))) { req->cmd_flags |= REQ_DISCARD; if (bio_barrier(bio)) req->cmd_flags |= REQ_SOFTBARRIER; req->q->prepare_discard_fn(req->q, req); } else if (unlikely(bio_barrier(bio))) - req->cmd_flags |= (REQ_HARDBARRIER | REQ_NOMERGE); + req->cmd_flags |= REQ_HARDBARRIER; if (bio_sync(bio)) req->cmd_flags |= REQ_RW_SYNC; diff --git a/block/blk-exec.c b/block/blk-exec.c index 6af716d1e54..49557e91f0d 100644 --- a/block/blk-exec.c +++ b/block/blk-exec.c @@ -51,7 +51,6 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, int where = at_head ? ELEVATOR_INSERT_FRONT : ELEVATOR_INSERT_BACK; rq->rq_disk = bd_disk; - rq->cmd_flags |= REQ_NOMERGE; rq->end_io = done; WARN_ON(irqs_disabled()); spin_lock_irq(q->queue_lock); From 10732f5661fb7adf62e20733b0030fc0fc93b0c4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:18 +0900 Subject: [PATCH 0608/2061] block: cleanup REQ_SOFTBARRIER usages blk_insert_request() doesn't need to worry about REQ_SOFTBARRIER. Don't set it. Combined with recent ide updates, REQ_SOFTBARRIER is now only used in elevator proper and for discard requests. [ Impact: cleanup ] Signed-off-by: Tejun Heo --- block/blk-core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 7e0fab53e93..cf10dfcda99 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -946,7 +946,6 @@ void blk_insert_request(struct request_queue *q, struct request *rq, * barrier */ rq->cmd_type = REQ_TYPE_SPECIAL; - rq->cmd_flags |= REQ_SOFTBARRIER; rq->special = data; From 2eef33e439ba9ae387cdc3f1abcef2f3f6c4e7a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:18 +0900 Subject: [PATCH 0609/2061] block: clean up misc stuff after block layer timeout conversion * In blk_rq_timed_out_timer(), else { if } to else if * In blk_add_timer(), simplify if/else block [ Impact: cleanup ] Signed-off-by: Tejun Heo --- block/blk-timeout.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 1ec0d503cac..1ba7e0aca87 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -122,10 +122,8 @@ void blk_rq_timed_out_timer(unsigned long data) if (blk_mark_rq_complete(rq)) continue; blk_rq_timed_out(rq); - } else { - if (!next || time_after(next, rq->deadline)) - next = rq->deadline; - } + } else if (!next || time_after(next, rq->deadline)) + next = rq->deadline; } /* @@ -176,16 +174,14 @@ void blk_add_timer(struct request *req) BUG_ON(!list_empty(&req->timeout_list)); BUG_ON(test_bit(REQ_ATOM_COMPLETE, &req->atomic_flags)); - if (req->timeout) - req->deadline = jiffies + req->timeout; - else { - req->deadline = jiffies + q->rq_timeout; - /* - * Some LLDs, like scsi, peek at the timeout to prevent - * a command from being retried forever. - */ + /* + * Some LLDs, like scsi, peek at the timeout to prevent a + * command from being retried forever. + */ + if (!req->timeout) req->timeout = q->rq_timeout; - } + + req->deadline = jiffies + req->timeout; list_add_tail(&req->timeout_list, &q->timeout_list); /* From 5efccd17ceb0fc43837a331297c2c407969d7201 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:18 +0900 Subject: [PATCH 0610/2061] block: reorder request completion functions Reorder request completion functions such that * All request completion functions are located together. * Functions which are used by only one caller is put right above the caller. * end_request() is put after other completion functions but before blk_update_request(). This change is for completion function cleanup which will follow. [ Impact: cleanup, code reorganization ] Signed-off-by: Tejun Heo --- block/blk-core.c | 144 ++++++++++++++++++++--------------------- include/linux/blkdev.h | 16 ++--- 2 files changed, 80 insertions(+), 80 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index cf10dfcda99..406a93e526b 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1683,6 +1683,35 @@ static void blk_account_io_done(struct request *req) } } +/** + * blk_rq_bytes - Returns bytes left to complete in the entire request + * @rq: the request being processed + **/ +unsigned int blk_rq_bytes(struct request *rq) +{ + if (blk_fs_request(rq)) + return rq->hard_nr_sectors << 9; + + return rq->data_len; +} +EXPORT_SYMBOL_GPL(blk_rq_bytes); + +/** + * blk_rq_cur_bytes - Returns bytes left to complete in the current segment + * @rq: the request being processed + **/ +unsigned int blk_rq_cur_bytes(struct request *rq) +{ + if (blk_fs_request(rq)) + return rq->current_nr_sectors << 9; + + if (rq->bio) + return rq->bio->bi_size; + + return rq->data_len; +} +EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); + /** * __end_that_request_first - end I/O on a request * @req: the request being processed @@ -1797,6 +1826,22 @@ static int __end_that_request_first(struct request *req, int error, return 1; } +static int end_that_request_data(struct request *rq, int error, + unsigned int nr_bytes, unsigned int bidi_bytes) +{ + if (rq->bio) { + if (__end_that_request_first(rq, error, nr_bytes)) + return 1; + + /* Bidi request must be completed as a whole */ + if (blk_bidi_rq(rq) && + __end_that_request_first(rq->next_rq, error, bidi_bytes)) + return 1; + } + + return 0; +} + /* * queue lock must be held */ @@ -1825,78 +1870,6 @@ static void end_that_request_last(struct request *req, int error) } } -/** - * blk_rq_bytes - Returns bytes left to complete in the entire request - * @rq: the request being processed - **/ -unsigned int blk_rq_bytes(struct request *rq) -{ - if (blk_fs_request(rq)) - return rq->hard_nr_sectors << 9; - - return rq->data_len; -} -EXPORT_SYMBOL_GPL(blk_rq_bytes); - -/** - * blk_rq_cur_bytes - Returns bytes left to complete in the current segment - * @rq: the request being processed - **/ -unsigned int blk_rq_cur_bytes(struct request *rq) -{ - if (blk_fs_request(rq)) - return rq->current_nr_sectors << 9; - - if (rq->bio) - return rq->bio->bi_size; - - return rq->data_len; -} -EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); - -/** - * end_request - end I/O on the current segment of the request - * @req: the request being processed - * @uptodate: error value or %0/%1 uptodate flag - * - * Description: - * Ends I/O on the current segment of a request. If that is the only - * remaining segment, the request is also completed and freed. - * - * This is a remnant of how older block drivers handled I/O completions. - * Modern drivers typically end I/O on the full request in one go, unless - * they have a residual value to account for. For that case this function - * isn't really useful, unless the residual just happens to be the - * full current segment. In other words, don't use this function in new - * code. Use blk_end_request() or __blk_end_request() to end a request. - **/ -void end_request(struct request *req, int uptodate) -{ - int error = 0; - - if (uptodate <= 0) - error = uptodate ? uptodate : -EIO; - - __blk_end_request(req, error, req->hard_cur_sectors << 9); -} -EXPORT_SYMBOL(end_request); - -static int end_that_request_data(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes) -{ - if (rq->bio) { - if (__end_that_request_first(rq, error, nr_bytes)) - return 1; - - /* Bidi request must be completed as a whole */ - if (blk_bidi_rq(rq) && - __end_that_request_first(rq->next_rq, error, bidi_bytes)) - return 1; - } - - return 0; -} - /** * blk_end_io - Generic end_io function to complete a request. * @rq: the request being processed @@ -2006,6 +1979,33 @@ int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, } EXPORT_SYMBOL_GPL(blk_end_bidi_request); +/** + * end_request - end I/O on the current segment of the request + * @req: the request being processed + * @uptodate: error value or %0/%1 uptodate flag + * + * Description: + * Ends I/O on the current segment of a request. If that is the only + * remaining segment, the request is also completed and freed. + * + * This is a remnant of how older block drivers handled I/O completions. + * Modern drivers typically end I/O on the full request in one go, unless + * they have a residual value to account for. For that case this function + * isn't really useful, unless the residual just happens to be the + * full current segment. In other words, don't use this function in new + * code. Use blk_end_request() or __blk_end_request() to end a request. + **/ +void end_request(struct request *req, int uptodate) +{ + int error = 0; + + if (uptodate <= 0) + error = uptodate ? uptodate : -EIO; + + __blk_end_request(req, error, req->hard_cur_sectors << 9); +} +EXPORT_SYMBOL(end_request); + /** * blk_update_request - Special helper function for request stacking drivers * @rq: the request being processed diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 12e20de44b6..156ffd9de96 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -831,6 +831,14 @@ static inline void blk_run_address_space(struct address_space *mapping) extern void blkdev_dequeue_request(struct request *req); +/* + * blk_end_request() takes bytes instead of sectors as a complete size. + * blk_rq_bytes() returns bytes left to complete in the entire request. + * blk_rq_cur_bytes() returns bytes left to complete in the current segment. + */ +extern unsigned int blk_rq_bytes(struct request *rq); +extern unsigned int blk_rq_cur_bytes(struct request *rq); + /* * blk_end_request() and friends. * __blk_end_request() and end_request() must be called with @@ -857,14 +865,6 @@ extern void blk_abort_queue(struct request_queue *); extern void blk_update_request(struct request *rq, int error, unsigned int nr_bytes); -/* - * blk_end_request() takes bytes instead of sectors as a complete size. - * blk_rq_bytes() returns bytes left to complete in the entire request. - * blk_rq_cur_bytes() returns bytes left to complete in the current segment. - */ -extern unsigned int blk_rq_bytes(struct request *rq); -extern unsigned int blk_rq_cur_bytes(struct request *rq); - /* * Access functions for manipulating queue properties */ From 158dbda0068e63c7cce7bd47c123bd1dfa5a902c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:18 +0900 Subject: [PATCH 0611/2061] block: reorganize request fetching functions Impact: code reorganization elv_next_request() and elv_dequeue_request() are public block layer interface than actual elevator implementation. They mostly deal with how requests interact with block layer and low level drivers at the beginning of rqeuest processing whereas __elv_next_request() is the actual eleveator request fetching interface. Move the two functions to blk-core.c. This prepares for further interface cleanup. Signed-off-by: Tejun Heo --- block/blk-core.c | 95 +++++++++++++++++++++++++++++++++++ block/blk.h | 37 ++++++++++++++ block/elevator.c | 128 ----------------------------------------------- 3 files changed, 132 insertions(+), 128 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 406a93e526b..678ede23ed0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1712,6 +1712,101 @@ unsigned int blk_rq_cur_bytes(struct request *rq) } EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); +struct request *elv_next_request(struct request_queue *q) +{ + struct request *rq; + int ret; + + while ((rq = __elv_next_request(q)) != NULL) { + if (!(rq->cmd_flags & REQ_STARTED)) { + /* + * This is the first time the device driver + * sees this request (possibly after + * requeueing). Notify IO scheduler. + */ + if (blk_sorted_rq(rq)) + elv_activate_rq(q, rq); + + /* + * just mark as started even if we don't start + * it, a request that has been delayed should + * not be passed by new incoming requests + */ + rq->cmd_flags |= REQ_STARTED; + trace_block_rq_issue(q, rq); + } + + if (!q->boundary_rq || q->boundary_rq == rq) { + q->end_sector = rq_end_sector(rq); + q->boundary_rq = NULL; + } + + if (rq->cmd_flags & REQ_DONTPREP) + break; + + if (q->dma_drain_size && rq->data_len) { + /* + * make sure space for the drain appears we + * know we can do this because max_hw_segments + * has been adjusted to be one fewer than the + * device can handle + */ + rq->nr_phys_segments++; + } + + if (!q->prep_rq_fn) + break; + + ret = q->prep_rq_fn(q, rq); + if (ret == BLKPREP_OK) { + break; + } else if (ret == BLKPREP_DEFER) { + /* + * the request may have been (partially) prepped. + * we need to keep this request in the front to + * avoid resource deadlock. REQ_STARTED will + * prevent other fs requests from passing this one. + */ + if (q->dma_drain_size && rq->data_len && + !(rq->cmd_flags & REQ_DONTPREP)) { + /* + * remove the space for the drain we added + * so that we don't add it again + */ + --rq->nr_phys_segments; + } + + rq = NULL; + break; + } else if (ret == BLKPREP_KILL) { + rq->cmd_flags |= REQ_QUIET; + __blk_end_request(rq, -EIO, blk_rq_bytes(rq)); + } else { + printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); + break; + } + } + + return rq; +} +EXPORT_SYMBOL(elv_next_request); + +void elv_dequeue_request(struct request_queue *q, struct request *rq) +{ + BUG_ON(list_empty(&rq->queuelist)); + BUG_ON(ELV_ON_HASH(rq)); + + list_del_init(&rq->queuelist); + + /* + * the time frame between a request being removed from the lists + * and to it is freed is accounted as io that is in progress at + * the driver side. + */ + if (blk_account_rq(rq)) + q->in_flight++; +} + /** * __end_that_request_first - end I/O on a request * @req: the request being processed diff --git a/block/blk.h b/block/blk.h index 79c85f7c9ff..9b2c324e440 100644 --- a/block/blk.h +++ b/block/blk.h @@ -43,6 +43,43 @@ static inline void blk_clear_rq_complete(struct request *rq) clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); } +/* + * Internal elevator interface + */ +#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) + +static inline struct request *__elv_next_request(struct request_queue *q) +{ + struct request *rq; + + while (1) { + while (!list_empty(&q->queue_head)) { + rq = list_entry_rq(q->queue_head.next); + if (blk_do_ordered(q, &rq)) + return rq; + } + + if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) + return NULL; + } +} + +static inline void elv_activate_rq(struct request_queue *q, struct request *rq) +{ + struct elevator_queue *e = q->elevator; + + if (e->ops->elevator_activate_req_fn) + e->ops->elevator_activate_req_fn(q, rq); +} + +static inline void elv_deactivate_rq(struct request_queue *q, struct request *rq) +{ + struct elevator_queue *e = q->elevator; + + if (e->ops->elevator_deactivate_req_fn) + e->ops->elevator_deactivate_req_fn(q, rq); +} + #ifdef CONFIG_FAIL_IO_TIMEOUT int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); diff --git a/block/elevator.c b/block/elevator.c index 2e0fb21485b..b03b8752e18 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -53,7 +53,6 @@ static const int elv_hash_shift = 6; (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift)) #define ELV_HASH_ENTRIES (1 << elv_hash_shift) #define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) -#define ELV_ON_HASH(rq) (!hlist_unhashed(&(rq)->hash)) DEFINE_TRACE(block_rq_insert); DEFINE_TRACE(block_rq_issue); @@ -310,22 +309,6 @@ void elevator_exit(struct elevator_queue *e) } EXPORT_SYMBOL(elevator_exit); -static void elv_activate_rq(struct request_queue *q, struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (e->ops->elevator_activate_req_fn) - e->ops->elevator_activate_req_fn(q, rq); -} - -static void elv_deactivate_rq(struct request_queue *q, struct request *rq) -{ - struct elevator_queue *e = q->elevator; - - if (e->ops->elevator_deactivate_req_fn) - e->ops->elevator_deactivate_req_fn(q, rq); -} - static inline void __elv_rqhash_del(struct request *rq) { hlist_del_init(&rq->hash); @@ -758,117 +741,6 @@ void elv_add_request(struct request_queue *q, struct request *rq, int where, } EXPORT_SYMBOL(elv_add_request); -static inline struct request *__elv_next_request(struct request_queue *q) -{ - struct request *rq; - - while (1) { - while (!list_empty(&q->queue_head)) { - rq = list_entry_rq(q->queue_head.next); - if (blk_do_ordered(q, &rq)) - return rq; - } - - if (!q->elevator->ops->elevator_dispatch_fn(q, 0)) - return NULL; - } -} - -struct request *elv_next_request(struct request_queue *q) -{ - struct request *rq; - int ret; - - while ((rq = __elv_next_request(q)) != NULL) { - if (!(rq->cmd_flags & REQ_STARTED)) { - /* - * This is the first time the device driver - * sees this request (possibly after - * requeueing). Notify IO scheduler. - */ - if (blk_sorted_rq(rq)) - elv_activate_rq(q, rq); - - /* - * just mark as started even if we don't start - * it, a request that has been delayed should - * not be passed by new incoming requests - */ - rq->cmd_flags |= REQ_STARTED; - trace_block_rq_issue(q, rq); - } - - if (!q->boundary_rq || q->boundary_rq == rq) { - q->end_sector = rq_end_sector(rq); - q->boundary_rq = NULL; - } - - if (rq->cmd_flags & REQ_DONTPREP) - break; - - if (q->dma_drain_size && rq->data_len) { - /* - * make sure space for the drain appears we - * know we can do this because max_hw_segments - * has been adjusted to be one fewer than the - * device can handle - */ - rq->nr_phys_segments++; - } - - if (!q->prep_rq_fn) - break; - - ret = q->prep_rq_fn(q, rq); - if (ret == BLKPREP_OK) { - break; - } else if (ret == BLKPREP_DEFER) { - /* - * the request may have been (partially) prepped. - * we need to keep this request in the front to - * avoid resource deadlock. REQ_STARTED will - * prevent other fs requests from passing this one. - */ - if (q->dma_drain_size && rq->data_len && - !(rq->cmd_flags & REQ_DONTPREP)) { - /* - * remove the space for the drain we added - * so that we don't add it again - */ - --rq->nr_phys_segments; - } - - rq = NULL; - break; - } else if (ret == BLKPREP_KILL) { - rq->cmd_flags |= REQ_QUIET; - __blk_end_request(rq, -EIO, blk_rq_bytes(rq)); - } else { - printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); - break; - } - } - - return rq; -} -EXPORT_SYMBOL(elv_next_request); - -void elv_dequeue_request(struct request_queue *q, struct request *rq) -{ - BUG_ON(list_empty(&rq->queuelist)); - BUG_ON(ELV_ON_HASH(rq)); - - list_del_init(&rq->queuelist); - - /* - * the time frame between a request being removed from the lists - * and to it is freed is accounted as io that is in progress at - * the driver side. - */ - if (blk_account_rq(rq)) - q->in_flight++; -} - int elv_queue_empty(struct request_queue *q) { struct elevator_queue *e = q->elevator; From 0b302d5aa7975006fa2ec3d66386610b9b36c669 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:18 +0900 Subject: [PATCH 0612/2061] block: kill blk_end_request_callback() With recent IDE updates, blk_end_request_callback() doesn't have any user now. Kill it. [ Impact: removal of unused convoluted interface ] Signed-off-by: Tejun Heo --- block/blk-core.c | 48 +++--------------------------------------- include/linux/blkdev.h | 3 --- 2 files changed, 3 insertions(+), 48 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 678ede23ed0..2f277ea0e59 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1971,10 +1971,6 @@ static void end_that_request_last(struct request *req, int error) * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq - * @drv_callback: function called between completion of bios in the request - * and completion of the request. - * If the callback returns non %0, this helper returns without - * completion of the request. * * Description: * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. @@ -1985,8 +1981,7 @@ static void end_that_request_last(struct request *req, int error) * %1 - this request is not freed yet, it still has pending buffers. **/ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, - unsigned int bidi_bytes, - int (drv_callback)(struct request *)) + unsigned int bidi_bytes) { struct request_queue *q = rq->q; unsigned long flags = 0UL; @@ -1994,10 +1989,6 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, if (end_that_request_data(rq, error, nr_bytes, bidi_bytes)) return 1; - /* Special feature for tricky drivers */ - if (drv_callback && drv_callback(rq)) - return 1; - add_disk_randomness(rq->rq_disk); spin_lock_irqsave(q->queue_lock, flags); @@ -2023,7 +2014,7 @@ static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, **/ int blk_end_request(struct request *rq, int error, unsigned int nr_bytes) { - return blk_end_io(rq, error, nr_bytes, 0, NULL); + return blk_end_io(rq, error, nr_bytes, 0); } EXPORT_SYMBOL_GPL(blk_end_request); @@ -2070,7 +2061,7 @@ EXPORT_SYMBOL_GPL(__blk_end_request); int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes) { - return blk_end_io(rq, error, nr_bytes, bidi_bytes, NULL); + return blk_end_io(rq, error, nr_bytes, bidi_bytes); } EXPORT_SYMBOL_GPL(blk_end_bidi_request); @@ -2131,39 +2122,6 @@ void blk_update_request(struct request *rq, int error, unsigned int nr_bytes) } EXPORT_SYMBOL_GPL(blk_update_request); -/** - * blk_end_request_callback - Special helper function for tricky drivers - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete - * @drv_callback: function called between completion of bios in the request - * and completion of the request. - * If the callback returns non %0, this helper returns without - * completion of the request. - * - * Description: - * Ends I/O on a number of bytes attached to @rq. - * If @rq has leftover, sets it up for the next range of segments. - * - * This special helper function is used only for existing tricky drivers. - * (e.g. cdrom_newpc_intr() of ide-cd) - * This interface will be removed when such drivers are rewritten. - * Don't use this interface in other places anymore. - * - * Return: - * %0 - we are done with this request - * %1 - this request is not freed yet. - * this request still has pending buffers or - * the driver doesn't want to finish this request yet. - **/ -int blk_end_request_callback(struct request *rq, int error, - unsigned int nr_bytes, - int (drv_callback)(struct request *)) -{ - return blk_end_io(rq, error, nr_bytes, 0, drv_callback); -} -EXPORT_SYMBOL_GPL(blk_end_request_callback); - void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) { diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 156ffd9de96..1fa9dcf9aa6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -855,9 +855,6 @@ extern int __blk_end_request(struct request *rq, int error, extern int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, unsigned int bidi_bytes); extern void end_request(struct request *, int); -extern int blk_end_request_callback(struct request *rq, int error, - unsigned int nr_bytes, - int (drv_callback)(struct request *)); extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); From 2e60e02297cf54e367567f2d85b2ca56b1c4a906 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:18 +0900 Subject: [PATCH 0613/2061] block: clean up request completion API Request completion has gone through several changes and became a bit messy over the time. Clean it up. 1. end_that_request_data() is a thin wrapper around end_that_request_data_first() which checks whether bio is NULL before doing anything and handles bidi completion. blk_update_request() is a thin wrapper around end_that_request_data() which clears nr_sectors on the last iteration but doesn't use the bidi completion. Clean it up by moving the initial bio NULL check and nr_sectors clearing on the last iteration into end_that_request_data() and renaming it to blk_update_request(), which makes blk_end_io() the only user of end_that_request_data(). Collapse end_that_request_data() into blk_end_io(). 2. There are four visible completion variants - blk_end_request(), __blk_end_request(), blk_end_bidi_request() and end_request(). blk_end_request() and blk_end_bidi_request() uses blk_end_request() as the backend but __blk_end_request() and end_request() use separate implementation in __blk_end_request() due to different locking rules. blk_end_bidi_request() is identical to blk_end_io(). Collapse blk_end_io() into blk_end_bidi_request(), separate out request update into internal helper blk_update_bidi_request() and add __blk_end_bidi_request(). Redefine [__]blk_end_request() as thin inline wrappers around [__]blk_end_bidi_request(). 3. As the whole request issue/completion usages are about to be modified and audited, it's a good chance to convert completion functions return bool which better indicates the intended meaning of return values. 4. The function name end_that_request_last() is from the days when it was a public interface and slighly confusing. Give it a proper internal name - blk_finish_request(). 5. Add description explaning that blk_end_bidi_request() can be safely used for uni requests as suggested by Boaz Harrosh. The only visible behavior change is from #1. nr_sectors counts are cleared after the final iteration no matter which function is used to complete the request. I couldn't find any place where the code assumes those nr_sectors counters contain the values for the last segment and this change is good as it makes the API much more consistent as the end result is now same whether a request is completed using [__]blk_end_request() alone or in combination with blk_update_request(). API further cleaned up per Christoph's suggestion. [ Impact: cleanup, rq->*nr_sectors always updated after req completion ] Signed-off-by: Tejun Heo Reviewed-by: Boaz Harrosh Cc: Christoph Hellwig --- block/blk-core.c | 238 ++++++++++++++--------------------------- include/linux/blkdev.h | 94 +++++++++++++--- 2 files changed, 163 insertions(+), 169 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 2f277ea0e59..89cc05d9a7a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1808,25 +1808,35 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq) } /** - * __end_that_request_first - end I/O on a request - * @req: the request being processed + * blk_update_request - Special helper function for request stacking drivers + * @rq: the request being processed * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete + * @nr_bytes: number of bytes to complete @rq * * Description: - * Ends I/O on a number of bytes attached to @req, and sets it up - * for the next range of segments (if any) in the cluster. + * Ends I/O on a number of bytes attached to @rq, but doesn't complete + * the request structure even if @rq doesn't have leftover. + * If @rq has leftover, sets it up for the next range of segments. + * + * This special helper function is only for request stacking drivers + * (e.g. request-based dm) so that they can handle partial completion. + * Actual device drivers should use blk_end_request instead. + * + * Passing the result of blk_rq_bytes() as @nr_bytes guarantees + * %false return from this function. * * Return: - * %0 - we are done with this request, call end_that_request_last() - * %1 - still buffers pending for this request + * %false - this request doesn't have any more data + * %true - this request has more data **/ -static int __end_that_request_first(struct request *req, int error, - int nr_bytes) +bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) { int total_bytes, bio_nbytes, next_idx = 0; struct bio *bio; + if (!req->bio) + return false; + trace_block_rq_complete(req->q, req); /* @@ -1903,8 +1913,16 @@ static int __end_that_request_first(struct request *req, int error, /* * completely done */ - if (!req->bio) - return 0; + if (!req->bio) { + /* + * Reset counters so that the request stacking driver + * can find how many bytes remain in the request + * later. + */ + req->nr_sectors = req->hard_nr_sectors = 0; + req->current_nr_sectors = req->hard_cur_sectors = 0; + return false; + } /* * if the request wasn't completed, update state @@ -1918,29 +1936,31 @@ static int __end_that_request_first(struct request *req, int error, blk_recalc_rq_sectors(req, total_bytes >> 9); blk_recalc_rq_segments(req); - return 1; + return true; } +EXPORT_SYMBOL_GPL(blk_update_request); -static int end_that_request_data(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes) +static bool blk_update_bidi_request(struct request *rq, int error, + unsigned int nr_bytes, + unsigned int bidi_bytes) { - if (rq->bio) { - if (__end_that_request_first(rq, error, nr_bytes)) - return 1; + if (blk_update_request(rq, error, nr_bytes)) + return true; - /* Bidi request must be completed as a whole */ - if (blk_bidi_rq(rq) && - __end_that_request_first(rq->next_rq, error, bidi_bytes)) - return 1; - } + /* Bidi request must be completed as a whole */ + if (unlikely(blk_bidi_rq(rq)) && + blk_update_request(rq->next_rq, error, bidi_bytes)) + return true; - return 0; + add_disk_randomness(rq->rq_disk); + + return false; } /* * queue lock must be held */ -static void end_that_request_last(struct request *req, int error) +static void blk_finish_request(struct request *req, int error) { if (blk_rq_tagged(req)) blk_queue_end_tag(req->q, req); @@ -1966,161 +1986,65 @@ static void end_that_request_last(struct request *req, int error) } /** - * blk_end_io - Generic end_io function to complete a request. - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete @rq - * @bidi_bytes: number of bytes to complete @rq->next_rq - * - * Description: - * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. - * If @rq has leftover, sets it up for the next range of segments. - * - * Return: - * %0 - we are done with this request - * %1 - this request is not freed yet, it still has pending buffers. - **/ -static int blk_end_io(struct request *rq, int error, unsigned int nr_bytes, - unsigned int bidi_bytes) -{ - struct request_queue *q = rq->q; - unsigned long flags = 0UL; - - if (end_that_request_data(rq, error, nr_bytes, bidi_bytes)) - return 1; - - add_disk_randomness(rq->rq_disk); - - spin_lock_irqsave(q->queue_lock, flags); - end_that_request_last(rq, error); - spin_unlock_irqrestore(q->queue_lock, flags); - - return 0; -} - -/** - * blk_end_request - Helper function for drivers to complete the request. - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete - * - * Description: - * Ends I/O on a number of bytes attached to @rq. - * If @rq has leftover, sets it up for the next range of segments. - * - * Return: - * %0 - we are done with this request - * %1 - still buffers pending for this request - **/ -int blk_end_request(struct request *rq, int error, unsigned int nr_bytes) -{ - return blk_end_io(rq, error, nr_bytes, 0); -} -EXPORT_SYMBOL_GPL(blk_end_request); - -/** - * __blk_end_request - Helper function for drivers to complete the request. - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete - * - * Description: - * Must be called with queue lock held unlike blk_end_request(). - * - * Return: - * %0 - we are done with this request - * %1 - still buffers pending for this request - **/ -int __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) -{ - if (rq->bio && __end_that_request_first(rq, error, nr_bytes)) - return 1; - - add_disk_randomness(rq->rq_disk); - - end_that_request_last(rq, error); - - return 0; -} -EXPORT_SYMBOL_GPL(__blk_end_request); - -/** - * blk_end_bidi_request - Helper function for drivers to complete bidi request. - * @rq: the bidi request being processed + * blk_end_bidi_request - Complete a bidi request + * @rq: the request to complete * @error: %0 for success, < %0 for error * @nr_bytes: number of bytes to complete @rq * @bidi_bytes: number of bytes to complete @rq->next_rq * * Description: * Ends I/O on a number of bytes attached to @rq and @rq->next_rq. + * Drivers that supports bidi can safely call this member for any + * type of request, bidi or uni. In the later case @bidi_bytes is + * just ignored. * * Return: - * %0 - we are done with this request - * %1 - still buffers pending for this request + * %false - we are done with this request + * %true - still buffers pending for this request **/ -int blk_end_bidi_request(struct request *rq, int error, unsigned int nr_bytes, - unsigned int bidi_bytes) +bool blk_end_bidi_request(struct request *rq, int error, + unsigned int nr_bytes, unsigned int bidi_bytes) { - return blk_end_io(rq, error, nr_bytes, bidi_bytes); + struct request_queue *q = rq->q; + unsigned long flags; + + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) + return true; + + spin_lock_irqsave(q->queue_lock, flags); + blk_finish_request(rq, error); + spin_unlock_irqrestore(q->queue_lock, flags); + + return false; } EXPORT_SYMBOL_GPL(blk_end_bidi_request); /** - * end_request - end I/O on the current segment of the request - * @req: the request being processed - * @uptodate: error value or %0/%1 uptodate flag + * __blk_end_bidi_request - Complete a bidi request with queue lock held + * @rq: the request to complete + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete @rq + * @bidi_bytes: number of bytes to complete @rq->next_rq * * Description: - * Ends I/O on the current segment of a request. If that is the only - * remaining segment, the request is also completed and freed. + * Identical to blk_end_bidi_request() except that queue lock is + * assumed to be locked on entry and remains so on return. * - * This is a remnant of how older block drivers handled I/O completions. - * Modern drivers typically end I/O on the full request in one go, unless - * they have a residual value to account for. For that case this function - * isn't really useful, unless the residual just happens to be the - * full current segment. In other words, don't use this function in new - * code. Use blk_end_request() or __blk_end_request() to end a request. + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request **/ -void end_request(struct request *req, int uptodate) +bool __blk_end_bidi_request(struct request *rq, int error, + unsigned int nr_bytes, unsigned int bidi_bytes) { - int error = 0; + if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) + return true; - if (uptodate <= 0) - error = uptodate ? uptodate : -EIO; + blk_finish_request(rq, error); - __blk_end_request(req, error, req->hard_cur_sectors << 9); + return false; } -EXPORT_SYMBOL(end_request); - -/** - * blk_update_request - Special helper function for request stacking drivers - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete @rq - * - * Description: - * Ends I/O on a number of bytes attached to @rq, but doesn't complete - * the request structure even if @rq doesn't have leftover. - * If @rq has leftover, sets it up for the next range of segments. - * - * This special helper function is only for request stacking drivers - * (e.g. request-based dm) so that they can handle partial completion. - * Actual device drivers should use blk_end_request instead. - */ -void blk_update_request(struct request *rq, int error, unsigned int nr_bytes) -{ - if (!end_that_request_data(rq, error, nr_bytes, 0)) { - /* - * These members are not updated in end_that_request_data() - * when all bios are completed. - * Update them so that the request stacking driver can find - * how many bytes remain in the request later. - */ - rq->nr_sectors = rq->hard_nr_sectors = 0; - rq->current_nr_sectors = rq->hard_cur_sectors = 0; - } -} -EXPORT_SYMBOL_GPL(blk_update_request); +EXPORT_SYMBOL_GPL(__blk_end_bidi_request); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1fa9dcf9aa6..501f6845cc7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -840,27 +840,97 @@ extern unsigned int blk_rq_bytes(struct request *rq); extern unsigned int blk_rq_cur_bytes(struct request *rq); /* - * blk_end_request() and friends. - * __blk_end_request() and end_request() must be called with - * the request queue spinlock acquired. + * Request completion related functions. + * + * blk_update_request() completes given number of bytes and updates + * the request without completing it. + * + * blk_end_request() and friends. __blk_end_request() and + * end_request() must be called with the request queue spinlock + * acquired. * * Several drivers define their own end_request and call * blk_end_request() for parts of the original function. * This prevents code duplication in drivers. */ -extern int blk_end_request(struct request *rq, int error, - unsigned int nr_bytes); -extern int __blk_end_request(struct request *rq, int error, - unsigned int nr_bytes); -extern int blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes); -extern void end_request(struct request *, int); +extern bool blk_update_request(struct request *rq, int error, + unsigned int nr_bytes); +extern bool blk_end_bidi_request(struct request *rq, int error, + unsigned int nr_bytes, + unsigned int bidi_bytes); +extern bool __blk_end_bidi_request(struct request *rq, int error, + unsigned int nr_bytes, + unsigned int bidi_bytes); + +/** + * blk_end_request - Helper function for drivers to complete the request. + * @rq: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete + * + * Description: + * Ends I/O on a number of bytes attached to @rq. + * If @rq has leftover, sets it up for the next range of segments. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + **/ +static inline bool blk_end_request(struct request *rq, int error, + unsigned int nr_bytes) +{ + return blk_end_bidi_request(rq, error, nr_bytes, 0); +} + +/** + * __blk_end_request - Helper function for drivers to complete the request. + * @rq: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete + * + * Description: + * Must be called with queue lock held unlike blk_end_request(). + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + **/ +static inline bool __blk_end_request(struct request *rq, int error, + unsigned int nr_bytes) +{ + return __blk_end_bidi_request(rq, error, nr_bytes, 0); +} + +/** + * end_request - end I/O on the current segment of the request + * @rq: the request being processed + * @uptodate: error value or %0/%1 uptodate flag + * + * Description: + * Ends I/O on the current segment of a request. If that is the only + * remaining segment, the request is also completed and freed. + * + * This is a remnant of how older block drivers handled I/O completions. + * Modern drivers typically end I/O on the full request in one go, unless + * they have a residual value to account for. For that case this function + * isn't really useful, unless the residual just happens to be the + * full current segment. In other words, don't use this function in new + * code. Use blk_end_request() or __blk_end_request() to end a request. + **/ +static inline void end_request(struct request *rq, int uptodate) +{ + int error = 0; + + if (uptodate <= 0) + error = uptodate ? uptodate : -EIO; + + __blk_end_bidi_request(rq, error, rq->hard_cur_sectors << 9, 0); +} + extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); extern void blk_abort_request(struct request *); extern void blk_abort_queue(struct request_queue *); -extern void blk_update_request(struct request *rq, int error, - unsigned int nr_bytes); /* * Access functions for manipulating queue properties From b243ddcbe9be146172baa544dadecebf156eda0e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:18 +0900 Subject: [PATCH 0614/2061] block: move rq->start_time initialization to blk_rq_init() rq->start_time was initialized in init_request_from_bio() so special requests didn't have start_time set. This has been okay as start_time has been used only for fs requests; however, there is no indication of this actually is the case or not. Set rq->start_time in blk_rq_init() and guarantee that all initialized rq's have its start_time set. This improves consistency at virtually no cost and future changes will make use of the timestamp for !bio requests. [ Impact: rq->start_time is valid for all requests ] Signed-off-by: Tejun Heo --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index 89cc05d9a7a..b84250d3019 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -134,6 +134,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) rq->cmd_len = BLK_MAX_CDB; rq->tag = -1; rq->ref_count = 1; + rq->start_time = jiffies; } EXPORT_SYMBOL(blk_rq_init); @@ -1099,7 +1100,6 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->errors = 0; req->hard_sector = req->sector = bio->bi_sector; req->ioprio = bio_prio(bio); - req->start_time = jiffies; blk_rq_bio_prep(req->q, req, bio); } From 40cbbb781d3eba5d6ac0860db078af490e5c7c6b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:19 +0900 Subject: [PATCH 0615/2061] block: implement and use [__]blk_end_request_all() There are many [__]blk_end_request() call sites which call it with full request length and expect full completion. Many of them ensure that the request actually completes by doing BUG_ON() the return value, which is awkward and error-prone. This patch adds [__]blk_end_request_all() which takes @rq and @error and fully completes the request. BUG_ON() is added to to ensure that this actually happens. Most conversions are simple but there are a few noteworthy ones. * cdrom/viocd: viocd_end_request() replaced with direct calls to __blk_end_request_all(). * s390/block/dasd: dasd_end_request() replaced with direct calls to __blk_end_request_all(). * s390/char/tape_block: tapeblock_end_request() replaced with direct calls to blk_end_request_all(). [ Impact: cleanup ] Signed-off-by: Tejun Heo Cc: Russell King Cc: Stephen Rothwell Cc: Mike Miller Cc: Martin Schwidefsky Cc: Jeff Garzik Cc: Rusty Russell Cc: Jeremy Fitzhardinge Cc: Alex Dubov Cc: James Bottomley --- arch/arm/plat-omap/mailbox.c | 11 +++------- block/blk-barrier.c | 9 ++------ block/blk-core.c | 2 +- block/elevator.c | 2 +- drivers/block/cpqarray.c | 3 +-- drivers/block/sx8.c | 3 +-- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 4 +--- drivers/cdrom/gdrom.c | 2 +- drivers/cdrom/viocd.c | 25 ++++------------------ drivers/memstick/core/mspro_block.c | 2 +- drivers/s390/block/dasd.c | 17 ++++----------- drivers/s390/char/tape_block.c | 15 ++++---------- drivers/scsi/scsi_lib.c | 2 +- include/linux/blkdev.h | 32 +++++++++++++++++++++++++++++ 15 files changed, 58 insertions(+), 73 deletions(-) diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c index 0abfbaa5987..cf81bad8aec 100644 --- a/arch/arm/plat-omap/mailbox.c +++ b/arch/arm/plat-omap/mailbox.c @@ -192,8 +192,7 @@ static void mbox_tx_work(struct work_struct *work) } spin_lock(q->queue_lock); - if (__blk_end_request(rq, 0, 0)) - BUG(); + __blk_end_request_all(rq, 0); spin_unlock(q->queue_lock); } } @@ -224,10 +223,7 @@ static void mbox_rx_work(struct work_struct *work) break; msg = (mbox_msg_t) rq->data; - - if (blk_end_request(rq, 0, 0)) - BUG(); - + blk_end_request_all(rq, 0); mbox->rxq->callback((void *)msg); } } @@ -337,8 +333,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf) *p = (mbox_msg_t) rq->data; - if (blk_end_request(rq, 0, 0)) - BUG(); + blk_end_request_all(rq, 0); if (unlikely(mbox_seq_test(mbox, *p))) { pr_info("mbox: Illegal seq bit!(%08x) ignored\n", *p); diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 20b4111fa05..c8d087655ef 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -106,10 +106,7 @@ bool blk_ordered_complete_seq(struct request_queue *q, unsigned seq, int error) */ q->ordseq = 0; rq = q->orig_bar_rq; - - if (__blk_end_request(rq, q->orderr, blk_rq_bytes(rq))) - BUG(); - + __blk_end_request_all(rq, q->orderr); return true; } @@ -252,9 +249,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp) * with prejudice. */ elv_dequeue_request(q, rq); - if (__blk_end_request(rq, -EOPNOTSUPP, - blk_rq_bytes(rq))) - BUG(); + __blk_end_request_all(rq, -EOPNOTSUPP); *rqp = NULL; return false; } diff --git a/block/blk-core.c b/block/blk-core.c index b84250d3019..0520cc70458 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1780,7 +1780,7 @@ struct request *elv_next_request(struct request_queue *q) break; } else if (ret == BLKPREP_KILL) { rq->cmd_flags |= REQ_QUIET; - __blk_end_request(rq, -EIO, blk_rq_bytes(rq)); + __blk_end_request_all(rq, -EIO); } else { printk(KERN_ERR "%s: bad return=%d\n", __func__, ret); break; diff --git a/block/elevator.c b/block/elevator.c index b03b8752e18..1af5d9f04af 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -810,7 +810,7 @@ void elv_abort_queue(struct request_queue *q) rq = list_entry_rq(q->queue_head.next); rq->cmd_flags |= REQ_QUIET; trace_block_rq_abort(q, rq); - __blk_end_request(rq, -EIO, blk_rq_bytes(rq)); + __blk_end_request_all(rq, -EIO); } } EXPORT_SYMBOL(elv_abort_queue); diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index ca268ca1115..488a8f4a60a 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -1024,8 +1024,7 @@ static inline void complete_command(cmdlist_t *cmd, int timeout) cmd->req.sg[i].size, ddir); DBGPX(printk("Done with %p\n", rq);); - if (__blk_end_request(rq, error, blk_rq_bytes(rq))) - BUG(); + __blk_end_request_all(rq, error); } /* diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index ff0448e4bf0..60e85bb6f79 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -749,8 +749,7 @@ static inline void carm_end_request_queued(struct carm_host *host, struct request *req = crq->rq; int rc; - rc = __blk_end_request(req, error, blk_rq_bytes(req)); - assert(rc == 0); + __blk_end_request_all(req, error); rc = carm_put_request(host, crq); assert(rc == 0); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 5d34764c8a8..50745e64414 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -62,7 +62,7 @@ static void blk_done(struct virtqueue *vq) break; } - __blk_end_request(vbr->req, error, blk_rq_bytes(vbr->req)); + __blk_end_request_all(vbr->req, error); list_del(&vbr->list); mempool_free(vbr, vblk->pool); } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 8f905089b72..cd6cfe3b51e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -551,7 +551,6 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) for (i = info->ring.rsp_cons; i != rp; i++) { unsigned long id; - int ret; bret = RING_GET_RESPONSE(&info->ring, i); id = bret->id; @@ -578,8 +577,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) dev_dbg(&info->xbdev->dev, "Bad return from blkdev data " "request: %x\n", bret->status); - ret = __blk_end_request(req, error, blk_rq_bytes(req)); - BUG_ON(ret); + __blk_end_request_all(req, error); break; default: BUG(); diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 2eecb779437..fee9a9e83fc 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -632,7 +632,7 @@ static void gdrom_readdisk_dma(struct work_struct *work) * before handling ending the request */ spin_lock(&gdrom_lock); list_del_init(&req->queuelist); - __blk_end_request(req, err, blk_rq_bytes(req)); + __blk_end_request_all(req, err); } spin_unlock(&gdrom_lock); kfree(read_command); diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index 13929356135..cc3efa096e1 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -291,23 +291,6 @@ static int send_request(struct request *req) return 0; } -static void viocd_end_request(struct request *req, int error) -{ - int nsectors = req->hard_nr_sectors; - - /* - * Make sure it's fully ended, and ensure that we process - * at least one sector. - */ - if (blk_pc_request(req)) - nsectors = (req->data_len + 511) >> 9; - if (!nsectors) - nsectors = 1; - - if (__blk_end_request(req, error, nsectors << 9)) - BUG(); -} - static int rwreq; static void do_viocd_request(struct request_queue *q) @@ -316,11 +299,11 @@ static void do_viocd_request(struct request_queue *q) while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) { if (!blk_fs_request(req)) - viocd_end_request(req, -EIO); + __blk_end_request_all(req, -EIO); else if (send_request(req) < 0) { printk(VIOCD_KERN_WARNING "unable to send message to OS/400!"); - viocd_end_request(req, -EIO); + __blk_end_request_all(req, -EIO); } else rwreq++; } @@ -531,9 +514,9 @@ return_complete: "with rc %d:0x%04X: %s\n", req, event->xRc, bevent->sub_result, err->msg); - viocd_end_request(req, -EIO); + __blk_end_request_all(req, -EIO); } else - viocd_end_request(req, 0); + __blk_end_request_all(req, 0); /* restart handling of incoming requests */ spin_unlock_irqrestore(&viocd_reqlock, flags); diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index de143deb06f..a41634699f8 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -826,7 +826,7 @@ static void mspro_block_submit_req(struct request_queue *q) if (msb->eject) { while ((req = elv_next_request(q)) != NULL) - __blk_end_request(req, -ENODEV, blk_rq_bytes(req)); + __blk_end_request_all(req, -ENODEV); return; } diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index d1815272c43..fabec95686b 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1613,15 +1613,6 @@ void dasd_block_clear_timer(struct dasd_block *block) del_timer(&block->timer); } -/* - * posts the buffer_cache about a finalized request - */ -static inline void dasd_end_request(struct request *req, int error) -{ - if (__blk_end_request(req, error, blk_rq_bytes(req))) - BUG(); -} - /* * Process finished error recovery ccw. */ @@ -1676,7 +1667,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) "Rejecting write request %p", req); blkdev_dequeue_request(req); - dasd_end_request(req, -EIO); + __blk_end_request_all(req, -EIO); continue; } cqr = basedev->discipline->build_cp(basedev, block, req); @@ -1705,7 +1696,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) "on request %p", PTR_ERR(cqr), req); blkdev_dequeue_request(req); - dasd_end_request(req, -EIO); + __blk_end_request_all(req, -EIO); continue; } /* @@ -1731,7 +1722,7 @@ static void __dasd_cleanup_cqr(struct dasd_ccw_req *cqr) status = cqr->block->base->discipline->free_cp(cqr, req); if (status <= 0) error = status ? status : -EIO; - dasd_end_request(req, error); + __blk_end_request_all(req, error); } /* @@ -2040,7 +2031,7 @@ static void dasd_flush_request_queue(struct dasd_block *block) spin_lock_irq(&block->request_queue_lock); while ((req = elv_next_request(block->request_queue))) { blkdev_dequeue_request(req); - dasd_end_request(req, -EIO); + __blk_end_request_all(req, -EIO); } spin_unlock_irq(&block->request_queue_lock); } diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c index f32e89e7c4f..86596d3813b 100644 --- a/drivers/s390/char/tape_block.c +++ b/drivers/s390/char/tape_block.c @@ -73,13 +73,6 @@ tapeblock_trigger_requeue(struct tape_device *device) /* * Post finished request. */ -static void -tapeblock_end_request(struct request *req, int error) -{ - if (blk_end_request(req, error, blk_rq_bytes(req))) - BUG(); -} - static void __tapeblock_end_request(struct tape_request *ccw_req, void *data) { @@ -90,7 +83,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data) device = ccw_req->device; req = (struct request *) data; - tapeblock_end_request(req, (ccw_req->rc == 0) ? 0 : -EIO); + blk_end_request_all(req, (ccw_req->rc == 0) ? 0 : -EIO); if (ccw_req->rc == 0) /* Update position. */ device->blk_data.block_position = @@ -118,7 +111,7 @@ tapeblock_start_request(struct tape_device *device, struct request *req) ccw_req = device->discipline->bread(device, req); if (IS_ERR(ccw_req)) { DBF_EVENT(1, "TBLOCK: bread failed\n"); - tapeblock_end_request(req, -EIO); + blk_end_request_all(req, -EIO); return PTR_ERR(ccw_req); } ccw_req->callback = __tapeblock_end_request; @@ -131,7 +124,7 @@ tapeblock_start_request(struct tape_device *device, struct request *req) * Start/enqueueing failed. No retries in * this case. */ - tapeblock_end_request(req, -EIO); + blk_end_request_all(req, -EIO); device->discipline->free_bread(ccw_req); } @@ -177,7 +170,7 @@ tapeblock_requeue(struct work_struct *work) { DBF_EVENT(1, "TBLOCK: Rejecting write request\n"); blkdev_dequeue_request(req); spin_unlock_irq(&device->blk_data.request_queue_lock); - tapeblock_end_request(req, -EIO); + blk_end_request_all(req, -EIO); spin_lock_irq(&device->blk_data.request_queue_lock); continue; } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index d1cb64ad1a3..756ac7c93de 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -922,7 +922,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) if (driver_byte(result) & DRIVER_SENSE) scsi_print_sense("", cmd); } - blk_end_request(req, -EIO, blk_rq_bytes(req)); + blk_end_request_all(req, -EIO); scsi_next_command(cmd); break; case ACTION_REPREP: diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 501f6845cc7..e33c8356b3d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -882,6 +882,22 @@ static inline bool blk_end_request(struct request *rq, int error, return blk_end_bidi_request(rq, error, nr_bytes, 0); } +/** + * blk_end_request_all - Helper function for drives to finish the request. + * @rq: the request to finish + * @err: %0 for success, < %0 for error + * + * Description: + * Completely finish @rq. + */ +static inline void blk_end_request_all(struct request *rq, int error) +{ + bool pending; + + pending = blk_end_request(rq, error, blk_rq_bytes(rq)); + BUG_ON(pending); +} + /** * __blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed @@ -901,6 +917,22 @@ static inline bool __blk_end_request(struct request *rq, int error, return __blk_end_bidi_request(rq, error, nr_bytes, 0); } +/** + * __blk_end_request_all - Helper function for drives to finish the request. + * @rq: the request to finish + * @err: %0 for success, < %0 for error + * + * Description: + * Completely finish @rq. Must be called with queue lock held. + */ +static inline void __blk_end_request_all(struct request *rq, int error) +{ + bool pending; + + pending = __blk_end_request(rq, error, blk_rq_bytes(rq)); + BUG_ON(pending); +} + /** * end_request - end I/O on the current segment of the request * @rq: the request being processed From f06d9a2b52e246a66b606130cea3f0d7b7be17a7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:19 +0900 Subject: [PATCH 0616/2061] block: replace end_request() with [__]blk_end_request_cur() end_request() has been kept around for backward compatibility; however, it's about time for it to go away. * There aren't too many users left. * Its use of @updtodate is pretty confusing. * In some cases, newer code ends up using mixture of end_request() and [__]blk_end_request[_all](), which is way too confusing. So, add [__]blk_end_request_cur() and replace end_request() with it. Most conversions are straightforward. Noteworthy ones are... * paride/pcd: next_request() updated to take 0/-errno instead of 1/0. * paride/pf: pf_end_request() and next_request() updated to take 0/-errno instead of 1/0. * xd: xd_readwrite() updated to return 0/-errno instead of 1/0. * mtd/mtd_blkdevs: blktrans_discard_request() updated to return 0/-errno instead of 1/0. Unnecessary local variable res initialization removed from mtd_blktrans_thread(). [ Impact: cleanup ] Signed-off-by: Tejun Heo Acked-by: Joerg Dorchain Acked-by: Geert Uytterhoeven Acked-by: Grant Likely Acked-by: Laurent Vivier Cc: Tim Waugh Cc: Stephen Rothwell Cc: Paul Mackerras Cc: Jeremy Fitzhardinge Cc: Markus Lidel Cc: David Woodhouse Cc: Pete Zaitcev Cc: unsik Kim --- drivers/block/amiflop.c | 10 +++---- drivers/block/ataflop.c | 14 +++++----- drivers/block/hd.c | 14 +++++----- drivers/block/mg_disk.c | 16 ++++++------ drivers/block/paride/pcd.c | 12 ++++----- drivers/block/paride/pd.c | 5 ++-- drivers/block/paride/pf.c | 28 ++++++++++---------- drivers/block/ps3disk.c | 6 ++--- drivers/block/swim.c | 14 +++++----- drivers/block/swim3.c | 26 +++++++++---------- drivers/block/xd.c | 15 ++++++----- drivers/block/xen-blkfront.c | 2 +- drivers/block/xsysace.c | 4 +-- drivers/block/z2ram.c | 4 +-- drivers/cdrom/gdrom.c | 6 ++--- drivers/message/i2o/i2o_block.c | 2 +- drivers/mtd/mtd_blkdevs.c | 22 ++++++++-------- drivers/sbus/char/jsflash.c | 8 +++--- include/linux/blkdev.h | 46 ++++++++++++++++----------------- 19 files changed, 128 insertions(+), 126 deletions(-) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 8df436ff706..b99a2a606d0 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1359,7 +1359,7 @@ static void redo_fd_request(void) #endif block = CURRENT->sector + cnt; if ((int)block > floppy->blocks) { - end_request(CURRENT, 0); + __blk_end_request_cur(CURRENT, -EIO); goto repeat; } @@ -1373,11 +1373,11 @@ static void redo_fd_request(void) if ((rq_data_dir(CURRENT) != READ) && (rq_data_dir(CURRENT) != WRITE)) { printk(KERN_WARNING "do_fd_request: unknown command\n"); - end_request(CURRENT, 0); + __blk_end_request_cur(CURRENT, -EIO); goto repeat; } if (get_track(drive, track) == -1) { - end_request(CURRENT, 0); + __blk_end_request_cur(CURRENT, -EIO); goto repeat; } @@ -1391,7 +1391,7 @@ static void redo_fd_request(void) /* keep the drive spinning while writes are scheduled */ if (!fd_motor_on(drive)) { - end_request(CURRENT, 0); + __blk_end_request_cur(CURRENT, -EIO); goto repeat; } /* @@ -1410,7 +1410,7 @@ static void redo_fd_request(void) CURRENT->nr_sectors -= CURRENT->current_nr_sectors; CURRENT->sector += CURRENT->current_nr_sectors; - end_request(CURRENT, 1); + __blk_end_request_cur(CURRENT, 0); goto repeat; } diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 4234c11c1e4..44a8702136a 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -612,7 +612,7 @@ static void fd_error( void ) CURRENT->errors++; if (CURRENT->errors >= MAX_ERRORS) { printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive ); - end_request(CURRENT, 0); + __blk_end_request_cur(CURRENT, -EIO); } else if (CURRENT->errors == RECALIBRATE_ERRORS) { printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive ); @@ -734,7 +734,7 @@ static void do_fd_action( int drive ) /* all sectors finished */ CURRENT->nr_sectors -= CURRENT->current_nr_sectors; CURRENT->sector += CURRENT->current_nr_sectors; - end_request(CURRENT, 1); + __blk_end_request_cur(CURRENT, 0); redo_fd_request(); return; } @@ -1141,7 +1141,7 @@ static void fd_rwsec_done1(int status) /* all sectors finished */ CURRENT->nr_sectors -= CURRENT->current_nr_sectors; CURRENT->sector += CURRENT->current_nr_sectors; - end_request(CURRENT, 1); + __blk_end_request_cur(CURRENT, 0); redo_fd_request(); } return; @@ -1414,7 +1414,7 @@ repeat: if (!UD.connected) { /* drive not connected */ printk(KERN_ERR "Unknown Device: fd%d\n", drive ); - end_request(CURRENT, 0); + __blk_end_request_cur(CURRENT, -EIO); goto repeat; } @@ -1430,12 +1430,12 @@ repeat: /* user supplied disk type */ if (--type >= NUM_DISK_MINORS) { printk(KERN_WARNING "fd%d: invalid disk format", drive ); - end_request(CURRENT, 0); + __blk_end_request_cur(CURRENT, -EIO); goto repeat; } if (minor2disktype[type].drive_types > DriveType) { printk(KERN_WARNING "fd%d: unsupported disk format", drive ); - end_request(CURRENT, 0); + __blk_end_request_cur(CURRENT, -EIO); goto repeat; } type = minor2disktype[type].index; @@ -1445,7 +1445,7 @@ repeat: } if (CURRENT->sector + 1 > UDT->blocks) { - end_request(CURRENT, 0); + __blk_end_request_cur(CURRENT, -EIO); goto repeat; } diff --git a/drivers/block/hd.c b/drivers/block/hd.c index baaa9e486e5..5cb300b81c6 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -410,7 +410,7 @@ static void bad_rw_intr(void) if (req != NULL) { struct hd_i_struct *disk = req->rq_disk->private_data; if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); disk->special_op = disk->recalibrate = 1; } else if (req->errors % RESET_FREQ == 0) reset = 1; @@ -466,7 +466,7 @@ ok_to_read: req->buffer+512); #endif if (req->current_nr_sectors <= 0) - end_request(req, 1); + __blk_end_request_cur(req, 0); if (i > 0) { SET_HANDLER(&read_intr); return; @@ -505,7 +505,7 @@ ok_to_write: --req->current_nr_sectors; req->buffer += 512; if (!i || (req->bio && req->current_nr_sectors <= 0)) - end_request(req, 1); + __blk_end_request_cur(req, 0); if (i > 0) { SET_HANDLER(&write_intr); outsw(HD_DATA, req->buffer, 256); @@ -548,7 +548,7 @@ static void hd_times_out(unsigned long dummy) #ifdef DEBUG printk("%s: too many errors\n", name); #endif - end_request(CURRENT, 0); + __blk_end_request_cur(CURRENT, -EIO); } hd_request(); spin_unlock_irq(hd_queue->queue_lock); @@ -563,7 +563,7 @@ static int do_special_op(struct hd_i_struct *disk, struct request *req) } if (disk->head > 16) { printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); } disk->special_op = 0; return 1; @@ -607,7 +607,7 @@ repeat: ((block+nsect) > get_capacity(req->rq_disk))) { printk("%s: bad access: block=%d, count=%d\n", req->rq_disk->disk_name, block, nsect); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); goto repeat; } @@ -647,7 +647,7 @@ repeat: break; default: printk("unknown hd-command\n"); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); break; } } diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index f3898353d0a..408c2bd8a43 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -285,7 +285,7 @@ static void mg_bad_rw_intr(struct mg_host *host) if (req != NULL) if (++req->errors >= MG_MAX_ERRORS || host->error == MG_ERR_TIMEOUT) - end_request(req, 0); + __blk_end_request_cur(req, -EIO); } static unsigned int mg_out(struct mg_host *host, @@ -351,7 +351,7 @@ static void mg_read(struct request *req) if (req->current_nr_sectors <= 0) { MG_DBG("remain : %d sects\n", remains); - end_request(req, 1); + __blk_end_request_cur(req, 0); if (remains > 0) req = elv_next_request(host->breq); } @@ -395,7 +395,7 @@ static void mg_write(struct request *req) if (req->current_nr_sectors <= 0) { MG_DBG("remain : %d sects\n", remains); - end_request(req, 1); + __blk_end_request_cur(req, 0); if (remains > 0) req = elv_next_request(host->breq); } @@ -448,7 +448,7 @@ ok_to_read: /* let know if current segment done */ if (req->current_nr_sectors <= 0) - end_request(req, 1); + __blk_end_request_cur(req, 0); /* set handler if read remains */ if (i > 0) { @@ -497,7 +497,7 @@ ok_to_write: /* let know if current segment or all done */ if (!i || (req->bio && req->current_nr_sectors <= 0)) - end_request(req, 1); + __blk_end_request_cur(req, 0); /* write 1 sector and set handler if remains */ if (i > 0) { @@ -563,7 +563,7 @@ static void mg_request_poll(struct request_queue *q) default: printk(KERN_WARNING "%s:%d unknown command\n", __func__, __LINE__); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); break; } } @@ -617,7 +617,7 @@ static unsigned int mg_issue_req(struct request *req, default: printk(KERN_WARNING "%s:%d unknown command\n", __func__, __LINE__); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); break; } return MG_ERR_NONE; @@ -655,7 +655,7 @@ static void mg_request(struct request_queue *q) "%s: bad access: sector=%d, count=%d\n", req->rq_disk->disk_name, sect_num, sect_cnt); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index e91d4b4b014..9fd57c2aa46 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -735,16 +735,16 @@ static void do_pcd_request(struct request_queue * q) ps_set_intr(do_pcd_read, NULL, 0, nice); return; } else - end_request(pcd_req, 0); + __blk_end_request_cur(pcd_req, -EIO); } } -static inline void next_request(int success) +static inline void next_request(int err) { unsigned long saved_flags; spin_lock_irqsave(&pcd_lock, saved_flags); - end_request(pcd_req, success); + __blk_end_request_cur(pcd_req, err); pcd_busy = 0; do_pcd_request(pcd_queue); spin_unlock_irqrestore(&pcd_lock, saved_flags); @@ -781,7 +781,7 @@ static void pcd_start(void) if (pcd_command(pcd_current, rd_cmd, 2048, "read block")) { pcd_bufblk = -1; - next_request(0); + next_request(-EIO); return; } @@ -796,7 +796,7 @@ static void do_pcd_read(void) pcd_retries = 0; pcd_transfer(); if (!pcd_count) { - next_request(1); + next_request(0); return; } @@ -815,7 +815,7 @@ static void do_pcd_read_drq(void) return; } pcd_bufblk = -1; - next_request(0); + next_request(-EIO); return; } diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 9299455b0af..0732df4e901 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -410,7 +410,8 @@ static void run_fsm(void) pd_claimed = 0; phase = NULL; spin_lock_irqsave(&pd_lock, saved_flags); - end_request(pd_req, res); + __blk_end_request_cur(pd_req, + res == Ok ? 0 : -EIO); pd_req = elv_next_request(pd_queue); if (!pd_req) stop = 1; @@ -477,7 +478,7 @@ static int pd_next_buf(void) if (pd_count) return 0; spin_lock_irqsave(&pd_lock, saved_flags); - end_request(pd_req, 1); + __blk_end_request_cur(pd_req, 0); pd_count = pd_req->current_nr_sectors; pd_buf = pd_req->buffer; spin_unlock_irqrestore(&pd_lock, saved_flags); diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index bef3b997ba3..3871e3586d6 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -750,10 +750,10 @@ static int pf_ready(void) static struct request_queue *pf_queue; -static void pf_end_request(int uptodate) +static void pf_end_request(int err) { if (pf_req) { - end_request(pf_req, uptodate); + __blk_end_request_cur(pf_req, err); pf_req = NULL; } } @@ -773,7 +773,7 @@ repeat: pf_count = pf_req->current_nr_sectors; if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) { - pf_end_request(0); + pf_end_request(-EIO); goto repeat; } @@ -788,7 +788,7 @@ repeat: pi_do_claimed(pf_current->pi, do_pf_write); else { pf_busy = 0; - pf_end_request(0); + pf_end_request(-EIO); goto repeat; } } @@ -805,7 +805,7 @@ static int pf_next_buf(void) return 1; if (!pf_count) { spin_lock_irqsave(&pf_spin_lock, saved_flags); - pf_end_request(1); + pf_end_request(0); pf_req = elv_next_request(pf_queue); spin_unlock_irqrestore(&pf_spin_lock, saved_flags); if (!pf_req) @@ -816,12 +816,12 @@ static int pf_next_buf(void) return 0; } -static inline void next_request(int success) +static inline void next_request(int err) { unsigned long saved_flags; spin_lock_irqsave(&pf_spin_lock, saved_flags); - pf_end_request(success); + pf_end_request(err); pf_busy = 0; do_pf_request(pf_queue); spin_unlock_irqrestore(&pf_spin_lock, saved_flags); @@ -844,7 +844,7 @@ static void do_pf_read_start(void) pi_do_claimed(pf_current->pi, do_pf_read_start); return; } - next_request(0); + next_request(-EIO); return; } pf_mask = STAT_DRQ; @@ -863,7 +863,7 @@ static void do_pf_read_drq(void) pi_do_claimed(pf_current->pi, do_pf_read_start); return; } - next_request(0); + next_request(-EIO); return; } pi_read_block(pf_current->pi, pf_buf, 512); @@ -871,7 +871,7 @@ static void do_pf_read_drq(void) break; } pi_disconnect(pf_current->pi); - next_request(1); + next_request(0); } static void do_pf_write(void) @@ -890,7 +890,7 @@ static void do_pf_write_start(void) pi_do_claimed(pf_current->pi, do_pf_write_start); return; } - next_request(0); + next_request(-EIO); return; } @@ -903,7 +903,7 @@ static void do_pf_write_start(void) pi_do_claimed(pf_current->pi, do_pf_write_start); return; } - next_request(0); + next_request(-EIO); return; } pi_write_block(pf_current->pi, pf_buf, 512); @@ -923,11 +923,11 @@ static void do_pf_write_done(void) pi_do_claimed(pf_current->pi, do_pf_write_start); return; } - next_request(0); + next_request(-EIO); return; } pi_disconnect(pf_current->pi); - next_request(1); + next_request(0); } static int __init pf_init(void) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index bccc42bb921..d23b54bc2f5 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -158,7 +158,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev, if (res) { dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__, __LINE__, op, res); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); return 0; } @@ -180,7 +180,7 @@ static int ps3disk_submit_flush_request(struct ps3_storage_device *dev, if (res) { dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n", __func__, __LINE__, res); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); return 0; } @@ -205,7 +205,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev, break; } else { blk_dump_rq_flags(req, DEVICE_NAME " bad request"); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } } diff --git a/drivers/block/swim.c b/drivers/block/swim.c index d22cc385693..6544a7b06bf 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -532,39 +532,39 @@ static void redo_fd_request(struct request_queue *q) fs = req->rq_disk->private_data; if (req->sector < 0 || req->sector >= fs->total_secs) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } if (req->current_nr_sectors == 0) { - end_request(req, 1); + __blk_end_request_cur(req, 0); continue; } if (!fs->disk_in) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } if (rq_data_dir(req) == WRITE) { if (fs->write_protected) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } } switch (rq_data_dir(req)) { case WRITE: /* NOT IMPLEMENTED */ - end_request(req, 0); + __blk_end_request_cur(req, -EIO); break; case READ: if (floppy_read_sectors(fs, req->sector, req->current_nr_sectors, req->buffer)) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } req->nr_sectors -= req->current_nr_sectors; req->sector += req->current_nr_sectors; req->buffer += req->current_nr_sectors * 512; - end_request(req, 1); + __blk_end_request_cur(req, 0); break; } } diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 612965307ba..5904f7b73c6 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -320,15 +320,15 @@ static void start_request(struct floppy_state *fs) #endif if (req->sector < 0 || req->sector >= fs->total_secs) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } if (req->current_nr_sectors == 0) { - end_request(req, 1); + __blk_end_request_cur(req, 0); continue; } if (fs->ejected) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } @@ -336,7 +336,7 @@ static void start_request(struct floppy_state *fs) if (fs->write_prot < 0) fs->write_prot = swim3_readbit(fs, WRITE_PROT); if (fs->write_prot) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } } @@ -508,7 +508,7 @@ static void act(struct floppy_state *fs) case do_transfer: if (fs->cur_cyl != fs->req_cyl) { if (fs->retries > 5) { - end_request(fd_req, 0); + __blk_end_request_cur(fd_req, -EIO); fs->state = idle; return; } @@ -540,7 +540,7 @@ static void scan_timeout(unsigned long data) out_8(&sw->intr_enable, 0); fs->cur_cyl = -1; if (fs->retries > 5) { - end_request(fd_req, 0); + __blk_end_request_cur(fd_req, -EIO); fs->state = idle; start_request(fs); } else { @@ -559,7 +559,7 @@ static void seek_timeout(unsigned long data) out_8(&sw->select, RELAX); out_8(&sw->intr_enable, 0); printk(KERN_ERR "swim3: seek timeout\n"); - end_request(fd_req, 0); + __blk_end_request_cur(fd_req, -EIO); fs->state = idle; start_request(fs); } @@ -583,7 +583,7 @@ static void settle_timeout(unsigned long data) return; } printk(KERN_ERR "swim3: seek settle timeout\n"); - end_request(fd_req, 0); + __blk_end_request_cur(fd_req, -EIO); fs->state = idle; start_request(fs); } @@ -615,7 +615,7 @@ static void xfer_timeout(unsigned long data) fd_req->current_nr_sectors -= s; printk(KERN_ERR "swim3: timeout %sing sector %ld\n", (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)fd_req->sector); - end_request(fd_req, 0); + __blk_end_request_cur(fd_req, -EIO); fs->state = idle; start_request(fs); } @@ -646,7 +646,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) printk(KERN_ERR "swim3: seen sector but cyl=ff?\n"); fs->cur_cyl = -1; if (fs->retries > 5) { - end_request(fd_req, 0); + __blk_end_request_cur(fd_req, -EIO); fs->state = idle; start_request(fs); } else { @@ -731,7 +731,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) printk("swim3: error %sing block %ld (err=%x)\n", rq_data_dir(fd_req) == WRITE? "writ": "read", (long)fd_req->sector, err); - end_request(fd_req, 0); + __blk_end_request_cur(fd_req, -EIO); fs->state = idle; } } else { @@ -740,7 +740,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid); printk(KERN_ERR " state=%d, dir=%x, intr=%x, err=%x\n", fs->state, rq_data_dir(fd_req), intr, err); - end_request(fd_req, 0); + __blk_end_request_cur(fd_req, -EIO); fs->state = idle; start_request(fs); break; @@ -749,7 +749,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) fd_req->current_nr_sectors -= fs->scount; fd_req->buffer += fs->scount * 512; if (fd_req->current_nr_sectors <= 0) { - end_request(fd_req, 1); + __blk_end_request_cur(fd_req, 0); fs->state = idle; } else { fs->req_sector += fs->scount; diff --git a/drivers/block/xd.c b/drivers/block/xd.c index 64b496fce98..6f6ad82ec0c 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c @@ -314,21 +314,22 @@ static void do_xd_request (struct request_queue * q) int retry; if (!blk_fs_request(req)) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } if (block + count > get_capacity(req->rq_disk)) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } if (rw != READ && rw != WRITE) { printk("do_xd_request: unknown request\n"); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } for (retry = 0; (retry < XD_RETRIES) && !res; retry++) res = xd_readwrite(rw, disk, req->buffer, block, count); - end_request(req, res); /* wrap up, 0 = fail, 1 = success */ + /* wrap up, 0 = success, -errno = fail */ + __blk_end_request_cur(req, res); } } @@ -418,7 +419,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_ printk("xd%c: %s timeout, recalibrating drive\n",'a'+drive,(operation == READ ? "read" : "write")); xd_recalibrate(drive); spin_lock_irq(&xd_lock); - return (0); + return -EIO; case 2: if (sense[0] & 0x30) { printk("xd%c: %s - ",'a'+drive,(operation == READ ? "reading" : "writing")); @@ -439,7 +440,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_ else printk(" - no valid disk address\n"); spin_lock_irq(&xd_lock); - return (0); + return -EIO; } if (xd_dma_buffer) for (i=0; i < (temp * 0x200); i++) @@ -448,7 +449,7 @@ static int xd_readwrite (u_char operation,XD_INFO *p,char *buffer,u_int block,u_ count -= temp, buffer += temp * 0x200, block += temp; } spin_lock_irq(&xd_lock); - return (1); + return 0; } /* xd_recalibrate: recalibrate a given drive and reset controller if necessary */ diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index cd6cfe3b51e..b4564479f64 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -302,7 +302,7 @@ static void do_blkif_request(struct request_queue *rq) while ((req = elv_next_request(rq)) != NULL) { info = req->rq_disk->private_data; if (!blk_fs_request(req)) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 4aecf5dc6a9..b1e1d7e5ab1 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -466,7 +466,7 @@ struct request *ace_get_next_request(struct request_queue * q) while ((req = elv_next_request(q)) != NULL) { if (blk_fs_request(req)) break; - end_request(req, 0); + __blk_end_request_cur(req, -EIO); } return req; } @@ -494,7 +494,7 @@ static void ace_fsm_dostate(struct ace_device *ace) /* Drop all pending requests */ while ((req = elv_next_request(ace->queue)) != NULL) - end_request(req, 0); + __blk_end_request_cur(req, -EIO); /* Drop back to IDLE state and notify waiters */ ace->fsm_state = ACE_FSM_STATE_IDLE; diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 80754cdd311..b66ad58a3c3 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -77,7 +77,7 @@ static void do_z2_request(struct request_queue *q) if (start + len > z2ram_size) { printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n", req->sector, req->current_nr_sectors); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } while (len) { @@ -93,7 +93,7 @@ static void do_z2_request(struct request_queue *q) start += size; len -= size; } - end_request(req, 1); + __blk_end_request_cur(req, 0); } } diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index fee9a9e83fc..cab2b1fb2fe 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -654,17 +654,17 @@ static void gdrom_request(struct request_queue *rq) while ((req = elv_next_request(rq)) != NULL) { if (!blk_fs_request(req)) { printk(KERN_DEBUG "GDROM: Non-fs request ignored\n"); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); } if (rq_data_dir(req) != READ) { printk(KERN_NOTICE "GDROM: Read only device -"); printk(" write request ignored\n"); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); } if (req->nr_sectors) gdrom_request_handler_dma(req); else - end_request(req, 0); + __blk_end_request_cur(req, -EIO); } } diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index a443e136dc4..221317e6a00 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -923,7 +923,7 @@ static void i2o_block_request_fn(struct request_queue *q) break; } } else - end_request(req, 0); + __blk_end_request_cur(req, -EIO); } }; diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index a49a9c8f2cb..76c4c8d1307 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -54,33 +54,33 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, if (req->cmd_type == REQ_TYPE_LINUX_BLOCK && req->cmd[0] == REQ_LB_OP_DISCARD) - return !tr->discard(dev, block, nsect); + return tr->discard(dev, block, nsect); if (!blk_fs_request(req)) - return 0; + return -EIO; if (req->sector + req->current_nr_sectors > get_capacity(req->rq_disk)) - return 0; + return -EIO; switch(rq_data_dir(req)) { case READ: for (; nsect > 0; nsect--, block++, buf += tr->blksize) if (tr->readsect(dev, block, buf)) - return 0; - return 1; + return -EIO; + return 0; case WRITE: if (!tr->writesect) - return 0; + return -EIO; for (; nsect > 0; nsect--, block++, buf += tr->blksize) if (tr->writesect(dev, block, buf)) - return 0; - return 1; + return -EIO; + return 0; default: printk(KERN_NOTICE "Unknown request %u\n", rq_data_dir(req)); - return 0; + return -EIO; } } @@ -96,7 +96,7 @@ static int mtd_blktrans_thread(void *arg) while (!kthread_should_stop()) { struct request *req; struct mtd_blktrans_dev *dev; - int res = 0; + int res; req = elv_next_request(rq); @@ -119,7 +119,7 @@ static int mtd_blktrans_thread(void *arg) spin_lock_irq(rq->queue_lock); - end_request(req, res); + __blk_end_request_cur(req, res); } spin_unlock_irq(rq->queue_lock); diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index a85ad05e854..09617884a50 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -192,25 +192,25 @@ static void jsfd_do_request(struct request_queue *q) size_t len = req->current_nr_sectors << 9; if ((offset + len) > jdp->dsize) { - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } if (rq_data_dir(req) != READ) { printk(KERN_ERR "jsfd: write\n"); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } if ((jdp->dbase & 0xff000000) != 0x20000000) { printk(KERN_ERR "jsfd: bad base %x\n", (int)jdp->dbase); - end_request(req, 0); + __blk_end_request_cur(req, -EIO); continue; } jsfd_read(req->buffer, jdp->dbase + offset, len); - end_request(req, 1); + __blk_end_request_cur(req, 0); } } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e33c8356b3d..cfeb3c2feb2 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -845,9 +845,8 @@ extern unsigned int blk_rq_cur_bytes(struct request *rq); * blk_update_request() completes given number of bytes and updates * the request without completing it. * - * blk_end_request() and friends. __blk_end_request() and - * end_request() must be called with the request queue spinlock - * acquired. + * blk_end_request() and friends. __blk_end_request() must be called + * with the request queue spinlock acquired. * * Several drivers define their own end_request and call * blk_end_request() for parts of the original function. @@ -898,6 +897,19 @@ static inline void blk_end_request_all(struct request *rq, int error) BUG_ON(pending); } +/** + * blk_end_request_cur - Helper function to finish the current request chunk. + * @rq: the request to finish the current chunk for + * @err: %0 for success, < %0 for error + * + * Description: + * Complete the current consecutively mapped chunk from @rq. + */ +static inline void blk_end_request_cur(struct request *rq, int error) +{ + blk_end_request(rq, error, rq->hard_cur_sectors << 9); +} + /** * __blk_end_request - Helper function for drivers to complete the request. * @rq: the request being processed @@ -934,29 +946,17 @@ static inline void __blk_end_request_all(struct request *rq, int error) } /** - * end_request - end I/O on the current segment of the request - * @rq: the request being processed - * @uptodate: error value or %0/%1 uptodate flag + * __blk_end_request_cur - Helper function to finish the current request chunk. + * @rq: the request to finish the current chunk for + * @err: %0 for success, < %0 for error * * Description: - * Ends I/O on the current segment of a request. If that is the only - * remaining segment, the request is also completed and freed. - * - * This is a remnant of how older block drivers handled I/O completions. - * Modern drivers typically end I/O on the full request in one go, unless - * they have a residual value to account for. For that case this function - * isn't really useful, unless the residual just happens to be the - * full current segment. In other words, don't use this function in new - * code. Use blk_end_request() or __blk_end_request() to end a request. - **/ -static inline void end_request(struct request *rq, int uptodate) + * Complete the current consecutively mapped chunk from @rq. Must + * be called with queue lock held. + */ +static inline void __blk_end_request_cur(struct request *rq, int error) { - int error = 0; - - if (uptodate <= 0) - error = uptodate ? uptodate : -EIO; - - __blk_end_bidi_request(rq, error, rq->hard_cur_sectors << 9, 0); + __blk_end_request(rq, error, rq->hard_cur_sectors << 9); } extern void blk_complete_request(struct request *); From ec24751a6b57e1373a12361e581b2458bc9bb791 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:20 +0900 Subject: [PATCH 0617/2061] arm-omap: don't abuse rq->data omap mailbox uses rq->data as the second opaque pointer to carry mbox_msg_t and rq->special message argument which is needed only for tx. Add and use omap_msg_tx_data struct for tx and use rq->special for mbox_msg_t for rx such that only rq->special is used as opaque pointer. [ Impact: cleanup rq->data usage, extra kmalloc in msg_send ] Signed-off-by: Tejun Heo Cc: Russell King --- arch/arm/plat-omap/mailbox.c | 43 ++++++++++++++++++++++++++---------- 1 file changed, 31 insertions(+), 12 deletions(-) diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c index cf81bad8aec..538ba7541d3 100644 --- a/arch/arm/plat-omap/mailbox.c +++ b/arch/arm/plat-omap/mailbox.c @@ -147,24 +147,40 @@ static int __mbox_msg_send(struct omap_mbox *mbox, mbox_msg_t msg, void *arg) return ret; } +struct omap_msg_tx_data { + mbox_msg_t msg; + void *arg; +}; + +static void omap_msg_tx_end_io(struct request *rq, int error) +{ + kfree(rq->special); + __blk_put_request(rq->q, rq); +} + int omap_mbox_msg_send(struct omap_mbox *mbox, mbox_msg_t msg, void* arg) { + struct omap_msg_tx_data *tx_data; struct request *rq; struct request_queue *q = mbox->txq->queue; - int ret = 0; + + tx_data = kmalloc(sizeof(*tx_data), GFP_ATOMIC); + if (unlikely(!tx_data)) + return -ENOMEM; rq = blk_get_request(q, WRITE, GFP_ATOMIC); if (unlikely(!rq)) { - ret = -ENOMEM; - goto fail; + kfree(tx_data); + return -ENOMEM; } - rq->data = (void *)msg; - blk_insert_request(q, rq, 0, arg); + tx_data->msg = msg; + tx_data->arg = arg; + rq->end_io = omap_msg_tx_end_io; + blk_insert_request(q, rq, 0, tx_data); schedule_work(&mbox->txq->work); - fail: - return ret; + return 0; } EXPORT_SYMBOL(omap_mbox_msg_send); @@ -178,6 +194,8 @@ static void mbox_tx_work(struct work_struct *work) struct request_queue *q = mbox->txq->queue; while (1) { + struct omap_msg_tx_data *tx_data; + spin_lock(q->queue_lock); rq = elv_next_request(q); spin_unlock(q->queue_lock); @@ -185,7 +203,9 @@ static void mbox_tx_work(struct work_struct *work) if (!rq) break; - ret = __mbox_msg_send(mbox, (mbox_msg_t) rq->data, rq->special); + tx_data = rq->special; + + ret = __mbox_msg_send(mbox, tx_data->msg, tx_data->arg); if (ret) { enable_mbox_irq(mbox, IRQ_TX); return; @@ -222,7 +242,7 @@ static void mbox_rx_work(struct work_struct *work) if (!rq) break; - msg = (mbox_msg_t) rq->data; + msg = (mbox_msg_t)rq->special; blk_end_request_all(rq, 0); mbox->rxq->callback((void *)msg); } @@ -260,7 +280,6 @@ static void __mbox_rx_interrupt(struct omap_mbox *mbox) goto nomem; msg = mbox_fifo_read(mbox); - rq->data = (void *)msg; if (unlikely(mbox_seq_test(mbox, msg))) { pr_info("mbox: Illegal seq bit!(%08x)\n", msg); @@ -268,7 +287,7 @@ static void __mbox_rx_interrupt(struct omap_mbox *mbox) mbox->err_notify(); } - blk_insert_request(q, rq, 0, NULL); + blk_insert_request(q, rq, 0, (void *)msg); if (mbox->ops->type == OMAP_MBOX_TYPE1) break; } @@ -331,7 +350,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf) if (!rq) break; - *p = (mbox_msg_t) rq->data; + *p = (mbox_msg_t)rq->special; blk_end_request_all(rq, 0); From 731ec497e5888c6792ad62613ae9be97eebcd7ca Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 23 Apr 2009 11:05:20 +0900 Subject: [PATCH 0618/2061] block: kill rq->data Now that all block request data transfer is done via bio, rq->data isn't used. Kill it. While at it, make the roles of rq->special and buffer clear. [ Impact: drop now unncessary field from struct request ] Signed-off-by: Tejun Heo Cc: Boaz Harrosh --- block/blk-core.c | 5 ++--- block/blk-map.c | 6 +++--- block/scsi_ioctl.c | 1 - drivers/scsi/scsi_lib.c | 1 - include/linux/blkdev.h | 5 ++--- 5 files changed, 7 insertions(+), 11 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 0520cc70458..6dd180cf15d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -189,10 +189,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg) (unsigned long long)rq->sector, rq->nr_sectors, rq->current_nr_sectors); - printk(KERN_INFO " bio %p, biotail %p, buffer %p, data %p, len %u\n", + printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", rq->bio, rq->biotail, - rq->buffer, rq->data, - rq->data_len); + rq->buffer, rq->data_len); if (blk_pc_request(rq)) { printk(KERN_INFO " cdb: "); diff --git a/block/blk-map.c b/block/blk-map.c index f103729b462..694fefad34e 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -156,7 +156,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, if (!bio_flagged(bio, BIO_USER_MAPPED)) rq->cmd_flags |= REQ_COPY_USER; - rq->buffer = rq->data = NULL; + rq->buffer = NULL; return 0; unmap_rq: blk_rq_unmap_user(bio); @@ -235,7 +235,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, blk_queue_bounce(q, &bio); bio_get(bio); blk_rq_bio_prep(q, rq, bio); - rq->buffer = rq->data = NULL; + rq->buffer = NULL; return 0; } EXPORT_SYMBOL(blk_rq_map_user_iov); @@ -313,7 +313,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, blk_rq_bio_prep(q, rq, bio); blk_queue_bounce(q, &rq->bio); - rq->buffer = rq->data = NULL; + rq->buffer = NULL; return 0; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 82a0ca2f672..bb9aa43551b 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -500,7 +500,6 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, rq = blk_get_request(q, WRITE, __GFP_WAIT); rq->cmd_type = REQ_TYPE_BLOCK_PC; - rq->data = NULL; rq->data_len = 0; rq->extra_len = 0; rq->timeout = BLK_DEFAULT_SG_TIMEOUT; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 756ac7c93de..aa9fc572e45 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1088,7 +1088,6 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req) return ret; } else { BUG_ON(req->data_len); - BUG_ON(req->data); memset(&cmd->sdb, 0, sizeof(cmd->sdb)); req->buffer = NULL; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index cfeb3c2feb2..12c545e2737 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -211,8 +211,8 @@ struct request { unsigned short ioprio; - void *special; - char *buffer; + void *special; /* opaque pointer available for LLD use */ + char *buffer; /* kaddr of the current segment if available */ int tag; int errors; @@ -229,7 +229,6 @@ struct request { unsigned int data_len; unsigned int extra_len; /* length of alignment and padding */ unsigned int sense_len; - void *data; void *sense; unsigned long deadline; From c2553b5844b06910435e40cfab9e6f384840cb97 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Apr 2009 08:10:11 +0200 Subject: [PATCH 0619/2061] block: make blk_do_io_stat() do the full "is this rq accountable" checks We currently check for file system requests outside of blk_do_io_stat(rq), but we may as well just include it. Signed-off-by: Jens Axboe --- block/blk-core.c | 12 +++--------- block/blk.h | 9 ++++++++- 2 files changed, 11 insertions(+), 10 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 6dd180cf15d..1e3b97f0ae6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -68,7 +68,7 @@ static void drive_stat_acct(struct request *rq, int new_io) int rw = rq_data_dir(rq); int cpu; - if (!blk_fs_request(rq) || !blk_do_io_stat(rq)) + if (!blk_do_io_stat(rq)) return; cpu = part_stat_lock(); @@ -1639,10 +1639,7 @@ EXPORT_SYMBOL(blkdev_dequeue_request); static void blk_account_io_completion(struct request *req, unsigned int bytes) { - if (!blk_do_io_stat(req)) - return; - - if (blk_fs_request(req)) { + if (blk_do_io_stat(req)) { const int rw = rq_data_dir(req); struct hd_struct *part; int cpu; @@ -1656,15 +1653,12 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) static void blk_account_io_done(struct request *req) { - if (!blk_do_io_stat(req)) - return; - /* * Account IO completion. bar_rq isn't accounted as a normal * IO on queueing nor completion. Accounting the containing * request is enough. */ - if (blk_fs_request(req) && req != &req->q->bar_rq) { + if (blk_do_io_stat(req) && req != &req->q->bar_rq) { unsigned long duration = jiffies - req->start_time; const int rw = rq_data_dir(req); struct hd_struct *part; diff --git a/block/blk.h b/block/blk.h index 9b2c324e440..404c10b25ca 100644 --- a/block/blk.h +++ b/block/blk.h @@ -149,9 +149,16 @@ static inline int blk_cpu_to_group(int cpu) #endif } +/* + * Contribute to IO statistics IFF: + * + * a) it's attached to a gendisk, and + * b) the queue had IO stats enabled when this request was started, and + * c) it's a file system request + */ static inline int blk_do_io_stat(struct request *rq) { - return rq->rq_disk && blk_rq_io_stat(rq); + return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq); } #endif From c69d48540c201394d08cb4d48b905e001313d9b8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Apr 2009 08:12:19 +0200 Subject: [PATCH 0620/2061] block: include discard requests in IO accounting We currently don't do merging on discard requests, but we potentially could. If we do, then we need to include discard requests in the IO accounting, or merging would end up decrementing in_flight IO counters for an IO which never incremented them. So enable accounting for discard requests. Problem found by Nikanth Karthikesan Signed-off-by: Jens Axboe --- block/blk.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/block/blk.h b/block/blk.h index 404c10b25ca..51115599df9 100644 --- a/block/blk.h +++ b/block/blk.h @@ -158,7 +158,8 @@ static inline int blk_cpu_to_group(int cpu) */ static inline int blk_do_io_stat(struct request *rq) { - return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq); + return rq->rq_disk && blk_rq_io_stat(rq) && blk_fs_request(rq) && + blk_discard_rq(rq); } #endif From 9eb55b030c4b3227334ee4482402096cd1d1a6fe Mon Sep 17 00:00:00 2001 From: Nikanth Karthikesan Date: Mon, 27 Apr 2009 14:53:54 +0200 Subject: [PATCH 0621/2061] block: catch trying to use more bits than request->cmd_flags has Signed-off-by: Nikanth Karthikesan Signed-off-by: Jens Axboe --- block/blk-core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index 1e3b97f0ae6..394c5bd8127 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2097,6 +2097,9 @@ EXPORT_SYMBOL(kblockd_schedule_work); int __init blk_dev_init(void) { + BUILD_BUG_ON(__REQ_NR_BITS > 8 * + sizeof(((struct request *)0)->cmd_flags)); + kblockd_workqueue = create_workqueue("kblockd"); if (!kblockd_workqueue) panic("Failed to create kblockd\n"); From 9fd8d0e1bcb848257968d9a7d73ca4d890ea8bd1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:04 +0900 Subject: [PATCH 0622/2061] block: make blk_end_request_cur() return bool In the process of mindlessly copying [__]blk_end_request_all(), [__]blk_end_request_cur() ended up returning void even though they're partial completion functions. Fix it. [ Impact: fix braindead API ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 12c545e2737..3a5b1bd6582 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -903,10 +903,14 @@ static inline void blk_end_request_all(struct request *rq, int error) * * Description: * Complete the current consecutively mapped chunk from @rq. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request */ -static inline void blk_end_request_cur(struct request *rq, int error) +static inline bool blk_end_request_cur(struct request *rq, int error) { - blk_end_request(rq, error, rq->hard_cur_sectors << 9); + return blk_end_request(rq, error, rq->hard_cur_sectors << 9); } /** @@ -952,10 +956,14 @@ static inline void __blk_end_request_all(struct request *rq, int error) * Description: * Complete the current consecutively mapped chunk from @rq. Must * be called with queue lock held. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request */ -static inline void __blk_end_request_cur(struct request *rq, int error) +static inline bool __blk_end_request_cur(struct request *rq, int error) { - __blk_end_request(rq, error, rq->hard_cur_sectors << 9); + return __blk_end_request(rq, error, rq->hard_cur_sectors << 9); } extern void blk_complete_request(struct request *); From 4c94dece1baf320d925cedb231489c4e0358ac5a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:05 +0900 Subject: [PATCH 0623/2061] block: don't init rq fields unnecessarily blk_get_request() always returns properly zeroed requests. Don't set fields to zero/NULL unnecessarily. [ Impact: cleanup ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/scsi_ioctl.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index bb9aa43551b..58cf4560f74 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -500,8 +500,6 @@ static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, rq = blk_get_request(q, WRITE, __GFP_WAIT); rq->cmd_type = REQ_TYPE_BLOCK_PC; - rq->data_len = 0; - rq->extra_len = 0; rq->timeout = BLK_DEFAULT_SG_TIMEOUT; rq->cmd[0] = cmd; rq->cmd[4] = data; From 5b5c5d12b91cb6b2a2967f06aef35d59008dc2e7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:06 +0900 Subject: [PATCH 0624/2061] amiflop,ataflop,xd,mg_disk: clean up unnecessary stuff from block drivers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rq_data_dir() can only be READ or WRITE and rq->sector and nr_sectors are always automatically updated after partial request completion. Don't worry about rq_data_dir() not being either READ or WRITE or manually update sector and nr_sectors. [ Impact: cleanup ] Signed-off-by: Tejun Heo Cc: Jörg Dorchain Cc: Geert Uytterhoeven Cc: unsik Kim Signed-off-by: Jens Axboe --- drivers/block/amiflop.c | 7 ------- drivers/block/ataflop.c | 4 ---- drivers/block/mg_disk.c | 10 ---------- drivers/block/xd.c | 9 ++------- 4 files changed, 2 insertions(+), 28 deletions(-) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index b99a2a606d0..8ff95f2c0ed 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1371,11 +1371,6 @@ static void redo_fd_request(void) "0x%08lx\n", track, sector, data); #endif - if ((rq_data_dir(CURRENT) != READ) && (rq_data_dir(CURRENT) != WRITE)) { - printk(KERN_WARNING "do_fd_request: unknown command\n"); - __blk_end_request_cur(CURRENT, -EIO); - goto repeat; - } if (get_track(drive, track) == -1) { __blk_end_request_cur(CURRENT, -EIO); goto repeat; @@ -1407,8 +1402,6 @@ static void redo_fd_request(void) break; } } - CURRENT->nr_sectors -= CURRENT->current_nr_sectors; - CURRENT->sector += CURRENT->current_nr_sectors; __blk_end_request_cur(CURRENT, 0); goto repeat; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 44a8702136a..25067287211 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -732,8 +732,6 @@ static void do_fd_action( int drive ) } else { /* all sectors finished */ - CURRENT->nr_sectors -= CURRENT->current_nr_sectors; - CURRENT->sector += CURRENT->current_nr_sectors; __blk_end_request_cur(CURRENT, 0); redo_fd_request(); return; @@ -1139,8 +1137,6 @@ static void fd_rwsec_done1(int status) } else { /* all sectors finished */ - CURRENT->nr_sectors -= CURRENT->current_nr_sectors; - CURRENT->sector += CURRENT->current_nr_sectors; __blk_end_request_cur(CURRENT, 0); redo_fd_request(); } diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 408c2bd8a43..2c00ad90cd6 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -560,11 +560,6 @@ static void mg_request_poll(struct request_queue *q) case WRITE: mg_write(req); break; - default: - printk(KERN_WARNING "%s:%d unknown command\n", - __func__, __LINE__); - __blk_end_request_cur(req, -EIO); - break; } } } @@ -614,11 +609,6 @@ static unsigned int mg_issue_req(struct request *req, outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); break; - default: - printk(KERN_WARNING "%s:%d unknown command\n", - __func__, __LINE__); - __blk_end_request_cur(req, -EIO); - break; } return MG_ERR_NONE; } diff --git a/drivers/block/xd.c b/drivers/block/xd.c index 6f6ad82ec0c..14be4c1ed1a 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c @@ -308,7 +308,6 @@ static void do_xd_request (struct request_queue * q) while ((req = elv_next_request(q)) != NULL) { unsigned block = req->sector; unsigned count = req->nr_sectors; - int rw = rq_data_dir(req); XD_INFO *disk = req->rq_disk->private_data; int res = 0; int retry; @@ -321,13 +320,9 @@ static void do_xd_request (struct request_queue * q) __blk_end_request_cur(req, -EIO); continue; } - if (rw != READ && rw != WRITE) { - printk("do_xd_request: unknown request\n"); - __blk_end_request_cur(req, -EIO); - continue; - } for (retry = 0; (retry < XD_RETRIES) && !res; retry++) - res = xd_readwrite(rw, disk, req->buffer, block, count); + res = xd_readwrite(rq_data_dir(req), disk, req->buffer, + block, count); /* wrap up, 0 = success, -errno = fail */ __blk_end_request_cur(req, res); } From cd4c34ebec155e5c10f897d9bebf618248121e70 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:07 +0900 Subject: [PATCH 0625/2061] ps3disk: simplify request completion ps3disk_interrupt() always completes requests fully but it uses rq->hard_cur_sectors for FLUSH requests for some reason. Drop them and simply use __blk_end_request_all(). [ Impact: cleanup ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/ps3disk.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index d23b54bc2f5..f6586e4d351 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -231,7 +231,6 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data) struct request *req; int res, read, error; u64 tag, status; - unsigned long num_sectors; const char *op; res = lv1_storage_get_async_status(dev->sbd.dev_id, &tag, &status); @@ -261,11 +260,9 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data) if (req->cmd_type == REQ_TYPE_LINUX_BLOCK && req->cmd[0] == REQ_LB_OP_FLUSH) { read = 0; - num_sectors = req->hard_cur_sectors; op = "flush"; } else { read = !rq_data_dir(req); - num_sectors = req->nr_sectors; op = read ? "read" : "write"; } if (status) { @@ -281,7 +278,7 @@ static irqreturn_t ps3disk_interrupt(int irq, void *data) } spin_lock(&priv->lock); - __blk_end_request(req, error, num_sectors << 9); + __blk_end_request_all(req, error); priv->req = NULL; ps3disk_do_request(dev, priv->queue); spin_unlock(&priv->lock); From 044208506d35bd62396c4673176e2c12393905b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:08 +0900 Subject: [PATCH 0626/2061] sunvdc: kill vdc_end_request() vdc_end_request() is a thin silly wrapper on top of __blk_end_request(). Kill it. [ Impact: cleanup ] Signed-off-by: Tejun Heo Acked-by: David S. Miller Signed-off-by: Jens Axboe --- drivers/block/sunvdc.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 5861e33efe6..f59887c5ffb 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -212,11 +212,6 @@ static void vdc_end_special(struct vdc_port *port, struct vio_disk_desc *desc) vdc_finish(&port->vio, -err, WAITING_FOR_GEN_CMD); } -static void vdc_end_request(struct request *req, int error, int num_sectors) -{ - __blk_end_request(req, error, num_sectors << 9); -} - static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, unsigned int index) { @@ -239,7 +234,7 @@ static void vdc_end_one(struct vdc_port *port, struct vio_dring_state *dr, rqe->req = NULL; - vdc_end_request(req, (desc->status ? -EIO : 0), desc->size >> 9); + __blk_end_request(req, (desc->status ? -EIO : 0), desc->size); if (blk_queue_stopped(port->disk->queue)) blk_start_queue(port->disk->queue); @@ -453,7 +448,7 @@ static void do_vdc_request(struct request_queue *q) blkdev_dequeue_request(req); if (__send_request(req) < 0) - vdc_end_request(req, -EIO, req->hard_nr_sectors); + __blk_end_request_all(req, -EIO); } } From 4d6c84d91d1a539ebc47d1a36a35e9390ba11fdc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:09 +0900 Subject: [PATCH 0627/2061] ubd: cleanup completion path ubd had its own block request partial completion mechanism, which is unnecessary as block layer already does it. Kill ubd_end_request() and ubd_finish() and replace them with direct call to blk_end_request(). [ Impact: cleanup ] Signed-off-by: Tejun Heo Cc: Jeff Dike Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 23 +---------------------- 1 file changed, 1 insertion(+), 22 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index f934225fd8e..36ca9fa89d0 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -451,23 +451,6 @@ static void do_ubd_request(struct request_queue * q); /* Only changed by ubd_init, which is an initcall. */ static int thread_fd = -1; - -static void ubd_end_request(struct request *req, int bytes, int error) -{ - blk_end_request(req, error, bytes); -} - -/* Callable only from interrupt context - otherwise you need to do - * spin_lock_irq()/spin_lock_irqsave() */ -static inline void ubd_finish(struct request *req, int bytes) -{ - if(bytes < 0){ - ubd_end_request(req, 0, -EIO); - return; - } - ubd_end_request(req, bytes, 0); -} - static LIST_HEAD(restart); /* XXX - move this inside ubd_intr. */ @@ -475,7 +458,6 @@ static LIST_HEAD(restart); static void ubd_handler(void) { struct io_thread_req *req; - struct request *rq; struct ubd *ubd; struct list_head *list, *next_ele; unsigned long flags; @@ -492,10 +474,7 @@ static void ubd_handler(void) return; } - rq = req->req; - rq->nr_sectors -= req->length >> 9; - if(rq->nr_sectors == 0) - ubd_finish(rq, rq->hard_nr_sectors << 9); + blk_end_request(req->req, 0, req->length); kfree(req); } reactivate_fd(thread_fd, UBD_IRQ); From f81f2f7c9fee307e371f37424577d46f9eaf8692 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:10 +0900 Subject: [PATCH 0628/2061] ubd: drop unnecessary rq->sector manipulation ubd curiously updates rq->sector while issuing the request in multiple pieces. Don't do it and simply use local copy of sector. [ Impact: cleanup ] Signed-off-by: Tejun Heo Cc: Jeff Dike Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 36ca9fa89d0..433012764a3 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1222,7 +1222,8 @@ static void do_ubd_request(struct request_queue *q) { struct io_thread_req *io_req; struct request *req; - int n, last_sectors; + sector_t sector; + int n; while(1){ struct ubd *dev = q->queuedata; @@ -1238,11 +1239,10 @@ static void do_ubd_request(struct request_queue *q) } req = dev->request; - last_sectors = 0; + sector = req->sector; while(dev->start_sg < dev->end_sg){ struct scatterlist *sg = &dev->sg[dev->start_sg]; - req->sector += last_sectors; io_req = kmalloc(sizeof(struct io_thread_req), GFP_ATOMIC); if(io_req == NULL){ @@ -1251,10 +1251,10 @@ static void do_ubd_request(struct request_queue *q) return; } prepare_request(req, io_req, - (unsigned long long) req->sector << 9, + (unsigned long long)sector << 9, sg->offset, sg->length, sg_page(sg)); - last_sectors = sg->length >> 9; + sector += sg->length >> 9; n = os_write_file(thread_fd, &io_req, sizeof(struct io_thread_req *)); if(n != sizeof(struct io_thread_req *)){ From e091eb67af957bac4e4f7410c5d1aa263ee483a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:11 +0900 Subject: [PATCH 0629/2061] hd: clean up request completion paths hd read/write_intr() functions manually manipulate request to incrementally complete it, which block layer already supports. Simply use block layer completion routines instead of manual partial completion. While at it, clear unnecessary elv_next_request() check at the tail of read_intr(). This also makes read and write_intr() more consistent. [ Impact: cleanup ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/hd.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) diff --git a/drivers/block/hd.c b/drivers/block/hd.c index 5cb300b81c6..75b9ca95c4e 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -452,32 +452,25 @@ static void read_intr(void) bad_rw_intr(); hd_request(); return; + ok_to_read: req = CURRENT; insw(HD_DATA, req->buffer, 256); - req->sector++; - req->buffer += 512; - req->errors = 0; - i = --req->nr_sectors; - --req->current_nr_sectors; #ifdef DEBUG printk("%s: read: sector %ld, remaining = %ld, buffer=%p\n", - req->rq_disk->disk_name, req->sector, req->nr_sectors, + req->rq_disk->disk_name, req->sector + 1, req->nr_sectors - 1, req->buffer+512); #endif - if (req->current_nr_sectors <= 0) - __blk_end_request_cur(req, 0); - if (i > 0) { + if (__blk_end_request(req, 0, 512)) { SET_HANDLER(&read_intr); return; } + (void) inb_p(HD_STATUS); #if (HD_DELAY > 0) last_req = read_timer(); #endif - if (elv_next_request(QUEUE)) - hd_request(); - return; + hd_request(); } static void write_intr(void) @@ -499,23 +492,18 @@ static void write_intr(void) bad_rw_intr(); hd_request(); return; + ok_to_write: - req->sector++; - i = --req->nr_sectors; - --req->current_nr_sectors; - req->buffer += 512; - if (!i || (req->bio && req->current_nr_sectors <= 0)) - __blk_end_request_cur(req, 0); - if (i > 0) { + if (__blk_end_request(req, 0, 512)) { SET_HANDLER(&write_intr); outsw(HD_DATA, req->buffer, 256); - } else { -#if (HD_DELAY > 0) - last_req = read_timer(); -#endif - hd_request(); + return; } - return; + +#if (HD_DELAY > 0) + last_req = read_timer(); +#endif + hd_request(); } static void recal_intr(void) From 467ca759fc83fc35cb7d15aec0d74c62cffc4481 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:12 +0900 Subject: [PATCH 0630/2061] swim3: clean up request completion paths swim3 curiously tries to update request parameters before calling __blk_end_request() when __blk_end_request() will do it anyway, and it updates request for partial completion manually instead of using blk_update_request(). Also, it does some spurious checks on rq such as testing whether rq->sector is negative or current_nr_sectors is zero right after fetching. Drop unnecessary stuff and use standard block layer mechanisms. [ Impact: cleanup ] Signed-off-by: Tejun Heo Cc: Benjamin Herrenschmidt Signed-off-by: Jens Axboe --- drivers/block/swim3.c | 31 +++++-------------------------- 1 file changed, 5 insertions(+), 26 deletions(-) diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 5904f7b73c6..424855945b9 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -319,14 +319,10 @@ static void start_request(struct floppy_state *fs) req->errors, req->current_nr_sectors); #endif - if (req->sector < 0 || req->sector >= fs->total_secs) { + if (req->sector >= fs->total_secs) { __blk_end_request_cur(req, -EIO); continue; } - if (req->current_nr_sectors == 0) { - __blk_end_request_cur(req, 0); - continue; - } if (fs->ejected) { __blk_end_request_cur(req, -EIO); continue; @@ -593,8 +589,6 @@ static void xfer_timeout(unsigned long data) struct floppy_state *fs = (struct floppy_state *) data; struct swim3 __iomem *sw = fs->swim3; struct dbdma_regs __iomem *dr = fs->dma; - struct dbdma_cmd *cp = fs->dma_cmd; - unsigned long s; int n; fs->timeout_pending = 0; @@ -605,14 +599,6 @@ static void xfer_timeout(unsigned long data) out_8(&sw->intr_enable, 0); out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION); out_8(&sw->select, RELAX); - if (rq_data_dir(fd_req) == WRITE) - ++cp; - if (ld_le16(&cp->xfer_status) != 0) - s = fs->scount - ((ld_le16(&cp->res_count) + 511) >> 9); - else - s = 0; - fd_req->sector += s; - fd_req->current_nr_sectors -= s; printk(KERN_ERR "swim3: timeout %sing sector %ld\n", (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)fd_req->sector); __blk_end_request_cur(fd_req, -EIO); @@ -719,9 +705,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) if (intr & ERROR_INTR) { n = fs->scount - 1 - resid / 512; if (n > 0) { - fd_req->sector += n; - fd_req->current_nr_sectors -= n; - fd_req->buffer += n * 512; + blk_update_request(fd_req, 0, n << 9); fs->req_sector += n; } if (fs->retries < 5) { @@ -745,13 +729,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) start_request(fs); break; } - fd_req->sector += fs->scount; - fd_req->current_nr_sectors -= fs->scount; - fd_req->buffer += fs->scount * 512; - if (fd_req->current_nr_sectors <= 0) { - __blk_end_request_cur(fd_req, 0); - fs->state = idle; - } else { + if (__blk_end_request(fd_req, 0, fs->scount << 9)) { fs->req_sector += fs->scount; if (fs->req_sector > fs->secpertrack) { fs->req_sector -= fs->secpertrack; @@ -761,7 +739,8 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) } } act(fs); - } + } else + fs->state = idle; } if (fs->state == idle) start_request(fs); From e138b4e08ef65771000fbe6d93d67e3960ff862b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:13 +0900 Subject: [PATCH 0631/2061] swim: clean up request completion paths swim curiously tries to update request parameters before calling __blk_end_request() when __blk_end_request() will do it anyway and unnecessarily checks whether current_nr_sectors is zero right after fetching. Drop unnecessary stuff and use standard block layer mechanisms. [ Impact: cleanup ] Signed-off-by: Tejun Heo Cc: Laurent Vivier Signed-off-by: Jens Axboe --- drivers/block/swim.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 6544a7b06bf..97ef4266c4c 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -535,10 +535,6 @@ static void redo_fd_request(struct request_queue *q) __blk_end_request_cur(req, -EIO); continue; } - if (req->current_nr_sectors == 0) { - __blk_end_request_cur(req, 0); - continue; - } if (!fs->disk_in) { __blk_end_request_cur(req, -EIO); continue; @@ -561,9 +557,6 @@ static void redo_fd_request(struct request_queue *q) __blk_end_request_cur(req, -EIO); continue; } - req->nr_sectors -= req->current_nr_sectors; - req->sector += req->current_nr_sectors; - req->buffer += req->current_nr_sectors * 512; __blk_end_request_cur(req, 0); break; } From eec9462088a26c046d4db3100796a340a50890b8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:14 +0900 Subject: [PATCH 0632/2061] mg_disk: fold mg_disk.h into mg_disk.c include/linux/mg_disk.h is used only by drivers/block/mg_disk.c. No reason to put it in a separate header. Fold it into mg_disk.c. [ Impact: cleanup ] Signed-off-by: Tejun Heo Cc: unsik Kim Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 186 +++++++++++++++++++++++++++++++++++- include/linux/mg_disk.h | 206 ---------------------------------------- 2 files changed, 185 insertions(+), 207 deletions(-) delete mode 100644 include/linux/mg_disk.h diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 2c00ad90cd6..2c6127ee834 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -22,10 +22,194 @@ #include #include #include -#include #define MG_RES_SEC (CONFIG_MG_DISK_RES << 1) +/* name for block device */ +#define MG_DISK_NAME "mgd" +/* name for platform device */ +#define MG_DEV_NAME "mg_disk" + +#define MG_DISK_MAJ 0 +#define MG_DISK_MAX_PART 16 +#define MG_SECTOR_SIZE 512 +#define MG_MAX_SECTS 256 + +/* Register offsets */ +#define MG_BUFF_OFFSET 0x8000 +#define MG_STORAGE_BUFFER_SIZE 0x200 +#define MG_REG_OFFSET 0xC000 +#define MG_REG_FEATURE (MG_REG_OFFSET + 2) /* write case */ +#define MG_REG_ERROR (MG_REG_OFFSET + 2) /* read case */ +#define MG_REG_SECT_CNT (MG_REG_OFFSET + 4) +#define MG_REG_SECT_NUM (MG_REG_OFFSET + 6) +#define MG_REG_CYL_LOW (MG_REG_OFFSET + 8) +#define MG_REG_CYL_HIGH (MG_REG_OFFSET + 0xA) +#define MG_REG_DRV_HEAD (MG_REG_OFFSET + 0xC) +#define MG_REG_COMMAND (MG_REG_OFFSET + 0xE) /* write case */ +#define MG_REG_STATUS (MG_REG_OFFSET + 0xE) /* read case */ +#define MG_REG_DRV_CTRL (MG_REG_OFFSET + 0x10) +#define MG_REG_BURST_CTRL (MG_REG_OFFSET + 0x12) + +/* "Drive Select/Head Register" bit values */ +#define MG_REG_HEAD_MUST_BE_ON 0xA0 /* These 2 bits are always on */ +#define MG_REG_HEAD_DRIVE_MASTER (0x00 | MG_REG_HEAD_MUST_BE_ON) +#define MG_REG_HEAD_DRIVE_SLAVE (0x10 | MG_REG_HEAD_MUST_BE_ON) +#define MG_REG_HEAD_LBA_MODE (0x40 | MG_REG_HEAD_MUST_BE_ON) + + +/* "Device Control Register" bit values */ +#define MG_REG_CTRL_INTR_ENABLE 0x0 +#define MG_REG_CTRL_INTR_DISABLE (0x1<<1) +#define MG_REG_CTRL_RESET (0x1<<2) +#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH 0x0 +#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW (0x1<<4) +#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW 0x0 +#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH (0x1<<5) +#define MG_REG_CTRL_DPD_DISABLE 0x0 +#define MG_REG_CTRL_DPD_ENABLE (0x1<<6) + +/* Status register bit */ +/* error bit in status register */ +#define MG_REG_STATUS_BIT_ERROR 0x01 +/* corrected error in status register */ +#define MG_REG_STATUS_BIT_CORRECTED_ERROR 0x04 +/* data request bit in status register */ +#define MG_REG_STATUS_BIT_DATA_REQ 0x08 +/* DSC - Drive Seek Complete */ +#define MG_REG_STATUS_BIT_SEEK_DONE 0x10 +/* DWF - Drive Write Fault */ +#define MG_REG_STATUS_BIT_WRITE_FAULT 0x20 +#define MG_REG_STATUS_BIT_READY 0x40 +#define MG_REG_STATUS_BIT_BUSY 0x80 + +/* handy status */ +#define MG_STAT_READY (MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE) +#define MG_READY_OK(s) (((s) & (MG_STAT_READY | \ + (MG_REG_STATUS_BIT_BUSY | \ + MG_REG_STATUS_BIT_WRITE_FAULT | \ + MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY) + +/* Error register */ +#define MG_REG_ERR_AMNF 0x01 +#define MG_REG_ERR_ABRT 0x04 +#define MG_REG_ERR_IDNF 0x10 +#define MG_REG_ERR_UNC 0x40 +#define MG_REG_ERR_BBK 0x80 + +/* error code for others */ +#define MG_ERR_NONE 0 +#define MG_ERR_TIMEOUT 0x100 +#define MG_ERR_INIT_STAT 0x101 +#define MG_ERR_TRANSLATION 0x102 +#define MG_ERR_CTRL_RST 0x103 +#define MG_ERR_INV_STAT 0x104 +#define MG_ERR_RSTOUT 0x105 + +#define MG_MAX_ERRORS 6 /* Max read/write errors */ + +/* command */ +#define MG_CMD_RD 0x20 +#define MG_CMD_WR 0x30 +#define MG_CMD_SLEEP 0x99 +#define MG_CMD_WAKEUP 0xC3 +#define MG_CMD_ID 0xEC +#define MG_CMD_WR_CONF 0x3C +#define MG_CMD_RD_CONF 0x40 + +/* operation mode */ +#define MG_OP_CASCADE (1 << 0) +#define MG_OP_CASCADE_SYNC_RD (1 << 1) +#define MG_OP_CASCADE_SYNC_WR (1 << 2) +#define MG_OP_INTERLEAVE (1 << 3) + +/* synchronous */ +#define MG_BURST_LAT_4 (3 << 4) +#define MG_BURST_LAT_5 (4 << 4) +#define MG_BURST_LAT_6 (5 << 4) +#define MG_BURST_LAT_7 (6 << 4) +#define MG_BURST_LAT_8 (7 << 4) +#define MG_BURST_LEN_4 (1 << 1) +#define MG_BURST_LEN_8 (2 << 1) +#define MG_BURST_LEN_16 (3 << 1) +#define MG_BURST_LEN_32 (4 << 1) +#define MG_BURST_LEN_CONT (0 << 1) + +/* timeout value (unit: ms) */ +#define MG_TMAX_CONF_TO_CMD 1 +#define MG_TMAX_WAIT_RD_DRQ 10 +#define MG_TMAX_WAIT_WR_DRQ 500 +#define MG_TMAX_RST_TO_BUSY 10 +#define MG_TMAX_HDRST_TO_RDY 500 +#define MG_TMAX_SWRST_TO_RDY 500 +#define MG_TMAX_RSTOUT 3000 + +/* device attribution */ +/* use mflash as boot device */ +#define MG_BOOT_DEV (1 << 0) +/* use mflash as storage device */ +#define MG_STORAGE_DEV (1 << 1) +/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */ +#define MG_STORAGE_DEV_SKIP_RST (1 << 2) + +#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST) + +/* names of GPIO resource */ +#define MG_RST_PIN "mg_rst" +/* except MG_BOOT_DEV, reset-out pin should be assigned */ +#define MG_RSTOUT_PIN "mg_rstout" + +/* private driver data */ +struct mg_drv_data { + /* disk resource */ + u32 use_polling; + + /* device attribution */ + u32 dev_attr; + + /* internally used */ + struct mg_host *host; +}; + +/* main structure for mflash driver */ +struct mg_host { + struct device *dev; + + struct request_queue *breq; + spinlock_t lock; + struct gendisk *gd; + + struct timer_list timer; + void (*mg_do_intr) (struct mg_host *); + + u16 id[ATA_ID_WORDS]; + + u16 cyls; + u16 heads; + u16 sectors; + u32 n_sectors; + u32 nres_sectors; + + void __iomem *dev_base; + unsigned int irq; + unsigned int rst; + unsigned int rstout; + + u32 major; + u32 error; +}; + +/* + * Debugging macro and defines + */ +#undef DO_MG_DEBUG +#ifdef DO_MG_DEBUG +# define MG_DBG(fmt, args...) \ + printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args) +#else /* CONFIG_MG_DEBUG */ +# define MG_DBG(fmt, args...) do { } while (0) +#endif /* CONFIG_MG_DEBUG */ + static void mg_request(struct request_queue *); static void mg_dump_status(const char *msg, unsigned int stat, diff --git a/include/linux/mg_disk.h b/include/linux/mg_disk.h deleted file mode 100644 index 1f76b1ebf62..00000000000 --- a/include/linux/mg_disk.h +++ /dev/null @@ -1,206 +0,0 @@ -/* - * include/linux/mg_disk.c - * - * Support for the mGine m[g]flash IO mode. - * Based on legacy hd.c - * - * (c) 2008 mGine Co.,LTD - * (c) 2008 unsik Kim - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - */ - -#ifndef __MG_DISK_H__ -#define __MG_DISK_H__ - -#include -#include - -/* name for block device */ -#define MG_DISK_NAME "mgd" -/* name for platform device */ -#define MG_DEV_NAME "mg_disk" - -#define MG_DISK_MAJ 0 -#define MG_DISK_MAX_PART 16 -#define MG_SECTOR_SIZE 512 -#define MG_MAX_SECTS 256 - -/* Register offsets */ -#define MG_BUFF_OFFSET 0x8000 -#define MG_STORAGE_BUFFER_SIZE 0x200 -#define MG_REG_OFFSET 0xC000 -#define MG_REG_FEATURE (MG_REG_OFFSET + 2) /* write case */ -#define MG_REG_ERROR (MG_REG_OFFSET + 2) /* read case */ -#define MG_REG_SECT_CNT (MG_REG_OFFSET + 4) -#define MG_REG_SECT_NUM (MG_REG_OFFSET + 6) -#define MG_REG_CYL_LOW (MG_REG_OFFSET + 8) -#define MG_REG_CYL_HIGH (MG_REG_OFFSET + 0xA) -#define MG_REG_DRV_HEAD (MG_REG_OFFSET + 0xC) -#define MG_REG_COMMAND (MG_REG_OFFSET + 0xE) /* write case */ -#define MG_REG_STATUS (MG_REG_OFFSET + 0xE) /* read case */ -#define MG_REG_DRV_CTRL (MG_REG_OFFSET + 0x10) -#define MG_REG_BURST_CTRL (MG_REG_OFFSET + 0x12) - -/* "Drive Select/Head Register" bit values */ -#define MG_REG_HEAD_MUST_BE_ON 0xA0 /* These 2 bits are always on */ -#define MG_REG_HEAD_DRIVE_MASTER (0x00 | MG_REG_HEAD_MUST_BE_ON) -#define MG_REG_HEAD_DRIVE_SLAVE (0x10 | MG_REG_HEAD_MUST_BE_ON) -#define MG_REG_HEAD_LBA_MODE (0x40 | MG_REG_HEAD_MUST_BE_ON) - - -/* "Device Control Register" bit values */ -#define MG_REG_CTRL_INTR_ENABLE 0x0 -#define MG_REG_CTRL_INTR_DISABLE (0x1<<1) -#define MG_REG_CTRL_RESET (0x1<<2) -#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH 0x0 -#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW (0x1<<4) -#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW 0x0 -#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH (0x1<<5) -#define MG_REG_CTRL_DPD_DISABLE 0x0 -#define MG_REG_CTRL_DPD_ENABLE (0x1<<6) - -/* Status register bit */ -/* error bit in status register */ -#define MG_REG_STATUS_BIT_ERROR 0x01 -/* corrected error in status register */ -#define MG_REG_STATUS_BIT_CORRECTED_ERROR 0x04 -/* data request bit in status register */ -#define MG_REG_STATUS_BIT_DATA_REQ 0x08 -/* DSC - Drive Seek Complete */ -#define MG_REG_STATUS_BIT_SEEK_DONE 0x10 -/* DWF - Drive Write Fault */ -#define MG_REG_STATUS_BIT_WRITE_FAULT 0x20 -#define MG_REG_STATUS_BIT_READY 0x40 -#define MG_REG_STATUS_BIT_BUSY 0x80 - -/* handy status */ -#define MG_STAT_READY (MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE) -#define MG_READY_OK(s) (((s) & (MG_STAT_READY | \ - (MG_REG_STATUS_BIT_BUSY | \ - MG_REG_STATUS_BIT_WRITE_FAULT | \ - MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY) - -/* Error register */ -#define MG_REG_ERR_AMNF 0x01 -#define MG_REG_ERR_ABRT 0x04 -#define MG_REG_ERR_IDNF 0x10 -#define MG_REG_ERR_UNC 0x40 -#define MG_REG_ERR_BBK 0x80 - -/* error code for others */ -#define MG_ERR_NONE 0 -#define MG_ERR_TIMEOUT 0x100 -#define MG_ERR_INIT_STAT 0x101 -#define MG_ERR_TRANSLATION 0x102 -#define MG_ERR_CTRL_RST 0x103 -#define MG_ERR_INV_STAT 0x104 -#define MG_ERR_RSTOUT 0x105 - -#define MG_MAX_ERRORS 6 /* Max read/write errors */ - -/* command */ -#define MG_CMD_RD 0x20 -#define MG_CMD_WR 0x30 -#define MG_CMD_SLEEP 0x99 -#define MG_CMD_WAKEUP 0xC3 -#define MG_CMD_ID 0xEC -#define MG_CMD_WR_CONF 0x3C -#define MG_CMD_RD_CONF 0x40 - -/* operation mode */ -#define MG_OP_CASCADE (1 << 0) -#define MG_OP_CASCADE_SYNC_RD (1 << 1) -#define MG_OP_CASCADE_SYNC_WR (1 << 2) -#define MG_OP_INTERLEAVE (1 << 3) - -/* synchronous */ -#define MG_BURST_LAT_4 (3 << 4) -#define MG_BURST_LAT_5 (4 << 4) -#define MG_BURST_LAT_6 (5 << 4) -#define MG_BURST_LAT_7 (6 << 4) -#define MG_BURST_LAT_8 (7 << 4) -#define MG_BURST_LEN_4 (1 << 1) -#define MG_BURST_LEN_8 (2 << 1) -#define MG_BURST_LEN_16 (3 << 1) -#define MG_BURST_LEN_32 (4 << 1) -#define MG_BURST_LEN_CONT (0 << 1) - -/* timeout value (unit: ms) */ -#define MG_TMAX_CONF_TO_CMD 1 -#define MG_TMAX_WAIT_RD_DRQ 10 -#define MG_TMAX_WAIT_WR_DRQ 500 -#define MG_TMAX_RST_TO_BUSY 10 -#define MG_TMAX_HDRST_TO_RDY 500 -#define MG_TMAX_SWRST_TO_RDY 500 -#define MG_TMAX_RSTOUT 3000 - -/* device attribution */ -/* use mflash as boot device */ -#define MG_BOOT_DEV (1 << 0) -/* use mflash as storage device */ -#define MG_STORAGE_DEV (1 << 1) -/* same as MG_STORAGE_DEV, but bootloader already done reset sequence */ -#define MG_STORAGE_DEV_SKIP_RST (1 << 2) - -#define MG_DEV_MASK (MG_BOOT_DEV | MG_STORAGE_DEV | MG_STORAGE_DEV_SKIP_RST) - -/* names of GPIO resource */ -#define MG_RST_PIN "mg_rst" -/* except MG_BOOT_DEV, reset-out pin should be assigned */ -#define MG_RSTOUT_PIN "mg_rstout" - -/* private driver data */ -struct mg_drv_data { - /* disk resource */ - u32 use_polling; - - /* device attribution */ - u32 dev_attr; - - /* internally used */ - struct mg_host *host; -}; - -/* main structure for mflash driver */ -struct mg_host { - struct device *dev; - - struct request_queue *breq; - spinlock_t lock; - struct gendisk *gd; - - struct timer_list timer; - void (*mg_do_intr) (struct mg_host *); - - u16 id[ATA_ID_WORDS]; - - u16 cyls; - u16 heads; - u16 sectors; - u32 n_sectors; - u32 nres_sectors; - - void __iomem *dev_base; - unsigned int irq; - unsigned int rst; - unsigned int rstout; - - u32 major; - u32 error; -}; - -/* - * Debugging macro and defines - */ -#undef DO_MG_DEBUG -#ifdef DO_MG_DEBUG -# define MG_DBG(fmt, args...) \ - printk(KERN_DEBUG "%s:%d "fmt, __func__, __LINE__, ##args) -#else /* CONFIG_MG_DEBUG */ -# define MG_DBG(fmt, args...) do { } while (0) -#endif /* CONFIG_MG_DEBUG */ - -#endif From a03bb5a32fff4aa23f081a3cff7e98d4084104cd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 28 Apr 2009 13:06:15 +0900 Subject: [PATCH 0633/2061] mg_disk: clean up request completion paths mg_disk implements its own partial completion. Convert to standard block layer partial completion. [ Impact: cleanup ] Signed-off-by: Tejun Heo Cc: unsik Kim Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 117 ++++++++++++---------------------------- 1 file changed, 33 insertions(+), 84 deletions(-) diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 2c6127ee834..1a4cc968cfe 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -503,95 +503,68 @@ static unsigned int mg_out(struct mg_host *host, static void mg_read(struct request *req) { - u32 remains, j; + u32 j; struct mg_host *host = req->rq_disk->private_data; - remains = req->nr_sectors; - if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, NULL) != MG_ERR_NONE) mg_bad_rw_intr(host); MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - remains, req->sector, req->buffer); + req->nr_sectors, req->sector, req->buffer); + + do { + u16 *buff = (u16 *)req->buffer; - while (remains) { if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } - for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) { - *(u16 *)req->buffer = - inw((unsigned long)host->dev_base + - MG_BUFF_OFFSET + (j << 1)); - req->buffer += 2; - } - - req->sector++; - req->errors = 0; - remains = --req->nr_sectors; - --req->current_nr_sectors; - - if (req->current_nr_sectors <= 0) { - MG_DBG("remain : %d sects\n", remains); - __blk_end_request_cur(req, 0); - if (remains > 0) - req = elv_next_request(host->breq); - } + for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) + *buff++ = inw((unsigned long)host->dev_base + + MG_BUFF_OFFSET + (j << 1)); outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); - } + } while (__blk_end_request(req, 0, MG_SECTOR_SIZE)); } static void mg_write(struct request *req) { - u32 remains, j; + u32 j; struct mg_host *host = req->rq_disk->private_data; - remains = req->nr_sectors; - if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, NULL) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } - MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - remains, req->sector, req->buffer); - while (remains) { + req->nr_sectors, req->sector, req->buffer); + + do { + u16 *buff = (u16 *)req->buffer; + if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } - for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) { - outw(*(u16 *)req->buffer, - (unsigned long)host->dev_base + - MG_BUFF_OFFSET + (j << 1)); - req->buffer += 2; - } - req->sector++; - remains = --req->nr_sectors; - --req->current_nr_sectors; - - if (req->current_nr_sectors <= 0) { - MG_DBG("remain : %d sects\n", remains); - __blk_end_request_cur(req, 0); - if (remains > 0) - req = elv_next_request(host->breq); - } + for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) + outw(*buff++, (unsigned long)host->dev_base + + MG_BUFF_OFFSET + (j << 1)); outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); - } + } while (__blk_end_request(req, 0, MG_SECTOR_SIZE)); } static void mg_read_intr(struct mg_host *host) { u32 i; + u16 *buff; struct request *req; /* check status */ @@ -612,39 +585,24 @@ static void mg_read_intr(struct mg_host *host) ok_to_read: /* get current segment of request */ req = elv_next_request(host->breq); + buff = (u16 *)req->buffer; /* read 1 sector */ - for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) { - *(u16 *)req->buffer = - inw((unsigned long)host->dev_base + MG_BUFF_OFFSET + - (i << 1)); - req->buffer += 2; - } + for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) + *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET + + (i << 1)); - /* manipulate request */ MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", req->sector, req->nr_sectors - 1, req->buffer); - req->sector++; - req->errors = 0; - i = --req->nr_sectors; - --req->current_nr_sectors; - - /* let know if current segment done */ - if (req->current_nr_sectors <= 0) - __blk_end_request_cur(req, 0); - - /* set handler if read remains */ - if (i > 0) { - host->mg_do_intr = mg_read_intr; - mod_timer(&host->timer, jiffies + 3 * HZ); - } - /* send read confirm */ outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); - /* goto next request */ - if (!i) + if (__blk_end_request(req, 0, MG_SECTOR_SIZE)) { + /* set handler if read remains */ + host->mg_do_intr = mg_read_intr; + mod_timer(&host->timer, jiffies + 3 * HZ); + } else /* goto next request */ mg_request(host->breq); } @@ -653,6 +611,7 @@ static void mg_write_intr(struct mg_host *host) u32 i, j; u16 *buff; struct request *req; + bool rem; /* get current segment of request */ req = elv_next_request(host->breq); @@ -673,18 +632,8 @@ static void mg_write_intr(struct mg_host *host) return; ok_to_write: - /* manipulate request */ - req->sector++; - i = --req->nr_sectors; - --req->current_nr_sectors; - req->buffer += MG_SECTOR_SIZE; - - /* let know if current segment or all done */ - if (!i || (req->bio && req->current_nr_sectors <= 0)) - __blk_end_request_cur(req, 0); - - /* write 1 sector and set handler if remains */ - if (i > 0) { + if ((rem = __blk_end_request(req, 0, MG_SECTOR_SIZE))) { + /* write 1 sector and set handler if remains */ buff = (u16 *)req->buffer; for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) { outw(*buff, (unsigned long)host->dev_base + @@ -700,7 +649,7 @@ ok_to_write: /* send write confirm */ outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); - if (!i) + if (!rem) mg_request(host->breq); } From 8a11a789c39cd6f61de4243234cf44a46c23ffd0 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Apr 2009 13:06:16 +0900 Subject: [PATCH 0634/2061] mg_disk: fix dependency on libata Add local copies of ata_id_string() and ata_id_c_string() to mg_disk so there is no need for the driver to depend on ATA and SCSI. [ Impact: break dependency on libata by copying ata id string functions ] Cc: unsik Kim Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/Kconfig | 2 +- drivers/block/mg_disk.c | 44 +++++++++++++++++++++++++++++++++++++---- 2 files changed, 41 insertions(+), 5 deletions(-) diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig index ddea8e485cc..f42fa50d355 100644 --- a/drivers/block/Kconfig +++ b/drivers/block/Kconfig @@ -412,7 +412,7 @@ config ATA_OVER_ETH config MG_DISK tristate "mGine mflash, gflash support" - depends on ARM && ATA && GPIOLIB + depends on ARM && GPIOLIB help mGine mFlash(gFlash) block device driver diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 1a4cc968cfe..8e53cae9fa9 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -17,7 +17,7 @@ #include #include #include -#include +#include #include #include #include @@ -357,6 +357,42 @@ static irqreturn_t mg_irq(int irq, void *dev_id) return IRQ_HANDLED; } +/* local copy of ata_id_string() */ +static void mg_id_string(const u16 *id, unsigned char *s, + unsigned int ofs, unsigned int len) +{ + unsigned int c; + + BUG_ON(len & 1); + + while (len > 0) { + c = id[ofs] >> 8; + *s = c; + s++; + + c = id[ofs] & 0xff; + *s = c; + s++; + + ofs++; + len -= 2; + } +} + +/* local copy of ata_id_c_string() */ +static void mg_id_c_string(const u16 *id, unsigned char *s, + unsigned int ofs, unsigned int len) +{ + unsigned char *p; + + mg_id_string(id, s, ofs, len - 1); + + p = s + strnlen(s, len - 1); + while (p > s && p[-1] == ' ') + p--; + *p = '\0'; +} + static int mg_get_disk_id(struct mg_host *host) { u32 i; @@ -403,9 +439,9 @@ static int mg_get_disk_id(struct mg_host *host) host->n_sectors -= host->nres_sectors; } - ata_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev)); - ata_id_c_string(id, model, ATA_ID_PROD, sizeof(model)); - ata_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial)); + mg_id_c_string(id, fwrev, ATA_ID_FW_REV, sizeof(fwrev)); + mg_id_c_string(id, model, ATA_ID_PROD, sizeof(model)); + mg_id_c_string(id, serial, ATA_ID_SERNO, sizeof(serial)); printk(KERN_INFO "mg_disk: model: %s\n", model); printk(KERN_INFO "mg_disk: firm: %.8s\n", fwrev); printk(KERN_INFO "mg_disk: serial: %s\n", serial); From f68adec3c7155a8bbf32a90cb4c4d0737df045d9 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Tue, 28 Apr 2009 13:06:17 +0900 Subject: [PATCH 0635/2061] mg_disk: use defines from While at it: - remove MG_REG_HEAD_MUST_BE_ON define - remove MG_REG_CTRL_INTR_ENABLE define - remove MG_REG_HEAD_LBA_MODE define - remove unused defines Cc: unsik Kim Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 144 ++++++++++++---------------------------- 1 file changed, 43 insertions(+), 101 deletions(-) diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 8e53cae9fa9..71e56cc28ca 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -51,51 +51,10 @@ #define MG_REG_DRV_CTRL (MG_REG_OFFSET + 0x10) #define MG_REG_BURST_CTRL (MG_REG_OFFSET + 0x12) -/* "Drive Select/Head Register" bit values */ -#define MG_REG_HEAD_MUST_BE_ON 0xA0 /* These 2 bits are always on */ -#define MG_REG_HEAD_DRIVE_MASTER (0x00 | MG_REG_HEAD_MUST_BE_ON) -#define MG_REG_HEAD_DRIVE_SLAVE (0x10 | MG_REG_HEAD_MUST_BE_ON) -#define MG_REG_HEAD_LBA_MODE (0x40 | MG_REG_HEAD_MUST_BE_ON) - - -/* "Device Control Register" bit values */ -#define MG_REG_CTRL_INTR_ENABLE 0x0 -#define MG_REG_CTRL_INTR_DISABLE (0x1<<1) -#define MG_REG_CTRL_RESET (0x1<<2) -#define MG_REG_CTRL_INTR_POLA_ACTIVE_HIGH 0x0 -#define MG_REG_CTRL_INTR_POLA_ACTIVE_LOW (0x1<<4) -#define MG_REG_CTRL_DPD_POLA_ACTIVE_LOW 0x0 -#define MG_REG_CTRL_DPD_POLA_ACTIVE_HIGH (0x1<<5) -#define MG_REG_CTRL_DPD_DISABLE 0x0 -#define MG_REG_CTRL_DPD_ENABLE (0x1<<6) - -/* Status register bit */ -/* error bit in status register */ -#define MG_REG_STATUS_BIT_ERROR 0x01 -/* corrected error in status register */ -#define MG_REG_STATUS_BIT_CORRECTED_ERROR 0x04 -/* data request bit in status register */ -#define MG_REG_STATUS_BIT_DATA_REQ 0x08 -/* DSC - Drive Seek Complete */ -#define MG_REG_STATUS_BIT_SEEK_DONE 0x10 -/* DWF - Drive Write Fault */ -#define MG_REG_STATUS_BIT_WRITE_FAULT 0x20 -#define MG_REG_STATUS_BIT_READY 0x40 -#define MG_REG_STATUS_BIT_BUSY 0x80 - /* handy status */ -#define MG_STAT_READY (MG_REG_STATUS_BIT_READY | MG_REG_STATUS_BIT_SEEK_DONE) -#define MG_READY_OK(s) (((s) & (MG_STAT_READY | \ - (MG_REG_STATUS_BIT_BUSY | \ - MG_REG_STATUS_BIT_WRITE_FAULT | \ - MG_REG_STATUS_BIT_ERROR))) == MG_STAT_READY) - -/* Error register */ -#define MG_REG_ERR_AMNF 0x01 -#define MG_REG_ERR_ABRT 0x04 -#define MG_REG_ERR_IDNF 0x10 -#define MG_REG_ERR_UNC 0x40 -#define MG_REG_ERR_BBK 0x80 +#define MG_STAT_READY (ATA_DRDY | ATA_DSC) +#define MG_READY_OK(s) (((s) & (MG_STAT_READY | (ATA_BUSY | ATA_DF | \ + ATA_ERR))) == MG_STAT_READY) /* error code for others */ #define MG_ERR_NONE 0 @@ -225,41 +184,39 @@ static void mg_dump_status(const char *msg, unsigned int stat, } printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff); - if (stat & MG_REG_STATUS_BIT_BUSY) + if (stat & ATA_BUSY) printk("Busy "); - if (stat & MG_REG_STATUS_BIT_READY) + if (stat & ATA_DRDY) printk("DriveReady "); - if (stat & MG_REG_STATUS_BIT_WRITE_FAULT) + if (stat & ATA_DF) printk("WriteFault "); - if (stat & MG_REG_STATUS_BIT_SEEK_DONE) + if (stat & ATA_DSC) printk("SeekComplete "); - if (stat & MG_REG_STATUS_BIT_DATA_REQ) + if (stat & ATA_DRQ) printk("DataRequest "); - if (stat & MG_REG_STATUS_BIT_CORRECTED_ERROR) + if (stat & ATA_CORR) printk("CorrectedError "); - if (stat & MG_REG_STATUS_BIT_ERROR) + if (stat & ATA_ERR) printk("Error "); printk("}\n"); - if ((stat & MG_REG_STATUS_BIT_ERROR) == 0) { + if ((stat & ATA_ERR) == 0) { host->error = 0; } else { host->error = inb((unsigned long)host->dev_base + MG_REG_ERROR); printk(KERN_ERR "%s: %s: error=0x%02x { ", name, msg, host->error & 0xff); - if (host->error & MG_REG_ERR_BBK) + if (host->error & ATA_BBK) printk("BadSector "); - if (host->error & MG_REG_ERR_UNC) + if (host->error & ATA_UNC) printk("UncorrectableError "); - if (host->error & MG_REG_ERR_IDNF) + if (host->error & ATA_IDNF) printk("SectorIdNotFound "); - if (host->error & MG_REG_ERR_ABRT) + if (host->error & ATA_ABORTED) printk("DriveStatusError "); - if (host->error & MG_REG_ERR_AMNF) + if (host->error & ATA_AMNF) printk("AddrMarkNotFound "); printk("}"); - if (host->error & - (MG_REG_ERR_BBK | MG_REG_ERR_UNC | - MG_REG_ERR_IDNF | MG_REG_ERR_AMNF)) { + if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) { if (host->breq) { req = elv_next_request(host->breq); if (req) @@ -284,12 +241,12 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec) do { cur_jiffies = jiffies; - if (status & MG_REG_STATUS_BIT_BUSY) { - if (expect == MG_REG_STATUS_BIT_BUSY) + if (status & ATA_BUSY) { + if (expect == ATA_BUSY) break; } else { /* Check the error condition! */ - if (status & MG_REG_STATUS_BIT_ERROR) { + if (status & ATA_ERR) { mg_dump_status("mg_wait", status, host); break; } @@ -298,8 +255,8 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec) if (MG_READY_OK(status)) break; - if (expect == MG_REG_STATUS_BIT_DATA_REQ) - if (status & MG_REG_STATUS_BIT_DATA_REQ) + if (expect == ATA_DRQ) + if (status & ATA_DRQ) break; } if (!msec) { @@ -404,12 +361,10 @@ static int mg_get_disk_id(struct mg_host *host) char serial[ATA_ID_SERNO_LEN + 1]; if (!prv_data->use_polling) - outb(MG_REG_CTRL_INTR_DISABLE, - (unsigned long)host->dev_base + - MG_REG_DRV_CTRL); + outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); outb(MG_CMD_ID, (unsigned long)host->dev_base + MG_REG_COMMAND); - err = mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, MG_TMAX_WAIT_RD_DRQ); + err = mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ); if (err) return err; @@ -449,8 +404,7 @@ static int mg_get_disk_id(struct mg_host *host) host->n_sectors, host->nres_sectors); if (!prv_data->use_polling) - outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base + - MG_REG_DRV_CTRL); + outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); return err; } @@ -464,7 +418,7 @@ static int mg_disk_init(struct mg_host *host) /* hdd rst low */ gpio_set_value(host->rst, 0); - err = mg_wait(host, MG_REG_STATUS_BIT_BUSY, MG_TMAX_RST_TO_BUSY); + err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY); if (err) return err; @@ -475,17 +429,14 @@ static int mg_disk_init(struct mg_host *host) return err; /* soft reset on */ - outb(MG_REG_CTRL_RESET | - (prv_data->use_polling ? MG_REG_CTRL_INTR_DISABLE : - MG_REG_CTRL_INTR_ENABLE), + outb(ATA_SRST | (prv_data->use_polling ? ATA_NIEN : 0), (unsigned long)host->dev_base + MG_REG_DRV_CTRL); - err = mg_wait(host, MG_REG_STATUS_BIT_BUSY, MG_TMAX_RST_TO_BUSY); + err = mg_wait(host, ATA_BUSY, MG_TMAX_RST_TO_BUSY); if (err) return err; /* soft reset off */ - outb(prv_data->use_polling ? MG_REG_CTRL_INTR_DISABLE : - MG_REG_CTRL_INTR_ENABLE, + outb(prv_data->use_polling ? ATA_NIEN : 0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); err = mg_wait(host, MG_STAT_READY, MG_TMAX_SWRST_TO_RDY); if (err) @@ -531,7 +482,7 @@ static unsigned int mg_out(struct mg_host *host, MG_REG_CYL_LOW); outb((u8)(sect_num >> 16), (unsigned long)host->dev_base + MG_REG_CYL_HIGH); - outb((u8)((sect_num >> 24) | MG_REG_HEAD_LBA_MODE), + outb((u8)((sect_num >> 24) | ATA_LBA | ATA_DEVICE_OBS), (unsigned long)host->dev_base + MG_REG_DRV_HEAD); outb(cmd, (unsigned long)host->dev_base + MG_REG_COMMAND); return MG_ERR_NONE; @@ -552,8 +503,8 @@ static void mg_read(struct request *req) do { u16 *buff = (u16 *)req->buffer; - if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, - MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) { + if (mg_wait(host, ATA_DRQ, + MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } @@ -583,8 +534,7 @@ static void mg_write(struct request *req) do { u16 *buff = (u16 *)req->buffer; - if (mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, - MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } @@ -606,11 +556,11 @@ static void mg_read_intr(struct mg_host *host) /* check status */ do { i = inb((unsigned long)host->dev_base + MG_REG_STATUS); - if (i & MG_REG_STATUS_BIT_BUSY) + if (i & ATA_BUSY) break; if (!MG_READY_OK(i)) break; - if (i & MG_REG_STATUS_BIT_DATA_REQ) + if (i & ATA_DRQ) goto ok_to_read; } while (0); mg_dump_status("mg_read_intr", i, host); @@ -655,11 +605,11 @@ static void mg_write_intr(struct mg_host *host) /* check status */ do { i = inb((unsigned long)host->dev_base + MG_REG_STATUS); - if (i & MG_REG_STATUS_BIT_BUSY) + if (i & ATA_BUSY) break; if (!MG_READY_OK(i)) break; - if ((req->nr_sectors <= 1) || (i & MG_REG_STATUS_BIT_DATA_REQ)) + if ((req->nr_sectors <= 1) || (i & ATA_DRQ)) goto ok_to_write; } while (0); mg_dump_status("mg_write_intr", i, host); @@ -752,18 +702,15 @@ static unsigned int mg_issue_req(struct request *req, break; case WRITE: /* TODO : handler */ - outb(MG_REG_CTRL_INTR_DISABLE, - (unsigned long)host->dev_base + - MG_REG_DRV_CTRL); + outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); if (mg_out(host, sect_num, sect_cnt, MG_CMD_WR, &mg_write_intr) != MG_ERR_NONE) { mg_bad_rw_intr(host); return host->error; } del_timer(&host->timer); - mg_wait(host, MG_REG_STATUS_BIT_DATA_REQ, MG_TMAX_WAIT_WR_DRQ); - outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base + - MG_REG_DRV_CTRL); + mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ); + outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); if (host->error) { mg_bad_rw_intr(host); return host->error; @@ -849,9 +796,7 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state) return -EIO; if (!prv_data->use_polling) - outb(MG_REG_CTRL_INTR_DISABLE, - (unsigned long)host->dev_base + - MG_REG_DRV_CTRL); + outb(ATA_NIEN, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); outb(MG_CMD_SLEEP, (unsigned long)host->dev_base + MG_REG_COMMAND); /* wait until mflash deep sleep */ @@ -859,9 +804,7 @@ static int mg_suspend(struct platform_device *plat_dev, pm_message_t state) if (mg_wait(host, MG_STAT_READY, MG_TMAX_CONF_TO_CMD)) { if (!prv_data->use_polling) - outb(MG_REG_CTRL_INTR_ENABLE, - (unsigned long)host->dev_base + - MG_REG_DRV_CTRL); + outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); return -EIO; } @@ -884,8 +827,7 @@ static int mg_resume(struct platform_device *plat_dev) return -EIO; if (!prv_data->use_polling) - outb(MG_REG_CTRL_INTR_ENABLE, (unsigned long)host->dev_base + - MG_REG_DRV_CTRL); + outb(0, (unsigned long)host->dev_base + MG_REG_DRV_CTRL); return 0; } From 5b644c7a218702668d7b610994e7dcbc3d4705d3 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 28 Apr 2009 08:17:54 +0000 Subject: [PATCH 0636/2061] clocksource: improve sh_cmt clocksource overflow handling This patch improves the sh_cmt clocksource handling. Currently the counter value is ignored in the case of overflow. With this patch the overflow flag is read before and after reading the counter, removing any counter value and overflow flag mismatch issues. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/clocksource/sh_cmt.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index c2475648961..d607ac2d516 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -111,16 +111,21 @@ static unsigned long sh_cmt_get_counter(struct sh_cmt_priv *p, int *has_wrapped) { unsigned long v1, v2, v3; + int o1, o2; + + o1 = sh_cmt_read(p, CMCSR) & p->overflow_bit; /* Make sure the timer value is stable. Stolen from acpi_pm.c */ do { + o2 = o1; v1 = sh_cmt_read(p, CMCNT); v2 = sh_cmt_read(p, CMCNT); v3 = sh_cmt_read(p, CMCNT); - } while (unlikely((v1 > v2 && v1 < v3) || (v2 > v3 && v2 < v1) - || (v3 > v1 && v3 < v2))); + o1 = sh_cmt_read(p, CMCSR) & p->overflow_bit; + } while (unlikely((o1 != o2) || (v1 > v2 && v1 < v3) + || (v2 > v3 && v2 < v1) || (v3 > v1 && v3 < v2))); - *has_wrapped = sh_cmt_read(p, CMCSR) & p->overflow_bit; + *has_wrapped = o1; return v2; } @@ -394,7 +399,7 @@ static cycle_t sh_cmt_clocksource_read(struct clocksource *cs) raw = sh_cmt_get_counter(p, &has_wrapped); if (unlikely(has_wrapped)) - raw = p->match_value; + raw += p->match_value; spin_unlock_irqrestore(&p->lock, flags); return value + raw; From 8e0b842948156e3463879caed12b4ce51bed772e Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 28 Apr 2009 08:19:50 +0000 Subject: [PATCH 0637/2061] sh: setup timers in late_time_init() This patch moves the SuperH timer setup code from time_init() to late_time_init(). Good things about this change: - interrupts: they are enabled at late_time_init() - mm: regular kmalloc() can be used at late_time_init() Together with moving to late_time_init() this patch changes the sh_cmt driver to always allocate with kmalloc(). This simplifies the code a bit and also fixes section mismatches. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/time_32.c | 34 ++++++++++++++++++++-------------- drivers/clocksource/sh_cmt.c | 13 ++----------- 2 files changed, 22 insertions(+), 25 deletions(-) diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c index 109409f5ca5..a2458bfdda2 100644 --- a/arch/sh/kernel/time_32.c +++ b/arch/sh/kernel/time_32.c @@ -237,21 +237,8 @@ unsigned long long sched_clock(void) } #endif -void __init time_init(void) +static void __init sh_late_time_init(void) { - if (board_time_init) - board_time_init(); - - clk_init(); - - rtc_sh_get_time(&xtime); - set_normalized_timespec(&wall_to_monotonic, - -xtime.tv_sec, -xtime.tv_nsec); - -#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST - local_timer_setup(smp_processor_id()); -#endif - /* * Make sure all compiled-in early timers register themselves. * Run probe() for one "earlytimer" device. @@ -270,3 +257,22 @@ void __init time_init(void) printk(KERN_INFO "Using %s for system timer\n", sys_timer->name); } + +void __init time_init(void) +{ + if (board_time_init) + board_time_init(); + + clk_init(); + + rtc_sh_get_time(&xtime); + set_normalized_timespec(&wall_to_monotonic, + -xtime.tv_sec, -xtime.tv_nsec); + +#ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST + local_timer_setup(smp_processor_id()); +#endif + + late_time_init = sh_late_time_init; +} + diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index d607ac2d516..bf3e4c11fd3 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -18,7 +18,6 @@ */ #include -#include #include #include #include @@ -645,11 +644,7 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev) return 0; } - if (is_early_platform_device(pdev)) - p = alloc_bootmem(sizeof(*p)); - else - p = kmalloc(sizeof(*p), GFP_KERNEL); - + p = kmalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) { dev_err(&pdev->dev, "failed to allocate driver data\n"); return -ENOMEM; @@ -657,11 +652,7 @@ static int __devinit sh_cmt_probe(struct platform_device *pdev) ret = sh_cmt_setup(p, pdev); if (ret) { - if (is_early_platform_device(pdev)) - free_bootmem(__pa(p), sizeof(*p)); - else - kfree(p); - + kfree(p); platform_set_drvdata(pdev, NULL); } return ret; From 4278600644dee621bd50d7498e244b135612e0f6 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 28 Apr 2009 23:12:10 +0900 Subject: [PATCH 0638/2061] sh: register the rtc-generic platform device properly. The device registration was accidentally omitted, add it back in. Do some basic device probing as well, so this doesn't show up for platforms that tie in to the RTC interface properly. Signed-off-by: Paul Mundt --- arch/sh/kernel/time_32.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c index a2458bfdda2..457332116e1 100644 --- a/arch/sh/kernel/time_32.c +++ b/arch/sh/kernel/time_32.c @@ -3,7 +3,7 @@ * * Copyright (C) 1999 Tetsuya Okada & Niibe Yutaka * Copyright (C) 2000 Philipp Rumpf - * Copyright (C) 2002 - 2008 Paul Mundt + * Copyright (C) 2002 - 2009 Paul Mundt * Copyright (C) 2002 M. R. Brown * * Some code taken from i386 version. @@ -68,6 +68,21 @@ int set_rtc_time(struct rtc_time *tm) } EXPORT_SYMBOL(set_rtc_time); +static int __init rtc_generic_init(void) +{ + struct platform_device *pdev; + + if (rtc_sh_get_time == null_rtc_get_time) + return -ENODEV; + + pdev = platform_device_register_simple("rtc-generic", -1, NULL, 0); + if (IS_ERR(pdev)) + return PTR_ERR(pdev); + + return 0; +} +module_init(rtc_generic_init); + #ifndef CONFIG_GENERIC_TIME void do_gettimeofday(struct timeval *tv) { From 1130b0296184bc21806225fd06d533515a99d2db Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 28 Apr 2009 14:56:18 +0200 Subject: [PATCH 0639/2061] perf_counter tools: fix Documentation/perf_counter build error MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mike Galbraith reported: > marge:..Documentation/perf_counter # make > CC builtin-stat.o > In file included from builtin-stat.c:71: > /usr/include/ctype.h:102: error: expected expression before ‘]’ token Remove the ctype.h include. Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-stat.c | 1 - Documentation/perf_counter/builtin-top.c | 1 - 2 files changed, 2 deletions(-) diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index d7ace631fc4..112b94ed329 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -68,7 +68,6 @@ #include #include #include -#include #include #include #include diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index dea016fa41e..6a276d2b2bb 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -49,7 +49,6 @@ #include #include #include -#include #include #include #include From cd891ae0305601bdb4d2e7e85282961c4ff256cd Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 28 Apr 2009 11:39:34 -0400 Subject: [PATCH 0640/2061] tracing: convert ftrace_dump spinlocks to raw ftrace_dump is used for printing out the contents of the ftrace ring buffer to the console on failure. Currently it uses a spinlock to synchronize the output from multiple failures on different CPUs. This spin lock currently is a normal spinlock and can cause issues with lockdep and lock tracing. This patch converts it to raw since it is for error handling only. The lock is local to the ftrace_dump and is not used by any other infrastructure. [ Impact: prevent ftrace_dump from locking up by internal tracing ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index b6183bc9eca..5d704a41f83 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4114,7 +4114,8 @@ trace_printk_seq(struct trace_seq *s) static void __ftrace_dump(bool disable_tracing) { - static DEFINE_SPINLOCK(ftrace_dump_lock); + static raw_spinlock_t ftrace_dump_lock = + (raw_spinlock_t)__RAW_SPIN_LOCK_UNLOCKED; /* use static because iter can be a bit big for the stack */ static struct trace_iterator iter; unsigned int old_userobj; @@ -4123,7 +4124,8 @@ static void __ftrace_dump(bool disable_tracing) int cnt = 0, cpu; /* only one dump */ - spin_lock_irqsave(&ftrace_dump_lock, flags); + local_irq_save(flags); + __raw_spin_lock(&ftrace_dump_lock); if (dump_ran) goto out; @@ -4195,7 +4197,8 @@ static void __ftrace_dump(bool disable_tracing) } out: - spin_unlock_irqrestore(&ftrace_dump_lock, flags); + __raw_spin_unlock(&ftrace_dump_lock); + local_irq_restore(flags); } /* By default: disable tracing after the dump */ From edc953fa4ebc0265ef3b1754fe116a9fd4264e15 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 28 Apr 2009 11:13:46 -0400 Subject: [PATCH 0641/2061] x86: clean up alternative.h Alternative header duplicates assembly that could be merged in one single macro. Merging this into this macro also allows to directly declare ALTERNATIVE() statements within assembly code. Uses a __stringify() of the feature bits rather than passing a "i" operand. Leave the old %0 operand as-is (set to 0), unused to stay compatible with API. (v2: tab alignment fixes) [ Impact: cleanup ] Signed-off-by: Mathieu Desnoyers LKML-Reference: <20090428151346.GA31212@Krystal> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/alternative.h | 59 ++++++++++++------------------ 1 file changed, 23 insertions(+), 36 deletions(-) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index f6aa18eadf7..1a37bcdc860 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -3,6 +3,7 @@ #include #include +#include #include /* @@ -74,6 +75,22 @@ static inline void alternatives_smp_switch(int smp) {} const unsigned char *const *find_nop_table(void); +/* alternative assembly primitive: */ +#define ALTERNATIVE(oldinstr, newinstr, feature) \ + \ + "661:\n\t" oldinstr "\n662:\n" \ + ".section .altinstructions,\"a\"\n" \ + _ASM_ALIGN "\n" \ + _ASM_PTR "661b\n" /* label */ \ + _ASM_PTR "663f\n" /* new instruction */ \ + " .byte " __stringify(feature) "\n" /* feature bit */ \ + " .byte 662b-661b\n" /* sourcelen */ \ + " .byte 664f-663f\n" /* replacementlen */ \ + ".previous\n" \ + ".section .altinstr_replacement, \"ax\"\n" \ + "663:\n\t" newinstr "\n664:\n" /* replacement */ \ + ".previous" + /* * Alternative instructions for different CPU types or capabilities. * @@ -87,18 +104,7 @@ const unsigned char *const *find_nop_table(void); * without volatile and memory clobber. */ #define alternative(oldinstr, newinstr, feature) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature) : "memory") + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") /* * Alternative inline assembly with input. @@ -109,35 +115,16 @@ const unsigned char *const *find_nop_table(void); * Best is to use constraints that are fixed size (like (%1) ... "r") * If you use variable sized constraints like "m" or "g" in the * replacement make sure to pad to the worst case length. + * Leaving an unused argument 0 to keep API compatibility. */ #define alternative_input(oldinstr, newinstr, feature, input...) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c0\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" :: "i" (feature), ##input) + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + : : "i" (0), ## input) /* Like alternative_input, but with a single output argument */ #define alternative_io(oldinstr, newinstr, feature, output, input...) \ - asm volatile ("661:\n\t" oldinstr "\n662:\n" \ - ".section .altinstructions,\"a\"\n" \ - _ASM_ALIGN "\n" \ - _ASM_PTR "661b\n" /* label */ \ - _ASM_PTR "663f\n" /* new instruction */ \ - " .byte %c[feat]\n" /* feature bit */ \ - " .byte 662b-661b\n" /* sourcelen */ \ - " .byte 664f-663f\n" /* replacementlen */ \ - ".previous\n" \ - ".section .altinstr_replacement,\"ax\"\n" \ - "663:\n\t" newinstr "\n664:\n" /* replacement */ \ - ".previous" : output : [feat] "i" (feature), ##input) + asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) \ + : output : "i" (0), ## input) /* * use this macro(s) if you need more than one output parameter From 5beae6efd1004b44c3e257dc96087978e4c763c1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:16:21 -0400 Subject: [PATCH 0642/2061] tracing: fix ref count in splice pages The pages allocated for the splice binary buffer did not initialize the ref count correctly. This caused pages not to be freed and causes a drastic memory leak. Thanks to logdev I was able to trace the tracer to find where the leak was. [ Impact: stop memory leak when using splice ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 5d704a41f83..9058240c85c 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3531,6 +3531,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, if (!ref) break; + ref->ref = 1; ref->buffer = info->tr->buffer; ref->page = ring_buffer_alloc_read_page(ref->buffer); if (!ref->page) { From 93459c6cb9816c52200993d29dd18cea1daee335 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:23:13 -0400 Subject: [PATCH 0643/2061] tracing: only add splice page if entries exist The splice code allocates a page even when the ring buffer is empty. It detects the ring buffer being empty when it it fails to copy anything from the ring buffer into the page. This patch adds a check to see if there is anything in the ring buffer before allocating a page. Thanks to logdev for letting me trace the tracer to find this. [ Impact: speed up due to removing unnecessary allocation ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 9058240c85c..0aeb3b93414 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3508,7 +3508,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, .spd_release = buffer_spd_release, }; struct buffer_ref *ref; - int size, i; + int entries, size, i; size_t ret; if (*ppos & (PAGE_SIZE - 1)) { @@ -3523,7 +3523,9 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, len &= PAGE_MASK; } - for (i = 0; i < PIPE_BUFFERS && len; i++, len -= PAGE_SIZE) { + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); + + for (i = 0; i < PIPE_BUFFERS && len && entries; i++, len -= PAGE_SIZE) { struct page *page; int r; @@ -3564,6 +3566,8 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, spd.partial[i].private = (unsigned long)ref; spd.nr_pages++; *ppos += PAGE_SIZE; + + entries = ring_buffer_entries_cpu(info->tr->buffer, info->cpu); } spd.nr_pages = i; From f2957f1f196b0217644a17c1379855a118a37d72 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 00:26:30 -0400 Subject: [PATCH 0644/2061] tracing: have splice only copy full pages Splice works with pages, it is much more effecient to use an entire page than to copy bits over several pages. Using logdev to trace the internals of the splice mechanism, I was able to see that splice can be very aggressive. When tracing is occurring, and the reader caught up to the writer, and the writer is on the reader page, the reader will copy what is there into the splice page. Splice may iterate over several pages and if the writer is still writing to the page, the reader will keep copying bits to new pages to pass to userspace. This patch changes it to only pass data to userspace if the page is full (the writer has left the page). This has a small side effect that splice can not read a partial page, and must wait for the page to fill. This should not be an issue. If tracing has stopped, then a use of "read" will still read all of the page. [ Impact: better performance for ring buffer splice code ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 0aeb3b93414..f5427e0fc98 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3542,7 +3542,7 @@ tracing_buffers_splice_read(struct file *file, loff_t *ppos, } r = ring_buffer_read_page(ref->buffer, &ref->page, - len, info->cpu, 0); + len, info->cpu, 1); if (r < 0) { ring_buffer_free_read_page(ref->buffer, ref->page); From 7d7d2b803159d4edeb051b0e5efbc1a8d9ef1c67 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 27 Apr 2009 12:37:49 -0400 Subject: [PATCH 0645/2061] ring-buffer: fix printk output The warning output in trace_recursive_lock uses %d for a long when it should be %ld. [ Impact: fix compile warning ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9692f100ec1..f4cc59040eb 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1491,7 +1491,7 @@ static int trace_recursive_lock(void) /* Disable all tracing before we do anything else */ tracing_off_permanent(); - printk_once(KERN_WARNING "Tracing recursion: depth[%d]:" + printk_once(KERN_WARNING "Tracing recursion: depth[%ld]:" "HC[%lu]:SC[%lu]:NMI[%lu]\n", current->trace_recursion, hardirq_count() >> HARDIRQ_SHIFT, From aee6a166a5401dcfcb17fcdc055e5edf2a4f4042 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:17 +0200 Subject: [PATCH 0646/2061] x86: beautify vmlinux_32.lds.S Beautify vmlinux_32.lds.S: - Use tabs for indent - Located curly braces like in C code - Rearranged a few comments To see actual differences use "git diff -b" which ignore 'whitespace' changes. The beautification is done to prepare a unification of the _32 and _64 variants of the linker scripts. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-1-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux_32.lds.S | 374 ++++++++++++++++--------------- 1 file changed, 199 insertions(+), 175 deletions(-) diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 62ad500d55f..fffa45a1036 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -22,196 +22,220 @@ ENTRY(phys_startup_32) jiffies = jiffies_64; PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ + note PT_NOTE FLAGS(0); /* ___ */ } SECTIONS { - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - *(.text.head) - } :text = 0x9090 + /* Text and read-only data */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); /* not really needed, already page aligned */ - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } :text = 0x9090 + /* read-only */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 - NOTES :text :note + NOTES :text :note - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 - RODATA + RODATA - /* writeable */ - . = ALIGN(PAGE_SIZE); - .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ - DATA_DATA - CONSTRUCTORS + /* writeable */ + . = ALIGN(PAGE_SIZE); + /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS } :data - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - _edata = .; /* End of data section */ - } - - . = ALIGN(THREAD_SIZE); /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* will be freed after init */ - . = ALIGN(PAGE_SIZE); /* Init code and data */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - SECURITY_INIT - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* .exit.text is discard at runtime, not link time, to deal with references - from .altinstructions and .eh_frame */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } -#if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - PERCPU(PAGE_SIZE) - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); - __brk_base = . ; - . += 64 * 1024 ; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = . ; - } - - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; } - STABS_DEBUG + . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + *(.data.idt) + } - DWARF_DEBUG + . = ALIGN(32); + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + + /* rarely changed data like cpu maps */ + . = ALIGN(32); + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + + /* End of data section */ + _edata = .; + } + + /* init_task */ + . = ALIGN(THREAD_SIZE); + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } + + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + /* might get freed after init */ + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + } + /* will be freed after init + * Following ALIGN() is required to make sure no other data falls on the + * same page where __smp_alt_end is pointing as that page might be freed + * after boot. Always make sure that ALIGN() directive is present after + * the section which contains __smp_alt_end. + */ + . = ALIGN(PAGE_SIZE); + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + . = ALIGN(4); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + + . = ALIGN(4); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#if defined(CONFIG_BLK_DEV_INITRD) + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } +#endif + + PERCPU(PAGE_SIZE) + + . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ + + /* BSS */ + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __init_end = .; + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } + + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } + + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = . ; + } + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.discard) + } + + STABS_DEBUG + DWARF_DEBUG } /* From 17ce265d6a1789eae5eb739a3bb7fcffdb3e87c5 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:18 +0200 Subject: [PATCH 0647/2061] x86, vmlinux.lds: unify header/footer Merge everything except PHDRS and SECTIONS into vmlinux.lds.S. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 77 ++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 37 --------------- arch/x86/kernel/vmlinux_64.lds.S | 42 ----------------- 3 files changed, 77 insertions(+), 79 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 849ee611f01..d113642c134 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -1,5 +1,82 @@ +/* + * ld script for the x86 kernel + * + * Historic 32-bit version written by Martin Mares + * + * Modernisation and unification done by Sam Ravnborg + * + * + * Don't define absolute symbols until and unless you know that symbol + * value is should remain constant even if kernel image is relocated + * at run time. Absolute symbols are not relocated. If symbol value should + * change if kernel is relocated, make the symbol section relative and + * put it inside the section definition. + */ + +#ifdef CONFIG_X86_32 +#define LOAD_OFFSET __PAGE_OFFSET +#else +#define LOAD_OFFSET __START_KERNEL_map +#endif + +#include +#include +#include +#include +#include +#include + +#undef i386 /* in case the preprocessor is a 32bit one */ + +OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) + +#ifdef CONFIG_X86_32 +OUTPUT_ARCH(i386) +ENTRY(phys_startup_32) +jiffies = jiffies_64; +#else +OUTPUT_ARCH(i386:x86-64) +ENTRY(phys_startup_64) +jiffies_64 = jiffies; +#endif + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else # include "vmlinux_64.lds.S" #endif + + +#ifdef CONFIG_X86_32 +ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") +#else +/* + * Per-cpu symbols which need to be offset from __per_cpu_load + * for the boot processor. + */ +#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load +INIT_PER_CPU(gdt_page); +INIT_PER_CPU(irq_stack_union); + +/* + * Build-time check on the image size: + */ +ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), + "kernel image bigger than KERNEL_IMAGE_SIZE") + +#ifdef CONFIG_SMP +ASSERT((per_cpu__irq_stack_union == 0), + "irq_stack_union is not at start of per-cpu area"); +#endif + +#endif /* CONFIG_X86_32 */ + +#ifdef CONFIG_KEXEC +#include + +ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, + "kexec control code size is too big") +#endif + diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index fffa45a1036..4c985fcd9ab 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,26 +1,3 @@ -/* ld script to make i386 Linux kernel - * Written by Martin Mares ; - * - * Don't define absolute symbols until and unless you know that symbol - * value is should remain constant even if kernel image is relocated - * at run time. Absolute symbols are not relocated. If symbol value should - * change if kernel is relocated, make the symbol section relative and - * put it inside the section definition. - */ - -#define LOAD_OFFSET __PAGE_OFFSET - -#include -#include -#include -#include -#include - -OUTPUT_FORMAT("elf32-i386", "elf32-i386", "elf32-i386") -OUTPUT_ARCH(i386) -ENTRY(phys_startup_32) -jiffies = jiffies_64; - PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -237,17 +214,3 @@ SECTIONS STABS_DEBUG DWARF_DEBUG } - -/* - * Build-time check on the image size: - */ -ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_KEXEC -/* Link time checks */ -#include - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 6d5a5b05eaa..7f1cc3d5fef 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,19 +1,3 @@ -/* ld script to make x86-64 Linux kernel - * Written by Martin Mares ; - */ - -#define LOAD_OFFSET __START_KERNEL_map - -#include -#include -#include - -#undef i386 /* in case the preprocessor is a 32bit one */ - -OUTPUT_FORMAT("elf64-x86-64", "elf64-x86-64", "elf64-x86-64") -OUTPUT_ARCH(i386:x86-64) -ENTRY(phys_startup_64) -jiffies_64 = jiffies; PHDRS { text PT_LOAD FLAGS(5); /* R_E */ data PT_LOAD FLAGS(7); /* RWE */ @@ -308,29 +292,3 @@ SECTIONS STABS_DEBUG DWARF_DEBUG } - -/* - * Per-cpu symbols which need to be offset from __per_cpu_load - * for the boot processor. - */ -#define INIT_PER_CPU(x) init_per_cpu__##x = per_cpu__##x + __per_cpu_load -INIT_PER_CPU(gdt_page); -INIT_PER_CPU(irq_stack_union); - -/* - * Build-time check on the image size: - */ -ASSERT((_end - _text <= KERNEL_IMAGE_SIZE), - "kernel image bigger than KERNEL_IMAGE_SIZE") - -#ifdef CONFIG_SMP -ASSERT((per_cpu__irq_stack_union == 0), - "irq_stack_union is not at start of per-cpu area"); -#endif - -#ifdef CONFIG_KEXEC -#include - -ASSERT(kexec_control_code_size <= KEXEC_CONTROL_CODE_MAX_SIZE, - "kexec control code size is too big") -#endif From afb8095a7eab32e5760613fa73d2f80a39cc45bf Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:19 +0200 Subject: [PATCH 0648/2061] x86, vmlinux.lds: unify PHDRS PHDRS are not equal for the two - so use ifdefs to cover up for that. On the assumption that they may become equal the ifdef is inside the PHDRS definiton. [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-3-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 13 +++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 5 ----- arch/x86/kernel/vmlinux_64.lds.S | 11 ----------- 3 files changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index d113642c134..1a1b303a427 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -40,6 +40,19 @@ ENTRY(phys_startup_64) jiffies_64 = jiffies; #endif +PHDRS { + text PT_LOAD FLAGS(5); /* R_E */ + data PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_X86_64 + user PT_LOAD FLAGS(7); /* RWE */ + data.init PT_LOAD FLAGS(7); /* RWE */ +#ifdef CONFIG_SMP + percpu PT_LOAD FLAGS(7); /* RWE */ +#endif + data.init2 PT_LOAD FLAGS(7); /* RWE */ +#endif + note PT_NOTE FLAGS(0); /* ___ */ +} #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 4c985fcd9ab..4fd40dc5017 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,8 +1,3 @@ -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} SECTIONS { . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 7f1cc3d5fef..6e7cbee0e87 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,14 +1,3 @@ -PHDRS { - text PT_LOAD FLAGS(5); /* R_E */ - data PT_LOAD FLAGS(7); /* RWE */ - user PT_LOAD FLAGS(7); /* RWE */ - data.init PT_LOAD FLAGS(7); /* RWE */ -#ifdef CONFIG_SMP - percpu PT_LOAD FLAGS(7); /* RWE */ -#endif - data.init2 PT_LOAD FLAGS(7); /* RWE */ - note PT_NOTE FLAGS(0); /* ___ */ -} SECTIONS { . = __START_KERNEL; From 444e0ae4831f99ba25062d9a5ccb7117c62841a0 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:20 +0200 Subject: [PATCH 0649/2061] x86, vmlinux.lds: unify start/end of SECTIONS [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-4-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 14 ++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 9 --------- arch/x86/kernel/vmlinux_64.lds.S | 9 --------- 3 files changed, 14 insertions(+), 18 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1a1b303a427..845776fe529 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -54,12 +54,26 @@ PHDRS { note PT_NOTE FLAGS(0); /* ___ */ } +SECTIONS +{ +#ifdef CONFIG_X86_32 + . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; + phys_startup_32 = startup_32 - LOAD_OFFSET; +#else + . = __START_KERNEL; + phys_startup_64 = startup_64 - LOAD_OFFSET; +#endif + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else # include "vmlinux_64.lds.S" #endif + STABS_DEBUG + DWARF_DEBUG +} + #ifdef CONFIG_X86_32 ASSERT((_end - LOAD_OFFSET <= KERNEL_IMAGE_SIZE), diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 4fd40dc5017..3d3d49c31b0 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,8 +1,3 @@ -SECTIONS -{ - . = LOAD_OFFSET + LOAD_PHYSICAL_ADDR; - phys_startup_32 = startup_32 - LOAD_OFFSET; - /* Text and read-only data */ .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { _text = .; @@ -205,7 +200,3 @@ SECTIONS *(.exitcall.exit) *(.discard) } - - STABS_DEBUG - DWARF_DEBUG -} diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 6e7cbee0e87..2d7fa2016c3 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,8 +1,3 @@ -SECTIONS -{ - . = __START_KERNEL; - phys_startup_64 = startup_64 - LOAD_OFFSET; - /* Text and read-only data */ .text : AT(ADDR(.text) - LOAD_OFFSET) { _text = .; @@ -277,7 +272,3 @@ SECTIONS *(.eh_frame) *(.discard) } - - STABS_DEBUG - DWARF_DEBUG -} From dfc20895d944cfa81d8ff00809b68ecb8f72cbb0 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:21 +0200 Subject: [PATCH 0650/2061] x86, vmlinux.lds: unify .text output sections 32 bit x86 had a dedicated .text.head output section, whereas 64 bit had it all in a single output section. In the unified version the dedicated .text.head output section was kept to have full control over the head code. 32 bit: - Moved definition of _stext to the linker script. The definition is located _after_ .text.page_aligned as this is what 32 bit did before. The ALIGN(8) was introduced so we hit the exact same address (on the tested config) before and after the move. I assume that it is a bug that _stext did not cover the .text.page_aligned section - if this is true it can be fixed in a follow-up patch (and the ugly ALIGN() can be dropped). [ Impact: 64-bit: cleanup, 32-bit: use the 64-bit linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-5-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/head_32.S | 7 ------- arch/x86/kernel/vmlinux.lds.S | 31 +++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 24 ------------------------ arch/x86/kernel/vmlinux_64.lds.S | 20 -------------------- 4 files changed, 31 insertions(+), 51 deletions(-) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index 30683883e0c..dc5ed4bdd88 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -608,13 +608,6 @@ ignore_int: ENTRY(initial_code) .long i386_start_kernel -.section .text -/* - * Real beginning of normal "text" segment - */ -ENTRY(stext) -ENTRY(_stext) - /* * BSS section */ diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 845776fe529..a7c88bb4365 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -64,6 +64,37 @@ SECTIONS phys_startup_64 = startup_64 - LOAD_OFFSET; #endif + /* Text and read-only data */ + + /* bootstrapping code */ + .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { + _text = .; + *(.text.head) + } :text = 0x9090 + + /* The rest of the text */ + .text : AT(ADDR(.text) - LOAD_OFFSET) { +#ifdef CONFIG_X86_32 + /* not really needed, already page aligned */ + . = ALIGN(PAGE_SIZE); + *(.text.page_aligned) +#endif + . = ALIGN(8); + _stext = .; + TEXT_TEXT + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + /* End of text section */ + _etext = .; + } :text = 0x9090 + + NOTES :text :note + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 3d3d49c31b0..854009288ec 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,27 +1,3 @@ - /* Text and read-only data */ - .text.head : AT(ADDR(.text.head) - LOAD_OFFSET) { - _text = .; - *(.text.head) - } :text = 0x9090 - - /* read-only */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - /* not really needed, already page aligned */ - . = ALIGN(PAGE_SIZE); - *(.text.page_aligned) - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - /* Exception table */ . = ALIGN(16); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 2d7fa2016c3..b5d43670d80 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,23 +1,3 @@ - /* Text and read-only data */ - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; - /* First the code that has to be first for bootstrapping */ - *(.text.head) - _stext = .; - /* Then the rest */ - TEXT_TEXT - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - /* End of text section */ - _etext = .; - } :text = 0x9090 - - NOTES :text :note - /* Exception table */ . = ALIGN(16); __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { From 448bc3ab0d03e77fee8e4264de0d001fc87bc164 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:22 +0200 Subject: [PATCH 0651/2061] x86, vmlinux.lds: unify exception table [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-6-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 10 ++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 10 ---------- arch/x86/kernel/vmlinux_64.lds.S | 10 ---------- 3 files changed, 10 insertions(+), 20 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index a7c88bb4365..67164f6f092 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -94,6 +94,16 @@ SECTIONS NOTES :text :note + /* Exception table */ + . = ALIGN(16); + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } :text = 0x9090 + + RODATA + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 854009288ec..920cc6989cc 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,13 +1,3 @@ - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - /* writeable */ . = ALIGN(PAGE_SIZE); /* Data */ diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index b5d43670d80..641f3f991a0 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,13 +1,3 @@ - /* Exception table */ - . = ALIGN(16); - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } :text = 0x9090 - - RODATA - /* Align data segment to page size boundary */ . = ALIGN(PAGE_SIZE); /* Data */ From 1f6397bac55040cd520d9eaf299e155a7aa01d5f Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:23 +0200 Subject: [PATCH 0652/2061] x86, vmlinux.lds: unify data output sections For 64 bit the following functional changes are introduced: - .data.page_aligned has moved - .data.cacheline_aligned has moved - .data.read_mostly has moved - ALIGN() moved out of output section for .data.cacheline_aligned - ALIGN() moved out of output section for .data.page_aligned Notice that 32 bit and 64 bit has different location of _edata. .data_nosave is 32 bit only as 64 bit is special due to PERCPU. [ Impact: 32-bit: cleanup, 64-bit: use 32-bit linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-7-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 55 ++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 37 --------------------- arch/x86/kernel/vmlinux_64.lds.S | 28 ---------------- 3 files changed, 55 insertions(+), 65 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 67164f6f092..067bdb012da 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -104,6 +104,61 @@ SECTIONS RODATA + /* Data */ + . = ALIGN(PAGE_SIZE); + .data : AT(ADDR(.data) - LOAD_OFFSET) { + DATA_DATA + CONSTRUCTORS + +#ifdef CONFIG_X86_64 + /* End of data section */ + _edata = .; +#endif + } :data + +#ifdef CONFIG_X86_32 + /* 32 bit has nosave before _edata */ + . = ALIGN(PAGE_SIZE); + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } +#endif + + . = ALIGN(PAGE_SIZE); + .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { + *(.data.page_aligned) + *(.data.idt) + } + +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(PAGE_SIZE); + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); +#endif + .data.cacheline_aligned : + AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { + *(.data.cacheline_aligned) + } + + /* rarely changed data like cpu maps */ +#ifdef CONFIG_X86_32 + . = ALIGN(32); +#else + . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); +#endif + .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { + *(.data.read_mostly) + +#ifdef CONFIG_X86_32 + /* End of data section */ + _edata = .; +#endif + } + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 920cc6989cc..8ade84687b2 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,40 +1,3 @@ - /* writeable */ - . = ALIGN(PAGE_SIZE); - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - } :data - - . = ALIGN(PAGE_SIZE); - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } - - . = ALIGN(PAGE_SIZE); - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - *(.data.page_aligned) - *(.data.idt) - } - - . = ALIGN(32); - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - *(.data.cacheline_aligned) - } - - /* rarely changed data like cpu maps */ - . = ALIGN(32); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - - /* End of data section */ - _edata = .; - } - /* init_task */ . = ALIGN(THREAD_SIZE); .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 641f3f991a0..826270147b5 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,26 +1,3 @@ - /* Align data segment to page size boundary */ - . = ALIGN(PAGE_SIZE); - /* Data */ - .data : AT(ADDR(.data) - LOAD_OFFSET) { - DATA_DATA - CONSTRUCTORS - /* End of data section */ - _edata = .; - } :data - - - .data.cacheline_aligned : - AT(ADDR(.data.cacheline_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - *(.data.cacheline_aligned) - } - - . = ALIGN(CONFIG_X86_INTERNODE_CACHE_BYTES); - .data.read_mostly : AT(ADDR(.data.read_mostly) - LOAD_OFFSET) { - *(.data.read_mostly) - } - #define VSYSCALL_ADDR (-10*1024*1024) #define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ SIZEOF(.data.read_mostly) + 4095) & ~(4095)) @@ -95,11 +72,6 @@ *(.data.init_task) } :data.init - .data.page_aligned : AT(ADDR(.data.page_aligned) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - } - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { /* might get freed after init */ . = ALIGN(PAGE_SIZE); From ff6f87e1626e10beef675084c9b5384a9477e3d5 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:24 +0200 Subject: [PATCH 0653/2061] x86, vmlinux.lds: move vsyscall output sections [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-8-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 71 ++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_64.lds.S | 68 ------------------------------ 2 files changed, 71 insertions(+), 68 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 067bdb012da..b3106c2a037 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -159,6 +159,77 @@ SECTIONS #endif } +#ifdef CONFIG_X86_64 + +#define VSYSCALL_ADDR (-10*1024*1024) +#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) +#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ + SIZEOF(.data.read_mostly) + 4095) & ~(4095)) + +#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) +#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) + +#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) +#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) + + . = VSYSCALL_ADDR; + .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { + *(.vsyscall_0) + } :user + + __vsyscall_0 = VSYSCALL_VIRT_ADDR; + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { + *(.vsyscall_fn) + } + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { + *(.vsyscall_gtod_data) + } + + vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); + .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { + *(.vsyscall_clock) + } + vsyscall_clock = VVIRT(.vsyscall_clock); + + + .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { + *(.vsyscall_1) + } + .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { + *(.vsyscall_2) + } + + .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { + *(.vgetcpu_mode) + } + vgetcpu_mode = VVIRT(.vgetcpu_mode); + + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); + .jiffies : AT(VLOAD(.jiffies)) { + *(.jiffies) + } + jiffies = VVIRT(.jiffies); + + .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { + *(.vsyscall_3) + } + + . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; + +#undef VSYSCALL_ADDR +#undef VSYSCALL_PHYS_ADDR +#undef VSYSCALL_VIRT_ADDR +#undef VLOAD_OFFSET +#undef VLOAD +#undef VVIRT_OFFSET +#undef VVIRT + +#endif /* CONFIG_X86_64 */ #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 826270147b5..013aa0e1dd3 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,71 +1,3 @@ -#define VSYSCALL_ADDR (-10*1024*1024) -#define VSYSCALL_PHYS_ADDR ((LOADADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) -#define VSYSCALL_VIRT_ADDR ((ADDR(.data.read_mostly) + \ - SIZEOF(.data.read_mostly) + 4095) & ~(4095)) - -#define VLOAD_OFFSET (VSYSCALL_ADDR - VSYSCALL_PHYS_ADDR) -#define VLOAD(x) (ADDR(x) - VLOAD_OFFSET) - -#define VVIRT_OFFSET (VSYSCALL_ADDR - VSYSCALL_VIRT_ADDR) -#define VVIRT(x) (ADDR(x) - VVIRT_OFFSET) - - . = VSYSCALL_ADDR; - .vsyscall_0 : AT(VSYSCALL_PHYS_ADDR) { - *(.vsyscall_0) - } :user - - __vsyscall_0 = VSYSCALL_VIRT_ADDR; - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_fn : AT(VLOAD(.vsyscall_fn)) { - *(.vsyscall_fn) - } - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .vsyscall_gtod_data : AT(VLOAD(.vsyscall_gtod_data)) { - *(.vsyscall_gtod_data) - } - - vsyscall_gtod_data = VVIRT(.vsyscall_gtod_data); - .vsyscall_clock : AT(VLOAD(.vsyscall_clock)) { - *(.vsyscall_clock) - } - vsyscall_clock = VVIRT(.vsyscall_clock); - - - .vsyscall_1 ADDR(.vsyscall_0) + 1024: AT(VLOAD(.vsyscall_1)) { - *(.vsyscall_1) - } - .vsyscall_2 ADDR(.vsyscall_0) + 2048: AT(VLOAD(.vsyscall_2)) { - *(.vsyscall_2) - } - - .vgetcpu_mode : AT(VLOAD(.vgetcpu_mode)) { - *(.vgetcpu_mode) - } - vgetcpu_mode = VVIRT(.vgetcpu_mode); - - . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); - .jiffies : AT(VLOAD(.jiffies)) { - *(.jiffies) - } - jiffies = VVIRT(.jiffies); - - .vsyscall_3 ADDR(.vsyscall_0) + 3072: AT(VLOAD(.vsyscall_3)) { - *(.vsyscall_3) - } - - . = VSYSCALL_VIRT_ADDR + PAGE_SIZE; - -#undef VSYSCALL_ADDR -#undef VSYSCALL_PHYS_ADDR -#undef VSYSCALL_VIRT_ADDR -#undef VLOAD_OFFSET -#undef VLOAD -#undef VVIRT_OFFSET -#undef VVIRT - /* init_task */ .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { . = ALIGN(THREAD_SIZE); From e58bdaa8f810332e5c1760ce496b01e07d51642c Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:25 +0200 Subject: [PATCH 0654/2061] x86, vmlinux.lds: unify first part of initdata 32-bit: - Move definition of __init_begin outside output_section because it covers more than one section - Move ALIGN() for end-of-section inside .smp_locks output section. Same effect but the intent is better documented that we need both start and end aligned. 64-bit: - Move ALIGN() outside output section in .init.setup - Deleted unused __smp_alt_* symbols None of the above should result in any functional change. [ Impact: refactor and unify linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-9-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 61 ++++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 60 ------------------------------- arch/x86/kernel/vmlinux_64.lds.S | 59 ------------------------------ 3 files changed, 61 insertions(+), 119 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index b3106c2a037..8b203c4ced9 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -231,6 +231,67 @@ SECTIONS #endif /* CONFIG_X86_64 */ + /* init_task */ + . = ALIGN(THREAD_SIZE); + .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { + *(.data.init_task) + } +#ifdef CONFIG_X86_64 + :data.init +#endif + + /* + * smp_locks might be freed after init + * start/end must be page aligned + */ + . = ALIGN(PAGE_SIZE); + .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { + __smp_locks = .; + *(.smp_locks) + __smp_locks_end = .; + . = ALIGN(PAGE_SIZE); + } + + /* Init code and data - will be freed after init */ + . = ALIGN(PAGE_SIZE); + __init_begin = .; /* paired with __init_end */ + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { + INIT_DATA + } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { + __x86_cpu_dev_start = .; + *(.x86_cpu_dev.init) + __x86_cpu_dev_end = .; + } + + SECURITY_INIT + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 8ade84687b2..d8ba5394af0 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,63 +1,3 @@ - /* init_task */ - . = ALIGN(THREAD_SIZE); - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - *(.data.init_task) - } - - . = ALIGN(PAGE_SIZE); - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - } - /* will be freed after init - * Following ALIGN() is required to make sure no other data falls on the - * same page where __smp_alt_end is pointing as that page might be freed - * after boot. Always make sure that ALIGN() directive is present after - * the section which contains __smp_alt_end. - */ - . = ALIGN(PAGE_SIZE); - - /* Init code and data - will be freed after init */ - . = ALIGN(PAGE_SIZE); - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - INIT_DATA - } - - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - - SECURITY_INIT - . = ALIGN(4); .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { __alt_instructions = .; diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 013aa0e1dd3..0e8054e0c5c 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,62 +1,3 @@ - /* init_task */ - .data.init_task : AT(ADDR(.data.init_task) - LOAD_OFFSET) { - . = ALIGN(THREAD_SIZE); - *(.data.init_task) - } :data.init - - .smp_locks : AT(ADDR(.smp_locks) - LOAD_OFFSET) { - /* might get freed after init */ - . = ALIGN(PAGE_SIZE); - __smp_alt_begin = .; - __smp_locks = .; - *(.smp_locks) - __smp_locks_end = .; - . = ALIGN(PAGE_SIZE); - __smp_alt_end = .; - } - - /* Init code and data */ - . = ALIGN(PAGE_SIZE); - __init_begin = .; /* paired with __init_end */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - _sinittext = .; - INIT_TEXT - _einittext = .; - } - - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { - __initdata_begin = .; - INIT_DATA - __initdata_end = .; - } - - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - . = ALIGN(16); - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - - .x86_cpu_dev.init : AT(ADDR(.x86_cpu_dev.init) - LOAD_OFFSET) { - __x86_cpu_dev_start = .; - *(.x86_cpu_dev.init) - __x86_cpu_dev_end = .; - } - - SECURITY_INIT - . = ALIGN(8); .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { __parainstructions = .; From ae61836289a415351caa524d328110aaeae100d4 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:26 +0200 Subject: [PATCH 0655/2061] x86, vmlinux.lds: unify parainstructions 32 bit: - increase alignment from 4 to 8 for .parainstructions - increase alignment from 4 to 8 for .altinstructions 64 bit: - move ALIGN() outside output section for .altinstructions None of the above should result in any functional change. [ Impact: refactor and unify linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-10-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 18 ++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 18 ------------------ arch/x86/kernel/vmlinux_64.lds.S | 18 ------------------ 3 files changed, 18 insertions(+), 36 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 8b203c4ced9..c8dd71ecb56 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -291,6 +291,24 @@ SECTIONS SECURITY_INIT + . = ALIGN(8); + .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { + __parainstructions = .; + *(.parainstructions) + __parainstructions_end = .; + } + + . = ALIGN(8); + .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { + *(.altinstr_replacement) + } + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index d8ba5394af0..5df9647bb5d 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,21 +1,3 @@ - . = ALIGN(4); - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - - . = ALIGN(4); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 0e8054e0c5c..9ef70966985 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,21 +1,3 @@ - . = ALIGN(8); - .parainstructions : AT(ADDR(.parainstructions) - LOAD_OFFSET) { - __parainstructions = .; - *(.parainstructions) - __parainstructions_end = .; - } - - .altinstructions : AT(ADDR(.altinstructions) - LOAD_OFFSET) { - . = ALIGN(8); - __alt_instructions = .; - *(.altinstructions) - __alt_instructions_end = .; - } - - .altinstr_replacement : AT(ADDR(.altinstr_replacement) - LOAD_OFFSET) { - *(.altinstr_replacement) - } - /* * .exit.text is discard at runtime, not link time, to deal with * references from .altinstructions and .eh_frame From bf6a57418d5445c98047edbec022c9e54d1526e6 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:27 +0200 Subject: [PATCH 0656/2061] x86, vmlinux.lds: unify .exit.* and .init.ramfs [ Impact: cleanup ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-11-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 20 ++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 21 --------------------- arch/x86/kernel/vmlinux_64.lds.S | 21 --------------------- 3 files changed, 20 insertions(+), 42 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index c8dd71ecb56..1ab62a5fa1a 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -309,6 +309,26 @@ SECTIONS *(.altinstr_replacement) } + /* + * .exit.text is discard at runtime, not link time, to deal with + * references from .altinstructions and .eh_frame + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { + EXIT_TEXT + } + + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { + EXIT_DATA + } + +#ifdef CONFIG_BLK_DEV_INITRD + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } +#endif #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 5df9647bb5d..36c8fbd3c76 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,24 +1,3 @@ - /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame - */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#if defined(CONFIG_BLK_DEV_INITRD) - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - PERCPU(PAGE_SIZE) . = ALIGN(PAGE_SIZE); diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 9ef70966985..1aa53622333 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,24 +1,3 @@ - /* - * .exit.text is discard at runtime, not link time, to deal with - * references from .altinstructions and .eh_frame - */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { - EXIT_TEXT - } - - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { - EXIT_DATA - } - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - #ifdef CONFIG_SMP /* * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the From 9d16e78318f174fd4b07916a93e41749d5199267 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:28 +0200 Subject: [PATCH 0657/2061] x86, vmlinux.lds: unify percpu 32 bit: - move __init_end outside the .bss output section It really did not belong in there [ Impact: 64-bit: cleanup, 32-bit: refactor linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-12-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 30 ++++++++++++++++++++++++++++++ arch/x86/kernel/vmlinux_32.lds.S | 6 ------ arch/x86/kernel/vmlinux_64.lds.S | 26 -------------------------- 3 files changed, 30 insertions(+), 32 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1ab62a5fa1a..1ea2b8571e1 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -330,6 +330,36 @@ SECTIONS } #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_SMP) + /* + * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the + * output PHDR, so the next output section - __data_nosave - should + * start another section data.init2. Also, pda should be at the head of + * percpu area. Preallocate it and define the percpu offset symbol + * so that it can be accessed as a percpu variable. + */ + . = ALIGN(PAGE_SIZE); + PERCPU_VADDR(0, :percpu) +#else + PERCPU(PAGE_SIZE) +#endif + + . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ + __init_end = .; + +#ifdef CONFIG_X86_64 + .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { + . = ALIGN(PAGE_SIZE); + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + } :data.init2 + /* use another section data.init2, see PERCPU_VADDR() above */ +#endif + + #ifdef CONFIG_X86_32 # include "vmlinux_32.lds.S" #else diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S index 36c8fbd3c76..d23ee2c15c2 100644 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ b/arch/x86/kernel/vmlinux_32.lds.S @@ -1,11 +1,5 @@ - PERCPU(PAGE_SIZE) - - . = ALIGN(PAGE_SIZE); - /* freed after init ends here */ - /* BSS */ .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; __bss_start = .; *(.bss.page_aligned) *(.bss) diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S index 1aa53622333..a53936696a0 100644 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ b/arch/x86/kernel/vmlinux_64.lds.S @@ -1,29 +1,3 @@ -#ifdef CONFIG_SMP - /* - * percpu offsets are zero-based on SMP. PERCPU_VADDR() changes the - * output PHDR, so the next output section - __data_nosave - should - * start another section data.init2. Also, pda should be at the head of - * percpu area. Preallocate it and define the percpu offset symbol - * so that it can be accessed as a percpu variable. - */ - . = ALIGN(PAGE_SIZE); - PERCPU_VADDR(0, :percpu) -#else - PERCPU(PAGE_SIZE) -#endif - - . = ALIGN(PAGE_SIZE); - __init_end = .; - - .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - } :data.init2 - /* use another section data.init2, see PERCPU_VADDR() above */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { . = ALIGN(PAGE_SIZE); __bss_start = .; /* BSS */ From 091e52c3551d3031343df24b573b770b4c6c72b6 Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Wed, 29 Apr 2009 09:47:29 +0200 Subject: [PATCH 0658/2061] x86, vmlinux.lds: unify remaining parts 32 bit: - explicit page align .bss - move ALING() out of .brk output section - discard *(.eh_frame) 64 bit: - move ALIGN() out of .bss output section - move ALIGN() out of .brk output section - use a dedicated section to define _end [ Impact: unify and fix section alignments in linker script ] Signed-off-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-13-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 32 +++++++++++++++++++++++++++----- arch/x86/kernel/vmlinux_32.lds.S | 26 -------------------------- arch/x86/kernel/vmlinux_64.lds.S | 24 ------------------------ 3 files changed, 27 insertions(+), 55 deletions(-) delete mode 100644 arch/x86/kernel/vmlinux_32.lds.S delete mode 100644 arch/x86/kernel/vmlinux_64.lds.S diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 1ea2b8571e1..ef3e4f1042b 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -359,12 +359,34 @@ SECTIONS /* use another section data.init2, see PERCPU_VADDR() above */ #endif + /* BSS */ + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __bss_start = .; + *(.bss.page_aligned) + *(.bss) + . = ALIGN(4); + __bss_stop = .; + } -#ifdef CONFIG_X86_32 -# include "vmlinux_32.lds.S" -#else -# include "vmlinux_64.lds.S" -#endif + . = ALIGN(PAGE_SIZE); + .brk : AT(ADDR(.brk) - LOAD_OFFSET) { + __brk_base = .; + . += 64 * 1024; /* 64k alignment slop space */ + *(.brk_reservation) /* areas brk users have reserved */ + __brk_limit = .; + } + + .end : AT(ADDR(.end) - LOAD_OFFSET) { + _end = .; + } + + /* Sections to be discarded */ + /DISCARD/ : { + *(.exitcall.exit) + *(.eh_frame) + *(.discard) + } STABS_DEBUG DWARF_DEBUG diff --git a/arch/x86/kernel/vmlinux_32.lds.S b/arch/x86/kernel/vmlinux_32.lds.S deleted file mode 100644 index d23ee2c15c2..00000000000 --- a/arch/x86/kernel/vmlinux_32.lds.S +++ /dev/null @@ -1,26 +0,0 @@ - /* BSS */ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __bss_start = .; - *(.bss.page_aligned) - *(.bss) - . = ALIGN(4); - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = .; - . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = .; - } - - .end : AT(ADDR(.end) - LOAD_OFFSET) { - _end = . ; - } - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.discard) - } diff --git a/arch/x86/kernel/vmlinux_64.lds.S b/arch/x86/kernel/vmlinux_64.lds.S deleted file mode 100644 index a53936696a0..00000000000 --- a/arch/x86/kernel/vmlinux_64.lds.S +++ /dev/null @@ -1,24 +0,0 @@ - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - __bss_stop = .; - } - - .brk : AT(ADDR(.brk) - LOAD_OFFSET) { - . = ALIGN(PAGE_SIZE); - __brk_base = .; - . += 64 * 1024; /* 64k alignment slop space */ - *(.brk_reservation) /* areas brk users have reserved */ - __brk_limit = .; - } - - _end = . ; - - /* Sections to be discarded */ - /DISCARD/ : { - *(.exitcall.exit) - *(.eh_frame) - *(.discard) - } From 91fd7fe809bdf4d8aa56559d17b9f25a1a6fe732 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 10:58:38 +0200 Subject: [PATCH 0659/2061] x86, vmlinux.lds: add copyright Acked-by: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <1240991249-27117-2-git-send-email-sam@ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index ef3e4f1042b..0bdbaa57969 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -3,7 +3,8 @@ * * Historic 32-bit version written by Martin Mares * - * Modernisation and unification done by Sam Ravnborg + * Modernisation, unification and other changes and fixes: + * Copyright (C) 2007-2009 Sam Ravnborg * * * Don't define absolute symbols until and unless you know that symbol From 0492e1bb8fe7d122901c9f3af75e537d4129712e Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Tue, 28 Apr 2009 20:17:49 +0100 Subject: [PATCH 0660/2061] tracing: x86, mmiotrace: code consistency/legibility improvement kmmio_probe being *p and kmmio_fault_page being sometimes *f and sometimes *p is not helpful. [ Impact: cleanup ] Signed-off-by: Stuart Bennett Acked-by: Pekka Paalanen Cc: Steven Rostedt LKML-Reference: <1240946271-7083-3-git-send-email-stuart@freedesktop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 34 +++++++++++++++++----------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 4f115e00486..869181a917d 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -97,13 +97,13 @@ static struct kmmio_probe *get_kmmio_probe(unsigned long addr) static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) { struct list_head *head; - struct kmmio_fault_page *p; + struct kmmio_fault_page *f; page &= PAGE_MASK; head = kmmio_page_list(page); - list_for_each_entry_rcu(p, head, list) { - if (p->page == page) - return p; + list_for_each_entry_rcu(f, head, list) { + if (f->page == page) + return f; } return NULL; } @@ -439,12 +439,12 @@ static void rcu_free_kmmio_fault_pages(struct rcu_head *head) head, struct kmmio_delayed_release, rcu); - struct kmmio_fault_page *p = dr->release_list; - while (p) { - struct kmmio_fault_page *next = p->release_next; - BUG_ON(p->count); - kfree(p); - p = next; + struct kmmio_fault_page *f = dr->release_list; + while (f) { + struct kmmio_fault_page *next = f->release_next; + BUG_ON(f->count); + kfree(f); + f = next; } kfree(dr); } @@ -453,19 +453,19 @@ static void remove_kmmio_fault_pages(struct rcu_head *head) { struct kmmio_delayed_release *dr = container_of(head, struct kmmio_delayed_release, rcu); - struct kmmio_fault_page *p = dr->release_list; + struct kmmio_fault_page *f = dr->release_list; struct kmmio_fault_page **prevp = &dr->release_list; unsigned long flags; spin_lock_irqsave(&kmmio_lock, flags); - while (p) { - if (!p->count) { - list_del_rcu(&p->list); - prevp = &p->release_next; + while (f) { + if (!f->count) { + list_del_rcu(&f->list); + prevp = &f->release_next; } else { - *prevp = p->release_next; + *prevp = f->release_next; } - p = p->release_next; + f = f->release_next; } spin_unlock_irqrestore(&kmmio_lock, flags); From 46e91d00b1165b14b484aa33800e1bba0794ae1a Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Tue, 28 Apr 2009 20:17:50 +0100 Subject: [PATCH 0661/2061] tracing: x86, mmiotrace: refactor clearing/restore of page presence * change function names to clear_* from set_*: in reality we only clear and restore page presence, and never unconditionally set present. Using clear_*({true, false}, ...) is therefore more honest than set_*({false, true}, ...) * upgrade presence storage to pteval_t: doing user-space tracing will require saving and manipulation of the _PAGE_PROTNONE bit, in addition to the existing _PAGE_PRESENT changes, and having multiple bools stored and passed around does not seem optimal [ Impact: refactor, clean up mmiotrace code ] Signed-off-by: Stuart Bennett Acked-by: Pekka Paalanen Cc: Steven Rostedt LKML-Reference: <1240946271-7083-4-git-send-email-stuart@freedesktop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 43 ++++++++++++++++++++++--------------------- 1 file changed, 22 insertions(+), 21 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index 869181a917d..a769d1a2d93 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -32,7 +32,7 @@ struct kmmio_fault_page { struct list_head list; struct kmmio_fault_page *release_next; unsigned long page; /* location of the fault page */ - bool old_presence; /* page presence prior to arming */ + pteval_t old_presence; /* page presence prior to arming */ bool armed; /* @@ -108,49 +108,51 @@ static struct kmmio_fault_page *get_kmmio_fault_page(unsigned long page) return NULL; } -static void set_pmd_presence(pmd_t *pmd, bool present, bool *old) +static void clear_pmd_presence(pmd_t *pmd, bool clear, pmdval_t *old) { pmdval_t v = pmd_val(*pmd); - *old = !!(v & _PAGE_PRESENT); - v &= ~_PAGE_PRESENT; - if (present) - v |= _PAGE_PRESENT; + if (clear) { + *old = v & _PAGE_PRESENT; + v &= ~_PAGE_PRESENT; + } else /* presume this has been called with clear==true previously */ + v |= *old; set_pmd(pmd, __pmd(v)); } -static void set_pte_presence(pte_t *pte, bool present, bool *old) +static void clear_pte_presence(pte_t *pte, bool clear, pteval_t *old) { pteval_t v = pte_val(*pte); - *old = !!(v & _PAGE_PRESENT); - v &= ~_PAGE_PRESENT; - if (present) - v |= _PAGE_PRESENT; + if (clear) { + *old = v & _PAGE_PRESENT; + v &= ~_PAGE_PRESENT; + } else /* presume this has been called with clear==true previously */ + v |= *old; set_pte_atomic(pte, __pte(v)); } -static int set_page_presence(unsigned long addr, bool present, bool *old) +static int clear_page_presence(struct kmmio_fault_page *f, bool clear) { unsigned int level; - pte_t *pte = lookup_address(addr, &level); + pte_t *pte = lookup_address(f->page, &level); if (!pte) { - pr_err("kmmio: no pte for page 0x%08lx\n", addr); + pr_err("kmmio: no pte for page 0x%08lx\n", f->page); return -1; } switch (level) { case PG_LEVEL_2M: - set_pmd_presence((pmd_t *)pte, present, old); + clear_pmd_presence((pmd_t *)pte, clear, &f->old_presence); break; case PG_LEVEL_4K: - set_pte_presence(pte, present, old); + clear_pte_presence(pte, clear, &f->old_presence); break; default: pr_err("kmmio: unexpected page level 0x%x.\n", level); return -1; } - __flush_tlb_one(addr); + __flush_tlb_one(f->page); return 0; } @@ -171,9 +173,9 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) WARN_ONCE(f->armed, KERN_ERR "kmmio page already armed.\n"); if (f->armed) { pr_warning("kmmio double-arm: page 0x%08lx, ref %d, old %d\n", - f->page, f->count, f->old_presence); + f->page, f->count, !!f->old_presence); } - ret = set_page_presence(f->page, false, &f->old_presence); + ret = clear_page_presence(f, true); WARN_ONCE(ret < 0, KERN_ERR "kmmio arming 0x%08lx failed.\n", f->page); f->armed = true; return ret; @@ -182,8 +184,7 @@ static int arm_kmmio_fault_page(struct kmmio_fault_page *f) /** Restore the given page to saved presence state. */ static void disarm_kmmio_fault_page(struct kmmio_fault_page *f) { - bool tmp; - int ret = set_page_presence(f->page, f->old_presence, &tmp); + int ret = clear_page_presence(f, false); WARN_ONCE(ret < 0, KERN_ERR "kmmio disarming 0x%08lx failed.\n", f->page); f->armed = false; From 0f9a623dd6c9b5b4dd00c232f29525bfc7a8ecf2 Mon Sep 17 00:00:00 2001 From: Stuart Bennett Date: Tue, 28 Apr 2009 20:17:51 +0100 Subject: [PATCH 0662/2061] tracing: x86, mmiotrace: only register for die notifier when tracer active Follow up to afcfe024aebd74b0984a41af9a34e009cf5badaf in Linus' tree ("x86: mmiotrace: quieten spurious warning message") Signed-off-by: Stuart Bennett Acked-by: Pekka Paalanen Cc: Steven Rostedt LKML-Reference: <1240946271-7083-5-git-send-email-stuart@freedesktop.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/kmmio.c | 27 ++++++++++++++++++++++----- arch/x86/mm/mmio-mod.c | 2 ++ include/linux/mmiotrace.h | 2 ++ 3 files changed, 26 insertions(+), 5 deletions(-) diff --git a/arch/x86/mm/kmmio.c b/arch/x86/mm/kmmio.c index a769d1a2d93..256ce643b0b 100644 --- a/arch/x86/mm/kmmio.c +++ b/arch/x86/mm/kmmio.c @@ -311,7 +311,12 @@ static int post_kmmio_handler(unsigned long condition, struct pt_regs *regs) struct kmmio_context *ctx = &get_cpu_var(kmmio_ctx); if (!ctx->active) { - pr_debug("kmmio: spurious debug trap on CPU %d.\n", + /* + * debug traps without an active context are due to either + * something external causing them (f.e. using a debugger while + * mmio tracing enabled), or erroneous behaviour + */ + pr_warning("kmmio: unexpected debug trap on CPU %d.\n", smp_processor_id()); goto out; } @@ -529,8 +534,8 @@ void unregister_kmmio_probe(struct kmmio_probe *p) } EXPORT_SYMBOL(unregister_kmmio_probe); -static int kmmio_die_notifier(struct notifier_block *nb, unsigned long val, - void *args) +static int +kmmio_die_notifier(struct notifier_block *nb, unsigned long val, void *args) { struct die_args *arg = args; @@ -545,11 +550,23 @@ static struct notifier_block nb_die = { .notifier_call = kmmio_die_notifier }; -static int __init init_kmmio(void) +int kmmio_init(void) { int i; + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) INIT_LIST_HEAD(&kmmio_page_table[i]); + return register_die_notifier(&nb_die); } -fs_initcall(init_kmmio); /* should be before device_initcall() */ + +void kmmio_cleanup(void) +{ + int i; + + unregister_die_notifier(&nb_die); + for (i = 0; i < KMMIO_PAGE_TABLE_SIZE; i++) { + WARN_ONCE(!list_empty(&kmmio_page_table[i]), + KERN_ERR "kmmio_page_table not empty at cleanup, any further tracing will leak memory.\n"); + } +} diff --git a/arch/x86/mm/mmio-mod.c b/arch/x86/mm/mmio-mod.c index c9342ed8b40..132772a8ec5 100644 --- a/arch/x86/mm/mmio-mod.c +++ b/arch/x86/mm/mmio-mod.c @@ -451,6 +451,7 @@ void enable_mmiotrace(void) if (nommiotrace) pr_info(NAME "MMIO tracing disabled.\n"); + kmmio_init(); enter_uniprocessor(); spin_lock_irq(&trace_lock); atomic_inc(&mmiotrace_enabled); @@ -473,6 +474,7 @@ void disable_mmiotrace(void) clear_trace_list(); /* guarantees: no more kmmio callbacks */ leave_uniprocessor(); + kmmio_cleanup(); pr_info(NAME "disabled.\n"); out: mutex_unlock(&mmiotrace_mutex); diff --git a/include/linux/mmiotrace.h b/include/linux/mmiotrace.h index 3d1b7bde128..97491f78b08 100644 --- a/include/linux/mmiotrace.h +++ b/include/linux/mmiotrace.h @@ -30,6 +30,8 @@ extern unsigned int kmmio_count; extern int register_kmmio_probe(struct kmmio_probe *p); extern void unregister_kmmio_probe(struct kmmio_probe *p); +extern int kmmio_init(void); +extern void kmmio_cleanup(void); #ifdef CONFIG_MMIOTRACE /* kmmio is active by some kmmio_probes? */ From fd0731944333db6e9e91b6954c6ef95f4b71ab04 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 12:56:58 +0200 Subject: [PATCH 0663/2061] x86, vmlinux.lds: fix relocatable symbols __init_begin/_end symbols should be inside sections as well, otherwise the relocatable kernel gets confused when freeing init sections in the wrong place. [ Impact: fix bootup crash ] Cc: Sam Ravnborg Cc: Tim Abbott Cc: Linus Torvalds LKML-Reference: <20090429105056.GA28720@uranus.ravnborg.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/vmlinux.lds.S | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/vmlinux.lds.S b/arch/x86/kernel/vmlinux.lds.S index 0bdbaa57969..4c85b2e2bb6 100644 --- a/arch/x86/kernel/vmlinux.lds.S +++ b/arch/x86/kernel/vmlinux.lds.S @@ -255,8 +255,8 @@ SECTIONS /* Init code and data - will be freed after init */ . = ALIGN(PAGE_SIZE); - __init_begin = .; /* paired with __init_end */ .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; /* paired with __init_end */ _sinittext = .; INIT_TEXT _einittext = .; @@ -346,8 +346,11 @@ SECTIONS #endif . = ALIGN(PAGE_SIZE); + /* freed after init ends here */ - __init_end = .; + .init.end : AT(ADDR(.init.end) - LOAD_OFFSET) { + __init_end = .; + } #ifdef CONFIG_X86_64 .data_nosave : AT(ADDR(.data_nosave) - LOAD_OFFSET) { From 30e673b230f9d556eb81ef68a7b1a08c8b3b142c Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:47 -0500 Subject: [PATCH 0664/2061] tracing/filters: move preds into event_filter object Create a new event_filter object, and move the pred-related members out of the call and subsystem objects and into the filter object - the details of the filter implementation don't need to be exposed in the call and subsystem in any case, and it will also help make the new parser implementation a little cleaner. [ Impact: refactor trace-filter code to prepare for new features ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905887.6416.119.camel@tropicana> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 4 +- kernel/trace/trace.h | 10 ++- kernel/trace/trace_events.c | 3 +- kernel/trace/trace_events_filter.c | 107 ++++++++++++++++++----------- 4 files changed, 76 insertions(+), 48 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 78a9ba24cbf..46a27f2695a 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -101,8 +101,8 @@ struct ftrace_event_call { int (*show_format)(struct trace_seq *s); int (*define_fields)(void); struct list_head fields; - int n_preds; - struct filter_pred **preds; + int filter_active; + void *filter; void *mod; #ifdef CONFIG_EVENT_PROFILE diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7d55bcf50e4..1fb7d6ccadf 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -731,12 +731,16 @@ struct ftrace_event_field { int size; }; +struct event_filter { + int n_preds; + struct filter_pred **preds; +}; + struct event_subsystem { struct list_head list; const char *name; struct dentry *entry; - int n_preds; - struct filter_pred **preds; + void *filter; }; struct filter_pred; @@ -774,7 +778,7 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, struct ring_buffer *buffer, struct ring_buffer_event *event) { - if (unlikely(call->n_preds) && !filter_match_preds(call, rec)) { + if (unlikely(call->filter_active) && !filter_match_preds(call, rec)) { ring_buffer_discard_commit(buffer, event); return 1; } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index be4d3a437c1..1cd1f37373d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -757,8 +757,7 @@ event_subsystem_dir(const char *name, struct dentry *d_events) list_add(&system->list, &event_subsystems); - system->preds = NULL; - system->n_preds = 0; + system->filter = NULL; entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 65418288f95..1e861eca3d0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -93,11 +93,12 @@ static int filter_pred_none(struct filter_pred *pred, void *event) /* return 1 if event matches, 0 otherwise (discard) */ int filter_match_preds(struct ftrace_event_call *call, void *rec) { + struct event_filter *filter = call->filter; int i, matched, and_failed = 0; struct filter_pred *pred; - for (i = 0; i < call->n_preds; i++) { - pred = call->preds[i]; + for (i = 0; i < filter->n_preds; i++) { + pred = filter->preds[i]; if (and_failed && !pred->or) continue; matched = pred->fn(pred, rec); @@ -115,20 +116,20 @@ int filter_match_preds(struct ftrace_event_call *call, void *rec) } EXPORT_SYMBOL_GPL(filter_match_preds); -static void __filter_print_preds(struct filter_pred **preds, int n_preds, +static void __filter_print_preds(struct event_filter *filter, struct trace_seq *s) { - char *field_name; struct filter_pred *pred; + char *field_name; int i; - if (!n_preds) { + if (!filter || !filter->n_preds) { trace_seq_printf(s, "none\n"); return; } - for (i = 0; i < n_preds; i++) { - pred = preds[i]; + for (i = 0; i < filter->n_preds; i++) { + pred = filter->preds[i]; field_name = pred->field_name; if (i) trace_seq_printf(s, pred->or ? "|| " : "&& "); @@ -144,7 +145,7 @@ static void __filter_print_preds(struct filter_pred **preds, int n_preds, void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) { mutex_lock(&filter_mutex); - __filter_print_preds(call->preds, call->n_preds, s); + __filter_print_preds(call->filter, s); mutex_unlock(&filter_mutex); } @@ -152,7 +153,7 @@ void filter_print_subsystem_preds(struct event_subsystem *system, struct trace_seq *s) { mutex_lock(&filter_mutex); - __filter_print_preds(system->preds, system->n_preds, s); + __filter_print_preds(system->filter, s); mutex_unlock(&filter_mutex); } @@ -200,12 +201,14 @@ static int filter_set_pred(struct filter_pred *dest, static void __filter_disable_preds(struct ftrace_event_call *call) { + struct event_filter *filter = call->filter; int i; - call->n_preds = 0; + call->filter_active = 0; + filter->n_preds = 0; for (i = 0; i < MAX_FILTER_PRED; i++) - call->preds[i]->fn = filter_pred_none; + filter->preds[i]->fn = filter_pred_none; } void filter_disable_preds(struct ftrace_event_call *call) @@ -217,32 +220,39 @@ void filter_disable_preds(struct ftrace_event_call *call) int init_preds(struct ftrace_event_call *call) { + struct event_filter *filter; struct filter_pred *pred; int i; - call->n_preds = 0; - - call->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!call->preds) + filter = call->filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!call->filter) return -ENOMEM; + call->filter_active = 0; + filter->n_preds = 0; + + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); + if (!filter->preds) + goto oom; + for (i = 0; i < MAX_FILTER_PRED; i++) { pred = kzalloc(sizeof(*pred), GFP_KERNEL); if (!pred) goto oom; pred->fn = filter_pred_none; - call->preds[i] = pred; + filter->preds[i] = pred; } return 0; oom: for (i = 0; i < MAX_FILTER_PRED; i++) { - if (call->preds[i]) - filter_free_pred(call->preds[i]); + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); } - kfree(call->preds); - call->preds = NULL; + kfree(filter->preds); + kfree(call->filter); + call->filter = NULL; return -ENOMEM; } @@ -250,15 +260,16 @@ EXPORT_SYMBOL_GPL(init_preds); static void __filter_free_subsystem_preds(struct event_subsystem *system) { + struct event_filter *filter = system->filter; struct ftrace_event_call *call; int i; - if (system->n_preds) { - for (i = 0; i < system->n_preds; i++) - filter_free_pred(system->preds[i]); - kfree(system->preds); - system->preds = NULL; - system->n_preds = 0; + if (filter && filter->n_preds) { + for (i = 0; i < filter->n_preds; i++) + filter_free_pred(filter->preds[i]); + kfree(filter->preds); + kfree(filter); + system->filter = NULL; } list_for_each_entry(call, &ftrace_events, list) { @@ -281,21 +292,23 @@ static int filter_add_pred_fn(struct ftrace_event_call *call, struct filter_pred *pred, filter_pred_fn_t fn) { + struct event_filter *filter = call->filter; int idx, err; - if (call->n_preds && !pred->compound) + if (filter->n_preds && !pred->compound) __filter_disable_preds(call); - if (call->n_preds == MAX_FILTER_PRED) + if (filter->n_preds == MAX_FILTER_PRED) return -ENOSPC; - idx = call->n_preds; - filter_clear_pred(call->preds[idx]); - err = filter_set_pred(call->preds[idx], pred, fn); + idx = filter->n_preds; + filter_clear_pred(filter->preds[idx]); + err = filter_set_pred(filter->preds[idx], pred, fn); if (err) return err; - call->n_preds++; + filter->n_preds++; + call->filter_active = 1; return 0; } @@ -366,29 +379,41 @@ int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) int filter_add_subsystem_pred(struct event_subsystem *system, struct filter_pred *pred) { + struct event_filter *filter = system->filter; struct ftrace_event_call *call; mutex_lock(&filter_mutex); - if (system->n_preds && !pred->compound) + if (filter && filter->n_preds && !pred->compound) { __filter_free_subsystem_preds(system); + filter = NULL; + } - if (!system->n_preds) { - system->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), + if (!filter) { + system->filter = kzalloc(sizeof(*filter), GFP_KERNEL); + if (!system->filter) { + mutex_unlock(&filter_mutex); + return -ENOMEM; + } + filter = system->filter; + filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!system->preds) { + + if (!filter->preds) { + kfree(system->filter); + system->filter = NULL; mutex_unlock(&filter_mutex); return -ENOMEM; } } - if (system->n_preds == MAX_FILTER_PRED) { + if (filter->n_preds == MAX_FILTER_PRED) { mutex_unlock(&filter_mutex); return -ENOSPC; } - system->preds[system->n_preds] = pred; - system->n_preds++; + filter->preds[filter->n_preds] = pred; + filter->n_preds++; list_for_each_entry(call, &ftrace_events, list) { int err; @@ -401,8 +426,8 @@ int filter_add_subsystem_pred(struct event_subsystem *system, err = __filter_add_pred(call, pred); if (err == -ENOMEM) { - system->preds[system->n_preds] = NULL; - system->n_preds--; + filter->preds[filter->n_preds] = NULL; + filter->n_preds--; mutex_unlock(&filter_mutex); return err; } From a118e4d1402f1349fe3d953493e4168a300a752d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:53 -0500 Subject: [PATCH 0665/2061] tracing/filters: distinguish between signed and unsigned fields The new filter comparison ops need to be able to distinguish between signed and unsigned field types, so add an is_signed flag/param to the event field struct/trace_define_fields(). Also define a simple macro, is_signed_type() to determine the signedness at compile time, used in the trace macros. If the is_signed_type() macro won't work with a specific type, a new slightly modified version of TRACE_FIELD() called TRACE_FIELD_SIGN(), allows the signedness to be set explicitly. [ Impact: extend trace-filter code for new feature ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905893.6416.120.camel@tropicana> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 7 ++++--- include/trace/ftrace.h | 16 ++++++++-------- kernel/trace/trace.h | 1 + kernel/trace/trace_event_types.h | 4 ++-- kernel/trace/trace_events.c | 3 ++- kernel/trace/trace_export.c | 29 ++++++++++++++++++++++------- 6 files changed, 39 insertions(+), 21 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 46a27f2695a..e61a7403f3d 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -122,8 +122,9 @@ extern int filter_current_check_discard(struct ftrace_event_call *call, struct ring_buffer_event *event); extern int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size); + char *name, int offset, int size, int is_signed); +#define is_signed_type(type) (((type)(-1)) < 0) /* * The double __builtin_constant_p is because gcc will give us an error @@ -144,10 +145,10 @@ do { \ __trace_printk(ip, fmt, ##args); \ } while (0) -#define __common_field(type, item) \ +#define __common_field(type, item, is_signed) \ ret = trace_define_field(event_call, #type, "common_" #item, \ offsetof(typeof(field.ent), item), \ - sizeof(field.ent.item)); \ + sizeof(field.ent.item), is_signed); \ if (ret) \ return ret; diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 1e681142f1d..edb02bc9f8f 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -225,7 +225,7 @@ ftrace_format_##call(struct trace_seq *s) \ #define __field(type, item) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (ret) \ return ret; @@ -234,7 +234,7 @@ ftrace_format_##call(struct trace_seq *s) \ BUILD_BUG_ON(len > MAX_FILTER_STR_VAL); \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), 0); \ if (ret) \ return ret; @@ -242,7 +242,7 @@ ftrace_format_##call(struct trace_seq *s) \ #define __string(item, src) \ ret = trace_define_field(event_call, "__str_loc", #item, \ offsetof(typeof(field), __str_loc_##item), \ - sizeof(field.__str_loc_##item)); + sizeof(field.__str_loc_##item), 0); #undef TRACE_EVENT #define TRACE_EVENT(call, proto, args, tstruct, func, print) \ @@ -253,11 +253,11 @@ ftrace_define_fields_##call(void) \ struct ftrace_event_call *event_call = &event_##call; \ int ret; \ \ - __common_field(int, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ + __common_field(int, type, 1); \ + __common_field(unsigned char, flags, 0); \ + __common_field(unsigned char, preempt_count, 0); \ + __common_field(int, pid, 1); \ + __common_field(int, tgid, 1); \ \ tstruct; \ \ diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 1fb7d6ccadf..866d0108fd2 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -729,6 +729,7 @@ struct ftrace_event_field { char *type; int offset; int size; + int is_signed; }; struct event_filter { diff --git a/kernel/trace/trace_event_types.h b/kernel/trace/trace_event_types.h index cfcecc4fd86..5e32e375134 100644 --- a/kernel/trace/trace_event_types.h +++ b/kernel/trace/trace_event_types.h @@ -141,8 +141,8 @@ TRACE_EVENT_FORMAT(hw_branch, TRACE_HW_BRANCHES, hw_branch_entry, ignore, TRACE_EVENT_FORMAT(power, TRACE_POWER, trace_power, ignore, TRACE_STRUCT( - TRACE_FIELD(ktime_t, state_data.stamp, stamp) - TRACE_FIELD(ktime_t, state_data.end, end) + TRACE_FIELD_SIGN(ktime_t, state_data.stamp, stamp, 1) + TRACE_FIELD_SIGN(ktime_t, state_data.end, end, 1) TRACE_FIELD(int, state_data.type, type) TRACE_FIELD(int, state_data.state, state) ), diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 1cd1f37373d..bbbea747937 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -26,7 +26,7 @@ static DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); int trace_define_field(struct ftrace_event_call *call, char *type, - char *name, int offset, int size) + char *name, int offset, int size, int is_signed) { struct ftrace_event_field *field; @@ -44,6 +44,7 @@ int trace_define_field(struct ftrace_event_call *call, char *type, field->offset = offset; field->size = size; + field->is_signed = is_signed; list_add(&field->link, &call->fields); return 0; diff --git a/kernel/trace/trace_export.c b/kernel/trace/trace_export.c index 0cb1a142c74..d06cf898dc8 100644 --- a/kernel/trace/trace_export.c +++ b/kernel/trace/trace_export.c @@ -50,6 +50,9 @@ extern void __bad_type_size(void); if (!ret) \ return 0; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) #undef TP_RAW_FMT #define TP_RAW_FMT(args...) args @@ -98,6 +101,10 @@ ftrace_format_##call(struct trace_seq *s) \ #define TRACE_FIELD(type, item, assign)\ entry->item = assign; +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + TRACE_FIELD(type, item, assign) + #undef TP_CMD #define TP_CMD(cmd...) cmd @@ -149,7 +156,7 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD(type, item, assign) \ ret = trace_define_field(event_call, #type, #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), is_signed_type(type)); \ if (ret) \ return ret; @@ -157,7 +164,15 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ #define TRACE_FIELD_SPECIAL(type, item, len, cmd) \ ret = trace_define_field(event_call, #type "[" #len "]", #item, \ offsetof(typeof(field), item), \ - sizeof(field.item)); \ + sizeof(field.item), 0); \ + if (ret) \ + return ret; + +#undef TRACE_FIELD_SIGN +#define TRACE_FIELD_SIGN(type, item, assign, is_signed) \ + ret = trace_define_field(event_call, #type, #item, \ + offsetof(typeof(field), item), \ + sizeof(field.item), is_signed); \ if (ret) \ return ret; @@ -173,11 +188,11 @@ ftrace_define_fields_##call(void) \ struct args field; \ int ret; \ \ - __common_field(unsigned char, type); \ - __common_field(unsigned char, flags); \ - __common_field(unsigned char, preempt_count); \ - __common_field(int, pid); \ - __common_field(int, tgid); \ + __common_field(unsigned char, type, 0); \ + __common_field(unsigned char, flags, 0); \ + __common_field(unsigned char, preempt_count, 0); \ + __common_field(int, pid, 1); \ + __common_field(int, tgid, 1); \ \ tstruct; \ \ From 8b3725621074040d380664964ffbc40610aef8c6 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Tue, 28 Apr 2009 03:04:59 -0500 Subject: [PATCH 0666/2061] tracing/filters: a better event parser Replace the current event parser hack with a better one. Filters are no longer specified predicate by predicate, but all at once and can use parens and any of the following operators: numeric fields: ==, !=, <, <=, >, >= string fields: ==, != predicates can be combined with the logical operators: &&, || examples: "common_preempt_count > 4" > filter "((sig >= 10 && sig < 15) || sig == 17) && comm != bash" > filter If there was an error, the erroneous string along with an error message can be seen by looking at the filter e.g.: ((sig >= 10 && sig < 15) || dsig == 17) && comm != bash ^ parse_error: Field not found Currently the caret for an error always appears at the beginning of the filter; a real position should be used, but the error message should be useful even without it. To clear a filter, '0' can be written to the filter file. Filters can also be set or cleared for a complete subsystem by writing the same filter as would be written to an individual event to the filter file at the root of the subsytem. Note however, that if any event in the subsystem lacks a field specified in the filter being set, the set will fail and all filters in the subsytem are automatically cleared. This change from the previous version was made because using only the fields that happen to exist for a given event would most likely result in a meaningless filter. Because the logical operators are now implemented as predicates, the maximum number of predicates in a filter was increased from 8 to 16. [ Impact: add new, extended trace-filter implementation ] Signed-off-by: Tom Zanussi Acked-by: Steven Rostedt Cc: fweisbec@gmail.com Cc: Li Zefan LKML-Reference: <1240905899.6416.121.camel@tropicana> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 2 +- kernel/trace/trace.h | 66 +- kernel/trace/trace_events.c | 90 +-- kernel/trace/trace_events_filter.c | 1044 +++++++++++++++++++++------- 4 files changed, 898 insertions(+), 304 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index e61a7403f3d..5fff40c9ff5 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -112,7 +112,7 @@ struct ftrace_event_call { #endif }; -#define MAX_FILTER_PRED 8 +#define MAX_FILTER_PRED 32 #define MAX_FILTER_STR_VAL 128 extern int init_preds(struct ftrace_event_call *call); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 866d0108fd2..7736fe8c1b7 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -735,6 +735,7 @@ struct ftrace_event_field { struct event_filter { int n_preds; struct filter_pred **preds; + char *filter_string; }; struct event_subsystem { @@ -746,7 +747,8 @@ struct event_subsystem { struct filter_pred; -typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event); +typedef int (*filter_pred_fn_t) (struct filter_pred *pred, void *event, + int val1, int val2); struct filter_pred { filter_pred_fn_t fn; @@ -756,23 +758,18 @@ struct filter_pred { char *field_name; int offset; int not; - int or; - int compound; - int clear; + int op; + int pop_n; }; -extern void filter_free_pred(struct filter_pred *pred); -extern void filter_print_preds(struct ftrace_event_call *call, +extern void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s); -extern int filter_parse(char **pbuf, struct filter_pred *pred); -extern int filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred); -extern void filter_disable_preds(struct ftrace_event_call *call); -extern void filter_free_subsystem_preds(struct event_subsystem *system); -extern void filter_print_subsystem_preds(struct event_subsystem *system, +extern int apply_event_filter(struct ftrace_event_call *call, + char *filter_string); +extern int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string); +extern void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s); -extern int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred); static inline int filter_check_discard(struct ftrace_event_call *call, void *rec, @@ -787,6 +784,47 @@ filter_check_discard(struct ftrace_event_call *call, void *rec, return 0; } +#define DEFINE_COMPARISON_PRED(type) \ +static int filter_pred_##type(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + type *addr = (type *)(event + pred->offset); \ + type val = (type)pred->val; \ + int match = 0; \ + \ + switch (pred->op) { \ + case OP_LT: \ + match = (*addr < val); \ + break; \ + case OP_LE: \ + match = (*addr <= val); \ + break; \ + case OP_GT: \ + match = (*addr > val); \ + break; \ + case OP_GE: \ + match = (*addr >= val); \ + break; \ + default: \ + break; \ + } \ + \ + return match; \ +} + +#define DEFINE_EQUALITY_PRED(size) \ +static int filter_pred_##size(struct filter_pred *pred, void *event, \ + int val1, int val2) \ +{ \ + u##size *addr = (u##size *)(event + pred->offset); \ + u##size val = (u##size)pred->val; \ + int match; \ + \ + match = (val == *addr) ^ pred->not; \ + \ + return match; \ +} + extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index bbbea747937..f789ca540fe 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -492,7 +492,7 @@ event_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_preds(call, s); + print_event_filter(call, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -505,40 +505,26 @@ event_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct ftrace_event_call *call = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; + } + buf[cnt] = '\0'; + + err = apply_event_filter(call, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } - - if (pred->clear) { - filter_disable_preds(call); - filter_free_pred(pred); - return cnt; - } - - err = filter_add_pred(call, pred); - if (err < 0) { - filter_free_pred(pred); - return err; - } - - filter_free_pred(pred); *ppos += cnt; @@ -562,7 +548,7 @@ subsystem_filter_read(struct file *filp, char __user *ubuf, size_t cnt, trace_seq_init(s); - filter_print_subsystem_preds(system, s); + print_subsystem_event_filter(system, s); r = simple_read_from_buffer(ubuf, cnt, ppos, s->buffer, s->len); kfree(s); @@ -575,38 +561,26 @@ subsystem_filter_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) { struct event_subsystem *system = filp->private_data; - char buf[64], *pbuf = buf; - struct filter_pred *pred; + char *buf; int err; - if (cnt >= sizeof(buf)) + if (cnt >= PAGE_SIZE) return -EINVAL; - if (copy_from_user(&buf, ubuf, cnt)) - return -EFAULT; - buf[cnt] = '\0'; - - pred = kzalloc(sizeof(*pred), GFP_KERNEL); - if (!pred) + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) return -ENOMEM; - err = filter_parse(&pbuf, pred); - if (err < 0) { - filter_free_pred(pred); - return err; + if (copy_from_user(buf, ubuf, cnt)) { + free_page((unsigned long) buf); + return -EFAULT; } + buf[cnt] = '\0'; - if (pred->clear) { - filter_free_subsystem_preds(system); - filter_free_pred(pred); - return cnt; - } - - err = filter_add_subsystem_pred(system, pred); - if (err < 0) { - filter_free_pred(pred); + err = apply_subsystem_event_filter(system, buf); + free_page((unsigned long) buf); + if (err < 0) return err; - } *ppos += cnt; @@ -760,11 +734,21 @@ event_subsystem_dir(const char *name, struct dentry *d_events) system->filter = NULL; + system->filter = kzalloc(sizeof(struct event_filter), GFP_KERNEL); + if (!system->filter) { + pr_warning("Could not allocate filter for subsystem " + "'%s'\n", name); + return system->entry; + } + entry = debugfs_create_file("filter", 0644, system->entry, system, &ftrace_subsystem_filter_fops); - if (!entry) + if (!entry) { + kfree(system->filter); + system->filter = NULL; pr_warning("Could not create debugfs " "'%s/filter' entry\n", name); + } return system->entry; } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 1e861eca3d0..f49486687ee 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -29,51 +29,130 @@ static DEFINE_MUTEX(filter_mutex); -static int filter_pred_64(struct filter_pred *pred, void *event) +enum filter_op_ids { - u64 *addr = (u64 *)(event + pred->offset); - u64 val = (u64)pred->val; - int match; + OP_OR, + OP_AND, + OP_NE, + OP_EQ, + OP_LT, + OP_LE, + OP_GT, + OP_GE, + OP_NONE, + OP_OPEN_PAREN, +}; - match = (val == *addr) ^ pred->not; +struct filter_op { + int id; + char *string; + int precedence; +}; - return match; +static struct filter_op filter_ops[] = { + { OP_OR, "||", 1 }, + { OP_AND, "&&", 2 }, + { OP_NE, "!=", 4 }, + { OP_EQ, "==", 4 }, + { OP_LT, "<", 5 }, + { OP_LE, "<=", 5 }, + { OP_GT, ">", 5 }, + { OP_GE, ">=", 5 }, + { OP_NONE, "OP_NONE", 0 }, + { OP_OPEN_PAREN, "(", 0 }, +}; + +enum { + FILT_ERR_NONE, + FILT_ERR_INVALID_OP, + FILT_ERR_UNBALANCED_PAREN, + FILT_ERR_TOO_MANY_OPERANDS, + FILT_ERR_OPERAND_TOO_LONG, + FILT_ERR_FIELD_NOT_FOUND, + FILT_ERR_ILLEGAL_FIELD_OP, + FILT_ERR_ILLEGAL_INTVAL, + FILT_ERR_BAD_SUBSYS_FILTER, + FILT_ERR_TOO_MANY_PREDS, + FILT_ERR_MISSING_FIELD, + FILT_ERR_INVALID_FILTER, +}; + +static char *err_text[] = { + "No error", + "Invalid operator", + "Unbalanced parens", + "Too many operands", + "Operand too long", + "Field not found", + "Illegal operation for field type", + "Illegal integer value", + "Couldn't find or set field in one of a subsystem's events", + "Too many terms in predicate expression", + "Missing field name and/or value", + "Meaningless filter expression", +}; + +struct opstack_op { + int op; + struct list_head list; +}; + +struct postfix_elt { + int op; + char *operand; + struct list_head list; +}; + +struct filter_parse_state { + struct filter_op *ops; + struct list_head opstack; + struct list_head postfix; + int lasterr; + int lasterr_pos; + + struct { + char *string; + unsigned int cnt; + unsigned int tail; + } infix; + + struct { + char string[MAX_FILTER_STR_VAL]; + int pos; + unsigned int tail; + } operand; +}; + +DEFINE_COMPARISON_PRED(s64); +DEFINE_COMPARISON_PRED(u64); +DEFINE_COMPARISON_PRED(s32); +DEFINE_COMPARISON_PRED(u32); +DEFINE_COMPARISON_PRED(s16); +DEFINE_COMPARISON_PRED(u16); +DEFINE_COMPARISON_PRED(s8); +DEFINE_COMPARISON_PRED(u8); + +DEFINE_EQUALITY_PRED(64); +DEFINE_EQUALITY_PRED(32); +DEFINE_EQUALITY_PRED(16); +DEFINE_EQUALITY_PRED(8); + +static int filter_pred_and(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) +{ + return val1 && val2; } -static int filter_pred_32(struct filter_pred *pred, void *event) +static int filter_pred_or(struct filter_pred *pred __attribute((unused)), + void *event __attribute((unused)), + int val1, int val2) { - u32 *addr = (u32 *)(event + pred->offset); - u32 val = (u32)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; + return val1 || val2; } -static int filter_pred_16(struct filter_pred *pred, void *event) -{ - u16 *addr = (u16 *)(event + pred->offset); - u16 val = (u16)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; -} - -static int filter_pred_8(struct filter_pred *pred, void *event) -{ - u8 *addr = (u8 *)(event + pred->offset); - u8 val = (u8)pred->val; - int match; - - match = (val == *addr) ^ pred->not; - - return match; -} - -static int filter_pred_string(struct filter_pred *pred, void *event) +static int filter_pred_string(struct filter_pred *pred, void *event, + int val1, int val2) { char *addr = (char *)(event + pred->offset); int cmp, match; @@ -85,7 +164,8 @@ static int filter_pred_string(struct filter_pred *pred, void *event) return match; } -static int filter_pred_none(struct filter_pred *pred, void *event) +static int filter_pred_none(struct filter_pred *pred, void *event, + int val1, int val2) { return 0; } @@ -94,66 +174,119 @@ static int filter_pred_none(struct filter_pred *pred, void *event) int filter_match_preds(struct ftrace_event_call *call, void *rec) { struct event_filter *filter = call->filter; - int i, matched, and_failed = 0; + int match, top = 0, val1 = 0, val2 = 0; + int stack[MAX_FILTER_PRED]; struct filter_pred *pred; + int i; for (i = 0; i < filter->n_preds; i++) { pred = filter->preds[i]; - if (and_failed && !pred->or) + if (!pred->pop_n) { + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; continue; - matched = pred->fn(pred, rec); - if (!matched && !pred->or) { - and_failed = 1; - continue; - } else if (matched && pred->or) - return 1; + } + if (pred->pop_n > top) { + WARN_ON_ONCE(1); + return 0; + } + val1 = stack[--top]; + val2 = stack[--top]; + match = pred->fn(pred, rec, val1, val2); + stack[top++] = match; } - if (and_failed) - return 0; - - return 1; + return stack[--top]; } EXPORT_SYMBOL_GPL(filter_match_preds); -static void __filter_print_preds(struct event_filter *filter, - struct trace_seq *s) +static void parse_error(struct filter_parse_state *ps, int err, int pos) { - struct filter_pred *pred; - char *field_name; - int i; - - if (!filter || !filter->n_preds) { - trace_seq_printf(s, "none\n"); - return; - } - - for (i = 0; i < filter->n_preds; i++) { - pred = filter->preds[i]; - field_name = pred->field_name; - if (i) - trace_seq_printf(s, pred->or ? "|| " : "&& "); - trace_seq_printf(s, "%s ", field_name); - trace_seq_printf(s, pred->not ? "!= " : "== "); - if (pred->str_len) - trace_seq_printf(s, "%s\n", pred->str_val); - else - trace_seq_printf(s, "%llu\n", pred->val); - } + ps->lasterr = err; + ps->lasterr_pos = pos; } -void filter_print_preds(struct ftrace_event_call *call, struct trace_seq *s) +static void remove_filter_string(struct event_filter *filter) { + kfree(filter->filter_string); + filter->filter_string = NULL; +} + +static int replace_filter_string(struct event_filter *filter, + char *filter_string) +{ + kfree(filter->filter_string); + filter->filter_string = kstrdup(filter_string, GFP_KERNEL); + if (!filter->filter_string) + return -ENOMEM; + + return 0; +} + +static int append_filter_string(struct event_filter *filter, + char *string) +{ + int newlen; + char *new_filter_string; + + BUG_ON(!filter->filter_string); + newlen = strlen(filter->filter_string) + strlen(string) + 1; + new_filter_string = kmalloc(newlen, GFP_KERNEL); + if (!new_filter_string) + return -ENOMEM; + + strcpy(new_filter_string, filter->filter_string); + strcat(new_filter_string, string); + kfree(filter->filter_string); + filter->filter_string = new_filter_string; + + return 0; +} + +static void append_filter_err(struct filter_parse_state *ps, + struct event_filter *filter) +{ + int pos = ps->lasterr_pos; + char *buf, *pbuf; + + buf = (char *)__get_free_page(GFP_TEMPORARY); + if (!buf) + return; + + append_filter_string(filter, "\n"); + memset(buf, ' ', PAGE_SIZE); + if (pos > PAGE_SIZE - 128) + pos = 0; + buf[pos] = '^'; + pbuf = &buf[pos] + 1; + + sprintf(pbuf, "\nparse_error: %s\n", err_text[ps->lasterr]); + append_filter_string(filter, buf); + free_page((unsigned long) buf); +} + +void print_event_filter(struct ftrace_event_call *call, struct trace_seq *s) +{ + struct event_filter *filter = call->filter; + mutex_lock(&filter_mutex); - __filter_print_preds(call->filter, s); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); mutex_unlock(&filter_mutex); } -void filter_print_subsystem_preds(struct event_subsystem *system, +void print_subsystem_event_filter(struct event_subsystem *system, struct trace_seq *s) { + struct event_filter *filter = system->filter; + mutex_lock(&filter_mutex); - __filter_print_preds(system->filter, s); + if (filter->filter_string) + trace_seq_printf(s, "%s\n", filter->filter_string); + else + trace_seq_printf(s, "none\n"); mutex_unlock(&filter_mutex); } @@ -170,7 +303,7 @@ find_event_field(struct ftrace_event_call *call, char *name) return NULL; } -void filter_free_pred(struct filter_pred *pred) +static void filter_free_pred(struct filter_pred *pred) { if (!pred) return; @@ -191,15 +324,17 @@ static int filter_set_pred(struct filter_pred *dest, filter_pred_fn_t fn) { *dest = *src; - dest->field_name = kstrdup(src->field_name, GFP_KERNEL); - if (!dest->field_name) - return -ENOMEM; + if (src->field_name) { + dest->field_name = kstrdup(src->field_name, GFP_KERNEL); + if (!dest->field_name) + return -ENOMEM; + } dest->fn = fn; return 0; } -static void __filter_disable_preds(struct ftrace_event_call *call) +static void filter_disable_preds(struct ftrace_event_call *call) { struct event_filter *filter = call->filter; int i; @@ -211,13 +346,6 @@ static void __filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } -void filter_disable_preds(struct ftrace_event_call *call) -{ - mutex_lock(&filter_mutex); - __filter_disable_preds(call); - mutex_unlock(&filter_mutex); -} - int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; @@ -258,48 +386,43 @@ oom: } EXPORT_SYMBOL_GPL(init_preds); -static void __filter_free_subsystem_preds(struct event_subsystem *system) +static void filter_free_subsystem_preds(struct event_subsystem *system) { struct event_filter *filter = system->filter; struct ftrace_event_call *call; int i; - if (filter && filter->n_preds) { + if (filter->n_preds) { for (i = 0; i < filter->n_preds; i++) filter_free_pred(filter->preds[i]); kfree(filter->preds); - kfree(filter); - system->filter = NULL; + filter->preds = NULL; + filter->n_preds = 0; } list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; - if (!strcmp(call->system, system->name)) - __filter_disable_preds(call); + if (!strcmp(call->system, system->name)) { + filter_disable_preds(call); + remove_filter_string(call->filter); + } } } -void filter_free_subsystem_preds(struct event_subsystem *system) -{ - mutex_lock(&filter_mutex); - __filter_free_subsystem_preds(system); - mutex_unlock(&filter_mutex); -} - -static int filter_add_pred_fn(struct ftrace_event_call *call, +static int filter_add_pred_fn(struct filter_parse_state *ps, + struct ftrace_event_call *call, struct filter_pred *pred, filter_pred_fn_t fn) { struct event_filter *filter = call->filter; int idx, err; - if (filter->n_preds && !pred->compound) - __filter_disable_preds(call); - - if (filter->n_preds == MAX_FILTER_PRED) + if (filter->n_preds == MAX_FILTER_PRED) { + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; + } idx = filter->n_preds; filter_clear_pred(filter->preds[idx]); @@ -321,94 +444,132 @@ static int is_string_field(const char *type) return 0; } -static int __filter_add_pred(struct ftrace_event_call *call, - struct filter_pred *pred) +static int is_legal_op(struct ftrace_event_field *field, int op) +{ + if (is_string_field(field->type) && (op != OP_EQ && op != OP_NE)) + return 0; + + return 1; +} + +static filter_pred_fn_t select_comparison_fn(int op, int field_size, + int field_is_signed) +{ + filter_pred_fn_t fn = NULL; + + switch (field_size) { + case 8: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_64; + else if (field_is_signed) + fn = filter_pred_s64; + else + fn = filter_pred_u64; + break; + case 4: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_32; + else if (field_is_signed) + fn = filter_pred_s32; + else + fn = filter_pred_u32; + break; + case 2: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_16; + else if (field_is_signed) + fn = filter_pred_s16; + else + fn = filter_pred_u16; + break; + case 1: + if (op == OP_EQ || op == OP_NE) + fn = filter_pred_8; + else if (field_is_signed) + fn = filter_pred_s8; + else + fn = filter_pred_u8; + break; + } + + return fn; +} + +static int filter_add_pred(struct filter_parse_state *ps, + struct ftrace_event_call *call, + struct filter_pred *pred) { struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; - field = find_event_field(call, pred->field_name); - if (!field) - return -EINVAL; - pred->fn = filter_pred_none; + + if (pred->op == OP_AND) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_and); + } else if (pred->op == OP_OR) { + pred->pop_n = 2; + return filter_add_pred_fn(ps, call, pred, filter_pred_or); + } + + field = find_event_field(call, pred->field_name); + if (!field) { + parse_error(ps, FILT_ERR_FIELD_NOT_FOUND, 0); + return -EINVAL; + } + pred->offset = field->offset; + if (!is_legal_op(field, pred->op)) { + parse_error(ps, FILT_ERR_ILLEGAL_FIELD_OP, 0); + return -EINVAL; + } + if (is_string_field(field->type)) { fn = filter_pred_string; pred->str_len = field->size; - return filter_add_pred_fn(call, pred, fn); + if (pred->op == OP_NE) + pred->not = 1; + return filter_add_pred_fn(ps, call, pred, fn); } else { - if (strict_strtoull(pred->str_val, 0, &val)) + if (strict_strtoull(pred->str_val, 0, &val)) { + parse_error(ps, FILT_ERR_ILLEGAL_INTVAL, 0); return -EINVAL; + } pred->val = val; } - switch (field->size) { - case 8: - fn = filter_pred_64; - break; - case 4: - fn = filter_pred_32; - break; - case 2: - fn = filter_pred_16; - break; - case 1: - fn = filter_pred_8; - break; - default: + fn = select_comparison_fn(pred->op, field->size, field->is_signed); + if (!fn) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); return -EINVAL; } - return filter_add_pred_fn(call, pred, fn); + if (pred->op == OP_NE) + pred->not = 1; + + return filter_add_pred_fn(ps, call, pred, fn); } -int filter_add_pred(struct ftrace_event_call *call, struct filter_pred *pred) -{ - int err; - - mutex_lock(&filter_mutex); - err = __filter_add_pred(call, pred); - mutex_unlock(&filter_mutex); - - return err; -} - -int filter_add_subsystem_pred(struct event_subsystem *system, - struct filter_pred *pred) +static int filter_add_subsystem_pred(struct filter_parse_state *ps, + struct event_subsystem *system, + struct filter_pred *pred, + char *filter_string) { struct event_filter *filter = system->filter; struct ftrace_event_call *call; - mutex_lock(&filter_mutex); - - if (filter && filter->n_preds && !pred->compound) { - __filter_free_subsystem_preds(system); - filter = NULL; - } - - if (!filter) { - system->filter = kzalloc(sizeof(*filter), GFP_KERNEL); - if (!system->filter) { - mutex_unlock(&filter_mutex); - return -ENOMEM; - } - filter = system->filter; + if (!filter->preds) { filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), GFP_KERNEL); - if (!filter->preds) { - kfree(system->filter); - system->filter = NULL; - mutex_unlock(&filter_mutex); + if (!filter->preds) return -ENOMEM; - } } if (filter->n_preds == MAX_FILTER_PRED) { - mutex_unlock(&filter_mutex); + parse_error(ps, FILT_ERR_TOO_MANY_PREDS, 0); return -ENOSPC; } @@ -424,97 +585,508 @@ int filter_add_subsystem_pred(struct event_subsystem *system, if (strcmp(call->system, system->name)) continue; - err = __filter_add_pred(call, pred); - if (err == -ENOMEM) { - filter->preds[filter->n_preds] = NULL; - filter->n_preds--; - mutex_unlock(&filter_mutex); + err = filter_add_pred(ps, call, pred); + if (err) { + filter_free_subsystem_preds(system); + parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); return err; } + replace_filter_string(call->filter, filter_string); } - mutex_unlock(&filter_mutex); - return 0; } -/* - * The filter format can be - * - 0, which means remove all filter preds - * - [||/&&] ==/!= - */ -int filter_parse(char **pbuf, struct filter_pred *pred) +static void parse_init(struct filter_parse_state *ps, + struct filter_op *ops, + char *infix_string) { - char *tok, *val_str = NULL; - int tok_n = 0; + memset(ps, '\0', sizeof(*ps)); - while ((tok = strsep(pbuf, " \n"))) { - if (tok_n == 0) { - if (!strcmp(tok, "0")) { - pred->clear = 1; - return 0; - } else if (!strcmp(tok, "&&")) { - pred->or = 0; - pred->compound = 1; - } else if (!strcmp(tok, "||")) { - pred->or = 1; - pred->compound = 1; - } else - pred->field_name = tok; - tok_n = 1; - continue; - } - if (tok_n == 1) { - if (!pred->field_name) - pred->field_name = tok; - else if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; - return -EINVAL; - } - tok_n = 2; - continue; - } - if (tok_n == 2) { - if (pred->compound) { - if (!strcmp(tok, "!=")) - pred->not = 1; - else if (!strcmp(tok, "==")) - pred->not = 0; - else { - pred->field_name = NULL; - return -EINVAL; - } - } else { - val_str = tok; - break; /* done */ - } - tok_n = 3; - continue; - } - if (tok_n == 3) { - val_str = tok; - break; /* done */ + ps->infix.string = infix_string; + ps->infix.cnt = strlen(infix_string); + ps->ops = ops; + + INIT_LIST_HEAD(&ps->opstack); + INIT_LIST_HEAD(&ps->postfix); +} + +static char infix_next(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + + return ps->infix.string[ps->infix.tail++]; +} + +static char infix_peek(struct filter_parse_state *ps) +{ + if (ps->infix.tail == strlen(ps->infix.string)) + return 0; + + return ps->infix.string[ps->infix.tail]; +} + +static void infix_advance(struct filter_parse_state *ps) +{ + ps->infix.cnt--; + ps->infix.tail++; +} + +static inline int is_precedence_lower(struct filter_parse_state *ps, + int a, int b) +{ + return ps->ops[a].precedence < ps->ops[b].precedence; +} + +static inline int is_op_char(struct filter_parse_state *ps, char c) +{ + int i; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (ps->ops[i].string[0] == c) + return 1; + } + + return 0; +} + +static int infix_get_op(struct filter_parse_state *ps, char firstc) +{ + char nextc = infix_peek(ps); + char opstr[3]; + int i; + + opstr[0] = firstc; + opstr[1] = nextc; + opstr[2] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) { + infix_advance(ps); + return ps->ops[i].id; } } - if (!val_str || !strlen(val_str) - || strlen(val_str) >= MAX_FILTER_STR_VAL) { - pred->field_name = NULL; + opstr[1] = '\0'; + + for (i = 0; strcmp(ps->ops[i].string, "OP_NONE"); i++) { + if (!strcmp(opstr, ps->ops[i].string)) + return ps->ops[i].id; + } + + return OP_NONE; +} + +static inline void clear_operand_string(struct filter_parse_state *ps) +{ + memset(ps->operand.string, '\0', MAX_FILTER_STR_VAL); + ps->operand.tail = 0; +} + +static inline int append_operand_char(struct filter_parse_state *ps, char c) +{ + if (ps->operand.tail == MAX_FILTER_STR_VAL) + return -EINVAL; + + ps->operand.string[ps->operand.tail++] = c; + + return 0; +} + +static int filter_opstack_push(struct filter_parse_state *ps, int op) +{ + struct opstack_op *opstack_op; + + opstack_op = kmalloc(sizeof(*opstack_op), GFP_KERNEL); + if (!opstack_op) + return -ENOMEM; + + opstack_op->op = op; + list_add(&opstack_op->list, &ps->opstack); + + return 0; +} + +static int filter_opstack_empty(struct filter_parse_state *ps) +{ + return list_empty(&ps->opstack); +} + +static int filter_opstack_top(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + + return opstack_op->op; +} + +static int filter_opstack_pop(struct filter_parse_state *ps) +{ + struct opstack_op *opstack_op; + int op; + + if (filter_opstack_empty(ps)) + return OP_NONE; + + opstack_op = list_first_entry(&ps->opstack, struct opstack_op, list); + op = opstack_op->op; + list_del(&opstack_op->list); + + kfree(opstack_op); + + return op; +} + +static void filter_opstack_clear(struct filter_parse_state *ps) +{ + while (!filter_opstack_empty(ps)) + filter_opstack_pop(ps); +} + +static char *curr_operand(struct filter_parse_state *ps) +{ + return ps->operand.string; +} + +static int postfix_append_operand(struct filter_parse_state *ps, char *operand) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = OP_NONE; + elt->operand = kstrdup(operand, GFP_KERNEL); + if (!elt->operand) { + kfree(elt); + return -ENOMEM; + } + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static int postfix_append_op(struct filter_parse_state *ps, int op) +{ + struct postfix_elt *elt; + + elt = kmalloc(sizeof(*elt), GFP_KERNEL); + if (!elt) + return -ENOMEM; + + elt->op = op; + elt->operand = NULL; + + list_add_tail(&elt->list, &ps->postfix); + + return 0; +} + +static void postfix_clear(struct filter_parse_state *ps) +{ + struct postfix_elt *elt; + + while (!list_empty(&ps->postfix)) { + elt = list_first_entry(&ps->postfix, struct postfix_elt, list); + kfree(elt->operand); + list_del(&elt->list); + } +} + +static int filter_parse(struct filter_parse_state *ps) +{ + int op, top_op; + char ch; + + while ((ch = infix_next(ps))) { + if (isspace(ch)) + continue; + + if (is_op_char(ps, ch)) { + op = infix_get_op(ps, ch); + if (op == OP_NONE) { + parse_error(ps, FILT_ERR_INVALID_OP, 0); + return -EINVAL; + } + + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_top(ps); + if (!is_precedence_lower(ps, top_op, op)) { + top_op = filter_opstack_pop(ps); + postfix_append_op(ps, top_op); + continue; + } + break; + } + + filter_opstack_push(ps, op); + continue; + } + + if (ch == '(') { + filter_opstack_push(ps, OP_OPEN_PAREN); + continue; + } + + if (ch == ')') { + if (strlen(curr_operand(ps))) { + postfix_append_operand(ps, curr_operand(ps)); + clear_operand_string(ps); + } + + top_op = filter_opstack_pop(ps); + while (top_op != OP_NONE) { + if (top_op == OP_OPEN_PAREN) + break; + postfix_append_op(ps, top_op); + top_op = filter_opstack_pop(ps); + } + if (top_op == OP_NONE) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + continue; + } + if (append_operand_char(ps, ch)) { + parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); + return -EINVAL; + } + } + + if (strlen(curr_operand(ps))) + postfix_append_operand(ps, curr_operand(ps)); + + while (!filter_opstack_empty(ps)) { + top_op = filter_opstack_pop(ps); + if (top_op == OP_NONE) + break; + if (top_op == OP_OPEN_PAREN) { + parse_error(ps, FILT_ERR_UNBALANCED_PAREN, 0); + return -EINVAL; + } + postfix_append_op(ps, top_op); + } + + return 0; +} + +static struct filter_pred *create_pred(int op, char *operand1, char *operand2) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->field_name = kstrdup(operand1, GFP_KERNEL); + if (!pred->field_name) { + kfree(pred); + return NULL; + } + + strcpy(pred->str_val, operand2); + pred->str_len = strlen(operand2); + + pred->op = op; + + return pred; +} + +static struct filter_pred *create_logical_pred(int op) +{ + struct filter_pred *pred; + + pred = kzalloc(sizeof(*pred), GFP_KERNEL); + if (!pred) + return NULL; + + pred->op = op; + + return pred; +} + +static int check_preds(struct filter_parse_state *ps) +{ + int n_normal_preds = 0, n_logical_preds = 0; + struct postfix_elt *elt; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) + continue; + + if (elt->op == OP_AND || elt->op == OP_OR) { + n_logical_preds++; + continue; + } + n_normal_preds++; + } + + if (!n_normal_preds || n_logical_preds >= n_normal_preds) { + parse_error(ps, FILT_ERR_INVALID_FILTER, 0); return -EINVAL; } - strcpy(pred->str_val, val_str); - pred->str_len = strlen(val_str); + return 0; +} - pred->field_name = kstrdup(pred->field_name, GFP_KERNEL); - if (!pred->field_name) - return -ENOMEM; +static int replace_preds(struct event_subsystem *system, + struct ftrace_event_call *call, + struct filter_parse_state *ps, + char *filter_string) +{ + char *operand1 = NULL, *operand2 = NULL; + struct filter_pred *pred; + struct postfix_elt *elt; + int err; + + err = check_preds(ps); + if (err) + return err; + + list_for_each_entry(elt, &ps->postfix, list) { + if (elt->op == OP_NONE) { + if (!operand1) + operand1 = elt->operand; + else if (!operand2) + operand2 = elt->operand; + else { + parse_error(ps, FILT_ERR_TOO_MANY_OPERANDS, 0); + return -EINVAL; + } + continue; + } + + if (elt->op == OP_AND || elt->op == OP_OR) { + pred = create_logical_pred(elt->op); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, + pred, filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; + continue; + } + + if (!operand1 || !operand2) { + parse_error(ps, FILT_ERR_MISSING_FIELD, 0); + return -EINVAL; + } + + pred = create_pred(elt->op, operand1, operand2); + if (call) { + err = filter_add_pred(ps, call, pred); + filter_free_pred(pred); + } else + err = filter_add_subsystem_pred(ps, system, pred, + filter_string); + if (err) + return err; + + operand1 = operand2 = NULL; + } return 0; } +int apply_event_filter(struct ftrace_event_call *call, char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_disable_preds(call); + remove_filter_string(call->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + filter_disable_preds(call); + replace_filter_string(call->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, call->filter); + goto out; + } + + err = replace_preds(NULL, call, ps, filter_string); + if (err) + append_filter_err(ps, call->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + mutex_unlock(&filter_mutex); + + return err; +} + +int apply_subsystem_event_filter(struct event_subsystem *system, + char *filter_string) +{ + int err; + + struct filter_parse_state *ps; + + mutex_lock(&filter_mutex); + + if (!strcmp(strstrip(filter_string), "0")) { + filter_free_subsystem_preds(system); + remove_filter_string(system->filter); + mutex_unlock(&filter_mutex); + return 0; + } + + ps = kzalloc(sizeof(*ps), GFP_KERNEL); + if (!ps) + return -ENOMEM; + + filter_free_subsystem_preds(system); + replace_filter_string(system->filter, filter_string); + + parse_init(ps, filter_ops, filter_string); + err = filter_parse(ps); + if (err) { + append_filter_err(ps, system->filter); + goto out; + } + + err = replace_preds(system, NULL, ps, filter_string); + if (err) + append_filter_err(ps, system->filter); + +out: + filter_opstack_clear(ps); + postfix_clear(ps); + kfree(ps); + + mutex_unlock(&filter_mutex); + + return err; +} From a0e39ed378fb6ba916522764cd508fa7d42ad495 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Wed, 29 Apr 2009 13:51:39 +0200 Subject: [PATCH 0667/2061] tracing: fix build failure on s390 "tracing: create automated trace defines" causes this compile error on s390, as reported by Sachin Sant against linux-next: kernel/built-in.o: In function `__do_softirq': (.text+0x1c680): undefined reference to `__tracepoint_softirq_entry' This happens because the definitions of the softirq tracepoints were moved from kernel/softirq.c to kernel/irq/handle.c. Since s390 doesn't support generic hardirqs handle.c doesn't get compiled and the definitions are missing. So move the tracepoints to softirq.c again. [ Impact: fix build failure on s390 ] Reported-by: Sachin Sant Signed-off-by: Heiko Carstens Cc: Steven Rostedt Cc: fweisbec@gmail.com LKML-Reference: <20090429135139.5fac79b8@osiris.boeblingen.de.ibm.com> Signed-off-by: Ingo Molnar --- kernel/irq/handle.c | 2 -- kernel/softirq.c | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 37c63633e78..e68bb5aebe0 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -18,8 +18,6 @@ #include #include #include - -#define CREATE_TRACE_POINTS #include #include "internals.h" diff --git a/kernel/softirq.c b/kernel/softirq.c index 7ab9dfd8d08..d4ba347a872 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -24,6 +24,8 @@ #include #include #include + +#define CREATE_TRACE_POINTS #include #include From 50fa610a3b6ba7cf91d7a92229177dfaff2b81a1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Tue, 28 Apr 2009 15:01:38 +0100 Subject: [PATCH 0668/2061] sched: Document memory barriers implied by sleep/wake-up primitives Add a section to the memory barriers document to note the implied memory barriers of sleep primitives (set_current_state() and wrappers) and wake-up primitives (wake_up() and co.). Also extend the in-code comments on the wake_up() functions to note these implied barriers. [ Impact: add documentation ] Signed-off-by: David Howells Cc: Oleg Nesterov Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <20090428140138.1192.94723.stgit@warthog.procyon.org.uk> Signed-off-by: Ingo Molnar --- Documentation/memory-barriers.txt | 129 +++++++++++++++++++++++++++++- kernel/sched.c | 23 ++++++ 2 files changed, 151 insertions(+), 1 deletion(-) diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt index f5b7127f54a..7f5809eddee 100644 --- a/Documentation/memory-barriers.txt +++ b/Documentation/memory-barriers.txt @@ -31,6 +31,7 @@ Contents: - Locking functions. - Interrupt disabling functions. + - Sleep and wake-up functions. - Miscellaneous functions. (*) Inter-CPU locking barrier effects. @@ -1217,6 +1218,132 @@ barriers are required in such a situation, they must be provided from some other means. +SLEEP AND WAKE-UP FUNCTIONS +--------------------------- + +Sleeping and waking on an event flagged in global data can be viewed as an +interaction between two pieces of data: the task state of the task waiting for +the event and the global data used to indicate the event. To make sure that +these appear to happen in the right order, the primitives to begin the process +of going to sleep, and the primitives to initiate a wake up imply certain +barriers. + +Firstly, the sleeper normally follows something like this sequence of events: + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (event_indicated) + break; + schedule(); + } + +A general memory barrier is interpolated automatically by set_current_state() +after it has altered the task state: + + CPU 1 + =============================== + set_current_state(); + set_mb(); + STORE current->state + + LOAD event_indicated + +set_current_state() may be wrapped by: + + prepare_to_wait(); + prepare_to_wait_exclusive(); + +which therefore also imply a general memory barrier after setting the state. +The whole sequence above is available in various canned forms, all of which +interpolate the memory barrier in the right place: + + wait_event(); + wait_event_interruptible(); + wait_event_interruptible_exclusive(); + wait_event_interruptible_timeout(); + wait_event_killable(); + wait_event_timeout(); + wait_on_bit(); + wait_on_bit_lock(); + + +Secondly, code that performs a wake up normally follows something like this: + + event_indicated = 1; + wake_up(&event_wait_queue); + +or: + + event_indicated = 1; + wake_up_process(event_daemon); + +A write memory barrier is implied by wake_up() and co. if and only if they wake +something up. The barrier occurs before the task state is cleared, and so sits +between the STORE to indicate the event and the STORE to set TASK_RUNNING: + + CPU 1 CPU 2 + =============================== =============================== + set_current_state(); STORE event_indicated + set_mb(); wake_up(); + STORE current->state + STORE current->state + LOAD event_indicated + +The available waker functions include: + + complete(); + wake_up(); + wake_up_all(); + wake_up_bit(); + wake_up_interruptible(); + wake_up_interruptible_all(); + wake_up_interruptible_nr(); + wake_up_interruptible_poll(); + wake_up_interruptible_sync(); + wake_up_interruptible_sync_poll(); + wake_up_locked(); + wake_up_locked_poll(); + wake_up_nr(); + wake_up_poll(); + wake_up_process(); + + +[!] Note that the memory barriers implied by the sleeper and the waker do _not_ +order multiple stores before the wake-up with respect to loads of those stored +values after the sleeper has called set_current_state(). For instance, if the +sleeper does: + + set_current_state(TASK_INTERRUPTIBLE); + if (event_indicated) + break; + __set_current_state(TASK_RUNNING); + do_something(my_data); + +and the waker does: + + my_data = value; + event_indicated = 1; + wake_up(&event_wait_queue); + +there's no guarantee that the change to event_indicated will be perceived by +the sleeper as coming after the change to my_data. In such a circumstance, the +code on both sides must interpolate its own memory barriers between the +separate data accesses. Thus the above sleeper ought to do: + + set_current_state(TASK_INTERRUPTIBLE); + if (event_indicated) { + smp_rmb(); + do_something(my_data); + } + +and the waker should do: + + my_data = value; + smp_wmb(); + event_indicated = 1; + wake_up(&event_wait_queue); + + MISCELLANEOUS FUNCTIONS ----------------------- @@ -1366,7 +1493,7 @@ WHERE ARE MEMORY BARRIERS NEEDED? Under normal operation, memory operation reordering is generally not going to be a problem as a single-threaded linear piece of code will still appear to -work correctly, even if it's in an SMP kernel. There are, however, three +work correctly, even if it's in an SMP kernel. There are, however, four circumstances in which reordering definitely _could_ be a problem: (*) Interprocessor interaction. diff --git a/kernel/sched.c b/kernel/sched.c index b902e587a3a..fd0c2cee3f3 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -2458,6 +2458,17 @@ out: return success; } +/** + * wake_up_process - Wake up a specific process + * @p: The process to be woken up. + * + * Attempt to wake up the nominated process and move it to the set of runnable + * processes. Returns 1 if the process was woken up, 0 if it was already + * running. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. + */ int wake_up_process(struct task_struct *p) { return try_to_wake_up(p, TASK_ALL, 0); @@ -5241,6 +5252,9 @@ void __wake_up_common(wait_queue_head_t *q, unsigned int mode, * @mode: which threads * @nr_exclusive: how many wake-one or wake-many threads to wake up * @key: is directly passed to the wakeup function + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5279,6 +5293,9 @@ void __wake_up_locked_key(wait_queue_head_t *q, unsigned int mode, void *key) * with each other. This can prevent needless bouncing between CPUs. * * On UP it can prevent extra preemption. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void __wake_up_sync_key(wait_queue_head_t *q, unsigned int mode, int nr_exclusive, void *key) @@ -5315,6 +5332,9 @@ EXPORT_SYMBOL_GPL(__wake_up_sync); /* For internal use only */ * awakened in the same order in which they were queued. * * See also complete_all(), wait_for_completion() and related routines. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete(struct completion *x) { @@ -5332,6 +5352,9 @@ EXPORT_SYMBOL(complete); * @x: holds the state of this particular completion * * This will wake up all threads waiting on this particular completion event. + * + * It may be assumed that this function implies a write memory barrier before + * changing the task state if and only if any tasks are woken up. */ void complete_all(struct completion *x) { From da1a776be1ac7f78bb30ececbec4c1383163b079 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:46:58 +0200 Subject: [PATCH 0669/2061] perf_counter, x86: remove X86_FEATURE_ARCH_PERFMON flag for AMD cpus X86_FEATURE_ARCH_PERFMON is an Intel hardware feature that does not work on AMD CPUs. The flag is now only used in Intel specific code (especially initialization). [ Impact: refactor code ] Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241002046-8832-2-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 4 ---- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index fd69c514ca2..7e4a459daa6 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -420,10 +420,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) if (c->x86 >= 6) set_cpu_cap(c, X86_FEATURE_FXSAVE_LEAK); - /* Enable Performance counter for K7 and later */ - if (c->x86 > 6 && c->x86 <= 0x11) - set_cpu_cap(c, X86_FEATURE_ARCH_PERFMON); - if (!c->x86_model_id[0]) { switch (c->x86) { case 0xf: diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 0fcbaab83f9..7d0f81dcb52 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -949,6 +949,9 @@ static struct pmc_x86_ops *pmc_intel_init(void) unsigned int unused; unsigned int ebx; + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + return NULL; + /* * Check whether the Architectural PerfMon supports * Branch Misses Retired Event or not. @@ -987,9 +990,6 @@ static struct pmc_x86_ops *pmc_amd_init(void) void __init init_hw_perf_counters(void) { - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: pmc_ops = pmc_intel_init(); From 829b42dd395c5801f6ae87da87ecbdcfd5ef1a6c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:46:59 +0200 Subject: [PATCH 0670/2061] perf_counter, x86: declare perf_max_counters only for CONFIG_PERF_COUNTERS This is only needed for CONFIG_PERF_COUNTERS enabled. [ Impact: cleanup ] Signed-off-by: Robert Richter Acked-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241002046-8832-3-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 98143288530..be10b3ffe32 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -512,12 +512,13 @@ struct perf_cpu_context { int recursion[4]; }; +#ifdef CONFIG_PERF_COUNTERS + /* * Set by architecture code: */ extern int perf_max_counters; -#ifdef CONFIG_PERF_COUNTERS extern const struct hw_perf_counter_ops * hw_perf_counter_init(struct perf_counter *counter); From 4138960a9251a265002b5cf07e671a49f8495381 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:00 +0200 Subject: [PATCH 0671/2061] perf_counter, x86: add default path to cpu detection This quits hw counter initialization immediately if no cpu is detected. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-4-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7d0f81dcb52..d6d6529349d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -997,6 +997,8 @@ void __init init_hw_perf_counters(void) case X86_VENDOR_AMD: pmc_ops = pmc_amd_init(); break; + default: + return; } if (!pmc_ops) return; From 4295ee62660b13ddb87d41539f49b239e6e7d56f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:01 +0200 Subject: [PATCH 0672/2061] perf_counter, x86: rework pmc_amd_save_disable_all() and pmc_amd_restore_all() MSR reads and writes are expensive. This patch adds checks to avoid its usage where possible. [ Impact: micro-optimization on AMD CPUs ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-5-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d6d6529349d..75a090394b6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -334,11 +334,13 @@ static u64 pmc_amd_save_disable_all(void) for (idx = 0; idx < nr_counters_generic; idx++) { u64 val; + if (!test_bit(idx, cpuc->active_mask)) + continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) { - val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } + if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) + continue; + val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } return enabled; @@ -372,13 +374,15 @@ static void pmc_amd_restore_all(u64 ctrl) return; for (idx = 0; idx < nr_counters_generic; idx++) { - if (test_bit(idx, cpuc->active_mask)) { - u64 val; + u64 val; - rdmsrl(MSR_K7_EVNTSEL0 + idx, val); - val |= ARCH_PERFMON_EVENTSEL0_ENABLE; - wrmsrl(MSR_K7_EVNTSEL0 + idx, val); - } + if (!test_bit(idx, cpuc->active_mask)) + continue; + rdmsrl(MSR_K7_EVNTSEL0 + idx, val); + if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) + continue; + val |= ARCH_PERFMON_EVENTSEL0_ENABLE; + wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } } From 527e26af3741a2168986d8b82653ffe173891324 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:02 +0200 Subject: [PATCH 0673/2061] perf_counter, x86: protect per-cpu variables with compile barriers only Per-cpu variables needn't to be protected with cpu barriers (smp_wmb()). Protection is only needed for preemption on the same cpu (rescheduling or the nmi handler). This can be done using a compiler barrier only. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-6-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 75a090394b6..ad663d5ad2d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -673,7 +673,7 @@ try_generic: /* * Make it visible before enabling the hw: */ - smp_wmb(); + barrier(); __hw_perf_counter_set_period(counter, hwc, idx); __pmc_generic_enable(counter, hwc, idx); @@ -745,7 +745,7 @@ static void pmc_generic_disable(struct perf_counter *counter) * Make sure the cleared pointer becomes visible before we * (potentially) free the counter: */ - smp_wmb(); + barrier(); /* * Drain the remaining delta count out of a counter From 4aeb0b4239bb3b67ed402cb9cef3e000c892cadf Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:03 +0200 Subject: [PATCH 0674/2061] perfcounters: rename struct hw_perf_counter_ops into struct pmu This patch renames struct hw_perf_counter_ops into struct pmu. It introduces a structure to describe a cpu specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-7-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 25 ++++++----- arch/x86/kernel/cpu/perf_counter.c | 37 ++++++++-------- include/linux/perf_counter.h | 9 ++-- kernel/perf_counter.c | 68 ++++++++++++++---------------- 4 files changed, 66 insertions(+), 73 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index bd76d0fa2c3..d9bbe5efc64 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -256,7 +256,7 @@ static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) return 0; } -static void power_perf_read(struct perf_counter *counter) +static void power_pmu_read(struct perf_counter *counter) { long val, delta, prev; @@ -405,7 +405,7 @@ void hw_perf_restore(u64 disable) for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; if (counter->hw.idx && counter->hw.idx != hwc_index[i] + 1) { - power_perf_read(counter); + power_pmu_read(counter); write_pmc(counter->hw.idx, 0); counter->hw.idx = 0; } @@ -477,7 +477,7 @@ static void counter_sched_in(struct perf_counter *counter, int cpu) counter->oncpu = cpu; counter->tstamp_running += counter->ctx->time - counter->tstamp_stopped; if (is_software_counter(counter)) - counter->hw_ops->enable(counter); + counter->pmu->enable(counter); } /* @@ -533,7 +533,7 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, * re-enable the PMU in order to get hw_perf_restore to do the * actual work of reconfiguring the PMU. */ -static int power_perf_enable(struct perf_counter *counter) +static int power_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; unsigned long flags; @@ -573,7 +573,7 @@ static int power_perf_enable(struct perf_counter *counter) /* * Remove a counter from the PMU. */ -static void power_perf_disable(struct perf_counter *counter) +static void power_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; long i; @@ -583,7 +583,7 @@ static void power_perf_disable(struct perf_counter *counter) local_irq_save(flags); pmudis = hw_perf_save_disable(); - power_perf_read(counter); + power_pmu_read(counter); cpuhw = &__get_cpu_var(cpu_hw_counters); for (i = 0; i < cpuhw->n_counters; ++i) { @@ -607,10 +607,10 @@ static void power_perf_disable(struct perf_counter *counter) local_irq_restore(flags); } -struct hw_perf_counter_ops power_perf_ops = { - .enable = power_perf_enable, - .disable = power_perf_disable, - .read = power_perf_read +struct pmu power_pmu = { + .enable = power_pmu_enable, + .disable = power_pmu_disable, + .read = power_pmu_read, }; /* Number of perf_counters counting hardware events */ @@ -631,8 +631,7 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) } } -const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { unsigned long ev; struct perf_counter *ctrs[MAX_HWCOUNTERS]; @@ -705,7 +704,7 @@ hw_perf_counter_init(struct perf_counter *counter) if (err) return ERR_PTR(err); - return &power_perf_ops; + return &power_pmu; } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ad663d5ad2d..95de980c74a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -515,8 +515,8 @@ __pmc_fixed_disable(struct perf_counter *counter, } static inline void -__pmc_generic_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int idx) +__x86_pmu_disable(struct perf_counter *counter, + struct hw_perf_counter *hwc, unsigned int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); @@ -591,8 +591,8 @@ __pmc_fixed_enable(struct perf_counter *counter, } static void -__pmc_generic_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +__x86_pmu_enable(struct perf_counter *counter, + struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_enable(counter, hwc, idx); @@ -626,7 +626,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) /* * Find a PMC slot for the freshly enabled / scheduled in counter: */ -static int pmc_generic_enable(struct perf_counter *counter) +static int x86_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; @@ -667,7 +667,7 @@ try_generic: perf_counters_lapic_init(hwc->nmi); - __pmc_generic_disable(counter, hwc, idx); + __x86_pmu_disable(counter, hwc, idx); cpuc->counters[idx] = counter; /* @@ -676,7 +676,7 @@ try_generic: barrier(); __hw_perf_counter_set_period(counter, hwc, idx); - __pmc_generic_enable(counter, hwc, idx); + __x86_pmu_enable(counter, hwc, idx); return 0; } @@ -731,13 +731,13 @@ void perf_counter_print_debug(void) local_irq_enable(); } -static void pmc_generic_disable(struct perf_counter *counter) +static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; - __pmc_generic_disable(counter, hwc, idx); + __x86_pmu_disable(counter, hwc, idx); clear_bit(idx, cpuc->used); cpuc->counters[idx] = NULL; @@ -767,7 +767,7 @@ static void perf_save_and_restart(struct perf_counter *counter) __hw_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) - __pmc_generic_enable(counter, hwc, idx); + __x86_pmu_enable(counter, hwc, idx); } /* @@ -805,7 +805,7 @@ again: perf_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) - __pmc_generic_disable(counter, &counter->hw, bit); + __x86_pmu_disable(counter, &counter->hw, bit); } hw_perf_ack_status(ack); @@ -1034,19 +1034,18 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void pmc_generic_read(struct perf_counter *counter) +static void x86_pmu_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } -static const struct hw_perf_counter_ops x86_perf_counter_ops = { - .enable = pmc_generic_enable, - .disable = pmc_generic_disable, - .read = pmc_generic_read, +static const struct pmu pmu = { + .enable = x86_pmu_enable, + .disable = x86_pmu_disable, + .read = x86_pmu_read, }; -const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { int err; @@ -1054,7 +1053,7 @@ hw_perf_counter_init(struct perf_counter *counter) if (err) return ERR_PTR(err); - return &x86_perf_counter_ops; + return &pmu; } /* diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index be10b3ffe32..c3db52dc876 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -334,9 +334,9 @@ struct hw_perf_counter { struct perf_counter; /** - * struct hw_perf_counter_ops - performance counter hw ops + * struct pmu - generic performance monitoring unit */ -struct hw_perf_counter_ops { +struct pmu { int (*enable) (struct perf_counter *counter); void (*disable) (struct perf_counter *counter); void (*read) (struct perf_counter *counter); @@ -381,7 +381,7 @@ struct perf_counter { struct list_head sibling_list; int nr_siblings; struct perf_counter *group_leader; - const struct hw_perf_counter_ops *hw_ops; + const struct pmu *pmu; enum perf_counter_active_state state; enum perf_counter_active_state prev_state; @@ -519,8 +519,7 @@ struct perf_cpu_context { */ extern int perf_max_counters; -extern const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter); +extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 09396098dd0..582108addef 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -52,8 +52,7 @@ static DEFINE_MUTEX(perf_resource_mutex); /* * Architecture provided APIs - weak aliases: */ -extern __weak const struct hw_perf_counter_ops * -hw_perf_counter_init(struct perf_counter *counter) +extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { return NULL; } @@ -124,7 +123,7 @@ counter_sched_out(struct perf_counter *counter, counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_stopped = ctx->time; - counter->hw_ops->disable(counter); + counter->pmu->disable(counter); counter->oncpu = -1; if (!is_software_counter(counter)) @@ -417,7 +416,7 @@ counter_sched_in(struct perf_counter *counter, */ smp_wmb(); - if (counter->hw_ops->enable(counter)) { + if (counter->pmu->enable(counter)) { counter->state = PERF_COUNTER_STATE_INACTIVE; counter->oncpu = -1; return -EAGAIN; @@ -1096,7 +1095,7 @@ static void __read(void *info) local_irq_save(flags); if (ctx->is_active) update_context_time(ctx); - counter->hw_ops->read(counter); + counter->pmu->read(counter); update_counter_times(counter); local_irq_restore(flags); } @@ -1922,7 +1921,7 @@ static void perf_counter_output(struct perf_counter *counter, leader = counter->group_leader; list_for_each_entry(sub, &leader->sibling_list, list_entry) { if (sub != counter) - sub->hw_ops->read(sub); + sub->pmu->read(sub); group_entry.event = sub->hw_event.config; group_entry.counter = atomic64_read(&sub->count); @@ -2264,7 +2263,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) struct pt_regs *regs; counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->hw_ops->read(counter); + counter->pmu->read(counter); regs = get_irq_regs(); /* @@ -2410,7 +2409,7 @@ static void perf_swcounter_disable(struct perf_counter *counter) perf_swcounter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_generic = { +static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, @@ -2460,7 +2459,7 @@ static void cpu_clock_perf_counter_read(struct perf_counter *counter) cpu_clock_perf_counter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_cpu_clock = { +static const struct pmu perf_ops_cpu_clock = { .enable = cpu_clock_perf_counter_enable, .disable = cpu_clock_perf_counter_disable, .read = cpu_clock_perf_counter_read, @@ -2522,7 +2521,7 @@ static void task_clock_perf_counter_read(struct perf_counter *counter) task_clock_perf_counter_update(counter, time); } -static const struct hw_perf_counter_ops perf_ops_task_clock = { +static const struct pmu perf_ops_task_clock = { .enable = task_clock_perf_counter_enable, .disable = task_clock_perf_counter_disable, .read = task_clock_perf_counter_read, @@ -2574,7 +2573,7 @@ static void cpu_migrations_perf_counter_disable(struct perf_counter *counter) cpu_migrations_perf_counter_update(counter); } -static const struct hw_perf_counter_ops perf_ops_cpu_migrations = { +static const struct pmu perf_ops_cpu_migrations = { .enable = cpu_migrations_perf_counter_enable, .disable = cpu_migrations_perf_counter_disable, .read = cpu_migrations_perf_counter_read, @@ -2600,8 +2599,7 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) ftrace_profile_disable(perf_event_id(&counter->hw_event)); } -static const struct hw_perf_counter_ops * -tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { int event_id = perf_event_id(&counter->hw_event); int ret; @@ -2616,18 +2614,16 @@ tp_perf_counter_init(struct perf_counter *counter) return &perf_ops_generic; } #else -static const struct hw_perf_counter_ops * -tp_perf_counter_init(struct perf_counter *counter) +static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { return NULL; } #endif -static const struct hw_perf_counter_ops * -sw_perf_counter_init(struct perf_counter *counter) +static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { struct perf_counter_hw_event *hw_event = &counter->hw_event; - const struct hw_perf_counter_ops *hw_ops = NULL; + const struct pmu *pmu = NULL; struct hw_perf_counter *hwc = &counter->hw; /* @@ -2639,7 +2635,7 @@ sw_perf_counter_init(struct perf_counter *counter) */ switch (perf_event_id(&counter->hw_event)) { case PERF_COUNT_CPU_CLOCK: - hw_ops = &perf_ops_cpu_clock; + pmu = &perf_ops_cpu_clock; if (hw_event->irq_period && hw_event->irq_period < 10000) hw_event->irq_period = 10000; @@ -2650,9 +2646,9 @@ sw_perf_counter_init(struct perf_counter *counter) * use the cpu_clock counter instead. */ if (counter->ctx->task) - hw_ops = &perf_ops_task_clock; + pmu = &perf_ops_task_clock; else - hw_ops = &perf_ops_cpu_clock; + pmu = &perf_ops_cpu_clock; if (hw_event->irq_period && hw_event->irq_period < 10000) hw_event->irq_period = 10000; @@ -2661,18 +2657,18 @@ sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_PAGE_FAULTS_MIN: case PERF_COUNT_PAGE_FAULTS_MAJ: case PERF_COUNT_CONTEXT_SWITCHES: - hw_ops = &perf_ops_generic; + pmu = &perf_ops_generic; break; case PERF_COUNT_CPU_MIGRATIONS: if (!counter->hw_event.exclude_kernel) - hw_ops = &perf_ops_cpu_migrations; + pmu = &perf_ops_cpu_migrations; break; } - if (hw_ops) + if (pmu) hwc->irq_period = hw_event->irq_period; - return hw_ops; + return pmu; } /* @@ -2685,7 +2681,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, struct perf_counter *group_leader, gfp_t gfpflags) { - const struct hw_perf_counter_ops *hw_ops; + const struct pmu *pmu; struct perf_counter *counter; long err; @@ -2713,46 +2709,46 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->cpu = cpu; counter->hw_event = *hw_event; counter->group_leader = group_leader; - counter->hw_ops = NULL; + counter->pmu = NULL; counter->ctx = ctx; counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) counter->state = PERF_COUNTER_STATE_OFF; - hw_ops = NULL; + pmu = NULL; if (perf_event_raw(hw_event)) { - hw_ops = hw_perf_counter_init(counter); + pmu = hw_perf_counter_init(counter); goto done; } switch (perf_event_type(hw_event)) { case PERF_TYPE_HARDWARE: - hw_ops = hw_perf_counter_init(counter); + pmu = hw_perf_counter_init(counter); break; case PERF_TYPE_SOFTWARE: - hw_ops = sw_perf_counter_init(counter); + pmu = sw_perf_counter_init(counter); break; case PERF_TYPE_TRACEPOINT: - hw_ops = tp_perf_counter_init(counter); + pmu = tp_perf_counter_init(counter); break; } done: err = 0; - if (!hw_ops) + if (!pmu) err = -EINVAL; - else if (IS_ERR(hw_ops)) - err = PTR_ERR(hw_ops); + else if (IS_ERR(pmu)) + err = PTR_ERR(pmu); if (err) { kfree(counter); return ERR_PTR(err); } - counter->hw_ops = hw_ops; + counter->pmu = pmu; if (counter->hw_event.mmap) atomic_inc(&nr_mmap_tracking); From 5f4ec28ffe77c840354cce1820a3436106e9e0f1 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:04 +0200 Subject: [PATCH 0675/2061] perf_counter, x86: rename struct pmc_x86_ops into struct x86_pmu This patch renames struct pmc_x86_ops into struct x86_pmu. It introduces a structure to describe an x86 model specific pmu (performance monitoring unit). It may contain ops and data. The new name of the structure fits better, is shorter, and thus better to handle. Where it was appropriate, names of function and variable have been changed too. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-8-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 135 +++++++++++++++-------------- 1 file changed, 68 insertions(+), 67 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 95de980c74a..808a1a11346 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -44,9 +44,9 @@ struct cpu_hw_counters { }; /* - * struct pmc_x86_ops - performance counter x86 ops + * struct x86_pmu - generic x86 pmu */ -struct pmc_x86_ops { +struct x86_pmu { u64 (*save_disable_all)(void); void (*restore_all)(u64); u64 (*get_status)(u64); @@ -60,7 +60,7 @@ struct pmc_x86_ops { int max_events; }; -static struct pmc_x86_ops *pmc_ops __read_mostly; +static struct x86_pmu *x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, @@ -82,12 +82,12 @@ static const u64 intel_perfmon_event_map[] = [PERF_COUNT_BUS_CYCLES] = 0x013c, }; -static u64 pmc_intel_event_map(int event) +static u64 intel_pmu_event_map(int event) { return intel_perfmon_event_map[event]; } -static u64 pmc_intel_raw_event(u64 event) +static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL @@ -114,12 +114,12 @@ static const u64 amd_perfmon_event_map[] = [PERF_COUNT_BRANCH_MISSES] = 0x00c5, }; -static u64 pmc_amd_event_map(int event) +static u64 amd_pmu_event_map(int event) { return amd_perfmon_event_map[event]; } -static u64 pmc_amd_raw_event(u64 event) +static u64 amd_pmu_raw_event(u64 event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL @@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void) disable_lapic_nmi_watchdog(); for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_perfctr_nmi(pmc_ops->perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu->perfctr + i)) goto perfctr_fail; } for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_evntsel_nmi(pmc_ops->eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu->eventsel + i)) goto eventsel_fail; } @@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(pmc_ops->eventsel + i); + release_evntsel_nmi(x86_pmu->eventsel + i); i = nr_counters_generic; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(pmc_ops->perfctr + i); + release_perfctr_nmi(x86_pmu->perfctr + i); if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); @@ -216,8 +216,8 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < nr_counters_generic; i++) { - release_perfctr_nmi(pmc_ops->perfctr + i); - release_evntsel_nmi(pmc_ops->eventsel + i); + release_perfctr_nmi(x86_pmu->perfctr + i); + release_evntsel_nmi(x86_pmu->eventsel + i); } if (nmi_watchdog == NMI_LOCAL_APIC) @@ -293,14 +293,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (perf_event_raw(hw_event)) { - hwc->config |= pmc_ops->raw_event(perf_event_config(hw_event)); + hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event)); } else { - if (perf_event_id(hw_event) >= pmc_ops->max_events) + if (perf_event_id(hw_event) >= x86_pmu->max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= pmc_ops->event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu->event_map(perf_event_id(hw_event)); } counter->destroy = hw_perf_counter_destroy; @@ -308,7 +308,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -static u64 pmc_intel_save_disable_all(void) +static u64 intel_pmu_save_disable_all(void) { u64 ctrl; @@ -318,7 +318,7 @@ static u64 pmc_intel_save_disable_all(void) return ctrl; } -static u64 pmc_amd_save_disable_all(void) +static u64 amd_pmu_save_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int enabled, idx; @@ -327,7 +327,8 @@ static u64 pmc_amd_save_disable_all(void) cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the - * counters proper, so that pcm_amd_enable() does the right thing. + * counters proper, so that amd_pmu_enable_counter() does the + * right thing. */ barrier(); @@ -351,19 +352,19 @@ u64 hw_perf_save_disable(void) if (unlikely(!perf_counters_initialized)) return 0; - return pmc_ops->save_disable_all(); + return x86_pmu->save_disable_all(); } /* * Exported because of ACPI idle */ EXPORT_SYMBOL_GPL(hw_perf_save_disable); -static void pmc_intel_restore_all(u64 ctrl) +static void intel_pmu_restore_all(u64 ctrl) { wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); } -static void pmc_amd_restore_all(u64 ctrl) +static void amd_pmu_restore_all(u64 ctrl) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; @@ -391,14 +392,14 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->restore_all(ctrl); + x86_pmu->restore_all(ctrl); } /* * Exported because of ACPI idle */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static u64 pmc_intel_get_status(u64 mask) +static u64 intel_pmu_get_status(u64 mask) { u64 status; @@ -407,7 +408,7 @@ static u64 pmc_intel_get_status(u64 mask) return status; } -static u64 pmc_amd_get_status(u64 mask) +static u64 amd_pmu_get_status(u64 mask) { u64 status = 0; int idx; @@ -432,15 +433,15 @@ static u64 hw_perf_get_status(u64 mask) if (unlikely(!perf_counters_initialized)) return 0; - return pmc_ops->get_status(mask); + return x86_pmu->get_status(mask); } -static void pmc_intel_ack_status(u64 ack) +static void intel_pmu_ack_status(u64 ack) { wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void pmc_amd_ack_status(u64 ack) +static void amd_pmu_ack_status(u64 ack) { } @@ -449,16 +450,16 @@ static void hw_perf_ack_status(u64 ack) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->ack_status(ack); + x86_pmu->ack_status(ack); } -static void pmc_intel_enable(int idx, u64 config) +static void intel_pmu_enable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static void pmc_amd_enable(int idx, u64 config) +static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -474,15 +475,15 @@ static void hw_perf_enable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->enable(idx, config); + x86_pmu->enable(idx, config); } -static void pmc_intel_disable(int idx, u64 config) +static void intel_pmu_disable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); } -static void pmc_amd_disable(int idx, u64 config) +static void amd_pmu_disable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); @@ -496,7 +497,7 @@ static void hw_perf_disable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - pmc_ops->disable(idx, config); + x86_pmu->disable(idx, config); } static inline void @@ -613,11 +614,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == pmc_ops->event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -661,8 +662,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = pmc_ops->eventsel; - hwc->counter_base = pmc_ops->perfctr; + hwc->config_base = x86_pmu->eventsel; + hwc->counter_base = x86_pmu->perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -710,8 +711,8 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(pmc_ops->eventsel + idx, pmc_ctrl); - rdmsrl(pmc_ops->perfctr + idx, pmc_count); + rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu->perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -918,35 +919,35 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { .priority = 1 }; -static struct pmc_x86_ops pmc_intel_ops = { - .save_disable_all = pmc_intel_save_disable_all, - .restore_all = pmc_intel_restore_all, - .get_status = pmc_intel_get_status, - .ack_status = pmc_intel_ack_status, - .enable = pmc_intel_enable, - .disable = pmc_intel_disable, +static struct x86_pmu intel_pmu = { + .save_disable_all = intel_pmu_save_disable_all, + .restore_all = intel_pmu_restore_all, + .get_status = intel_pmu_get_status, + .ack_status = intel_pmu_ack_status, + .enable = intel_pmu_enable_counter, + .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, .perfctr = MSR_ARCH_PERFMON_PERFCTR0, - .event_map = pmc_intel_event_map, - .raw_event = pmc_intel_raw_event, + .event_map = intel_pmu_event_map, + .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), }; -static struct pmc_x86_ops pmc_amd_ops = { - .save_disable_all = pmc_amd_save_disable_all, - .restore_all = pmc_amd_restore_all, - .get_status = pmc_amd_get_status, - .ack_status = pmc_amd_ack_status, - .enable = pmc_amd_enable, - .disable = pmc_amd_disable, +static struct x86_pmu amd_pmu = { + .save_disable_all = amd_pmu_save_disable_all, + .restore_all = amd_pmu_restore_all, + .get_status = amd_pmu_get_status, + .ack_status = amd_pmu_ack_status, + .enable = amd_pmu_enable_counter, + .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, .perfctr = MSR_K7_PERFCTR0, - .event_map = pmc_amd_event_map, - .raw_event = pmc_amd_raw_event, + .event_map = amd_pmu_event_map, + .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; -static struct pmc_x86_ops *pmc_intel_init(void) +static struct x86_pmu *intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -977,10 +978,10 @@ static struct pmc_x86_ops *pmc_intel_init(void) nr_counters_fixed = edx.split.num_counters_fixed; counter_value_mask = (1ULL << eax.split.bit_width) - 1; - return &pmc_intel_ops; + return &intel_pmu; } -static struct pmc_x86_ops *pmc_amd_init(void) +static struct x86_pmu *amd_pmu_init(void) { nr_counters_generic = 4; nr_counters_fixed = 0; @@ -989,22 +990,22 @@ static struct pmc_x86_ops *pmc_amd_init(void) pr_info("AMD Performance Monitoring support detected.\n"); - return &pmc_amd_ops; + return &amd_pmu; } void __init init_hw_perf_counters(void) { switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - pmc_ops = pmc_intel_init(); + x86_pmu = intel_pmu_init(); break; case X86_VENDOR_AMD: - pmc_ops = pmc_amd_init(); + x86_pmu = amd_pmu_init(); break; default: return; } - if (!pmc_ops) + if (!x86_pmu) return; pr_info("... num counters: %d\n", nr_counters_generic); From 39d81eab2374d71b2d9c82f66258a1a4f57ddd2e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:05 +0200 Subject: [PATCH 0676/2061] perf_counter, x86: make interrupt handler model specific This separates the perfcounter interrupt handler for AMD and Intel cpus. The AMD interrupt handler implementation is a follow-on patch. [ Impact: refactor and clean up code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-9-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 808a1a11346..9d90de0bd0b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -4,6 +4,7 @@ * Copyright(C) 2008 Thomas Gleixner * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar * Copyright(C) 2009 Jaswinder Singh Rajput + * Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter * * For licencing details see kernel-base/COPYING */ @@ -47,6 +48,7 @@ struct cpu_hw_counters { * struct x86_pmu - generic x86 pmu */ struct x86_pmu { + int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); u64 (*get_status)(u64); @@ -241,6 +243,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int err; + /* disable temporarily */ + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + return -ENOSYS; + if (unlikely(!perf_counters_initialized)) return -EINVAL; @@ -780,7 +786,7 @@ static void perf_save_and_restart(struct perf_counter *counter) * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: */ -static int __smp_perf_counter_interrupt(struct pt_regs *regs, int nmi) +static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) { int bit, cpu = smp_processor_id(); u64 ack, status; @@ -827,6 +833,8 @@ out: return ret; } +static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; } + void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; @@ -851,7 +859,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); ack_APIC_irq(); - __smp_perf_counter_interrupt(regs, 0); + x86_pmu->handle_irq(regs, 0); irq_exit(); } @@ -908,7 +916,7 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = __smp_perf_counter_interrupt(regs, 1); + ret = x86_pmu->handle_irq(regs, 1); return ret ? NOTIFY_STOP : NOTIFY_OK; } @@ -920,6 +928,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { }; static struct x86_pmu intel_pmu = { + .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, .get_status = intel_pmu_get_status, @@ -934,6 +943,7 @@ static struct x86_pmu intel_pmu = { }; static struct x86_pmu amd_pmu = { + .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, .get_status = amd_pmu_get_status, From b7f8859a8ed1937e2139c17b84878f1d413fa659 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:06 +0200 Subject: [PATCH 0677/2061] perf_counter, x86: remove get_status() from struct x86_pmu This function is Intel only and not necessary for AMD cpus. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-10-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 39 ++++-------------------------- 1 file changed, 5 insertions(+), 34 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9d90de0bd0b..d0bb02919c6 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -51,7 +51,6 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - u64 (*get_status)(u64); void (*ack_status)(u64); void (*enable)(int, u64); void (*disable)(int, u64); @@ -405,43 +404,17 @@ void hw_perf_restore(u64 ctrl) */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static u64 intel_pmu_get_status(u64 mask) +static inline u64 intel_pmu_get_status(u64 mask) { u64 status; + if (unlikely(!perf_counters_initialized)) + return 0; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); return status; } -static u64 amd_pmu_get_status(u64 mask) -{ - u64 status = 0; - int idx; - - for (idx = 0; idx < nr_counters_generic; idx++) { - s64 val; - - if (!(mask & (1 << idx))) - continue; - - rdmsrl(MSR_K7_PERFCTR0 + idx, val); - val <<= (64 - counter_value_bits); - if (val >= 0) - status |= (1 << idx); - } - - return status; -} - -static u64 hw_perf_get_status(u64 mask) -{ - if (unlikely(!perf_counters_initialized)) - return 0; - - return x86_pmu->get_status(mask); -} - static void intel_pmu_ack_status(u64 ack) { wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); @@ -795,7 +768,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) cpuc->throttle_ctrl = hw_perf_save_disable(); - status = hw_perf_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(cpuc->throttle_ctrl); if (!status) goto out; @@ -820,7 +793,7 @@ again: /* * Repeat if there is more work to be done: */ - status = hw_perf_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(cpuc->throttle_ctrl); if (status) goto again; out: @@ -931,7 +904,6 @@ static struct x86_pmu intel_pmu = { .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, - .get_status = intel_pmu_get_status, .ack_status = intel_pmu_ack_status, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, @@ -946,7 +918,6 @@ static struct x86_pmu amd_pmu = { .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, - .get_status = amd_pmu_get_status, .ack_status = amd_pmu_ack_status, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, From dee5d9067ca78b317538fd67930be4e09a83dbc5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:07 +0200 Subject: [PATCH 0678/2061] perf_counter, x86: remove ack_status() from struct x86_pmu This function is Intel only and not necessary for AMD cpus. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-11-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 19 ++----------------- 1 file changed, 2 insertions(+), 17 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d0bb02919c6..6bbdc16cc69 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -51,7 +51,6 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - void (*ack_status)(u64); void (*enable)(int, u64); void (*disable)(int, u64); unsigned eventsel; @@ -415,23 +414,11 @@ static inline u64 intel_pmu_get_status(u64 mask) return status; } -static void intel_pmu_ack_status(u64 ack) +static inline void intel_pmu_ack_status(u64 ack) { wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void amd_pmu_ack_status(u64 ack) -{ -} - -static void hw_perf_ack_status(u64 ack) -{ - if (unlikely(!perf_counters_initialized)) - return; - - x86_pmu->ack_status(ack); -} - static void intel_pmu_enable_counter(int idx, u64 config) { wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, @@ -788,7 +775,7 @@ again: __x86_pmu_disable(counter, &counter->hw, bit); } - hw_perf_ack_status(ack); + intel_pmu_ack_status(ack); /* * Repeat if there is more work to be done: @@ -904,7 +891,6 @@ static struct x86_pmu intel_pmu = { .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, - .ack_status = intel_pmu_ack_status, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -918,7 +904,6 @@ static struct x86_pmu amd_pmu = { .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, - .ack_status = amd_pmu_ack_status, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, From 26816c287e13eedc67bc4ed0cd40c138314b7c7d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:08 +0200 Subject: [PATCH 0679/2061] perf_counter, x86: rename __hw_perf_counter_set_period into x86_perf_counter_set_period [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-12-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6bbdc16cc69..fa6541d781b 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -498,7 +498,7 @@ static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); * To be called with the counter disabled in hw: */ static void -__hw_perf_counter_set_period(struct perf_counter *counter, +x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); @@ -642,7 +642,7 @@ try_generic: */ barrier(); - __hw_perf_counter_set_period(counter, hwc, idx); + x86_perf_counter_set_period(counter, hwc, idx); __x86_pmu_enable(counter, hwc, idx); return 0; @@ -731,7 +731,7 @@ static void perf_save_and_restart(struct perf_counter *counter) int idx = hwc->idx; x86_perf_counter_update(counter, hwc, idx); - __hw_perf_counter_set_period(counter, hwc, idx); + x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) __x86_pmu_enable(counter, hwc, idx); From 55de0f2e57994b525324bf0d04d242d9358a2417 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:09 +0200 Subject: [PATCH 0680/2061] perf_counter, x86: rename intel only functions [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-13-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fa6541d781b..5a52d73ccfa 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -725,7 +725,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * Save and restart an expired counter. Called by NMI contexts, * so it has to be careful about preempting normal counter ops: */ -static void perf_save_and_restart(struct perf_counter *counter) +static void intel_pmu_save_and_restart(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; int idx = hwc->idx; @@ -753,7 +753,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); int ret = 0; - cpuc->throttle_ctrl = hw_perf_save_disable(); + cpuc->throttle_ctrl = intel_pmu_save_disable_all(); status = intel_pmu_get_status(cpuc->throttle_ctrl); if (!status) @@ -770,7 +770,7 @@ again: if (!counter) continue; - perf_save_and_restart(counter); + intel_pmu_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) __x86_pmu_disable(counter, &counter->hw, bit); } @@ -788,7 +788,7 @@ out: * Restore - do not reenable when global enable is off or throttled: */ if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - hw_perf_restore(cpuc->throttle_ctrl); + intel_pmu_restore_all(cpuc->throttle_ctrl); return ret; } From 72eae04d3a3075c26d39e1e685acfc8e8c29db64 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:10 +0200 Subject: [PATCH 0681/2061] perf_counter, x86: modify initialization of struct x86_pmu This patch adds an error handler and changes initialization of struct x86_pmu. No functional changes. Needed for follow-on patches. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-14-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 27 ++++++++++++++++----------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5a52d73ccfa..7c72a942363 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -913,7 +913,7 @@ static struct x86_pmu amd_pmu = { .max_events = ARRAY_SIZE(amd_perfmon_event_map), }; -static struct x86_pmu *intel_pmu_init(void) +static int intel_pmu_init(void) { union cpuid10_edx edx; union cpuid10_eax eax; @@ -921,7 +921,7 @@ static struct x86_pmu *intel_pmu_init(void) unsigned int ebx; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return NULL; + return -ENODEV; /* * Check whether the Architectural PerfMon supports @@ -929,49 +929,54 @@ static struct x86_pmu *intel_pmu_init(void) */ cpuid(10, &eax.full, &ebx, &unused, &edx.full); if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) - return NULL; + return -ENODEV; intel_perfmon_version = eax.split.version_id; if (intel_perfmon_version < 2) - return NULL; + return -ENODEV; pr_info("Intel Performance Monitoring support detected.\n"); pr_info("... version: %d\n", intel_perfmon_version); pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); + x86_pmu = &intel_pmu; + nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; counter_value_mask = (1ULL << eax.split.bit_width) - 1; - return &intel_pmu; + return 0; } -static struct x86_pmu *amd_pmu_init(void) +static int amd_pmu_init(void) { + x86_pmu = &amd_pmu; + nr_counters_generic = 4; nr_counters_fixed = 0; counter_value_mask = 0x0000FFFFFFFFFFFFULL; counter_value_bits = 48; pr_info("AMD Performance Monitoring support detected.\n"); - - return &amd_pmu; + return 0; } void __init init_hw_perf_counters(void) { + int err; + switch (boot_cpu_data.x86_vendor) { case X86_VENDOR_INTEL: - x86_pmu = intel_pmu_init(); + err = intel_pmu_init(); break; case X86_VENDOR_AMD: - x86_pmu = amd_pmu_init(); + err = amd_pmu_init(); break; default: return; } - if (!x86_pmu) + if (err != 0) return; pr_info("... num counters: %d\n", nr_counters_generic); From 4a06bd8508f65ad1dd5cd2046b85694813fa36a2 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:11 +0200 Subject: [PATCH 0682/2061] perf_counter, x86: make x86_pmu data a static struct Instead of using a pointer to reference to the x86 pmu we now have one single data structure that is initialized at the beginning. This saves the pointer access when using this memory. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-15-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 50 +++++++++++++++--------------- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7c72a942363..68597d76338 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -60,7 +60,7 @@ struct x86_pmu { int max_events; }; -static struct x86_pmu *x86_pmu __read_mostly; +static struct x86_pmu x86_pmu __read_mostly; static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, @@ -184,12 +184,12 @@ static bool reserve_pmc_hardware(void) disable_lapic_nmi_watchdog(); for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_perfctr_nmi(x86_pmu->perfctr + i)) + if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } for (i = 0; i < nr_counters_generic; i++) { - if (!reserve_evntsel_nmi(x86_pmu->eventsel + i)) + if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -197,13 +197,13 @@ static bool reserve_pmc_hardware(void) eventsel_fail: for (i--; i >= 0; i--) - release_evntsel_nmi(x86_pmu->eventsel + i); + release_evntsel_nmi(x86_pmu.eventsel + i); i = nr_counters_generic; perfctr_fail: for (i--; i >= 0; i--) - release_perfctr_nmi(x86_pmu->perfctr + i); + release_perfctr_nmi(x86_pmu.perfctr + i); if (nmi_watchdog == NMI_LOCAL_APIC) enable_lapic_nmi_watchdog(); @@ -216,8 +216,8 @@ static void release_pmc_hardware(void) int i; for (i = 0; i < nr_counters_generic; i++) { - release_perfctr_nmi(x86_pmu->perfctr + i); - release_evntsel_nmi(x86_pmu->eventsel + i); + release_perfctr_nmi(x86_pmu.perfctr + i); + release_evntsel_nmi(x86_pmu.eventsel + i); } if (nmi_watchdog == NMI_LOCAL_APIC) @@ -297,14 +297,14 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * Raw event type provide the config in the event structure */ if (perf_event_raw(hw_event)) { - hwc->config |= x86_pmu->raw_event(perf_event_config(hw_event)); + hwc->config |= x86_pmu.raw_event(perf_event_config(hw_event)); } else { - if (perf_event_id(hw_event) >= x86_pmu->max_events) + if (perf_event_id(hw_event) >= x86_pmu.max_events) return -EINVAL; /* * The generic map: */ - hwc->config |= x86_pmu->event_map(perf_event_id(hw_event)); + hwc->config |= x86_pmu.event_map(perf_event_id(hw_event)); } counter->destroy = hw_perf_counter_destroy; @@ -356,7 +356,7 @@ u64 hw_perf_save_disable(void) if (unlikely(!perf_counters_initialized)) return 0; - return x86_pmu->save_disable_all(); + return x86_pmu.save_disable_all(); } /* * Exported because of ACPI idle @@ -396,7 +396,7 @@ void hw_perf_restore(u64 ctrl) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->restore_all(ctrl); + x86_pmu.restore_all(ctrl); } /* * Exported because of ACPI idle @@ -441,7 +441,7 @@ static void hw_perf_enable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->enable(idx, config); + x86_pmu.enable(idx, config); } static void intel_pmu_disable_counter(int idx, u64 config) @@ -463,7 +463,7 @@ static void hw_perf_disable(int idx, u64 config) if (unlikely(!perf_counters_initialized)) return; - x86_pmu->disable(idx, config); + x86_pmu.disable(idx, config); } static inline void @@ -580,11 +580,11 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) event = hwc->config & ARCH_PERFMON_EVENT_MASK; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_INSTRUCTIONS))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) return X86_PMC_IDX_FIXED_INSTRUCTIONS; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_CPU_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_CPU_CYCLES))) return X86_PMC_IDX_FIXED_CPU_CYCLES; - if (unlikely(event == x86_pmu->event_map(PERF_COUNT_BUS_CYCLES))) + if (unlikely(event == x86_pmu.event_map(PERF_COUNT_BUS_CYCLES))) return X86_PMC_IDX_FIXED_BUS_CYCLES; return -1; @@ -628,8 +628,8 @@ try_generic: set_bit(idx, cpuc->used); hwc->idx = idx; } - hwc->config_base = x86_pmu->eventsel; - hwc->counter_base = x86_pmu->perfctr; + hwc->config_base = x86_pmu.eventsel; + hwc->counter_base = x86_pmu.perfctr; } perf_counters_lapic_init(hwc->nmi); @@ -677,8 +677,8 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); for (idx = 0; idx < nr_counters_generic; idx++) { - rdmsrl(x86_pmu->eventsel + idx, pmc_ctrl); - rdmsrl(x86_pmu->perfctr + idx, pmc_count); + rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); + rdmsrl(x86_pmu.perfctr + idx, pmc_count); prev_left = per_cpu(prev_left[idx], cpu); @@ -819,7 +819,7 @@ void smp_perf_counter_interrupt(struct pt_regs *regs) irq_enter(); apic_write(APIC_LVTPC, LOCAL_PERF_VECTOR); ack_APIC_irq(); - x86_pmu->handle_irq(regs, 0); + x86_pmu.handle_irq(regs, 0); irq_exit(); } @@ -876,7 +876,7 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = x86_pmu->handle_irq(regs, 1); + ret = x86_pmu.handle_irq(regs, 1); return ret ? NOTIFY_STOP : NOTIFY_OK; } @@ -940,7 +940,7 @@ static int intel_pmu_init(void) pr_info("... bit width: %d\n", eax.split.bit_width); pr_info("... mask length: %d\n", eax.split.mask_length); - x86_pmu = &intel_pmu; + x86_pmu = intel_pmu; nr_counters_generic = eax.split.num_counters; nr_counters_fixed = edx.split.num_counters_fixed; @@ -951,7 +951,7 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { - x86_pmu = &amd_pmu; + x86_pmu = amd_pmu; nr_counters_generic = 4; nr_counters_fixed = 0; From 0933e5c6a680ba8d8d786a6f7fa377b7ec0d1e49 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:12 +0200 Subject: [PATCH 0683/2061] perf_counter, x86: move counter parameters to struct x86_pmu [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-16-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 80 ++++++++++++++---------------- 1 file changed, 37 insertions(+), 43 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 68597d76338..75dbb1f0900 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -24,16 +24,7 @@ #include static bool perf_counters_initialized __read_mostly; - -/* - * Number of (generic) HW counters: - */ -static int nr_counters_generic __read_mostly; static u64 perf_counter_mask __read_mostly; -static u64 counter_value_mask __read_mostly; -static int counter_value_bits __read_mostly; - -static int nr_counters_fixed __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; @@ -58,6 +49,10 @@ struct x86_pmu { u64 (*event_map)(int); u64 (*raw_event)(u64); int max_events; + int num_counters; + int num_counters_fixed; + int counter_bits; + u64 counter_mask; }; static struct x86_pmu x86_pmu __read_mostly; @@ -183,12 +178,12 @@ static bool reserve_pmc_hardware(void) if (nmi_watchdog == NMI_LOCAL_APIC) disable_lapic_nmi_watchdog(); - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_perfctr_nmi(x86_pmu.perfctr + i)) goto perfctr_fail; } - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { if (!reserve_evntsel_nmi(x86_pmu.eventsel + i)) goto eventsel_fail; } @@ -199,7 +194,7 @@ eventsel_fail: for (i--; i >= 0; i--) release_evntsel_nmi(x86_pmu.eventsel + i); - i = nr_counters_generic; + i = x86_pmu.num_counters; perfctr_fail: for (i--; i >= 0; i--) @@ -215,7 +210,7 @@ static void release_pmc_hardware(void) { int i; - for (i = 0; i < nr_counters_generic; i++) { + for (i = 0; i < x86_pmu.num_counters; i++) { release_perfctr_nmi(x86_pmu.perfctr + i); release_evntsel_nmi(x86_pmu.eventsel + i); } @@ -336,7 +331,7 @@ static u64 amd_pmu_save_disable_all(void) */ barrier(); - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -378,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl) if (!ctrl) return; - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; if (!test_bit(idx, cpuc->active_mask)) @@ -527,7 +522,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, atomic64_set(&hwc->prev_count, (u64)-left); err = checking_wrmsrl(hwc->counter_base + idx, - (u64)(-left) & counter_value_mask); + (u64)(-left) & x86_pmu.counter_mask); } static inline void @@ -621,8 +616,9 @@ static int x86_pmu_enable(struct perf_counter *counter) /* Try to get the previous generic counter again */ if (test_and_set_bit(idx, cpuc->used)) { try_generic: - idx = find_first_zero_bit(cpuc->used, nr_counters_generic); - if (idx == nr_counters_generic) + idx = find_first_zero_bit(cpuc->used, + x86_pmu.num_counters); + if (idx == x86_pmu.num_counters) return -EAGAIN; set_bit(idx, cpuc->used); @@ -654,7 +650,7 @@ void perf_counter_print_debug(void) struct cpu_hw_counters *cpuc; int cpu, idx; - if (!nr_counters_generic) + if (!x86_pmu.num_counters) return; local_irq_disable(); @@ -676,7 +672,7 @@ void perf_counter_print_debug(void) } pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); - for (idx = 0; idx < nr_counters_generic; idx++) { + for (idx = 0; idx < x86_pmu.num_counters; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); rdmsrl(x86_pmu.perfctr + idx, pmc_count); @@ -689,7 +685,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: gen-PMC%d left: %016llx\n", cpu, idx, prev_left); } - for (idx = 0; idx < nr_counters_fixed; idx++) { + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { rdmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, pmc_count); pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", @@ -911,6 +907,9 @@ static struct x86_pmu amd_pmu = { .event_map = amd_pmu_event_map, .raw_event = amd_pmu_raw_event, .max_events = ARRAY_SIZE(amd_perfmon_event_map), + .num_counters = 4, + .counter_bits = 48, + .counter_mask = (1ULL << 48) - 1, }; static int intel_pmu_init(void) @@ -941,10 +940,10 @@ static int intel_pmu_init(void) pr_info("... mask length: %d\n", eax.split.mask_length); x86_pmu = intel_pmu; - - nr_counters_generic = eax.split.num_counters; - nr_counters_fixed = edx.split.num_counters_fixed; - counter_value_mask = (1ULL << eax.split.bit_width) - 1; + x86_pmu.num_counters = eax.split.num_counters; + x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + x86_pmu.counter_bits = eax.split.bit_width; + x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; return 0; } @@ -952,12 +951,6 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; - - nr_counters_generic = 4; - nr_counters_fixed = 0; - counter_value_mask = 0x0000FFFFFFFFFFFFULL; - counter_value_bits = 48; - pr_info("AMD Performance Monitoring support detected.\n"); return 0; } @@ -979,25 +972,26 @@ void __init init_hw_perf_counters(void) if (err != 0) return; - pr_info("... num counters: %d\n", nr_counters_generic); - if (nr_counters_generic > X86_PMC_MAX_GENERIC) { - nr_counters_generic = X86_PMC_MAX_GENERIC; + pr_info("... num counters: %d\n", x86_pmu.num_counters); + if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { + x86_pmu.num_counters = X86_PMC_MAX_GENERIC; WARN(1, KERN_ERR "hw perf counters %d > max(%d), clipping!", - nr_counters_generic, X86_PMC_MAX_GENERIC); + x86_pmu.num_counters, X86_PMC_MAX_GENERIC); } - perf_counter_mask = (1 << nr_counters_generic) - 1; - perf_max_counters = nr_counters_generic; + perf_counter_mask = (1 << x86_pmu.num_counters) - 1; + perf_max_counters = x86_pmu.num_counters; - pr_info("... value mask: %016Lx\n", counter_value_mask); + pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); - if (nr_counters_fixed > X86_PMC_MAX_FIXED) { - nr_counters_fixed = X86_PMC_MAX_FIXED; + if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { + x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; WARN(1, KERN_ERR "hw perf counters fixed %d > max(%d), clipping!", - nr_counters_fixed, X86_PMC_MAX_FIXED); + x86_pmu.num_counters_fixed, X86_PMC_MAX_FIXED); } - pr_info("... fixed counters: %d\n", nr_counters_fixed); + pr_info("... fixed counters: %d\n", x86_pmu.num_counters_fixed); - perf_counter_mask |= ((1LL << nr_counters_fixed)-1) << X86_PMC_IDX_FIXED; + perf_counter_mask |= + ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; pr_info("... counter mask: %016Lx\n", perf_counter_mask); perf_counters_initialized = true; From faa28ae018ed004a22aa4a7704e04ccdde4a941e Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:13 +0200 Subject: [PATCH 0684/2061] perf_counter, x86: make pmu version generic This makes the use of the version variable generic. Also, some debug messages have been generalized. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-17-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 75dbb1f0900..15d2c03e16f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -39,6 +39,8 @@ struct cpu_hw_counters { * struct x86_pmu - generic x86 pmu */ struct x86_pmu { + const char *name; + int version; int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); @@ -61,8 +63,6 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = { .enabled = 1, }; -static __read_mostly int intel_perfmon_version; - /* * Intel PerfMon v3. Used on Core2 and later. */ @@ -658,7 +658,7 @@ void perf_counter_print_debug(void) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (intel_perfmon_version >= 2) { + if (x86_pmu.version >= 2) { rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); rdmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, overflow); @@ -884,6 +884,7 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { }; static struct x86_pmu intel_pmu = { + .name = "Intel", .handle_irq = intel_pmu_handle_irq, .save_disable_all = intel_pmu_save_disable_all, .restore_all = intel_pmu_restore_all, @@ -897,6 +898,7 @@ static struct x86_pmu intel_pmu = { }; static struct x86_pmu amd_pmu = { + .name = "AMD", .handle_irq = amd_pmu_handle_irq, .save_disable_all = amd_pmu_save_disable_all, .restore_all = amd_pmu_restore_all, @@ -918,6 +920,7 @@ static int intel_pmu_init(void) union cpuid10_eax eax; unsigned int unused; unsigned int ebx; + int version; if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return -ENODEV; @@ -930,16 +933,12 @@ static int intel_pmu_init(void) if (eax.split.mask_length <= ARCH_PERFMON_BRANCH_MISSES_RETIRED) return -ENODEV; - intel_perfmon_version = eax.split.version_id; - if (intel_perfmon_version < 2) + version = eax.split.version_id; + if (version < 2) return -ENODEV; - pr_info("Intel Performance Monitoring support detected.\n"); - pr_info("... version: %d\n", intel_perfmon_version); - pr_info("... bit width: %d\n", eax.split.bit_width); - pr_info("... mask length: %d\n", eax.split.mask_length); - x86_pmu = intel_pmu; + x86_pmu.version = version; x86_pmu.num_counters = eax.split.num_counters; x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; x86_pmu.counter_bits = eax.split.bit_width; @@ -951,7 +950,6 @@ static int intel_pmu_init(void) static int amd_pmu_init(void) { x86_pmu = amd_pmu; - pr_info("AMD Performance Monitoring support detected.\n"); return 0; } @@ -972,6 +970,10 @@ void __init init_hw_perf_counters(void) if (err != 0) return; + pr_info("%s Performance Monitoring support detected.\n", x86_pmu.name); + pr_info("... version: %d\n", x86_pmu.version); + pr_info("... bit width: %d\n", x86_pmu.counter_bits); + pr_info("... num counters: %d\n", x86_pmu.num_counters); if (x86_pmu.num_counters > X86_PMC_MAX_GENERIC) { x86_pmu.num_counters = X86_PMC_MAX_GENERIC; From bb775fc2d1dcd1aa6eafde37a8289ba2d80783aa Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:14 +0200 Subject: [PATCH 0685/2061] perf_counter, x86: make x86_pmu_read() static inline [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-18-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 15d2c03e16f..3f3ae477a7d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1002,7 +1002,7 @@ void __init init_hw_perf_counters(void) register_die_notifier(&perf_counter_nmi_notifier); } -static void x86_pmu_read(struct perf_counter *counter) +static inline void x86_pmu_read(struct perf_counter *counter) { x86_perf_counter_update(counter, &counter->hw, counter->hw.idx); } From 93904966934193204ad08e951f806d5631c29eb3 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:15 +0200 Subject: [PATCH 0686/2061] perf_counter, x86: rename cpuc->active_mask This is to have a consistent naming scheme with cpuc->used. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-19-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3f3ae477a7d..9ec51a662db 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -29,9 +29,9 @@ static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; - unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; int enabled; }; @@ -334,7 +334,7 @@ static u64 amd_pmu_save_disable_all(void) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) @@ -376,7 +376,7 @@ static void amd_pmu_restore_all(u64 ctrl) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active_mask)) + if (!test_bit(idx, cpuc->active)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) @@ -424,7 +424,7 @@ static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, cpuc->active_mask); + set_bit(idx, cpuc->active); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -448,7 +448,7 @@ static void amd_pmu_disable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - clear_bit(idx, cpuc->active_mask); + clear_bit(idx, cpuc->active); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } From 095342389e2ed8deed07b3076f990260ce3c7c9f Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:16 +0200 Subject: [PATCH 0687/2061] perf_counter, x86: generic use of cpuc->active cpuc->active will now be used to indicate an enabled counter which implies also valid pointers of cpuc->counters[]. In contrast, cpuc->used only locks the counter, but it can be still uninitialized. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-20-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 9ec51a662db..f7fd4a35515 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -424,7 +424,6 @@ static void amd_pmu_enable_counter(int idx, u64 config) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - set_bit(idx, cpuc->active); if (cpuc->enabled) config |= ARCH_PERFMON_EVENTSEL0_ENABLE; @@ -446,9 +445,6 @@ static void intel_pmu_disable_counter(int idx, u64 config) static void amd_pmu_disable_counter(int idx, u64 config) { - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - clear_bit(idx, cpuc->active); wrmsrl(MSR_K7_EVNTSEL0 + idx, config); } @@ -633,10 +629,7 @@ try_generic: __x86_pmu_disable(counter, hwc, idx); cpuc->counters[idx] = counter; - /* - * Make it visible before enabling the hw: - */ - barrier(); + set_bit(idx, cpuc->active); x86_perf_counter_set_period(counter, hwc, idx); __x86_pmu_enable(counter, hwc, idx); @@ -700,10 +693,13 @@ static void x86_pmu_disable(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; unsigned int idx = hwc->idx; + /* + * Must be done before we disable, otherwise the nmi handler + * could reenable again: + */ + clear_bit(idx, cpuc->active); __x86_pmu_disable(counter, hwc, idx); - clear_bit(idx, cpuc->used); - cpuc->counters[idx] = NULL; /* * Make sure the cleared pointer becomes visible before we * (potentially) free the counter: @@ -715,6 +711,8 @@ static void x86_pmu_disable(struct perf_counter *counter) * that we are disabling: */ x86_perf_counter_update(counter, hwc, idx); + cpuc->counters[idx] = NULL; + clear_bit(idx, cpuc->used); } /* @@ -763,7 +761,7 @@ again: struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); - if (!counter) + if (!test_bit(bit, cpuc->active)) continue; intel_pmu_save_and_restart(counter); From 6f00cada07bb5da7f751929d3173494dcc5446cc Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:17 +0200 Subject: [PATCH 0688/2061] perf_counter, x86: consistent use of type int for counter index The type of counter index is sometimes implemented as unsigned int. This patch changes this to have a consistent usage of int. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-21-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++---- include/linux/perf_counter.h | 2 +- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f7fd4a35515..d8beebeb270 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -459,7 +459,7 @@ static void hw_perf_disable(int idx, u64 config) static inline void __pmc_fixed_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int __idx) + struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -474,7 +474,7 @@ __pmc_fixed_disable(struct perf_counter *counter, static inline void __x86_pmu_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int idx) + struct hw_perf_counter *hwc, int idx) { if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) __pmc_fixed_disable(counter, hwc, idx); @@ -523,7 +523,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, static inline void __pmc_fixed_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, unsigned int __idx) + struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -691,7 +691,7 @@ static void x86_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); struct hw_perf_counter *hwc = &counter->hw; - unsigned int idx = hwc->idx; + int idx = hwc->idx; /* * Must be done before we disable, otherwise the nmi handler diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c3db52dc876..41aed427005 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -318,7 +318,7 @@ struct hw_perf_counter { unsigned long config_base; unsigned long counter_base; int nmi; - unsigned int idx; + int idx; }; union { /* software */ atomic64_t count; From 7c90cc45f89af4dd4617f97d452740ad95b800d5 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:18 +0200 Subject: [PATCH 0689/2061] perf_counter, x86: rework counter enable functions There is vendor specific code in generic x86 code, and there is vendor specific code that could be generic. This patch introduces x86_pmu_enable_counter() for x86 generic code. Fixed counter code for Intel is moved to Intel only functions. In the end, checks and calls via function pointers were reduced to the necessary. Also, the internal function i/f changed. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-22-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 52 ++++++++++++++---------------- 1 file changed, 24 insertions(+), 28 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d8beebeb270..ae55933ce79 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -44,7 +44,7 @@ struct x86_pmu { int (*handle_irq)(struct pt_regs *, int); u64 (*save_disable_all)(void); void (*restore_all)(u64); - void (*enable)(int, u64); + void (*enable)(struct hw_perf_counter *, int); void (*disable)(int, u64); unsigned eventsel; unsigned perfctr; @@ -414,28 +414,15 @@ static inline void intel_pmu_ack_status(u64 ack) wrmsrl(MSR_CORE_PERF_GLOBAL_OVF_CTRL, ack); } -static void intel_pmu_enable_counter(int idx, u64 config) +static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, - config | ARCH_PERFMON_EVENTSEL0_ENABLE); -} + int err; -static void amd_pmu_enable_counter(int idx, u64 config) -{ - struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - - if (cpuc->enabled) - config |= ARCH_PERFMON_EVENTSEL0_ENABLE; - - wrmsrl(MSR_K7_EVNTSEL0 + idx, config); -} - -static void hw_perf_enable(int idx, u64 config) -{ if (unlikely(!perf_counters_initialized)) return; - x86_pmu.enable(idx, config); + err = checking_wrmsrl(hwc->config_base + idx, + hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } static void intel_pmu_disable_counter(int idx, u64 config) @@ -522,8 +509,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, } static inline void -__pmc_fixed_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int __idx) +intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, bits, mask; @@ -548,14 +534,24 @@ __pmc_fixed_enable(struct perf_counter *counter, err = checking_wrmsrl(hwc->config_base, ctrl_val); } -static void -__x86_pmu_enable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - __pmc_fixed_enable(counter, hwc, idx); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_enable_fixed(hwc, idx); + return; + } + + x86_pmu_enable_counter(hwc, idx); +} + +static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + + if (cpuc->enabled) + x86_pmu_enable_counter(hwc, idx); else - hw_perf_enable(idx, hwc->config); + amd_pmu_disable_counter(idx, hwc->config); } static int @@ -632,7 +628,7 @@ try_generic: set_bit(idx, cpuc->active); x86_perf_counter_set_period(counter, hwc, idx); - __x86_pmu_enable(counter, hwc, idx); + x86_pmu.enable(hwc, idx); return 0; } @@ -728,7 +724,7 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) x86_perf_counter_set_period(counter, hwc, idx); if (counter->state == PERF_COUNTER_STATE_ACTIVE) - __x86_pmu_enable(counter, hwc, idx); + intel_pmu_enable_counter(hwc, idx); } /* From d43698918bd46c71d494555fb92195fbea1fcb6c Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:19 +0200 Subject: [PATCH 0690/2061] perf_counter, x86: rework counter disable functions As for the enable function, this patch reworks the disable functions and introduces x86_pmu_disable_counter(). The internal function i/f in struct x86_pmu changed too. [ Impact: refactor and generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-23-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 48 ++++++++++++++---------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ae55933ce79..df9012bbd21 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -45,7 +45,7 @@ struct x86_pmu { u64 (*save_disable_all)(void); void (*restore_all)(u64); void (*enable)(struct hw_perf_counter *, int); - void (*disable)(int, u64); + void (*disable)(struct hw_perf_counter *, int); unsigned eventsel; unsigned perfctr; u64 (*event_map)(int); @@ -425,28 +425,19 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } -static void intel_pmu_disable_counter(int idx, u64 config) +static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - wrmsrl(MSR_ARCH_PERFMON_EVENTSEL0 + idx, config); -} + int err; -static void amd_pmu_disable_counter(int idx, u64 config) -{ - wrmsrl(MSR_K7_EVNTSEL0 + idx, config); - -} - -static void hw_perf_disable(int idx, u64 config) -{ if (unlikely(!perf_counters_initialized)) return; - x86_pmu.disable(idx, config); + err = checking_wrmsrl(hwc->config_base + idx, + hwc->config); } static inline void -__pmc_fixed_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int __idx) +intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx) { int idx = __idx - X86_PMC_IDX_FIXED; u64 ctrl_val, mask; @@ -460,13 +451,20 @@ __pmc_fixed_disable(struct perf_counter *counter, } static inline void -__x86_pmu_disable(struct perf_counter *counter, - struct hw_perf_counter *hwc, int idx) +intel_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { - if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) - __pmc_fixed_disable(counter, hwc, idx); - else - hw_perf_disable(idx, hwc->config); + if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) { + intel_pmu_disable_fixed(hwc, idx); + return; + } + + x86_pmu_disable_counter(hwc, idx); +} + +static inline void +amd_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) +{ + x86_pmu_disable_counter(hwc, idx); } static DEFINE_PER_CPU(u64, prev_left[X86_PMC_IDX_MAX]); @@ -551,7 +549,7 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) if (cpuc->enabled) x86_pmu_enable_counter(hwc, idx); else - amd_pmu_disable_counter(idx, hwc->config); + x86_pmu_disable_counter(hwc, idx); } static int @@ -622,7 +620,7 @@ try_generic: perf_counters_lapic_init(hwc->nmi); - __x86_pmu_disable(counter, hwc, idx); + x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; set_bit(idx, cpuc->active); @@ -694,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * could reenable again: */ clear_bit(idx, cpuc->active); - __x86_pmu_disable(counter, hwc, idx); + x86_pmu.disable(hwc, idx); /* * Make sure the cleared pointer becomes visible before we @@ -762,7 +760,7 @@ again: intel_pmu_save_and_restart(counter); if (perf_counter_overflow(counter, nmi, regs, 0)) - __x86_pmu_disable(counter, &counter->hw, bit); + intel_pmu_disable_counter(&counter->hw, bit); } intel_pmu_ack_status(ack); From 85cf9dba92152bb4edec118b2f4f0be1ae7fdcab Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:20 +0200 Subject: [PATCH 0691/2061] perf_counter, x86: change and remove pmu initialization checks Some functions are only called if the pmu was proper initialized. That initalization checks can be removed. The way to check initialization changed too. Now, the pointer to the interrupt handler is checked. If it exists the pmu is initialized. This also removes a static variable and uses struct x86_pmu as only data source for the check. [ Impact: simplify code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-24-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 34 ++++++++++++------------------ 1 file changed, 13 insertions(+), 21 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index df9012bbd21..2d3681bbb52 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -23,7 +23,6 @@ #include #include -static bool perf_counters_initialized __read_mostly; static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { @@ -227,6 +226,11 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) } } +static inline int x86_pmu_initialized(void) +{ + return x86_pmu.handle_irq != NULL; +} + /* * Setup the hardware configuration for a given hw_event_type */ @@ -240,8 +244,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) return -ENOSYS; - if (unlikely(!perf_counters_initialized)) - return -EINVAL; + if (!x86_pmu_initialized()) + return -ENODEV; err = 0; if (atomic_inc_not_zero(&num_counters)) { @@ -348,9 +352,8 @@ static u64 amd_pmu_save_disable_all(void) u64 hw_perf_save_disable(void) { - if (unlikely(!perf_counters_initialized)) + if (!x86_pmu_initialized()) return 0; - return x86_pmu.save_disable_all(); } /* @@ -388,9 +391,8 @@ static void amd_pmu_restore_all(u64 ctrl) void hw_perf_restore(u64 ctrl) { - if (unlikely(!perf_counters_initialized)) + if (!x86_pmu_initialized()) return; - x86_pmu.restore_all(ctrl); } /* @@ -402,8 +404,6 @@ static inline u64 intel_pmu_get_status(u64 mask) { u64 status; - if (unlikely(!perf_counters_initialized)) - return 0; rdmsrl(MSR_CORE_PERF_GLOBAL_STATUS, status); return status; @@ -417,10 +417,6 @@ static inline void intel_pmu_ack_status(u64 ack) static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) { int err; - - if (unlikely(!perf_counters_initialized)) - return; - err = checking_wrmsrl(hwc->config_base + idx, hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE); } @@ -428,10 +424,6 @@ static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx) static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx) { int err; - - if (unlikely(!perf_counters_initialized)) - return; - err = checking_wrmsrl(hwc->config_base + idx, hwc->config); } @@ -787,10 +779,10 @@ void perf_counter_unthrottle(void) { struct cpu_hw_counters *cpuc; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) + if (!x86_pmu_initialized()) return; - if (unlikely(!perf_counters_initialized)) + if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) return; cpuc = &__get_cpu_var(cpu_hw_counters); @@ -829,8 +821,9 @@ void perf_counters_lapic_init(int nmi) { u32 apic_val; - if (!perf_counters_initialized) + if (!x86_pmu_initialized()) return; + /* * Enable the performance counter vector in the APIC LVT: */ @@ -988,7 +981,6 @@ void __init init_hw_perf_counters(void) ((1LL << x86_pmu.num_counters_fixed)-1) << X86_PMC_IDX_FIXED; pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_initialized = true; perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); From a29aa8a7ff93e4196d558036928597e68337dd8d Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:21 +0200 Subject: [PATCH 0692/2061] perf_counter, x86: implement the interrupt handler for AMD cpus This patch implements the interrupt handler for AMD performance counters. In difference to the Intel pmu, there is no single status register and also there are no fixed counters. This makes the handler very different and it is useful to make the handler vendor specific. To check if a counter is overflowed the upper bit of the counter is checked. Only counters where the active bit is set are checked. With this patch throttling is enabled for AMD performance counters. This patch also reenables Linux performance counters on AMD cpus. [ Impact: re-enable perfcounters on AMD CPUs ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-25-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 45 ++++++++++++++++++++++++------ 1 file changed, 37 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 2d3681bbb52..f4d59d4cf3f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -240,10 +240,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) struct hw_perf_counter *hwc = &counter->hw; int err; - /* disable temporarily */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) - return -ENOSYS; - if (!x86_pmu_initialized()) return -ENODEV; @@ -773,7 +769,43 @@ out: return ret; } -static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { return 0; } +static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) +{ + int cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + u64 val; + int handled = 0; + struct perf_counter *counter; + struct hw_perf_counter *hwc; + int idx; + + ++cpuc->interrupts; + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + if (!test_bit(idx, cpuc->active)) + continue; + counter = cpuc->counters[idx]; + hwc = &counter->hw; + x86_perf_counter_update(counter, hwc, idx); + val = atomic64_read(&hwc->prev_count); + if (val & (1ULL << (x86_pmu.counter_bits - 1))) + continue; + /* counter overflow */ + x86_perf_counter_set_period(counter, hwc, idx); + handled = 1; + inc_irq_stat(apic_perf_irqs); + if (perf_counter_overflow(counter, nmi, regs, 0)) + amd_pmu_disable_counter(hwc, idx); + else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) + /* + * do not reenable when throttled, but reload + * the register + */ + amd_pmu_disable_counter(hwc, idx); + else if (counter->state == PERF_COUNTER_STATE_ACTIVE) + amd_pmu_enable_counter(hwc, idx); + } + return handled; +} void perf_counter_unthrottle(void) { @@ -782,9 +814,6 @@ void perf_counter_unthrottle(void) if (!x86_pmu_initialized()) return; - if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) - return; - cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) From 4b7bfd0d276da3a006d37e85d3cf900d7a14ae2a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:22 +0200 Subject: [PATCH 0693/2061] perf_counter, x86: return raw count with x86_perf_counter_update() To check on AMD cpus if a counter overflows, the upper bit of the raw counter value must be checked. This value is already internally available in x86_perf_counter_update(). Now, the value is returned so that it can be used directly to check for overflows. [ Impact: micro-optimization ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-26-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f4d59d4cf3f..a8a53abd706 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -132,7 +132,7 @@ static u64 amd_pmu_raw_event(u64 event) * Can only be executed on the CPU where the counter is active. * Returns the delta events processed. */ -static void +static u64 x86_perf_counter_update(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { @@ -165,6 +165,8 @@ again: atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); + + return new_raw_count; } static atomic_t num_counters; @@ -785,8 +787,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) continue; counter = cpuc->counters[idx]; hwc = &counter->hw; - x86_perf_counter_update(counter, hwc, idx); - val = atomic64_read(&hwc->prev_count); + val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; /* counter overflow */ From c619b8ffb1cec6a431687a35695dc6fd292a79e6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:23 +0200 Subject: [PATCH 0694/2061] perf_counter, x86: introduce max_period variable In x86 pmus the allowed counter period to programm differs. This introduces a max_period value and allows the generic implementation for all models to check the max period. [ Impact: generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-27-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a8a53abd706..4b8715b34f8 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -54,6 +54,7 @@ struct x86_pmu { int num_counters_fixed; int counter_bits; u64 counter_mask; + u64 max_period; }; static struct x86_pmu x86_pmu __read_mostly; @@ -279,14 +280,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; hwc->irq_period = hw_event->irq_period; - /* - * Intel PMCs cannot be accessed sanely above 32 bit width, - * so we install an artificial 1<<31 period regardless of - * the generic counter period: - */ - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > 0x7FFFFFFF) - hwc->irq_period = 0x7FFFFFFF; + if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) + hwc->irq_period = x86_pmu.max_period; atomic64_set(&hwc->period_left, hwc->irq_period); @@ -910,6 +905,12 @@ static struct x86_pmu intel_pmu = { .event_map = intel_pmu_event_map, .raw_event = intel_pmu_raw_event, .max_events = ARRAY_SIZE(intel_perfmon_event_map), + /* + * Intel PMCs cannot be accessed sanely above 32 bit width, + * so we install an artificial 1<<31 period regardless of + * the generic counter period: + */ + .max_period = (1ULL << 31) - 1, }; static struct x86_pmu amd_pmu = { @@ -927,6 +928,8 @@ static struct x86_pmu amd_pmu = { .num_counters = 4, .counter_bits = 48, .counter_mask = (1ULL << 48) - 1, + /* use highest bit to detect overflow */ + .max_period = (1ULL << 47) - 1, }; static int intel_pmu_init(void) @@ -999,6 +1002,7 @@ void __init init_hw_perf_counters(void) perf_max_counters = x86_pmu.num_counters; pr_info("... value mask: %016Lx\n", x86_pmu.counter_mask); + pr_info("... max period: %016Lx\n", x86_pmu.max_period); if (x86_pmu.num_counters_fixed > X86_PMC_MAX_FIXED) { x86_pmu.num_counters_fixed = X86_PMC_MAX_FIXED; From ef7b3e09ffdcd5200aea9523f6b56d331d1c4fc0 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:24 +0200 Subject: [PATCH 0695/2061] perf_counter, x86: remove vendor check in fixed_mode_idx() The function fixed_mode_idx() is used generically. Now it checks the num_counters_fixed value instead of the vendor to decide if fixed counters are present. [ Impact: generalize code ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 4b8715b34f8..d1c8036dcbd 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -542,7 +542,7 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) { unsigned int event; - if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) + if (!x86_pmu.num_counters_fixed) return -1; if (unlikely(hwc->nmi)) From 19d84dab55a383d75c885b5c1a618f5ead96f2f6 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:25 +0200 Subject: [PATCH 0696/2061] perf_counter, x86: remove unused function argument in intel_pmu_get_status() The mask argument is unused and thus can be removed. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-29-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d1c8036dcbd..856b0b85219 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -393,7 +393,7 @@ void hw_perf_restore(u64 ctrl) */ EXPORT_SYMBOL_GPL(hw_perf_restore); -static inline u64 intel_pmu_get_status(u64 mask) +static inline u64 intel_pmu_get_status(void) { u64 status; @@ -728,7 +728,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) cpuc->throttle_ctrl = intel_pmu_save_disable_all(); - status = intel_pmu_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(); if (!status) goto out; @@ -753,7 +753,7 @@ again: /* * Repeat if there is more work to be done: */ - status = intel_pmu_get_status(cpuc->throttle_ctrl); + status = intel_pmu_get_status(); if (status) goto again; out: From 38105f0234d4795c77c7c6845916caf3a395c451 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 12:47:26 +0200 Subject: [PATCH 0697/2061] perf_counter: update 'perf top' documentation The documentation about the perf-top build was outdated after perfstat has been implemented. This updates it. [ Impact: update documentation ] Signed-off-by: Robert Richter Cc: Paul Mackerras Acked-by: Peter Zijlstra LKML-Reference: <1241002046-8832-30-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-top.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index 6a276d2b2bb..8d28864a20c 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -3,7 +3,7 @@ Build with: - cc -O6 -Wall -c -o kerneltop.o kerneltop.c -lrt + make -C Documentation/perf_counter/ Sample output: From 98144511427c192e4249ff66a3f9debc55c59411 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 29 Apr 2009 14:52:50 +0200 Subject: [PATCH 0698/2061] perf_counter: add/update copyrights Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 +++++---- kernel/perf_counter.c | 6 +++--- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 856b0b85219..47e563bfd4c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -1,10 +1,11 @@ /* * Performance counter x86 architecture code * - * Copyright(C) 2008 Thomas Gleixner - * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar - * Copyright(C) 2009 Jaswinder Singh Rajput - * Copyright(C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2009 Jaswinder Singh Rajput + * Copyright (C) 2009 Advanced Micro Devices, Inc., Robert Richter + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * * For licencing details see kernel-base/COPYING */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 582108addef..a95a171e608 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1,9 +1,9 @@ /* * Performance counter core code * - * Copyright(C) 2008 Thomas Gleixner - * Copyright(C) 2008 Red Hat, Inc., Ingo Molnar - * + * Copyright (C) 2008 Thomas Gleixner + * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar + * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra * * For licensing details see kernel-base/COPYING */ From ab7ef2e50a557af92f4f90689f51fadadafc16b2 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Wed, 29 Apr 2009 22:38:51 +1000 Subject: [PATCH 0699/2061] perf_counter: powerpc: allow use of limited-function counters POWER5+ and POWER6 have two hardware counters with limited functionality: PMC5 counts instructions completed in run state and PMC6 counts cycles in run state. (Run state is the state when a hardware RUN bit is 1; the idle task clears RUN while waiting for work to do and sets it when there is work to do.) These counters can't be written to by the kernel, can't generate interrupts, and don't obey the freeze conditions. That means we can only use them for per-task counters (where we know we'll always be in run state; we can't put a per-task counter on an idle task), and only if we don't want interrupts and we do want to count in all processor modes. Obviously some counters can't go on a limited hardware counter, but there are also situations where we can only put a counter on a limited hardware counter - if there are already counters on that exclude some processor modes and we want to put on a per-task cycle or instruction counter that doesn't exclude any processor mode, it could go on if it can use a limited hardware counter. To keep track of these constraints, this adds a flags argument to the processor-specific get_alternatives() functions, with three bits defined: one to say that we can accept alternative event codes that go on limited counters, one to say we only want alternatives on limited counters, and one to say that this is a per-task counter and therefore events that are gated by run state are equivalent to those that aren't (e.g. a "cycles" event is equivalent to a "cycles in run state" event). These flags are computed for each counter and stored in the counter->hw.counter_base field (slightly wonky name for what it does, but it was an existing unused field). Since the limited counters don't freeze when we freeze the other counters, we need some special handling to avoid getting skew between things counted on the limited counters and those counted on normal counters. To minimize this skew, if we are using any limited counters, we read PMC5 and PMC6 immediately after setting and clearing the freeze bit. This is done in a single asm in the new write_mmcr0() function. The code here is specific to PMC5 and PMC6 being the limited hardware counters. Being more general (e.g. having a bitmap of limited hardware counter numbers) would have meant more complex code to read the limited counters when freezing and unfreezing the normal counters, with conditional branches, which would have increased the skew. Since it isn't necessary for the code to be more general at this stage, it isn't. This also extends the back-ends for POWER5+ and POWER6 to be able to handle up to 6 counters rather than the 4 they previously handled. Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Robert Richter LKML-Reference: <18936.19035.163066.892208@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/perf_counter.h | 13 +- arch/powerpc/kernel/perf_counter.c | 297 +++++++++++++++++++++--- arch/powerpc/kernel/power4-pmu.c | 3 +- arch/powerpc/kernel/power5+-pmu.c | 117 ++++++++-- arch/powerpc/kernel/power5-pmu.c | 3 +- arch/powerpc/kernel/power6-pmu.c | 119 ++++++++-- arch/powerpc/kernel/ppc970-pmu.c | 3 +- 7 files changed, 479 insertions(+), 76 deletions(-) diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h index 9d7ff6d7fb5..56d66c38143 100644 --- a/arch/powerpc/include/asm/perf_counter.h +++ b/arch/powerpc/include/asm/perf_counter.h @@ -12,6 +12,7 @@ #define MAX_HWCOUNTERS 8 #define MAX_EVENT_ALTERNATIVES 8 +#define MAX_LIMITED_HWCOUNTERS 2 /* * This struct provides the constants and functions needed to @@ -25,14 +26,24 @@ struct power_pmu { int (*compute_mmcr)(unsigned int events[], int n_ev, unsigned int hwc[], u64 mmcr[]); int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp); - int (*get_alternatives)(unsigned int event, unsigned int alt[]); + int (*get_alternatives)(unsigned int event, unsigned int flags, + unsigned int alt[]); void (*disable_pmc)(unsigned int pmc, u64 mmcr[]); + int (*limited_pmc_event)(unsigned int event); + int limited_pmc5_6; /* PMC5 and PMC6 have limited function */ int n_generic; int *generic_events; }; extern struct power_pmu *ppmu; +/* + * Values for flags to get_alternatives() + */ +#define PPMU_LIMITED_PMC_OK 1 /* can put this on a limited PMC */ +#define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ +#define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ + /* * The power_pmu.get_constraint function returns a 64-bit value and * a 64-bit mask that express the constraints between this event and diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index d9bbe5efc64..15cdc8e6722 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -23,10 +23,14 @@ struct cpu_hw_counters { int n_percpu; int disabled; int n_added; + int n_limited; + u8 pmcs_enabled; struct perf_counter *counter[MAX_HWCOUNTERS]; unsigned int events[MAX_HWCOUNTERS]; + unsigned int flags[MAX_HWCOUNTERS]; u64 mmcr[3]; - u8 pmcs_enabled; + struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS]; + u8 limited_hwidx[MAX_LIMITED_HWCOUNTERS]; }; DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters); @@ -127,7 +131,8 @@ static void write_pmc(int idx, unsigned long val) * and see if any combination of alternative codes is feasible. * The feasible set is returned in event[]. */ -static int power_check_constraints(unsigned int event[], int n_ev) +static int power_check_constraints(unsigned int event[], unsigned int cflags[], + int n_ev) { u64 mask, value, nv; unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; @@ -144,11 +149,15 @@ static int power_check_constraints(unsigned int event[], int n_ev) /* First see if the events will go on as-is */ for (i = 0; i < n_ev; ++i) { - alternatives[i][0] = event[i]; + if ((cflags[i] & PPMU_LIMITED_PMC_REQD) + && !ppmu->limited_pmc_event(event[i])) { + ppmu->get_alternatives(event[i], cflags[i], + alternatives[i]); + event[i] = alternatives[i][0]; + } if (ppmu->get_constraint(event[i], &amasks[i][0], &avalues[i][0])) return -1; - choice[i] = 0; } value = mask = 0; for (i = 0; i < n_ev; ++i) { @@ -166,7 +175,9 @@ static int power_check_constraints(unsigned int event[], int n_ev) if (!ppmu->get_alternatives) return -1; for (i = 0; i < n_ev; ++i) { - n_alt[i] = ppmu->get_alternatives(event[i], alternatives[i]); + choice[i] = 0; + n_alt[i] = ppmu->get_alternatives(event[i], cflags[i], + alternatives[i]); for (j = 1; j < n_alt[i]; ++j) ppmu->get_constraint(alternatives[i][j], &amasks[i][j], &avalues[i][j]); @@ -231,28 +242,41 @@ static int power_check_constraints(unsigned int event[], int n_ev) * exclude_{user,kernel,hv} with each other and any previously * added counters. */ -static int check_excludes(struct perf_counter **ctrs, int n_prev, int n_new) +static int check_excludes(struct perf_counter **ctrs, unsigned int cflags[], + int n_prev, int n_new) { - int eu, ek, eh; - int i, n; + int eu = 0, ek = 0, eh = 0; + int i, n, first; struct perf_counter *counter; n = n_prev + n_new; if (n <= 1) return 0; - eu = ctrs[0]->hw_event.exclude_user; - ek = ctrs[0]->hw_event.exclude_kernel; - eh = ctrs[0]->hw_event.exclude_hv; - if (n_prev == 0) - n_prev = 1; - for (i = n_prev; i < n; ++i) { + first = 1; + for (i = 0; i < n; ++i) { + if (cflags[i] & PPMU_LIMITED_PMC_OK) { + cflags[i] &= ~PPMU_LIMITED_PMC_REQD; + continue; + } counter = ctrs[i]; - if (counter->hw_event.exclude_user != eu || - counter->hw_event.exclude_kernel != ek || - counter->hw_event.exclude_hv != eh) + if (first) { + eu = counter->hw_event.exclude_user; + ek = counter->hw_event.exclude_kernel; + eh = counter->hw_event.exclude_hv; + first = 0; + } else if (counter->hw_event.exclude_user != eu || + counter->hw_event.exclude_kernel != ek || + counter->hw_event.exclude_hv != eh) { return -EAGAIN; + } } + + if (eu || ek || eh) + for (i = 0; i < n; ++i) + if (cflags[i] & PPMU_LIMITED_PMC_OK) + cflags[i] |= PPMU_LIMITED_PMC_REQD; + return 0; } @@ -279,6 +303,85 @@ static void power_pmu_read(struct perf_counter *counter) atomic64_sub(delta, &counter->hw.period_left); } +/* + * On some machines, PMC5 and PMC6 can't be written, don't respect + * the freeze conditions, and don't generate interrupts. This tells + * us if `counter' is using such a PMC. + */ +static int is_limited_pmc(int pmcnum) +{ + return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6); +} + +static void freeze_limited_counters(struct cpu_hw_counters *cpuhw, + unsigned long pmc5, unsigned long pmc6) +{ + struct perf_counter *counter; + u64 val, prev, delta; + int i; + + for (i = 0; i < cpuhw->n_limited; ++i) { + counter = cpuhw->limited_counter[i]; + if (!counter->hw.idx) + continue; + val = (counter->hw.idx == 5) ? pmc5 : pmc6; + prev = atomic64_read(&counter->hw.prev_count); + counter->hw.idx = 0; + delta = (val - prev) & 0xfffffffful; + atomic64_add(delta, &counter->count); + } +} + +static void thaw_limited_counters(struct cpu_hw_counters *cpuhw, + unsigned long pmc5, unsigned long pmc6) +{ + struct perf_counter *counter; + u64 val; + int i; + + for (i = 0; i < cpuhw->n_limited; ++i) { + counter = cpuhw->limited_counter[i]; + counter->hw.idx = cpuhw->limited_hwidx[i]; + val = (counter->hw.idx == 5) ? pmc5 : pmc6; + atomic64_set(&counter->hw.prev_count, val); + perf_counter_update_userpage(counter); + } +} + +/* + * Since limited counters don't respect the freeze conditions, we + * have to read them immediately after freezing or unfreezing the + * other counters. We try to keep the values from the limited + * counters as consistent as possible by keeping the delay (in + * cycles and instructions) between freezing/unfreezing and reading + * the limited counters as small and consistent as possible. + * Therefore, if any limited counters are in use, we read them + * both, and always in the same order, to minimize variability, + * and do it inside the same asm that writes MMCR0. + */ +static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0) +{ + unsigned long pmc5, pmc6; + + if (!cpuhw->n_limited) { + mtspr(SPRN_MMCR0, mmcr0); + return; + } + + /* + * Write MMCR0, then read PMC5 and PMC6 immediately. + */ + asm volatile("mtspr %3,%2; mfspr %0,%4; mfspr %1,%5" + : "=&r" (pmc5), "=&r" (pmc6) + : "r" (mmcr0), "i" (SPRN_MMCR0), + "i" (SPRN_PMC5), "i" (SPRN_PMC6)); + + if (mmcr0 & MMCR0_FC) + freeze_limited_counters(cpuhw, pmc5, pmc6); + else + thaw_limited_counters(cpuhw, pmc5, pmc6); +} + /* * Disable all counters to prevent PMU interrupts and to allow * counters to be added or removed. @@ -321,7 +424,7 @@ u64 hw_perf_save_disable(void) * executed and the PMU has frozen the counters * before we return. */ - mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) | MMCR0_FC); + write_mmcr0(cpuhw, mfspr(SPRN_MMCR0) | MMCR0_FC); mb(); } local_irq_restore(flags); @@ -342,6 +445,8 @@ void hw_perf_restore(u64 disable) unsigned long val; s64 left; unsigned int hwc_index[MAX_HWCOUNTERS]; + int n_lim; + int idx; if (disable) return; @@ -414,10 +519,18 @@ void hw_perf_restore(u64 disable) /* * Initialize the PMCs for all the new and moved counters. */ + cpuhw->n_limited = n_lim = 0; for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; if (counter->hw.idx) continue; + idx = hwc_index[i] + 1; + if (is_limited_pmc(idx)) { + cpuhw->limited_counter[n_lim] = counter; + cpuhw->limited_hwidx[n_lim] = idx; + ++n_lim; + continue; + } val = 0; if (counter->hw_event.irq_period) { left = atomic64_read(&counter->hw.period_left); @@ -425,15 +538,16 @@ void hw_perf_restore(u64 disable) val = 0x80000000L - left; } atomic64_set(&counter->hw.prev_count, val); - counter->hw.idx = hwc_index[i] + 1; - write_pmc(counter->hw.idx, val); + counter->hw.idx = idx; + write_pmc(idx, val); perf_counter_update_userpage(counter); } + cpuhw->n_limited = n_lim; cpuhw->mmcr[0] |= MMCR0_PMXE | MMCR0_FCECE; out_enable: mb(); - mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + write_mmcr0(cpuhw, cpuhw->mmcr[0]); /* * Enable instruction sampling if necessary @@ -448,7 +562,8 @@ void hw_perf_restore(u64 disable) } static int collect_events(struct perf_counter *group, int max_count, - struct perf_counter *ctrs[], unsigned int *events) + struct perf_counter *ctrs[], unsigned int *events, + unsigned int *flags) { int n = 0; struct perf_counter *counter; @@ -457,6 +572,7 @@ static int collect_events(struct perf_counter *group, int max_count, if (n >= max_count) return -1; ctrs[n] = group; + flags[n] = group->hw.counter_base; events[n++] = group->hw.config; } list_for_each_entry(counter, &group->sibling_list, list_entry) { @@ -465,6 +581,7 @@ static int collect_events(struct perf_counter *group, int max_count, if (n >= max_count) return -1; ctrs[n] = counter; + flags[n] = counter->hw.counter_base; events[n++] = counter->hw.config; } } @@ -497,12 +614,14 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, cpuhw = &__get_cpu_var(cpu_hw_counters); n0 = cpuhw->n_counters; n = collect_events(group_leader, ppmu->n_counter - n0, - &cpuhw->counter[n0], &cpuhw->events[n0]); + &cpuhw->counter[n0], &cpuhw->events[n0], + &cpuhw->flags[n0]); if (n < 0) return -EAGAIN; - if (check_excludes(cpuhw->counter, n0, n)) + if (check_excludes(cpuhw->counter, cpuhw->flags, n0, n)) return -EAGAIN; - if (power_check_constraints(cpuhw->events, n + n0)) + i = power_check_constraints(cpuhw->events, cpuhw->flags, n + n0); + if (i < 0) return -EAGAIN; cpuhw->n_counters = n0 + n; cpuhw->n_added += n; @@ -554,9 +673,10 @@ static int power_pmu_enable(struct perf_counter *counter) goto out; cpuhw->counter[n0] = counter; cpuhw->events[n0] = counter->hw.config; - if (check_excludes(cpuhw->counter, n0, 1)) + cpuhw->flags[n0] = counter->hw.counter_base; + if (check_excludes(cpuhw->counter, cpuhw->flags, n0, 1)) goto out; - if (power_check_constraints(cpuhw->events, n0 + 1)) + if (power_check_constraints(cpuhw->events, cpuhw->flags, n0 + 1)) goto out; counter->hw.config = cpuhw->events[n0]; @@ -592,12 +712,24 @@ static void power_pmu_disable(struct perf_counter *counter) cpuhw->counter[i-1] = cpuhw->counter[i]; --cpuhw->n_counters; ppmu->disable_pmc(counter->hw.idx - 1, cpuhw->mmcr); - write_pmc(counter->hw.idx, 0); - counter->hw.idx = 0; + if (counter->hw.idx) { + write_pmc(counter->hw.idx, 0); + counter->hw.idx = 0; + } perf_counter_update_userpage(counter); break; } } + for (i = 0; i < cpuhw->n_limited; ++i) + if (counter == cpuhw->limited_counter[i]) + break; + if (i < cpuhw->n_limited) { + while (++i < cpuhw->n_limited) { + cpuhw->limited_counter[i-1] = cpuhw->limited_counter[i]; + cpuhw->limited_hwidx[i-1] = cpuhw->limited_hwidx[i]; + } + --cpuhw->n_limited; + } if (cpuhw->n_counters == 0) { /* disable exceptions if no counters are running */ cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); @@ -613,6 +745,61 @@ struct pmu power_pmu = { .read = power_pmu_read, }; +/* + * Return 1 if we might be able to put counter on a limited PMC, + * or 0 if not. + * A counter can only go on a limited PMC if it counts something + * that a limited PMC can count, doesn't require interrupts, and + * doesn't exclude any processor mode. + */ +static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev, + unsigned int flags) +{ + int n; + unsigned int alt[MAX_EVENT_ALTERNATIVES]; + + if (counter->hw_event.exclude_user + || counter->hw_event.exclude_kernel + || counter->hw_event.exclude_hv + || counter->hw_event.irq_period) + return 0; + + if (ppmu->limited_pmc_event(ev)) + return 1; + + /* + * The requested event isn't on a limited PMC already; + * see if any alternative code goes on a limited PMC. + */ + if (!ppmu->get_alternatives) + return 0; + + flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; + n = ppmu->get_alternatives(ev, flags, alt); + if (n) + return alt[0]; + + return 0; +} + +/* + * Find an alternative event that goes on a normal PMC, if possible, + * and return the event code, or 0 if there is no such alternative. + * (Note: event code 0 is "don't count" on all machines.) + */ +static unsigned long normal_pmc_alternative(unsigned long ev, + unsigned long flags) +{ + unsigned int alt[MAX_EVENT_ALTERNATIVES]; + int n; + + flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); + n = ppmu->get_alternatives(ev, flags, alt); + if (!n) + return 0; + return alt[0]; +} + /* Number of perf_counters counting hardware events */ static atomic_t num_counters; /* Used to avoid races in calling reserve/release_pmc_hardware */ @@ -633,9 +820,10 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { - unsigned long ev; + unsigned long ev, flags; struct perf_counter *ctrs[MAX_HWCOUNTERS]; unsigned int events[MAX_HWCOUNTERS]; + unsigned int cflags[MAX_HWCOUNTERS]; int n; int err; @@ -661,7 +849,36 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) */ if (!firmware_has_feature(FW_FEATURE_LPAR)) counter->hw_event.exclude_hv = 0; - + + /* + * If this is a per-task counter, then we can use + * PM_RUN_* events interchangeably with their non RUN_* + * equivalents, e.g. PM_RUN_CYC instead of PM_CYC. + * XXX we should check if the task is an idle task. + */ + flags = 0; + if (counter->ctx->task) + flags |= PPMU_ONLY_COUNT_RUN; + + /* + * If this machine has limited counters, check whether this + * event could go on a limited counter. + */ + if (ppmu->limited_pmc5_6) { + if (can_go_on_limited_pmc(counter, ev, flags)) { + flags |= PPMU_LIMITED_PMC_OK; + } else if (ppmu->limited_pmc_event(ev)) { + /* + * The requested event is on a limited PMC, + * but we can't use a limited PMC; see if any + * alternative goes on a normal PMC. + */ + ev = normal_pmc_alternative(ev, flags); + if (!ev) + return ERR_PTR(-EINVAL); + } + } + /* * If this is in a group, check if it can go on with all the * other hardware counters in the group. We assume the counter @@ -670,18 +887,20 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) n = 0; if (counter->group_leader != counter) { n = collect_events(counter->group_leader, ppmu->n_counter - 1, - ctrs, events); + ctrs, events, cflags); if (n < 0) return ERR_PTR(-EINVAL); } events[n] = ev; ctrs[n] = counter; - if (check_excludes(ctrs, n, 1)) + cflags[n] = flags; + if (check_excludes(ctrs, cflags, n, 1)) return ERR_PTR(-EINVAL); - if (power_check_constraints(events, n + 1)) + if (power_check_constraints(events, cflags, n + 1)) return ERR_PTR(-EINVAL); counter->hw.config = events[n]; + counter->hw.counter_base = cflags[n]; atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); /* @@ -763,6 +982,10 @@ static void perf_counter_interrupt(struct pt_regs *regs) int found = 0; int nmi; + if (cpuhw->n_limited) + freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5), + mfspr(SPRN_PMC6)); + /* * If interrupts were soft-disabled when this PMU interrupt * occurred, treat it as an NMI. @@ -775,6 +998,8 @@ static void perf_counter_interrupt(struct pt_regs *regs) for (i = 0; i < cpuhw->n_counters; ++i) { counter = cpuhw->counter[i]; + if (is_limited_pmc(counter->hw.idx)) + continue; val = read_pmc(counter->hw.idx); if ((int)val < 0) { /* counter has overflowed */ @@ -791,6 +1016,8 @@ static void perf_counter_interrupt(struct pt_regs *regs) */ if (!found) { for (i = 0; i < ppmu->n_counter; ++i) { + if (is_limited_pmc(i + 1)) + continue; val = read_pmc(i + 1); if ((int)val < 0) write_pmc(i + 1, 0); @@ -804,7 +1031,7 @@ static void perf_counter_interrupt(struct pt_regs *regs) * XXX might want to use MSR.PM to keep the counters frozen until * we get back out of this interrupt. */ - mtspr(SPRN_MMCR0, cpuhw->mmcr[0]); + write_mmcr0(cpuhw, cpuhw->mmcr[0]); if (nmi) nmi_exit(); diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 1407b19ab61..744a2756958 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -320,7 +320,8 @@ static unsigned int ppc_inst_cmpl[] = { 0x1001, 0x4001, 0x6001, 0x7001, 0x8001 }; -static int p4_get_alternatives(unsigned int event, unsigned int alt[]) +static int p4_get_alternatives(unsigned int event, unsigned int flags, + unsigned int alt[]) { int i, j, na; diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 1222c8ea3c2..8154eaa2404 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -78,8 +78,8 @@ * Layout of constraint bits: * 6666555555555544444444443333333333222222222211111111110000000000 * 3210987654321098765432109876543210987654321098765432109876543210 - * [ ><><>< ><> <><>[ > < >< >< >< ><><><><> - * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P4P3P2P1 + * [ ><><>< ><> <><>[ > < >< >< >< ><><><><><><> + * NC G0G1G2 G3 T0T1 UC B0 B1 B2 B3 P6P5P4P3P2P1 * * NC - number of counters * 51: NC error 0x0008_0000_0000_0000 @@ -105,18 +105,18 @@ * 30: IDU|GRS events needed 0x00_4000_0000 * * B0 - * 20-23: Byte 0 event source 0x00f0_0000 + * 24-27: Byte 0 event source 0x0f00_0000 * Encoding as for the event code * * B1, B2, B3 - * 16-19, 12-15, 8-11: Byte 1, 2, 3 event sources + * 20-23, 16-19, 12-15: Byte 1, 2, 3 event sources * - * P4 - * 7: P1 error 0x80 - * 6-7: Count of events needing PMC4 + * P6 + * 11: P6 error 0x800 + * 10-11: Count of events needing PMC6 * - * P1..P3 - * 0-6: Count of events needing PMC1..PMC3 + * P1..P5 + * 0-9: Count of events needing PMC1..PMC5 */ static const int grsel_shift[8] = { @@ -143,11 +143,13 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; if (pmc) { - if (pmc > 4) + if (pmc > 6) return -1; sh = (pmc - 1) * 2; mask |= 2 << sh; value |= 1 << sh; + if (pmc >= 5 && !(event == 0x500009 || event == 0x600005)) + return -1; } if (event & PM_BUSEVENT_MSK) { unit = (event >> PM_UNIT_SH) & PM_UNIT_MSK; @@ -173,16 +175,26 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) value |= (u64)((event >> PM_GRS_SH) & fmask) << sh; } /* Set byte lane select field */ - mask |= 0xfULL << (20 - 4 * byte); - value |= (u64)unit << (20 - 4 * byte); + mask |= 0xfULL << (24 - 4 * byte); + value |= (u64)unit << (24 - 4 * byte); + } + if (pmc < 5) { + /* need a counter from PMC1-4 set */ + mask |= 0x8000000000000ull; + value |= 0x1000000000000ull; } - mask |= 0x8000000000000ull; - value |= 0x1000000000000ull; *maskp = mask; *valp = value; return 0; } +static int power5p_limited_pmc_event(unsigned int event) +{ + int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + + return pmc == 5 || pmc == 6; +} + #define MAX_ALT 3 /* at most 3 alternatives for any event */ static const unsigned int event_alternatives[][MAX_ALT] = { @@ -193,6 +205,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = { { 0x410c7, 0x441084 }, /* PM_THRD_L2MISS_BOTH_CYC */ { 0x800c4, 0xc20e0 }, /* PM_DTLB_MISS */ { 0xc50c6, 0xc60e0 }, /* PM_MRK_DTLB_MISS */ + { 0x100005, 0x600005 }, /* PM_RUN_CYC */ { 0x100009, 0x200009 }, /* PM_INST_CMPL */ { 0x200015, 0x300015 }, /* PM_LSU_LMQ_SRQ_EMPTY_CYC */ { 0x300009, 0x400009 }, /* PM_INST_DISP */ @@ -260,24 +273,85 @@ static int find_alternative_bdecode(unsigned int event) return -1; } -static int power5p_get_alternatives(unsigned int event, unsigned int alt[]) +static int power5p_get_alternatives(unsigned int event, unsigned int flags, + unsigned int alt[]) { int i, j, ae, nalt = 1; + int nlim; alt[0] = event; nalt = 1; + nlim = power5p_limited_pmc_event(event); i = find_alternative(event); if (i >= 0) { for (j = 0; j < MAX_ALT; ++j) { ae = event_alternatives[i][j]; if (ae && ae != event) alt[nalt++] = ae; + nlim += power5p_limited_pmc_event(ae); } } else { ae = find_alternative_bdecode(event); if (ae > 0) alt[nalt++] = ae; } + + if (flags & PPMU_ONLY_COUNT_RUN) { + /* + * We're only counting in RUN state, + * so PM_CYC is equivalent to PM_RUN_CYC + * and PM_INST_CMPL === PM_RUN_INST_CMPL. + * This doesn't include alternatives that don't provide + * any extra flexibility in assigning PMCs (e.g. + * 0x100005 for PM_RUN_CYC vs. 0xf for PM_CYC). + * Note that even with these additional alternatives + * we never end up with more than 3 alternatives for any event. + */ + j = nalt; + for (i = 0; i < nalt; ++i) { + switch (alt[i]) { + case 0xf: /* PM_CYC */ + alt[j++] = 0x600005; /* PM_RUN_CYC */ + ++nlim; + break; + case 0x600005: /* PM_RUN_CYC */ + alt[j++] = 0xf; + break; + case 0x100009: /* PM_INST_CMPL */ + alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */ + ++nlim; + break; + case 0x500009: /* PM_RUN_INST_CMPL */ + alt[j++] = 0x100009; /* PM_INST_CMPL */ + alt[j++] = 0x200009; + break; + } + } + nalt = j; + } + + if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) { + /* remove the limited PMC events */ + j = 0; + for (i = 0; i < nalt; ++i) { + if (!power5p_limited_pmc_event(alt[i])) { + alt[j] = alt[i]; + ++j; + } + } + nalt = j; + } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) { + /* remove all but the limited PMC events */ + j = 0; + for (i = 0; i < nalt; ++i) { + if (power5p_limited_pmc_event(alt[i])) { + alt[j] = alt[i]; + ++j; + } + } + nalt = j; + } + return nalt; } @@ -390,7 +464,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev, unsigned char unituse[16]; int ttmuse; - if (n_ev > 4) + if (n_ev > 6) return -1; /* First pass to count resource use */ @@ -399,7 +473,7 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev, for (i = 0; i < n_ev; ++i) { pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; if (pmc) { - if (pmc > 4) + if (pmc > 6) return -1; if (pmc_inuse & (1 << (pmc - 1))) return -1; @@ -488,13 +562,16 @@ static int power5p_compute_mmcr(unsigned int event[], int n_ev, if (pmc >= 4) return -1; pmc_inuse |= 1 << pmc; - } else { + } else if (pmc <= 4) { /* Direct event */ --pmc; if (isbus && (byte & 2) && (psel == 8 || psel == 0x10 || psel == 0x28)) /* add events on higher-numbered bus */ mmcr1 |= 1ull << (MMCR1_PMC1_ADDER_SEL_SH - pmc); + } else { + /* Instructions or run cycles on PMC5/6 */ + --pmc; } if (isbus && unit == PM_GRS) { bit = psel & 7; @@ -538,7 +615,7 @@ static int power5p_generic_events[] = { }; struct power_pmu power5p_pmu = { - .n_counter = 4, + .n_counter = 6, .max_alternatives = MAX_ALT, .add_fields = 0x7000000000055ull, .test_adder = 0x3000040000000ull, @@ -548,4 +625,6 @@ struct power_pmu power5p_pmu = { .disable_pmc = power5p_disable_pmc, .n_generic = ARRAY_SIZE(power5p_generic_events), .generic_events = power5p_generic_events, + .limited_pmc5_6 = 1, + .limited_pmc_event = power5p_limited_pmc_event, }; diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 116c4bb1809..6e667dc8647 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -269,7 +269,8 @@ static int find_alternative_bdecode(unsigned int event) return -1; } -static int power5_get_alternatives(unsigned int event, unsigned int alt[]) +static int power5_get_alternatives(unsigned int event, unsigned int flags, + unsigned int alt[]) { int i, j, ae, nalt = 1; diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index fce1fc290a1..d44049f0ae2 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -182,7 +182,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, unsigned int ttmset = 0; unsigned int pmc_inuse = 0; - if (n_ev > 4) + if (n_ev > 6) return -1; for (i = 0; i < n_ev; ++i) { pmc = (event[i] >> PM_PMC_SH) & PM_PMC_MSK; @@ -202,6 +202,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, for (pmc = 0; pmc < 4; ++pmc) if (!(pmc_inuse & (1 << pmc))) break; + if (pmc >= 4) + return -1; pmc_inuse |= 1 << pmc; } hwc[i] = pmc; @@ -240,7 +242,8 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, } if (power6_marked_instr_event(event[i])) mmcra |= MMCRA_SAMPLE_ENABLE; - mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); + if (pmc < 4) + mmcr1 |= (u64)psel << MMCR1_PMCSEL_SH(pmc); } mmcr[0] = 0; if (pmc_inuse & 1) @@ -256,19 +259,20 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, * Layout of constraint bits: * * 0-1 add field: number of uses of PMC1 (max 1) - * 2-3, 4-5, 6-7: ditto for PMC2, 3, 4 - * 8-10 select field: nest (subunit) event selector + * 2-3, 4-5, 6-7, 8-9, 10-11: ditto for PMC2, 3, 4, 5, 6 + * 12-15 add field: number of uses of PMC1-4 (max 4) * 16-19 select field: unit on byte 0 of event bus * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3 + * 32-34 select field: nest (subunit) event selector */ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) { - int pmc, byte, sh; - unsigned int mask = 0, value = 0; + int pmc, byte, sh, subunit; + u64 mask = 0, value = 0; pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; if (pmc) { - if (pmc > 4) + if (pmc > 4 && !(event == 0x500009 || event == 0x600005)) return -1; sh = (pmc - 1) * 2; mask |= 2 << sh; @@ -276,26 +280,38 @@ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) } if (event & PM_BUSEVENT_MSK) { byte = (event >> PM_BYTE_SH) & PM_BYTE_MSK; - sh = byte * 4; + sh = byte * 4 + (16 - PM_UNIT_SH); mask |= PM_UNIT_MSKS << sh; - value |= (event & PM_UNIT_MSKS) << sh; + value |= (u64)(event & PM_UNIT_MSKS) << sh; if ((event & PM_UNIT_MSKS) == (5 << PM_UNIT_SH)) { - mask |= PM_SUBUNIT_MSKS; - value |= event & PM_SUBUNIT_MSKS; + subunit = (event >> PM_SUBUNIT_SH) & PM_SUBUNIT_MSK; + mask |= (u64)PM_SUBUNIT_MSK << 32; + value |= (u64)subunit << 32; } } + if (pmc <= 4) { + mask |= 0x8000; /* add field for count of PMC1-4 uses */ + value |= 0x1000; + } *maskp = mask; *valp = value; return 0; } +static int p6_limited_pmc_event(unsigned int event) +{ + int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; + + return pmc == 5 || pmc == 6; +} + #define MAX_ALT 4 /* at most 4 alternatives for any event */ static const unsigned int event_alternatives[][MAX_ALT] = { { 0x0130e8, 0x2000f6, 0x3000fc }, /* PM_PTEG_RELOAD_VALID */ { 0x080080, 0x10000d, 0x30000c, 0x4000f0 }, /* PM_LD_MISS_L1 */ { 0x080088, 0x200054, 0x3000f0 }, /* PM_ST_MISS_L1 */ - { 0x10000a, 0x2000f4 }, /* PM_RUN_CYC */ + { 0x10000a, 0x2000f4, 0x600005 }, /* PM_RUN_CYC */ { 0x10000b, 0x2000f5 }, /* PM_RUN_COUNT */ { 0x10000e, 0x400010 }, /* PM_PURR */ { 0x100010, 0x4000f8 }, /* PM_FLUSH */ @@ -340,13 +356,15 @@ static int find_alternatives_list(unsigned int event) return -1; } -static int p6_get_alternatives(unsigned int event, unsigned int alt[]) +static int p6_get_alternatives(unsigned int event, unsigned int flags, + unsigned int alt[]) { - int i, j; + int i, j, nlim; unsigned int aevent, psel, pmc; unsigned int nalt = 1; alt[0] = event; + nlim = p6_limited_pmc_event(event); /* check the alternatives table */ i = find_alternatives_list(event); @@ -358,6 +376,7 @@ static int p6_get_alternatives(unsigned int event, unsigned int alt[]) break; if (aevent != event) alt[nalt++] = aevent; + nlim += p6_limited_pmc_event(aevent); } } else { @@ -375,13 +394,75 @@ static int p6_get_alternatives(unsigned int event, unsigned int alt[]) ((pmc > 2? pmc - 2: pmc + 2) << PM_PMC_SH); } + if (flags & PPMU_ONLY_COUNT_RUN) { + /* + * We're only counting in RUN state, + * so PM_CYC is equivalent to PM_RUN_CYC, + * PM_INST_CMPL === PM_RUN_INST_CMPL, PM_PURR === PM_RUN_PURR. + * This doesn't include alternatives that don't provide + * any extra flexibility in assigning PMCs (e.g. + * 0x10000a for PM_RUN_CYC vs. 0x1e for PM_CYC). + * Note that even with these additional alternatives + * we never end up with more than 4 alternatives for any event. + */ + j = nalt; + for (i = 0; i < nalt; ++i) { + switch (alt[i]) { + case 0x1e: /* PM_CYC */ + alt[j++] = 0x600005; /* PM_RUN_CYC */ + ++nlim; + break; + case 0x10000a: /* PM_RUN_CYC */ + alt[j++] = 0x1e; /* PM_CYC */ + break; + case 2: /* PM_INST_CMPL */ + alt[j++] = 0x500009; /* PM_RUN_INST_CMPL */ + ++nlim; + break; + case 0x500009: /* PM_RUN_INST_CMPL */ + alt[j++] = 2; /* PM_INST_CMPL */ + break; + case 0x10000e: /* PM_PURR */ + alt[j++] = 0x4000f4; /* PM_RUN_PURR */ + break; + case 0x4000f4: /* PM_RUN_PURR */ + alt[j++] = 0x10000e; /* PM_PURR */ + break; + } + } + nalt = j; + } + + if (!(flags & PPMU_LIMITED_PMC_OK) && nlim) { + /* remove the limited PMC events */ + j = 0; + for (i = 0; i < nalt; ++i) { + if (!p6_limited_pmc_event(alt[i])) { + alt[j] = alt[i]; + ++j; + } + } + nalt = j; + } else if ((flags & PPMU_LIMITED_PMC_REQD) && nlim < nalt) { + /* remove all but the limited PMC events */ + j = 0; + for (i = 0; i < nalt; ++i) { + if (p6_limited_pmc_event(alt[i])) { + alt[j] = alt[i]; + ++j; + } + } + nalt = j; + } + return nalt; } static void p6_disable_pmc(unsigned int pmc, u64 mmcr[]) { /* Set PMCxSEL to 0 to disable PMCx */ - mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); + if (pmc <= 3) + mmcr[1] &= ~(0xffUL << MMCR1_PMCSEL_SH(pmc)); } static int power6_generic_events[] = { @@ -394,14 +475,16 @@ static int power6_generic_events[] = { }; struct power_pmu power6_pmu = { - .n_counter = 4, + .n_counter = 6, .max_alternatives = MAX_ALT, - .add_fields = 0x55, - .test_adder = 0, + .add_fields = 0x1555, + .test_adder = 0x3000, .compute_mmcr = p6_compute_mmcr, .get_constraint = p6_get_constraint, .get_alternatives = p6_get_alternatives, .disable_pmc = p6_disable_pmc, .n_generic = ARRAY_SIZE(power6_generic_events), .generic_events = power6_generic_events, + .limited_pmc5_6 = 1, + .limited_pmc_event = p6_limited_pmc_event, }; diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index aed8ccd7c07..af2d1884058 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -243,7 +243,8 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) return 0; } -static int p970_get_alternatives(unsigned int event, unsigned int alt[]) +static int p970_get_alternatives(unsigned int event, unsigned int flags, + unsigned int alt[]) { alt[0] = event; From b1fca26631f76a5e8b18435a43f5d82b8734da4b Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 23 Mar 2009 18:22:09 +0100 Subject: [PATCH 0700/2061] mutex: add atomic_dec_and_mutex_lock() Much like the atomic_dec_and_lock() function in which we take an hold a spin_lock if we drop the atomic to 0 this function takes and holds the mutex if we dec the atomic to 0. Signed-off-by: Eric Paris Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Orig-LKML-Reference: <20090323172417.410913479@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 3069ec7e0ab..93054fc3635 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -151,4 +151,27 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} + #endif From 0c8b946e3ebb3846103486420ea7430a4b5e5b1b Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 15 Apr 2009 17:48:18 +0200 Subject: [PATCH 0701/2061] vsprintf: introduce %pf format specifier A printf format specifier which would allow us to print a pure function name has been suggested by Andrew Morton a couple of months ago. The current %pF is very convenient to print a function symbol, but often we only want to print the name of the function, without its asm offset. That's what %pf does in this patch. The lowecase f has been chosen for its intuitive meaning of a 'weak kind of %pF'. The support for this new format would be welcome by the tracing code where the need to print pure function names is often needed. This is also true for other parts of the kernel: $ git-grep -E "kallsyms_lookup\(.+?\)" arch/blackfin/kernel/traps.c: symname = kallsyms_lookup(address, &symsize, &offset, &modname, namebuf); arch/powerpc/xmon/xmon.c: name = kallsyms_lookup(pc, &size, &offset, NULL, tmpstr); arch/sh/kernel/cpu/sh5/unwind.c: sym = kallsyms_lookup(pc, NULL, &offset, NULL, namebuf); arch/x86/kernel/ftrace.c: kallsyms_lookup((unsigned long) syscall, NULL, NULL, NULL, str); kernel/kprobes.c: sym = kallsyms_lookup((unsigned long)p->addr, NULL, kernel/lockdep.c: return kallsyms_lookup((unsigned long)key, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup((unsigned long)rec->ops->func, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup(rec->ip, NULL, NULL, NULL, str); kernel/trace/ftrace.c: kallsyms_lookup(rec->ip, NULL, NULL, &modname, str); kernel/trace/ftrace.c: kallsyms_lookup(*ptr, NULL, NULL, NULL, str); kernel/trace/trace_functions.c: kallsyms_lookup(ip, NULL, NULL, NULL, str); kernel/trace/trace_output.c: kallsyms_lookup(address, NULL, NULL, NULL, str); Changes in v2: - Add the explanation of the %pf role for vsnprintf() and bstr_printf() - Change the comments by dropping the "asm offset" notion and only define the %pf against the actual function offset notion. Signed-off-by: Frederic Weisbecker Acked-by: Mike Frysinger Cc: Linus Torvalds Cc: Zhaolei Cc: Tom Zanussi Cc: Li Zefan Cc: Andrew Morton Cc: Steven Rostedt LKML-Reference: <20090415154817.GC5989@nowhere> Signed-off-by: Ingo Molnar --- lib/vsprintf.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/lib/vsprintf.c b/lib/vsprintf.c index b56f6d039d2..756ccafa9ce 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -575,12 +575,15 @@ static char *string(char *buf, char *end, char *s, struct printf_spec spec) } static char *symbol_string(char *buf, char *end, void *ptr, - struct printf_spec spec) + struct printf_spec spec, char ext) { unsigned long value = (unsigned long) ptr; #ifdef CONFIG_KALLSYMS char sym[KSYM_SYMBOL_LEN]; - sprint_symbol(sym, value); + if (ext != 'f') + sprint_symbol(sym, value); + else + kallsyms_lookup(value, NULL, NULL, NULL, sym); return string(buf, end, sym, spec); #else spec.field_width = 2*sizeof(void *); @@ -692,7 +695,8 @@ static char *ip4_addr_string(char *buf, char *end, u8 *addr, * * Right now we handle: * - * - 'F' For symbolic function descriptor pointers + * - 'F' For symbolic function descriptor pointers with offset + * - 'f' For simple symbolic function names without offset * - 'S' For symbolic direct pointers * - 'R' For a struct resource pointer, it prints the range of * addresses (not the name nor the flags) @@ -715,10 +719,11 @@ static char *pointer(const char *fmt, char *buf, char *end, void *ptr, switch (*fmt) { case 'F': + case 'f': ptr = dereference_function_descriptor(ptr); /* Fallthrough */ case 'S': - return symbol_string(buf, end, ptr, spec); + return symbol_string(buf, end, ptr, spec, *fmt); case 'R': return resource_string(buf, end, ptr, spec); case 'm': @@ -954,7 +959,8 @@ qualifier: * * This function follows C99 vsnprintf, but has some extensions: * %pS output the name of a text symbol - * %pF output the name of a function pointer + * %pF output the name of a function pointer with its offset + * %pf output the name of a function pointer without its offset * %pR output the address range in a struct resource * * The return value is the number of characters which would @@ -1412,7 +1418,8 @@ EXPORT_SYMBOL_GPL(vbin_printf); * * The format follows C99 vsnprintf, but has some extensions: * %pS output the name of a text symbol - * %pF output the name of a function pointer + * %pF output the name of a function pointer with its offset + * %pf output the name of a function pointer without its offset * %pR output the address range in a struct resource * %n is ignored * From 43f6201a22dbf1c5abe1cab96b49bd56fa9df8f4 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 29 Apr 2009 16:55:56 +0200 Subject: [PATCH 0702/2061] perf_counter, x86: rename bitmasks to ->used_mask and ->active_mask Standardize on explicitly mentioning '_mask' in fields that are not plain flags but masks. This avoids typos like: if (cpuc->used) (which could easily slip through review unnoticed), while if a typo looks like this: if (cpuc->used_mask) it might get noticed during review. [ Impact: cleanup ] Signed-off-by: Robert Richter Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1241016956-24648-1-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 47e563bfd4c..fc06f4d3264 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -28,8 +28,8 @@ static u64 perf_counter_mask __read_mostly; struct cpu_hw_counters { struct perf_counter *counters[X86_PMC_IDX_MAX]; - unsigned long used[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; - unsigned long active[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; + unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; u64 throttle_ctrl; int enabled; @@ -332,7 +332,7 @@ static u64 amd_pmu_save_disable_all(void) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (!(val & ARCH_PERFMON_EVENTSEL0_ENABLE)) @@ -373,7 +373,7 @@ static void amd_pmu_restore_all(u64 ctrl) for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; rdmsrl(MSR_K7_EVNTSEL0 + idx, val); if (val & ARCH_PERFMON_EVENTSEL0_ENABLE) @@ -576,7 +576,7 @@ static int x86_pmu_enable(struct perf_counter *counter) * Try to get the fixed counter, if that is already taken * then try to get a generic counter: */ - if (test_and_set_bit(idx, cpuc->used)) + if (test_and_set_bit(idx, cpuc->used_mask)) goto try_generic; hwc->config_base = MSR_ARCH_PERFMON_FIXED_CTR_CTRL; @@ -590,14 +590,14 @@ static int x86_pmu_enable(struct perf_counter *counter) } else { idx = hwc->idx; /* Try to get the previous generic counter again */ - if (test_and_set_bit(idx, cpuc->used)) { + if (test_and_set_bit(idx, cpuc->used_mask)) { try_generic: - idx = find_first_zero_bit(cpuc->used, + idx = find_first_zero_bit(cpuc->used_mask, x86_pmu.num_counters); if (idx == x86_pmu.num_counters) return -EAGAIN; - set_bit(idx, cpuc->used); + set_bit(idx, cpuc->used_mask); hwc->idx = idx; } hwc->config_base = x86_pmu.eventsel; @@ -609,7 +609,7 @@ try_generic: x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; - set_bit(idx, cpuc->active); + set_bit(idx, cpuc->active_mask); x86_perf_counter_set_period(counter, hwc, idx); x86_pmu.enable(hwc, idx); @@ -643,7 +643,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: overflow: %016llx\n", cpu, overflow); pr_info("CPU#%d: fixed: %016llx\n", cpu, fixed); } - pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used); + pr_info("CPU#%d: used: %016llx\n", cpu, *(u64 *)cpuc->used_mask); for (idx = 0; idx < x86_pmu.num_counters; idx++) { rdmsrl(x86_pmu.eventsel + idx, pmc_ctrl); @@ -677,7 +677,7 @@ static void x86_pmu_disable(struct perf_counter *counter) * Must be done before we disable, otherwise the nmi handler * could reenable again: */ - clear_bit(idx, cpuc->active); + clear_bit(idx, cpuc->active_mask); x86_pmu.disable(hwc, idx); /* @@ -692,7 +692,7 @@ static void x86_pmu_disable(struct perf_counter *counter) */ x86_perf_counter_update(counter, hwc, idx); cpuc->counters[idx] = NULL; - clear_bit(idx, cpuc->used); + clear_bit(idx, cpuc->used_mask); } /* @@ -741,7 +741,7 @@ again: struct perf_counter *counter = cpuc->counters[bit]; clear_bit(bit, (unsigned long *) &status); - if (!test_bit(bit, cpuc->active)) + if (!test_bit(bit, cpuc->active_mask)) continue; intel_pmu_save_and_restart(counter); @@ -779,7 +779,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) ++cpuc->interrupts; for (idx = 0; idx < x86_pmu.num_counters; idx++) { - if (!test_bit(idx, cpuc->active)) + if (!test_bit(idx, cpuc->active_mask)) continue; counter = cpuc->counters[idx]; hwc = &counter->hw; From 23b94b967f118bef941369238f33c8140be46539 Mon Sep 17 00:00:00 2001 From: Luis Henriques Date: Wed, 29 Apr 2009 21:54:51 +0100 Subject: [PATCH 0703/2061] locking, rtmutex.c: Documentation cleanup Two minor updates on functions documentation: - Updated documentation for function rt_mutex_unlock(), which contained an incorrect name - Removed extra '*' from comment in function rt_mutex_destroy() [ Impact: cleanup ] Signed-off-by: Luis Henriques Cc: Steven Rostedt LKML-Reference: <20090429205451.GA23154@hades.domain.com> Signed-off-by: Ingo Molnar --- kernel/rtmutex.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/rtmutex.c b/kernel/rtmutex.c index 69d9cb921ff..013882e8349 100644 --- a/kernel/rtmutex.c +++ b/kernel/rtmutex.c @@ -864,9 +864,9 @@ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock, EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); /** - * rt_mutex_lock_interruptible_ktime - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller + * rt_mutex_timed_lock - lock a rt_mutex interruptible + * the timeout structure is provided + * by the caller * * @lock: the rt_mutex to be locked * @timeout: timeout structure or NULL (no timeout) @@ -913,7 +913,7 @@ void __sched rt_mutex_unlock(struct rt_mutex *lock) } EXPORT_SYMBOL_GPL(rt_mutex_unlock); -/*** +/** * rt_mutex_destroy - mark a mutex unusable * @lock: the mutex to be destroyed * From 88c48db9788862d0290831d081bc3c64e13b592f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 29 Apr 2009 14:00:25 -0400 Subject: [PATCH 0704/2061] SELinux: drop secondary_ops->sysctl We are still calling secondary_ops->sysctl even though the capabilities module does not define a sysctl operation. Signed-off-by: Eric Paris Acked-by: Stephen Smalley Signed-off-by: James Morris --- security/selinux/hooks.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index ba808ef6bab..dd19ba81201 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -1980,10 +1980,6 @@ static int selinux_sysctl(ctl_table *table, int op) u32 tsid, sid; int rc; - rc = secondary_ops->sysctl(table, op); - if (rc) - return rc; - sid = current_sid(); rc = selinux_sysctl_get_sid(table, (op == 0001) ? From 3bcac0263f0b45e67a64034ebcb69eb9abb742f4 Mon Sep 17 00:00:00 2001 From: David Howells Date: Wed, 29 Apr 2009 13:45:05 +0100 Subject: [PATCH 0705/2061] SELinux: Don't flush inherited SIGKILL during execve() Don't flush inherited SIGKILL during execve() in SELinux's post cred commit hook. This isn't really a security problem: if the SIGKILL came before the credentials were changed, then we were right to receive it at the time, and should honour it; if it came after the creds were changed, then we definitely should honour it; and in any case, all that will happen is that the process will be scrapped before it ever returns to userspace. Signed-off-by: David Howells Signed-off-by: Oleg Nesterov Signed-off-by: James Morris --- include/linux/sched.h | 1 + kernel/signal.c | 11 ++++++++--- security/selinux/hooks.c | 9 +++++---- 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 1d19c025f9d..d3b787c7aef 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1875,6 +1875,7 @@ extern void sched_dead(struct task_struct *p); extern void proc_caches_init(void); extern void flush_signals(struct task_struct *); +extern void __flush_signals(struct task_struct *); extern void ignore_signals(struct task_struct *); extern void flush_signal_handlers(struct task_struct *, int force_default); extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info); diff --git a/kernel/signal.c b/kernel/signal.c index 1c8814481a1..f93efec14ff 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -238,14 +238,19 @@ void flush_sigqueue(struct sigpending *queue) /* * Flush all pending signals for a task. */ +void __flush_signals(struct task_struct *t) +{ + clear_tsk_thread_flag(t, TIF_SIGPENDING); + flush_sigqueue(&t->pending); + flush_sigqueue(&t->signal->shared_pending); +} + void flush_signals(struct task_struct *t) { unsigned long flags; spin_lock_irqsave(&t->sighand->siglock, flags); - clear_tsk_thread_flag(t, TIF_SIGPENDING); - flush_sigqueue(&t->pending); - flush_sigqueue(&t->signal->shared_pending); + __flush_signals(t); spin_unlock_irqrestore(&t->sighand->siglock, flags); } diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index dd19ba81201..5a345115036 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2394,11 +2394,12 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) memset(&itimer, 0, sizeof itimer); for (i = 0; i < 3; i++) do_setitimer(i, &itimer, NULL); - flush_signals(current); spin_lock_irq(¤t->sighand->siglock); - flush_signal_handlers(current, 1); - sigemptyset(¤t->blocked); - recalc_sigpending(); + if (!(current->signal->flags & SIGNAL_GROUP_EXIT)) { + __flush_signals(current); + flush_signal_handlers(current, 1); + sigemptyset(¤t->blocked); + } spin_unlock_irq(¤t->sighand->siglock); } From ecd6de3c88e8cbcad175b2eab48ba05c2014f7b6 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 29 Apr 2009 16:02:24 +0200 Subject: [PATCH 0706/2061] selinux: selinux_bprm_committed_creds() should wake up ->real_parent, not ->parent. We shouldn't worry about the tracer if current is ptraced, exec() must not succeed if the tracer has no rights to trace this task after cred changing. But we should notify ->real_parent which is, well, real parent. Also, we don't need _irq to take tasklist, and we don't need parent's ->siglock to wake_up_interruptible(real_parent->signal->wait_chldexit). Since we hold tasklist, real_parent->signal must be stable. Otherwise spin_lock(siglock) is not safe too and can't help anyway. Signed-off-by: Oleg Nesterov Signed-off-by: James Morris --- security/selinux/hooks.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index 5a345115036..39046ddd90a 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -2371,10 +2371,8 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) { const struct task_security_struct *tsec = current_security(); struct itimerval itimer; - struct sighand_struct *psig; u32 osid, sid; int rc, i; - unsigned long flags; osid = tsec->osid; sid = tsec->sid; @@ -2405,12 +2403,9 @@ static void selinux_bprm_committed_creds(struct linux_binprm *bprm) /* Wake up the parent if it is waiting so that it can recheck * wait permission to the new task SID. */ - read_lock_irq(&tasklist_lock); - psig = current->parent->sighand; - spin_lock_irqsave(&psig->siglock, flags); - wake_up_interruptible(¤t->parent->signal->wait_chldexit); - spin_unlock_irqrestore(&psig->siglock, flags); - read_unlock_irq(&tasklist_lock); + read_lock(&tasklist_lock); + wake_up_interruptible(¤t->real_parent->signal->wait_chldexit); + read_unlock(&tasklist_lock); } /* superblock security operations */ From e6be3a25861429166f945499c7ee616875bc3db9 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 30 Apr 2009 12:56:37 +0900 Subject: [PATCH 0707/2061] sh: pass through ioremap() for non-mmu processors. All 32-bit SuperH processors currently go through __ioremap_mode() and check for IO_TRAPPED and directly mapped segments. With this patch we simplify the MMU less case with a pass through version of __ioremap_mode() which just returns the physical address. The effects of this is change are: - fix non-MMU ioremap() of high address hardware blocks (sh7203 CMT) - make sure IO_TRAPPED is not selected Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/boards/Kconfig | 6 +++--- arch/sh/include/asm/io.h | 12 ++++++------ 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/arch/sh/boards/Kconfig b/arch/sh/boards/Kconfig index dcc1af8a2cf..99e2d47f79f 100644 --- a/arch/sh/boards/Kconfig +++ b/arch/sh/boards/Kconfig @@ -121,7 +121,7 @@ config SH_RTS7751R2D bool "RTS7751R2D" depends on CPU_SUBTYPE_SH7751R select SYS_SUPPORTS_PCI - select IO_TRAPPED + select IO_TRAPPED if MMU help Select RTS7751R2D if configuring for a Renesas Technology Sales SH-Graphics board. @@ -145,13 +145,13 @@ config SH_HIGHLANDER bool "Highlander" depends on CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 select SYS_SUPPORTS_PCI - select IO_TRAPPED + select IO_TRAPPED if MMU config SH_SH7785LCR bool "SH7785LCR" depends on CPU_SUBTYPE_SH7785 select SYS_SUPPORTS_PCI - select IO_TRAPPED + select IO_TRAPPED if MMU config SH_SH7785LCR_29BIT_PHYSMAPS bool "SH7785LCR 29bit physmaps" diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index 0454f8d6805..ef217344afc 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -228,12 +228,6 @@ void __iounmap(void __iomem *addr); unsigned long onchip_remap(unsigned long addr, unsigned long size, const char *name); extern void onchip_unmap(unsigned long vaddr); -#else -#define __ioremap(offset, size, flags) ((void __iomem *)(offset)) -#define __iounmap(addr) do { } while (0) -#define onchip_remap(addr, size, name) (addr) -#define onchip_unmap(addr) do { } while (0) -#endif /* CONFIG_MMU */ static inline void __iomem * __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags) @@ -268,6 +262,12 @@ __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags) return __ioremap(offset, size, flags); } +#else +#define onchip_remap(addr, size, name) (addr) +#define onchip_unmap(addr) do { } while (0) +#define __ioremap_mode(offset, size, flags) ((void __iomem *)(offset)) +#define __iounmap(addr) do { } while (0) +#endif /* CONFIG_MMU */ #define ioremap(offset, size) \ __ioremap_mode((offset), (size), 0) From 3014f47460ecfb13d4169daae51f26a20bacfa17 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 29 Apr 2009 14:50:37 +0000 Subject: [PATCH 0708/2061] clocksource: sh_cmt 16-bit fixes This patch contains various fixes for 16-bit cmt hardware. With this applied periodic clockevents work fine on sh7203. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/clocksource/sh_cmt.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index bf3e4c11fd3..4ff1508e5ab 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -158,16 +158,18 @@ static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate) pr_err("sh_cmt: cannot enable clock \"%s\"\n", cfg->clk); return ret; } - *rate = clk_get_rate(p->clk) / 8; /* make sure channel is disabled */ sh_cmt_start_stop_ch(p, 0); /* configure channel, periodic mode and maximum timeout */ - if (p->width == 16) - sh_cmt_write(p, CMCSR, 0); - else + if (p->width == 16) { + *rate = clk_get_rate(p->clk) / 512; + sh_cmt_write(p, CMCSR, 0x43); + } else { + *rate = clk_get_rate(p->clk) / 8; sh_cmt_write(p, CMCSR, 0x01a4); + } sh_cmt_write(p, CMCOR, 0xffffffff); sh_cmt_write(p, CMCNT, 0); @@ -615,7 +617,7 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev) if (resource_size(res) == 6) { p->width = 16; p->overflow_bit = 0x80; - p->clear_bits = ~0xc0; + p->clear_bits = ~0x80; } else { p->width = 32; p->overflow_bit = 0x8000; From 698aa99da5f5e2b4c666fd21ab77306f0225b8f5 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 30 Apr 2009 04:08:18 +0000 Subject: [PATCH 0709/2061] sh: sh2/sh2a 16-bit CMT platform data This patch adds 16-bit cmt platform data for the following cpus: - sh7619 (2 channels) - sh7203/sh7263 (2 channels) - sh7206 (2 channels) Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh2/setup-sh7619.c | 84 ++++++++++++++++++++++++++ arch/sh/kernel/cpu/sh2a/setup-sh7203.c | 84 ++++++++++++++++++++++++++ arch/sh/kernel/cpu/sh2a/setup-sh7206.c | 84 ++++++++++++++++++++++++++ 3 files changed, 252 insertions(+) diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c index 0e32d8e448c..d70c263eb07 100644 --- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c +++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include enum { UNUSED = 0, @@ -109,9 +111,75 @@ static struct platform_device eth_device = { .resource = eth_resources, }; +static struct sh_cmt_config cmt0_platform_data = { + .name = "CMT0", + .channel_offset = 0x02, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 125, + .clocksource_rating = 0, /* disabled due to code generation issues */ +}; + +static struct resource cmt0_resources[] = { + [0] = { + .name = "CMT0", + .start = 0xf84a0072, + .end = 0xf84a0077, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 86, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt0_device = { + .name = "sh_cmt", + .id = 0, + .dev = { + .platform_data = &cmt0_platform_data, + }, + .resource = cmt0_resources, + .num_resources = ARRAY_SIZE(cmt0_resources), +}; + +static struct sh_cmt_config cmt1_platform_data = { + .name = "CMT1", + .channel_offset = 0x08, + .timer_bit = 1, + .clk = "module_clk", + .clockevent_rating = 125, + .clocksource_rating = 0, /* disabled due to code generation issues */ +}; + +static struct resource cmt1_resources[] = { + [0] = { + .name = "CMT1", + .start = 0xf84a0078, + .end = 0xf84a007d, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 87, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt1_device = { + .name = "sh_cmt", + .id = 1, + .dev = { + .platform_data = &cmt1_platform_data, + }, + .resource = cmt1_resources, + .num_resources = ARRAY_SIZE(cmt1_resources), +}; + static struct platform_device *sh7619_devices[] __initdata = { &sci_device, ð_device, + &cmt0_device, + &cmt1_device, }; static int __init sh7619_devices_setup(void) @@ -125,3 +193,19 @@ void __init plat_irq_setup(void) { register_intc_controller(&intc_desc); } + +static struct platform_device *sh7619_early_devices[] __initdata = { + &cmt0_device, + &cmt1_device, +}; + +#define STBCR3 0xf80a0000 + +void __init plat_early_device_setup(void) +{ + /* enable CMT clock */ + __raw_writeb(__raw_readb(STBCR3) & ~0x10, STBCR3); + + early_platform_add_devices(sh7619_early_devices, + ARRAY_SIZE(sh7619_early_devices)); +} diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c index 820dfb2e865..0836acee228 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c @@ -11,6 +11,8 @@ #include #include #include +#include +#include enum { UNUSED = 0, @@ -205,6 +207,70 @@ static struct platform_device sci_device = { }, }; +static struct sh_cmt_config cmt0_platform_data = { + .name = "CMT0", + .channel_offset = 0x02, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 125, + .clocksource_rating = 0, /* disabled due to code generation issues */ +}; + +static struct resource cmt0_resources[] = { + [0] = { + .name = "CMT0", + .start = 0xfffec002, + .end = 0xfffec007, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 142, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt0_device = { + .name = "sh_cmt", + .id = 0, + .dev = { + .platform_data = &cmt0_platform_data, + }, + .resource = cmt0_resources, + .num_resources = ARRAY_SIZE(cmt0_resources), +}; + +static struct sh_cmt_config cmt1_platform_data = { + .name = "CMT1", + .channel_offset = 0x08, + .timer_bit = 1, + .clk = "module_clk", + .clockevent_rating = 125, + .clocksource_rating = 0, /* disabled due to code generation issues */ +}; + +static struct resource cmt1_resources[] = { + [0] = { + .name = "CMT1", + .start = 0xfffec008, + .end = 0xfffec00d, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 143, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt1_device = { + .name = "sh_cmt", + .id = 1, + .dev = { + .platform_data = &cmt1_platform_data, + }, + .resource = cmt1_resources, + .num_resources = ARRAY_SIZE(cmt1_resources), +}; + static struct resource rtc_resources[] = { [0] = { .start = 0xffff2000, @@ -227,6 +293,8 @@ static struct platform_device rtc_device = { static struct platform_device *sh7203_devices[] __initdata = { &sci_device, + &cmt0_device, + &cmt1_device, &rtc_device, }; @@ -241,3 +309,19 @@ void __init plat_irq_setup(void) { register_intc_controller(&intc_desc); } + +static struct platform_device *sh7203_early_devices[] __initdata = { + &cmt0_device, + &cmt1_device, +}; + +#define STBCR4 0xfffe040c + +void __init plat_early_device_setup(void) +{ + /* enable CMT clock */ + __raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4); + + early_platform_add_devices(sh7203_early_devices, + ARRAY_SIZE(sh7203_early_devices)); +} diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c index c46a8355726..f9606e3d251 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include enum { UNUSED = 0, @@ -165,8 +167,74 @@ static struct platform_device sci_device = { }, }; +static struct sh_cmt_config cmt0_platform_data = { + .name = "CMT0", + .channel_offset = 0x02, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 125, + .clocksource_rating = 0, /* disabled due to code generation issues */ +}; + +static struct resource cmt0_resources[] = { + [0] = { + .name = "CMT0", + .start = 0xfffec002, + .end = 0xfffec007, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 140, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt0_device = { + .name = "sh_cmt", + .id = 0, + .dev = { + .platform_data = &cmt0_platform_data, + }, + .resource = cmt0_resources, + .num_resources = ARRAY_SIZE(cmt0_resources), +}; + +static struct sh_cmt_config cmt1_platform_data = { + .name = "CMT1", + .channel_offset = 0x08, + .timer_bit = 1, + .clk = "module_clk", + .clockevent_rating = 125, + .clocksource_rating = 0, /* disabled due to code generation issues */ +}; + +static struct resource cmt1_resources[] = { + [0] = { + .name = "CMT1", + .start = 0xfffec008, + .end = 0xfffec00d, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 144, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt1_device = { + .name = "sh_cmt", + .id = 1, + .dev = { + .platform_data = &cmt1_platform_data, + }, + .resource = cmt1_resources, + .num_resources = ARRAY_SIZE(cmt1_resources), +}; + static struct platform_device *sh7206_devices[] __initdata = { &sci_device, + &cmt0_device, + &cmt1_device, }; static int __init sh7206_devices_setup(void) @@ -180,3 +248,19 @@ void __init plat_irq_setup(void) { register_intc_controller(&intc_desc); } + +static struct platform_device *sh7206_early_devices[] __initdata = { + &cmt0_device, + &cmt1_device, +}; + +#define STBCR4 0xfffe040c + +void __init plat_early_device_setup(void) +{ + /* enable CMT clock */ + __raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4); + + early_platform_add_devices(sh7206_early_devices, + ARRAY_SIZE(sh7206_early_devices)); +} From f425752fc66acf1d4e47970ea704ed7d31c14173 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 30 Apr 2009 04:09:26 +0000 Subject: [PATCH 0710/2061] sh: remove old CMT driver This patch removes the old CMT driver (CONFIG_SH_CMT/timer-cmt.c) As replacement, select the sh_cmt driver with CONFIG_SH_TIMER_CMT and configure timer channel using platform data. If multiple CMT channels are enabled using platform data, use the earlytimer parameter on the kernel command line to select channel. For instance, use "earlytimer=sh_cmt.0" to select the first channel. To verify which timer is being used, look at printouts or the timer irq count in /proc/interrupts. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 18 +-- arch/sh/include/asm/timer.h | 2 +- arch/sh/kernel/timers/Makefile | 1 - arch/sh/kernel/timers/timer-cmt.c | 188 ------------------------------ arch/sh/kernel/timers/timer.c | 3 - 5 files changed, 6 insertions(+), 206 deletions(-) delete mode 100644 arch/sh/kernel/timers/timer-cmt.c diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 9db02ced57a..7e96adea960 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -462,22 +462,14 @@ config SH_TMU help This enables the use of the TMU as the system timer. -config SH_CMT - bool "CMT timer support" - depends on SYS_SUPPORTS_CMT && CPU_SH2 - default y - help - This enables the use of the CMT as the system timer. - -# -# Support for the new-style CMT driver. This will replace SH_CMT -# once its other dependencies are merged. -# config SH_TIMER_CMT - bool "CMT clockevents driver" - depends on SYS_SUPPORTS_CMT && !SH_CMT + bool "CMT timer driver" + depends on SYS_SUPPORTS_CMT + default y select GENERIC_CLOCKEVENTS select GENERIC_TIME + help + This enables build of the CMT timer driver. config SH_MTU2 bool "MTU2 timer support" diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h index 4c3b66e30af..8f80a55c04a 100644 --- a/arch/sh/include/asm/timer.h +++ b/arch/sh/include/asm/timer.h @@ -23,7 +23,7 @@ struct sys_timer { #define TICK_SIZE (tick_nsec / 1000) -extern struct sys_timer tmu_timer, cmt_timer, mtu2_timer; +extern struct sys_timer tmu_timer, mtu2_timer; extern struct sys_timer *sys_timer; #ifndef CONFIG_GENERIC_TIME diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile index 0b7f8577193..1e9a104c2ae 100644 --- a/arch/sh/kernel/timers/Makefile +++ b/arch/sh/kernel/timers/Makefile @@ -6,6 +6,5 @@ obj-y := timer.o obj-$(CONFIG_SH_TMU) += timer-tmu.o obj-$(CONFIG_SH_MTU2) += timer-mtu2.o -obj-$(CONFIG_SH_CMT) += timer-cmt.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += timer-broadcast.o diff --git a/arch/sh/kernel/timers/timer-cmt.c b/arch/sh/kernel/timers/timer-cmt.c deleted file mode 100644 index 9aa348658ae..00000000000 --- a/arch/sh/kernel/timers/timer-cmt.c +++ /dev/null @@ -1,188 +0,0 @@ -/* - * arch/sh/kernel/timers/timer-cmt.c - CMT Timer Support - * - * Copyright (C) 2005 Yoshinori Sato - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(CONFIG_CPU_SUBTYPE_SH7619) -#define CMT_CMSTR 0xf84a0070 -#define CMT_CMCSR_0 0xf84a0072 -#define CMT_CMCNT_0 0xf84a0074 -#define CMT_CMCOR_0 0xf84a0076 -#define CMT_CMCSR_1 0xf84a0078 -#define CMT_CMCNT_1 0xf84a007a -#define CMT_CMCOR_1 0xf84a007c - -#define STBCR3 0xf80a0000 -#define cmt_clock_enable() do { ctrl_outb(ctrl_inb(STBCR3) & ~0x10, STBCR3); } while(0) -#define CMT_CMCSR_INIT 0x0040 -#define CMT_CMCSR_CALIB 0x0000 -#elif defined(CONFIG_CPU_SUBTYPE_SH7203) || \ - defined(CONFIG_CPU_SUBTYPE_SH7206) || \ - defined(CONFIG_CPU_SUBTYPE_SH7263) -#define CMT_CMSTR 0xfffec000 -#define CMT_CMCSR_0 0xfffec002 -#define CMT_CMCNT_0 0xfffec004 -#define CMT_CMCOR_0 0xfffec006 - -#define STBCR4 0xfffe040c -#define cmt_clock_enable() do { ctrl_outb(ctrl_inb(STBCR4) & ~0x04, STBCR4); } while(0) -#define CMT_CMCSR_INIT 0x0040 -#define CMT_CMCSR_CALIB 0x0000 -#else -#error "Unknown CPU SUBTYPE" -#endif - -static unsigned long cmt_timer_get_offset(void) -{ - int count; - static unsigned short count_p = 0xffff; /* for the first call after boot */ - static unsigned long jiffies_p = 0; - - /* - * cache volatile jiffies temporarily; we have IRQs turned off. - */ - unsigned long jiffies_t; - - /* timer count may underflow right here */ - count = ctrl_inw(CMT_CMCOR_0); - count -= ctrl_inw(CMT_CMCNT_0); - - jiffies_t = jiffies; - - /* - * avoiding timer inconsistencies (they are rare, but they happen)... - * there is one kind of problem that must be avoided here: - * 1. the timer counter underflows - */ - - if (jiffies_t == jiffies_p) { - if (count > count_p) { - /* the nutcase */ - if (ctrl_inw(CMT_CMCSR_0) & 0x80) { /* Check CMF bit */ - count -= LATCH; - } else { - printk("%s (): hardware timer problem?\n", - __func__); - } - } - } else - jiffies_p = jiffies_t; - - count_p = count; - - count = ((LATCH-1) - count) * TICK_SIZE; - count = (count + LATCH/2) / LATCH; - - return count; -} - -static irqreturn_t cmt_timer_interrupt(int irq, void *dev_id) -{ - unsigned long timer_status; - - /* Clear CMF bit */ - timer_status = ctrl_inw(CMT_CMCSR_0); - timer_status &= ~0x80; - ctrl_outw(timer_status, CMT_CMCSR_0); - - handle_timer_tick(); - - return IRQ_HANDLED; -} - -static struct irqaction cmt_irq = { - .name = "timer", - .handler = cmt_timer_interrupt, - .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL, -}; - -static void cmt_clk_init(struct clk *clk) -{ - u8 divisor = CMT_CMCSR_INIT & 0x3; - ctrl_inw(CMT_CMCSR_0); - ctrl_outw(CMT_CMCSR_INIT, CMT_CMCSR_0); - clk->parent = clk_get(NULL, "module_clk"); - clk->rate = clk->parent->rate / (8 << (divisor << 1)); -} - -static void cmt_clk_recalc(struct clk *clk) -{ - u8 divisor = ctrl_inw(CMT_CMCSR_0) & 0x3; - clk->rate = clk->parent->rate / (8 << (divisor << 1)); -} - -static struct clk_ops cmt_clk_ops = { - .init = cmt_clk_init, - .recalc = cmt_clk_recalc, -}; - -static struct clk cmt0_clk = { - .name = "cmt0_clk", - .ops = &cmt_clk_ops, -}; - -static int cmt_timer_start(void) -{ - ctrl_outw(ctrl_inw(CMT_CMSTR) | 0x01, CMT_CMSTR); - return 0; -} - -static int cmt_timer_stop(void) -{ - ctrl_outw(ctrl_inw(CMT_CMSTR) & ~0x01, CMT_CMSTR); - return 0; -} - -static int cmt_timer_init(void) -{ - unsigned long interval; - - cmt_clock_enable(); - - setup_irq(CONFIG_SH_TIMER_IRQ, &cmt_irq); - - cmt0_clk.parent = clk_get(NULL, "module_clk"); - - cmt_timer_stop(); - - interval = cmt0_clk.parent->rate / 8 / HZ; - printk(KERN_INFO "Interval = %ld\n", interval); - - ctrl_outw(interval, CMT_CMCOR_0); - - clk_register(&cmt0_clk); - clk_enable(&cmt0_clk); - - cmt_timer_start(); - - return 0; -} - -static struct sys_timer_ops cmt_timer_ops = { - .init = cmt_timer_init, - .start = cmt_timer_start, - .stop = cmt_timer_stop, -#ifndef CONFIG_GENERIC_TIME - .get_offset = cmt_timer_get_offset, -#endif -}; - -struct sys_timer cmt_timer = { - .name = "cmt", - .ops = &cmt_timer_ops, -}; diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c index 4e7e747d1b6..f3bd1413d56 100644 --- a/arch/sh/kernel/timers/timer.c +++ b/arch/sh/kernel/timers/timer.c @@ -19,9 +19,6 @@ static struct sys_timer *sys_timers[] = { #endif #ifdef CONFIG_SH_MTU2 &mtu2_timer, -#endif -#ifdef CONFIG_SH_CMT - &cmt_timer, #endif NULL, }; From c5dd016cdf0a040e1de0b691e274fbfe642b2cdc Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 30 Apr 2009 09:48:16 +1000 Subject: [PATCH 0711/2061] perf_counter: update copyright notice This adds my name to the list of copyright holders on the core perf_counter.c, since I have contributed a significant amount of the code in there. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Robert Richter LKML-Reference: <18936.59200.888049.746658@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a95a171e608..75f2b6c8239 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -4,6 +4,7 @@ * Copyright (C) 2008 Thomas Gleixner * Copyright (C) 2008-2009 Red Hat, Inc., Ingo Molnar * Copyright (C) 2008-2009 Red Hat, Inc., Peter Zijlstra + * Copyright © 2009 Paul Mackerras, IBM Corp. * * For licensing details see kernel-base/COPYING */ From a511e3f968c462a55ef58697257f5347c73d306e Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 29 Apr 2009 15:59:58 -0700 Subject: [PATCH 0712/2061] mutex: add atomic_dec_and_mutex_lock(), fix include/linux/mutex.h:136: warning: 'mutex_lock' declared inline after being called include/linux/mutex.h:136: warning: previous declaration of 'mutex_lock' was here uninline it. [ Impact: clean up and uninline, address compiler warning ] Signed-off-by: Andrew Morton Cc: Al Viro Cc: Christoph Hellwig Cc: Eric Paris Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <200904292318.n3TNIsi6028340@imap1.linux-foundation.org> Signed-off-by: Ingo Molnar --- include/linux/mutex.h | 24 +----------------------- kernel/mutex.c | 25 ++++++++++++++++++++++++- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 93054fc3635..878cab4f5fc 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -150,28 +150,6 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); */ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); - -/** - * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 - * @cnt: the atomic which we are to dec - * @lock: the mutex to return holding if we dec to 0 - * - * return true and hold lock if we dec to 0, return false otherwise - */ -static inline int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) -{ - /* dec if we can't possibly hit 0 */ - if (atomic_add_unless(cnt, -1, 1)) - return 0; - /* we might hit 0, so take the lock */ - mutex_lock(lock); - if (!atomic_dec_and_test(cnt)) { - /* when we actually did the dec, we didn't hit 0 */ - mutex_unlock(lock); - return 0; - } - /* we hit 0, and we hold the lock */ - return 1; -} +extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); #endif diff --git a/kernel/mutex.c b/kernel/mutex.c index 507cf2b5e9f..e2d25e9e62a 100644 --- a/kernel/mutex.c +++ b/kernel/mutex.c @@ -471,5 +471,28 @@ int __sched mutex_trylock(struct mutex *lock) return ret; } - EXPORT_SYMBOL(mutex_trylock); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); From 2b72394e4089643f11669d9610907a1442fe044a Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 28 Apr 2009 16:00:49 +0300 Subject: [PATCH 0713/2061] x86: move max_pfn_mapped and max_low_pfn_mapped to setup.c This patch moves the max_pfn_mapped and max_low_pfn_mapped global variables to kernel/setup.c where they're initialized. [ Impact: cleanup ] Signed-off-by: Pekka Enberg LKML-Reference: <1240923649.1982.21.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 8 ++++++++ arch/x86/mm/init_32.c | 4 +--- arch/x86/mm/init_64.c | 8 -------- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf6..0d77e56e821 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -112,6 +112,14 @@ #define ARCH_SETUP #endif +/* + * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. + * The direct mapping extends to max_pfn_mapped, so that we can directly access + * apertures, ACPI and other tables without having to play with fixmaps. + */ +unsigned long max_low_pfn_mapped; +unsigned long max_pfn_mapped; + RESERVE_BRK(dmi_alloc, 65536); unsigned int boot_cpu_id __read_mostly; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 2b27120665b..a640a7f0490 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -49,11 +49,9 @@ #include #include #include +#include #include -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; - DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index a4e7846efb1..1016ea01593 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -50,14 +50,6 @@ #include #include -/* - * end_pfn only includes RAM, while max_pfn_mapped includes all e820 entries. - * The direct mapping extends to max_pfn_mapped, so that we can directly access - * apertures, ACPI and other tables without having to play with fixmaps. - */ -unsigned long max_low_pfn_mapped; -unsigned long max_pfn_mapped; - static unsigned long dma_reserve __initdata; DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); From 9518e0e4350a5ea8ca200ce320b28d6284a7b0ce Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Tue, 28 Apr 2009 16:00:50 +0300 Subject: [PATCH 0714/2061] x86: move per-cpu mmu_gathers to mm/init.c [ Impact: cleanup ] Signed-off-by: Pekka Enberg LKML-Reference: <1240923650.1982.22.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init.c | 3 +++ arch/x86/mm/init_32.c | 1 - arch/x86/mm/init_64.c | 2 -- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index fedde5359a0..4d67c33a2e1 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -9,6 +9,9 @@ #include #include #include +#include + +DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long __initdata e820_table_start; unsigned long __meminitdata e820_table_end; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index a640a7f0490..fef1d90d4f1 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -52,7 +52,6 @@ #include #include -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); unsigned long highstart_pfn, highend_pfn; static noinline int do_test_wp_bit(void); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 1016ea01593..6a1a573e20f 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -52,8 +52,6 @@ static unsigned long dma_reserve __initdata; -DEFINE_PER_CPU(struct mmu_gather, mmu_gathers); - static int __init parse_direct_gbpages_off(char *arg) { direct_gbpages = 0; From ba9c22f2c01cf5c88beed5a6b9e07d42e10bd358 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Mon, 20 Apr 2009 22:22:22 -0700 Subject: [PATCH 0715/2061] futex: remove FUTEX_REQUEUE_PI (non CMP) The new requeue PI futex op codes were modeled after the existing FUTEX_REQUEUE and FUTEX_CMP_REQUEUE calls. I was unaware at the time that FUTEX_REQUEUE was only around for compatibility reasons and shouldn't be used in new code. Ulrich Drepper elaborates on this in his Futexes are Tricky paper: http://people.redhat.com/drepper/futex.pdf. The deprecated call doesn't catch changes to the futex corresponding to the destination futex which can lead to deadlock. Therefor, I feel it best to remove FUTEX_REQUEUE_PI and leave only FUTEX_CMP_REQUEUE_PI as there are not yet any existing users of the API. This patch does change the OP code value of FUTEX_CMP_REQUEUE_PI to 12 from 13. Since my test case is the only known user of this API, I felt this was the right thing to do, rather than leave a hole in the enumeration. I chose to continue using the _CMP_ modifier in the OP code to make it explicit to the user that the test is being done. Builds, boots, and ran several hundred iterations requeue_pi.c. Signed-off-by: Darren Hart LKML-Reference: <49ED580E.1050502@us.ibm.com> Signed-off-by: Thomas Gleixner --- include/linux/futex.h | 4 +--- kernel/futex.c | 6 +----- 2 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/linux/futex.h b/include/linux/futex.h index b05519ca9e5..34956c8fdeb 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -24,8 +24,7 @@ union ktime; #define FUTEX_WAIT_BITSET 9 #define FUTEX_WAKE_BITSET 10 #define FUTEX_WAIT_REQUEUE_PI 11 -#define FUTEX_REQUEUE_PI 12 -#define FUTEX_CMP_REQUEUE_PI 13 +#define FUTEX_CMP_REQUEUE_PI 12 #define FUTEX_PRIVATE_FLAG 128 #define FUTEX_CLOCK_REALTIME 256 @@ -43,7 +42,6 @@ union ktime; #define FUTEX_WAKE_BITSET_PRIVATE (FUTEX_WAKE_BITS | FUTEX_PRIVATE_FLAG) #define FUTEX_WAIT_REQUEUE_PI_PRIVATE (FUTEX_WAIT_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) -#define FUTEX_REQUEUE_PI_PRIVATE (FUTEX_REQUEUE_PI | FUTEX_PRIVATE_FLAG) #define FUTEX_CMP_REQUEUE_PI_PRIVATE (FUTEX_CMP_REQUEUE_PI | \ FUTEX_PRIVATE_FLAG) diff --git a/kernel/futex.c b/kernel/futex.c index 6d2daa46f9f..aec8bf89bf4 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2555,9 +2555,6 @@ long do_futex(u32 __user *uaddr, int op, u32 val, ktime_t *timeout, ret = futex_wait_requeue_pi(uaddr, fshared, val, timeout, val3, clockrt, uaddr2); break; - case FUTEX_REQUEUE_PI: - ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, NULL, 1); - break; case FUTEX_CMP_REQUEUE_PI: ret = futex_requeue(uaddr, fshared, uaddr2, val, val2, &val3, 1); @@ -2596,8 +2593,7 @@ SYSCALL_DEFINE6(futex, u32 __user *, uaddr, int, op, u32, val, * number of waiters to wake in 'utime' if cmd == FUTEX_WAKE_OP. */ if (cmd == FUTEX_REQUEUE || cmd == FUTEX_CMP_REQUEUE || - cmd == FUTEX_REQUEUE_PI || cmd == FUTEX_CMP_REQUEUE_PI || - cmd == FUTEX_WAKE_OP) + cmd == FUTEX_CMP_REQUEUE_PI || cmd == FUTEX_WAKE_OP) val2 = (u32) (unsigned long) utime; return do_futex(uaddr, op, val, tp, uaddr2, val2, val3); From 83c4832683bc8ebcd1687b3c0bf3ba1ab253dd4f Mon Sep 17 00:00:00 2001 From: Sam Ravnborg Date: Thu, 30 Apr 2009 12:03:16 +0200 Subject: [PATCH 0716/2061] x86: boot/compressed/vmlinux.lds.S: fix build of bzImage with 64 bit compiler Jesper reported that he saw following build issue: > ld:arch/x86/boot/compressed/vmlinux.lds:9: syntax error > make[2]: *** [arch/x86/boot/compressed/vmlinux] Error 1 > make[1]: *** [arch/x86/boot/compressed/vmlinux] Error 2 > make: *** [bzImage] Error 2 CPP defines the symbol "i386" to "1". Undefine this to fix it. [ Impact: build fix with certain tool chains ] Reported-by: Jesper Dangaard Brouer Signed-off-by: Sam Ravnborg Cc: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/boot/compressed/vmlinux.lds.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index ffcb19134bf..0d26c92d3c7 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -1,5 +1,7 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) +#undef i386 + #ifdef CONFIG_X86_64 OUTPUT_ARCH(i386:x86-64) ENTRY(startup_64) From aac3f3c2c41ce49a6dbb98d9145265c00a964dc2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2009 13:52:19 +0200 Subject: [PATCH 0717/2061] perf_counter tools: add perf-report to the Makefile Build it explicitly until it's a proper builtin command. Signed-off-by: Ingo Molnar --- Documentation/perf_counter/Makefile | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index 543ccf28ac4..877cf5dedb5 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -228,7 +228,7 @@ COMPAT_CFLAGS = COMPAT_OBJS = LIB_H = LIB_OBJS = -PROGRAMS = +PROGRAMS = perf-report SCRIPT_PERL = SCRIPT_SH = TEST_PROGRAMS = @@ -808,6 +808,10 @@ clean: $(RM) $(htmldocs).tar.gz $(manpages).tar.gz $(RM) PERF-VERSION-FILE PERF-CFLAGS PERF-BUILD-OPTIONS +# temporary hack: +perf-report: perf-report.cc ../../include/linux/perf_counter.h Makefile + g++ -g -O2 -Wall -lrt -o $@ $< + .PHONY: all install clean strip .PHONY: shell_compatibility_test please_set_SHELL_PATH_to_a_more_modern_shell .PHONY: .FORCE-PERF-VERSION-FILE TAGS tags cscope .FORCE-PERF-CFLAGS From 66cf782996f3d57d3cc199f0a2d47a54e2aa5991 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2009 13:53:33 +0200 Subject: [PATCH 0718/2061] perf_counter tools: perf stat: make -l default-on Turn on scaling display by default - this is less confusing. Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index 112b94ed329..1fde12762ca 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -171,7 +171,7 @@ static unsigned int page_size; static int zero; -static int scale; +static int scale = 1; static const unsigned int default_count[] = { 1000000, From bad760089c1ef7fe525c0f268a4078b9cb483903 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 30 Apr 2009 14:14:37 +0200 Subject: [PATCH 0719/2061] perf_counter tools: fix infinite loop in perf-report on zeroed event records Bail out early if a record has zero size - we have no chance to make reliable progress in that case. Print out the offset where this happens, and print the number of bytes we missed out on. Signed-off-by: Ingo Molnar --- Documentation/perf_counter/perf-report.cc | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc index 1727317352b..933a0754453 100644 --- a/Documentation/perf_counter/perf-report.cc +++ b/Documentation/perf_counter/perf-report.cc @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -226,7 +227,7 @@ void load_kallsyms(void) while (!feof(file)) { uint64_t start; char c; - char sym[1024]; + char sym[1024000]; if (getline(&line, &n, file) < 0) break; @@ -416,12 +417,23 @@ more: if (head + event->header.size >= page_size * mmap_window) { unsigned long shift = page_size * (head / page_size); + int ret; + + ret = munmap(buf, page_size * mmap_window); + assert(ret == 0); - munmap(buf, page_size * mmap_window); offset += shift; head -= shift; goto remap; } + + + if (!event->header.size) { + fprintf(stderr, "zero-sized event at file offset %ld\n", offset + head); + fprintf(stderr, "skipping %ld bytes of events.\n", stat.st_size - offset - head); + goto done; + } + head += event->header.size; if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) { @@ -458,6 +470,8 @@ more: if (offset + head < stat.st_size) goto more; +done: + close(input); std::map::iterator hi = hist.begin(); From 03682411b1ccd38cbde2e9a6ab43884ff34fbefc Mon Sep 17 00:00:00 2001 From: Anton Vorontsov Date: Thu, 30 Apr 2009 18:38:01 +0200 Subject: [PATCH 0720/2061] alim15x3: Remove historical hacks, re-enable init_hwif for PowerPC Some time ago we had to disable init_hwif callback for PowerPC builds. That was because of a historical IRQ overwrite in the driver, which was causing IDE malfunction on the MPC8610HPCD PowerPC boards. It's unclear whether this overwrite is still useful, but it is proven to cause a bit of harm, and today some PowerPC targets (Xilinx ML510, as reported by Roderick Colenbrander) need the init_hwif, so we have to re-enable it and remove the overwrite. Reported-by: Roderick Colenbrander Suggested-by: Bartlomiej Zolnierkiewicz Cc: Benjamin Herrenschmidt Signed-off-by: Anton Vorontsov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/alim15x3.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/drivers/ide/alim15x3.c b/drivers/ide/alim15x3.c index 537da1cde16..e59b6dee9ae 100644 --- a/drivers/ide/alim15x3.c +++ b/drivers/ide/alim15x3.c @@ -402,27 +402,23 @@ static u8 ali_cable_detect(ide_hwif_t *hwif) return cbl; } -#if !defined(CONFIG_SPARC64) && !defined(CONFIG_PPC) +#ifndef CONFIG_SPARC64 /** * init_hwif_ali15x3 - Initialize the ALI IDE x86 stuff * @hwif: interface to configure * * Obtain the IRQ tables for an ALi based IDE solution on the PC * class platforms. This part of the code isn't applicable to the - * Sparc and PowerPC systems. + * Sparc systems. */ static void __devinit init_hwif_ali15x3 (ide_hwif_t *hwif) { - struct pci_dev *dev = to_pci_dev(hwif->dev); u8 ideic, inmir; s8 irq_routing_table[] = { -1, 9, 3, 10, 4, 5, 7, 6, 1, 11, 0, 12, 0, 14, 0, 15 }; int irq = -1; - if (dev->device == PCI_DEVICE_ID_AL_M5229) - hwif->irq = hwif->channel ? 15 : 14; - if (isa_dev) { /* * read IDE interface control @@ -455,7 +451,7 @@ static void __devinit init_hwif_ali15x3 (ide_hwif_t *hwif) } #else #define init_hwif_ali15x3 NULL -#endif /* !defined(CONFIG_SPARC64) && !defined(CONFIG_PPC) */ +#endif /* CONFIG_SPARC64 */ /** * init_dma_ali15x3 - set up DMA on ALi15x3 From 30b4ae8a4498543863501f707879b7220b649602 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2009 21:01:01 +0000 Subject: [PATCH 0721/2061] signals: split do_tkill Split out the code from do_tkill to make it reusable by the follow up patch which implements sys_rt_tgsigqueueinfo Signed-off-by: Thomas Gleixner Reviewed-by: Oleg Nesterov --- kernel/signal.c | 30 ++++++++++++++++++------------ 1 file changed, 18 insertions(+), 12 deletions(-) diff --git a/kernel/signal.c b/kernel/signal.c index d8034737db4..56d27acad87 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2278,24 +2278,17 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) return kill_something_info(sig, &info, pid); } -static int do_tkill(pid_t tgid, pid_t pid, int sig) +static int +do_send_specific(pid_t tgid, pid_t pid, int sig, struct siginfo *info) { - int error; - struct siginfo info; struct task_struct *p; unsigned long flags; - - error = -ESRCH; - info.si_signo = sig; - info.si_errno = 0; - info.si_code = SI_TKILL; - info.si_pid = task_tgid_vnr(current); - info.si_uid = current_uid(); + int error = -ESRCH; rcu_read_lock(); p = find_task_by_vpid(pid); if (p && (tgid <= 0 || task_tgid_vnr(p) == tgid)) { - error = check_kill_permission(sig, &info, p); + error = check_kill_permission(sig, info, p); /* * The null signal is a permissions and process existence * probe. No signal is actually delivered. @@ -2305,7 +2298,7 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) * signal is private anyway. */ if (!error && sig && lock_task_sighand(p, &flags)) { - error = specific_send_sig_info(sig, &info, p); + error = specific_send_sig_info(sig, info, p); unlock_task_sighand(p, &flags); } } @@ -2314,6 +2307,19 @@ static int do_tkill(pid_t tgid, pid_t pid, int sig) return error; } +static int do_tkill(pid_t tgid, pid_t pid, int sig) +{ + struct siginfo info; + + info.si_signo = sig; + info.si_errno = 0; + info.si_code = SI_TKILL; + info.si_pid = task_tgid_vnr(current); + info.si_uid = current_uid(); + + return do_send_specific(tgid, pid, sig, &info); +} + /** * sys_tgkill - send signal to one specific thread * @tgid: the thread group ID of the thread From 62ab4505e3efaf67784f84059e0fb9cedb1728ea Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2009 21:01:06 +0000 Subject: [PATCH 0722/2061] signals: implement sys_rt_tgsigqueueinfo sys_kill has the per thread counterpart sys_tgkill. sigqueueinfo is missing a thread directed counterpart. Such an interface is important for migrating applications from other OSes which have the per thread delivery implemented. Signed-off-by: Thomas Gleixner Reviewed-by: Oleg Nesterov Acked-by: Roland McGrath Acked-by: Ulrich Drepper --- include/linux/compat.h | 2 ++ include/linux/signal.h | 2 ++ kernel/compat.c | 11 +++++++++++ kernel/signal.c | 26 ++++++++++++++++++++++++++ 4 files changed, 41 insertions(+) diff --git a/include/linux/compat.h b/include/linux/compat.h index f2ded21f9a3..af931ee43dd 100644 --- a/include/linux/compat.h +++ b/include/linux/compat.h @@ -222,6 +222,8 @@ int copy_siginfo_from_user32(siginfo_t *to, struct compat_siginfo __user *from); int copy_siginfo_to_user32(struct compat_siginfo __user *to, siginfo_t *from); int get_compat_sigevent(struct sigevent *event, const struct compat_sigevent __user *u_event); +long compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, + struct compat_siginfo __user *uinfo); static inline int compat_timeval_compare(struct compat_timeval *lhs, struct compat_timeval *rhs) diff --git a/include/linux/signal.h b/include/linux/signal.h index 84f997f8aa5..c7552836bd9 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -235,6 +235,8 @@ static inline int valid_signal(unsigned long sig) extern int next_signal(struct sigpending *pending, sigset_t *mask); extern int group_send_sig_info(int sig, struct siginfo *info, struct task_struct *p); extern int __group_send_sig_info(int, struct siginfo *, struct task_struct *); +extern long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, + siginfo_t *info); extern long do_sigpending(void __user *, unsigned long); extern int sigprocmask(int, sigset_t *, sigset_t *); extern int show_unhandled_signals; diff --git a/kernel/compat.c b/kernel/compat.c index 42d56544460..f6c204f07ea 100644 --- a/kernel/compat.c +++ b/kernel/compat.c @@ -882,6 +882,17 @@ compat_sys_rt_sigtimedwait (compat_sigset_t __user *uthese, } +asmlinkage long +compat_sys_rt_tgsigqueueinfo(compat_pid_t tgid, compat_pid_t pid, int sig, + struct compat_siginfo __user *uinfo) +{ + siginfo_t info; + + if (copy_siginfo_from_user32(&info, uinfo)) + return -EFAULT; + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} + #ifdef __ARCH_WANT_COMPAT_SYS_TIME /* compat_time_t is a 32 bit "long" and needs to get converted. */ diff --git a/kernel/signal.c b/kernel/signal.c index 56d27acad87..f79b3b9f837 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -2369,6 +2369,32 @@ SYSCALL_DEFINE3(rt_sigqueueinfo, pid_t, pid, int, sig, return kill_proc_info(sig, &info, pid); } +long do_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig, siginfo_t *info) +{ + /* This is only valid for single tasks */ + if (pid <= 0 || tgid <= 0) + return -EINVAL; + + /* Not even root can pretend to send signals from the kernel. + Nor can they impersonate a kill(), which adds source info. */ + if (info->si_code >= 0) + return -EPERM; + info->si_signo = sig; + + return do_send_specific(tgid, pid, sig, info); +} + +SYSCALL_DEFINE4(rt_tgsigqueueinfo, pid_t, tgid, pid_t, pid, int, sig, + siginfo_t __user *, uinfo) +{ + siginfo_t info; + + if (copy_from_user(&info, uinfo, sizeof(siginfo_t))) + return -EFAULT; + + return do_rt_tgsigqueueinfo(tgid, pid, sig, &info); +} + int do_sigaction(int sig, struct k_sigaction *act, struct k_sigaction *oact) { struct task_struct *t = current; From 12d161147f828192b5bcc33166f468a827832767 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 4 Apr 2009 21:01:10 +0000 Subject: [PATCH 0723/2061] x86: hookup sys_rt_tgsigqueueinfo Make the new sys_rt_tgsigqueueinfo available for x86. Signed-off-by: Thomas Gleixner --- arch/x86/ia32/ia32entry.S | 1 + arch/x86/include/asm/unistd_32.h | 1 + arch/x86/include/asm/unistd_64.h | 2 ++ arch/x86/kernel/syscall_table_32.S | 1 + 4 files changed, 5 insertions(+) diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index a505202086e..dcef387ddc3 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -830,4 +830,5 @@ ia32_sys_call_table: .quad sys_inotify_init1 .quad compat_sys_preadv .quad compat_sys_pwritev + .quad compat_sys_rt_tgsigqueueinfo /* 335 */ ia32_syscall_end: diff --git a/arch/x86/include/asm/unistd_32.h b/arch/x86/include/asm/unistd_32.h index 6e72d74cf8d..708dae61262 100644 --- a/arch/x86/include/asm/unistd_32.h +++ b/arch/x86/include/asm/unistd_32.h @@ -340,6 +340,7 @@ #define __NR_inotify_init1 332 #define __NR_preadv 333 #define __NR_pwritev 334 +#define __NR_rt_tgsigqueueinfo 335 #ifdef __KERNEL__ diff --git a/arch/x86/include/asm/unistd_64.h b/arch/x86/include/asm/unistd_64.h index f8182946232..4e2b0540440 100644 --- a/arch/x86/include/asm/unistd_64.h +++ b/arch/x86/include/asm/unistd_64.h @@ -657,6 +657,8 @@ __SYSCALL(__NR_inotify_init1, sys_inotify_init1) __SYSCALL(__NR_preadv, sys_preadv) #define __NR_pwritev 296 __SYSCALL(__NR_pwritev, sys_pwritev) +#define __NR_rt_tgsigqueueinfo 297 +__SYSCALL(__NR_rt_tgsigqueueinfo, sys_rt_tgsigqueueinfo) #ifndef __NO_STUBS diff --git a/arch/x86/kernel/syscall_table_32.S b/arch/x86/kernel/syscall_table_32.S index ff5c8736b49..734f92c02dd 100644 --- a/arch/x86/kernel/syscall_table_32.S +++ b/arch/x86/kernel/syscall_table_32.S @@ -334,3 +334,4 @@ ENTRY(sys_call_table) .long sys_inotify_init1 .long sys_preadv .long sys_pwritev + .long sys_rt_tgsigqueueinfo /* 335 */ From bf293c17b26b8854241df08b9b63f7270cbde012 Mon Sep 17 00:00:00 2001 From: Remis Lima Baima Date: Thu, 30 Apr 2009 18:36:23 +0200 Subject: [PATCH 0724/2061] x86: added 'ifndef _ASM_X86_IOMAP_H' to iomap.h iomap.h misses the include guards. [ Impact: cleanup ] Signed-off-by: Remis Lima Baima Signed-off-by: Arnd Bergmann LKML-Reference: <200904301836.23885.arnd@arndb.de> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/iomap.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index 86af26091d6..0e9fe1d9d97 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -1,3 +1,6 @@ +#ifndef _ASM_X86_IOMAP_H +#define _ASM_X86_IOMAP_H + /* * Copyright © 2008 Ingo Molnar * @@ -31,3 +34,5 @@ iomap_atomic_prot_pfn(unsigned long pfn, enum km_type type, pgprot_t prot); void iounmap_atomic(void *kvaddr, enum km_type type); + +#endif /* _ASM_X86_IOMAP_H */ From 78a3d9d5654a7fd99cf8b2ab06b9497b9c7aad64 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 29 Apr 2009 18:01:23 +0200 Subject: [PATCH 0725/2061] do_wait: do take security_task_wait() into account I was never able to understand what should we actually do when security_task_wait() fails, but the current code doesn't look right. If ->task_wait() returns the error, we update *notask_error correctly. But then we either reap the child (despite the fact this was forbidden) or clear *notask_error (and hide the securiy policy problems). This patch assumes that "stolen by ptrace" doesn't matter. If selinux denies the child we should ignore it but make sure we report -EACCESS instead of -ECHLD if there are no other eligible children. Signed-off-by: Oleg Nesterov Acked-by: Roland McGrath Signed-off-by: James Morris --- kernel/exit.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/exit.c b/kernel/exit.c index 167e1e3ad7c..d2e8239ea18 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -1582,6 +1582,7 @@ static int wait_consider_task(struct task_struct *parent, int ptrace, */ if (*notask_error) *notask_error = ret; + return 0; } if (likely(!ptrace) && unlikely(p->ptrace)) { From c33a0bc4e41ef169d6e807d8abb9502544b518e5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2009 12:23:16 +0200 Subject: [PATCH 0726/2061] perf_counter: fix race in perf_output_* When two (or more) contexts output to the same buffer, it is possible to observe half written output. Suppose we have CPU0 doing perf_counter_mmap(), CPU1 doing perf_counter_overflow(). If CPU1 does a wakeup and exposes head to user-space, then CPU2 can observe the data CPU0 is still writing. [ Impact: fix occasionally corrupted profiling records ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090501102533.007821627@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 +- kernel/perf_counter.c | 130 +++++++++++++++++++++++++++-------- 2 files changed, 105 insertions(+), 30 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 41aed427005..f776851f8c4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -358,10 +358,13 @@ struct perf_mmap_data { struct rcu_head rcu_head; int nr_pages; /* nr of data pages */ - atomic_t wakeup; /* POLL_ for wakeups */ + atomic_t poll; /* POLL_ for wakeups */ atomic_t head; /* write position */ atomic_t events; /* event limit */ + atomic_t wakeup_head; /* completed head */ + atomic_t lock; /* concurrent writes */ + struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 75f2b6c8239..8660ae57953 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1279,14 +1279,12 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) { struct perf_counter *counter = file->private_data; struct perf_mmap_data *data; - unsigned int events; + unsigned int events = POLL_HUP; rcu_read_lock(); data = rcu_dereference(counter->data); if (data) - events = atomic_xchg(&data->wakeup, 0); - else - events = POLL_HUP; + events = atomic_xchg(&data->poll, 0); rcu_read_unlock(); poll_wait(file, &counter->waitq, wait); @@ -1568,22 +1566,6 @@ static const struct file_operations perf_fops = { void perf_counter_wakeup(struct perf_counter *counter) { - struct perf_mmap_data *data; - - rcu_read_lock(); - data = rcu_dereference(counter->data); - if (data) { - atomic_set(&data->wakeup, POLL_IN); - /* - * Ensure all data writes are issued before updating the - * user-space data head information. The matching rmb() - * will be in userspace after reading this value. - */ - smp_wmb(); - data->user_page->data_head = atomic_read(&data->head); - } - rcu_read_unlock(); - wake_up_all(&counter->waitq); if (counter->pending_kill) { @@ -1721,10 +1703,14 @@ struct perf_output_handle { int wakeup; int nmi; int overflow; + int locked; + unsigned long flags; }; -static inline void __perf_output_wakeup(struct perf_output_handle *handle) +static void perf_output_wakeup(struct perf_output_handle *handle) { + atomic_set(&handle->data->poll, POLL_IN); + if (handle->nmi) { handle->counter->pending_wakeup = 1; perf_pending_queue(&handle->counter->pending, @@ -1733,6 +1719,86 @@ static inline void __perf_output_wakeup(struct perf_output_handle *handle) perf_counter_wakeup(handle->counter); } +/* + * Curious locking construct. + * + * We need to ensure a later event doesn't publish a head when a former + * event isn't done writing. However since we need to deal with NMIs we + * cannot fully serialize things. + * + * What we do is serialize between CPUs so we only have to deal with NMI + * nesting on a single CPU. + * + * We only publish the head (and generate a wakeup) when the outer-most + * event completes. + */ +static void perf_output_lock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int cpu; + + handle->locked = 0; + + local_irq_save(handle->flags); + cpu = smp_processor_id(); + + if (in_nmi() && atomic_read(&data->lock) == cpu) + return; + + while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + cpu_relax(); + + handle->locked = 1; +} + +static void perf_output_unlock(struct perf_output_handle *handle) +{ + struct perf_mmap_data *data = handle->data; + int head, cpu; + + if (handle->wakeup) + data->wakeup_head = data->head; + + if (!handle->locked) + goto out; + +again: + /* + * The xchg implies a full barrier that ensures all writes are done + * before we publish the new head, matched by a rmb() in userspace when + * reading this position. + */ + while ((head = atomic_xchg(&data->wakeup_head, 0))) { + data->user_page->data_head = head; + handle->wakeup = 1; + } + + /* + * NMI can happen here, which means we can miss a wakeup_head update. + */ + + cpu = atomic_xchg(&data->lock, 0); + WARN_ON_ONCE(cpu != smp_processor_id()); + + /* + * Therefore we have to validate we did not indeed do so. + */ + if (unlikely(atomic_read(&data->wakeup_head))) { + /* + * Since we had it locked, we can lock it again. + */ + while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + cpu_relax(); + + goto again; + } + + if (handle->wakeup) + perf_output_wakeup(handle); +out: + local_irq_restore(handle->flags); +} + static int perf_output_begin(struct perf_output_handle *handle, struct perf_counter *counter, unsigned int size, int nmi, int overflow) @@ -1745,6 +1811,7 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data) goto out; + handle->data = data; handle->counter = counter; handle->nmi = nmi; handle->overflow = overflow; @@ -1752,12 +1819,13 @@ static int perf_output_begin(struct perf_output_handle *handle, if (!data->nr_pages) goto fail; + perf_output_lock(handle); + do { offset = head = atomic_read(&data->head); head += size; } while (atomic_cmpxchg(&data->head, offset, head) != offset); - handle->data = data; handle->offset = offset; handle->head = head; handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); @@ -1765,7 +1833,7 @@ static int perf_output_begin(struct perf_output_handle *handle, return 0; fail: - __perf_output_wakeup(handle); + perf_output_wakeup(handle); out: rcu_read_unlock(); @@ -1809,16 +1877,20 @@ static void perf_output_copy(struct perf_output_handle *handle, static void perf_output_end(struct perf_output_handle *handle) { - int wakeup_events = handle->counter->hw_event.wakeup_events; + struct perf_counter *counter = handle->counter; + struct perf_mmap_data *data = handle->data; + + int wakeup_events = counter->hw_event.wakeup_events; if (handle->overflow && wakeup_events) { - int events = atomic_inc_return(&handle->data->events); + int events = atomic_inc_return(&data->events); if (events >= wakeup_events) { - atomic_sub(wakeup_events, &handle->data->events); - __perf_output_wakeup(handle); + atomic_sub(wakeup_events, &data->events); + handle->wakeup = 1; } - } else if (handle->wakeup) - __perf_output_wakeup(handle); + } + + perf_output_unlock(handle); rcu_read_unlock(); } From 63a809a2dc53b91268dd915bbcbd425063893676 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2009 12:23:17 +0200 Subject: [PATCH 0727/2061] perf_counter: fix nmi-watchdog interaction When we don't have any perf-counters active, don't act like we know what the NMI is for. [ Impact: fix hard hang with nmi_watchdog=2 ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090501102533.109867793@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index fc06f4d3264..d4c0cc9d326 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -871,6 +871,9 @@ perf_counter_nmi_handler(struct notifier_block *self, struct pt_regs *regs; int ret; + if (!atomic_read(&num_counters)) + return NOTIFY_DONE; + switch (cmd) { case DIE_NMI: case DIE_NMI_IPI: From 585e3374d9d29376c2c37d821c8b7637dd48ca95 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2009 12:23:18 +0200 Subject: [PATCH 0728/2061] perf_counter: tool: handle 0-length data files Avoid perf-report barfing on 0-length data files. [ Impact: fix perf-report SIGBUS ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090501102533.196245693@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/perf-report.cc | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc index 933a0754453..911d7f3e7a6 100644 --- a/Documentation/perf_counter/perf-report.cc +++ b/Documentation/perf_counter/perf-report.cc @@ -402,6 +402,11 @@ int main(int argc, char *argv[]) exit(-1); } + if (!stat.st_size) { + fprintf(stderr, "zero-sized file, nothing to do!\n"); + exit(0); + } + load_kallsyms(); remap: From e5791a808ae91a9e7e1b65ea9b8de0f96a043d88 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 1 May 2009 12:23:19 +0200 Subject: [PATCH 0729/2061] perf_counter: documentation update Update the documentation to reflect the current state of affairs [ Impact: documentation update ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090501102533.296727903@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/design.txt | 264 +++++++++++++++++++++----- 1 file changed, 215 insertions(+), 49 deletions(-) diff --git a/Documentation/perf_counter/design.txt b/Documentation/perf_counter/design.txt index aaf105c02fb..9930c4bddc6 100644 --- a/Documentation/perf_counter/design.txt +++ b/Documentation/perf_counter/design.txt @@ -34,41 +34,47 @@ can be poll()ed. When creating a new counter fd, 'perf_counter_hw_event' is: -/* - * Event to monitor via a performance monitoring counter: - */ struct perf_counter_hw_event { - __u64 event_config; + /* + * The MSB of the config word signifies if the rest contains cpu + * specific (raw) counter configuration data, if unset, the next + * 7 bits are an event type and the rest of the bits are the event + * identifier. + */ + __u64 config; - __u64 irq_period; - __u64 record_type; - __u64 read_format; + __u64 irq_period; + __u32 record_type; + __u32 read_format; - __u64 disabled : 1, /* off by default */ - nmi : 1, /* NMI sampling */ - inherit : 1, /* children inherit it */ - pinned : 1, /* must always be on PMU */ - exclusive : 1, /* only group on PMU */ - exclude_user : 1, /* don't count user */ - exclude_kernel : 1, /* ditto kernel */ - exclude_hv : 1, /* ditto hypervisor */ - exclude_idle : 1, /* don't count when idle */ + __u64 disabled : 1, /* off by default */ + nmi : 1, /* NMI sampling */ + inherit : 1, /* children inherit it */ + pinned : 1, /* must always be on PMU */ + exclusive : 1, /* only group on PMU */ + exclude_user : 1, /* don't count user */ + exclude_kernel : 1, /* ditto kernel */ + exclude_hv : 1, /* ditto hypervisor */ + exclude_idle : 1, /* don't count when idle */ + mmap : 1, /* include mmap data */ + munmap : 1, /* include munmap data */ + comm : 1, /* include comm data */ - __reserved_1 : 55; + __reserved_1 : 52; - __u32 extra_config_len; + __u32 extra_config_len; + __u32 wakeup_events; /* wakeup every n events */ - __u32 __reserved_4; - __u64 __reserved_2; - __u64 __reserved_3; + __u64 __reserved_2; + __u64 __reserved_3; }; -The 'event_config' field specifies what the counter should count. It +The 'config' field specifies what the counter should count. It is divided into 3 bit-fields: -raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 -type: 7 bits (next most significant) 0x7f00_0000_0000_0000 -event_id: 56 bits (least significant) 0x00ff_0000_0000_0000 +raw_type: 1 bit (most significant bit) 0x8000_0000_0000_0000 +type: 7 bits (next most significant) 0x7f00_0000_0000_0000 +event_id: 56 bits (least significant) 0x00ff_ffff_ffff_ffff If 'raw_type' is 1, then the counter will count a hardware event specified by the remaining 63 bits of event_config. The encoding is @@ -134,41 +140,56 @@ enum sw_event_ids { PERF_COUNT_PAGE_FAULTS_MAJ = 6, }; +Counters of the type PERF_TYPE_TRACEPOINT are available when the ftrace event +tracer is available, and event_id values can be obtained from +/debug/tracing/events/*/*/id + + Counters come in two flavours: counting counters and sampling counters. A "counting" counter is one that is used for counting the number of events that occur, and is characterised by having -irq_period = 0 and record_type = PERF_RECORD_SIMPLE. A read() on a -counting counter simply returns the current value of the counter as -an 8-byte number. +irq_period = 0. + + +A read() on a counter returns the current value of the counter and possible +additional values as specified by 'read_format', each value is a u64 (8 bytes) +in size. + +/* + * Bits that can be set in hw_event.read_format to request that + * reads on the counter should return the indicated quantities, + * in increasing order of bit value, after the counter value. + */ +enum perf_counter_read_format { + PERF_FORMAT_TOTAL_TIME_ENABLED = 1, + PERF_FORMAT_TOTAL_TIME_RUNNING = 2, +}; + +Using these additional values one can establish the overcommit ratio for a +particular counter allowing one to take the round-robin scheduling effect +into account. + A "sampling" counter is one that is set up to generate an interrupt every N events, where N is given by 'irq_period'. A sampling counter -has irq_period > 0 and record_type != PERF_RECORD_SIMPLE. The -record_type controls what data is recorded on each interrupt, and the -available values are currently: +has irq_period > 0. The record_type controls what data is recorded on each +interrupt: /* - * IRQ-notification data record type: + * Bits that can be set in hw_event.record_type to request information + * in the overflow packets. */ -enum perf_counter_record_type { - PERF_RECORD_SIMPLE = 0, - PERF_RECORD_IRQ = 1, - PERF_RECORD_GROUP = 2, +enum perf_counter_record_format { + PERF_RECORD_IP = 1U << 0, + PERF_RECORD_TID = 1U << 1, + PERF_RECORD_TIME = 1U << 2, + PERF_RECORD_ADDR = 1U << 3, + PERF_RECORD_GROUP = 1U << 4, + PERF_RECORD_CALLCHAIN = 1U << 5, }; -A record_type value of PERF_RECORD_IRQ will record the instruction -pointer (IP) at which the interrupt occurred. A record_type value of -PERF_RECORD_GROUP will record the event_config and counter value of -all of the other counters in the group, and should only be used on a -group leader (see below). Currently these two values are mutually -exclusive, but record_type will become a bit-mask in future and -support other values. - -A sampling counter has an event queue, into which an event is placed -on each interrupt. A read() on a sampling counter will read the next -event from the event queue. If the queue is empty, the read() will -either block or return an EAGAIN error, depending on whether the fd -has been set to non-blocking mode or not. +Such (and other) events will be recorded in a ring-buffer, which is +available to user-space using mmap() (see below). The 'disabled' bit specifies whether the counter starts out disabled or enabled. If it is initially disabled, it can be enabled by ioctl @@ -206,6 +227,13 @@ The 'exclude_user', 'exclude_kernel' and 'exclude_hv' bits provide a way to request that counting of events be restricted to times when the CPU is in user, kernel and/or hypervisor mode. +The 'mmap' and 'munmap' bits allow recording of PROT_EXEC mmap/munmap +operations, these can be used to relate userspace IP addresses to actual +code, even after the mapping (or even the whole process) is gone, +these events are recorded in the ring-buffer (see below). + +The 'comm' bit allows tracking of process comm data on process creation. +This too is recorded in the ring-buffer (see below). The 'pid' parameter to the perf_counter_open() system call allows the counter to be specific to a task: @@ -250,6 +278,138 @@ can be meaningfully compared, added, divided (to get ratios), etc., with each other, since they have counted events for the same set of executed instructions. + +Like stated, asynchronous events, like counter overflow or PROT_EXEC mmap +tracking are logged into a ring-buffer. This ring-buffer is created and +accessed through mmap(). + +The mmap size should be 1+2^n pages, where the first page is a meta-data page +(struct perf_counter_mmap_page) that contains various bits of information such +as where the ring-buffer head is. + +/* + * Structure of the page that can be mapped via mmap + */ +struct perf_counter_mmap_page { + __u32 version; /* version number of this structure */ + __u32 compat_version; /* lowest version this is compat with */ + + /* + * Bits needed to read the hw counters in user-space. + * + * u32 seq; + * s64 count; + * + * do { + * seq = pc->lock; + * + * barrier() + * if (pc->index) { + * count = pmc_read(pc->index - 1); + * count += pc->offset; + * } else + * goto regular_read; + * + * barrier(); + * } while (pc->lock != seq); + * + * NOTE: for obvious reason this only works on self-monitoring + * processes. + */ + __u32 lock; /* seqlock for synchronization */ + __u32 index; /* hardware counter identifier */ + __s64 offset; /* add to hardware counter value */ + + /* + * Control data for the mmap() data buffer. + * + * User-space reading this value should issue an rmb(), on SMP capable + * platforms, after reading this value -- see perf_counter_wakeup(). + */ + __u32 data_head; /* head in the data section */ +}; + +NOTE: the hw-counter userspace bits are arch specific and are currently only + implemented on powerpc. + +The following 2^n pages are the ring-buffer which contains events of the form: + +#define PERF_EVENT_MISC_KERNEL (1 << 0) +#define PERF_EVENT_MISC_USER (1 << 1) +#define PERF_EVENT_MISC_OVERFLOW (1 << 2) + +struct perf_event_header { + __u32 type; + __u16 misc; + __u16 size; +}; + +enum perf_event_type { + + /* + * The MMAP events record the PROT_EXEC mappings so that we can + * correlate userspace IPs to code. They have the following structure: + * + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; + * }; + */ + PERF_EVENT_MMAP = 1, + PERF_EVENT_MUNMAP = 2, + + /* + * struct { + * struct perf_event_header header; + * + * u32 pid, tid; + * char comm[]; + * }; + */ + PERF_EVENT_COMM = 3, + + /* + * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field + * will be PERF_RECORD_* + * + * struct { + * struct perf_event_header header; + * + * { u64 ip; } && PERF_RECORD_IP + * { u32 pid, tid; } && PERF_RECORD_TID + * { u64 time; } && PERF_RECORD_TIME + * { u64 addr; } && PERF_RECORD_ADDR + * + * { u64 nr; + * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP + * + * { u16 nr, + * hv, + * kernel, + * user; + * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN + * }; + */ +}; + +NOTE: PERF_RECORD_CALLCHAIN is arch specific and currently only implemented + on x86. + +Notification of new events is possible through poll()/select()/epoll() and +fcntl() managing signals. + +Normally a notification is generated for every page filled, however one can +additionally set perf_counter_hw_event.wakeup_events to generate one every +so many counter overflow events. + +Future work will include a splice() interface to the ring-buffer. + + Counters can be enabled and disabled in two ways: via ioctl and via prctl. When a counter is disabled, it doesn't count or generate events but does continue to exist and maintain its count value. @@ -269,6 +429,12 @@ group other than the leader only affects that counter - disabling an non-leader stops that counter from counting but doesn't affect any other counter. +Additionally, non-inherited overflow counters can use + + ioctl(fd, PERF_COUNTER_IOC_REFRESH, nr); + +to enable a counter for 'nr' events, after which it gets disabled again. + A process can enable or disable all the counter groups that are attached to it, using prctl: From 56afb0f8823650f53a5f0e96d69a282e8892c61b Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 30 Apr 2009 13:29:36 -0400 Subject: [PATCH 0730/2061] kerneldoc, tracing: make kernel-doc understand TRACE_EVENT() macro (take #2) Add support to kernel-doc for tracepoint comments above TRACE_EVENT() macro definitions. Paves the way for tracepoint docbook. [ Impact: extend DocBook infrastructure ] Signed-off-by: Jason Baron Acked-by: Randy Dunlap Cc: akpm@linux-foundation.org Cc: rostedt@goodmis.org Cc: fweisbec@gmail.com Cc: mathieu.desnoyers@polymtl.ca Cc: wcohen@redhat.com LKML-Reference: Signed-off-by: Ingo Molnar --- scripts/kernel-doc | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/scripts/kernel-doc b/scripts/kernel-doc index 0f11870116d..2b53a55fbec 100755 --- a/scripts/kernel-doc +++ b/scripts/kernel-doc @@ -1827,6 +1827,25 @@ sub reset_state { $state = 0; } +sub tracepoint_munge($) { + my $file = shift; + my $tracepointname = 0; + my $tracepointargs = 0; + + if($prototype =~ m/TRACE_EVENT\((.*?),/) { + $tracepointname = $1; + } + if($prototype =~ m/TP_PROTO\((.*?)\)/) { + $tracepointargs = $1; + } + if (($tracepointname eq 0) || ($tracepointargs eq 0)) { + print STDERR "Warning(${file}:$.): Unrecognized tracepoint format: \n". + "$prototype\n"; + } else { + $prototype = "static inline void trace_$tracepointname($tracepointargs)"; + } +} + sub syscall_munge() { my $void = 0; @@ -1881,6 +1900,9 @@ sub process_state3_function($$) { if ($prototype =~ /SYSCALL_DEFINE/) { syscall_munge(); } + if ($prototype =~ /TRACE_EVENT/) { + tracepoint_munge($file); + } dump_function($prototype, $file); reset_state(); } From a76f8c6da1e48fd4ef025f42c736389532ff30ba Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 30 Apr 2009 13:29:42 -0400 Subject: [PATCH 0731/2061] tracing: add new tracepoints docbook Add tracepoint docbook. This will help us document and understand what tracepoints are in the kernel. Since there are multiple macros, and files that contain tracepoints. [ Impact: add documentation ] Signed-off-by: Jason Baron Acked-by: Randy Dunlap Cc: akpm@linux-foundation.org Cc: rostedt@goodmis.org Cc: fweisbec@gmail.com Cc: mathieu.desnoyers@polymtl.ca Cc: wcohen@redhat.com LKML-Reference: <84160b6bd94aff02455da7e12bad054d34c579a0.1241107197.git.jbaron@redhat.com> Signed-off-by: Ingo Molnar --- Documentation/DocBook/Makefile | 3 +- Documentation/DocBook/tracepoint.tmpl | 84 +++++++++++++++++++++++++++ 2 files changed, 86 insertions(+), 1 deletion(-) create mode 100644 Documentation/DocBook/tracepoint.tmpl diff --git a/Documentation/DocBook/Makefile b/Documentation/DocBook/Makefile index 8918a32c6b3..4c8f4d6e114 100644 --- a/Documentation/DocBook/Makefile +++ b/Documentation/DocBook/Makefile @@ -13,7 +13,8 @@ DOCBOOKS := z8530book.xml mcabook.xml device-drivers.xml \ gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ genericirq.xml s390-drivers.xml uio-howto.xml scsi.xml \ mac80211.xml debugobjects.xml sh.xml regulator.xml \ - alsa-driver-api.xml writing-an-alsa-driver.xml + alsa-driver-api.xml writing-an-alsa-driver.xml \ + tracepoint.xml ### # The build process is as follows (targets): diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl new file mode 100644 index 00000000000..70891bc6849 --- /dev/null +++ b/Documentation/DocBook/tracepoint.tmpl @@ -0,0 +1,84 @@ + + + + + + The Linux Kernel Tracepoint API + + + + Jason + Baron + +
+ jbaron@redhat.com +
+
+
+
+ + + + This documentation is free software; you can redistribute + it and/or modify it under the terms of the GNU General Public + License as published by the Free Software Foundation; either + version 2 of the License, or (at your option) any later + version. + + + + This program is distributed in the hope that it will be + useful, but WITHOUT ANY WARRANTY; without even the implied + warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + See the GNU General Public License for more details. + + + + You should have received a copy of the GNU General Public + License along with this program; if not, write to the Free + Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, + MA 02111-1307 USA + + + + For more details see the file COPYING in the source + distribution of Linux. + + +
+ + + + Introduction + + Tracepoints are static probe points that are located in strategic points + throughout the kernel. 'Probes' register/unregister with tracepoints + via a callback mechanism. The 'probes' are strictly typed functions that + are passed a unique set of parameters defined by each tracepoint. + + + + From this simple callback mechanism, 'probes' can be used to profile, debug, + and understand kernel behavior. There are a number of tools that provide a + framework for using 'probes'. These tools include Systemtap, ftrace, and + LTTng. + + + + Tracepoints are defined in a number of header files via various macros. Thus, + the purpose of this document is to provide a clear accounting of the available + tracepoints. The intention is to understand not only what tracepoints are + available but also to understand where future tracepoints might be added. + + + + The API presented has functions of the form: + trace_tracepointname(function parameters). These are the + tracepoints callbacks that are found throughout the code. Registering and + unregistering probes with these callback sites is covered in the + Documentation/trace/* directory. + + + +
From 9ee1983c9aa18f12388ef660d0c76a23dc112959 Mon Sep 17 00:00:00 2001 From: Jason Baron Date: Thu, 30 Apr 2009 13:29:47 -0400 Subject: [PATCH 0732/2061] tracing: add irq tracepoint documentation Document irqs for the newly created docbook. [ Impact: add documentation ] Signed-off-by: Jason Baron Acked-by: Randy Dunlap Cc: akpm@linux-foundation.org Cc: rostedt@goodmis.org Cc: fweisbec@gmail.com Cc: mathieu.desnoyers@polymtl.ca Cc: wcohen@redhat.com LKML-Reference: <73ff42be3420157667ec548e9b0e409c3cfad05f.1241107197.git.jbaron@redhat.com> Signed-off-by: Ingo Molnar --- Documentation/DocBook/tracepoint.tmpl | 5 +++ include/trace/events/irq.h | 46 ++++++++++++++++++++++++--- 2 files changed, 47 insertions(+), 4 deletions(-) diff --git a/Documentation/DocBook/tracepoint.tmpl b/Documentation/DocBook/tracepoint.tmpl index 70891bc6849..b0756d0fd57 100644 --- a/Documentation/DocBook/tracepoint.tmpl +++ b/Documentation/DocBook/tracepoint.tmpl @@ -81,4 +81,9 @@ + + IRQ +!Iinclude/trace/events/irq.h + + diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index 76868646751..32a9f7ef432 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -7,8 +7,16 @@ #undef TRACE_SYSTEM #define TRACE_SYSTEM irq -/* - * Tracepoint for entry of interrupt handler: +/** + * irq_handler_entry - called immediately before the irq action handler + * @irq: irq number + * @action: pointer to struct irqaction + * + * The struct irqaction pointed to by @action contains various + * information about the handler, including the device name, + * @action->name, and the device id, @action->dev_id. When used in + * conjunction with the irq_handler_exit tracepoint, we can figure + * out irq handler latencies. */ TRACE_EVENT(irq_handler_entry, @@ -29,8 +37,16 @@ TRACE_EVENT(irq_handler_entry, TP_printk("irq=%d handler=%s", __entry->irq, __get_str(name)) ); -/* - * Tracepoint for return of an interrupt handler: +/** + * irq_handler_exit - called immediately after the irq action handler returns + * @irq: irq number + * @action: pointer to struct irqaction + * @ret: return value + * + * If the @ret value is set to IRQ_HANDLED, then we know that the corresponding + * @action->handler scuccessully handled this irq. Otherwise, the irq might be + * a shared irq line, or the irq was not handled successfully. Can be used in + * conjunction with the irq_handler_entry to understand irq handler latencies. */ TRACE_EVENT(irq_handler_exit, @@ -52,6 +68,17 @@ TRACE_EVENT(irq_handler_exit, __entry->irq, __entry->ret ? "handled" : "unhandled") ); +/** + * softirq_entry - called immediately before the softirq handler + * @h: pointer to struct softirq_action + * @vec: pointer to first struct softirq_action in softirq_vec array + * + * The @h parameter, contains a pointer to the struct softirq_action + * which has a pointer to the action handler that is called. By subtracting + * the @vec pointer from the @h pointer, we can determine the softirq + * number. Also, when used in combination with the softirq_exit tracepoint + * we can determine the softirq latency. + */ TRACE_EVENT(softirq_entry, TP_PROTO(struct softirq_action *h, struct softirq_action *vec), @@ -71,6 +98,17 @@ TRACE_EVENT(softirq_entry, TP_printk("softirq=%d action=%s", __entry->vec, __get_str(name)) ); +/** + * softirq_exit - called immediately after the softirq handler returns + * @h: pointer to struct softirq_action + * @vec: pointer to first struct softirq_action in softirq_vec array + * + * The @h parameter contains a pointer to the struct softirq_action + * that has handled the softirq. By subtracting the @vec pointer from + * the @h pointer, we can determine the softirq number. Also, when used in + * combination with the softirq_entry tracepoint we can determine the softirq + * latency. + */ TRACE_EVENT(softirq_exit, TP_PROTO(struct softirq_action *h, struct softirq_action *vec), From e0202f56a82cd1170c6f1c520db669431cf26ddc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 1 May 2009 16:51:44 +0200 Subject: [PATCH 0733/2061] perf_counter tools: fix x86 syscall numbers Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-record.c | 4 ++-- Documentation/perf_counter/builtin-stat.c | 4 ++-- Documentation/perf_counter/builtin-top.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c index 4a50abf843e..9cff266fb61 100644 --- a/Documentation/perf_counter/builtin-record.c +++ b/Documentation/perf_counter/builtin-record.c @@ -57,13 +57,13 @@ #define asmlinkage #ifdef __x86_64__ -#define __NR_perf_counter_open 295 +#define __NR_perf_counter_open 298 #define rmb() asm volatile("lfence" ::: "memory") #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #endif #ifdef __i386__ -#define __NR_perf_counter_open 333 +#define __NR_perf_counter_open 336 #define rmb() asm volatile("lfence" ::: "memory") #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #endif diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index 1fde12762ca..9fbc66173de 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -108,13 +108,13 @@ #define asmlinkage #ifdef __x86_64__ -#define __NR_perf_counter_open 295 +#define __NR_perf_counter_open 298 #define rmb() asm volatile("lfence" ::: "memory") #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #endif #ifdef __i386__ -#define __NR_perf_counter_open 333 +#define __NR_perf_counter_open 336 #define rmb() asm volatile("lfence" ::: "memory") #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #endif diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index 8d28864a20c..b6d989e7b19 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -89,13 +89,13 @@ #define asmlinkage #ifdef __x86_64__ -#define __NR_perf_counter_open 295 +#define __NR_perf_counter_open 298 #define rmb() asm volatile("lfence" ::: "memory") #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #endif #ifdef __i386__ -#define __NR_perf_counter_open 333 +#define __NR_perf_counter_open 336 #define rmb() asm volatile("lfence" ::: "memory") #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #endif From 3666932bf2212a8fa77e344c5d946e86787bdbbe Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 May 2009 17:37:51 +0200 Subject: [PATCH 0734/2061] perf_counter tools: remove build generated files These files are generated during the build process. No need to have them in the git repository. Signed-off-by: Thomas Gleixner --- Documentation/perf_counter/PERF-BUILD-OPTIONS | 4 ---- Documentation/perf_counter/PERF-CFLAGS | 1 - Documentation/perf_counter/PERF-VERSION-FILE | 1 - 3 files changed, 6 deletions(-) delete mode 100644 Documentation/perf_counter/PERF-BUILD-OPTIONS delete mode 100644 Documentation/perf_counter/PERF-CFLAGS delete mode 100644 Documentation/perf_counter/PERF-VERSION-FILE diff --git a/Documentation/perf_counter/PERF-BUILD-OPTIONS b/Documentation/perf_counter/PERF-BUILD-OPTIONS deleted file mode 100644 index 46d8d6ceb2f..00000000000 --- a/Documentation/perf_counter/PERF-BUILD-OPTIONS +++ /dev/null @@ -1,4 +0,0 @@ -SHELL_PATH='/bin/sh' -TAR='tar' -NO_CURL='' -NO_PERL='' diff --git a/Documentation/perf_counter/PERF-CFLAGS b/Documentation/perf_counter/PERF-CFLAGS deleted file mode 100644 index f24906ca688..00000000000 --- a/Documentation/perf_counter/PERF-CFLAGS +++ /dev/null @@ -1 +0,0 @@ --g -O2 -Wall -DSHA1_HEADER='' : /home/mingo/bin:libexec/perf-core:share/perf-core/templates:/home/mingo diff --git a/Documentation/perf_counter/PERF-VERSION-FILE b/Documentation/perf_counter/PERF-VERSION-FILE deleted file mode 100644 index 328e244c0c8..00000000000 --- a/Documentation/perf_counter/PERF-VERSION-FILE +++ /dev/null @@ -1 +0,0 @@ -PERF_VERSION = 0.0.1.PERF From 6eda5838bc5771578986429cde4a0870e1e5f5e1 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 May 2009 18:29:57 +0200 Subject: [PATCH 0735/2061] perfcounter tools: move common defines ... to local header file No change, move of duplicated stuff only. Signed-off-by: Thomas Gleixner --- Documentation/perf_counter/Makefile | 1 + Documentation/perf_counter/builtin-record.c | 64 +-------------------- Documentation/perf_counter/builtin-stat.c | 59 +------------------ Documentation/perf_counter/builtin-top.c | 63 +------------------- Documentation/perf_counter/perf.h | 64 +++++++++++++++++++++ 5 files changed, 70 insertions(+), 181 deletions(-) create mode 100644 Documentation/perf_counter/perf.h diff --git a/Documentation/perf_counter/Makefile b/Documentation/perf_counter/Makefile index 877cf5dedb5..481e4c26cd4 100644 --- a/Documentation/perf_counter/Makefile +++ b/Documentation/perf_counter/Makefile @@ -287,6 +287,7 @@ export PERL_PATH LIB_FILE=libperf.a LIB_H += ../../include/linux/perf_counter.h +LIB_H += perf.h LIB_H += util/levenshtein.h LIB_H += util/parse-options.h LIB_H += util/quote.h diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c index 9cff266fb61..59f1d87f41e 100644 --- a/Documentation/perf_counter/builtin-record.c +++ b/Documentation/perf_counter/builtin-record.c @@ -1,6 +1,7 @@ -#define _GNU_SOURCE +#include "util/util.h" + #include #include #include @@ -32,66 +33,7 @@ #include "../../include/linux/perf_counter.h" - -/* - * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all - * counters in the current task. - */ -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 - -#define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) - -#define rdclock() \ -({ \ - struct timespec ts; \ - \ - clock_gettime(CLOCK_MONOTONIC, &ts); \ - ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ -}) - -/* - * Pick up some kernel type conventions: - */ -#define __user -#define asmlinkage - -#ifdef __x86_64__ -#define __NR_perf_counter_open 298 -#define rmb() asm volatile("lfence" ::: "memory") -#define cpu_relax() asm volatile("rep; nop" ::: "memory"); -#endif - -#ifdef __i386__ -#define __NR_perf_counter_open 336 -#define rmb() asm volatile("lfence" ::: "memory") -#define cpu_relax() asm volatile("rep; nop" ::: "memory"); -#endif - -#ifdef __powerpc__ -#define __NR_perf_counter_open 319 -#define rmb() asm volatile ("sync" ::: "memory") -#define cpu_relax() asm volatile ("" ::: "memory"); -#endif - -#define unlikely(x) __builtin_expect(!!(x), 0) -#define min(x, y) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - (void) (&_min1 == &_min2); \ - _min1 < _min2 ? _min1 : _min2; }) - -extern asmlinkage int sys_perf_counter_open( - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd, - unsigned long flags); - -#define MAX_COUNTERS 64 -#define MAX_NR_CPUS 256 - -#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) +#include "perf.h" static int nr_counters = 0; static __u64 event_id[MAX_COUNTERS] = { }; diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index 9fbc66173de..6de38d25688 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -85,64 +85,7 @@ #include "../../include/linux/perf_counter.h" - -/* - * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all - * counters in the current task. - */ -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 - -#define rdclock() \ -({ \ - struct timespec ts; \ - \ - clock_gettime(CLOCK_MONOTONIC, &ts); \ - ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ -}) - -/* - * Pick up some kernel type conventions: - */ -#define __user -#define asmlinkage - -#ifdef __x86_64__ -#define __NR_perf_counter_open 298 -#define rmb() asm volatile("lfence" ::: "memory") -#define cpu_relax() asm volatile("rep; nop" ::: "memory"); -#endif - -#ifdef __i386__ -#define __NR_perf_counter_open 336 -#define rmb() asm volatile("lfence" ::: "memory") -#define cpu_relax() asm volatile("rep; nop" ::: "memory"); -#endif - -#ifdef __powerpc__ -#define __NR_perf_counter_open 319 -#define rmb() asm volatile ("sync" ::: "memory") -#define cpu_relax() asm volatile ("" ::: "memory"); -#endif - -#define unlikely(x) __builtin_expect(!!(x), 0) -#define min(x, y) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - (void) (&_min1 == &_min2); \ - _min1 < _min2 ? _min1 : _min2; }) - -extern asmlinkage int sys_perf_counter_open( - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd, - unsigned long flags); - -#define MAX_COUNTERS 64 -#define MAX_NR_CPUS 256 - -#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) +#include "perf.h" static int system_wide = 0; diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index b6d989e7b19..cd6f61d7341 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -66,68 +66,7 @@ #include "../../include/linux/perf_counter.h" - -/* - * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all - * counters in the current task. - */ -#define PR_TASK_PERF_COUNTERS_DISABLE 31 -#define PR_TASK_PERF_COUNTERS_ENABLE 32 - -#define rdclock() \ -({ \ - struct timespec ts; \ - \ - clock_gettime(CLOCK_MONOTONIC, &ts); \ - ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ -}) - -/* - * Pick up some kernel type conventions: - */ -#define __user -#define asmlinkage - -#ifdef __x86_64__ -#define __NR_perf_counter_open 298 -#define rmb() asm volatile("lfence" ::: "memory") -#define cpu_relax() asm volatile("rep; nop" ::: "memory"); -#endif - -#ifdef __i386__ -#define __NR_perf_counter_open 336 -#define rmb() asm volatile("lfence" ::: "memory") -#define cpu_relax() asm volatile("rep; nop" ::: "memory"); -#endif - -#ifdef __powerpc__ -#define __NR_perf_counter_open 319 -#define rmb() asm volatile ("sync" ::: "memory") -#define cpu_relax() asm volatile ("" ::: "memory"); -#endif - -#define unlikely(x) __builtin_expect(!!(x), 0) -#define min(x, y) ({ \ - typeof(x) _min1 = (x); \ - typeof(y) _min2 = (y); \ - (void) (&_min1 == &_min2); \ - _min1 < _min2 ? _min1 : _min2; }) - -asmlinkage int sys_perf_counter_open( - struct perf_counter_hw_event *hw_event_uptr __user, - pid_t pid, - int cpu, - int group_fd, - unsigned long flags) -{ - return syscall( - __NR_perf_counter_open, hw_event_uptr, pid, cpu, group_fd, flags); -} - -#define MAX_COUNTERS 64 -#define MAX_NR_CPUS 256 - -#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) +#include "perf.h" static int system_wide = 0; diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h new file mode 100644 index 00000000000..391fcc73148 --- /dev/null +++ b/Documentation/perf_counter/perf.h @@ -0,0 +1,64 @@ +#ifndef _PERF_PERF_H +#define _PERF_PERF_H + +/* + * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all + * counters in the current task. + */ +#define PR_TASK_PERF_COUNTERS_DISABLE 31 +#define PR_TASK_PERF_COUNTERS_ENABLE 32 + +#define rdclock() \ +({ \ + struct timespec ts; \ + \ + clock_gettime(CLOCK_MONOTONIC, &ts); \ + ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ +}) + +/* + * Pick up some kernel type conventions: + */ +#define __user +#define asmlinkage + +#ifdef __x86_64__ +#define __NR_perf_counter_open 298 +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __i386__ +#define __NR_perf_counter_open 336 +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __powerpc__ +#define __NR_perf_counter_open 319 +#define rmb() asm volatile ("sync" ::: "memory") +#define cpu_relax() asm volatile ("" ::: "memory"); +#endif + +#define unlikely(x) __builtin_expect(!!(x), 0) +#define min(x, y) ({ \ + typeof(x) _min1 = (x); \ + typeof(y) _min2 = (y); \ + (void) (&_min1 == &_min2); \ + _min1 < _min2 ? _min1 : _min2; }) + +static inline int +sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, + pid_t pid, int cpu, int group_fd, + unsigned long flags) +{ + return syscall(__NR_perf_counter_open, hw_event_uptr, pid, cpu, + group_fd, flags); +} + +#define MAX_COUNTERS 64 +#define MAX_NR_CPUS 256 + +#define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) + +#endif From a92e70237c8abbd1c3241133bf72f2cd07c90eae Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 May 2009 18:39:47 +0200 Subject: [PATCH 0736/2061] perfcounter tools: make rdclock an inline function Signed-off-by: Thomas Gleixner --- Documentation/perf_counter/perf.h | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h index 391fcc73148..fb142307228 100644 --- a/Documentation/perf_counter/perf.h +++ b/Documentation/perf_counter/perf.h @@ -8,13 +8,17 @@ #define PR_TASK_PERF_COUNTERS_DISABLE 31 #define PR_TASK_PERF_COUNTERS_ENABLE 32 -#define rdclock() \ -({ \ - struct timespec ts; \ - \ - clock_gettime(CLOCK_MONOTONIC, &ts); \ - ts.tv_sec * 1000000000ULL + ts.tv_nsec; \ -}) +#ifndef NSEC_PER_SEC +# define NSEC_PER_SEC 1000000000ULL +#endif + +static inline unsigned long long rdclock(void) +{ + struct timespec ts; + + clock_gettime(CLOCK_MONOTONIC, &ts); + return ts.tv_sec * 1000000000ULL + ts.tv_nsec; +} /* * Pick up some kernel type conventions: From 7bd5469cd938eec6a76b3135e6becd9b5e096e98 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 May 2009 18:42:47 +0200 Subject: [PATCH 0737/2061] perfcounter tools: fix pointer mismatch Neither process_options nor execvp take an const **char as argument. Signed-off-by: Thomas Gleixner --- Documentation/perf_counter/builtin-record.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c index 59f1d87f41e..3a3deb3fbbc 100644 --- a/Documentation/perf_counter/builtin-record.c +++ b/Documentation/perf_counter/builtin-record.c @@ -332,7 +332,7 @@ static void sigchld_handler(int sig) done = 1; } -int cmd_record(int argc, const char **argv) +int cmd_record(int argc, char **argv) { struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; From 4ba67c1d48aeedcc31630bb40b6179fc7d360f90 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Fri, 1 May 2009 18:48:06 +0200 Subject: [PATCH 0738/2061] perfcounter tools: get the syscall number from arch/*/include/asm/unistd.h Avoid further confusion during development Signed-off-by: Thomas Gleixner --- Documentation/perf_counter/perf.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h index fb142307228..6fa3656399f 100644 --- a/Documentation/perf_counter/perf.h +++ b/Documentation/perf_counter/perf.h @@ -26,20 +26,14 @@ static inline unsigned long long rdclock(void) #define __user #define asmlinkage -#ifdef __x86_64__ -#define __NR_perf_counter_open 298 -#define rmb() asm volatile("lfence" ::: "memory") -#define cpu_relax() asm volatile("rep; nop" ::: "memory"); -#endif - -#ifdef __i386__ -#define __NR_perf_counter_open 336 +#if defined(__x86_64__) || defined(__i386__) +#include "../../arch/x86/include/asm/unistd.h" #define rmb() asm volatile("lfence" ::: "memory") #define cpu_relax() asm volatile("rep; nop" ::: "memory"); #endif #ifdef __powerpc__ -#define __NR_perf_counter_open 319 +#include "../../arch/powerpc/include/asm/unistd.h" #define rmb() asm volatile ("sync" ::: "memory") #define cpu_relax() asm volatile ("" ::: "memory"); #endif From 75507efb1372b6acf1aa6bf00ebd49ce196fd994 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 12:58:36 -0400 Subject: [PATCH 0739/2061] ext4: Don't avoid using BLOCK_UNINIT block groups in mballoc By avoiding the use of not-yet-used block groups (i.e., block groups with the BLOCK_UNINIT flag), mballoc had a tendency to create large files with large non-contiguous gaps. In addition avoiding the use of new block groups had a tendency to push regular file data into the first block group in a flex_bg group, which slows down the speed of e2fsck pass 2, since it has a tendency to seek much more. For example: Before Patch After Patch Time in seconds Time in seconds Real / User/ Sys MB/s Real / User/ Sys MB/s Pass 1 8.52 / 2.21 / 0.46 20.43 8.84 / 4.97 / 1.11 19.68 Pass 2 21.16 / 1.02 / 1.86 11.30 6.54 / 1.77 / 1.78 36.39 Pass 3 0.01 / 0.00 / 0.00 139.00 0.01 / 0.01 / 0.00 128.90 Pass 4 0.16 / 0.15 / 0.00 0.00 0.17 / 0.17 / 0.00 0.00 Pass 5 2.52 / 1.99 / 0.09 0.79 2.31 / 1.78 / 0.06 0.86 Total 32.40 / 5.11 / 2.49 12.81 17.99 / 8.75 / 2.98 23.01 This was on a sample 80 gig root filesystem which was approximately 50% full. Note the improved e2fsck pass 2 performance, by over a factor of 3, due to a decreased number of seeks. (The total amount of I/O in pass 2 was unchanged; the layout of the directory blocks was simply much better from e2fsck's's perspective.) Other changes as a result of this patch on this sample filesystem: Before Patch After Patch # of non-contig files 762 779 # of non-contig directories 571 570 # of BLOCK_UNINIT bg's 307 293 # of INODE_UNINIT bg's 503 503 Out of 640 block groups, of which 333 were in use, this patch caused an extra 14 block groups to be utilized. The number of non-contiguous files did go up slightly, but when measured against the 99.9% of the files (603,154) which were contiguously allocated, this is pretty insignificant. Signed-off-by: "Theodore Ts'o" Signed-off-by: Andreas Dilger --- fs/ext4/mballoc.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c3af9e6b666..dbd47eac13e 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1728,7 +1728,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, unsigned free, fragments; unsigned i, bits; int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb)); - struct ext4_group_desc *desc; struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group); BUG_ON(cr < 0 || cr >= 4); @@ -1744,10 +1743,6 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac, switch (cr) { case 0: BUG_ON(ac->ac_2order == 0); - /* If this group is uninitialized, skip it initially */ - desc = ext4_get_group_desc(ac->ac_sb, group, NULL); - if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) - return 0; /* Avoid using the first bg of a flexgroup for data files */ if ((ac->ac_flags & EXT4_MB_HINT_DATA) && @@ -2067,9 +2062,7 @@ repeat: ac->ac_groups_scanned++; desc = ext4_get_group_desc(sb, group, NULL); - if (cr == 0 || (desc->bg_flags & - cpu_to_le16(EXT4_BG_BLOCK_UNINIT) && - ac->ac_2order != 0)) + if (cr == 0) ext4_mb_simple_scan_group(ac, &e4b); else if (cr == 1 && ac->ac_g_ex.fe_len == sbi->s_stripe) From d444c3c38189b3f18337a213855ac1c07af4e2d9 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 13:44:33 -0400 Subject: [PATCH 0740/2061] ext4: Move the ext4_i.h header file into ext4.h There is no longer a reason for a separate ext4_i.h header file, so move it into ext4.h just to make life easier for developers to find the relevant data structures and typedefs. Should also speed up compiles slightly, too. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 122 ++++++++++++++++++++++++++++++++++++++++- fs/ext4/ext4_i.h | 140 ----------------------------------------------- 2 files changed, 121 insertions(+), 141 deletions(-) delete mode 100644 fs/ext4/ext4_i.h diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 02ec44bf38e..ba57d669cb6 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -21,7 +21,10 @@ #include #include #include -#include "ext4_i.h" +#include +#include +#include +#include /* * The fourth extended filesystem constants/structures @@ -46,6 +49,19 @@ #define ext4_debug(f, a...) do {} while (0) #endif +/* data type for block offset of block group */ +typedef int ext4_grpblk_t; + +/* data type for filesystem-wide blocks number */ +typedef unsigned long long ext4_fsblk_t; + +/* data type for file logical block number */ +typedef __u32 ext4_lblk_t; + +/* data type for block group number */ +typedef unsigned int ext4_group_t; + + /* prefer goal again. length */ #define EXT4_MB_HINT_MERGE 1 /* blocks already reserved */ @@ -515,6 +531,110 @@ do { \ #endif /* defined(__KERNEL__) || defined(__linux__) */ +/* + * storage for cached extent + */ +struct ext4_ext_cache { + ext4_fsblk_t ec_start; + ext4_lblk_t ec_block; + __u32 ec_len; /* must be 32bit to return holes */ + __u32 ec_type; +}; + +/* + * fourth extended file system inode data in memory + */ +struct ext4_inode_info { + __le32 i_data[15]; /* unconverted */ + __u32 i_flags; + ext4_fsblk_t i_file_acl; + __u32 i_dtime; + + /* + * i_block_group is the number of the block group which contains + * this file's inode. Constant across the lifetime of the inode, + * it is ued for making block allocation decisions - we try to + * place a file's data blocks near its inode block, and new inodes + * near to their parent directory's inode. + */ + ext4_group_t i_block_group; + __u32 i_state; /* Dynamic state flags for ext4 */ + + ext4_lblk_t i_dir_start_lookup; +#ifdef CONFIG_EXT4_FS_XATTR + /* + * Extended attributes can be read independently of the main file + * data. Taking i_mutex even when reading would cause contention + * between readers of EAs and writers of regular file data, so + * instead we synchronize on xattr_sem when reading or changing + * EAs. + */ + struct rw_semaphore xattr_sem; +#endif +#ifdef CONFIG_EXT4_FS_POSIX_ACL + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; +#endif + + struct list_head i_orphan; /* unlinked but open inodes */ + + /* + * i_disksize keeps track of what the inode size is ON DISK, not + * in memory. During truncate, i_size is set to the new size by + * the VFS prior to calling ext4_truncate(), but the filesystem won't + * set i_disksize to 0 until the truncate is actually under way. + * + * The intent is that i_disksize always represents the blocks which + * are used by this file. This allows recovery to restart truncate + * on orphans if we crash during truncate. We actually write i_disksize + * into the on-disk inode when writing inodes out, instead of i_size. + * + * The only time when i_disksize and i_size may be different is when + * a truncate is in progress. The only things which change i_disksize + * are ext4_get_block (growth) and ext4_truncate (shrinkth). + */ + loff_t i_disksize; + + /* + * i_data_sem is for serialising ext4_truncate() against + * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's + * data tree are chopped off during truncate. We can't do that in + * ext4 because whenever we perform intermediate commits during + * truncate, the inode and all the metadata blocks *must* be in a + * consistent state which allows truncation of the orphans to restart + * during recovery. Hence we must fix the get_block-vs-truncate race + * by other means, so we have i_data_sem. + */ + struct rw_semaphore i_data_sem; + struct inode vfs_inode; + struct jbd2_inode jinode; + + struct ext4_ext_cache i_cached_extent; + /* + * File creation time. Its function is same as that of + * struct timespec i_{a,c,m}time in the generic inode. + */ + struct timespec i_crtime; + + /* mballoc */ + struct list_head i_prealloc_list; + spinlock_t i_prealloc_lock; + + /* ialloc */ + ext4_group_t i_last_alloc_group; + + /* allocation reservation info for delalloc */ + unsigned int i_reserved_data_blocks; + unsigned int i_reserved_meta_blocks; + unsigned int i_allocated_meta_blocks; + unsigned short i_delalloc_reserved_flag; + + /* on-disk additional length */ + __u16 i_extra_isize; + + spinlock_t i_block_reservation_lock; +}; + /* * File system states */ diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h deleted file mode 100644 index 4ce2187123a..00000000000 --- a/fs/ext4/ext4_i.h +++ /dev/null @@ -1,140 +0,0 @@ -/* - * ext4_i.h - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/include/linux/minix_fs_i.h - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#ifndef _EXT4_I -#define _EXT4_I - -#include -#include -#include -#include - -/* data type for block offset of block group */ -typedef int ext4_grpblk_t; - -/* data type for filesystem-wide blocks number */ -typedef unsigned long long ext4_fsblk_t; - -/* data type for file logical block number */ -typedef __u32 ext4_lblk_t; - -/* data type for block group number */ -typedef unsigned int ext4_group_t; - -/* - * storage for cached extent - */ -struct ext4_ext_cache { - ext4_fsblk_t ec_start; - ext4_lblk_t ec_block; - __u32 ec_len; /* must be 32bit to return holes */ - __u32 ec_type; -}; - -/* - * fourth extended file system inode data in memory - */ -struct ext4_inode_info { - __le32 i_data[15]; /* unconverted */ - __u32 i_flags; - ext4_fsblk_t i_file_acl; - __u32 i_dtime; - - /* - * i_block_group is the number of the block group which contains - * this file's inode. Constant across the lifetime of the inode, - * it is ued for making block allocation decisions - we try to - * place a file's data blocks near its inode block, and new inodes - * near to their parent directory's inode. - */ - ext4_group_t i_block_group; - __u32 i_state; /* Dynamic state flags for ext4 */ - - ext4_lblk_t i_dir_start_lookup; -#ifdef CONFIG_EXT4_FS_XATTR - /* - * Extended attributes can be read independently of the main file - * data. Taking i_mutex even when reading would cause contention - * between readers of EAs and writers of regular file data, so - * instead we synchronize on xattr_sem when reading or changing - * EAs. - */ - struct rw_semaphore xattr_sem; -#endif -#ifdef CONFIG_EXT4_FS_POSIX_ACL - struct posix_acl *i_acl; - struct posix_acl *i_default_acl; -#endif - - struct list_head i_orphan; /* unlinked but open inodes */ - - /* - * i_disksize keeps track of what the inode size is ON DISK, not - * in memory. During truncate, i_size is set to the new size by - * the VFS prior to calling ext4_truncate(), but the filesystem won't - * set i_disksize to 0 until the truncate is actually under way. - * - * The intent is that i_disksize always represents the blocks which - * are used by this file. This allows recovery to restart truncate - * on orphans if we crash during truncate. We actually write i_disksize - * into the on-disk inode when writing inodes out, instead of i_size. - * - * The only time when i_disksize and i_size may be different is when - * a truncate is in progress. The only things which change i_disksize - * are ext4_get_block (growth) and ext4_truncate (shrinkth). - */ - loff_t i_disksize; - - /* - * i_data_sem is for serialising ext4_truncate() against - * ext4_getblock(). In the 2.4 ext2 design, great chunks of inode's - * data tree are chopped off during truncate. We can't do that in - * ext4 because whenever we perform intermediate commits during - * truncate, the inode and all the metadata blocks *must* be in a - * consistent state which allows truncation of the orphans to restart - * during recovery. Hence we must fix the get_block-vs-truncate race - * by other means, so we have i_data_sem. - */ - struct rw_semaphore i_data_sem; - struct inode vfs_inode; - struct jbd2_inode jinode; - - struct ext4_ext_cache i_cached_extent; - /* - * File creation time. Its function is same as that of - * struct timespec i_{a,c,m}time in the generic inode. - */ - struct timespec i_crtime; - - /* mballoc */ - struct list_head i_prealloc_list; - spinlock_t i_prealloc_lock; - - /* ialloc */ - ext4_group_t i_last_alloc_group; - - /* allocation reservation info for delalloc */ - unsigned int i_reserved_data_blocks; - unsigned int i_reserved_meta_blocks; - unsigned int i_allocated_meta_blocks; - unsigned short i_delalloc_reserved_flag; - - /* on-disk additional length */ - __u16 i_extra_isize; - - spinlock_t i_block_reservation_lock; -}; - -#endif /* _EXT4_I */ From ca0faba0e8ac844dc0279825eb8db876b5962ea5 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 3 May 2009 16:33:44 -0400 Subject: [PATCH 0741/2061] ext4: Move the ext4_sb.h header file into ext4.h There is no longer a reason for a separate ext4_sb.h header file, so move it into ext4.h just to make life easier for developers to find the relevant data structures and typedefs. Should also speed up compiles slightly, too. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 144 ++++++++++++++++++++++++++++++++++++++-- fs/ext4/ext4_sb.h | 163 ---------------------------------------------- 2 files changed, 140 insertions(+), 167 deletions(-) delete mode 100644 fs/ext4/ext4_sb.h diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index ba57d669cb6..af3c906e705 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -25,6 +25,10 @@ #include #include #include +#include +#include +#include +#include /* * The fourth extended filesystem constants/structures @@ -195,9 +199,6 @@ struct flex_groups { #define EXT4_BG_BLOCK_UNINIT 0x0002 /* Block bitmap not in use */ #define EXT4_BG_INODE_ZEROED 0x0004 /* On-disk itable initialized to zero */ -#ifdef __KERNEL__ -#include "ext4_sb.h" -#endif /* * Macro-instructions used to manage group descriptors */ @@ -809,6 +810,136 @@ struct ext4_super_block { }; #ifdef __KERNEL__ +/* + * fourth extended-fs super-block data in memory + */ +struct ext4_sb_info { + unsigned long s_desc_size; /* Size of a group descriptor in bytes */ + unsigned long s_inodes_per_block;/* Number of inodes per block */ + unsigned long s_blocks_per_group;/* Number of blocks in a group */ + unsigned long s_inodes_per_group;/* Number of inodes in a group */ + unsigned long s_itb_per_group; /* Number of inode table blocks per group */ + unsigned long s_gdb_count; /* Number of group descriptor blocks */ + unsigned long s_desc_per_block; /* Number of group descriptors per block */ + ext4_group_t s_groups_count; /* Number of groups in the fs */ + unsigned long s_overhead_last; /* Last calculated overhead */ + unsigned long s_blocks_last; /* Last seen block count */ + loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ + struct buffer_head * s_sbh; /* Buffer containing the super block */ + struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ + struct buffer_head **s_group_desc; + unsigned long s_mount_opt; + ext4_fsblk_t s_sb_block; + uid_t s_resuid; + gid_t s_resgid; + unsigned short s_mount_state; + unsigned short s_pad; + int s_addr_per_block_bits; + int s_desc_per_block_bits; + int s_inode_size; + int s_first_ino; + unsigned int s_inode_readahead_blks; + spinlock_t s_next_gen_lock; + u32 s_next_generation; + u32 s_hash_seed[4]; + int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ + struct percpu_counter s_freeblocks_counter; + struct percpu_counter s_freeinodes_counter; + struct percpu_counter s_dirs_counter; + struct percpu_counter s_dirtyblocks_counter; + struct blockgroup_lock *s_blockgroup_lock; + struct proc_dir_entry *s_proc; + struct kobject s_kobj; + struct completion s_kobj_unregister; + + /* Journaling */ + struct inode *s_journal_inode; + struct journal_s *s_journal; + struct list_head s_orphan; + struct mutex s_orphan_lock; + struct mutex s_resize_lock; + unsigned long s_commit_interval; + u32 s_max_batch_time; + u32 s_min_batch_time; + struct block_device *journal_bdev; +#ifdef CONFIG_JBD2_DEBUG + struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ + wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ +#endif +#ifdef CONFIG_QUOTA + char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + int s_jquota_fmt; /* Format of quota to use */ +#endif + unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + +#ifdef EXTENTS_STATS + /* ext4 extents stats */ + unsigned long s_ext_min; + unsigned long s_ext_max; + unsigned long s_depth_max; + spinlock_t s_ext_stats_lock; + unsigned long s_ext_blocks; + unsigned long s_ext_extents; +#endif + + /* for buddy allocator */ + struct ext4_group_info ***s_group_info; + struct inode *s_buddy_cache; + long s_blocks_reserved; + spinlock_t s_reserve_lock; + spinlock_t s_md_lock; + tid_t s_last_transaction; + unsigned short *s_mb_offsets; + unsigned int *s_mb_maxs; + + /* tunables */ + unsigned long s_stripe; + unsigned int s_mb_stream_request; + unsigned int s_mb_max_to_scan; + unsigned int s_mb_min_to_scan; + unsigned int s_mb_stats; + unsigned int s_mb_order2_reqs; + unsigned int s_mb_group_prealloc; + /* where last allocation was done - for stream allocation */ + unsigned long s_mb_last_group; + unsigned long s_mb_last_start; + + /* history to debug policy */ + struct ext4_mb_history *s_mb_history; + int s_mb_history_cur; + int s_mb_history_max; + int s_mb_history_num; + spinlock_t s_mb_history_lock; + int s_mb_history_filter; + + /* stats for buddy allocator */ + spinlock_t s_mb_pa_lock; + atomic_t s_bal_reqs; /* number of reqs with len > 1 */ + atomic_t s_bal_success; /* we found long enough chunks */ + atomic_t s_bal_allocated; /* in blocks */ + atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_goals; /* goal hits */ + atomic_t s_bal_breaks; /* too long searches */ + atomic_t s_bal_2orders; /* 2^order hits */ + spinlock_t s_bal_lock; + unsigned long s_mb_buddies_generated; + unsigned long long s_mb_generation_time; + atomic_t s_mb_lost_chunks; + atomic_t s_mb_preallocated; + atomic_t s_mb_discarded; + + /* locality groups */ + struct ext4_locality_group *s_locality_groups; + + /* for write statistics */ + unsigned long s_sectors_written_start; + u64 s_kbytes_written; + + unsigned int s_log_groups_per_flex; + struct flex_groups *s_flex_groups; +}; + static inline struct ext4_sb_info *EXT4_SB(struct super_block *sb) { return sb->s_fs_info; @@ -824,7 +955,6 @@ static inline struct timespec ext4_current_time(struct inode *inode) current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; } - static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) { return ino == EXT4_ROOT_INO || @@ -833,6 +963,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } + +static inline spinlock_t * +sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) +{ + return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); +} #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h deleted file mode 100644 index 2d36223d5f5..00000000000 --- a/fs/ext4/ext4_sb.h +++ /dev/null @@ -1,163 +0,0 @@ -/* - * ext4_sb.h - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/include/linux/minix_fs_sb.h - * - * Copyright (C) 1991, 1992 Linus Torvalds - */ - -#ifndef _EXT4_SB -#define _EXT4_SB - -#ifdef __KERNEL__ -#include -#include -#include -#include -#endif -#include - -/* - * fourth extended-fs super-block data in memory - */ -struct ext4_sb_info { - unsigned long s_desc_size; /* Size of a group descriptor in bytes */ - unsigned long s_inodes_per_block;/* Number of inodes per block */ - unsigned long s_blocks_per_group;/* Number of blocks in a group */ - unsigned long s_inodes_per_group;/* Number of inodes in a group */ - unsigned long s_itb_per_group; /* Number of inode table blocks per group */ - unsigned long s_gdb_count; /* Number of group descriptor blocks */ - unsigned long s_desc_per_block; /* Number of group descriptors per block */ - ext4_group_t s_groups_count; /* Number of groups in the fs */ - unsigned long s_overhead_last; /* Last calculated overhead */ - unsigned long s_blocks_last; /* Last seen block count */ - loff_t s_bitmap_maxbytes; /* max bytes for bitmap files */ - struct buffer_head * s_sbh; /* Buffer containing the super block */ - struct ext4_super_block *s_es; /* Pointer to the super block in the buffer */ - struct buffer_head **s_group_desc; - unsigned long s_mount_opt; - ext4_fsblk_t s_sb_block; - uid_t s_resuid; - gid_t s_resgid; - unsigned short s_mount_state; - unsigned short s_pad; - int s_addr_per_block_bits; - int s_desc_per_block_bits; - int s_inode_size; - int s_first_ino; - unsigned int s_inode_readahead_blks; - spinlock_t s_next_gen_lock; - u32 s_next_generation; - u32 s_hash_seed[4]; - int s_def_hash_version; - int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ - struct percpu_counter s_freeblocks_counter; - struct percpu_counter s_freeinodes_counter; - struct percpu_counter s_dirs_counter; - struct percpu_counter s_dirtyblocks_counter; - struct blockgroup_lock *s_blockgroup_lock; - struct proc_dir_entry *s_proc; - struct kobject s_kobj; - struct completion s_kobj_unregister; - - /* Journaling */ - struct inode *s_journal_inode; - struct journal_s *s_journal; - struct list_head s_orphan; - struct mutex s_orphan_lock; - struct mutex s_resize_lock; - unsigned long s_commit_interval; - u32 s_max_batch_time; - u32 s_min_batch_time; - struct block_device *journal_bdev; -#ifdef CONFIG_JBD2_DEBUG - struct timer_list turn_ro_timer; /* For turning read-only (crash simulation) */ - wait_queue_head_t ro_wait_queue; /* For people waiting for the fs to go read-only */ -#endif -#ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ - int s_jquota_fmt; /* Format of quota to use */ -#endif - unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ - -#ifdef EXTENTS_STATS - /* ext4 extents stats */ - unsigned long s_ext_min; - unsigned long s_ext_max; - unsigned long s_depth_max; - spinlock_t s_ext_stats_lock; - unsigned long s_ext_blocks; - unsigned long s_ext_extents; -#endif - - /* for buddy allocator */ - struct ext4_group_info ***s_group_info; - struct inode *s_buddy_cache; - long s_blocks_reserved; - spinlock_t s_reserve_lock; - spinlock_t s_md_lock; - tid_t s_last_transaction; - unsigned short *s_mb_offsets; - unsigned int *s_mb_maxs; - - /* tunables */ - unsigned long s_stripe; - unsigned int s_mb_stream_request; - unsigned int s_mb_max_to_scan; - unsigned int s_mb_min_to_scan; - unsigned int s_mb_stats; - unsigned int s_mb_order2_reqs; - unsigned int s_mb_group_prealloc; - /* where last allocation was done - for stream allocation */ - unsigned long s_mb_last_group; - unsigned long s_mb_last_start; - - /* history to debug policy */ - struct ext4_mb_history *s_mb_history; - int s_mb_history_cur; - int s_mb_history_max; - int s_mb_history_num; - spinlock_t s_mb_history_lock; - int s_mb_history_filter; - - /* stats for buddy allocator */ - spinlock_t s_mb_pa_lock; - atomic_t s_bal_reqs; /* number of reqs with len > 1 */ - atomic_t s_bal_success; /* we found long enough chunks */ - atomic_t s_bal_allocated; /* in blocks */ - atomic_t s_bal_ex_scanned; /* total extents scanned */ - atomic_t s_bal_goals; /* goal hits */ - atomic_t s_bal_breaks; /* too long searches */ - atomic_t s_bal_2orders; /* 2^order hits */ - spinlock_t s_bal_lock; - unsigned long s_mb_buddies_generated; - unsigned long long s_mb_generation_time; - atomic_t s_mb_lost_chunks; - atomic_t s_mb_preallocated; - atomic_t s_mb_discarded; - - /* locality groups */ - struct ext4_locality_group *s_locality_groups; - - /* for write statistics */ - unsigned long s_sectors_written_start; - u64 s_kbytes_written; - - unsigned int s_log_groups_per_flex; - struct flex_groups *s_flex_groups; -}; - -static inline spinlock_t * -sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) -{ - return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); -} - -#endif /* _EXT4_SB */ From 596397b77c895d0fa3674f579c94ad5ea88ef01d Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 13:49:15 -0400 Subject: [PATCH 0742/2061] ext4: Move fs/ext4/namei.h into ext4.h The fs/ext4/namei.h header file had only a single function declaration, and should have never been a standalone file. Move it into ext4.h, where should have been from the beginning. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 1 + fs/ext4/namei.c | 1 - fs/ext4/namei.h | 8 -------- fs/ext4/super.c | 1 - 4 files changed, 1 insertion(+), 10 deletions(-) delete mode 100644 fs/ext4/namei.h diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index af3c906e705..d9c5251d082 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1594,6 +1594,7 @@ extern const struct file_operations ext4_file_operations; /* namei.c */ extern const struct inode_operations ext4_dir_inode_operations; extern const struct inode_operations ext4_special_inode_operations; +extern struct dentry *ext4_get_parent(struct dentry *child); /* symlink.c */ extern const struct inode_operations ext4_symlink_inode_operations; diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 8018e49a728..c9690b250e5 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -37,7 +37,6 @@ #include "ext4.h" #include "ext4_jbd2.h" -#include "namei.h" #include "xattr.h" #include "acl.h" diff --git a/fs/ext4/namei.h b/fs/ext4/namei.h deleted file mode 100644 index 5e4dfff36a0..00000000000 --- a/fs/ext4/namei.h +++ /dev/null @@ -1,8 +0,0 @@ -/* linux/fs/ext4/namei.h - * - * Copyright (C) 2005 Simtec Electronics - * Ben Dooks - * -*/ - -extern struct dentry *ext4_get_parent(struct dentry *child); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 1fbf0906ae2..d79e1c428b4 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -46,7 +46,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "namei.h" #include "group.h" struct proc_dir_entry *ext4_proc_root; From bb23c20a851a5038b255a3c0d0aa56093c1da3f8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Fri, 1 May 2009 19:44:44 -0400 Subject: [PATCH 0743/2061] ext4: Move fs/ext4/group.h into ext4.h Move the function prototypes in group.h into ext4.h so they are all defined in one place. Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 1 - fs/ext4/ext4.h | 17 +++++++++++++++++ fs/ext4/group.h | 29 ----------------------------- fs/ext4/ialloc.c | 1 - fs/ext4/mballoc.h | 1 - fs/ext4/resize.c | 1 - fs/ext4/super.c | 1 - 7 files changed, 17 insertions(+), 34 deletions(-) delete mode 100644 fs/ext4/group.h diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index a5ba039850c..92f557d957d 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -19,7 +19,6 @@ #include #include "ext4.h" #include "ext4_jbd2.h" -#include "group.h" #include "mballoc.h" /* diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d9c5251d082..5973f3261b0 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1270,6 +1270,14 @@ extern struct ext4_group_desc * ext4_get_group_desc(struct super_block * sb, ext4_group_t block_group, struct buffer_head ** bh); extern int ext4_should_retry_alloc(struct super_block *sb, int *retries); +struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, + ext4_group_t block_group); +extern unsigned ext4_init_block_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +#define ext4_free_blocks_after_init(sb, group, desc) \ + ext4_init_block_bitmap(sb, NULL, group, desc) /* dir.c */ extern int ext4_check_dir_entry(const char *, struct inode *, @@ -1294,6 +1302,11 @@ extern struct inode * ext4_orphan_get(struct super_block *, unsigned long); extern unsigned long ext4_count_free_inodes(struct super_block *); extern unsigned long ext4_count_dirs(struct super_block *); extern void ext4_check_inodes_bitmap(struct super_block *); +extern unsigned ext4_init_inode_bitmap(struct super_block *sb, + struct buffer_head *bh, + ext4_group_t group, + struct ext4_group_desc *desc); +extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); /* mballoc.c */ extern long ext4_mb_stats; @@ -1417,6 +1430,10 @@ extern void ext4_used_dirs_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); extern void ext4_itable_unused_set(struct super_block *sb, struct ext4_group_desc *bg, __u32 count); +extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); +extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, + struct ext4_group_desc *gdp); static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { diff --git a/fs/ext4/group.h b/fs/ext4/group.h deleted file mode 100644 index c2c0a8d06d0..00000000000 --- a/fs/ext4/group.h +++ /dev/null @@ -1,29 +0,0 @@ -/* - * linux/fs/ext4/group.h - * - * Copyright (C) 2007 Cluster File Systems, Inc - * - * Author: Andreas Dilger - */ - -#ifndef _LINUX_EXT4_GROUP_H -#define _LINUX_EXT4_GROUP_H - -extern __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -extern int ext4_group_desc_csum_verify(struct ext4_sb_info *sbi, __u32 group, - struct ext4_group_desc *gdp); -struct buffer_head *ext4_read_block_bitmap(struct super_block *sb, - ext4_group_t block_group); -extern unsigned ext4_init_block_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -#define ext4_free_blocks_after_init(sb, group, desc) \ - ext4_init_block_bitmap(sb, NULL, group, desc) -extern unsigned ext4_init_inode_bitmap(struct super_block *sb, - struct buffer_head *bh, - ext4_group_t group, - struct ext4_group_desc *desc); -extern void mark_bitmap_end(int start_bit, int end_bit, char *bitmap); -#endif /* _LINUX_EXT4_GROUP_H */ diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 55ba419ca00..916d05c881c 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -27,7 +27,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "group.h" /* * ialloc.c contains the inodes allocation and deallocation routines diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index dd9e6cd5f6c..75e34f69215 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -23,7 +23,6 @@ #include #include "ext4_jbd2.h" #include "ext4.h" -#include "group.h" /* * with AGGRESSIVE_CHECK allocator runs consistency checks over diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index e8ded13b5cb..27eb289eea3 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -15,7 +15,6 @@ #include #include "ext4_jbd2.h" -#include "group.h" #define outside(b, first, last) ((b) < (first) || (b) >= (last)) #define inside(b, first, last) ((b) >= (first) && (b) < (last)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index d79e1c428b4..7903f20c807 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -46,7 +46,6 @@ #include "ext4_jbd2.h" #include "xattr.h" #include "acl.h" -#include "group.h" struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; From f40339031b04279c3fdde7ac5fe97db33b2a7694 Mon Sep 17 00:00:00 2001 From: Curt Wohlgemuth Date: Fri, 1 May 2009 20:27:20 -0400 Subject: [PATCH 0744/2061] ext4: Make the length of the mb_history file tunable In memory-constrained systems with many partitions, the ~68K for each partition for the mb_history buffer can be excessive. This patch adds a new mount option, mb_history_length, as well as a way of setting the default via a module parameter (or via a sysfs parameter in /sys/module/ext4/parameter/default_mb_history_length). If the mb_history_length is set to zero, the mb_history facility is disabled entirely. Signed-off-by: Curt Wohlgemuth Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 13 +++++++------ fs/ext4/super.c | 18 +++++++++++++++++- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index dbd47eac13e..df75855ae6f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2413,7 +2413,8 @@ static void ext4_mb_history_release(struct super_block *sb) if (sbi->s_proc != NULL) { remove_proc_entry("mb_groups", sbi->s_proc); - remove_proc_entry("mb_history", sbi->s_proc); + if (sbi->s_mb_history_max) + remove_proc_entry("mb_history", sbi->s_proc); } kfree(sbi->s_mb_history); } @@ -2424,17 +2425,17 @@ static void ext4_mb_history_init(struct super_block *sb) int i; if (sbi->s_proc != NULL) { - proc_create_data("mb_history", S_IRUGO, sbi->s_proc, - &ext4_mb_seq_history_fops, sb); + if (sbi->s_mb_history_max) + proc_create_data("mb_history", S_IRUGO, sbi->s_proc, + &ext4_mb_seq_history_fops, sb); proc_create_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_fops, sb); } - sbi->s_mb_history_max = 1000; sbi->s_mb_history_cur = 0; spin_lock_init(&sbi->s_mb_history_lock); i = sbi->s_mb_history_max * sizeof(struct ext4_mb_history); - sbi->s_mb_history = kzalloc(i, GFP_KERNEL); + sbi->s_mb_history = i ? kzalloc(i, GFP_KERNEL) : NULL; /* if we can't allocate history, then we simple won't use it */ } @@ -2444,7 +2445,7 @@ ext4_mb_store_history(struct ext4_allocation_context *ac) struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); struct ext4_mb_history h; - if (unlikely(sbi->s_mb_history == NULL)) + if (sbi->s_mb_history == NULL) return; if (!(ac->ac_op & sbi->s_mb_history_filter)) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7903f20c807..39223a52bc7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -47,6 +47,13 @@ #include "xattr.h" #include "acl.h" +static int default_mb_history_length = 1000; + +module_param_named(default_mb_history_length, default_mb_history_length, + int, 0644); +MODULE_PARM_DESC(default_mb_history_length, + "Default number of entries saved for mb_history"); + struct proc_dir_entry *ext4_proc_root; static struct kset *ext4_kset; @@ -1042,7 +1049,7 @@ enum { Opt_journal_update, Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit, Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback, - Opt_data_err_abort, Opt_data_err_ignore, + Opt_data_err_abort, Opt_data_err_ignore, Opt_mb_history_length, Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota, Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, @@ -1088,6 +1095,7 @@ static const match_table_t tokens = { {Opt_data_writeback, "data=writeback"}, {Opt_data_err_abort, "data_err=abort"}, {Opt_data_err_ignore, "data_err=ignore"}, + {Opt_mb_history_length, "mb_history_length=%u"}, {Opt_offusrjquota, "usrjquota="}, {Opt_usrjquota, "usrjquota=%s"}, {Opt_offgrpjquota, "grpjquota="}, @@ -1329,6 +1337,13 @@ static int parse_options(char *options, struct super_block *sb, case Opt_data_err_ignore: clear_opt(sbi->s_mount_opt, DATA_ERR_ABORT); break; + case Opt_mb_history_length: + if (match_int(&args[0], &option)) + return 0; + if (option < 0) + return 0; + sbi->s_mb_history_max = option; + break; #ifdef CONFIG_QUOTA case Opt_usrjquota: qtype = USRQUOTA; @@ -2345,6 +2360,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_mb_history_max = default_mb_history_length; set_opt(sbi->s_mount_opt, BARRIER); From abc8746eb91fb01e8d411896f80f7687c0d8372e Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 2 May 2009 22:54:32 -0400 Subject: [PATCH 0745/2061] ext4: hook fiemap operation for directories Add fiemap callback for directories Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/namei.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index c9690b250e5..f2bc160463b 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -2534,6 +2534,7 @@ const struct inode_operations ext4_dir_inode_operations = { .removexattr = generic_removexattr, #endif .permission = ext4_permission, + .fiemap = ext4_fiemap, }; const struct inode_operations ext4_special_inode_operations = { From 19ba0559f9ce104171ab16706893ce01f03ef116 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 13 May 2009 18:12:05 -0400 Subject: [PATCH 0746/2061] vfs: Enable FS_IOC_FIEMAP and FIGETBSZ for all filetypes The fiemap and get_blk_size ioctls should be enabled even for directories. So move it outisde file_ioctl. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ioctl.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/ioctl.c b/fs/ioctl.c index 82d9c42b8ba..286f38dfc6c 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -414,10 +414,6 @@ static int file_ioctl(struct file *filp, unsigned int cmd, switch (cmd) { case FIBMAP: return ioctl_fibmap(filp, p); - case FS_IOC_FIEMAP: - return ioctl_fiemap(filp, arg); - case FIGETBSZ: - return put_user(inode->i_sb->s_blocksize, p); case FIONREAD: return put_user(i_size_read(inode) - filp->f_pos, p); } @@ -557,6 +553,16 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd, error = ioctl_fsthaw(filp); break; + case FS_IOC_FIEMAP: + return ioctl_fiemap(filp, arg); + + case FIGETBSZ: + { + struct inode *inode = filp->f_path.dentry->d_inode; + int __user *p = (int __user *)arg; + return put_user(inode->i_sb->s_blocksize, p); + } + default: if (S_ISREG(filp->f_path.dentry->d_inode->i_mode)) error = file_ioctl(filp, cmd, arg); From c9877b205f6ce7943bb95281342f4001cc1c00ec Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Fri, 1 May 2009 23:32:06 -0400 Subject: [PATCH 0747/2061] ext4: fix for fiemap last-block test Carl Henrik Lunde reported and debugged this; the test for the last allocated block was comparing bytes to blocks in this test: if (logical + length - 1 == EXT_MAX_BLOCK || ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) flags |= FIEMAP_EXTENT_LAST; so any extent which ended right at 4G was stopping the extent walk. Just replacing these values with the extent block & length should fix it. Also give blksize_bits a saner type, and reverse the order of the tests to make the more likely case tested first. Signed-off-by: Eric Sandeen Reported-by: Carl Henrik Lunde Tested-by: Carl Henrik Lunde Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index ea5c47608ce..5f7295287de 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3196,7 +3196,7 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, void *data) { struct fiemap_extent_info *fieinfo = data; - unsigned long blksize_bits = inode->i_sb->s_blocksize_bits; + unsigned char blksize_bits = inode->i_sb->s_blocksize_bits; __u64 logical; __u64 physical; __u64 length; @@ -3243,8 +3243,8 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, * * XXX this might miss a single-block extent at EXT_MAX_BLOCK */ - if (logical + length - 1 == EXT_MAX_BLOCK || - ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK) + if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || + newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) flags |= FIEMAP_EXTENT_LAST; error = fiemap_fill_next_extent(fieinfo, logical, physical, From a25cbd045a2ffc42787d4dbcbb9c7118f5f42732 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 1 May 2009 14:45:46 +0900 Subject: [PATCH 0748/2061] clocksource: setup mult_orig in clocksource_enable() Setup clocksource mult_orig in clocksource_enable(). Clocksource drivers can save power by using keeping the device clock disabled while the clocksource is unused. In practice this means that the enable() and disable() callbacks perform clk_enable() and clk_disable(). The enable() callback may also use clk_get_rate() to get the clock rate from the clock framework. This information can then be used to calculate the shift and mult variables. Currently the mult_orig variable is setup from mult at registration time only. This is conflicting with the above case since the clock is disabled and the mult variable is not yet calculated at the time of registration. Moving the mult_orig setup code to clocksource_enable() allows us to both handle the common case with no enable() callback and the mult-changed-after-enable() case. [ Impact: allow dynamic clock source usage ] Signed-off-by: Magnus Damm LKML-Reference: <20090501054546.8193.10688.sendpatchset@rx1.opensource.se> Signed-off-by: Thomas Gleixner --- include/linux/clocksource.h | 10 +++++++++- kernel/time/clocksource.c | 3 --- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/linux/clocksource.h b/include/linux/clocksource.h index 5a40d14daa9..c56457c8334 100644 --- a/include/linux/clocksource.h +++ b/include/linux/clocksource.h @@ -288,7 +288,15 @@ static inline cycle_t clocksource_read(struct clocksource *cs) */ static inline int clocksource_enable(struct clocksource *cs) { - return cs->enable ? cs->enable(cs) : 0; + int ret = 0; + + if (cs->enable) + ret = cs->enable(cs); + + /* save mult_orig on enable */ + cs->mult_orig = cs->mult; + + return ret; } /** diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index ecfd7b5187e..80189f6f1c5 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -402,9 +402,6 @@ int clocksource_register(struct clocksource *c) unsigned long flags; int ret; - /* save mult_orig on registration */ - c->mult_orig = c->mult; - spin_lock_irqsave(&clocksource_lock, flags); ret = clocksource_enqueue(c); if (!ret) From 7d27558c4138ac6b3684dea35c2f4379b940a7dd Mon Sep 17 00:00:00 2001 From: john stultz Date: Fri, 1 May 2009 13:10:26 -0700 Subject: [PATCH 0749/2061] timekeeping: create arch_gettimeoffset infrastructure Some arches don't supply their own clocksource. This is mainly the case in architectures that get their inter-tick times by reading the counter on their interval timer. Since these timers wrap every tick, they're not really useful as clocksources. Wrapping them to act like one is possible but not very efficient. So we provide a callout these arches can implement for use with the jiffies clocksource to provide finer then tick granular time. [ Impact: ease the migration to generic time keeping ] Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/time.h | 15 +++++++++++++++ kernel/time/timekeeping.c | 7 +++++++ 2 files changed, 22 insertions(+) diff --git a/include/linux/time.h b/include/linux/time.h index 242f62499bb..ea16c1a01d5 100644 --- a/include/linux/time.h +++ b/include/linux/time.h @@ -113,6 +113,21 @@ struct timespec current_kernel_time(void); #define CURRENT_TIME (current_kernel_time()) #define CURRENT_TIME_SEC ((struct timespec) { get_seconds(), 0 }) +/* Some architectures do not supply their own clocksource. + * This is mainly the case in architectures that get their + * inter-tick times by reading the counter on their interval + * timer. Since these timers wrap every tick, they're not really + * useful as clocksources. Wrapping them to act like one is possible + * but not very efficient. So we provide a callout these arches + * can implement for use with the jiffies clocksource to provide + * finer then tick granular time. + */ +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +extern u32 arch_gettimeoffset(void); +#else +static inline u32 arch_gettimeoffset(void) { return 0; } +#endif + extern void do_gettimeofday(struct timeval *tv); extern int do_settimeofday(struct timespec *tv); extern int do_sys_settimeofday(struct timespec *tv, struct timezone *tz); diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e..e97c50f8458 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -77,6 +77,10 @@ static void clocksource_forward_now(void) clock->cycle_last = cycle_now; nsec = cyc2ns(clock, cycle_delta); + + /* If arch requires, add in gettimeoffset() */ + nsec += arch_gettimeoffset(); + timespec_add_ns(&xtime, nsec); nsec = ((s64)cycle_delta * clock->mult_orig) >> clock->shift; @@ -111,6 +115,9 @@ void getnstimeofday(struct timespec *ts) /* convert to nanoseconds: */ nsecs = cyc2ns(clock, cycle_delta); + /* If arch requires, add in gettimeoffset() */ + nsecs += arch_gettimeoffset(); + } while (read_seqretry(&xtime_lock, seq)); timespec_add_ns(ts, nsecs); From eefd7f03b86b8a319890e7fac5a6fcc7f8694b76 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 2 May 2009 19:05:37 -0400 Subject: [PATCH 0750/2061] ext4: fix the length returned by fiemap for an unallocated extent If the file's blocks have not yet been allocated because of delayed allocation, the length of the extent returned by fiemap is incorrect. This commit fixes this bug. Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 5f7295287de..4fec6b74638 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3244,8 +3244,15 @@ static int ext4_ext_fiemap_cb(struct inode *inode, struct ext4_ext_path *path, * XXX this might miss a single-block extent at EXT_MAX_BLOCK */ if (ext4_ext_next_allocated_block(path) == EXT_MAX_BLOCK || - newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) + newex->ec_block + newex->ec_len - 1 == EXT_MAX_BLOCK) { + loff_t size = i_size_read(inode); + loff_t bs = EXT4_BLOCK_SIZE(inode->i_sb); + flags |= FIEMAP_EXTENT_LAST; + if ((flags & FIEMAP_EXTENT_DELALLOC) && + logical+length > size) + length = (size - logical + bs - 1) & ~(bs-1); + } error = fiemap_fill_next_extent(fieinfo, logical, physical, length, flags); From 955ce5f5be67dfe0d1d096b543af33fe8a1ce3dd Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 2 May 2009 20:35:09 -0400 Subject: [PATCH 0751/2061] ext4: Convert ext4_lock_group to use sb_bgl_lock We have sb_bgl_lock() and ext4_group_info.bb_state bit spinlock to protech group information. The later is only used within mballoc code. Consolidate them to use sb_bgl_lock(). This makes the mballoc.c code much simpler and also avoid confusion with two locks protecting same info. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 12 ++++---- fs/ext4/ext4.h | 26 ++++++---------- fs/ext4/ialloc.c | 29 +++++++++--------- fs/ext4/mballoc.c | 78 ++++++++++++++++------------------------------- fs/ext4/super.c | 6 ++-- 5 files changed, 59 insertions(+), 92 deletions(-) diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 92f557d957d..e2126d70dff 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -326,16 +326,16 @@ ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); return bh; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, @@ -451,7 +451,7 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, down_write(&grp->alloc_sem); for (i = 0, blocks_freed = 0; i < count; i++) { BUFFER_TRACE(bitmap_bh, "clear bit"); - if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), + if (!ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), bit + i, bitmap_bh->b_data)) { ext4_error(sb, __func__, "bit already cleared for block %llu", @@ -461,11 +461,11 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb, blocks_freed++; } } - spin_lock(sb_bgl_lock(sbi, block_group)); + ext4_lock_group(sb, block_group); blk_free_count = blocks_freed + ext4_free_blks_count(sb, desc); ext4_free_blks_set(sb, desc, blk_free_count); desc->bg_checksum = ext4_group_desc_csum(sbi, block_group, desc); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeblocks_counter, blocks_freed); if (sbi->s_log_groups_per_flex) { diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5973f3261b0..149e02dc360 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -963,12 +963,6 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) (ino >= EXT4_FIRST_INO(sb) && ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } - -static inline spinlock_t * -sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group) -{ - return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group); -} #else /* Assume that user mode programs are passing in an ext4fs superblock, not * a kernel struct super_block. This will allow us to call the feature-test @@ -1568,33 +1562,31 @@ struct ext4_group_info { }; #define EXT4_GROUP_INFO_NEED_INIT_BIT 0 -#define EXT4_GROUP_INFO_LOCKED_BIT 1 #define EXT4_MB_GRP_NEED_INIT(grp) \ (test_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &((grp)->bb_state))) +static inline spinlock_t *ext4_group_lock_ptr(struct super_block *sb, + ext4_group_t group) +{ + return bgl_lock_ptr(EXT4_SB(sb)->s_blockgroup_lock, group); +} + static inline void ext4_lock_group(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_lock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); + spin_lock(ext4_group_lock_ptr(sb, group)); } static inline void ext4_unlock_group(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - bit_spin_unlock(EXT4_GROUP_INFO_LOCKED_BIT, &(grinfo->bb_state)); + spin_unlock(ext4_group_lock_ptr(sb, group)); } static inline int ext4_is_group_locked(struct super_block *sb, ext4_group_t group) { - struct ext4_group_info *grinfo = ext4_get_group_info(sb, group); - - return bit_spin_is_locked(EXT4_GROUP_INFO_LOCKED_BIT, - &(grinfo->bb_state)); + return spin_is_locked(ext4_group_lock_ptr(sb, group)); } /* diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 916d05c881c..82f7d1d7eae 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -122,16 +122,16 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group) unlock_buffer(bh); return bh; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) { ext4_init_inode_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); unlock_buffer(bh); return bh; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), block_group)); + ext4_unlock_group(sb, block_group); if (buffer_uptodate(bh)) { /* * if not uninit if bh is uptodate, @@ -246,9 +246,8 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) goto error_return; /* Ok, now we can actually update the inode bitmaps.. */ - spin_lock(sb_bgl_lock(sbi, block_group)); - cleared = ext4_clear_bit(bit, bitmap_bh->b_data); - spin_unlock(sb_bgl_lock(sbi, block_group)); + cleared = ext4_clear_bit_atomic(ext4_group_lock_ptr(sb, block_group), + bit, bitmap_bh->b_data); if (!cleared) ext4_error(sb, "ext4_free_inode", "bit already cleared for inode %lu", ino); @@ -260,7 +259,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) if (fatal) goto error_return; if (gdp) { - spin_lock(sb_bgl_lock(sbi, block_group)); + ext4_lock_group(sb, block_group); count = ext4_free_inodes_count(sb, gdp) + 1; ext4_free_inodes_set(sb, gdp, count); if (is_directory) { @@ -276,7 +275,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) } gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_inc(&sbi->s_freeinodes_counter); if (is_directory) percpu_counter_dec(&sbi->s_dirs_counter); @@ -707,10 +706,10 @@ static int find_group_other(struct super_block *sb, struct inode *parent, /* * claim the inode from the inode bitmap. If the group - * is uninit we need to take the groups's sb_bgl_lock + * is uninit we need to take the groups's ext4_group_lock * and clear the uninit flag. The inode bitmap update * and group desc uninit flag clear should be done - * after holding sb_bgl_lock so that ext4_read_inode_bitmap + * after holding ext4_group_lock so that ext4_read_inode_bitmap * doesn't race with the ext4_claim_inode */ static int ext4_claim_inode(struct super_block *sb, @@ -721,7 +720,7 @@ static int ext4_claim_inode(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL); - spin_lock(sb_bgl_lock(sbi, group)); + ext4_lock_group(sb, group); if (ext4_set_bit(ino, inode_bitmap_bh->b_data)) { /* not a free inode */ retval = 1; @@ -730,7 +729,7 @@ static int ext4_claim_inode(struct super_block *sb, ino++; if ((group == 0 && ino < EXT4_FIRST_INO(sb)) || ino > EXT4_INODES_PER_GROUP(sb)) { - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); ext4_error(sb, __func__, "reserved inode or inode > inodes count - " "block_group = %u, inode=%lu", group, @@ -779,7 +778,7 @@ static int ext4_claim_inode(struct super_block *sb, } gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); err_ret: - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); return retval; } @@ -935,7 +934,7 @@ got: } free = 0; - spin_lock(sb_bgl_lock(sbi, group)); + ext4_lock_group(sb, group); /* recheck and clear flag under lock if we still need to */ if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { free = ext4_free_blocks_after_init(sb, group, gdp); @@ -944,7 +943,7 @@ got: gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp); } - spin_unlock(sb_bgl_lock(sbi, group)); + ext4_unlock_group(sb, group); /* Don't need to dirty bitmap block if we didn't change it */ if (free) { diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index df75855ae6f..e76459cedcd 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -372,24 +372,12 @@ static inline void mb_set_bit(int bit, void *addr) ext4_set_bit(bit, addr); } -static inline void mb_set_bit_atomic(spinlock_t *lock, int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_set_bit_atomic(lock, bit, addr); -} - static inline void mb_clear_bit(int bit, void *addr) { addr = mb_correct_addr_and_bit(&bit, addr); ext4_clear_bit(bit, addr); } -static inline void mb_clear_bit_atomic(spinlock_t *lock, int bit, void *addr) -{ - addr = mb_correct_addr_and_bit(&bit, addr); - ext4_clear_bit_atomic(lock, bit, addr); -} - static inline int mb_find_next_zero_bit(void *addr, int max, int start) { int fix = 0, ret, tmpmax; @@ -803,17 +791,17 @@ static int ext4_mb_init_cache(struct page *page, char *incore) unlock_buffer(bh[i]); continue; } - spin_lock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_lock_group(sb, first_group + i); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { ext4_init_block_bitmap(sb, bh[i], first_group + i, desc); set_bitmap_uptodate(bh[i]); set_buffer_uptodate(bh[i]); - spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_unlock_group(sb, first_group + i); unlock_buffer(bh[i]); continue; } - spin_unlock(sb_bgl_lock(EXT4_SB(sb), first_group + i)); + ext4_unlock_group(sb, first_group + i); if (buffer_uptodate(bh[i])) { /* * if not uninit if bh is uptodate, @@ -1080,7 +1068,7 @@ static int mb_find_order_for_block(struct ext4_buddy *e4b, int block) return 0; } -static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) +static void mb_clear_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1093,15 +1081,12 @@ static void mb_clear_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - if (lock) - mb_clear_bit_atomic(lock, cur, bm); - else - mb_clear_bit(cur, bm); + mb_clear_bit(cur, bm); cur++; } } -static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) +static void mb_set_bits(void *bm, int cur, int len) { __u32 *addr; @@ -1114,10 +1099,7 @@ static void mb_set_bits(spinlock_t *lock, void *bm, int cur, int len) cur += 32; continue; } - if (lock) - mb_set_bit_atomic(lock, cur, bm); - else - mb_set_bit(cur, bm); + mb_set_bit(cur, bm); cur++; } } @@ -1332,8 +1314,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) e4b->bd_info->bb_counters[ord]++; } - mb_set_bits(sb_bgl_lock(EXT4_SB(e4b->bd_sb), ex->fe_group), - EXT4_MB_BITMAP(e4b), ex->fe_start, len0); + mb_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0); mb_check_buddy(e4b); return ret; @@ -2756,7 +2737,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery) return 0; } -/* need to called with ext4 group lock (ext4_lock_group) */ +/* need to called with the ext4 group lock held */ static void ext4_mb_cleanup_pa(struct ext4_group_info *grp) { struct ext4_prealloc_space *pa; @@ -2993,14 +2974,17 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. */ - mb_set_bits(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group), - bitmap_bh->b_data, ac->ac_b_ex.fe_start, - ac->ac_b_ex.fe_len); + ext4_lock_group(sb, ac->ac_b_ex.fe_group); + mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start, + ac->ac_b_ex.fe_len); + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh); if (!err) err = -EAGAIN; goto out_err; } + + ext4_lock_group(sb, ac->ac_b_ex.fe_group); #ifdef AGGRESSIVE_CHECK { int i; @@ -3010,9 +2994,7 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, } } #endif - spin_lock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); - mb_set_bits(NULL, bitmap_bh->b_data, - ac->ac_b_ex.fe_start, ac->ac_b_ex.fe_len); + mb_set_bits(bitmap_bh->b_data, ac->ac_b_ex.fe_start,ac->ac_b_ex.fe_len); if (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT); ext4_free_blks_set(sb, gdp, @@ -3022,7 +3004,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, len = ext4_free_blks_count(sb, gdp) - ac->ac_b_ex.fe_len; ext4_free_blks_set(sb, gdp, len); gdp->bg_checksum = ext4_group_desc_csum(sbi, ac->ac_b_ex.fe_group, gdp); - spin_unlock(sb_bgl_lock(sbi, ac->ac_b_ex.fe_group)); + + ext4_unlock_group(sb, ac->ac_b_ex.fe_group); percpu_counter_sub(&sbi->s_freeblocks_counter, ac->ac_b_ex.fe_len); /* * Now reduce the dirty block count also. Should not go negative @@ -3455,7 +3438,7 @@ ext4_mb_use_preallocated(struct ext4_allocation_context *ac) * the function goes through all block freed in the group * but not yet committed and marks them used in in-core bitmap. * buddy must be generated from this bitmap - * Need to be called with ext4 group lock (ext4_lock_group) + * Need to be called with the ext4 group lock held */ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group) @@ -3469,9 +3452,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, while (n) { entry = rb_entry(n, struct ext4_free_data, node); - mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), - bitmap, entry->start_blk, - entry->count); + mb_set_bits(bitmap, entry->start_blk, entry->count); n = rb_next(n); } return; @@ -3480,7 +3461,7 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, /* * the function goes through all preallocation in this group and marks them * used in in-core bitmap. buddy must be generated from this bitmap - * Need to be called with ext4 group lock (ext4_lock_group) + * Need to be called with ext4 group lock held */ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, ext4_group_t group) @@ -3512,8 +3493,7 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap, if (unlikely(len == 0)) continue; BUG_ON(groupnr != group); - mb_set_bits(sb_bgl_lock(EXT4_SB(sb), group), - bitmap, start, len); + mb_set_bits(bitmap, start, len); preallocated += len; count++; } @@ -4856,29 +4836,25 @@ do_more: new_entry->group = block_group; new_entry->count = count; new_entry->t_tid = handle->h_transaction->t_tid; + ext4_lock_group(sb, block_group); - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); + mb_clear_bits(bitmap_bh->b_data, bit, count); ext4_mb_free_metadata(handle, &e4b, new_entry); - ext4_unlock_group(sb, block_group); } else { - ext4_lock_group(sb, block_group); /* need to update group_info->bb_free and bitmap * with group lock held. generate_buddy look at * them with group lock_held */ - mb_clear_bits(sb_bgl_lock(sbi, block_group), bitmap_bh->b_data, - bit, count); + ext4_lock_group(sb, block_group); + mb_clear_bits(bitmap_bh->b_data, bit, count); mb_free_blocks(inode, &e4b, bit, count); ext4_mb_return_to_preallocation(inode, &e4b, block, count); - ext4_unlock_group(sb, block_group); } - spin_lock(sb_bgl_lock(sbi, block_group)); ret = ext4_free_blks_count(sb, gdp) + count; ext4_free_blks_set(sb, gdp, ret); gdp->bg_checksum = ext4_group_desc_csum(sbi, block_group, gdp); - spin_unlock(sb_bgl_lock(sbi, block_group)); + ext4_unlock_group(sb, block_group); percpu_counter_add(&sbi->s_freeblocks_counter, count); if (sbi->s_log_groups_per_flex) { diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 39223a52bc7..dc34ed3d132 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1784,18 +1784,18 @@ static int ext4_check_descriptors(struct super_block *sb) "(block %llu)!\n", i, inode_table); return 0; } - spin_lock(sb_bgl_lock(sbi, i)); + ext4_lock_group(sb, i); if (!ext4_group_desc_csum_verify(sbi, i, gdp)) { printk(KERN_ERR "EXT4-fs: ext4_check_descriptors: " "Checksum for group %u failed (%u!=%u)\n", i, le16_to_cpu(ext4_group_desc_csum(sbi, i, gdp)), le16_to_cpu(gdp->bg_checksum)); if (!(sb->s_flags & MS_RDONLY)) { - spin_unlock(sb_bgl_lock(sbi, i)); + ext4_unlock_group(sb, i); return 0; } } - spin_unlock(sb_bgl_lock(sbi, i)); + ext4_unlock_group(sb, i); if (!flexbg_flag) first_block += EXT4_BLOCKS_PER_GROUP(sb); } From dab6f6a3401f596fe934f41fc5da3f401adfdfb1 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sat, 2 May 2009 08:02:36 +0200 Subject: [PATCH 0752/2061] perf_counter tools: fix build error ctype.h crawled out of the bit bucket :) Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-record.c | 1 - 1 file changed, 1 deletion(-) diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c index 3a3deb3fbbc..ddfdcf86fb2 100644 --- a/Documentation/perf_counter/builtin-record.c +++ b/Documentation/perf_counter/builtin-record.c @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include From a454ab3110175d710f4f9a96226a26ce4d5d5de2 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 3 May 2009 10:09:03 +0200 Subject: [PATCH 0753/2061] x86, mm: fault.c, use printk_once() in is_errata93() Andrew pointed out that the 'once' variable has a needlessly function-global scope. We can in fact eliminate it completely, via the use of printk_once(). [ Impact: cleanup ] Reported-by: Andrew Morton Cc: Linus Torvalds Signed-off-by: Ingo Molnar --- arch/x86/mm/fault.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index 24a36a6426a..b9ca6d767db 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -514,8 +514,6 @@ bad: static int is_errata93(struct pt_regs *regs, unsigned long address) { #ifdef CONFIG_X86_64 - static int once; - if (address != regs->ip) return 0; @@ -525,10 +523,7 @@ static int is_errata93(struct pt_regs *regs, unsigned long address) address |= 0xffffffffUL << 32; if ((address >= (u64)_stext && address <= (u64)_etext) || (address >= MODULES_VADDR && address <= MODULES_END)) { - if (!once) { - printk(errata93_warning); - once = 1; - } + printk_once(errata93_warning); regs->ip = address; return 1; } From 0cd5f7b0c7fd96f9f00187ef2ad2328de28ae326 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 May 2009 20:00:44 +0000 Subject: [PATCH 0754/2061] sh: remove obsolete no_irq_type Impact: cleanup convert the last remaining users to no_irq_chip Signed-off-by: Thomas Gleixner Signed-off-by: Paul Mundt --- arch/sh/kernel/sh_ksyms_32.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c index 528de2955c8..2d868c60aa6 100644 --- a/arch/sh/kernel/sh_ksyms_32.c +++ b/arch/sh/kernel/sh_ksyms_32.c @@ -19,13 +19,12 @@ #include extern int dump_fpu(struct pt_regs *, elf_fpregset_t *); -extern struct hw_interrupt_type no_irq_type; /* platform dependent support */ EXPORT_SYMBOL(dump_fpu); EXPORT_SYMBOL(kernel_thread); EXPORT_SYMBOL(irq_desc); -EXPORT_SYMBOL(no_irq_type); +EXPORT_SYMBOL(no_irq_chip); EXPORT_SYMBOL(strlen); From d80498398276ca8eee7ebdbe0d47e06d01317439 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 May 2009 20:01:20 +0000 Subject: [PATCH 0755/2061] sh: remove obsolete hw_interrupt_type Impact: cleanup Convert the last remaining users to struct irq_chip and remove the define. Signed-off-by: Thomas Gleixner Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/irq/imask.c | 2 +- arch/sh/kernel/cpu/irq/intc-sh5.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sh/kernel/cpu/irq/imask.c b/arch/sh/kernel/cpu/irq/imask.c index 301b505c427..f43753b54e1 100644 --- a/arch/sh/kernel/cpu/irq/imask.c +++ b/arch/sh/kernel/cpu/irq/imask.c @@ -39,7 +39,7 @@ static unsigned int startup_imask_irq(unsigned int irq) return 0; /* never anything pending */ } -static struct hw_interrupt_type imask_irq_type = { +static struct irq_chip imask_irq_type = { .typename = "SR.IMASK", .startup = startup_imask_irq, .shutdown = shutdown_imask_irq, diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c index 726f0335da7..d8679a4d93a 100644 --- a/arch/sh/kernel/cpu/irq/intc-sh5.c +++ b/arch/sh/kernel/cpu/irq/intc-sh5.c @@ -84,7 +84,7 @@ static void disable_intc_irq(unsigned int irq); static void mask_and_ack_intc(unsigned int); static void end_intc_irq(unsigned int irq); -static struct hw_interrupt_type intc_irq_type = { +static struct irq_chip intc_irq_type = { .typename = "INTC", .startup = startup_intc_irq, .shutdown = shutdown_intc_irq, From 7563431107f6debf57c1dbecfb9498cf31a1c036 Mon Sep 17 00:00:00 2001 From: john stultz Date: Fri, 1 May 2009 13:10:28 -0700 Subject: [PATCH 0756/2061] time: sh: convert to use arch_getoffset() infrastructure Convert sh to use GENERIC_TIME via the arch_getoffset() infrastructure. Signed-off-by: John Stultz Signed-off-by: Andrew Morton Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 2 +- arch/sh/include/asm/timer.h | 4 +- arch/sh/kernel/time_32.c | 63 +++--------------------------- arch/sh/kernel/time_64.c | 53 +------------------------ arch/sh/kernel/timers/timer-mtu2.c | 2 +- 5 files changed, 11 insertions(+), 113 deletions(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 7e96adea960..6f91478826d 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -75,7 +75,7 @@ config GENERIC_IOMAP bool config GENERIC_TIME - def_bool n + def_bool y config GENERIC_CLOCKEVENTS def_bool n diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h index 8f80a55c04a..9c968d19cb9 100644 --- a/arch/sh/include/asm/timer.h +++ b/arch/sh/include/asm/timer.h @@ -9,7 +9,7 @@ struct sys_timer_ops { int (*init)(void); int (*start)(void); int (*stop)(void); -#ifndef CONFIG_GENERIC_TIME +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET unsigned long (*get_offset)(void); #endif }; @@ -26,7 +26,7 @@ struct sys_timer { extern struct sys_timer tmu_timer, mtu2_timer; extern struct sys_timer *sys_timer; -#ifndef CONFIG_GENERIC_TIME +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET static inline unsigned long get_timer_offset(void) { return sys_timer->ops->get_offset(); diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c index 457332116e1..9d34dff1499 100644 --- a/arch/sh/kernel/time_32.c +++ b/arch/sh/kernel/time_32.c @@ -83,65 +83,12 @@ static int __init rtc_generic_init(void) } module_init(rtc_generic_init); -#ifndef CONFIG_GENERIC_TIME -void do_gettimeofday(struct timeval *tv) +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET +u32 arch_gettimeoffset(void) { - unsigned long flags; - unsigned long seq; - unsigned long usec, sec; - - do { - /* - * Turn off IRQs when grabbing xtime_lock, so that - * the sys_timer get_offset code doesn't have to handle it. - */ - seq = read_seqbegin_irqsave(&xtime_lock, flags); - usec = get_timer_offset(); - sec = xtime.tv_sec; - usec += xtime.tv_nsec / NSEC_PER_USEC; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; + return get_timer_offset() * 1000; } -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= get_timer_offset() * NSEC_PER_USEC; - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - - return 0; -} -EXPORT_SYMBOL(do_settimeofday); -#endif /* !CONFIG_GENERIC_TIME */ +#endif /* CONFIG_ARCH_USES_GETTIMEOFFSET */ /* last time the RTC clock got updated */ static long last_rtc_update; @@ -238,7 +185,7 @@ struct clocksource clocksource_sh = { .name = "SuperH", }; -#ifdef CONFIG_GENERIC_TIME +#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET unsigned long long sched_clock(void) { unsigned long long cycles; diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c index 988c77c3723..f4f5e8ad5be 100644 --- a/arch/sh/kernel/time_64.c +++ b/arch/sh/kernel/time_64.c @@ -144,59 +144,10 @@ static unsigned long usecs_since_tick(void) return result; } -void do_gettimeofday(struct timeval *tv) +u32 arch_gettimeoffset(void) { - unsigned long flags; - unsigned long seq; - unsigned long usec, sec; - - do { - seq = read_seqbegin_irqsave(&xtime_lock, flags); - usec = usecs_since_tick(); - sec = xtime.tv_sec; - usec += xtime.tv_nsec / 1000; - } while (read_seqretry_irqrestore(&xtime_lock, seq, flags)); - - while (usec >= 1000000) { - usec -= 1000000; - sec++; - } - - tv->tv_sec = sec; - tv->tv_usec = usec; + return usecs_since_tick() * 1000; } -EXPORT_SYMBOL(do_gettimeofday); - -int do_settimeofday(struct timespec *tv) -{ - time_t wtm_sec, sec = tv->tv_sec; - long wtm_nsec, nsec = tv->tv_nsec; - - if ((unsigned long)tv->tv_nsec >= NSEC_PER_SEC) - return -EINVAL; - - write_seqlock_irq(&xtime_lock); - /* - * This is revolting. We need to set "xtime" correctly. However, the - * value in this location is the value at the most recent update of - * wall time. Discover what correction gettimeofday() would have - * made, and then undo it! - */ - nsec -= 1000 * usecs_since_tick(); - - wtm_sec = wall_to_monotonic.tv_sec + (xtime.tv_sec - sec); - wtm_nsec = wall_to_monotonic.tv_nsec + (xtime.tv_nsec - nsec); - - set_normalized_timespec(&xtime, sec, nsec); - set_normalized_timespec(&wall_to_monotonic, wtm_sec, wtm_nsec); - - ntp_clear(); - write_sequnlock_irq(&xtime_lock); - clock_was_set(); - - return 0; -} -EXPORT_SYMBOL(do_settimeofday); /* Dummy RTC ops */ static void null_rtc_get_time(struct timespec *tv) diff --git a/arch/sh/kernel/timers/timer-mtu2.c b/arch/sh/kernel/timers/timer-mtu2.c index 9b0ef012647..8a1dcc2c372 100644 --- a/arch/sh/kernel/timers/timer-mtu2.c +++ b/arch/sh/kernel/timers/timer-mtu2.c @@ -191,7 +191,7 @@ struct sys_timer_ops mtu2_timer_ops = { .init = mtu2_timer_init, .start = mtu2_timer_start, .stop = mtu2_timer_stop, -#ifndef CONFIG_GENERIC_TIME +#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET .get_offset = mtu2_timer_get_offset, #endif }; From d5ed4c2e5ce9f5f6fd6a5a39ee1196a1f8a46eed Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 30 Apr 2009 07:02:49 +0000 Subject: [PATCH 0757/2061] clocksource: SuperH MTU2 Timer driver This patch adds a MTU2 driver for the SuperH architecture. The MTU2 driver is a platform driver with early platform support to allow using a MTU2 channel as only clockevent during system bootup. Clocksource on sh2a is currently unsupported due to code generation issues with 64-bit math, so at this point only periodic clockevent support is in place. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 11 ++ drivers/clocksource/Makefile | 1 + drivers/clocksource/sh_mtu2.c | 357 ++++++++++++++++++++++++++++++++++ include/linux/sh_mtu2.h | 12 ++ 4 files changed, 381 insertions(+) create mode 100644 drivers/clocksource/sh_mtu2.c create mode 100644 include/linux/sh_mtu2.h diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 6f91478826d..6d0dd378eca 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -113,6 +113,9 @@ config SYS_SUPPORTS_PCI config SYS_SUPPORTS_CMT bool +config SYS_SUPPORTS_MTU2 + bool + config STACKTRACE_SUPPORT def_bool y @@ -478,6 +481,14 @@ config SH_MTU2 help This enables the use of the MTU2 as the system timer. +config SH_TIMER_MTU2 + bool "MTU2 timer driver" + depends on SYS_SUPPORTS_MTU2 && !SH_MTU2 + default y + select GENERIC_CLOCKEVENTS + help + This enables build of the MTU2 timer driver. + config SH_TIMER_IRQ int default "28" if CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \ diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 1efb2879a94..9785586dc8c 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_X86_CYCLONE_TIMER) += cyclone.o obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o obj-$(CONFIG_SCx200HR_TIMER) += scx200_hrt.o obj-$(CONFIG_SH_TIMER_CMT) += sh_cmt.o +obj-$(CONFIG_SH_TIMER_MTU2) += sh_mtu2.o diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c new file mode 100644 index 00000000000..420566f4c50 --- /dev/null +++ b/drivers/clocksource/sh_mtu2.c @@ -0,0 +1,357 @@ +/* + * SuperH Timer Support - MTU2 + * + * Copyright (C) 2009 Magnus Damm + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct sh_mtu2_priv { + void __iomem *mapbase; + struct clk *clk; + struct irqaction irqaction; + struct platform_device *pdev; + unsigned long rate; + unsigned long periodic; + struct clock_event_device ced; +}; + +static DEFINE_SPINLOCK(sh_mtu2_lock); + +#define TSTR -1 /* shared register */ +#define TCR 0 /* channel register */ +#define TMDR 1 /* channel register */ +#define TIOR 2 /* channel register */ +#define TIER 3 /* channel register */ +#define TSR 4 /* channel register */ +#define TCNT 5 /* channel register */ +#define TGR 6 /* channel register */ + +static unsigned long mtu2_reg_offs[] = { + [TCR] = 0, + [TMDR] = 1, + [TIOR] = 2, + [TIER] = 4, + [TSR] = 5, + [TCNT] = 6, + [TGR] = 8, +}; + +static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr) +{ + struct sh_mtu2_config *cfg = p->pdev->dev.platform_data; + void __iomem *base = p->mapbase; + unsigned long offs; + + if (reg_nr == TSTR) + return ioread8(base + cfg->channel_offset); + + offs = mtu2_reg_offs[reg_nr]; + + if ((reg_nr == TCNT) || (reg_nr == TGR)) + return ioread16(base + offs); + else + return ioread8(base + offs); +} + +static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr, + unsigned long value) +{ + struct sh_mtu2_config *cfg = p->pdev->dev.platform_data; + void __iomem *base = p->mapbase; + unsigned long offs; + + if (reg_nr == TSTR) { + iowrite8(value, base + cfg->channel_offset); + return; + } + + offs = mtu2_reg_offs[reg_nr]; + + if ((reg_nr == TCNT) || (reg_nr == TGR)) + iowrite16(value, base + offs); + else + iowrite8(value, base + offs); +} + +static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start) +{ + struct sh_mtu2_config *cfg = p->pdev->dev.platform_data; + unsigned long flags, value; + + /* start stop register shared by multiple timer channels */ + spin_lock_irqsave(&sh_mtu2_lock, flags); + value = sh_mtu2_read(p, TSTR); + + if (start) + value |= 1 << cfg->timer_bit; + else + value &= ~(1 << cfg->timer_bit); + + sh_mtu2_write(p, TSTR, value); + spin_unlock_irqrestore(&sh_mtu2_lock, flags); +} + +static int sh_mtu2_enable(struct sh_mtu2_priv *p) +{ + struct sh_mtu2_config *cfg = p->pdev->dev.platform_data; + int ret; + + /* enable clock */ + ret = clk_enable(p->clk); + if (ret) { + pr_err("sh_mtu2: cannot enable clock \"%s\"\n", cfg->clk); + return ret; + } + + /* make sure channel is disabled */ + sh_mtu2_start_stop_ch(p, 0); + + p->rate = clk_get_rate(p->clk) / 64; + p->periodic = (p->rate + HZ/2) / HZ; + + /* "Periodic Counter Operation" */ + sh_mtu2_write(p, TCR, 0x23); /* TGRA clear, divide clock by 64 */ + sh_mtu2_write(p, TIOR, 0); + sh_mtu2_write(p, TGR, p->periodic); + sh_mtu2_write(p, TCNT, 0); + sh_mtu2_write(p, TMDR, 0); + sh_mtu2_write(p, TIER, 0x01); + + /* enable channel */ + sh_mtu2_start_stop_ch(p, 1); + + return 0; +} + +static void sh_mtu2_disable(struct sh_mtu2_priv *p) +{ + /* disable channel */ + sh_mtu2_start_stop_ch(p, 0); + + /* stop clock */ + clk_disable(p->clk); +} + +static irqreturn_t sh_mtu2_interrupt(int irq, void *dev_id) +{ + struct sh_mtu2_priv *p = dev_id; + + /* acknowledge interrupt */ + sh_mtu2_read(p, TSR); + sh_mtu2_write(p, TSR, 0xfe); + + /* notify clockevent layer */ + p->ced.event_handler(&p->ced); + return IRQ_HANDLED; +} + +static struct sh_mtu2_priv *ced_to_sh_mtu2(struct clock_event_device *ced) +{ + return container_of(ced, struct sh_mtu2_priv, ced); +} + +static void sh_mtu2_clock_event_mode(enum clock_event_mode mode, + struct clock_event_device *ced) +{ + struct sh_mtu2_priv *p = ced_to_sh_mtu2(ced); + int disabled = 0; + + /* deal with old setting first */ + switch (ced->mode) { + case CLOCK_EVT_MODE_PERIODIC: + sh_mtu2_disable(p); + disabled = 1; + break; + default: + break; + } + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + pr_info("sh_mtu2: %s used for periodic clock events\n", + ced->name); + sh_mtu2_enable(p); + break; + case CLOCK_EVT_MODE_UNUSED: + if (!disabled) + sh_mtu2_disable(p); + break; + case CLOCK_EVT_MODE_SHUTDOWN: + default: + break; + } +} + +static void sh_mtu2_register_clockevent(struct sh_mtu2_priv *p, + char *name, unsigned long rating) +{ + struct clock_event_device *ced = &p->ced; + int ret; + + memset(ced, 0, sizeof(*ced)); + + ced->name = name; + ced->features = CLOCK_EVT_FEAT_PERIODIC; + ced->rating = rating; + ced->cpumask = cpumask_of(0); + ced->set_mode = sh_mtu2_clock_event_mode; + + ret = setup_irq(p->irqaction.irq, &p->irqaction); + if (ret) { + pr_err("sh_mtu2: failed to request irq %d\n", + p->irqaction.irq); + return; + } + + pr_info("sh_mtu2: %s used for clock events\n", ced->name); + clockevents_register_device(ced); +} + +int sh_mtu2_register(struct sh_mtu2_priv *p, char *name, + unsigned long clockevent_rating) +{ + if (clockevent_rating) + sh_mtu2_register_clockevent(p, name, clockevent_rating); + + return 0; +} + +static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev) +{ + struct sh_mtu2_config *cfg = pdev->dev.platform_data; + struct resource *res; + int irq, ret; + ret = -ENXIO; + + memset(p, 0, sizeof(*p)); + p->pdev = pdev; + + if (!cfg) { + dev_err(&p->pdev->dev, "missing platform data\n"); + goto err0; + } + + platform_set_drvdata(pdev, p); + + res = platform_get_resource(p->pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&p->pdev->dev, "failed to get I/O memory\n"); + goto err0; + } + + irq = platform_get_irq(p->pdev, 0); + if (irq < 0) { + dev_err(&p->pdev->dev, "failed to get irq\n"); + goto err0; + } + + /* map memory, let mapbase point to our channel */ + p->mapbase = ioremap_nocache(res->start, resource_size(res)); + if (p->mapbase == NULL) { + pr_err("sh_mtu2: failed to remap I/O memory\n"); + goto err0; + } + + /* setup data for setup_irq() (too early for request_irq()) */ + p->irqaction.name = cfg->name; + p->irqaction.handler = sh_mtu2_interrupt; + p->irqaction.dev_id = p; + p->irqaction.irq = irq; + p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL; + p->irqaction.mask = CPU_MASK_NONE; + + /* get hold of clock */ + p->clk = clk_get(&p->pdev->dev, cfg->clk); + if (IS_ERR(p->clk)) { + pr_err("sh_mtu2: cannot get clock \"%s\"\n", cfg->clk); + ret = PTR_ERR(p->clk); + goto err1; + } + + return sh_mtu2_register(p, cfg->name, cfg->clockevent_rating); + err1: + iounmap(p->mapbase); + err0: + return ret; +} + +static int __devinit sh_mtu2_probe(struct platform_device *pdev) +{ + struct sh_mtu2_priv *p = platform_get_drvdata(pdev); + struct sh_mtu2_config *cfg = pdev->dev.platform_data; + int ret; + + if (p) { + pr_info("sh_mtu2: %s kept as earlytimer\n", cfg->name); + return 0; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) { + dev_err(&pdev->dev, "failed to allocate driver data\n"); + return -ENOMEM; + } + + ret = sh_mtu2_setup(p, pdev); + if (ret) { + kfree(p); + platform_set_drvdata(pdev, NULL); + } + return ret; +} + +static int __devexit sh_mtu2_remove(struct platform_device *pdev) +{ + return -EBUSY; /* cannot unregister clockevent */ +} + +static struct platform_driver sh_mtu2_device_driver = { + .probe = sh_mtu2_probe, + .remove = __devexit_p(sh_mtu2_remove), + .driver = { + .name = "sh_mtu2", + } +}; + +static int __init sh_mtu2_init(void) +{ + return platform_driver_register(&sh_mtu2_device_driver); +} + +static void __exit sh_mtu2_exit(void) +{ + platform_driver_unregister(&sh_mtu2_device_driver); +} + +early_platform_init("earlytimer", &sh_mtu2_device_driver); +module_init(sh_mtu2_init); +module_exit(sh_mtu2_exit); + +MODULE_AUTHOR("Magnus Damm"); +MODULE_DESCRIPTION("SuperH MTU2 Timer Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/sh_mtu2.h b/include/linux/sh_mtu2.h new file mode 100644 index 00000000000..a98876340ff --- /dev/null +++ b/include/linux/sh_mtu2.h @@ -0,0 +1,12 @@ +#ifndef __SH_MTU2_H__ +#define __SH_MTU2_H__ + +struct sh_mtu2_config { + char *name; + int channel_offset; + int timer_bit; + char *clk; + unsigned long clockevent_rating; +}; + +#endif /* __SH_MTU2_H__ */ From da107c6ef919b3afd9c9b405a4f71e03b5725b04 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 30 Apr 2009 07:06:26 +0000 Subject: [PATCH 0758/2061] sh: sh2a MTU2 platform data This patch adds MTU2 platform data for the following cpus: - sh7201 (3/5 channels) - sh7203/sh7263 (2/4 channels) - sh7206 (3/5 channels) - MXG (3/5 channels) Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 5 ++ arch/sh/kernel/cpu/sh2a/setup-mxg.c | 111 +++++++++++++++++++++++- arch/sh/kernel/cpu/sh2a/setup-sh7201.c | 115 +++++++++++++++++++++++++ arch/sh/kernel/cpu/sh2a/setup-sh7203.c | 71 +++++++++++++++ arch/sh/kernel/cpu/sh2a/setup-sh7206.c | 104 ++++++++++++++++++++++ 5 files changed, 405 insertions(+), 1 deletion(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 6d0dd378eca..2061488cc17 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -214,27 +214,32 @@ config CPU_SUBTYPE_SH7201 bool "Support SH7201 processor" select CPU_SH2A select CPU_HAS_FPU + select SYS_SUPPORTS_MTU2 config CPU_SUBTYPE_SH7203 bool "Support SH7203 processor" select CPU_SH2A select CPU_HAS_FPU select SYS_SUPPORTS_CMT + select SYS_SUPPORTS_MTU2 config CPU_SUBTYPE_SH7206 bool "Support SH7206 processor" select CPU_SH2A select SYS_SUPPORTS_CMT + select SYS_SUPPORTS_MTU2 config CPU_SUBTYPE_SH7263 bool "Support SH7263 processor" select CPU_SH2A select CPU_HAS_FPU select SYS_SUPPORTS_CMT + select SYS_SUPPORTS_MTU2 config CPU_SUBTYPE_MXG bool "Support MX-G processor" select CPU_SH2A + select SYS_SUPPORTS_MTU2 help Select MX-G if running on an R8A03022BG part. diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c index 844293723cf..870030aa05b 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c +++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c @@ -11,6 +11,7 @@ #include #include #include +#include enum { UNUSED = 0, @@ -24,7 +25,7 @@ enum { SCIF0, SCIF1, - MTU2_GROUP1, MTU2_GROUP2, MTU2_GROUP3, MTU2_GROUP4, MTU2_GROUP5 + MTU2_GROUP1, MTU2_GROUP2, MTU2_GROUP3, MTU2_GROUP4, MTU2_GROUP5, MTU2_TGI3B, MTU2_TGI3C, /* interrupt groups */ @@ -113,6 +114,99 @@ static struct intc_mask_reg mask_registers[] __initdata = { static DECLARE_INTC_DESC(intc_desc, "mxg", vectors, groups, mask_registers, prio_registers, NULL); +static struct sh_mtu2_config mtu2_0_platform_data = { + .name = "MTU2_0", + .channel_offset = -0x80, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_0_resources[] = { + [0] = { + .name = "MTU2_0", + .start = 0xff801300, + .end = 0xff801326, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 228, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_0_device = { + .name = "sh_mtu2", + .id = 0, + .dev = { + .platform_data = &mtu2_0_platform_data, + }, + .resource = mtu2_0_resources, + .num_resources = ARRAY_SIZE(mtu2_0_resources), +}; + +static struct sh_mtu2_config mtu2_1_platform_data = { + .name = "MTU2_1", + .channel_offset = -0x100, + .timer_bit = 1, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_1_resources[] = { + [0] = { + .name = "MTU2_1", + .start = 0xff801380, + .end = 0xff801390, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 234, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_1_device = { + .name = "sh_mtu2", + .id = 1, + .dev = { + .platform_data = &mtu2_1_platform_data, + }, + .resource = mtu2_1_resources, + .num_resources = ARRAY_SIZE(mtu2_1_resources), +}; + +static struct sh_mtu2_config mtu2_2_platform_data = { + .name = "MTU2_2", + .channel_offset = 0x80, + .timer_bit = 2, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_2_resources[] = { + [0] = { + .name = "MTU2_2", + .start = 0xff801000, + .end = 0xff80100a, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 240, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_2_device = { + .name = "sh_mtu2", + .id = 2, + .dev = { + .platform_data = &mtu2_2_platform_data, + }, + .resource = mtu2_2_resources, + .num_resources = ARRAY_SIZE(mtu2_2_resources), +}; + static struct plat_sci_port sci_platform_data[] = { { .mapbase = 0xff804000, @@ -134,6 +228,9 @@ static struct platform_device sci_device = { static struct platform_device *mxg_devices[] __initdata = { &sci_device, + &mtu2_0_device, + &mtu2_1_device, + &mtu2_2_device, }; static int __init mxg_devices_setup(void) @@ -147,3 +244,15 @@ void __init plat_irq_setup(void) { register_intc_controller(&intc_desc); } + +static struct platform_device *mxg_early_devices[] __initdata = { + &mtu2_0_device, + &mtu2_1_device, + &mtu2_2_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(mxg_early_devices, + ARRAY_SIZE(mxg_early_devices)); +} diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c index 00f42f9e3f5..074aa9062e4 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c @@ -12,6 +12,8 @@ #include #include #include +#include +#include enum { UNUSED = 0, @@ -249,9 +251,105 @@ static struct platform_device rtc_device = { .resource = rtc_resources, }; +static struct sh_mtu2_config mtu2_0_platform_data = { + .name = "MTU2_0", + .channel_offset = -0x80, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_0_resources[] = { + [0] = { + .name = "MTU2_0", + .start = 0xfffe4300, + .end = 0xfffe4326, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 108, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_0_device = { + .name = "sh_mtu2", + .id = 0, + .dev = { + .platform_data = &mtu2_0_platform_data, + }, + .resource = mtu2_0_resources, + .num_resources = ARRAY_SIZE(mtu2_0_resources), +}; + +static struct sh_mtu2_config mtu2_1_platform_data = { + .name = "MTU2_1", + .channel_offset = -0x100, + .timer_bit = 1, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_1_resources[] = { + [0] = { + .name = "MTU2_1", + .start = 0xfffe4380, + .end = 0xfffe4390, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 116, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_1_device = { + .name = "sh_mtu2", + .id = 1, + .dev = { + .platform_data = &mtu2_1_platform_data, + }, + .resource = mtu2_1_resources, + .num_resources = ARRAY_SIZE(mtu2_1_resources), +}; + +static struct sh_mtu2_config mtu2_2_platform_data = { + .name = "MTU2_2", + .channel_offset = 0x80, + .timer_bit = 2, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_2_resources[] = { + [0] = { + .name = "MTU2_2", + .start = 0xfffe4000, + .end = 0xfffe400a, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 124, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_2_device = { + .name = "sh_mtu2", + .id = 2, + .dev = { + .platform_data = &mtu2_2_platform_data, + }, + .resource = mtu2_2_resources, + .num_resources = ARRAY_SIZE(mtu2_2_resources), +}; + static struct platform_device *sh7201_devices[] __initdata = { &sci_device, &rtc_device, + &mtu2_0_device, + &mtu2_1_device, + &mtu2_2_device, }; static int __init sh7201_devices_setup(void) @@ -265,3 +363,20 @@ void __init plat_irq_setup(void) { register_intc_controller(&intc_desc); } + +static struct platform_device *sh7201_early_devices[] __initdata = { + &mtu2_0_device, + &mtu2_1_device, + &mtu2_2_device, +}; + +#define STBCR3 0xfffe0408 + +void __init plat_early_device_setup(void) +{ + /* enable MTU2 clock */ + __raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3); + + early_platform_add_devices(sh7201_early_devices, + ARRAY_SIZE(sh7201_early_devices)); +} diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c index 0836acee228..3448164b573 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c @@ -12,6 +12,7 @@ #include #include #include +#include #include enum { @@ -271,6 +272,68 @@ static struct platform_device cmt1_device = { .num_resources = ARRAY_SIZE(cmt1_resources), }; +static struct sh_mtu2_config mtu2_0_platform_data = { + .name = "MTU2_0", + .channel_offset = -0x80, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_0_resources[] = { + [0] = { + .name = "MTU2_0", + .start = 0xfffe4300, + .end = 0xfffe4326, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 146, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_0_device = { + .name = "sh_mtu2", + .id = 0, + .dev = { + .platform_data = &mtu2_0_platform_data, + }, + .resource = mtu2_0_resources, + .num_resources = ARRAY_SIZE(mtu2_0_resources), +}; + +static struct sh_mtu2_config mtu2_1_platform_data = { + .name = "MTU2_1", + .channel_offset = -0x100, + .timer_bit = 1, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_1_resources[] = { + [0] = { + .name = "MTU2_1", + .start = 0xfffe4380, + .end = 0xfffe4390, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 153, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_1_device = { + .name = "sh_mtu2", + .id = 1, + .dev = { + .platform_data = &mtu2_1_platform_data, + }, + .resource = mtu2_1_resources, + .num_resources = ARRAY_SIZE(mtu2_1_resources), +}; + static struct resource rtc_resources[] = { [0] = { .start = 0xffff2000, @@ -295,6 +358,8 @@ static struct platform_device *sh7203_devices[] __initdata = { &sci_device, &cmt0_device, &cmt1_device, + &mtu2_0_device, + &mtu2_1_device, &rtc_device, }; @@ -313,8 +378,11 @@ void __init plat_irq_setup(void) static struct platform_device *sh7203_early_devices[] __initdata = { &cmt0_device, &cmt1_device, + &mtu2_0_device, + &mtu2_1_device, }; +#define STBCR3 0xfffe0408 #define STBCR4 0xfffe040c void __init plat_early_device_setup(void) @@ -322,6 +390,9 @@ void __init plat_early_device_setup(void) /* enable CMT clock */ __raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4); + /* enable MTU2 clock */ + __raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3); + early_platform_add_devices(sh7203_early_devices, ARRAY_SIZE(sh7203_early_devices)); } diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c index f9606e3d251..e700559b6b8 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c @@ -13,6 +13,7 @@ #include #include #include +#include #include enum { @@ -231,10 +232,106 @@ static struct platform_device cmt1_device = { .num_resources = ARRAY_SIZE(cmt1_resources), }; +static struct sh_mtu2_config mtu2_0_platform_data = { + .name = "MTU2_0", + .channel_offset = -0x80, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_0_resources[] = { + [0] = { + .name = "MTU2_0", + .start = 0xfffe4300, + .end = 0xfffe4326, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 156, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_0_device = { + .name = "sh_mtu2", + .id = 0, + .dev = { + .platform_data = &mtu2_0_platform_data, + }, + .resource = mtu2_0_resources, + .num_resources = ARRAY_SIZE(mtu2_0_resources), +}; + +static struct sh_mtu2_config mtu2_1_platform_data = { + .name = "MTU2_1", + .channel_offset = -0x100, + .timer_bit = 1, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_1_resources[] = { + [0] = { + .name = "MTU2_1", + .start = 0xfffe4380, + .end = 0xfffe4390, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 164, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_1_device = { + .name = "sh_mtu2", + .id = 1, + .dev = { + .platform_data = &mtu2_1_platform_data, + }, + .resource = mtu2_1_resources, + .num_resources = ARRAY_SIZE(mtu2_1_resources), +}; + +static struct sh_mtu2_config mtu2_2_platform_data = { + .name = "MTU2_2", + .channel_offset = 0x80, + .timer_bit = 2, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource mtu2_2_resources[] = { + [0] = { + .name = "MTU2_2", + .start = 0xfffe4000, + .end = 0xfffe400a, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 180, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device mtu2_2_device = { + .name = "sh_mtu2", + .id = 2, + .dev = { + .platform_data = &mtu2_2_platform_data, + }, + .resource = mtu2_2_resources, + .num_resources = ARRAY_SIZE(mtu2_2_resources), +}; + static struct platform_device *sh7206_devices[] __initdata = { &sci_device, &cmt0_device, &cmt1_device, + &mtu2_0_device, + &mtu2_1_device, + &mtu2_2_device, }; static int __init sh7206_devices_setup(void) @@ -252,8 +349,12 @@ void __init plat_irq_setup(void) static struct platform_device *sh7206_early_devices[] __initdata = { &cmt0_device, &cmt1_device, + &mtu2_0_device, + &mtu2_1_device, + &mtu2_2_device, }; +#define STBCR3 0xfffe0408 #define STBCR4 0xfffe040c void __init plat_early_device_setup(void) @@ -261,6 +362,9 @@ void __init plat_early_device_setup(void) /* enable CMT clock */ __raw_writeb(__raw_readb(STBCR4) & ~0x04, STBCR4); + /* enable MTU2 clock */ + __raw_writeb(__raw_readb(STBCR3) & ~0x20, STBCR3); + early_platform_add_devices(sh7206_early_devices, ARRAY_SIZE(sh7206_early_devices)); } From 3280c8865e1b738604bacdea54738acef31e8c12 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 30 Apr 2009 07:12:09 +0000 Subject: [PATCH 0759/2061] sh: remove old MTU2 driver This patch removes the old MTU2 driver (CONFIG_SH_MTU2/timer-mtu2.c) As replacement, select the sh_cmt driver with CONFIG_SH_TIMER_MTU2 and configure timer channel using platform data. If multiple MTU channels are enabled using platform data, use the earlytimer parameter on the kernel command line to select channel. For instance, use "earlytimer=sh_mtu2.0" to select the first channel. To verify which timer is being used, look at printouts or the timer irq count in /proc/interrupts. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 9 +- arch/sh/include/asm/timer.h | 2 +- arch/sh/kernel/timers/Makefile | 1 - arch/sh/kernel/timers/timer-mtu2.c | 202 ----------------------------- arch/sh/kernel/timers/timer.c | 3 - 5 files changed, 2 insertions(+), 215 deletions(-) delete mode 100644 arch/sh/kernel/timers/timer-mtu2.c diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 2061488cc17..bda3dc1b0c2 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -479,16 +479,9 @@ config SH_TIMER_CMT help This enables build of the CMT timer driver. -config SH_MTU2 - bool "MTU2 timer support" - depends on CPU_SH2A && !GENERIC_TIME - default y - help - This enables the use of the MTU2 as the system timer. - config SH_TIMER_MTU2 bool "MTU2 timer driver" - depends on SYS_SUPPORTS_MTU2 && !SH_MTU2 + depends on SYS_SUPPORTS_MTU2 default y select GENERIC_CLOCKEVENTS help diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h index 9c968d19cb9..581e3fe3fe0 100644 --- a/arch/sh/include/asm/timer.h +++ b/arch/sh/include/asm/timer.h @@ -23,7 +23,7 @@ struct sys_timer { #define TICK_SIZE (tick_nsec / 1000) -extern struct sys_timer tmu_timer, mtu2_timer; +extern struct sys_timer tmu_timer; extern struct sys_timer *sys_timer; #ifdef CONFIG_ARCH_USES_GETTIMEOFFSET diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile index 1e9a104c2ae..44156df0bde 100644 --- a/arch/sh/kernel/timers/Makefile +++ b/arch/sh/kernel/timers/Makefile @@ -5,6 +5,5 @@ obj-y := timer.o obj-$(CONFIG_SH_TMU) += timer-tmu.o -obj-$(CONFIG_SH_MTU2) += timer-mtu2.o obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += timer-broadcast.o diff --git a/arch/sh/kernel/timers/timer-mtu2.c b/arch/sh/kernel/timers/timer-mtu2.c deleted file mode 100644 index 8a1dcc2c372..00000000000 --- a/arch/sh/kernel/timers/timer-mtu2.c +++ /dev/null @@ -1,202 +0,0 @@ -/* - * arch/sh/kernel/timers/timer-mtu2.c - MTU2 Timer Support - * - * Copyright (C) 2005 Paul Mundt - * - * Based off of arch/sh/kernel/timers/timer-tmu.c - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include -#include -#include -#include -#include -#include - -/* - * We use channel 1 for our lowly system timer. Channel 2 would be the other - * likely candidate, but we leave it alone as it has higher divisors that - * would be of more use to other more interesting applications. - * - * TODO: Presently we only implement a 16-bit single-channel system timer. - * However, we can implement channel cascade if we go the overflow route and - * get away with using 2 MTU2 channels as a 32-bit timer. - */ -#define MTU2_TSTR 0xfffe4280 -#define MTU2_TCR_1 0xfffe4380 -#define MTU2_TMDR_1 0xfffe4381 -#define MTU2_TIOR_1 0xfffe4382 -#define MTU2_TIER_1 0xfffe4384 -#define MTU2_TSR_1 0xfffe4385 -#define MTU2_TCNT_1 0xfffe4386 /* 16-bit counter */ - -#if defined(CONFIG_CPU_SUBTYPE_SH7201) || \ - defined(CONFIG_CPU_SUBTYPE_SH7203) -#define MTU2_TGRA_1 0xfffe4388 -#else -#define MTU2_TGRA_1 0xfffe438a -#endif - -#define STBCR3 0xfffe0408 - -#define MTU2_TSTR_CST1 (1 << 1) /* Counter Start 1 */ - -#define MTU2_TSR_TGFA (1 << 0) /* GRA compare match */ - -#define MTU2_TIER_TGIEA (1 << 0) /* GRA compare match interrupt enable */ - -#define MTU2_TCR_INIT 0x22 - -#define MTU2_TCR_CALIB 0x00 - -static unsigned long mtu2_timer_get_offset(void) -{ - int count; - static int count_p = 0x7fff; /* for the first call after boot */ - static unsigned long jiffies_p = 0; - - /* - * cache volatile jiffies temporarily; we have IRQs turned off. - */ - unsigned long jiffies_t; - - /* timer count may underflow right here */ - count = ctrl_inw(MTU2_TCNT_1); /* read the latched count */ - - jiffies_t = jiffies; - - /* - * avoiding timer inconsistencies (they are rare, but they happen)... - * there is one kind of problem that must be avoided here: - * 1. the timer counter underflows - */ - - if (jiffies_t == jiffies_p) { - if (count > count_p) { - if (ctrl_inb(MTU2_TSR_1) & MTU2_TSR_TGFA) { - count -= LATCH; - } else { - printk("%s (): hardware timer problem?\n", - __func__); - } - } - } else - jiffies_p = jiffies_t; - - count_p = count; - - count = ((LATCH-1) - count) * TICK_SIZE; - count = (count + LATCH/2) / LATCH; - - return count; -} - -static irqreturn_t mtu2_timer_interrupt(int irq, void *dev_id) -{ - unsigned long timer_status; - - /* Clear TGFA bit */ - timer_status = ctrl_inb(MTU2_TSR_1); - timer_status &= ~MTU2_TSR_TGFA; - ctrl_outb(timer_status, MTU2_TSR_1); - - /* Do timer tick */ - handle_timer_tick(); - - return IRQ_HANDLED; -} - -static struct irqaction mtu2_irq = { - .name = "timer", - .handler = mtu2_timer_interrupt, - .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL, -}; - -static unsigned int divisors[] = { 1, 4, 16, 64, 1, 1, 256 }; - -static void mtu2_clk_init(struct clk *clk) -{ - u8 idx = MTU2_TCR_INIT & 0x7; - - clk->rate = clk->parent->rate / divisors[idx]; - /* Start TCNT counting */ - ctrl_outb(ctrl_inb(MTU2_TSTR) | MTU2_TSTR_CST1, MTU2_TSTR); - -} - -static void mtu2_clk_recalc(struct clk *clk) -{ - u8 idx = ctrl_inb(MTU2_TCR_1) & 0x7; - clk->rate = clk->parent->rate / divisors[idx]; -} - -static struct clk_ops mtu2_clk_ops = { - .init = mtu2_clk_init, - .recalc = mtu2_clk_recalc, -}; - -static struct clk mtu2_clk1 = { - .name = "mtu2_clk1", - .ops = &mtu2_clk_ops, -}; - -static int mtu2_timer_start(void) -{ - ctrl_outb(ctrl_inb(MTU2_TSTR) | MTU2_TSTR_CST1, MTU2_TSTR); - return 0; -} - -static int mtu2_timer_stop(void) -{ - ctrl_outb(ctrl_inb(MTU2_TSTR) & ~MTU2_TSTR_CST1, MTU2_TSTR); - return 0; -} - -static int mtu2_timer_init(void) -{ - unsigned long interval; - - setup_irq(CONFIG_SH_TIMER_IRQ, &mtu2_irq); - - mtu2_clk1.parent = clk_get(NULL, "module_clk"); - - ctrl_outb(ctrl_inb(STBCR3) & (~0x20), STBCR3); - - /* Normal operation */ - ctrl_outb(0, MTU2_TMDR_1); - ctrl_outb(MTU2_TCR_INIT, MTU2_TCR_1); - ctrl_outb(0x01, MTU2_TIOR_1); - - /* Enable underflow interrupt */ - ctrl_outb(ctrl_inb(MTU2_TIER_1) | MTU2_TIER_TGIEA, MTU2_TIER_1); - - interval = CONFIG_SH_PCLK_FREQ / 16 / HZ; - printk(KERN_INFO "Interval = %ld\n", interval); - - ctrl_outw(interval, MTU2_TGRA_1); - ctrl_outw(0, MTU2_TCNT_1); - - clk_register(&mtu2_clk1); - clk_enable(&mtu2_clk1); - - return 0; -} - -struct sys_timer_ops mtu2_timer_ops = { - .init = mtu2_timer_init, - .start = mtu2_timer_start, - .stop = mtu2_timer_stop, -#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - .get_offset = mtu2_timer_get_offset, -#endif -}; - -struct sys_timer mtu2_timer = { - .name = "mtu2", - .ops = &mtu2_timer_ops, -}; diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c index f3bd1413d56..920891acb77 100644 --- a/arch/sh/kernel/timers/timer.c +++ b/arch/sh/kernel/timers/timer.c @@ -16,9 +16,6 @@ static struct sys_timer *sys_timers[] = { #ifdef CONFIG_SH_TMU &tmu_timer, -#endif -#ifdef CONFIG_SH_MTU2 - &mtu2_timer, #endif NULL, }; From 9570ef20423b549757aa484ad388f9a7d5bdc4d9 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 1 May 2009 06:51:00 +0000 Subject: [PATCH 0760/2061] clocksource: SuperH TMU Timer driver This patch adds a TMU driver for the SuperH architecture. The TMU driver is a platform driver with early platform support to allow using a TMU channel as clockevent or clocksource during system bootup or later. Clocksource or clockevent can be selected. Both periodic and oneshot clockevents are supported. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 12 + drivers/clocksource/Makefile | 1 + drivers/clocksource/sh_tmu.c | 461 +++++++++++++++++++++++++++++++++++ include/linux/sh_tmu.h | 13 + 4 files changed, 487 insertions(+) create mode 100644 drivers/clocksource/sh_tmu.c create mode 100644 include/linux/sh_tmu.h diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index bda3dc1b0c2..1f74737c4f2 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -116,6 +116,9 @@ config SYS_SUPPORTS_CMT config SYS_SUPPORTS_MTU2 bool +config SYS_SUPPORTS_TMU + bool + config STACKTRACE_SUPPORT def_bool y @@ -470,6 +473,15 @@ config SH_TMU help This enables the use of the TMU as the system timer. +config SH_TIMER_TMU + bool "TMU timer driver" + depends on !SH_TMU && SYS_SUPPORTS_TMU + default y + select GENERIC_TIME + select GENERIC_CLOCKEVENTS + help + This enables the build of the TMU timer driver. + config SH_TIMER_CMT bool "CMT timer driver" depends on SYS_SUPPORTS_CMT diff --git a/drivers/clocksource/Makefile b/drivers/clocksource/Makefile index 9785586dc8c..eef216f7f61 100644 --- a/drivers/clocksource/Makefile +++ b/drivers/clocksource/Makefile @@ -4,3 +4,4 @@ obj-$(CONFIG_X86_PM_TIMER) += acpi_pm.o obj-$(CONFIG_SCx200HR_TIMER) += scx200_hrt.o obj-$(CONFIG_SH_TIMER_CMT) += sh_cmt.o obj-$(CONFIG_SH_TIMER_MTU2) += sh_mtu2.o +obj-$(CONFIG_SH_TIMER_TMU) += sh_tmu.o diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c new file mode 100644 index 00000000000..21bd77aa6a3 --- /dev/null +++ b/drivers/clocksource/sh_tmu.c @@ -0,0 +1,461 @@ +/* + * SuperH Timer Support - TMU + * + * Copyright (C) 2009 Magnus Damm + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +struct sh_tmu_priv { + void __iomem *mapbase; + struct clk *clk; + struct irqaction irqaction; + struct platform_device *pdev; + unsigned long rate; + unsigned long periodic; + struct clock_event_device ced; + struct clocksource cs; +}; + +static DEFINE_SPINLOCK(sh_tmu_lock); + +#define TSTR -1 /* shared register */ +#define TCOR 0 /* channel register */ +#define TCNT 1 /* channel register */ +#define TCR 2 /* channel register */ + +static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr) +{ + struct sh_tmu_config *cfg = p->pdev->dev.platform_data; + void __iomem *base = p->mapbase; + unsigned long offs; + + if (reg_nr == TSTR) + return ioread8(base - cfg->channel_offset); + + offs = reg_nr << 2; + + if (reg_nr == TCR) + return ioread16(base + offs); + else + return ioread32(base + offs); +} + +static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr, + unsigned long value) +{ + struct sh_tmu_config *cfg = p->pdev->dev.platform_data; + void __iomem *base = p->mapbase; + unsigned long offs; + + if (reg_nr == TSTR) { + iowrite8(value, base - cfg->channel_offset); + return; + } + + offs = reg_nr << 2; + + if (reg_nr == TCR) + iowrite16(value, base + offs); + else + iowrite32(value, base + offs); +} + +static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start) +{ + struct sh_tmu_config *cfg = p->pdev->dev.platform_data; + unsigned long flags, value; + + /* start stop register shared by multiple timer channels */ + spin_lock_irqsave(&sh_tmu_lock, flags); + value = sh_tmu_read(p, TSTR); + + if (start) + value |= 1 << cfg->timer_bit; + else + value &= ~(1 << cfg->timer_bit); + + sh_tmu_write(p, TSTR, value); + spin_unlock_irqrestore(&sh_tmu_lock, flags); +} + +static int sh_tmu_enable(struct sh_tmu_priv *p) +{ + struct sh_tmu_config *cfg = p->pdev->dev.platform_data; + int ret; + + /* enable clock */ + ret = clk_enable(p->clk); + if (ret) { + pr_err("sh_tmu: cannot enable clock \"%s\"\n", cfg->clk); + return ret; + } + + /* make sure channel is disabled */ + sh_tmu_start_stop_ch(p, 0); + + /* maximum timeout */ + sh_tmu_write(p, TCOR, 0xffffffff); + sh_tmu_write(p, TCNT, 0xffffffff); + + /* configure channel to parent clock / 4, irq off */ + p->rate = clk_get_rate(p->clk) / 4; + sh_tmu_write(p, TCR, 0x0000); + + /* enable channel */ + sh_tmu_start_stop_ch(p, 1); + + return 0; +} + +static void sh_tmu_disable(struct sh_tmu_priv *p) +{ + /* disable channel */ + sh_tmu_start_stop_ch(p, 0); + + /* stop clock */ + clk_disable(p->clk); +} + +static void sh_tmu_set_next(struct sh_tmu_priv *p, unsigned long delta, + int periodic) +{ + /* stop timer */ + sh_tmu_start_stop_ch(p, 0); + + /* acknowledge interrupt */ + sh_tmu_read(p, TCR); + + /* enable interrupt */ + sh_tmu_write(p, TCR, 0x0020); + + /* reload delta value in case of periodic timer */ + if (periodic) + sh_tmu_write(p, TCOR, delta); + else + sh_tmu_write(p, TCOR, 0); + + sh_tmu_write(p, TCNT, delta); + + /* start timer */ + sh_tmu_start_stop_ch(p, 1); +} + +static irqreturn_t sh_tmu_interrupt(int irq, void *dev_id) +{ + struct sh_tmu_priv *p = dev_id; + + /* disable or acknowledge interrupt */ + if (p->ced.mode == CLOCK_EVT_MODE_ONESHOT) + sh_tmu_write(p, TCR, 0x0000); + else + sh_tmu_write(p, TCR, 0x0020); + + /* notify clockevent layer */ + p->ced.event_handler(&p->ced); + return IRQ_HANDLED; +} + +static struct sh_tmu_priv *cs_to_sh_tmu(struct clocksource *cs) +{ + return container_of(cs, struct sh_tmu_priv, cs); +} + +static cycle_t sh_tmu_clocksource_read(struct clocksource *cs) +{ + struct sh_tmu_priv *p = cs_to_sh_tmu(cs); + + return sh_tmu_read(p, TCNT) ^ 0xffffffff; +} + +static int sh_tmu_clocksource_enable(struct clocksource *cs) +{ + struct sh_tmu_priv *p = cs_to_sh_tmu(cs); + int ret; + + ret = sh_tmu_enable(p); + if (ret) + return ret; + + /* TODO: calculate good shift from rate and counter bit width */ + cs->shift = 10; + cs->mult = clocksource_hz2mult(p->rate, cs->shift); + return 0; +} + +static void sh_tmu_clocksource_disable(struct clocksource *cs) +{ + sh_tmu_disable(cs_to_sh_tmu(cs)); +} + +static int sh_tmu_register_clocksource(struct sh_tmu_priv *p, + char *name, unsigned long rating) +{ + struct clocksource *cs = &p->cs; + + cs->name = name; + cs->rating = rating; + cs->read = sh_tmu_clocksource_read; + cs->enable = sh_tmu_clocksource_enable; + cs->disable = sh_tmu_clocksource_disable; + cs->mask = CLOCKSOURCE_MASK(32); + cs->flags = CLOCK_SOURCE_IS_CONTINUOUS; + pr_info("sh_tmu: %s used as clock source\n", cs->name); + clocksource_register(cs); + return 0; +} + +static struct sh_tmu_priv *ced_to_sh_tmu(struct clock_event_device *ced) +{ + return container_of(ced, struct sh_tmu_priv, ced); +} + +static void sh_tmu_clock_event_start(struct sh_tmu_priv *p, int periodic) +{ + struct clock_event_device *ced = &p->ced; + + sh_tmu_enable(p); + + /* TODO: calculate good shift from rate and counter bit width */ + + ced->shift = 32; + ced->mult = div_sc(p->rate, NSEC_PER_SEC, ced->shift); + ced->max_delta_ns = clockevent_delta2ns(0xffffffff, ced); + ced->min_delta_ns = 5000; + + if (periodic) { + p->periodic = (p->rate + HZ/2) / HZ; + sh_tmu_set_next(p, p->periodic, 1); + } +} + +static void sh_tmu_clock_event_mode(enum clock_event_mode mode, + struct clock_event_device *ced) +{ + struct sh_tmu_priv *p = ced_to_sh_tmu(ced); + int disabled = 0; + + /* deal with old setting first */ + switch (ced->mode) { + case CLOCK_EVT_MODE_PERIODIC: + case CLOCK_EVT_MODE_ONESHOT: + sh_tmu_disable(p); + disabled = 1; + break; + default: + break; + } + + switch (mode) { + case CLOCK_EVT_MODE_PERIODIC: + pr_info("sh_tmu: %s used for periodic clock events\n", + ced->name); + sh_tmu_clock_event_start(p, 1); + break; + case CLOCK_EVT_MODE_ONESHOT: + pr_info("sh_tmu: %s used for oneshot clock events\n", + ced->name); + sh_tmu_clock_event_start(p, 0); + break; + case CLOCK_EVT_MODE_UNUSED: + if (!disabled) + sh_tmu_disable(p); + break; + case CLOCK_EVT_MODE_SHUTDOWN: + default: + break; + } +} + +static int sh_tmu_clock_event_next(unsigned long delta, + struct clock_event_device *ced) +{ + struct sh_tmu_priv *p = ced_to_sh_tmu(ced); + + BUG_ON(ced->mode != CLOCK_EVT_MODE_ONESHOT); + + /* program new delta value */ + sh_tmu_set_next(p, delta, 0); + return 0; +} + +static void sh_tmu_register_clockevent(struct sh_tmu_priv *p, + char *name, unsigned long rating) +{ + struct clock_event_device *ced = &p->ced; + int ret; + + memset(ced, 0, sizeof(*ced)); + + ced->name = name; + ced->features = CLOCK_EVT_FEAT_PERIODIC; + ced->features |= CLOCK_EVT_FEAT_ONESHOT; + ced->rating = rating; + ced->cpumask = cpumask_of(0); + ced->set_next_event = sh_tmu_clock_event_next; + ced->set_mode = sh_tmu_clock_event_mode; + + ret = setup_irq(p->irqaction.irq, &p->irqaction); + if (ret) { + pr_err("sh_tmu: failed to request irq %d\n", + p->irqaction.irq); + return; + } + + pr_info("sh_tmu: %s used for clock events\n", ced->name); + clockevents_register_device(ced); +} + +static int sh_tmu_register(struct sh_tmu_priv *p, char *name, + unsigned long clockevent_rating, + unsigned long clocksource_rating) +{ + if (clockevent_rating) + sh_tmu_register_clockevent(p, name, clockevent_rating); + else if (clocksource_rating) + sh_tmu_register_clocksource(p, name, clocksource_rating); + + return 0; +} + +static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev) +{ + struct sh_tmu_config *cfg = pdev->dev.platform_data; + struct resource *res; + int irq, ret; + ret = -ENXIO; + + memset(p, 0, sizeof(*p)); + p->pdev = pdev; + + if (!cfg) { + dev_err(&p->pdev->dev, "missing platform data\n"); + goto err0; + } + + platform_set_drvdata(pdev, p); + + res = platform_get_resource(p->pdev, IORESOURCE_MEM, 0); + if (!res) { + dev_err(&p->pdev->dev, "failed to get I/O memory\n"); + goto err0; + } + + irq = platform_get_irq(p->pdev, 0); + if (irq < 0) { + dev_err(&p->pdev->dev, "failed to get irq\n"); + goto err0; + } + + /* map memory, let mapbase point to our channel */ + p->mapbase = ioremap_nocache(res->start, resource_size(res)); + if (p->mapbase == NULL) { + pr_err("sh_tmu: failed to remap I/O memory\n"); + goto err0; + } + + /* setup data for setup_irq() (too early for request_irq()) */ + p->irqaction.name = cfg->name; + p->irqaction.handler = sh_tmu_interrupt; + p->irqaction.dev_id = p; + p->irqaction.irq = irq; + p->irqaction.flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL; + p->irqaction.mask = CPU_MASK_NONE; + + /* get hold of clock */ + p->clk = clk_get(&p->pdev->dev, cfg->clk); + if (IS_ERR(p->clk)) { + pr_err("sh_tmu: cannot get clock \"%s\"\n", cfg->clk); + ret = PTR_ERR(p->clk); + goto err1; + } + + return sh_tmu_register(p, cfg->name, + cfg->clockevent_rating, + cfg->clocksource_rating); + err1: + iounmap(p->mapbase); + err0: + return ret; +} + +static int __devinit sh_tmu_probe(struct platform_device *pdev) +{ + struct sh_tmu_priv *p = platform_get_drvdata(pdev); + struct sh_tmu_config *cfg = pdev->dev.platform_data; + int ret; + + if (p) { + pr_info("sh_tmu: %s kept as earlytimer\n", cfg->name); + return 0; + } + + p = kmalloc(sizeof(*p), GFP_KERNEL); + if (p == NULL) { + dev_err(&pdev->dev, "failed to allocate driver data\n"); + return -ENOMEM; + } + + ret = sh_tmu_setup(p, pdev); + if (ret) { + kfree(p); + platform_set_drvdata(pdev, NULL); + } + return ret; +} + +static int __devexit sh_tmu_remove(struct platform_device *pdev) +{ + return -EBUSY; /* cannot unregister clockevent and clocksource */ +} + +static struct platform_driver sh_tmu_device_driver = { + .probe = sh_tmu_probe, + .remove = __devexit_p(sh_tmu_remove), + .driver = { + .name = "sh_tmu", + } +}; + +static int __init sh_tmu_init(void) +{ + return platform_driver_register(&sh_tmu_device_driver); +} + +static void __exit sh_tmu_exit(void) +{ + platform_driver_unregister(&sh_tmu_device_driver); +} + +early_platform_init("earlytimer", &sh_tmu_device_driver); +module_init(sh_tmu_init); +module_exit(sh_tmu_exit); + +MODULE_AUTHOR("Magnus Damm"); +MODULE_DESCRIPTION("SuperH TMU Timer Driver"); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/sh_tmu.h b/include/linux/sh_tmu.h new file mode 100644 index 00000000000..b1195e8a37d --- /dev/null +++ b/include/linux/sh_tmu.h @@ -0,0 +1,13 @@ +#ifndef __SH_TMU_H__ +#define __SH_TMU_H__ + +struct sh_tmu_config { + char *name; + unsigned long channel_offset; + int timer_bit; + char *clk; + unsigned long clockevent_rating; + unsigned long clocksource_rating; +}; + +#endif /* __SH_TMU_H__ */ From d43a41bf8b504a1d9f0b4ce7e17d803f4ef39d84 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 1 May 2009 06:58:52 +0000 Subject: [PATCH 0761/2061] sh: TMU platform data for sh7722 This patch adds TMU platform data for sh7722. Only clockevent mode is enabled for now, clocksource requires this patch: "clocksource: setup mult_orig in clocksource_enable()" Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 1 + arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 99 ++++++++++++++++++++++++++ 2 files changed, 100 insertions(+) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 1f74737c4f2..c3e9455498a 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -423,6 +423,7 @@ config CPU_SUBTYPE_SH7722 select ARCH_SPARSEMEM_ENABLE select SYS_SUPPORTS_NUMA select SYS_SUPPORTS_CMT + select SYS_SUPPORTS_TMU config CPU_SUBTYPE_SH7366 bool "Support SH7366 processor" diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c index 793c50da6a8..512735c5cc8 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -209,6 +210,98 @@ static struct platform_device cmt_device = { .num_resources = ARRAY_SIZE(cmt_resources), }; +static struct sh_tmu_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "tmu0", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_tmu_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "tmu0", + .clocksource_rating = 0, /* disabled for now */ +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_tmu_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "tmu0", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + static struct plat_sci_port sci_platform_data[] = { { .mapbase = 0xffe00000, @@ -243,6 +336,9 @@ static struct platform_device sci_device = { static struct platform_device *sh7722_devices[] __initdata = { &cmt_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, &rtc_device, &usbf_device, &iic_device, @@ -271,6 +367,9 @@ __initcall(sh7722_devices_setup); static struct platform_device *sh7722_early_devices[] __initdata = { &cmt_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, }; void __init plat_early_device_setup(void) From 46a12f7426d71cabc08972cf8d3ffdd441d26a3a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 3 May 2009 17:57:17 +0900 Subject: [PATCH 0762/2061] sh: Consolidate MTU2/CMT/TMU timer platform data. All of the SH timers use a roughly identical structure for platform data, which presently is broken out for each block. Consolidate all of these definitions, as there is no reason for them to be broken out in the first place. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh2/setup-sh7619.c | 6 +++--- arch/sh/kernel/cpu/sh2a/setup-mxg.c | 8 ++++---- arch/sh/kernel/cpu/sh2a/setup-sh7201.c | 8 ++++---- arch/sh/kernel/cpu/sh2a/setup-sh7203.c | 11 +++++------ arch/sh/kernel/cpu/sh2a/setup-sh7206.c | 13 ++++++------- arch/sh/kernel/cpu/sh4a/setup-sh7343.c | 4 ++-- arch/sh/kernel/cpu/sh4a/setup-sh7366.c | 4 ++-- arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 11 +++++------ arch/sh/kernel/cpu/sh4a/setup-sh7723.c | 4 ++-- arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 4 ++-- drivers/clocksource/sh_cmt.c | 14 +++++++------- drivers/clocksource/sh_mtu2.c | 14 +++++++------- drivers/clocksource/sh_tmu.c | 14 +++++++------- include/linux/sh_cmt.h | 13 ------------- include/linux/sh_mtu2.h | 12 ------------ include/linux/sh_timer.h | 13 +++++++++++++ include/linux/sh_tmu.h | 13 ------------- 17 files changed, 69 insertions(+), 97 deletions(-) delete mode 100644 include/linux/sh_cmt.h delete mode 100644 include/linux/sh_mtu2.h create mode 100644 include/linux/sh_timer.h delete mode 100644 include/linux/sh_tmu.h diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c index d70c263eb07..94ac27fc223 100644 --- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c +++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include enum { @@ -111,7 +111,7 @@ static struct platform_device eth_device = { .resource = eth_resources, }; -static struct sh_cmt_config cmt0_platform_data = { +static struct sh_timer_config cmt0_platform_data = { .name = "CMT0", .channel_offset = 0x02, .timer_bit = 0, @@ -143,7 +143,7 @@ static struct platform_device cmt0_device = { .num_resources = ARRAY_SIZE(cmt0_resources), }; -static struct sh_cmt_config cmt1_platform_data = { +static struct sh_timer_config cmt1_platform_data = { .name = "CMT1", .channel_offset = 0x08, .timer_bit = 1, diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c index 870030aa05b..a452d964906 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c +++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c @@ -11,7 +11,7 @@ #include #include #include -#include +#include enum { UNUSED = 0, @@ -114,7 +114,7 @@ static struct intc_mask_reg mask_registers[] __initdata = { static DECLARE_INTC_DESC(intc_desc, "mxg", vectors, groups, mask_registers, prio_registers, NULL); -static struct sh_mtu2_config mtu2_0_platform_data = { +static struct sh_timer_config mtu2_0_platform_data = { .name = "MTU2_0", .channel_offset = -0x80, .timer_bit = 0, @@ -145,7 +145,7 @@ static struct platform_device mtu2_0_device = { .num_resources = ARRAY_SIZE(mtu2_0_resources), }; -static struct sh_mtu2_config mtu2_1_platform_data = { +static struct sh_timer_config mtu2_1_platform_data = { .name = "MTU2_1", .channel_offset = -0x100, .timer_bit = 1, @@ -176,7 +176,7 @@ static struct platform_device mtu2_1_device = { .num_resources = ARRAY_SIZE(mtu2_1_resources), }; -static struct sh_mtu2_config mtu2_2_platform_data = { +static struct sh_timer_config mtu2_2_platform_data = { .name = "MTU2_2", .channel_offset = 0x80, .timer_bit = 2, diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c index 074aa9062e4..772358b7685 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include enum { @@ -251,7 +251,7 @@ static struct platform_device rtc_device = { .resource = rtc_resources, }; -static struct sh_mtu2_config mtu2_0_platform_data = { +static struct sh_timer_config mtu2_0_platform_data = { .name = "MTU2_0", .channel_offset = -0x80, .timer_bit = 0, @@ -282,7 +282,7 @@ static struct platform_device mtu2_0_device = { .num_resources = ARRAY_SIZE(mtu2_0_resources), }; -static struct sh_mtu2_config mtu2_1_platform_data = { +static struct sh_timer_config mtu2_1_platform_data = { .name = "MTU2_1", .channel_offset = -0x100, .timer_bit = 1, @@ -313,7 +313,7 @@ static struct platform_device mtu2_1_device = { .num_resources = ARRAY_SIZE(mtu2_1_resources), }; -static struct sh_mtu2_config mtu2_2_platform_data = { +static struct sh_timer_config mtu2_2_platform_data = { .name = "MTU2_2", .channel_offset = 0x80, .timer_bit = 2, diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c index 3448164b573..d7493418ba6 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c @@ -11,8 +11,7 @@ #include #include #include -#include -#include +#include #include enum { @@ -208,7 +207,7 @@ static struct platform_device sci_device = { }, }; -static struct sh_cmt_config cmt0_platform_data = { +static struct sh_timer_config cmt0_platform_data = { .name = "CMT0", .channel_offset = 0x02, .timer_bit = 0, @@ -240,7 +239,7 @@ static struct platform_device cmt0_device = { .num_resources = ARRAY_SIZE(cmt0_resources), }; -static struct sh_cmt_config cmt1_platform_data = { +static struct sh_timer_config cmt1_platform_data = { .name = "CMT1", .channel_offset = 0x08, .timer_bit = 1, @@ -272,7 +271,7 @@ static struct platform_device cmt1_device = { .num_resources = ARRAY_SIZE(cmt1_resources), }; -static struct sh_mtu2_config mtu2_0_platform_data = { +static struct sh_timer_config mtu2_0_platform_data = { .name = "MTU2_0", .channel_offset = -0x80, .timer_bit = 0, @@ -303,7 +302,7 @@ static struct platform_device mtu2_0_device = { .num_resources = ARRAY_SIZE(mtu2_0_resources), }; -static struct sh_mtu2_config mtu2_1_platform_data = { +static struct sh_timer_config mtu2_1_platform_data = { .name = "MTU2_1", .channel_offset = -0x100, .timer_bit = 1, diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c index e700559b6b8..2fc6bff5c5f 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c @@ -12,8 +12,7 @@ #include #include #include -#include -#include +#include #include enum { @@ -168,7 +167,7 @@ static struct platform_device sci_device = { }, }; -static struct sh_cmt_config cmt0_platform_data = { +static struct sh_timer_config cmt0_platform_data = { .name = "CMT0", .channel_offset = 0x02, .timer_bit = 0, @@ -200,7 +199,7 @@ static struct platform_device cmt0_device = { .num_resources = ARRAY_SIZE(cmt0_resources), }; -static struct sh_cmt_config cmt1_platform_data = { +static struct sh_timer_config cmt1_platform_data = { .name = "CMT1", .channel_offset = 0x08, .timer_bit = 1, @@ -232,7 +231,7 @@ static struct platform_device cmt1_device = { .num_resources = ARRAY_SIZE(cmt1_resources), }; -static struct sh_mtu2_config mtu2_0_platform_data = { +static struct sh_timer_config mtu2_0_platform_data = { .name = "MTU2_0", .channel_offset = -0x80, .timer_bit = 0, @@ -263,7 +262,7 @@ static struct platform_device mtu2_0_device = { .num_resources = ARRAY_SIZE(mtu2_0_resources), }; -static struct sh_mtu2_config mtu2_1_platform_data = { +static struct sh_timer_config mtu2_1_platform_data = { .name = "MTU2_1", .channel_offset = -0x100, .timer_bit = 1, @@ -294,7 +293,7 @@ static struct platform_device mtu2_1_device = { .num_resources = ARRAY_SIZE(mtu2_1_resources), }; -static struct sh_mtu2_config mtu2_2_platform_data = { +static struct sh_timer_config mtu2_2_platform_data = { .name = "MTU2_2", .channel_offset = 0x80, .timer_bit = 2, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c index cb5b4db1ca2..f327b7eaf47 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c @@ -12,7 +12,7 @@ #include #include #include -#include +#include #include static struct resource iic0_resources[] = { @@ -141,7 +141,7 @@ static struct platform_device jpu_device = { .num_resources = ARRAY_SIZE(jpu_resources), }; -static struct sh_cmt_config cmt_platform_data = { +static struct sh_timer_config cmt_platform_data = { .name = "CMT", .channel_offset = 0x60, .timer_bit = 5, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c index 2a771f48e9e..7361ea989b9 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c @@ -14,7 +14,7 @@ #include #include #include -#include +#include #include static struct resource iic_resources[] = { @@ -148,7 +148,7 @@ static struct platform_device veu1_device = { .num_resources = ARRAY_SIZE(veu1_resources), }; -static struct sh_cmt_config cmt_platform_data = { +static struct sh_timer_config cmt_platform_data = { .name = "CMT", .channel_offset = 0x60, .timer_bit = 5, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c index 512735c5cc8..624e8e5a605 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include #include #include @@ -178,7 +177,7 @@ static struct platform_device jpu_device = { .num_resources = ARRAY_SIZE(jpu_resources), }; -static struct sh_cmt_config cmt_platform_data = { +static struct sh_timer_config cmt_platform_data = { .name = "CMT", .channel_offset = 0x60, .timer_bit = 5, @@ -210,7 +209,7 @@ static struct platform_device cmt_device = { .num_resources = ARRAY_SIZE(cmt_resources), }; -static struct sh_tmu_config tmu0_platform_data = { +static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, @@ -241,7 +240,7 @@ static struct platform_device tmu0_device = { .num_resources = ARRAY_SIZE(tmu0_resources), }; -static struct sh_tmu_config tmu1_platform_data = { +static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, @@ -272,7 +271,7 @@ static struct platform_device tmu1_device = { .num_resources = ARRAY_SIZE(tmu1_resources), }; -static struct sh_tmu_config tmu2_platform_data = { +static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c index dbb44949ed1..fb9b5427a06 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c @@ -13,7 +13,7 @@ #include #include #include -#include +#include #include #include @@ -101,7 +101,7 @@ static struct platform_device veu1_device = { .num_resources = ARRAY_SIZE(veu1_resources), }; -static struct sh_cmt_config cmt_platform_data = { +static struct sh_timer_config cmt_platform_data = { .name = "CMT", .channel_offset = 0x60, .timer_bit = 5, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index 8429396acb5..e074951b88b 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -18,7 +18,7 @@ #include #include #include -#include +#include #include #include #include @@ -230,7 +230,7 @@ static struct platform_device veu1_device = { .num_resources = ARRAY_SIZE(veu1_resources), }; -static struct sh_cmt_config cmt_platform_data = { +static struct sh_timer_config cmt_platform_data = { .name = "CMT", .channel_offset = 0x60, .timer_bit = 5, diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index 4ff1508e5ab..aeb8c9b27b5 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include struct sh_cmt_priv { void __iomem *mapbase; @@ -59,7 +59,7 @@ static DEFINE_SPINLOCK(sh_cmt_lock); static inline unsigned long sh_cmt_read(struct sh_cmt_priv *p, int reg_nr) { - struct sh_cmt_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; void __iomem *base = p->mapbase; unsigned long offs; @@ -83,7 +83,7 @@ static inline unsigned long sh_cmt_read(struct sh_cmt_priv *p, int reg_nr) static inline void sh_cmt_write(struct sh_cmt_priv *p, int reg_nr, unsigned long value) { - struct sh_cmt_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; void __iomem *base = p->mapbase; unsigned long offs; @@ -131,7 +131,7 @@ static unsigned long sh_cmt_get_counter(struct sh_cmt_priv *p, static void sh_cmt_start_stop_ch(struct sh_cmt_priv *p, int start) { - struct sh_cmt_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; unsigned long flags, value; /* start stop register shared by multiple timer channels */ @@ -149,7 +149,7 @@ static void sh_cmt_start_stop_ch(struct sh_cmt_priv *p, int start) static int sh_cmt_enable(struct sh_cmt_priv *p, unsigned long *rate) { - struct sh_cmt_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; int ret; /* enable clock */ @@ -560,7 +560,7 @@ int sh_cmt_register(struct sh_cmt_priv *p, char *name, static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev) { - struct sh_cmt_config *cfg = pdev->dev.platform_data; + struct sh_timer_config *cfg = pdev->dev.platform_data; struct resource *res; int irq, ret; ret = -ENXIO; @@ -638,7 +638,7 @@ static int sh_cmt_setup(struct sh_cmt_priv *p, struct platform_device *pdev) static int __devinit sh_cmt_probe(struct platform_device *pdev) { struct sh_cmt_priv *p = platform_get_drvdata(pdev); - struct sh_cmt_config *cfg = pdev->dev.platform_data; + struct sh_timer_config *cfg = pdev->dev.platform_data; int ret; if (p) { diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c index 420566f4c50..ef02185a827 100644 --- a/drivers/clocksource/sh_mtu2.c +++ b/drivers/clocksource/sh_mtu2.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include struct sh_mtu2_priv { void __iomem *mapbase; @@ -63,7 +63,7 @@ static unsigned long mtu2_reg_offs[] = { static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr) { - struct sh_mtu2_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; void __iomem *base = p->mapbase; unsigned long offs; @@ -81,7 +81,7 @@ static inline unsigned long sh_mtu2_read(struct sh_mtu2_priv *p, int reg_nr) static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr, unsigned long value) { - struct sh_mtu2_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; void __iomem *base = p->mapbase; unsigned long offs; @@ -100,7 +100,7 @@ static inline void sh_mtu2_write(struct sh_mtu2_priv *p, int reg_nr, static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start) { - struct sh_mtu2_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; unsigned long flags, value; /* start stop register shared by multiple timer channels */ @@ -118,7 +118,7 @@ static void sh_mtu2_start_stop_ch(struct sh_mtu2_priv *p, int start) static int sh_mtu2_enable(struct sh_mtu2_priv *p) { - struct sh_mtu2_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; int ret; /* enable clock */ @@ -243,7 +243,7 @@ int sh_mtu2_register(struct sh_mtu2_priv *p, char *name, static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev) { - struct sh_mtu2_config *cfg = pdev->dev.platform_data; + struct sh_timer_config *cfg = pdev->dev.platform_data; struct resource *res; int irq, ret; ret = -ENXIO; @@ -303,7 +303,7 @@ static int sh_mtu2_setup(struct sh_mtu2_priv *p, struct platform_device *pdev) static int __devinit sh_mtu2_probe(struct platform_device *pdev) { struct sh_mtu2_priv *p = platform_get_drvdata(pdev); - struct sh_mtu2_config *cfg = pdev->dev.platform_data; + struct sh_timer_config *cfg = pdev->dev.platform_data; int ret; if (p) { diff --git a/drivers/clocksource/sh_tmu.c b/drivers/clocksource/sh_tmu.c index 21bd77aa6a3..d6ea4398bf6 100644 --- a/drivers/clocksource/sh_tmu.c +++ b/drivers/clocksource/sh_tmu.c @@ -29,7 +29,7 @@ #include #include #include -#include +#include struct sh_tmu_priv { void __iomem *mapbase; @@ -51,7 +51,7 @@ static DEFINE_SPINLOCK(sh_tmu_lock); static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr) { - struct sh_tmu_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; void __iomem *base = p->mapbase; unsigned long offs; @@ -69,7 +69,7 @@ static inline unsigned long sh_tmu_read(struct sh_tmu_priv *p, int reg_nr) static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr, unsigned long value) { - struct sh_tmu_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; void __iomem *base = p->mapbase; unsigned long offs; @@ -88,7 +88,7 @@ static inline void sh_tmu_write(struct sh_tmu_priv *p, int reg_nr, static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start) { - struct sh_tmu_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; unsigned long flags, value; /* start stop register shared by multiple timer channels */ @@ -106,7 +106,7 @@ static void sh_tmu_start_stop_ch(struct sh_tmu_priv *p, int start) static int sh_tmu_enable(struct sh_tmu_priv *p) { - struct sh_tmu_config *cfg = p->pdev->dev.platform_data; + struct sh_timer_config *cfg = p->pdev->dev.platform_data; int ret; /* enable clock */ @@ -345,7 +345,7 @@ static int sh_tmu_register(struct sh_tmu_priv *p, char *name, static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev) { - struct sh_tmu_config *cfg = pdev->dev.platform_data; + struct sh_timer_config *cfg = pdev->dev.platform_data; struct resource *res; int irq, ret; ret = -ENXIO; @@ -407,7 +407,7 @@ static int sh_tmu_setup(struct sh_tmu_priv *p, struct platform_device *pdev) static int __devinit sh_tmu_probe(struct platform_device *pdev) { struct sh_tmu_priv *p = platform_get_drvdata(pdev); - struct sh_tmu_config *cfg = pdev->dev.platform_data; + struct sh_timer_config *cfg = pdev->dev.platform_data; int ret; if (p) { diff --git a/include/linux/sh_cmt.h b/include/linux/sh_cmt.h deleted file mode 100644 index 68cacde5954..00000000000 --- a/include/linux/sh_cmt.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef __SH_CMT_H__ -#define __SH_CMT_H__ - -struct sh_cmt_config { - char *name; - unsigned long channel_offset; - int timer_bit; - char *clk; - unsigned long clockevent_rating; - unsigned long clocksource_rating; -}; - -#endif /* __SH_CMT_H__ */ diff --git a/include/linux/sh_mtu2.h b/include/linux/sh_mtu2.h deleted file mode 100644 index a98876340ff..00000000000 --- a/include/linux/sh_mtu2.h +++ /dev/null @@ -1,12 +0,0 @@ -#ifndef __SH_MTU2_H__ -#define __SH_MTU2_H__ - -struct sh_mtu2_config { - char *name; - int channel_offset; - int timer_bit; - char *clk; - unsigned long clockevent_rating; -}; - -#endif /* __SH_MTU2_H__ */ diff --git a/include/linux/sh_timer.h b/include/linux/sh_timer.h new file mode 100644 index 00000000000..864bd56bd3b --- /dev/null +++ b/include/linux/sh_timer.h @@ -0,0 +1,13 @@ +#ifndef __SH_TIMER_H__ +#define __SH_TIMER_H__ + +struct sh_timer_config { + char *name; + long channel_offset; + int timer_bit; + char *clk; + unsigned long clockevent_rating; + unsigned long clocksource_rating; +}; + +#endif /* __SH_TIMER_H__ */ diff --git a/include/linux/sh_tmu.h b/include/linux/sh_tmu.h deleted file mode 100644 index b1195e8a37d..00000000000 --- a/include/linux/sh_tmu.h +++ /dev/null @@ -1,13 +0,0 @@ -#ifndef __SH_TMU_H__ -#define __SH_TMU_H__ - -struct sh_tmu_config { - char *name; - unsigned long channel_offset; - int timer_bit; - char *clk; - unsigned long clockevent_rating; - unsigned long clocksource_rating; -}; - -#endif /* __SH_TMU_H__ */ From d1fcc0a8db5e47c1abaa783a3e83dbf5f2184969 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 3 May 2009 18:05:42 +0900 Subject: [PATCH 0763/2061] clocksource: sh_mtu2/cmt_register() should be static. Neither of these need to be exported, so just make them static. Signed-off-by: Paul Mundt --- drivers/clocksource/sh_cmt.c | 6 +++--- drivers/clocksource/sh_mtu2.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/clocksource/sh_cmt.c b/drivers/clocksource/sh_cmt.c index aeb8c9b27b5..cf56a2af5fe 100644 --- a/drivers/clocksource/sh_cmt.c +++ b/drivers/clocksource/sh_cmt.c @@ -537,9 +537,9 @@ static void sh_cmt_register_clockevent(struct sh_cmt_priv *p, clockevents_register_device(ced); } -int sh_cmt_register(struct sh_cmt_priv *p, char *name, - unsigned long clockevent_rating, - unsigned long clocksource_rating) +static int sh_cmt_register(struct sh_cmt_priv *p, char *name, + unsigned long clockevent_rating, + unsigned long clocksource_rating) { if (p->width == (sizeof(p->max_match_value) * 8)) p->max_match_value = ~0; diff --git a/drivers/clocksource/sh_mtu2.c b/drivers/clocksource/sh_mtu2.c index ef02185a827..d1ae75454d1 100644 --- a/drivers/clocksource/sh_mtu2.c +++ b/drivers/clocksource/sh_mtu2.c @@ -232,8 +232,8 @@ static void sh_mtu2_register_clockevent(struct sh_mtu2_priv *p, clockevents_register_device(ced); } -int sh_mtu2_register(struct sh_mtu2_priv *p, char *name, - unsigned long clockevent_rating) +static int sh_mtu2_register(struct sh_mtu2_priv *p, char *name, + unsigned long clockevent_rating) { if (clockevent_rating) sh_mtu2_register_clockevent(p, name, clockevent_rating); From 938edae11ee3a7b20b6d754074a0f2c2edc4534b Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 3 May 2009 18:12:26 +0900 Subject: [PATCH 0764/2061] sh: select both GENERIC_TIME and GENERIC_CLOCKEVENTS. Now that the rest of the timers that didn't support clockevents have been rewritten, both of these can be enabled by default. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index c3e9455498a..565f39042ad 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -78,7 +78,7 @@ config GENERIC_TIME def_bool y config GENERIC_CLOCKEVENTS - def_bool n + def_bool y config GENERIC_CLOCKEVENTS_BROADCAST bool @@ -469,8 +469,6 @@ config SH_TMU bool "TMU timer support" depends on CPU_SH3 || CPU_SH4 default y - select GENERIC_TIME - select GENERIC_CLOCKEVENTS help This enables the use of the TMU as the system timer. @@ -478,8 +476,6 @@ config SH_TIMER_TMU bool "TMU timer driver" depends on !SH_TMU && SYS_SUPPORTS_TMU default y - select GENERIC_TIME - select GENERIC_CLOCKEVENTS help This enables the build of the TMU timer driver. @@ -487,8 +483,6 @@ config SH_TIMER_CMT bool "CMT timer driver" depends on SYS_SUPPORTS_CMT default y - select GENERIC_CLOCKEVENTS - select GENERIC_TIME help This enables build of the CMT timer driver. @@ -496,7 +490,6 @@ config SH_TIMER_MTU2 bool "MTU2 timer driver" depends on SYS_SUPPORTS_MTU2 default y - select GENERIC_CLOCKEVENTS help This enables build of the MTU2 timer driver. From dec56e6312434b2536fedf9d7e9e73d666aaf0a8 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 3 May 2009 18:18:14 +0900 Subject: [PATCH 0765/2061] sh: Kill off the now unused ARCH_USES_GETTIMEOFFSET code. Now that the stragglers (MTU2/CMT/etc.) have been rewritten and we are selecting both GENERIC_TIME and GENERIC_CLOCKEVENTS, the get_offset() timer op is completely unused. As a result, we are now able to kill off the ARCH_USES_GETTIMEOFFSET references. Signed-off-by: Paul Mundt --- arch/sh/include/asm/timer.h | 10 ---------- arch/sh/kernel/time_32.c | 9 --------- 2 files changed, 19 deletions(-) diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h index 581e3fe3fe0..cb16645d839 100644 --- a/arch/sh/include/asm/timer.h +++ b/arch/sh/include/asm/timer.h @@ -9,9 +9,6 @@ struct sys_timer_ops { int (*init)(void); int (*start)(void); int (*stop)(void); -#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET - unsigned long (*get_offset)(void); -#endif }; struct sys_timer { @@ -26,13 +23,6 @@ struct sys_timer { extern struct sys_timer tmu_timer; extern struct sys_timer *sys_timer; -#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -static inline unsigned long get_timer_offset(void) -{ - return sys_timer->ops->get_offset(); -} -#endif - /* arch/sh/kernel/timers/timer.c */ struct sys_timer *get_sys_timer(void); diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time_32.c index 9d34dff1499..d41ca4cf20c 100644 --- a/arch/sh/kernel/time_32.c +++ b/arch/sh/kernel/time_32.c @@ -83,13 +83,6 @@ static int __init rtc_generic_init(void) } module_init(rtc_generic_init); -#ifdef CONFIG_ARCH_USES_GETTIMEOFFSET -u32 arch_gettimeoffset(void) -{ - return get_timer_offset() * 1000; -} -#endif /* CONFIG_ARCH_USES_GETTIMEOFFSET */ - /* last time the RTC clock got updated */ static long last_rtc_update; @@ -185,7 +178,6 @@ struct clocksource clocksource_sh = { .name = "SuperH", }; -#ifndef CONFIG_ARCH_USES_GETTIMEOFFSET unsigned long long sched_clock(void) { unsigned long long cycles; @@ -197,7 +189,6 @@ unsigned long long sched_clock(void) cycles = clocksource_sh.read(&clocksource_sh); return cyc2ns(&clocksource_sh, cycles); } -#endif static void __init sh_late_time_init(void) { From 25483efeb2e56521e418a59fa93401be156dc3bb Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 3 May 2009 18:29:27 +0900 Subject: [PATCH 0766/2061] sh: Move dummy clockevents broadcast timer to its new home. The old arch/sh/kernel/timers/ directly will be going away completely once the rest of the TMU users are migrated, so move the dummy broadcast driver up a level in preparation. Signed-off-by: Paul Mundt --- arch/sh/kernel/Makefile_32 | 2 ++ arch/sh/kernel/Makefile_64 | 2 ++ arch/sh/kernel/{timers/timer-broadcast.c => localtimer.c} | 0 arch/sh/kernel/timers/Makefile | 2 -- 4 files changed, 4 insertions(+), 2 deletions(-) rename arch/sh/kernel/{timers/timer-broadcast.c => localtimer.c} (100%) diff --git a/arch/sh/kernel/Makefile_32 b/arch/sh/kernel/Makefile_32 index 82a3a150c00..5b1d4d0fe04 100644 --- a/arch/sh/kernel/Makefile_32 +++ b/arch/sh/kernel/Makefile_32 @@ -32,4 +32,6 @@ obj-$(CONFIG_DYNAMIC_FTRACE) += ftrace.o obj-$(CONFIG_DUMP_CODE) += disassemble.o obj-$(CONFIG_HIBERNATION) += swsusp.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += localtimer.o + EXTRA_CFLAGS += -Werror diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64 index fe425d7f687..d256c774695 100644 --- a/arch/sh/kernel/Makefile_64 +++ b/arch/sh/kernel/Makefile_64 @@ -17,4 +17,6 @@ obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_IO_TRAPPED) += io_trapped.o obj-$(CONFIG_GENERIC_GPIO) += gpio.o +obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += localtimer.o + EXTRA_CFLAGS += -Werror diff --git a/arch/sh/kernel/timers/timer-broadcast.c b/arch/sh/kernel/localtimer.c similarity index 100% rename from arch/sh/kernel/timers/timer-broadcast.c rename to arch/sh/kernel/localtimer.c diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile index 44156df0bde..8bceba59e76 100644 --- a/arch/sh/kernel/timers/Makefile +++ b/arch/sh/kernel/timers/Makefile @@ -5,5 +5,3 @@ obj-y := timer.o obj-$(CONFIG_SH_TMU) += timer-tmu.o - -obj-$(CONFIG_GENERIC_CLOCKEVENTS_BROADCAST) += timer-broadcast.o From b82914ce33146186d554b0f5c41e4e13693614ce Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 18:54:32 +0200 Subject: [PATCH 0767/2061] perf_counter: round-robin per-CPU counters too This used to be unstable when we had the rq->lock dependencies, but now that they are that of the past we can turn on percpu counter RR too. [ Impact: handle counter over-commit for per-CPU counters too ] LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 8660ae57953..b9679c36bcc 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1069,18 +1069,14 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = &curr->perf_counter_ctx; - const int rotate_percpu = 0; - if (rotate_percpu) - perf_counter_cpu_sched_out(cpuctx); + perf_counter_cpu_sched_out(cpuctx); perf_counter_task_sched_out(curr, cpu); - if (rotate_percpu) - rotate_ctx(&cpuctx->ctx); + rotate_ctx(&cpuctx->ctx); rotate_ctx(ctx); - if (rotate_percpu) - perf_counter_cpu_sched_in(cpuctx, cpu); + perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); } From ba77813a2a22d631fe5bc0bf1ec0d11350544b70 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 4 May 2009 18:47:44 +0200 Subject: [PATCH 0768/2061] perf_counter: x86: fixup nmi_watchdog vs perf_counter boo-boo Invert the atomic_inc_not_zero() test so that we will indeed detect the first activation. Also rename the global num_counters, since its easy to confuse with x86_pmu.num_counters. [ Impact: fix non-working perfcounters on AMD CPUs, cleanup ] Signed-off-by: Peter Zijlstra LKML-Reference: <1241455664.7620.4938.camel@twins> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index d4c0cc9d326..196b58f0444 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -171,7 +171,7 @@ again: return new_raw_count; } -static atomic_t num_counters; +static atomic_t active_counters; static DEFINE_MUTEX(pmc_reserve_mutex); static bool reserve_pmc_hardware(void) @@ -224,7 +224,7 @@ static void release_pmc_hardware(void) static void hw_perf_counter_destroy(struct perf_counter *counter) { - if (atomic_dec_and_mutex_lock(&num_counters, &pmc_reserve_mutex)) { + if (atomic_dec_and_mutex_lock(&active_counters, &pmc_reserve_mutex)) { release_pmc_hardware(); mutex_unlock(&pmc_reserve_mutex); } @@ -248,12 +248,12 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -ENODEV; err = 0; - if (atomic_inc_not_zero(&num_counters)) { + if (!atomic_inc_not_zero(&active_counters)) { mutex_lock(&pmc_reserve_mutex); - if (atomic_read(&num_counters) == 0 && !reserve_pmc_hardware()) + if (atomic_read(&active_counters) == 0 && !reserve_pmc_hardware()) err = -EBUSY; else - atomic_inc(&num_counters); + atomic_inc(&active_counters); mutex_unlock(&pmc_reserve_mutex); } if (err) @@ -280,7 +280,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) if (capable(CAP_SYS_ADMIN) && hw_event->nmi) hwc->nmi = 1; - hwc->irq_period = hw_event->irq_period; + hwc->irq_period = hw_event->irq_period; if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) hwc->irq_period = x86_pmu.max_period; @@ -871,7 +871,7 @@ perf_counter_nmi_handler(struct notifier_block *self, struct pt_regs *regs; int ret; - if (!atomic_read(&num_counters)) + if (!atomic_read(&active_counters)) return NOTIFY_DONE; switch (cmd) { From 0d905bca23aca5c86a10ee101bcd3b1abbd40b25 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:13:30 +0200 Subject: [PATCH 0769/2061] perf_counter: initialize the per-cpu context earlier percpu scheduling for perfcounters wants to take the context lock, but that lock first needs to be initialized. Currently it is an early_initcall() - but that is too late, the task tick runs much sooner than that. Call it explicitly from the scheduler init sequence instead. [ Impact: fix access-before-init crash ] LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 5 ++++- kernel/perf_counter.c | 5 +---- kernel/sched.c | 5 ++++- 3 files changed, 9 insertions(+), 6 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f776851f8c4..a356fa69796 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -573,6 +573,8 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_priv; +extern void perf_counter_init(void); + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } @@ -600,9 +602,10 @@ perf_counter_mmap(unsigned long addr, unsigned long len, static inline void perf_counter_munmap(unsigned long addr, unsigned long len, - unsigned long pgoff, struct file *file) { } + unsigned long pgoff, struct file *file) { } static inline void perf_counter_comm(struct task_struct *tsk) { } +static inline void perf_counter_init(void) { } #endif #endif /* __KERNEL__ */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index b9679c36bcc..fcdafa234a5 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3265,15 +3265,12 @@ static struct notifier_block __cpuinitdata perf_cpu_nb = { .notifier_call = perf_cpu_notify, }; -static int __init perf_counter_init(void) +void __init perf_counter_init(void) { perf_cpu_notify(&perf_cpu_nb, (unsigned long)CPU_UP_PREPARE, (void *)(long)smp_processor_id()); register_cpu_notifier(&perf_cpu_nb); - - return 0; } -early_initcall(perf_counter_init); static ssize_t perf_show_reserve_percpu(struct sysdev_class *class, char *buf) { diff --git a/kernel/sched.c b/kernel/sched.c index 2f600e30dcf..a728976a3a6 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -8996,7 +8997,7 @@ void __init sched_init(void) * 1024) and two child groups A0 and A1 (of weight 1024 each), * then A0's share of the cpu resource is: * - * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% + * A0's bandwidth = 1024 / (10*1024 + 1024 + 1024) = 8.33% * * We achieve this by letting init_task_group's tasks sit * directly in rq->cfs (i.e init_task_group->se[] = NULL). @@ -9097,6 +9098,8 @@ void __init sched_init(void) alloc_bootmem_cpumask_var(&cpu_isolated_map); #endif /* SMP */ + perf_counter_init(); + scheduler_running = 1; } From 1dce8d99b85aba6eddb8b8260baea944922e6fe7 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:23:18 +0200 Subject: [PATCH 0770/2061] perf_counter: convert perf_resource_mutex to a spinlock Now percpu counters can be initialized very early. But the init sequence uses mutex_lock(). Fortunately, perf_resource_mutex should be a spinlock anyway, so convert it. [ Impact: fix crash due to early init mutex use ] LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fcdafa234a5..5f86a1156c9 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -46,9 +46,9 @@ static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ /* - * Mutex for (sysadmin-configurable) counter reservations: + * Lock for (sysadmin-configurable) counter reservations: */ -static DEFINE_MUTEX(perf_resource_mutex); +static DEFINE_SPINLOCK(perf_resource_lock); /* * Architecture provided APIs - weak aliases: @@ -3207,9 +3207,9 @@ static void __cpuinit perf_counter_init_cpu(int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); __perf_counter_init_context(&cpuctx->ctx, NULL); - mutex_lock(&perf_resource_mutex); + spin_lock(&perf_resource_lock); cpuctx->max_pertask = perf_max_counters - perf_reserved_percpu; - mutex_unlock(&perf_resource_mutex); + spin_unlock(&perf_resource_lock); hw_perf_counter_setup(cpu); } @@ -3292,7 +3292,7 @@ perf_set_reserve_percpu(struct sysdev_class *class, if (val > perf_max_counters) return -EINVAL; - mutex_lock(&perf_resource_mutex); + spin_lock(&perf_resource_lock); perf_reserved_percpu = val; for_each_online_cpu(cpu) { cpuctx = &per_cpu(perf_cpu_context, cpu); @@ -3302,7 +3302,7 @@ perf_set_reserve_percpu(struct sysdev_class *class, cpuctx->max_pertask = mpt; spin_unlock_irq(&cpuctx->ctx.lock); } - mutex_unlock(&perf_resource_mutex); + spin_unlock(&perf_resource_lock); return count; } @@ -3324,9 +3324,9 @@ perf_set_overcommit(struct sysdev_class *class, const char *buf, size_t count) if (val > 1) return -EINVAL; - mutex_lock(&perf_resource_mutex); + spin_lock(&perf_resource_lock); perf_overcommit = val; - mutex_unlock(&perf_resource_mutex); + spin_unlock(&perf_resource_lock); return count; } From 066d7dea32c9bffe6decc0abe465627656cdd84e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 4 May 2009 19:04:09 +0200 Subject: [PATCH 0771/2061] perf_counter: fix fixed-purpose counter support on v2 Intel-PERFMON Fixed-purpose counters stopped working in a simple 'perf stat ls' run: cache references cache misses Due to: ef7b3e0: perf_counter, x86: remove vendor check in fixed_mode_idx() Which made x86_pmu.num_counters_fixed matter: if it's nonzero, the fixed-purpose counters are utilized. But on v2 perfmon this field is not set (despite there being fixed-purpose PMCs). So add a quirk to set the number of fixed-purpose counters to at least three. [ Impact: add quirk for three fixed-purpose counters on certain Intel CPUs ] Cc: Robert Richter Cc: Paul Mackerras Cc: Peter Zijlstra LKML-Reference: <1241002046-8832-28-git-send-email-robert.richter@amd.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 196b58f0444..a6878b0798e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -962,7 +962,13 @@ static int intel_pmu_init(void) x86_pmu = intel_pmu; x86_pmu.version = version; x86_pmu.num_counters = eax.split.num_counters; - x86_pmu.num_counters_fixed = edx.split.num_counters_fixed; + + /* + * Quirk: v2 perfmon does not report fixed-purpose counters, so + * assume at least 3 counters: + */ + x86_pmu.num_counters_fixed = max((int)edx.split.num_counters_fixed, 3); + x86_pmu.counter_bits = eax.split.bit_width; x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; From d6ce96dabe2c4409fd009ec14250a1fdbab4b133 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 01:15:24 -0400 Subject: [PATCH 0772/2061] ring-buffer: export symbols I'm adding a module to do a series of tests on the ring buffer as well as benchmarks. This module needs to have more of the ring buffer API exported. There's nothing wrong with reading the ring buffer from a module. [ Impact: allow modules to read pages from the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f4cc59040eb..3e86da9b2a0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2802,6 +2802,7 @@ void *ring_buffer_alloc_read_page(struct ring_buffer *buffer) return bpage; } +EXPORT_SYMBOL_GPL(ring_buffer_alloc_read_page); /** * ring_buffer_free_read_page - free an allocated read page @@ -2814,6 +2815,7 @@ void ring_buffer_free_read_page(struct ring_buffer *buffer, void *data) { free_page((unsigned long)data); } +EXPORT_SYMBOL_GPL(ring_buffer_free_read_page); /** * ring_buffer_read_page - extract a page from the ring buffer @@ -2959,6 +2961,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, out: return ret; } +EXPORT_SYMBOL_GPL(ring_buffer_read_page); static ssize_t rb_simple_read(struct file *filp, char __user *ubuf, From f0d2c681ac0a85142fc8abe65fc33fcad35cb9b7 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 13:43:37 -0400 Subject: [PATCH 0773/2061] ring-buffer: add counters for commit overrun and nmi dropped entries The WARN_ON in the ring buffer when a commit is preempted and the buffer is filled by preceding writes can happen in normal operations. The WARN_ON makes it look like a bug, not to mention, because it does not stop tracing and calls printk which can also recurse, this is prone to deadlock (the WARN_ON is not in a position to recurse). This patch removes the WARN_ON and replaces it with a counter that can be retrieved by a tracer. This counter is called commit_overrun. While at it, I added a nmi_dropped counter to count any time an NMI entry is dropped because the NMI could not take the spinlock. [ Impact: prevent deadlock by printing normal case warning ] Signed-off-by: Steven Rostedt --- include/linux/ring_buffer.h | 2 ++ kernel/trace/ring_buffer.c | 52 ++++++++++++++++++++++++++++++++++--- 2 files changed, 51 insertions(+), 3 deletions(-) diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index 1c2f80911fb..f1345828c7c 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -153,6 +153,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer); unsigned long ring_buffer_overruns(struct ring_buffer *buffer); unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu); void ring_buffer_normalize_time_stamp(struct ring_buffer *buffer, diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3e86da9b2a0..26e1359fe19 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -402,6 +402,8 @@ struct ring_buffer_per_cpu { struct buffer_page *tail_page; /* write to tail */ struct buffer_page *commit_page; /* committed pages */ struct buffer_page *reader_page; + unsigned long nmi_dropped; + unsigned long commit_overrun; unsigned long overrun; unsigned long entries; u64 write_stamp; @@ -1216,8 +1218,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * simply fail. */ if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; goto out_reset; + } } else __raw_spin_lock(&cpu_buffer->lock); @@ -1238,8 +1242,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, * about it. */ if (unlikely(next_page == commit_page)) { - /* This can easily happen on small ring buffers */ - WARN_ON_ONCE(buffer->pages > 2); + cpu_buffer->commit_overrun++; goto out_reset; } @@ -1925,6 +1928,47 @@ unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu) } EXPORT_SYMBOL_GPL(ring_buffer_overrun_cpu); +/** + * ring_buffer_nmi_dropped_cpu - get the number of nmis that were dropped + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long ring_buffer_nmi_dropped_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->nmi_dropped; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_nmi_dropped_cpu); + +/** + * ring_buffer_commit_overrun_cpu - get the number of overruns caused by commits + * @buffer: The ring buffer + * @cpu: The per CPU buffer to get the number of overruns from + */ +unsigned long +ring_buffer_commit_overrun_cpu(struct ring_buffer *buffer, int cpu) +{ + struct ring_buffer_per_cpu *cpu_buffer; + unsigned long ret; + + if (!cpumask_test_cpu(cpu, buffer->cpumask)) + return 0; + + cpu_buffer = buffer->buffers[cpu]; + ret = cpu_buffer->commit_overrun; + + return ret; +} +EXPORT_SYMBOL_GPL(ring_buffer_commit_overrun_cpu); + /** * ring_buffer_entries - get the number of entries in a buffer * @buffer: The ring buffer @@ -2595,6 +2639,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; + cpu_buffer->nmi_dropped = 0; + cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; cpu_buffer->entries = 0; From c8d771835e18c938dae8690611d65fe98ad30f58 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 29 Apr 2009 18:03:45 -0400 Subject: [PATCH 0774/2061] tracing: export stats of ring buffers to userspace This patch adds stats to the ftrace ring buffers: # cat /debugfs/tracing/per_cpu/cpu0/stats entries: 42360 overrun: 30509326 commit overrun: 0 nmi dropped: 0 Where entries are the total number of data entries in the buffer. overrun is the number of entries not consumed and were overwritten by the writer. commit overrun is the number of entries dropped due to nested writers wrapping the buffer before the initial writer finished the commit. nmi dropped is the number of entries dropped due to the ring buffer lock being held when an nmi was going to write to the ring buffer. Note, this field will be meaningless and will go away when the ring buffer becomes lockless. [ Impact: let userspace know what is happening in the ring buffers ] Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index f5427e0fc98..74df029056b 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3595,6 +3595,45 @@ static const struct file_operations tracing_buffers_fops = { .llseek = no_llseek, }; +static ssize_t +tracing_stats_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *ppos) +{ + unsigned long cpu = (unsigned long)filp->private_data; + struct trace_array *tr = &global_trace; + struct trace_seq *s; + unsigned long cnt; + + s = kmalloc(sizeof(*s), GFP_ATOMIC); + if (!s) + return ENOMEM; + + trace_seq_init(s); + + cnt = ring_buffer_entries_cpu(tr->buffer, cpu); + trace_seq_printf(s, "entries: %ld\n", cnt); + + cnt = ring_buffer_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "overrun: %ld\n", cnt); + + cnt = ring_buffer_commit_overrun_cpu(tr->buffer, cpu); + trace_seq_printf(s, "commit overrun: %ld\n", cnt); + + cnt = ring_buffer_nmi_dropped_cpu(tr->buffer, cpu); + trace_seq_printf(s, "nmi dropped: %ld\n", cnt); + + count = simple_read_from_buffer(ubuf, count, ppos, s->buffer, s->len); + + kfree(s); + + return count; +} + +static const struct file_operations tracing_stats_fops = { + .open = tracing_open_generic, + .read = tracing_stats_read, +}; + #ifdef CONFIG_DYNAMIC_FTRACE int __weak ftrace_arch_read_dyn_info(char *buf, int size) @@ -3708,6 +3747,9 @@ static void tracing_init_debugfs_percpu(long cpu) trace_create_file("trace_pipe_raw", 0444, d_cpu, (void *) cpu, &tracing_buffers_fops); + + trace_create_file("stats", 0444, d_cpu, + (void *) cpu, &tracing_stats_fops); } #ifdef CONFIG_FTRACE_SELFTEST From 60aa605dfce2976e54fa76e805ab0f221372d4d9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:21 +0200 Subject: [PATCH 0775/2061] sched: rt: document the risk of small values in the bandwidth settings Thomas noted that we should disallow sysctl_sched_rt_runtime == 0 for (!RT_GROUP) since the root group always has some RT tasks in it. Further, update the documentation to inspire clue. [ Impact: exclude corner-case sysctl_sched_rt_runtime value ] Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra LKML-Reference: <20090505155436.863098054@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/scheduler/sched-rt-group.txt | 18 ++++++++++++++++++ kernel/sched.c | 7 +++++++ 2 files changed, 25 insertions(+) diff --git a/Documentation/scheduler/sched-rt-group.txt b/Documentation/scheduler/sched-rt-group.txt index 5ba4d3fc625..eb74b014a3f 100644 --- a/Documentation/scheduler/sched-rt-group.txt +++ b/Documentation/scheduler/sched-rt-group.txt @@ -4,6 +4,7 @@ CONTENTS ======== +0. WARNING 1. Overview 1.1 The problem 1.2 The solution @@ -14,6 +15,23 @@ CONTENTS 3. Future plans +0. WARNING +========== + + Fiddling with these settings can result in an unstable system, the knobs are + root only and assumes root knows what he is doing. + +Most notable: + + * very small values in sched_rt_period_us can result in an unstable + system when the period is smaller than either the available hrtimer + resolution, or the time it takes to handle the budget refresh itself. + + * very small values in sched_rt_runtime_us can result in an unstable + system when the runtime is so small the system has difficulty making + forward progress (NOTE: the migration thread and kstopmachine both + are real-time processes). + 1. Overview =========== diff --git a/kernel/sched.c b/kernel/sched.c index 54d67b94f1a..2a43a581ead 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -9917,6 +9917,13 @@ static int sched_rt_global_constraints(void) if (sysctl_sched_rt_period <= 0) return -EINVAL; + /* + * There's always some RT tasks in the root group + * -- migration, kstopmachine etc.. + */ + if (sysctl_sched_rt_runtime == 0) + return -EBUSY; + spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); for_each_possible_cpu(i) { struct rt_rq *rt_rq = &cpu_rq(i)->rt; From c66de4a5be7913247bd83d79168f8e4420c9cfbc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:22 +0200 Subject: [PATCH 0776/2061] perf_counter: uncouple data_head updates from wakeups Keep data_head up-to-date irrespective of notifications. This fixes the case where you disable a counter and don't get a notification for the last few pending events, and it also allows polling usage. [ Impact: increase precision of perfcounter mmap-ed fields ] Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090505155436.925084300@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 +++- kernel/perf_counter.c | 20 +++++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a356fa69796..17b63105f2a 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -362,9 +362,11 @@ struct perf_mmap_data { atomic_t head; /* write position */ atomic_t events; /* event limit */ - atomic_t wakeup_head; /* completed head */ + atomic_t done_head; /* completed head */ atomic_t lock; /* concurrent writes */ + atomic_t wakeup; /* needs a wakeup */ + struct perf_counter_mmap_page *user_page; void *data_pages[0]; }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5f86a1156c9..ba5e921e1f3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1696,7 +1696,6 @@ struct perf_output_handle { struct perf_mmap_data *data; unsigned int offset; unsigned int head; - int wakeup; int nmi; int overflow; int locked; @@ -1752,8 +1751,7 @@ static void perf_output_unlock(struct perf_output_handle *handle) struct perf_mmap_data *data = handle->data; int head, cpu; - if (handle->wakeup) - data->wakeup_head = data->head; + data->done_head = data->head; if (!handle->locked) goto out; @@ -1764,13 +1762,11 @@ again: * before we publish the new head, matched by a rmb() in userspace when * reading this position. */ - while ((head = atomic_xchg(&data->wakeup_head, 0))) { + while ((head = atomic_xchg(&data->done_head, 0))) data->user_page->data_head = head; - handle->wakeup = 1; - } /* - * NMI can happen here, which means we can miss a wakeup_head update. + * NMI can happen here, which means we can miss a done_head update. */ cpu = atomic_xchg(&data->lock, 0); @@ -1779,7 +1775,7 @@ again: /* * Therefore we have to validate we did not indeed do so. */ - if (unlikely(atomic_read(&data->wakeup_head))) { + if (unlikely(atomic_read(&data->done_head))) { /* * Since we had it locked, we can lock it again. */ @@ -1789,7 +1785,7 @@ again: goto again; } - if (handle->wakeup) + if (atomic_xchg(&data->wakeup, 0)) perf_output_wakeup(handle); out: local_irq_restore(handle->flags); @@ -1824,7 +1820,9 @@ static int perf_output_begin(struct perf_output_handle *handle, handle->offset = offset; handle->head = head; - handle->wakeup = (offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT); + + if ((offset >> PAGE_SHIFT) != (head >> PAGE_SHIFT)) + atomic_set(&data->wakeup, 1); return 0; @@ -1882,7 +1880,7 @@ static void perf_output_end(struct perf_output_handle *handle) int events = atomic_inc_return(&data->events); if (events >= wakeup_events) { atomic_sub(wakeup_events, &data->events); - handle->wakeup = 1; + atomic_set(&data->wakeup, 1); } } From 6de6a7b95705b859b61430fa3afa1403034eb3e6 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:23 +0200 Subject: [PATCH 0777/2061] perf_counter: add ioctl(PERF_COUNTER_IOC_RESET) Provide a way to reset an existing counter - this eases PAPI libraries around perfcounters. Similar to read() it doesn't collapse pending child counters. [ Impact: new perfcounter fd ioctl method to reset counters ] Suggested-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090505155437.022272933@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 8 ++++++++ 2 files changed, 9 insertions(+) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 17b63105f2a..0fcbf34a4f7 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -160,6 +160,7 @@ struct perf_counter_hw_event { #define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) #define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) #define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) +#define PERF_COUNTER_IOC_RESET _IO ('$', 3) /* * Structure of the page that can be mapped via mmap diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ba5e921e1f3..6e6834e0587 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1288,6 +1288,11 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) return events; } +static void perf_counter_reset(struct perf_counter *counter) +{ + atomic_set(&counter->count, 0); +} + static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; @@ -1303,6 +1308,9 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case PERF_COUNTER_IOC_REFRESH: perf_counter_refresh(counter, arg); break; + case PERF_COUNTER_IOC_RESET: + perf_counter_reset(counter); + break; default: err = -ENOTTY; } From c5078f78b455fbf67ea71442c7e7ca8acf9ff095 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:24 +0200 Subject: [PATCH 0778/2061] perf_counter: provide an mlock threshold Provide a threshold to relax the mlock accounting, increasing usability. Each counter gets perf_counter_mlock_kb for free. [ Impact: allow more mmap buffering ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090505155437.112113632@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 15 +++++++++++---- kernel/sysctl.c | 8 ++++++++ 3 files changed, 21 insertions(+), 4 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0fcbf34a4f7..00081d84169 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -358,6 +358,7 @@ struct file; struct perf_mmap_data { struct rcu_head rcu_head; int nr_pages; /* nr of data pages */ + int nr_locked; /* nr pages mlocked */ atomic_t poll; /* POLL_ for wakeups */ atomic_t head; /* write position */ @@ -575,6 +576,7 @@ struct perf_callchain_entry { extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_priv; +extern int sysctl_perf_counter_mlock; extern void perf_counter_init(void); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 6e6834e0587..2d134273830 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -44,6 +44,7 @@ static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ +int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */ /* * Lock for (sysadmin-configurable) counter reservations: @@ -1461,7 +1462,7 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { - vma->vm_mm->locked_vm -= counter->data->nr_pages + 1; + vma->vm_mm->locked_vm -= counter->data->nr_locked; perf_mmap_data_free(counter); mutex_unlock(&counter->mmap_mutex); } @@ -1480,6 +1481,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) unsigned long nr_pages; unsigned long locked, lock_limit; int ret = 0; + long extra; if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) return -EINVAL; @@ -1507,8 +1509,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - locked = vma->vm_mm->locked_vm; - locked += nr_pages + 1; + extra = nr_pages /* + 1 only account the data pages */; + extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + if (extra < 0) + extra = 0; + + locked = vma->vm_mm->locked_vm + extra; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; @@ -1524,7 +1530,8 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; atomic_set(&counter->mmap_count, 1); - vma->vm_mm->locked_vm += nr_pages + 1; + vma->vm_mm->locked_vm += extra; + counter->data->nr_locked = extra; unlock: mutex_unlock(&counter->mmap_mutex); diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 8203d70928d..3b05c2b088d 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -920,6 +920,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_mlock_kb", + .data = &sysctl_perf_counter_mlock, + .maxlen = sizeof(sysctl_perf_counter_mlock), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif /* * NOTE: do not add new entries to this table unless you have read From 22c1558e51c210787c6cf75d8905246fc91ec030 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:25 +0200 Subject: [PATCH 0779/2061] perf_counter: fix the output lock Use -1 instead of 0 as unlocked, since 0 is a valid cpu number. ( This is not an issue right now but will be once we allow multiple counters to output to the same mmap area. ) [ Impact: prepare code for multi-counter profile output ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090505155437.232686598@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2d134273830..c881afef997 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1409,6 +1409,7 @@ static int perf_mmap_data_alloc(struct perf_counter *counter, int nr_pages) } data->nr_pages = nr_pages; + atomic_set(&data->lock, -1); rcu_assign_pointer(counter->data, data); @@ -1755,7 +1756,7 @@ static void perf_output_lock(struct perf_output_handle *handle) if (in_nmi() && atomic_read(&data->lock) == cpu) return; - while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) cpu_relax(); handle->locked = 1; @@ -1784,7 +1785,7 @@ again: * NMI can happen here, which means we can miss a done_head update. */ - cpu = atomic_xchg(&data->lock, 0); + cpu = atomic_xchg(&data->lock, -1); WARN_ON_ONCE(cpu != smp_processor_id()); /* @@ -1794,7 +1795,7 @@ again: /* * Since we had it locked, we can lock it again. */ - while (atomic_cmpxchg(&data->lock, 0, cpu) != 0) + while (atomic_cmpxchg(&data->lock, -1, cpu) != -1) cpu_relax(); goto again; From 2023b359214bbc5bad31571cf50d7fb83b535c0a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:26 +0200 Subject: [PATCH 0780/2061] perf_counter: inheritable sample counters Redirect the output to the parent counter and put in some sanity checks. [ Impact: new perfcounter feature - inherited sampling counters ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090505155437.331556171@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 32 ++++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c881afef997..60e55f0b48f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -738,10 +738,18 @@ static void perf_counter_enable(struct perf_counter *counter) spin_unlock_irq(&ctx->lock); } -static void perf_counter_refresh(struct perf_counter *counter, int refresh) +static int perf_counter_refresh(struct perf_counter *counter, int refresh) { + /* + * not supported on inherited counters + */ + if (counter->hw_event.inherit) + return -EINVAL; + atomic_add(refresh, &counter->event_limit); perf_counter_enable(counter); + + return 0; } /* @@ -1307,7 +1315,7 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) perf_counter_disable_family(counter); break; case PERF_COUNTER_IOC_REFRESH: - perf_counter_refresh(counter, arg); + err = perf_counter_refresh(counter, arg); break; case PERF_COUNTER_IOC_RESET: perf_counter_reset(counter); @@ -1814,6 +1822,12 @@ static int perf_output_begin(struct perf_output_handle *handle, struct perf_mmap_data *data; unsigned int offset, head; + /* + * For inherited counters we send all the output towards the parent. + */ + if (counter->parent) + counter = counter->parent; + rcu_read_lock(); data = rcu_dereference(counter->data); if (!data) @@ -1995,6 +2009,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_ADDR) perf_output_put(&handle, addr); + /* + * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. + */ if (record_type & PERF_RECORD_GROUP) { struct perf_counter *leader, *sub; u64 nr = counter->nr_siblings; @@ -2281,6 +2298,11 @@ int perf_counter_overflow(struct perf_counter *counter, int events = atomic_read(&counter->event_limit); int ret = 0; + /* + * XXX event_limit might not quite work as expected on inherited + * counters + */ + counter->pending_kill = POLL_IN; if (events && atomic_dec_and_test(&counter->event_limit)) { ret = 1; @@ -2801,6 +2823,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, pmu = NULL; + /* + * we currently do not support PERF_RECORD_GROUP on inherited counters + */ + if (hw_event->inherit && (hw_event->record_type & PERF_RECORD_GROUP)) + goto done; + if (perf_event_raw(hw_event)) { pmu = hw_perf_counter_init(counter); goto done; From 16c8a10932aef971292c9570eb5f60b5d4e83ed2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 5 May 2009 17:50:27 +0200 Subject: [PATCH 0781/2061] perf_counter: tools: update the tools to support process and inherited counters "perf record": - per task counter - inherit switch - nmi switch "perf report": - userspace/kernel filter "perf stat": - userspace/kernel filter Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090505155437.389163017@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-record.c | 155 ++++++++++++-------- Documentation/perf_counter/builtin-stat.c | 24 ++- Documentation/perf_counter/perf-report.cc | 27 +++- 3 files changed, 140 insertions(+), 66 deletions(-) diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c index ddfdcf86fb2..5f5e6df0260 100644 --- a/Documentation/perf_counter/builtin-record.c +++ b/Documentation/perf_counter/builtin-record.c @@ -45,7 +45,10 @@ static unsigned int mmap_pages = 16; static int output; static char *output_name = "output.perf"; static int group = 0; -static unsigned int realtime_prio = 0; +static unsigned int realtime_prio = 0; +static int system_wide = 0; +static int inherit = 1; +static int nmi = 1; const unsigned int default_count[] = { 1000000, @@ -167,7 +170,7 @@ static void display_events_help(void) static void display_help(void) { printf( - "Usage: perf-record []\n" + "Usage: perf-record [] \n" "perf-record Options (up to %d event types can be specified at once):\n\n", MAX_COUNTERS); @@ -178,12 +181,13 @@ static void display_help(void) " -m pages --mmap_pages= # number of mmap data pages\n" " -o file --output= # output file\n" " -r prio --realtime= # use RT prio\n" + " -s --system # system wide profiling\n" ); exit(0); } -static void process_options(int argc, char *argv[]) +static void process_options(int argc, const char *argv[]) { int error = 0, counter; @@ -196,9 +200,12 @@ static void process_options(int argc, char *argv[]) {"mmap_pages", required_argument, NULL, 'm'}, {"output", required_argument, NULL, 'o'}, {"realtime", required_argument, NULL, 'r'}, + {"system", no_argument, NULL, 's'}, + {"inherit", no_argument, NULL, 'i'}, + {"nmi", no_argument, NULL, 'n'}, {NULL, 0, NULL, 0 } }; - int c = getopt_long(argc, argv, "+:c:e:m:o:r:", + int c = getopt_long(argc, argv, "+:c:e:m:o:r:sin", long_options, &option_index); if (c == -1) break; @@ -209,9 +216,16 @@ static void process_options(int argc, char *argv[]) case 'm': mmap_pages = atoi(optarg); break; case 'o': output_name = strdup(optarg); break; case 'r': realtime_prio = atoi(optarg); break; + case 's': system_wide ^= 1; break; + case 'i': inherit ^= 1; break; + case 'n': nmi ^= 1; break; default: error = 1; break; } } + + if (argc - optind == 0) + error = 1; + if (error) display_help(); @@ -325,18 +339,82 @@ static void mmap_read(struct mmap_data *md) static volatile int done = 0; -static void sigchld_handler(int sig) +static void sig_handler(int sig) { - if (sig == SIGCHLD) - done = 1; + done = 1; } -int cmd_record(int argc, char **argv) +static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; +static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; + +static int nr_poll; +static int nr_cpu; + +static void open_counters(int cpu) { - struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; - struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; struct perf_counter_hw_event hw_event; - int i, counter, group_fd, nr_poll = 0; + int counter, group_fd; + int track = 1; + pid_t pid = -1; + + if (cpu < 0) + pid = 0; + + group_fd = -1; + for (counter = 0; counter < nr_counters; counter++) { + + memset(&hw_event, 0, sizeof(hw_event)); + hw_event.config = event_id[counter]; + hw_event.irq_period = event_count[counter]; + hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID; + hw_event.nmi = nmi; + hw_event.mmap = track; + hw_event.comm = track; + hw_event.inherit = (cpu < 0) && inherit; + + track = 0; // only the first counter needs these + + fd[nr_cpu][counter] = + sys_perf_counter_open(&hw_event, pid, cpu, group_fd, 0); + + if (fd[nr_cpu][counter] < 0) { + int err = errno; + printf("kerneltop error: syscall returned with %d (%s)\n", + fd[nr_cpu][counter], strerror(err)); + if (err == EPERM) + printf("Are you root?\n"); + exit(-1); + } + assert(fd[nr_cpu][counter] >= 0); + fcntl(fd[nr_cpu][counter], F_SETFL, O_NONBLOCK); + + /* + * First counter acts as the group leader: + */ + if (group && group_fd == -1) + group_fd = fd[nr_cpu][counter]; + + event_array[nr_poll].fd = fd[nr_cpu][counter]; + event_array[nr_poll].events = POLLIN; + nr_poll++; + + mmap_array[nr_cpu][counter].counter = counter; + mmap_array[nr_cpu][counter].prev = 0; + mmap_array[nr_cpu][counter].mask = mmap_pages*page_size - 1; + mmap_array[nr_cpu][counter].base = mmap(NULL, (mmap_pages+1)*page_size, + PROT_READ, MAP_SHARED, fd[nr_cpu][counter], 0); + if (mmap_array[nr_cpu][counter].base == MAP_FAILED) { + printf("kerneltop error: failed to mmap with %d (%s)\n", + errno, strerror(errno)); + exit(-1); + } + } + nr_cpu++; +} + +int cmd_record(int argc, const char **argv) +{ + int i, counter; pid_t pid; int ret; @@ -357,54 +435,13 @@ int cmd_record(int argc, char **argv) argc -= optind; argv += optind; - for (i = 0; i < nr_cpus; i++) { - group_fd = -1; - for (counter = 0; counter < nr_counters; counter++) { + if (!system_wide) + open_counters(-1); + else for (i = 0; i < nr_cpus; i++) + open_counters(i); - memset(&hw_event, 0, sizeof(hw_event)); - hw_event.config = event_id[counter]; - hw_event.irq_period = event_count[counter]; - hw_event.record_type = PERF_RECORD_IP | PERF_RECORD_TID; - hw_event.nmi = 1; - hw_event.mmap = 1; - hw_event.comm = 1; - - fd[i][counter] = sys_perf_counter_open(&hw_event, -1, i, group_fd, 0); - if (fd[i][counter] < 0) { - int err = errno; - printf("kerneltop error: syscall returned with %d (%s)\n", - fd[i][counter], strerror(err)); - if (err == EPERM) - printf("Are you root?\n"); - exit(-1); - } - assert(fd[i][counter] >= 0); - fcntl(fd[i][counter], F_SETFL, O_NONBLOCK); - - /* - * First counter acts as the group leader: - */ - if (group && group_fd == -1) - group_fd = fd[i][counter]; - - event_array[nr_poll].fd = fd[i][counter]; - event_array[nr_poll].events = POLLIN; - nr_poll++; - - mmap_array[i][counter].counter = counter; - mmap_array[i][counter].prev = 0; - mmap_array[i][counter].mask = mmap_pages*page_size - 1; - mmap_array[i][counter].base = mmap(NULL, (mmap_pages+1)*page_size, - PROT_READ, MAP_SHARED, fd[i][counter], 0); - if (mmap_array[i][counter].base == MAP_FAILED) { - printf("kerneltop error: failed to mmap with %d (%s)\n", - errno, strerror(errno)); - exit(-1); - } - } - } - - signal(SIGCHLD, sigchld_handler); + signal(SIGCHLD, sig_handler); + signal(SIGINT, sig_handler); pid = fork(); if (pid < 0) @@ -434,7 +471,7 @@ int cmd_record(int argc, char **argv) while (!done) { int hits = events; - for (i = 0; i < nr_cpus; i++) { + for (i = 0; i < nr_cpu; i++) { for (counter = 0; counter < nr_counters; counter++) mmap_read(&mmap_array[i][counter]); } diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index 6de38d25688..e2fa117eab5 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -87,6 +87,9 @@ #include "perf.h" +#define EVENT_MASK_KERNEL 1 +#define EVENT_MASK_USER 2 + static int system_wide = 0; static int nr_counters = 0; @@ -104,6 +107,7 @@ static __u64 event_id[MAX_COUNTERS] = { static int default_interval = 100000; static int event_count[MAX_COUNTERS]; static int fd[MAX_NR_CPUS][MAX_COUNTERS]; +static int event_mask[MAX_COUNTERS]; static int tid = -1; static int profile_cpu = -1; @@ -258,12 +262,23 @@ static __u64 match_event_symbols(char *str) __u64 config, id; int type; unsigned int i; + char mask_str[4]; if (sscanf(str, "r%llx", &config) == 1) return config | PERF_COUNTER_RAW_MASK; - if (sscanf(str, "%d:%llu", &type, &id) == 2) - return EID(type, id); + switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) { + case 3: + if (strchr(mask_str, 'u')) + event_mask[nr_counters] |= EVENT_MASK_USER; + if (strchr(mask_str, 'k')) + event_mask[nr_counters] |= EVENT_MASK_KERNEL; + case 2: + return EID(type, id); + + default: + break; + } for (i = 0; i < ARRAY_SIZE(event_symbols); i++) { if (!strncmp(str, event_symbols[i].symbol, @@ -313,6 +328,11 @@ static void create_perfstat_counter(int counter) hw_event.config = event_id[counter]; hw_event.record_type = 0; hw_event.nmi = 0; + hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL; + hw_event.exclude_user = event_mask[counter] & EVENT_MASK_USER; + +printf("exclude: %d\n", event_mask[counter]); + if (scale) hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; diff --git a/Documentation/perf_counter/perf-report.cc b/Documentation/perf_counter/perf-report.cc index 911d7f3e7a6..8855107fe6b 100644 --- a/Documentation/perf_counter/perf-report.cc +++ b/Documentation/perf_counter/perf-report.cc @@ -33,8 +33,13 @@ #include +#define SHOW_KERNEL 1 +#define SHOW_USER 2 +#define SHOW_HV 4 + static char const *input_name = "output.perf"; static int input; +static int show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV; static unsigned long page_size; static unsigned long mmap_window = 32; @@ -359,15 +364,21 @@ static void process_options(int argc, char *argv[]) /** Options for getopt */ static struct option long_options[] = { {"input", required_argument, NULL, 'i'}, + {"no-user", no_argument, NULL, 'u'}, + {"no-kernel", no_argument, NULL, 'k'}, + {"no-hv", no_argument, NULL, 'h'}, {NULL, 0, NULL, 0 } }; - int c = getopt_long(argc, argv, "+:i:", + int c = getopt_long(argc, argv, "+:i:kuh", long_options, &option_index); if (c == -1) break; switch (c) { case 'i': input_name = strdup(optarg); break; + case 'k': show_mask &= ~SHOW_KERNEL; break; + case 'u': show_mask &= ~SHOW_USER; break; + case 'h': show_mask &= ~SHOW_HV; break; default: error = 1; break; } } @@ -443,22 +454,28 @@ more: if (event->header.misc & PERF_EVENT_MISC_OVERFLOW) { std::string comm, sym, level; + int show = 0; char output[1024]; if (event->header.misc & PERF_EVENT_MISC_KERNEL) { + show |= SHOW_KERNEL; level = " [k] "; sym = resolve_kernel_symbol(event->ip.ip); } else if (event->header.misc & PERF_EVENT_MISC_USER) { + show |= SHOW_USER; level = " [.] "; sym = resolve_user_symbol(event->ip.pid, event->ip.ip); } else { + show |= SHOW_HV; level = " [H] "; } - comm = resolve_comm(event->ip.pid); - snprintf(output, sizeof(output), "%16s %s %s", - comm.c_str(), level.c_str(), sym.c_str()); - hist[output]++; + if (show & show_mask) { + comm = resolve_comm(event->ip.pid); + snprintf(output, sizeof(output), "%16s %s %s", + comm.c_str(), level.c_str(), sym.c_str()); + hist[output]++; + } total++; From e4906eff9e6fbd2d311abcbcc53d5a531773c982 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 30 Apr 2009 20:49:44 -0400 Subject: [PATCH 0782/2061] ring-buffer: convert cpu buffer entries to local_t The entries counter in cpu buffer is not atomic. It can be updated by other interrupts or from another CPU (readers). But making entries into "atomic_t" causes an atomic operation that can hurt performance. Instead we convert it to a local_t that will increment a counter with a local CPU atomic operation (if the arch supports it). Instead of fighting with readers and overwrites that decrement the counter, I added a "read" counter. Every time a reader reads an entry it is incremented. We already have a overrun counter and with that, the entries counter and the read counter, we can calculate the total number of entries in the buffer with: (entries - overrun) - read As long as the total number of entries in the ring buffer is less than the word size, this will work. But since the entries counter was previously a long, this is no different than what we had before. Thanks to Andrew Morton for pointing out in the first version that atomic_t does not replace unsigned long. I switched to atomic_long_t even though it is signed. A negative count is most likely a bug. [ Impact: keep accurate count of cpu buffer entries ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 26e1359fe19..c792ea893b0 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -405,7 +405,8 @@ struct ring_buffer_per_cpu { unsigned long nmi_dropped; unsigned long commit_overrun; unsigned long overrun; - unsigned long entries; + unsigned long read; + local_t entries; u64 write_stamp; u64 read_stamp; atomic_t record_disabled; @@ -997,7 +998,6 @@ static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; cpu_buffer->overrun++; - cpu_buffer->entries--; } } @@ -1588,7 +1588,7 @@ EXPORT_SYMBOL_GPL(ring_buffer_lock_reserve); static void rb_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); /* Only process further if we own the commit */ if (!rb_is_commit(cpu_buffer, event)) @@ -1722,7 +1722,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, * The commit is still visible by the reader, so we * must increment entries. */ - cpu_buffer->entries++; + local_inc(&cpu_buffer->entries); out: /* * If a write came in and pushed the tail page @@ -1902,7 +1902,8 @@ unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu) return 0; cpu_buffer = buffer->buffers[cpu]; - ret = cpu_buffer->entries; + ret = (local_read(&cpu_buffer->entries) - cpu_buffer->overrun) + - cpu_buffer->read; return ret; } @@ -1985,7 +1986,8 @@ unsigned long ring_buffer_entries(struct ring_buffer *buffer) /* if you care about this being correct, lock the buffer */ for_each_buffer_cpu(buffer, cpu) { cpu_buffer = buffer->buffers[cpu]; - entries += cpu_buffer->entries; + entries += (local_read(&cpu_buffer->entries) - + cpu_buffer->overrun) - cpu_buffer->read; } return entries; @@ -2225,7 +2227,7 @@ static void rb_advance_reader(struct ring_buffer_per_cpu *cpu_buffer) if (event->type_len <= RINGBUF_TYPE_DATA_TYPE_LEN_MAX || rb_discarded_event(event)) - cpu_buffer->entries--; + cpu_buffer->read++; rb_update_read_stamp(cpu_buffer, event); @@ -2642,7 +2644,8 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->nmi_dropped = 0; cpu_buffer->commit_overrun = 0; cpu_buffer->overrun = 0; - cpu_buffer->entries = 0; + cpu_buffer->read = 0; + local_set(&cpu_buffer->entries, 0); cpu_buffer->write_stamp = 0; cpu_buffer->read_stamp = 0; @@ -2813,7 +2816,7 @@ static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, /* Only count data entries */ if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) continue; - cpu_buffer->entries--; + cpu_buffer->read++; } __raw_spin_unlock(&cpu_buffer->lock); } From 41c51c98f588edcdf6141cff1895df738e03ddd4 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 3 May 2009 23:11:18 +0200 Subject: [PATCH 0783/2061] rcu: rcu_sched_grace_period(): kill the bogus flush_signals() As a kernel thread, rcu_sched_grace_period() runs with all signals ignored. It can never receive a signal even if it sleeps in TASK_INTERRUPTIBLE, it needs the explicit allow_signal() to be visible for signals. [ Impact: reduce kernel size, remove dead code ] Signed-off-by: Oleg Nesterov Reviewed-by: Paul E. McKenney Cc: Andrew Morton LKML-Reference: <20090503211118.GA22973@redhat.com> Signed-off-by: Ingo Molnar --- kernel/rcupreempt.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/kernel/rcupreempt.c b/kernel/rcupreempt.c index ce97a4df64d..beb0e659adc 100644 --- a/kernel/rcupreempt.c +++ b/kernel/rcupreempt.c @@ -1356,17 +1356,11 @@ static int rcu_sched_grace_period(void *arg) rcu_ctrlblk.sched_sleep = rcu_sched_sleeping; spin_unlock_irqrestore(&rcu_ctrlblk.schedlock, flags); - ret = 0; + ret = 0; /* unused */ __wait_event_interruptible(rcu_ctrlblk.sched_wq, rcu_ctrlblk.sched_sleep != rcu_sched_sleeping, ret); - /* - * Signals would prevent us from sleeping, and we cannot - * do much with them in any case. So flush them. - */ - if (ret) - flush_signals(current); couldsleepnext = 0; } while (!kthread_should_stop()); From 778c55d44eb4f5f658915ed631d68ed9d1ac3ad1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 18:44:45 -0400 Subject: [PATCH 0784/2061] ring-buffer: record page entries in buffer page descriptor Currently, when the ring buffer writer overflows the buffer and must write over non consumed data, we increment the overrun counter by reading the entries on the page we are about to overwrite. This reads the entries one by one. This is not very effecient. This patch adds another entry counter into each buffer page descriptor that keeps track of the number of entries on the page. Now on overwrite, the overrun counter simply needs to add the number of entries that is on the page it is about to overwrite. [ Impact: speed up of ring buffer in overwrite mode ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 39 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 26 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index c792ea893b0..342eacc4baa 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -321,9 +321,10 @@ struct buffer_data_page { }; struct buffer_page { + struct list_head list; /* list of buffer pages */ local_t write; /* index for next write */ unsigned read; /* index for next read */ - struct list_head list; /* list of free pages */ + local_t entries; /* entries on this page */ struct buffer_data_page *page; /* Actual data page */ }; @@ -977,30 +978,6 @@ static inline unsigned rb_head_size(struct ring_buffer_per_cpu *cpu_buffer) return rb_page_commit(cpu_buffer->head_page); } -/* - * When the tail hits the head and the buffer is in overwrite mode, - * the head jumps to the next page and all content on the previous - * page is discarded. But before doing so, we update the overrun - * variable of the buffer. - */ -static void rb_update_overflow(struct ring_buffer_per_cpu *cpu_buffer) -{ - struct ring_buffer_event *event; - unsigned long head; - - for (head = 0; head < rb_head_size(cpu_buffer); - head += rb_event_length(event)) { - - event = __rb_page_index(cpu_buffer->head_page, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) - continue; - cpu_buffer->overrun++; - } -} - static inline void rb_inc_page(struct ring_buffer_per_cpu *cpu_buffer, struct buffer_page **bpage) { @@ -1253,7 +1230,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, /* tail_page has not moved yet? */ if (tail_page == cpu_buffer->tail_page) { /* count overflows */ - rb_update_overflow(cpu_buffer); + cpu_buffer->overrun += + local_read(&head_page->entries); rb_inc_page(cpu_buffer, &head_page); cpu_buffer->head_page = head_page; @@ -1268,6 +1246,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, */ if (tail_page == cpu_buffer->tail_page) { local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); local_set(&next_page->page->commit, 0); cpu_buffer->tail_page = next_page; @@ -1313,6 +1292,10 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, event = __rb_page_index(tail_page, tail); rb_update_event(event, type, length); + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + /* * If this is a commit and the tail is zero, then update * this page's time stamp. @@ -2183,6 +2166,7 @@ rb_get_reader_page(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->reader_page->list.prev = reader->list.prev; local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); /* Make the reader page now replace the head */ @@ -2629,6 +2613,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) cpu_buffer->head_page = list_entry(cpu_buffer->pages.next, struct buffer_page, list); local_set(&cpu_buffer->head_page->write, 0); + local_set(&cpu_buffer->head_page->entries, 0); local_set(&cpu_buffer->head_page->page->commit, 0); cpu_buffer->head_page->read = 0; @@ -2638,6 +2623,7 @@ rb_reset_cpu(struct ring_buffer_per_cpu *cpu_buffer) INIT_LIST_HEAD(&cpu_buffer->reader_page->list); local_set(&cpu_buffer->reader_page->write, 0); + local_set(&cpu_buffer->reader_page->entries, 0); local_set(&cpu_buffer->reader_page->page->commit, 0); cpu_buffer->reader_page->read = 0; @@ -2996,6 +2982,7 @@ int ring_buffer_read_page(struct ring_buffer *buffer, bpage = reader->page; reader->page = *data_page; local_set(&reader->write, 0); + local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; From afbab76a62b69ea6197e19727d4b8a8aef8deb25 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 19:40:05 -0400 Subject: [PATCH 0785/2061] ring-buffer: have read page swap increment counter with page entries In the swap page ring buffer code that is used by the ftrace splice code, we scan the page to increment the counter of entries read. With the number of entries already in the page we simply need to add it. [ Impact: speed up reading page from ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 342eacc4baa..9e42a742a3f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2785,28 +2785,6 @@ out: } EXPORT_SYMBOL_GPL(ring_buffer_swap_cpu); -static void rb_remove_entries(struct ring_buffer_per_cpu *cpu_buffer, - struct buffer_data_page *bpage, - unsigned int offset) -{ - struct ring_buffer_event *event; - unsigned long head; - - __raw_spin_lock(&cpu_buffer->lock); - for (head = offset; head < local_read(&bpage->commit); - head += rb_event_length(event)) { - - event = __rb_data_page_index(bpage, head); - if (RB_WARN_ON(cpu_buffer, rb_null_event(event))) - return; - /* Only count data entries */ - if (event->type_len > RINGBUF_TYPE_DATA_TYPE_LEN_MAX) - continue; - cpu_buffer->read++; - } - __raw_spin_unlock(&cpu_buffer->lock); -} - /** * ring_buffer_alloc_read_page - allocate a page to read from buffer * @buffer: the buffer to allocate for. @@ -2977,6 +2955,9 @@ int ring_buffer_read_page(struct ring_buffer *buffer, /* we copied everything to the beginning */ read = 0; } else { + /* update the entry counter */ + cpu_buffer->read += local_read(&reader->entries); + /* swap the pages */ rb_init_page(bpage); bpage = reader->page; @@ -2985,9 +2966,6 @@ int ring_buffer_read_page(struct ring_buffer *buffer, local_set(&reader->entries, 0); reader->read = 0; *data_page = bpage; - - /* update the entry counter */ - rb_remove_entries(cpu_buffer, bpage, read); } ret = read; From 41ede23eded40832c955d98d4b71bc244809abb3 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 1 May 2009 20:26:54 -0400 Subject: [PATCH 0786/2061] ring-buffer: disable writers when resetting buffers As a precaution, it is best to disable writing to the ring buffers when reseting them. [ Impact: prevent weird things if write happens during reset ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 9e42a742a3f..7876df00695 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -2650,6 +2650,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) if (!cpumask_test_cpu(cpu, buffer->cpumask)) return; + atomic_inc(&cpu_buffer->record_disabled); + spin_lock_irqsave(&cpu_buffer->reader_lock, flags); __raw_spin_lock(&cpu_buffer->lock); @@ -2659,6 +2661,8 @@ void ring_buffer_reset_cpu(struct ring_buffer *buffer, int cpu) __raw_spin_unlock(&cpu_buffer->lock); spin_unlock_irqrestore(&cpu_buffer->reader_lock, flags); + + atomic_dec(&cpu_buffer->record_disabled); } EXPORT_SYMBOL_GPL(ring_buffer_reset_cpu); From 31b6e76e21b2ffd3cb2f6fe4149790a9fdadce2d Mon Sep 17 00:00:00 2001 From: Tim Abbott Date: Thu, 30 Apr 2009 20:06:11 -0400 Subject: [PATCH 0787/2061] ftrace: use .sched.text, not .text.sched in recordmcount.pl The only references in the kernel to the .text.sched section are in recordmcount.pl. Since the code it has is intended to be example code it should refer to real kernel sections. So change it to .sched.text instead. [ Impact: consistency in comments ] Signed-off-by: Tim Abbott LKML-Reference: <1241136371-10768-1-git-send-email-tabbott@mit.edu> Acked-by: Sam Ravnborg Signed-off-by: Steven Rostedt --- scripts/recordmcount.pl | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/scripts/recordmcount.pl b/scripts/recordmcount.pl index 409596eca12..0fae7da0529 100755 --- a/scripts/recordmcount.pl +++ b/scripts/recordmcount.pl @@ -26,7 +26,7 @@ # which will also be the location of that section after final link. # e.g. # -# .section ".text.sched" +# .section ".sched.text", "ax" # .globl my_func # my_func: # [...] @@ -39,7 +39,7 @@ # [...] # # Both relocation offsets for the mcounts in the above example will be -# offset from .text.sched. If we make another file called tmp.s with: +# offset from .sched.text. If we make another file called tmp.s with: # # .section __mcount_loc # .quad my_func + 0x5 @@ -51,7 +51,7 @@ # But this gets hard if my_func is not globl (a static function). # In such a case we have: # -# .section ".text.sched" +# .section ".sched.text", "ax" # my_func: # [...] # call mcount (offset: 0x5) From 94487d6d53af5acae10cf9fd52f74498994d46b1 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 19:22:53 -0400 Subject: [PATCH 0788/2061] tracing: use proper export symbol for tracing api When adding the EXPORT_SYMBOL to some of the tracing API, I accidently used EXPORT_SYMBOL instead of EXPORT_SYMBOL_GPL. This patch fixes that mistake. [ Impact: export the tracing code only for GPL modules ] Reported-by: Christoph Hellwig Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 74df029056b..4164a344e72 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -887,21 +887,21 @@ trace_current_buffer_lock_reserve(int type, unsigned long len, return trace_buffer_lock_reserve(&global_trace, type, len, flags, pc); } -EXPORT_SYMBOL(trace_current_buffer_lock_reserve); +EXPORT_SYMBOL_GPL(trace_current_buffer_lock_reserve); void trace_current_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 1); } -EXPORT_SYMBOL(trace_current_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_current_buffer_unlock_commit); void trace_nowake_buffer_unlock_commit(struct ring_buffer_event *event, unsigned long flags, int pc) { __trace_buffer_unlock_commit(&global_trace, event, flags, pc, 0); } -EXPORT_SYMBOL(trace_nowake_buffer_unlock_commit); +EXPORT_SYMBOL_GPL(trace_nowake_buffer_unlock_commit); void trace_current_buffer_discard_commit(struct ring_buffer_event *event) { From 53fc0e2259f261602a2750dcc82b8d7bf04d3c35 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 5 May 2009 13:12:48 -0400 Subject: [PATCH 0789/2061] integrity: lsm audit rule matching fix An audit subsystem change replaced AUDIT_EQUAL with Audit_equal. Update calls to security_filter_rule_init()/match() to reflect the change. Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- security/integrity/ima/ima_policy.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index b5291ad5ef5..b168c1d595c 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -96,7 +96,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule, if ((rule->flags & IMA_UID) && rule->uid != tsk->cred->uid) return false; for (i = 0; i < MAX_LSM_RULES; i++) { - int rc; + int rc = 0; u32 osid, sid; if (!rule->lsm[i].rule) @@ -109,7 +109,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule, security_inode_getsecid(inode, &osid); rc = security_filter_rule_match(osid, rule->lsm[i].type, - AUDIT_EQUAL, + Audit_equal, rule->lsm[i].rule, NULL); break; @@ -119,7 +119,7 @@ static bool ima_match_rules(struct ima_measure_rule_entry *rule, security_task_getsecid(tsk, &sid); rc = security_filter_rule_match(sid, rule->lsm[i].type, - AUDIT_EQUAL, + Audit_equal, rule->lsm[i].rule, NULL); default: @@ -227,7 +227,7 @@ static int ima_lsm_rule_init(struct ima_measure_rule_entry *entry, entry->lsm[lsm_rule].type = audit_type; result = security_filter_rule_init(entry->lsm[lsm_rule].type, - AUDIT_EQUAL, args, + Audit_equal, args, &entry->lsm[lsm_rule].rule); return result; } From e5e520a715dcea6b72f6b9417b203a4b1e813a8b Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 5 May 2009 13:13:00 -0400 Subject: [PATCH 0790/2061] integrity: use audit_log_string Based on a request from Eric Paris to simplify parsing, replace audit_log_format statements containing "%s" with audit_log_string(). Signed-off-by: Mimi Zohar Acked-by: Eric Paris Signed-off-by: James Morris --- security/integrity/ima/ima_audit.c | 17 ++++------------- 1 file changed, 4 insertions(+), 13 deletions(-) diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c index 1e082bb987b..c1461150691 100644 --- a/security/integrity/ima/ima_audit.c +++ b/security/integrity/ima/ima_audit.c @@ -54,19 +54,10 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode, audit_get_loginuid(current), audit_get_sessionid(current)); audit_log_task_context(ab); - switch (audit_msgno) { - case AUDIT_INTEGRITY_DATA: - case AUDIT_INTEGRITY_METADATA: - case AUDIT_INTEGRITY_PCR: - case AUDIT_INTEGRITY_STATUS: - audit_log_format(ab, " op=%s cause=%s", op, cause); - break; - case AUDIT_INTEGRITY_HASH: - audit_log_format(ab, " op=%s hash=%s", op, cause); - break; - default: - audit_log_format(ab, " op=%s", op); - } + audit_log_format(ab, " op="); + audit_log_string(ab, op); + audit_log_format(ab, " cause="); + audit_log_string(ab, cause); audit_log_format(ab, " comm="); audit_log_untrustedstring(ab, current->comm); if (fname) { From 07ff7a0b187f3951788f64ae1f30e8109bc8e9eb Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 5 May 2009 13:13:10 -0400 Subject: [PATCH 0791/2061] integrity: remove __setup auditing msgs Remove integrity audit messages from __setup() Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- security/integrity/ima/ima_audit.c | 13 ++----------- security/integrity/ima/ima_main.c | 16 ++-------------- 2 files changed, 4 insertions(+), 25 deletions(-) diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c index c1461150691..b628eea477a 100644 --- a/security/integrity/ima/ima_audit.c +++ b/security/integrity/ima/ima_audit.c @@ -22,18 +22,9 @@ static int ima_audit; static int __init ima_audit_setup(char *str) { unsigned long audit; - int rc, result = 0; - char *op = "ima_audit"; - char *cause; - rc = strict_strtoul(str, 0, &audit); - if (rc || audit > 1) - result = 1; - else - ima_audit = audit; - cause = ima_audit ? "enabled" : "not_enabled"; - integrity_audit_msg(AUDIT_INTEGRITY_STATUS, NULL, NULL, - op, cause, result, 0); + if (!strict_strtoul(str, 0, &audit)) + ima_audit = audit ? 1 : 0; return 1; } __setup("ima_audit=", ima_audit_setup); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index f4e7266f5ae..122f17fc7fc 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -29,20 +29,8 @@ int ima_initialized; char *ima_hash = "sha1"; static int __init hash_setup(char *str) { - const char *op = "hash_setup"; - const char *hash = "sha1"; - int result = 0; - int audit_info = 0; - - if (strncmp(str, "md5", 3) == 0) { - hash = "md5"; - ima_hash = str; - } else if (strncmp(str, "sha1", 4) != 0) { - hash = "invalid_hash_type"; - result = 1; - } - integrity_audit_msg(AUDIT_INTEGRITY_HASH, NULL, NULL, op, hash, - result, audit_info); + if (strncmp(str, "md5", 3) == 0) + ima_hash = "md5"; return 1; } __setup("ima_hash=", hash_setup); From aa20ae8444fc6c318272c643f856d8d8ad3e198d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 21:16:11 -0400 Subject: [PATCH 0792/2061] ring-buffer: move big if statement down In the hot path of the ring buffer "__rb_reserve_next" there's a big if statement that does not even return back to the work flow. code; if (cross to next page) { [ lots of code ] return; } more code; The condition is even the unlikely path, although we do not denote it with an unlikely because gcc is fine with it. The condition is true when the write crosses a page boundary, and we need to start at a new page. Having this if statement makes it hard to read, but calling another function to do the work is also not appropriate, because we are using a lot of variables that were set before the if statement, and we do not want to send them as parameters. This patch changes it to a goto: code; if (cross to next page) goto next_page; more code; return; next_page: [ lots of code] This makes the code easier to understand, and a bit more obvious. The output from gcc is practically identical. For some reason, gcc decided to use different registers when I switched it to a goto. But other than that, the logic is the same. [ Impact: easier to read code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 224 +++++++++++++++++++------------------ 1 file changed, 114 insertions(+), 110 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 7876df00695..424129eb20a 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1159,6 +1159,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, unsigned type, unsigned long length, u64 *ts) { struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; + struct buffer_page *next_page; unsigned long tail, write; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; @@ -1173,116 +1174,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, tail = write - length; /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) { - struct buffer_page *next_page = tail_page; - - local_irq_save(flags); - /* - * Since the write to the buffer is still not - * fully lockless, we must be careful with NMIs. - * The locks in the writers are taken when a write - * crosses to a new page. The locks protect against - * races with the readers (this will soon be fixed - * with a lockless solution). - * - * Because we can not protect against NMIs, and we - * want to keep traces reentrant, we need to manage - * what happens when we are in an NMI. - * - * NMIs can happen after we take the lock. - * If we are in an NMI, only take the lock - * if it is not already taken. Otherwise - * simply fail. - */ - if (unlikely(in_nmi())) { - if (!__raw_spin_trylock(&cpu_buffer->lock)) { - cpu_buffer->nmi_dropped++; - goto out_reset; - } - } else - __raw_spin_lock(&cpu_buffer->lock); - - lock_taken = true; - - rb_inc_page(cpu_buffer, &next_page); - - head_page = cpu_buffer->head_page; - reader_page = cpu_buffer->reader_page; - - /* we grabbed the lock before incrementing */ - if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) - goto out_reset; - - /* - * If for some reason, we had an interrupt storm that made - * it all the way around the buffer, bail, and warn - * about it. - */ - if (unlikely(next_page == commit_page)) { - cpu_buffer->commit_overrun++; - goto out_reset; - } - - if (next_page == head_page) { - if (!(buffer->flags & RB_FL_OVERWRITE)) - goto out_reset; - - /* tail_page has not moved yet? */ - if (tail_page == cpu_buffer->tail_page) { - /* count overflows */ - cpu_buffer->overrun += - local_read(&head_page->entries); - - rb_inc_page(cpu_buffer, &head_page); - cpu_buffer->head_page = head_page; - cpu_buffer->head_page->read = 0; - } - } - - /* - * If the tail page is still the same as what we think - * it is, then it is up to us to update the tail - * pointer. - */ - if (tail_page == cpu_buffer->tail_page) { - local_set(&next_page->write, 0); - local_set(&next_page->entries, 0); - local_set(&next_page->page->commit, 0); - cpu_buffer->tail_page = next_page; - - /* reread the time stamp */ - *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); - cpu_buffer->tail_page->page->time_stamp = *ts; - } - - /* - * The actual tail page has moved forward. - */ - if (tail < BUF_PAGE_SIZE) { - /* Mark the rest of the page with padding */ - event = __rb_page_index(tail_page, tail); - rb_event_set_padding(event); - } - - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); - - /* - * If this was a commit entry that failed, - * increment that too - */ - if (tail_page == cpu_buffer->commit_page && - tail == rb_commit_index(cpu_buffer)) { - rb_set_commit_to_write(cpu_buffer); - } - - __raw_spin_unlock(&cpu_buffer->lock); - local_irq_restore(flags); - - /* fail and let the caller try again */ - return ERR_PTR(-EAGAIN); - } + if (write > BUF_PAGE_SIZE) + goto next_page; /* We reserved something on the buffer */ @@ -1305,6 +1198,117 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return event; + next_page: + + next_page = tail_page; + + local_irq_save(flags); + /* + * Since the write to the buffer is still not + * fully lockless, we must be careful with NMIs. + * The locks in the writers are taken when a write + * crosses to a new page. The locks protect against + * races with the readers (this will soon be fixed + * with a lockless solution). + * + * Because we can not protect against NMIs, and we + * want to keep traces reentrant, we need to manage + * what happens when we are in an NMI. + * + * NMIs can happen after we take the lock. + * If we are in an NMI, only take the lock + * if it is not already taken. Otherwise + * simply fail. + */ + if (unlikely(in_nmi())) { + if (!__raw_spin_trylock(&cpu_buffer->lock)) { + cpu_buffer->nmi_dropped++; + goto out_reset; + } + } else + __raw_spin_lock(&cpu_buffer->lock); + + lock_taken = true; + + rb_inc_page(cpu_buffer, &next_page); + + head_page = cpu_buffer->head_page; + reader_page = cpu_buffer->reader_page; + + /* we grabbed the lock before incrementing */ + if (RB_WARN_ON(cpu_buffer, next_page == reader_page)) + goto out_reset; + + /* + * If for some reason, we had an interrupt storm that made + * it all the way around the buffer, bail, and warn + * about it. + */ + if (unlikely(next_page == commit_page)) { + cpu_buffer->commit_overrun++; + goto out_reset; + } + + if (next_page == head_page) { + if (!(buffer->flags & RB_FL_OVERWRITE)) + goto out_reset; + + /* tail_page has not moved yet? */ + if (tail_page == cpu_buffer->tail_page) { + /* count overflows */ + cpu_buffer->overrun += + local_read(&head_page->entries); + + rb_inc_page(cpu_buffer, &head_page); + cpu_buffer->head_page = head_page; + cpu_buffer->head_page->read = 0; + } + } + + /* + * If the tail page is still the same as what we think + * it is, then it is up to us to update the tail + * pointer. + */ + if (tail_page == cpu_buffer->tail_page) { + local_set(&next_page->write, 0); + local_set(&next_page->entries, 0); + local_set(&next_page->page->commit, 0); + cpu_buffer->tail_page = next_page; + + /* reread the time stamp */ + *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); + cpu_buffer->tail_page->page->time_stamp = *ts; + } + + /* + * The actual tail page has moved forward. + */ + if (tail < BUF_PAGE_SIZE) { + /* Mark the rest of the page with padding */ + event = __rb_page_index(tail_page, tail); + rb_event_set_padding(event); + } + + if (tail <= BUF_PAGE_SIZE) + /* Set the write back to the previous setting */ + local_set(&tail_page->write, tail); + + /* + * If this was a commit entry that failed, + * increment that too + */ + if (tail_page == cpu_buffer->commit_page && + tail == rb_commit_index(cpu_buffer)) { + rb_set_commit_to_write(cpu_buffer); + } + + __raw_spin_unlock(&cpu_buffer->lock); + local_irq_restore(flags); + + /* fail and let the caller try again */ + return ERR_PTR(-EAGAIN); + out_reset: /* reset write */ if (tail <= BUF_PAGE_SIZE) From c898faf91b3ec6b0f6efa35831b3984fa3331db0 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Tue, 5 May 2009 17:28:56 -0400 Subject: [PATCH 0793/2061] x86: 46 bit physical address support on 64 bits Extend the maximum addressable memory on x86-64 from 2^44 to 2^46 bytes. This requires some shuffling around of the vmalloc and virtual memmap memory areas, to keep them away from the direct mapping of up to 64TB of physical memory. This patch also introduces a guard hole between the vmalloc area and the virtual memory map space. There's really no good reason why we wouldn't have a guard hole there. [ Impact: future hardware enablement ] Signed-off-by: Rik van Riel LKML-Reference: <20090505172856.6820db22@cuia.bos.redhat.com> Signed-off-by: H. Peter Anvin --- Documentation/x86/x86_64/mm.txt | 9 +++++---- arch/x86/include/asm/page_64_types.h | 2 +- arch/x86/include/asm/pgtable_64_types.h | 8 ++++---- arch/x86/include/asm/sparsemem.h | 2 +- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 29b52b14d0b..53941323584 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -6,10 +6,11 @@ Virtual memory map with 4 level page tables: 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm hole caused by [48:63] sign extension ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole -ffff880000000000 - ffffc0ffffffffff (=57 TB) direct mapping of all phys. memory -ffffc10000000000 - ffffc1ffffffffff (=40 bits) hole -ffffc20000000000 - ffffe1ffffffffff (=45 bits) vmalloc/ioremap space -ffffe20000000000 - ffffe2ffffffffff (=40 bits) virtual memory map (1TB) +ffff880000000000 - ffffc8ffffffffff (=64 TB) direct mapping of all phys. memory +ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole +ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space +ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole +ffffea0000000000 - ffffeaffffffffff (=40 bits) virtual memory map (1TB) ... unused hole ... ffffffff80000000 - ffffffffa0000000 (=512 MB) kernel text mapping, from phys 0 ffffffffa0000000 - fffffffffff00000 (=1536 MB) module mapping space diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index 3f587188ae6..6fadb020bd2 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -47,7 +47,7 @@ #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map _AC(0xffffffff80000000, UL) -/* See Documentation/x86_64/mm.txt for a description of the memory map. */ +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define __PHYSICAL_MASK_SHIFT 46 #define __VIRTUAL_MASK_SHIFT 48 diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h index fbf42b8e038..766ea16fbbb 100644 --- a/arch/x86/include/asm/pgtable_64_types.h +++ b/arch/x86/include/asm/pgtable_64_types.h @@ -51,11 +51,11 @@ typedef struct { pteval_t pte; } pte_t; #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) - +/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ #define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) -#define VMALLOC_START _AC(0xffffc20000000000, UL) -#define VMALLOC_END _AC(0xffffe1ffffffffff, UL) -#define VMEMMAP_START _AC(0xffffe20000000000, UL) +#define VMALLOC_START _AC(0xffffc90000000000, UL) +#define VMALLOC_END _AC(0xffffe8ffffffffff, UL) +#define VMEMMAP_START _AC(0xffffea0000000000, UL) #define MODULES_VADDR _AC(0xffffffffa0000000, UL) #define MODULES_END _AC(0xffffffffff000000, UL) #define MODULES_LEN (MODULES_END - MODULES_VADDR) diff --git a/arch/x86/include/asm/sparsemem.h b/arch/x86/include/asm/sparsemem.h index e3cc3c063ec..4517d6b9318 100644 --- a/arch/x86/include/asm/sparsemem.h +++ b/arch/x86/include/asm/sparsemem.h @@ -27,7 +27,7 @@ #else /* CONFIG_X86_32 */ # define SECTION_SIZE_BITS 27 /* matt - 128 is convenient right now */ # define MAX_PHYSADDR_BITS 44 -# define MAX_PHYSMEM_BITS 44 /* Can be max 45 bits */ +# define MAX_PHYSMEM_BITS 46 #endif #endif /* CONFIG_SPARSEMEM */ From 2feceeff1e771850e49f9074307f071964fd9e3e Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 5 May 2009 19:07:07 -0700 Subject: [PATCH 0794/2061] x86: fix typo in address space documentation Fix a trivial typo in Documentation/x86/x86_64/mm.txt. [ Impact: documentation only ] Signed-off-by: H. Peter Anvin Cc: Rik van Riel --- Documentation/x86/x86_64/mm.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/x86/x86_64/mm.txt b/Documentation/x86/x86_64/mm.txt index 53941323584..d6498e3cd71 100644 --- a/Documentation/x86/x86_64/mm.txt +++ b/Documentation/x86/x86_64/mm.txt @@ -6,7 +6,7 @@ Virtual memory map with 4 level page tables: 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm hole caused by [48:63] sign extension ffff800000000000 - ffff80ffffffffff (=40 bits) guard hole -ffff880000000000 - ffffc8ffffffffff (=64 TB) direct mapping of all phys. memory +ffff880000000000 - ffffc7ffffffffff (=64 TB) direct mapping of all phys. memory ffffc80000000000 - ffffc8ffffffffff (=40 bits) hole ffffc90000000000 - ffffe8ffffffffff (=45 bits) vmalloc/ioremap space ffffe90000000000 - ffffe9ffffffffff (=40 bits) hole From 5092dbc96f3acdac5433b27c06860352dc6d23b9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Tue, 5 May 2009 22:47:18 -0400 Subject: [PATCH 0795/2061] ring-buffer: add benchmark and tester This patch adds code that can benchmark the ring buffer as well as test it. This code can be compiled into the kernel (not recommended) or as a module. A separate ring buffer is used to not interfer with other users, like ftrace. It creates a producer and a consumer (option to disable creation of the consumer) and will run for 10 seconds, then sleep for 10 seconds and then repeat. While running, the producer will write 10 byte loads into the ring buffer with just putting in the current CPU number. The reader will continually try to read the buffer. The reader will alternate from reading the buffer via event by event, or by full pages. The output is a pr_info, thus it will fill up the syslogs. Starting ring buffer hammer End ring buffer hammer Time: 9000349 (usecs) Overruns: 12578640 Read: 5358440 (by events) Entries: 0 Total: 17937080 Missed: 0 Hit: 17937080 Entries per millisec: 1993 501 ns per entry Sleeping for 10 secs Starting ring buffer hammer End ring buffer hammer Time: 9936350 (usecs) Overruns: 0 Read: 28146644 (by pages) Entries: 74 Total: 28146718 Missed: 0 Hit: 28146718 Entries per millisec: 2832 353 ns per entry Sleeping for 10 secs Time: is the time the test ran Overruns: the number of events that were overwritten and not read Read: the number of events read (either by pages or events) Entries: the number of entries left in the buffer (the by pages will only read full pages) Total: Entries + Read + Overruns Missed: the number of entries that failed to write Hit: the number of entries that were written The above example shows that it takes ~353 nanosecs per entry when there is a reader, reading by pages (and no overruns) The event by event reader slowed the producer down to 501 nanosecs. [ Impact: see how changes to the ring buffer affect stability and performance ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 16 ++ kernel/trace/Makefile | 1 + kernel/trace/ring_buffer_benchmark.c | 379 +++++++++++++++++++++++++++ 3 files changed, 396 insertions(+) create mode 100644 kernel/trace/ring_buffer_benchmark.c diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 450d3c2cfbd..50f62a296e1 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -471,6 +471,22 @@ config MMIOTRACE_TEST Say N, unless you absolutely know what you are doing. +config RING_BUFFER_BENCHMARK + tristate "Ring buffer benchmark stress tester" + depends on RING_BUFFER + help + This option creates a test to stress the ring buffer and bench mark it. + It creates its own ring buffer such that it will not interfer with + any other users of the ring buffer (such as ftrace). It then creates + a producer and consumer that will run for 10 seconds and sleep for + 10 seconds. Each interval it will print out the number of events + it recorded and give a rough estimate of how long each iteration took. + + It does not disable interrupts or raise its priority, so it may be + affected by processes that are running. + + If unsure, say N + endif # FTRACE endif # TRACING_SUPPORT diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index fb9d7f96489..7c34cbfff96 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -17,6 +17,7 @@ endif obj-$(CONFIG_FUNCTION_TRACER) += libftrace.o obj-$(CONFIG_RING_BUFFER) += ring_buffer.o +obj-$(CONFIG_RING_BUFFER_BENCHMARK) += ring_buffer_benchmark.o obj-$(CONFIG_TRACING) += trace.o obj-$(CONFIG_TRACING) += trace_clock.o diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c new file mode 100644 index 00000000000..747244acb8f --- /dev/null +++ b/kernel/trace/ring_buffer_benchmark.c @@ -0,0 +1,379 @@ +/* + * ring buffer tester and benchmark + * + * Copyright (C) 2009 Steven Rostedt + */ +#include +#include +#include +#include +#include + +struct rb_page { + u64 ts; + local_t commit; + char data[4080]; +}; + +/* run time and sleep time in seconds */ +#define RUN_TIME 10 +#define SLEEP_TIME 10 + +/* number of events for writer to wake up the reader */ +static int wakeup_interval = 100; + +static int reader_finish; +static struct completion read_start; +static struct completion read_done; + +static struct ring_buffer *buffer; +static struct task_struct *producer; +static struct task_struct *consumer; +static unsigned long read; + +static int disable_reader; +module_param(disable_reader, uint, 0644); +MODULE_PARM_DESC(disable_reader, "only run producer"); + +static int read_events; + +static int kill_test; + +#define KILL_TEST() \ + do { \ + if (!kill_test) { \ + kill_test = 1; \ + WARN_ON(1); \ + } \ + } while (0) + +enum event_status { + EVENT_FOUND, + EVENT_DROPPED, +}; + +static enum event_status read_event(int cpu) +{ + struct ring_buffer_event *event; + int *entry; + u64 ts; + + event = ring_buffer_consume(buffer, cpu, &ts); + if (!event) + return EVENT_DROPPED; + + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + return EVENT_DROPPED; + } + + read++; + return EVENT_FOUND; +} + +static enum event_status read_page(int cpu) +{ + struct ring_buffer_event *event; + struct rb_page *rpage; + unsigned long commit; + void *bpage; + int *entry; + int ret; + int inc; + int i; + + bpage = ring_buffer_alloc_read_page(buffer); + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); + if (ret >= 0) { + rpage = bpage; + commit = local_read(&rpage->commit); + for (i = 0; i < commit && !kill_test; i += inc) { + + if (i >= (PAGE_SIZE - offsetof(struct rb_page, data))) { + KILL_TEST(); + break; + } + + inc = -1; + event = (void *)&rpage->data[i]; + switch (event->type_len) { + case RINGBUF_TYPE_PADDING: + /* We don't expect any padding */ + KILL_TEST(); + break; + case RINGBUF_TYPE_TIME_EXTEND: + inc = 8; + break; + case 0: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + if (!event->array[0]) { + KILL_TEST(); + break; + } + inc = event->array[0]; + break; + default: + entry = ring_buffer_event_data(event); + if (*entry != cpu) { + KILL_TEST(); + break; + } + read++; + inc = ((event->type_len + 1) * 4); + } + if (kill_test) + break; + + if (inc <= 0) { + KILL_TEST(); + break; + } + } + } + ring_buffer_free_read_page(buffer, bpage); + + if (ret < 0) + return EVENT_DROPPED; + return EVENT_FOUND; +} + +static void ring_buffer_consumer(void) +{ + /* toggle between reading pages and events */ + read_events ^= 1; + + read = 0; + while (!reader_finish && !kill_test) { + int found; + + do { + int cpu; + + found = 0; + for_each_online_cpu(cpu) { + enum event_status stat; + + if (read_events) + stat = read_event(cpu); + else + stat = read_page(cpu); + + if (kill_test) + break; + if (stat == EVENT_FOUND) + found = 1; + } + } while (found && !kill_test); + + set_current_state(TASK_INTERRUPTIBLE); + if (reader_finish) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + reader_finish = 0; + complete(&read_done); +} + +static void ring_buffer_producer(void) +{ + struct timeval start_tv; + struct timeval end_tv; + unsigned long long time; + unsigned long long entries; + unsigned long long overruns; + unsigned long missed = 0; + unsigned long hit = 0; + unsigned long avg; + int cnt = 0; + + /* + * Hammer the buffer for 10 secs (this may + * make the system stall) + */ + pr_info("Starting ring buffer hammer\n"); + do_gettimeofday(&start_tv); + do { + struct ring_buffer_event *event; + int *entry; + + event = ring_buffer_lock_reserve(buffer, 10); + if (!event) { + missed++; + } else { + hit++; + entry = ring_buffer_event_data(event); + *entry = smp_processor_id(); + ring_buffer_unlock_commit(buffer, event); + } + do_gettimeofday(&end_tv); + + if (consumer && !(++cnt % wakeup_interval)) + wake_up_process(consumer); + + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); + pr_info("End ring buffer hammer\n"); + + if (consumer) { + /* Init both completions here to avoid races */ + init_completion(&read_start); + init_completion(&read_done); + /* the completions must be visible before the finish var */ + smp_wmb(); + reader_finish = 1; + /* finish var visible before waking up the consumer */ + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_done); + } + + time = end_tv.tv_sec - start_tv.tv_sec; + time *= 1000000; + time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); + + entries = ring_buffer_entries(buffer); + overruns = ring_buffer_overruns(buffer); + + if (kill_test) + pr_info("ERROR!\n"); + pr_info("Time: %lld (usecs)\n", time); + pr_info("Overruns: %lld\n", overruns); + if (disable_reader) + pr_info("Read: (reader disabled)\n"); + else + pr_info("Read: %ld (by %s)\n", read, + read_events ? "events" : "pages"); + pr_info("Entries: %lld\n", entries); + pr_info("Total: %lld\n", entries + overruns + read); + pr_info("Missed: %ld\n", missed); + pr_info("Hit: %ld\n", hit); + + do_div(time, 1000); + if (time) + hit /= (long)time; + else + pr_info("TIME IS ZERO??\n"); + + pr_info("Entries per millisec: %ld\n", hit); + + if (hit) { + avg = 1000000 / hit; + pr_info("%ld ns per entry\n", avg); + } +} + +static void wait_to_die(void) +{ + set_current_state(TASK_INTERRUPTIBLE); + while (!kthread_should_stop()) { + schedule(); + set_current_state(TASK_INTERRUPTIBLE); + } + __set_current_state(TASK_RUNNING); +} + +static int ring_buffer_consumer_thread(void *arg) +{ + while (!kthread_should_stop() && !kill_test) { + complete(&read_start); + + ring_buffer_consumer(); + + set_current_state(TASK_INTERRUPTIBLE); + if (kthread_should_stop() || kill_test) + break; + + schedule(); + __set_current_state(TASK_RUNNING); + } + __set_current_state(TASK_RUNNING); + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int ring_buffer_producer_thread(void *arg) +{ + init_completion(&read_start); + + while (!kthread_should_stop() && !kill_test) { + ring_buffer_reset(buffer); + + if (consumer) { + smp_wmb(); + wake_up_process(consumer); + wait_for_completion(&read_start); + } + + ring_buffer_producer(); + + pr_info("Sleeping for 10 secs\n"); + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(HZ * SLEEP_TIME); + __set_current_state(TASK_RUNNING); + } + + if (kill_test) + wait_to_die(); + + return 0; +} + +static int __init ring_buffer_benchmark_init(void) +{ + int ret; + + /* make a one meg buffer in overwite mode */ + buffer = ring_buffer_alloc(1000000, RB_FL_OVERWRITE); + if (!buffer) + return -ENOMEM; + + if (!disable_reader) { + consumer = kthread_create(ring_buffer_consumer_thread, + NULL, "rb_consumer"); + ret = PTR_ERR(consumer); + if (IS_ERR(consumer)) + goto out_fail; + } + + producer = kthread_run(ring_buffer_producer_thread, + NULL, "rb_producer"); + ret = PTR_ERR(producer); + + if (IS_ERR(producer)) + goto out_kill; + + return 0; + + out_kill: + if (consumer) + kthread_stop(consumer); + + out_fail: + ring_buffer_free(buffer); + return ret; +} + +static void __exit ring_buffer_benchmark_exit(void) +{ + kthread_stop(producer); + if (consumer) + kthread_stop(consumer); + ring_buffer_free(buffer); +} + +module_init(ring_buffer_benchmark_init); +module_exit(ring_buffer_benchmark_exit); + +MODULE_AUTHOR("Steven Rostedt"); +MODULE_DESCRIPTION("ring_buffer_benchmark"); +MODULE_LICENSE("GPL"); From fd6da10a617f483348ee32bcfe53fd20c302eca1 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:32:13 +0800 Subject: [PATCH 0796/2061] tracing/events: don't say hi when loading the trace event sample The sample is useful for testing, and I'm using it. But after loading the module, it keeps saying hi every 10 seconds, this may be disturbing. Also Steven said commenting out the "hi" helped in causing races. :) [ Impact: make testing a bit easier ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A00F6AD.2070008@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- samples/trace_events/trace-events-sample.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/samples/trace_events/trace-events-sample.c b/samples/trace_events/trace-events-sample.c index f33b3ba744a..aabc4e97091 100644 --- a/samples/trace_events/trace-events-sample.c +++ b/samples/trace_events/trace-events-sample.c @@ -16,10 +16,6 @@ static void simple_thread_func(int cnt) set_current_state(TASK_INTERRUPTIBLE); schedule_timeout(HZ); trace_foo_bar("hello", cnt); - - if (!(cnt % 10)) - /* It is really important that I say "hi!" */ - printk(KERN_EMERG "hi!\n"); } static int simple_thread(void *arg) From 96d17980fabeb757706d2d6db5a28580a6156bfc Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:32:32 +0800 Subject: [PATCH 0797/2061] tracing/events: make SAMPLE_TRACE_EVENTS default to n Normally a config should be default to n. This patch also makes the sample module-only, like SAMPLE_MARKERS and SAMPLE_TRACEPOINTS. [ Impact: don't build trace event sample by default ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A00F6C0.8090803@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- samples/Kconfig | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/samples/Kconfig b/samples/Kconfig index 93f41c05109..b75d28cba3f 100644 --- a/samples/Kconfig +++ b/samples/Kconfig @@ -20,9 +20,8 @@ config SAMPLE_TRACEPOINTS This build tracepoints example modules. config SAMPLE_TRACE_EVENTS - tristate "Build trace_events examples" - depends on EVENT_TRACING - default m + tristate "Build trace_events examples -- loadable modules only" + depends on EVENT_TRACING && m help This build trace event example modules. From 2df75e415709ad12862028916c772c1f377f6a7c Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:33:04 +0800 Subject: [PATCH 0798/2061] tracing/events: fix memory leak when unloading module When unloading a module, memory allocated by init_preds() and trace_define_field() is not freed. [ Impact: fix memory leak ] Signed-off-by: Li Zefan Acked-by: Frederic Weisbecker Acked-by: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <4A00F6E0.3040503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/linux/ftrace_event.h | 1 + kernel/trace/trace_events.c | 18 ++++++++++++++++++ kernel/trace/trace_events_filter.c | 22 +++++++++++++++------- 3 files changed, 34 insertions(+), 7 deletions(-) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 5fff40c9ff5..662c1becf36 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -116,6 +116,7 @@ struct ftrace_event_call { #define MAX_FILTER_STR_VAL 128 extern int init_preds(struct ftrace_event_call *call); +extern void destroy_preds(struct ftrace_event_call *call); extern int filter_match_preds(struct ftrace_event_call *call, void *rec); extern int filter_current_check_discard(struct ftrace_event_call *call, void *rec, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f789ca540fe..f251a150e75 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -60,6 +60,22 @@ err: } EXPORT_SYMBOL_GPL(trace_define_field); +#ifdef CONFIG_MODULES + +static void trace_destroy_fields(struct ftrace_event_call *call) +{ + struct ftrace_event_field *field, *next; + + list_for_each_entry_safe(field, next, &call->fields, link) { + list_del(&field->link); + kfree(field->type); + kfree(field->name); + kfree(field); + } +} + +#endif /* CONFIG_MODULES */ + static void ftrace_clear_events(void) { struct ftrace_event_call *call; @@ -925,6 +941,8 @@ static void trace_module_remove_events(struct module *mod) unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); list_del(&call->list); + trace_destroy_fields(call); + destroy_preds(call); } } diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index f49486687ee..ce07b818671 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -346,6 +346,20 @@ static void filter_disable_preds(struct ftrace_event_call *call) filter->preds[i]->fn = filter_pred_none; } +void destroy_preds(struct ftrace_event_call *call) +{ + struct event_filter *filter = call->filter; + int i; + + for (i = 0; i < MAX_FILTER_PRED; i++) { + if (filter->preds[i]) + filter_free_pred(filter->preds[i]); + } + kfree(filter->preds); + kfree(filter); + call->filter = NULL; +} + int init_preds(struct ftrace_event_call *call) { struct event_filter *filter; @@ -374,13 +388,7 @@ int init_preds(struct ftrace_event_call *call) return 0; oom: - for (i = 0; i < MAX_FILTER_PRED; i++) { - if (filter->preds[i]) - filter_free_pred(filter->preds[i]); - } - kfree(filter->preds); - kfree(call->filter); - call->filter = NULL; + destroy_preds(call); return -ENOMEM; } From 20c8928abe70e204bd077ab6cfe23002d7788983 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Wed, 6 May 2009 10:33:45 +0800 Subject: [PATCH 0799/2061] tracing/events: fix concurrent access to ftrace_events list A module will add/remove its trace events when it gets loaded/unloaded, so the ftrace_events list is not "const", and concurrent access needs to be protected. This patch thus fixes races between loading/unloding modules and read 'available_events' or read/write 'set_event', etc. Below shows how to reproduce the race: # for ((; ;)) { cat /mnt/tracing/available_events; } > /dev/null & # for ((; ;)) { insmod trace-events-sample.ko; rmmod sample; } & After a while: BUG: unable to handle kernel paging request at 0010011c IP: [] t_next+0x1b/0x2d ... Call Trace: [] ? seq_read+0x217/0x30d [] ? seq_read+0x0/0x30d [] ? vfs_read+0x8f/0x136 [] ? sys_read+0x40/0x65 [] ? sysenter_do_call+0x12/0x36 [ Impact: fix races when concurrent accessing ftrace_events list ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker Cc: Tom Zanussi Cc: Peter Zijlstra LKML-Reference: <4A00F709.3080800@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace.h | 1 + kernel/trace/trace_event_profile.c | 19 ++++++++++++++----- kernel/trace/trace_events.c | 20 +++++++++++--------- kernel/trace/trace_events_filter.c | 10 +++++++--- 4 files changed, 33 insertions(+), 17 deletions(-) diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 7736fe8c1b7..777c6c3a0cd 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -825,6 +825,7 @@ static int filter_pred_##size(struct filter_pred *pred, void *event, \ return match; \ } +extern struct mutex event_mutex; extern struct list_head ftrace_events; extern const char *__start___trace_bprintk_fmt[]; diff --git a/kernel/trace/trace_event_profile.c b/kernel/trace/trace_event_profile.c index 7bf2ad65eee..5b5895afecf 100644 --- a/kernel/trace/trace_event_profile.c +++ b/kernel/trace/trace_event_profile.c @@ -10,21 +10,30 @@ int ftrace_profile_enable(int event_id) { struct ftrace_event_call *event; + int ret = -EINVAL; + mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) - return event->profile_enable(event); + if (event->id == event_id) { + ret = event->profile_enable(event); + break; + } } + mutex_unlock(&event_mutex); - return -EINVAL; + return ret; } void ftrace_profile_disable(int event_id) { struct ftrace_event_call *event; + mutex_lock(&event_mutex); list_for_each_entry(event, &ftrace_events, list) { - if (event->id == event_id) - return event->profile_disable(event); + if (event->id == event_id) { + event->profile_disable(event); + break; + } } + mutex_unlock(&event_mutex); } diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f251a150e75..8d579ff2361 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -21,7 +21,7 @@ #define TRACE_SYSTEM "TRACE_SYSTEM" -static DEFINE_MUTEX(event_mutex); +DEFINE_MUTEX(event_mutex); LIST_HEAD(ftrace_events); @@ -80,6 +80,7 @@ static void ftrace_clear_events(void) { struct ftrace_event_call *call; + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (call->enabled) { @@ -87,6 +88,7 @@ static void ftrace_clear_events(void) call->unregfunc(); } } + mutex_unlock(&event_mutex); } static void ftrace_event_enable_disable(struct ftrace_event_call *call, @@ -274,6 +276,9 @@ t_next(struct seq_file *m, void *v, loff_t *pos) static void *t_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return t_next(m, NULL, pos); } @@ -303,6 +308,9 @@ s_next(struct seq_file *m, void *v, loff_t *pos) static void *s_start(struct seq_file *m, loff_t *pos) { + mutex_lock(&event_mutex); + if (*pos == 0) + m->private = ftrace_events.next; return s_next(m, NULL, pos); } @@ -319,12 +327,12 @@ static int t_show(struct seq_file *m, void *v) static void t_stop(struct seq_file *m, void *p) { + mutex_unlock(&event_mutex); } static int ftrace_event_seq_open(struct inode *inode, struct file *file) { - int ret; const struct seq_operations *seq_ops; if ((file->f_mode & FMODE_WRITE) && @@ -332,13 +340,7 @@ ftrace_event_seq_open(struct inode *inode, struct file *file) ftrace_clear_events(); seq_ops = inode->i_private; - ret = seq_open(file, seq_ops); - if (!ret) { - struct seq_file *m = file->private_data; - - m->private = ftrace_events.next; - } - return ret; + return seq_open(file, seq_ops); } static ssize_t diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index ce07b818671..7ac69108527 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -408,6 +408,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) filter->n_preds = 0; } + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->define_fields) continue; @@ -417,6 +418,7 @@ static void filter_free_subsystem_preds(struct event_subsystem *system) remove_filter_string(call->filter); } } + mutex_unlock(&event_mutex); } static int filter_add_pred_fn(struct filter_parse_state *ps, @@ -567,6 +569,7 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, { struct event_filter *filter = system->filter; struct ftrace_event_call *call; + int err = 0; if (!filter->preds) { filter->preds = kzalloc(MAX_FILTER_PRED * sizeof(pred), @@ -584,8 +587,8 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, filter->preds[filter->n_preds] = pred; filter->n_preds++; + mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { - int err; if (!call->define_fields) continue; @@ -597,12 +600,13 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, if (err) { filter_free_subsystem_preds(system); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - return err; + break; } replace_filter_string(call->filter, filter_string); } + mutex_unlock(&event_mutex); - return 0; + return err; } static void parse_init(struct filter_parse_state *ps, From de1d7286060430e79a1d50ad6e5fee8fe863c5f6 Mon Sep 17 00:00:00 2001 From: Mathieu Desnoyers Date: Tue, 5 May 2009 16:49:59 +0800 Subject: [PATCH 0800/2061] tracepoint: trace_sched_migrate_task(): remove parameter The orig_cpu parameter in trace_sched_migrate_task() is not necessary, it can be got by using task_cpu(p) in the probe. [ Impact: micro-optimization ] Signed-off-by: Mathieu Desnoyers [ modified from Mathieu's patch. The original patch is at: http://marc.info/?l=linux-kernel&m=123791201716239&w=2 ] Signed-off-by: Xiao Guangrong Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org Cc: Li Zefan Cc: zhaolei@cn.fujitsu.com Cc: laijs@cn.fujitsu.com LKML-Reference: <49FFFDB7.1050402@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/events/sched.h | 6 +++--- kernel/sched.c | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index ffa1cab586b..dd4033cf5b0 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -180,9 +180,9 @@ TRACE_EVENT(sched_switch, */ TRACE_EVENT(sched_migrate_task, - TP_PROTO(struct task_struct *p, int orig_cpu, int dest_cpu), + TP_PROTO(struct task_struct *p, int dest_cpu), - TP_ARGS(p, orig_cpu, dest_cpu), + TP_ARGS(p, dest_cpu), TP_STRUCT__entry( __array( char, comm, TASK_COMM_LEN ) @@ -196,7 +196,7 @@ TRACE_EVENT(sched_migrate_task, memcpy(__entry->comm, p->comm, TASK_COMM_LEN); __entry->pid = p->pid; __entry->prio = p->prio; - __entry->orig_cpu = orig_cpu; + __entry->orig_cpu = task_cpu(p); __entry->dest_cpu = dest_cpu; ), diff --git a/kernel/sched.c b/kernel/sched.c index 9f7ffd00b6e..9cdedbd181c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -1954,7 +1954,7 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) clock_offset = old_rq->clock - new_rq->clock; - trace_sched_migrate_task(p, task_cpu(p), new_cpu); + trace_sched_migrate_task(p, new_cpu); #ifdef CONFIG_SCHEDSTATS if (p->se.wait_start) From a42aaa3bbce85ac487ad4fad5db99e8e91b7aac1 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Mon, 4 May 2009 16:27:26 -0400 Subject: [PATCH 0801/2061] blktrace: correct remap names This attempts to clarify names utilized during block I/O remap operations (partition, volume manager). It correctly matches up the /from/ information for both device & sector. This takes in the concept from Kosaki Motohiro and extends it to include better naming for the "device_from" field. [ Impact: cleanup ] Signed-off-by: Alan D. Brunelle Reviewed-by: Li Zefan Reviewed-by: KOSAKI Motohiro Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo LKML-Reference: <49FF4FAE.3000301@hp.com> Signed-off-by: Ingo Molnar --- include/linux/blktrace_api.h | 4 ++-- include/trace/block.h | 4 ++-- kernel/trace/blktrace.c | 24 ++++++++++++------------ 3 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 62763c95285..82b4636030e 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -116,9 +116,9 @@ struct blk_io_trace { * The remap event */ struct blk_io_trace_remap { - __be32 device; __be32 device_from; - __be64 sector; + __be32 device_to; + __be64 sector_from; }; enum { diff --git a/include/trace/block.h b/include/trace/block.h index 25b7068b819..87f6456fd32 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -70,7 +70,7 @@ DECLARE_TRACE(block_split, DECLARE_TRACE(block_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t from, sector_t to), - TP_ARGS(q, bio, dev, from, to)); + sector_t to, sector_t from), + TP_ARGS(q, bio, dev, to, from)); #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index c32062bd10b..f8d46d6f5d3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -830,8 +830,8 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * @q: queue the io is for * @bio: the source bio * @dev: target device - * @from: source sector * @to: target sector + * @from: source sector * * Description: * Device mapper or raid target sometimes need to split a bio because @@ -839,7 +839,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * **/ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t from, sector_t to) + dev_t dev, sector_t to, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -847,9 +847,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, if (likely(!bt)) return; - r.device = cpu_to_be32(dev); - r.device_from = cpu_to_be32(bio->bi_bdev->bd_dev); - r.sector = cpu_to_be64(to); + r.device_from = cpu_to_be32(dev); + r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); + r.sector_from = cpu_to_be64(from); __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); @@ -1028,11 +1028,11 @@ static void get_pdu_remap(const struct trace_entry *ent, struct blk_io_trace_remap *r) { const struct blk_io_trace_remap *__r = pdu_start(ent); - __u64 sector = __r->sector; + __u64 sector_from = __r->sector_from; - r->device = be32_to_cpu(__r->device); r->device_from = be32_to_cpu(__r->device_from); - r->sector = be64_to_cpu(sector); + r->device_to = be32_to_cpu(__r->device_to); + r->sector_from = be64_to_cpu(sector_from); } typedef int (blk_log_action_t) (struct trace_iterator *iter, const char *act); @@ -1148,13 +1148,13 @@ static int blk_log_with_error(struct trace_seq *s, static int blk_log_remap(struct trace_seq *s, const struct trace_entry *ent) { - struct blk_io_trace_remap r = { .device = 0, }; + struct blk_io_trace_remap r = { .device_from = 0, }; get_pdu_remap(ent, &r); return trace_seq_printf(s, "%llu + %u <- (%d,%d) %llu\n", - t_sector(ent), - t_sec(ent), MAJOR(r.device), MINOR(r.device), - (unsigned long long)r.sector); + t_sector(ent), t_sec(ent), + MAJOR(r.device_from), MINOR(r.device_from), + (unsigned long long)r.sector_from); } static int blk_log_plug(struct trace_seq *s, const struct trace_entry *ent) From 22a7c31a9659deaddafbbcec6562d44141e84474 Mon Sep 17 00:00:00 2001 From: "Alan D. Brunelle" Date: Mon, 4 May 2009 16:35:08 -0400 Subject: [PATCH 0802/2061] blktrace: from-sector redundant in trace_block_remap Remove redundant from-sector parameter: it's /always/ the bio's sector passed in. [ Impact: cleanup ] Signed-off-by: Alan D. Brunelle Reviewed-by: Li Zefan Reviewed-by: KOSAKI Motohiro Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo LKML-Reference: <49FF517C.7000503@hp.com> Signed-off-by: Ingo Molnar --- block/blk-core.c | 5 ++--- drivers/md/dm.c | 3 +-- include/trace/block.h | 4 ++-- kernel/trace/blktrace.c | 8 ++++---- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 07ab75403e1..a5f747a8312 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1275,7 +1275,7 @@ static inline void blk_partition_remap(struct bio *bio) bio->bi_bdev = bdev->bd_contains; trace_block_remap(bdev_get_queue(bio->bi_bdev), bio, - bdev->bd_dev, bio->bi_sector, + bdev->bd_dev, bio->bi_sector - p->start_sect); } } @@ -1444,8 +1444,7 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; if (old_sector != -1) - trace_block_remap(q, bio, old_dev, bio->bi_sector, - old_sector); + trace_block_remap(q, bio, old_dev, old_sector); trace_block_bio_queue(q, bio); diff --git a/drivers/md/dm.c b/drivers/md/dm.c index 8a994be035b..b01514afb6b 100644 --- a/drivers/md/dm.c +++ b/drivers/md/dm.c @@ -657,8 +657,7 @@ static void __map_bio(struct dm_target *ti, struct bio *clone, /* the bio has been remapped so dispatch it */ trace_block_remap(bdev_get_queue(clone->bi_bdev), clone, - tio->io->bio->bi_bdev->bd_dev, - clone->bi_sector, sector); + tio->io->bio->bi_bdev->bd_dev, sector); generic_make_request(clone); } else if (r < 0 || r == DM_MAPIO_REQUEUE) { diff --git a/include/trace/block.h b/include/trace/block.h index 87f6456fd32..8ac945b7746 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -70,7 +70,7 @@ DECLARE_TRACE(block_split, DECLARE_TRACE(block_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t to, sector_t from), - TP_ARGS(q, bio, dev, to, from)); + sector_t to), + TP_ARGS(q, bio, dev, to)); #endif diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index f8d46d6f5d3..e099f8cc1d1 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -830,7 +830,6 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * @q: queue the io is for * @bio: the source bio * @dev: target device - * @to: target sector * @from: source sector * * Description: @@ -839,7 +838,7 @@ static void blk_add_trace_split(struct request_queue *q, struct bio *bio, * **/ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, - dev_t dev, sector_t to, sector_t from) + dev_t dev, sector_t from) { struct blk_trace *bt = q->blk_trace; struct blk_io_trace_remap r; @@ -851,8 +850,9 @@ static void blk_add_trace_remap(struct request_queue *q, struct bio *bio, r.device_to = cpu_to_be32(bio->bi_bdev->bd_dev); r.sector_from = cpu_to_be64(from); - __blk_add_trace(bt, from, bio->bi_size, bio->bi_rw, BLK_TA_REMAP, - !bio_flagged(bio, BIO_UPTODATE), sizeof(r), &r); + __blk_add_trace(bt, bio->bi_sector, bio->bi_size, bio->bi_rw, + BLK_TA_REMAP, !bio_flagged(bio, BIO_UPTODATE), + sizeof(r), &r); } /** From 48dd0fed90e2b1f1ba87401439b85942181c6df3 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Wed, 6 May 2009 15:45:45 +0530 Subject: [PATCH 0803/2061] tracing: trace_output.c, fix false positive compiler warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This compiler warning: CC kernel/trace/trace_output.o kernel/trace/trace_output.c: In function ‘register_ftrace_event’: kernel/trace/trace_output.c:544: warning: ‘list’ may be used uninitialized in this function Is wrong as 'list' is always initialized - but GCC (4.3.2) does not recognize this relationship properly. Work around the warning by initializing the variable to NULL. [ Impact: fix false positive compiler warning ] Signed-off-by: Jaswinder Singh Rajput Acked-by: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/trace/trace_output.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 5fc51f0f75f..8bd9a2c1a46 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -541,7 +541,7 @@ int register_ftrace_event(struct trace_event *event) INIT_LIST_HEAD(&event->list); if (!event->type) { - struct list_head *list; + struct list_head *list = NULL; if (next_event_type > FTRACE_MAX_EVENT) { From 35cf723e99c0e26ddf51f037dffaa4ff2c2c9106 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 6 May 2009 12:33:38 +0200 Subject: [PATCH 0804/2061] tracing: small trave_events sample Makefile cleanup Use -I$(src) to add the current directory the include path. [ Impact: cleanup ] Signed-off-by: Christoph Hellwig Acked-by: Steven Rostedt LKML-Reference: Signed-off-by: Ingo Molnar --- samples/trace_events/Makefile | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/samples/trace_events/Makefile b/samples/trace_events/Makefile index 06c6dea1eb8..0d428dc6728 100644 --- a/samples/trace_events/Makefile +++ b/samples/trace_events/Makefile @@ -1,8 +1,6 @@ # builds the trace events example kernel modules; # then to use one (as root): insmod -PWD := $(shell pwd) - -CFLAGS_trace-events-sample.o := -I$(PWD)/samples/trace_events/ +CFLAGS_trace-events-sample.o := -I$(src) obj-$(CONFIG_SAMPLE_TRACE_EVENTS) += trace-events-sample.o From 8e7abf1c62941ebb7a1416cbc62392c8a0902625 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 10:26:45 -0400 Subject: [PATCH 0805/2061] ring-buffer: remove unneeded conditional in rb_reserve_next The code in __rb_reserve_next checks on page overflow if it is the original commiter and then resets the page back to the original setting. Although this is fine, and the code is correct, it is a bit fragil. Some experimental work I did breaks it easily. The better and more robust solution is to have all commiters that overflow the page, simply subtract what they added. [ Impact: more robust ring buffer account management ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 424129eb20a..03ed52b67db 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1290,9 +1290,8 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, rb_event_set_padding(event); } - if (tail <= BUF_PAGE_SIZE) - /* Set the write back to the previous setting */ - local_set(&tail_page->write, tail); + /* Set the write back to the previous setting */ + local_sub(length, &tail_page->write); /* * If this was a commit entry that failed, @@ -1311,8 +1310,7 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, out_reset: /* reset write */ - if (tail <= BUF_PAGE_SIZE) - local_set(&tail_page->write, tail); + local_sub(length, &tail_page->write); if (likely(lock_taken)) __raw_spin_unlock(&cpu_buffer->lock); From 00c81a58c5b4e0de14ee33bfbc3d71c90f69f9ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 12:40:51 -0400 Subject: [PATCH 0806/2061] ring-buffer: check for failed allocation in ring buffer benchmark The result of the allocation of the ring buffer read page in the ring buffer bench mark does not check the return to see if a page was actually allocated. This patch fixes that. [ Impact: avoid NULL dereference ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 747244acb8f..dcd75e9e49f 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -84,6 +84,9 @@ static enum event_status read_page(int cpu) int i; bpage = ring_buffer_alloc_read_page(buffer); + if (!bpage) + return EVENT_DROPPED; + ret = ring_buffer_read_page(buffer, &bpage, PAGE_SIZE, cpu, 1); if (ret >= 0) { rpage = bpage; From 6634ff26cce2da04e5c2a5481bcb8888e7d01786 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 15:30:07 -0400 Subject: [PATCH 0807/2061] ring-buffer: make moving the tail page a separate function Ingo Molnar thought the code would be cleaner if we used a function call instead of a goto for moving the tail page. After implementing this, it seems that gcc still inlines the result and the output is pretty much the same. Since this is considered a cleaner approach, might as well implement it. [ Impact: code clean up ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 89 +++++++++++++++++++++----------------- 1 file changed, 49 insertions(+), 40 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 03ed52b67db..3ae5ccf2c0f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1154,51 +1154,18 @@ static unsigned rb_calculate_event_length(unsigned length) return length; } + static struct ring_buffer_event * -__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length, u64 *ts) +rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, + unsigned long length, unsigned long tail, + struct buffer_page *commit_page, + struct buffer_page *tail_page, u64 *ts) { - struct buffer_page *tail_page, *head_page, *reader_page, *commit_page; - struct buffer_page *next_page; - unsigned long tail, write; + struct buffer_page *next_page, *head_page, *reader_page; struct ring_buffer *buffer = cpu_buffer->buffer; struct ring_buffer_event *event; - unsigned long flags; bool lock_taken = false; - - commit_page = cpu_buffer->commit_page; - /* we just need to protect against interrupts */ - barrier(); - tail_page = cpu_buffer->tail_page; - write = local_add_return(length, &tail_page->write); - tail = write - length; - - /* See if we shot pass the end of this buffer page */ - if (write > BUF_PAGE_SIZE) - goto next_page; - - /* We reserved something on the buffer */ - - if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) - return NULL; - - event = __rb_page_index(tail_page, tail); - rb_update_event(event, type, length); - - /* The passed in type is zero for DATA */ - if (likely(!type)) - local_inc(&tail_page->entries); - - /* - * If this is a commit and the tail is zero, then update - * this page's time stamp. - */ - if (!tail && rb_is_commit(cpu_buffer, event)) - cpu_buffer->commit_page->page->time_stamp = *ts; - - return event; - - next_page: + unsigned long flags; next_page = tail_page; @@ -1318,6 +1285,48 @@ __rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, return NULL; } +static struct ring_buffer_event * +__rb_reserve_next(struct ring_buffer_per_cpu *cpu_buffer, + unsigned type, unsigned long length, u64 *ts) +{ + struct buffer_page *tail_page, *commit_page; + struct ring_buffer_event *event; + unsigned long tail, write; + + commit_page = cpu_buffer->commit_page; + /* we just need to protect against interrupts */ + barrier(); + tail_page = cpu_buffer->tail_page; + write = local_add_return(length, &tail_page->write); + tail = write - length; + + /* See if we shot pass the end of this buffer page */ + if (write > BUF_PAGE_SIZE) + return rb_move_tail(cpu_buffer, length, tail, + commit_page, tail_page, ts); + + /* We reserved something on the buffer */ + + if (RB_WARN_ON(cpu_buffer, write > BUF_PAGE_SIZE)) + return NULL; + + event = __rb_page_index(tail_page, tail); + rb_update_event(event, type, length); + + /* The passed in type is zero for DATA */ + if (likely(!type)) + local_inc(&tail_page->entries); + + /* + * If this is a commit and the tail is zero, then update + * this page's time stamp. + */ + if (!tail && rb_is_commit(cpu_buffer, event)) + cpu_buffer->commit_page->page->time_stamp = *ts; + + return event; +} + static int rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, u64 *ts, u64 *delta) From 3e07a4f680adc66dfa175aa5021aedf340251b12 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 18:36:59 -0400 Subject: [PATCH 0808/2061] ring-buffer: change test to be more latency friendly The ring buffer benchmark/test runs a producer for 10 seconds. This is done with preemption and interrupts enabled. But if the kernel is not compiled with CONFIG_PREEMPT, it basically stops everything but interrupts for 10 seconds. Although this is just a test and is not for production, this attribute can be quite annoying. It can also spawn badness elsewhere. This patch solves the issues by calling "cond_resched" when the system is not compiled with CONFIG_PREEMPT. It also keeps track of the time spent to call cond_resched such that it does not go against the time calculations. That is, if the task schedules away, the time scheduled out is removed from the test data. Note, this only works for non PREEMPT because we do not know when the task is scheduled out if we have PREEMPT enabled. [ Impact: prevent test from stopping the world for 10 seconds ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 31 ++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index dcd75e9e49f..a26fc67b63b 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -185,6 +185,35 @@ static void ring_buffer_consumer(void) complete(&read_done); } +/* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call cond_resched + * and also add any time that was lost by a rescedule. + */ +#ifdef CONFIG_PREEMPT +static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) +{ +} +#else +static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) +{ + struct timeval tv; + + cond_resched(); + do_gettimeofday(&tv); + if (tv.tv_usec < end_tv->tv_usec) { + tv.tv_usec += 1000000; + tv.tv_sec--; + } + start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec; + start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec; + if (start_tv->tv_usec > 1000000) { + start_tv->tv_usec -= 1000000; + start_tv->tv_sec++; + } +} +#endif + static void ring_buffer_producer(void) { struct timeval start_tv; @@ -221,6 +250,8 @@ static void ring_buffer_producer(void) if (consumer && !(++cnt % wakeup_interval)) wake_up_process(consumer); + sched_if_needed(&start_tv, &end_tv); + } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); From 71e1c8ac42ae4038ddb1367cce7097ab868dc532 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 21:20:39 -0400 Subject: [PATCH 0809/2061] tracing: update sample with TRACE_INCLUDE_FILE When creating trace events for ftrace, the header file with the TRACE_EVENT macros must also have a macro called TRACE_SYSTEM. This macro describes the name of the system the TRACE_EVENTS are defined for. It also doubles as a way for the define_trace.h file to include the file that included it. For example: in irq.h #define TRACE_SYSTEM irq [...] #include The define_trace will use TRACE_SYSTEM to include irq.h. But if the name of the trace system does not match the name of the trace header file, one can override it with: Which will change define_trace.h to inclued foo_trace.h instead of foo.h The sample comments this, but people that use the sample code will more likely use the code and not read the comments. This patch changes the sample code to use the TRACE_INCLUDE_FILE to better show developers how to use it. [ Impact: make sample less confusing to developers ] Reported-by: Christoph Hellwig Signed-off-by: Steven Rostedt --- samples/trace_events/trace-events-sample.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/samples/trace_events/trace-events-sample.h b/samples/trace_events/trace-events-sample.h index eab46443e61..128a897687c 100644 --- a/samples/trace_events/trace-events-sample.h +++ b/samples/trace_events/trace-events-sample.h @@ -31,7 +31,7 @@ * */ #undef TRACE_SYSTEM -#define TRACE_SYSTEM trace-events-sample +#define TRACE_SYSTEM sample /* * The TRACE_EVENT macro is broken up into 5 parts. @@ -120,5 +120,10 @@ TRACE_EVENT(foo_bar, * result. */ #undef TRACE_INCLUDE_PATH +#undef TRACE_INCLUDE_FILE #define TRACE_INCLUDE_PATH . +/* + * TRACE_INCLUDE_FILE is not needed if the filename and TRACE_SYSTEM are equal + */ +#define TRACE_INCLUDE_FILE trace-events-sample #include From 9456f0fa6d3cb944d3b9fc31c9a244e0362c26ea Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 21:54:09 -0400 Subject: [PATCH 0810/2061] tracing: reset ring buffer when removing modules with events Li Zefan found that there's a race using the event ids of events and modules. When a module is loaded, an event id is incremented. We only have 16 bits for event ids (65536) and there is a possible (but highly unlikely) race that we could load and unload a module that registers events so many times that the event id counter overflows. When it overflows, it then restarts and goes looking for available ids. An id is available if it was added by a module and released. The race is if you have one module add an id, and then is removed. Another module loaded can use that same event id. But if the old module still had events in the ring buffer, the new module's call back would get bogus data. At best (and most likely) the output would just be garbage. But if the module for some reason used pointers (not recommended) then this could potentially crash. The safest thing to do is just reset the ring buffer if a module that registered events is removed. [ Impact: prevent unpredictable results of event id overflows ] Reported-by: Li Zefan LKML-Reference: <49FEAFD0.30106@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace.c | 10 ++++++++++ kernel/trace/trace.h | 2 ++ kernel/trace/trace_events.c | 9 +++++++++ 3 files changed, 21 insertions(+) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4164a344e72..dd40d232034 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -639,6 +639,16 @@ void tracing_reset_online_cpus(struct trace_array *tr) tracing_reset(tr, cpu); } +void tracing_reset_current(int cpu) +{ + tracing_reset(&global_trace, cpu); +} + +void tracing_reset_current_online_cpus(void) +{ + tracing_reset_online_cpus(&global_trace); +} + #define SAVED_CMDLINES 128 #define NO_CMDLINE_MAP UINT_MAX static unsigned map_pid_to_cmdline[PID_MAX_DEFAULT+1]; diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index 777c6c3a0cd..ba25793ffe6 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -409,6 +409,8 @@ int tracing_is_enabled(void); void trace_wake_up(void); void tracing_reset(struct trace_array *tr, int cpu); void tracing_reset_online_cpus(struct trace_array *tr); +void tracing_reset_current(int cpu); +void tracing_reset_current_online_cpus(void); int tracing_open_generic(struct inode *inode, struct file *filp); struct dentry *trace_create_file(const char *name, mode_t mode, diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8d579ff2361..6d2c842a024 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -932,9 +932,11 @@ static void trace_module_remove_events(struct module *mod) { struct ftrace_module_file_ops *file_ops; struct ftrace_event_call *call, *p; + bool found = false; list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { + found = true; if (call->enabled) { call->enabled = 0; call->unregfunc(); @@ -957,6 +959,13 @@ static void trace_module_remove_events(struct module *mod) list_del(&file_ops->list); kfree(file_ops); } + + /* + * It is safest to reset the ring buffer if the module being unloaded + * registered any events. + */ + if (found) + tracing_reset_current_online_cpus(); } static int trace_module_notify(struct notifier_block *self, From 8ae79a138e88aceeeb07077bff2883245fb7c218 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Wed, 6 May 2009 22:52:15 -0400 Subject: [PATCH 0811/2061] tracing: add hierarchical enabling of events With the current event directory, you can only enable individual events. The file debugfs/tracing/set_event is used to be able to enable or disable several events at once. But that can still be awkward. This patch adds hierarchical enabling of events. That is, each directory in debugfs/tracing/events has an "enable" file. This file can enable or disable all events within the directory and below. # echo 1 > /debugfs/tracing/events/enable will enable all events. # echo 1 > /debugfs/tracing/events/sched/enable will enable all events in the sched subsystem. # echo 1 > /debugfs/tracing/events/enable # echo 0 > /debugfs/tracing/events/irq/enable will enable all events, but then disable just the irq subsystem events. When reading one of these enable files, there are four results: 0 - all events this file affects are disabled 1 - all events this file affects are enabled X - there is a mixture of events enabled and disabled ? - this file does not affect any event Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 140 ++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 6d2c842a024..87feb0117ce 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -400,6 +400,133 @@ event_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return cnt; } +static ssize_t +system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + struct ftrace_event_call *call; + char buf[2]; + int set = -1; + int all = 0; + int ret; + + if (system[0] == '*') + all = 1; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + if (!call->name || !call->regfunc) + continue; + + if (!all && strcmp(call->system, system) != 0) + continue; + + /* + * We need to find out if all the events are set + * or if all events or cleared, or if we have + * a mixture. + */ + if (call->enabled) { + switch (set) { + case -1: + set = 1; + break; + case 0: + set = 2; + break; + } + } else { + switch (set) { + case -1: + set = 0; + break; + case 1: + set = 2; + break; + } + } + /* + * If we have a mixture, no need to look further. + */ + if (set == 2) + break; + } + mutex_unlock(&event_mutex); + + buf[1] = '\n'; + switch (set) { + case 0: + buf[0] = '0'; + break; + case 1: + buf[0] = '1'; + break; + case 2: + buf[0] = 'X'; + break; + default: + buf[0] = '?'; + } + + ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); + + return ret; +} + +static ssize_t +system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, + loff_t *ppos) +{ + const char *system = filp->private_data; + unsigned long val; + char *command; + char buf[64]; + ssize_t ret; + + if (cnt >= sizeof(buf)) + return -EINVAL; + + if (copy_from_user(&buf, ubuf, cnt)) + return -EFAULT; + + buf[cnt] = 0; + + ret = strict_strtoul(buf, 10, &val); + if (ret < 0) + return ret; + + ret = tracing_update_buffers(); + if (ret < 0) + return ret; + + switch (val) { + case 0: + case 1: + break; + + default: + return -EINVAL; + } + + command = kstrdup(system, GFP_KERNEL); + if (!command) + return -ENOMEM; + + ret = ftrace_set_clr_event(command, val); + if (ret) + goto out_free; + + ret = cnt; + + out_free: + kfree(command); + + *ppos += cnt; + + return ret; +} + extern char *__bad_type_size(void); #undef FIELD @@ -686,6 +813,12 @@ static const struct file_operations ftrace_subsystem_filter_fops = { .write = subsystem_filter_write, }; +static const struct file_operations ftrace_system_enable_fops = { + .open = tracing_open_generic, + .read = system_enable_read, + .write = system_enable_write, +}; + static const struct file_operations ftrace_show_header_fops = { .open = tracing_open_generic, .read = show_header, @@ -768,6 +901,10 @@ event_subsystem_dir(const char *name, struct dentry *d_events) "'%s/filter' entry\n", name); } + entry = trace_create_file("enable", 0644, system->entry, + (void *)system->name, + &ftrace_system_enable_fops); + return system->entry; } @@ -1041,6 +1178,9 @@ static __init int event_trace_init(void) ring_buffer_print_entry_header, &ftrace_show_header_fops); + trace_create_file("enable", 0644, d_events, + "*:*", &ftrace_system_enable_fops); + for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ if (!call->name) From 975e5f45500dff6d15c0001bb662e9aac0ce0076 Mon Sep 17 00:00:00 2001 From: Samuel Bronson Date: Wed, 6 May 2009 22:27:55 -0400 Subject: [PATCH 0812/2061] x86: use symbolic name for VM86_SIGNAL when used as vm86 default return This code has apparently used "0" and not VM86_SIGNAL since Linux 1.1.9, when Linus added VM86_SIGNAL to vm86.h. This patch changes the code to use the symbolic name. The magic 0 tripped me up in trying to extend the vm86(2) manpage to actually explain vm86()'s interface -- my greps for VM86_SIGNAL came up fruitless. [ Impact: cleanup; no object code change ] Signed-off-by: Samuel Bronson Signed-off-by: H. Peter Anvin --- arch/x86/kernel/vm86_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index d7ac84e7fc1..b8035a0f404 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -318,9 +318,9 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk } /* - * Save old state, set default return value (%ax) to 0 + * Save old state, set default return value (%ax) to 0 (VM86_SIGNAL) */ - info->regs32->ax = 0; + info->regs32->ax = VM86_SIGNAL; tsk->thread.saved_sp0 = tsk->thread.sp0; tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); From 40c8bca76ecaa6b663d403d34f0fcd422bbdbffd Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 7 May 2009 15:24:36 +0900 Subject: [PATCH 0813/2061] sh: Flag IRQSTACKS as BROKEN for now. There still seems to be some stack corruption to sort out here, so flag this as BROKEN until this issue is sorted out. Signed-off-by: Paul Mundt --- arch/sh/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug index 0d62681f72a..fdb049373e7 100644 --- a/arch/sh/Kconfig.debug +++ b/arch/sh/Kconfig.debug @@ -92,7 +92,7 @@ config 4KSTACKS config IRQSTACKS bool "Use separate kernel stacks when processing interrupts" - depends on DEBUG_KERNEL && SUPERH32 + depends on DEBUG_KERNEL && SUPERH32 && BROKEN help If you say Y here the kernel will use separate kernel stacks for handling hard and soft interrupts. This can help avoid From 643bec956544d376b7c2a80a3d5c3d0bf94da8d3 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 7 May 2009 09:12:50 +0200 Subject: [PATCH 0814/2061] x86: clean up arch/x86/kernel/tsc_sync.c a bit - remove unused define - make the lock variable definition stand out some more - convert KERN_* to pr_info() / pr_warning() [ Impact: cleanup ] LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/tsc_sync.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/tsc_sync.c b/arch/x86/kernel/tsc_sync.c index bf36328f6ef..027b5b49899 100644 --- a/arch/x86/kernel/tsc_sync.c +++ b/arch/x86/kernel/tsc_sync.c @@ -34,6 +34,7 @@ static __cpuinitdata atomic_t stop_count; * of a critical section, to be able to prove TSC time-warps: */ static __cpuinitdata raw_spinlock_t sync_lock = __RAW_SPIN_LOCK_UNLOCKED; + static __cpuinitdata cycles_t last_tsc; static __cpuinitdata cycles_t max_warp; static __cpuinitdata int nr_warps; @@ -113,13 +114,12 @@ void __cpuinit check_tsc_sync_source(int cpu) return; if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) { - printk(KERN_INFO - "Skipping synchronization checks as TSC is reliable.\n"); + pr_info("Skipping synchronization checks as TSC is reliable.\n"); return; } - printk(KERN_INFO "checking TSC synchronization [CPU#%d -> CPU#%d]:", - smp_processor_id(), cpu); + pr_info("checking TSC synchronization [CPU#%d -> CPU#%d]:", + smp_processor_id(), cpu); /* * Reset it - in case this is a second bootup: @@ -143,8 +143,8 @@ void __cpuinit check_tsc_sync_source(int cpu) if (nr_warps) { printk("\n"); - printk(KERN_WARNING "Measured %Ld cycles TSC warp between CPUs," - " turning off TSC clock.\n", max_warp); + pr_warning("Measured %Ld cycles TSC warp between CPUs, " + "turning off TSC clock.\n", max_warp); mark_tsc_unstable("check_tsc_sync_source failed"); } else { printk(" passed.\n"); @@ -195,5 +195,3 @@ void __cpuinit check_tsc_sync_target(void) while (atomic_read(&stop_count) != cpus) cpu_relax(); } -#undef NR_LOOPS - From aa47b7e0f89b9998dad4d1667447e8cb7703ff4e Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Mon, 4 May 2009 01:38:05 -0700 Subject: [PATCH 0815/2061] sched: emit thread info flags with stack trace When a thread is oom killed and fails to exit, it's helpful to know which threads have access to memory reserves if the machine livelocks. This is done by testing for the TIF_MEMDIE thread info flag and should be displayed alongside stack traces to identify tasks that have access to such reserves but are still stuck allocating pages, for instance. It would probably be helpful in other cases as well, so all thread info flags are emitted when showing a task. ( v2: fix warning reported by Stephen Rothwell ) [ Impact: extend debug printout info ] Signed-off-by: David Rientjes Cc: Peter Zijlstra Cc: Stephen Rothwell LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/sched.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/sched.c b/kernel/sched.c index 2a43a581ead..5aa63f50c69 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -6610,8 +6610,9 @@ void sched_show_task(struct task_struct *p) #ifdef CONFIG_DEBUG_STACK_USAGE free = stack_not_used(p); #endif - printk(KERN_CONT "%5lu %5d %6d\n", free, - task_pid_nr(p), task_pid_nr(p->real_parent)); + printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, + task_pid_nr(p), task_pid_nr(p->real_parent), + (unsigned long)task_thread_info(p)->flags); show_stack(p, NULL); } From ee1acbfabd5270b40ce2cfdc202070b7ca91cdff Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 7 May 2009 16:38:16 +0900 Subject: [PATCH 0816/2061] sh: Handle shm_align_mask also for HAVE_ARCH_UNMAPPED_AREA_TOPDOWN. Presently shm_align_mask is only looked at for the bottom up case, but we still want this for proper colouring constraints in the topdown case. Signed-off-by: Paul Mundt --- arch/sh/include/asm/cacheflush.h | 2 - arch/sh/include/asm/pgtable.h | 4 + arch/sh/mm/mmap.c | 136 ++++++++++++++++++++++++++++++- 3 files changed, 136 insertions(+), 6 deletions(-) diff --git a/arch/sh/include/asm/cacheflush.h b/arch/sh/include/asm/cacheflush.h index 09acbc32d6c..4c5462daa74 100644 --- a/arch/sh/include/asm/cacheflush.h +++ b/arch/sh/include/asm/cacheflush.h @@ -75,7 +75,5 @@ extern void copy_from_user_page(struct vm_area_struct *vma, #define flush_cache_vmap(start, end) flush_cache_all() #define flush_cache_vunmap(start, end) flush_cache_all() -#define HAVE_ARCH_UNMAPPED_AREA - #endif /* __KERNEL__ */ #endif /* __ASM_SH_CACHEFLUSH_H */ diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index b517ae08b9c..2a011b18090 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -154,6 +154,10 @@ extern void kmap_coherent_init(void); #define kmap_coherent_init() do { } while (0) #endif +/* arch/sh/mm/mmap.c */ +#define HAVE_ARCH_UNMAPPED_AREA +#define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN + #include #endif /* __ASM_SH_PGTABLE_H */ diff --git a/arch/sh/mm/mmap.c b/arch/sh/mm/mmap.c index 931f4d003fa..1b5fdfb4e0c 100644 --- a/arch/sh/mm/mmap.c +++ b/arch/sh/mm/mmap.c @@ -1,7 +1,7 @@ /* * arch/sh/mm/mmap.c * - * Copyright (C) 2008 Paul Mundt + * Copyright (C) 2008 - 2009 Paul Mundt * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive @@ -21,9 +21,26 @@ EXPORT_SYMBOL(shm_align_mask); /* * To avoid cache aliases, we map the shared page with same color. */ -#define COLOUR_ALIGN(addr, pgoff) \ - ((((addr) + shm_align_mask) & ~shm_align_mask) + \ - (((pgoff) << PAGE_SHIFT) & shm_align_mask)) +static inline unsigned long COLOUR_ALIGN(unsigned long addr, + unsigned long pgoff) +{ + unsigned long base = (addr + shm_align_mask) & ~shm_align_mask; + unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask; + + return base + off; +} + +static inline unsigned long COLOUR_ALIGN_DOWN(unsigned long addr, + unsigned long pgoff) +{ + unsigned long base = addr & ~shm_align_mask; + unsigned long off = (pgoff << PAGE_SHIFT) & shm_align_mask; + + if (base + off <= addr) + return base + off; + + return base - off; +} unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) @@ -103,6 +120,117 @@ full_search: addr = COLOUR_ALIGN(addr, pgoff); } } + +unsigned long +arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + const unsigned long len, const unsigned long pgoff, + const unsigned long flags) +{ + struct vm_area_struct *vma; + struct mm_struct *mm = current->mm; + unsigned long addr = addr0; + int do_colour_align; + + if (flags & MAP_FIXED) { + /* We do not accept a shared mapping if it would violate + * cache aliasing constraints. + */ + if ((flags & MAP_SHARED) && + ((addr - (pgoff << PAGE_SHIFT)) & shm_align_mask)) + return -EINVAL; + return addr; + } + + if (unlikely(len > TASK_SIZE)) + return -ENOMEM; + + do_colour_align = 0; + if (filp || (flags & MAP_SHARED)) + do_colour_align = 1; + + /* requesting a specific address */ + if (addr) { + if (do_colour_align) + addr = COLOUR_ALIGN(addr, pgoff); + else + addr = PAGE_ALIGN(addr); + + vma = find_vma(mm, addr); + if (TASK_SIZE - len >= addr && + (!vma || addr + len <= vma->vm_start)) + return addr; + } + + /* check if free_area_cache is useful for us */ + if (len <= mm->cached_hole_size) { + mm->cached_hole_size = 0; + mm->free_area_cache = mm->mmap_base; + } + + /* either no address requested or can't fit in requested address hole */ + addr = mm->free_area_cache; + if (do_colour_align) { + unsigned long base = COLOUR_ALIGN_DOWN(addr-len, pgoff); + + addr = base + len; + } + + /* make sure it can fit in the remaining address space */ + if (likely(addr > len)) { + vma = find_vma(mm, addr-len); + if (!vma || addr <= vma->vm_start) { + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr-len); + } + } + + if (unlikely(mm->mmap_base < len)) + goto bottomup; + + addr = mm->mmap_base-len; + if (do_colour_align) + addr = COLOUR_ALIGN_DOWN(addr, pgoff); + + do { + /* + * Lookup failure means no vma is above this address, + * else if new region fits below vma->vm_start, + * return with success: + */ + vma = find_vma(mm, addr); + if (likely(!vma || addr+len <= vma->vm_start)) { + /* remember the address as a hint for next time */ + return (mm->free_area_cache = addr); + } + + /* remember the largest hole we saw so far */ + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + + /* try just below the current vma->vm_start */ + addr = vma->vm_start-len; + if (do_colour_align) + addr = COLOUR_ALIGN_DOWN(addr, pgoff); + } while (likely(len < vma->vm_start)); + +bottomup: + /* + * A failed mmap() very likely causes application failure, + * so fall back to the bottom-up function here. This scenario + * can happen with large stack limits and large mmap() + * allocations. + */ + mm->cached_hole_size = ~0UL; + mm->free_area_cache = TASK_UNMAPPED_BASE; + addr = arch_get_unmapped_area(filp, addr0, len, pgoff, flags); + /* + * Restore the topdown base: + */ + mm->free_area_cache = mm->mmap_base; + mm->cached_hole_size = ~0UL; + + return addr; +} #endif /* CONFIG_MMU */ /* From e8808c1019b048a43686dbd25c188a035842c2e2 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 3 May 2009 02:48:52 +0200 Subject: [PATCH 0817/2061] tracing/filters: support for filters of dynamic sized arrays Currently the filtering infrastructure supports well the numeric types and fixed sized array types. But the recently added __string() field uses a specific indirect offset mechanism which requires a specific predicate. Until now it wasn't supported. This patch adds this support and implies very few changes, only a new predicate is needed, the management of this specific field can be done through the usual string helpers in the filtering infrastructure. [ Impact: support all kinds of strings in the tracing filters ] Cc: Tom Zanussi Cc: Steven Rostedt Cc: Li Zefan Cc: Zhaolei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events_filter.c | 44 ++++++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 7ac69108527..01c76eb3e16 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -151,6 +151,7 @@ static int filter_pred_or(struct filter_pred *pred __attribute((unused)), return val1 || val2; } +/* Filter predicate for fixed sized arrays of characters */ static int filter_pred_string(struct filter_pred *pred, void *event, int val1, int val2) { @@ -164,6 +165,30 @@ static int filter_pred_string(struct filter_pred *pred, void *event, return match; } +/* + * Filter predicate for dynamic sized arrays of characters. + * These are implemented through a list of strings at the end + * of the entry. + * Also each of these strings have a field in the entry which + * contains its offset from the beginning of the entry. + * We have then first to get this field, dereference it + * and add it to the address of the entry, and at last we have + * the address of the string. + */ +static int filter_pred_strloc(struct filter_pred *pred, void *event, + int val1, int val2) +{ + int str_loc = *(int *)(event + pred->offset); + char *addr = (char *)(event + str_loc); + int cmp, match; + + cmp = strncmp(addr, pred->str_val, pred->str_len); + + match = (!cmp) ^ pred->not; + + return match; +} + static int filter_pred_none(struct filter_pred *pred, void *event, int val1, int val2) { @@ -446,10 +471,18 @@ static int filter_add_pred_fn(struct filter_parse_state *ps, return 0; } +enum { + FILTER_STATIC_STRING = 1, + FILTER_DYN_STRING +}; + static int is_string_field(const char *type) { if (strchr(type, '[') && strstr(type, "char")) - return 1; + return FILTER_STATIC_STRING; + + if (!strcmp(type, "__str_loc")) + return FILTER_DYN_STRING; return 0; } @@ -512,6 +545,7 @@ static int filter_add_pred(struct filter_parse_state *ps, struct ftrace_event_field *field; filter_pred_fn_t fn; unsigned long long val; + int string_type; pred->fn = filter_pred_none; @@ -536,8 +570,12 @@ static int filter_add_pred(struct filter_parse_state *ps, return -EINVAL; } - if (is_string_field(field->type)) { - fn = filter_pred_string; + string_type = is_string_field(field->type); + if (string_type) { + if (string_type == FILTER_STATIC_STRING) + fn = filter_pred_string; + else + fn = filter_pred_strloc; pred->str_len = field->size; if (pred->op == OP_NE) pred->not = 1; From 5928c3cc0ffcb6894bbab6be591b7ae1786b2d87 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 3 May 2009 03:03:57 +0200 Subject: [PATCH 0818/2061] tracing/filters: support for operator reserved characters in strings When we set a filter for an event, such as: echo "name == my_lock_name" > \ /debug/tracing/events/lockdep/lock_acquired/filter then the following order of token type is parsed: - space - operator - parentheses - operand Because the operators and parentheses have a higher precedence than the operand characters, which is normal, then we can't use any string containing such special characters: ()=<>!&| To get this support and also avoid ambiguous intepretation from the parser or the human, we can do it using double quotes so that we keep the usual languages habits. Then after this patch you can still declare string condition like before: echo name == myname But if you want to compare against a string containing an operator character, you can use double quotes: echo 'name == "&myname"' Don't forget to include the whole expression into single quotes or the double ones will be eaten by echo. [ Impact: support strings with special characters for tracing filters ] Cc: Tom Zanussi Cc: Steven Rostedt Cc: Li Zefan Cc: Zhaolei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events_filter.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 01c76eb3e16..8c62e5bdff0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -851,10 +851,19 @@ static void postfix_clear(struct filter_parse_state *ps) static int filter_parse(struct filter_parse_state *ps) { + int in_string = 0; int op, top_op; char ch; while ((ch = infix_next(ps))) { + if (ch == '"') { + in_string ^= 1; + continue; + } + + if (in_string) + goto parse_operand; + if (isspace(ch)) continue; @@ -908,6 +917,7 @@ static int filter_parse(struct filter_parse_state *ps) } continue; } +parse_operand: if (append_operand_char(ps, ch)) { parse_error(ps, FILT_ERR_OPERAND_TOO_LONG, 0); return -EINVAL; From d94fc523f3c35bd8013f04827e94756cbc0212f4 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 7 May 2009 15:11:15 +0800 Subject: [PATCH 0819/2061] tracing/events: fix concurrent access to ftrace_events list, fix In filter_add_subsystem_pred() we should release event_mutex before calling filter_free_subsystem_preds(), since both functions hold event_mutex. [ Impact: fix deadlock when writing invalid pred into subsystem filter ] Signed-off-by: Li Zefan Cc: tzanussi@gmail.com Cc: a.p.zijlstra@chello.nl Cc: fweisbec@gmail.com Cc: rostedt@goodmis.org LKML-Reference: <4A028993.7020509@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events_filter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 8c62e5bdff0..85ad6a8939a 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -636,14 +636,15 @@ static int filter_add_subsystem_pred(struct filter_parse_state *ps, err = filter_add_pred(ps, call, pred); if (err) { + mutex_unlock(&event_mutex); filter_free_subsystem_preds(system); parse_error(ps, FILT_ERR_BAD_SUBSYS_FILTER, 0); - break; + goto out; } replace_filter_string(call->filter, filter_string); } mutex_unlock(&event_mutex); - +out: return err; } From ae318a148e4d255dfbc87d963fdd6031c2af9c46 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 7 May 2009 17:55:09 +0900 Subject: [PATCH 0820/2061] sh: sh64 still needs ARCH_USES_GETTIMEOFFSET temporarily. sh64 is still using this, so re-enable it temporarily while SH-5 gets converted to use the generic code. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 565f39042ad..02688d7909a 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -141,6 +141,10 @@ config ARCH_NO_VIRT_TO_BUS config ARCH_HAS_DEFAULT_IDLE def_bool y +config ARCH_USES_GETTIMEOFFSET + def_bool y + depends on SUPERH64 + config IO_TRAPPED bool From 0fb849b9d743a20056f2418cd955e5c650658663 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 7 May 2009 18:10:27 +0900 Subject: [PATCH 0821/2061] sh: Integrate the SH-5 onchip_remap() more coherently. Presently this is special-cased for early initialization. While there are situations where these static early initializations are still necessary, with minor changes it is possible to use this for the regular ioremap implementation as well. This allows us to kill off the special-casing for the remap completely and to start tidying up all of the SH-5 special-casing in drivers. Signed-off-by: Paul Mundt --- arch/sh/boards/mach-cayman/irq.c | 2 +- arch/sh/boards/mach-cayman/setup.c | 2 +- arch/sh/drivers/pci/pci-sh5.c | 4 +- arch/sh/include/asm/io.h | 7 - arch/sh/kernel/cpu/irq/intc-sh5.c | 2 +- arch/sh/kernel/cpu/sh5/Makefile | 3 + arch/sh/kernel/cpu/sh5/clock-sh5.c | 2 +- arch/sh/kernel/cpu/sh5/entry.S | 8 +- arch/sh/kernel/cpu/sh5/setup-sh5.c | 46 ++++++ arch/sh/kernel/time_64.c | 4 +- arch/sh/mm/ioremap_64.c | 255 ++++++++++------------------- drivers/serial/sh-sci.c | 6 - 12 files changed, 150 insertions(+), 191 deletions(-) create mode 100644 arch/sh/kernel/cpu/sh5/setup-sh5.c diff --git a/arch/sh/boards/mach-cayman/irq.c b/arch/sh/boards/mach-cayman/irq.c index da62ad51699..dbb00c99132 100644 --- a/arch/sh/boards/mach-cayman/irq.c +++ b/arch/sh/boards/mach-cayman/irq.c @@ -161,7 +161,7 @@ void init_cayman_irq(void) { int i; - epld_virt = onchip_remap(EPLD_BASE, 1024, "EPLD"); + epld_virt = (unsigned long)ioremap_nocache(EPLD_BASE, 1024); if (!epld_virt) { printk(KERN_ERR "Cayman IRQ: Unable to remap EPLD\n"); return; diff --git a/arch/sh/boards/mach-cayman/setup.c b/arch/sh/boards/mach-cayman/setup.c index e7f9cc5f2ff..7e8216ac31b 100644 --- a/arch/sh/boards/mach-cayman/setup.c +++ b/arch/sh/boards/mach-cayman/setup.c @@ -102,7 +102,7 @@ static int __init smsc_superio_setup(void) { unsigned char devid, devrev; - smsc_superio_virt = onchip_remap(SMSC_SUPERIO_BASE, 1024, "SMSC SuperIO"); + smsc_superio_virt = (unsigned long)ioremap_nocache(SMSC_SUPERIO_BASE, 1024); if (!smsc_superio_virt) { panic("Unable to remap SMSC SuperIO\n"); } diff --git a/arch/sh/drivers/pci/pci-sh5.c b/arch/sh/drivers/pci/pci-sh5.c index cf431852213..873ed2b4405 100644 --- a/arch/sh/drivers/pci/pci-sh5.c +++ b/arch/sh/drivers/pci/pci-sh5.c @@ -119,12 +119,12 @@ static int __init sh5pci_init(void) return -EINVAL; } - pcicr_virt = onchip_remap(SH5PCI_ICR_BASE, 1024, "PCICR"); + pcicr_virt = (unsigned long)ioremap_nocache(SH5PCI_ICR_BASE, 1024); if (!pcicr_virt) { panic("Unable to remap PCICR\n"); } - PCI_IO_AREA = onchip_remap(SH5PCI_IO_BASE, 0x10000, "PCIIO"); + PCI_IO_AREA = (unsigned long)ioremap_nocache(SH5PCI_IO_BASE, 0x10000); if (!PCI_IO_AREA) { panic("Unable to remap PCIIO\n"); } diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index ef217344afc..c7c360b5866 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -224,11 +224,6 @@ void __iomem *__ioremap(unsigned long offset, unsigned long size, unsigned long flags); void __iounmap(void __iomem *addr); -/* arch/sh/mm/ioremap_64.c */ -unsigned long onchip_remap(unsigned long addr, unsigned long size, - const char *name); -extern void onchip_unmap(unsigned long vaddr); - static inline void __iomem * __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags) { @@ -263,8 +258,6 @@ __ioremap_mode(unsigned long offset, unsigned long size, unsigned long flags) return __ioremap(offset, size, flags); } #else -#define onchip_remap(addr, size, name) (addr) -#define onchip_unmap(addr) do { } while (0) #define __ioremap_mode(offset, size, flags) ((void __iomem *)(offset)) #define __iounmap(addr) do { } while (0) #endif /* CONFIG_MMU */ diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c index d8679a4d93a..02d8137547e 100644 --- a/arch/sh/kernel/cpu/irq/intc-sh5.c +++ b/arch/sh/kernel/cpu/irq/intc-sh5.c @@ -188,7 +188,7 @@ void __init plat_irq_setup(void) unsigned long reg; int i; - intc_virt = onchip_remap(INTC_BASE, 1024, "INTC"); + intc_virt = (unsigned long)ioremap_nocache(INTC_BASE, 1024); if (!intc_virt) { panic("Unable to remap INTC\n"); } diff --git a/arch/sh/kernel/cpu/sh5/Makefile b/arch/sh/kernel/cpu/sh5/Makefile index ce4602ea23a..a184a31e686 100644 --- a/arch/sh/kernel/cpu/sh5/Makefile +++ b/arch/sh/kernel/cpu/sh5/Makefile @@ -6,6 +6,9 @@ obj-y := entry.o probe.o switchto.o obj-$(CONFIG_SH_FPU) += fpu.o obj-$(CONFIG_KALLSYMS) += unwind.o +# CPU subtype setup +obj-$(CONFIG_CPU_SH5) += setup-sh5.o + # Primary on-chip clocks (common) clock-$(CONFIG_CPU_SH5) := clock-sh5.o diff --git a/arch/sh/kernel/cpu/sh5/clock-sh5.c b/arch/sh/kernel/cpu/sh5/clock-sh5.c index 52c49248833..5486324880e 100644 --- a/arch/sh/kernel/cpu/sh5/clock-sh5.c +++ b/arch/sh/kernel/cpu/sh5/clock-sh5.c @@ -71,7 +71,7 @@ static struct clk_ops *sh5_clk_ops[] = { void __init arch_init_clk_ops(struct clk_ops **ops, int idx) { - cprc_base = onchip_remap(CPRC_BASE, 1024, "CPRC"); + cprc_base = (unsigned long)ioremap_nocache(CPRC_BASE, 1024); BUG_ON(!cprc_base); if (idx < ARRAY_SIZE(sh5_clk_ops)) diff --git a/arch/sh/kernel/cpu/sh5/entry.S b/arch/sh/kernel/cpu/sh5/entry.S index 7e49cb812f8..516297515b2 100644 --- a/arch/sh/kernel/cpu/sh5/entry.S +++ b/arch/sh/kernel/cpu/sh5/entry.S @@ -1410,8 +1410,8 @@ peek_real_address_q: r2(out) : result quadword This is provided as a cheapskate way of manipulating device - registers for debugging (to avoid the need to onchip_remap the debug - module, and to avoid the need to onchip_remap the watchpoint + registers for debugging (to avoid the need to ioremap the debug + module, and to avoid the need to ioremap the watchpoint controller in a way that identity maps sufficient bits to avoid the SH5-101 cut2 silicon defect). @@ -1459,8 +1459,8 @@ poke_real_address_q: r3 : quadword value to write. This is provided as a cheapskate way of manipulating device - registers for debugging (to avoid the need to onchip_remap the debug - module, and to avoid the need to onchip_remap the watchpoint + registers for debugging (to avoid the need to ioremap the debug + module, and to avoid the need to ioremap the watchpoint controller in a way that identity maps sufficient bits to avoid the SH5-101 cut2 silicon defect). diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c new file mode 100644 index 00000000000..d8d59fea108 --- /dev/null +++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c @@ -0,0 +1,46 @@ +/* + * SH5-101/SH5-103 CPU Setup + * + * Copyright (C) 2009 Paul Mundt + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ +#include +#include +#include +#include +#include +#include +#include + +static struct plat_sci_port sci_platform_data[] = { + { + .mapbase = PHYS_PERIPHERAL_BLOCK + 0x01030000, + .flags = UPF_BOOT_AUTOCONF | UPF_IOREMAP, + .type = PORT_SCIF, + .irqs = { 39, 40, 42, 0 }, + }, { + .flags = 0, + } +}; + +static struct platform_device sci_device = { + .name = "sh-sci", + .id = -1, + .dev = { + .platform_data = sci_platform_data, + }, +}; + +static struct platform_device *sh5_devices[] __initdata = { + &sci_device, +}; + +static int __init sh5_devices_setup(void) +{ + return platform_add_devices(sh5_devices, + ARRAY_SIZE(sh5_devices)); +} +__initcall(sh5_devices_setup); diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c index f4f5e8ad5be..7bfeaa09b94 100644 --- a/arch/sh/kernel/time_64.c +++ b/arch/sh/kernel/time_64.c @@ -243,12 +243,12 @@ void __init time_init(void) unsigned long interval; struct clk *clk; - tmu_base = onchip_remap(TMU_BASE, 1024, "TMU"); + tmu_base = (unsigned long)ioremap_nocache(TMU_BASE, 1024); if (!tmu_base) { panic("Unable to remap TMU\n"); } - rtc_base = onchip_remap(RTC_BASE, 1024, "RTC"); + rtc_base = (unsigned long)ioremap_nocache(RTC_BASE, 1024); if (!rtc_base) { panic("Unable to remap RTC\n"); } diff --git a/arch/sh/mm/ioremap_64.c b/arch/sh/mm/ioremap_64.c index 31e1bb5effb..2331229f812 100644 --- a/arch/sh/mm/ioremap_64.c +++ b/arch/sh/mm/ioremap_64.c @@ -27,88 +27,17 @@ #include #include -static void shmedia_mapioaddr(unsigned long, unsigned long); -static unsigned long shmedia_ioremap(struct resource *, u32, int); - -/* - * Generic mapping function (not visible outside): - */ - -/* - * Remap an arbitrary physical address space into the kernel virtual - * address space. Needed when the kernel wants to access high addresses - * directly. - * - * NOTE! We need to allow non-page-aligned mappings too: we will obviously - * have to convert them into an offset in a page-aligned mapping, but the - * caller shouldn't need to know that small detail. - */ -void *__ioremap(unsigned long phys_addr, unsigned long size, - unsigned long flags) -{ - void * addr; - struct vm_struct * area; - unsigned long offset, last_addr; - pgprot_t pgprot; - - /* Don't allow wraparound or zero size */ - last_addr = phys_addr + size - 1; - if (!size || last_addr < phys_addr) - return NULL; - - pgprot = __pgprot(_PAGE_PRESENT | _PAGE_READ | - _PAGE_WRITE | _PAGE_DIRTY | - _PAGE_ACCESSED | _PAGE_SHARED | flags); - - /* - * Mappings have to be page-aligned - */ - offset = phys_addr & ~PAGE_MASK; - phys_addr &= PAGE_MASK; - size = PAGE_ALIGN(last_addr + 1) - phys_addr; - - /* - * Ok, go for it.. - */ - area = get_vm_area(size, VM_IOREMAP); - if (!area) - return NULL; - pr_debug("Get vm_area returns %p addr %p\n", area, area->addr); - area->phys_addr = phys_addr; - addr = area->addr; - if (ioremap_page_range((unsigned long)addr, (unsigned long)addr + size, - phys_addr, pgprot)) { - vunmap(addr); - return NULL; - } - return (void *) (offset + (char *)addr); -} -EXPORT_SYMBOL(__ioremap); - -void __iounmap(void *addr) -{ - struct vm_struct *area; - - vfree((void *) (PAGE_MASK & (unsigned long) addr)); - area = remove_vm_area((void *) (PAGE_MASK & (unsigned long) addr)); - if (!area) { - printk(KERN_ERR "iounmap: bad address %p\n", addr); - return; - } - - kfree(area); -} -EXPORT_SYMBOL(__iounmap); - static struct resource shmedia_iomap = { .name = "shmedia_iomap", .start = IOBASE_VADDR + PAGE_SIZE, .end = IOBASE_END - 1, }; -static void shmedia_mapioaddr(unsigned long pa, unsigned long va); +static void shmedia_mapioaddr(unsigned long pa, unsigned long va, + unsigned long flags); static void shmedia_unmapioaddr(unsigned long vaddr); -static unsigned long shmedia_ioremap(struct resource *res, u32 pa, int sz); +static void __iomem *shmedia_ioremap(struct resource *res, u32 pa, + int sz, unsigned long flags); /* * We have the same problem as the SPARC, so lets have the same comment: @@ -130,18 +59,18 @@ static struct xresource xresv[XNRES]; static struct xresource *xres_alloc(void) { - struct xresource *xrp; - int n; + struct xresource *xrp; + int n; - xrp = xresv; - for (n = 0; n < XNRES; n++) { - if (xrp->xflag == 0) { - xrp->xflag = 1; - return xrp; - } - xrp++; - } - return NULL; + xrp = xresv; + for (n = 0; n < XNRES; n++) { + if (xrp->xflag == 0) { + xrp->xflag = 1; + return xrp; + } + xrp++; + } + return NULL; } static void xres_free(struct xresource *xrp) @@ -161,76 +90,71 @@ static struct resource *shmedia_find_resource(struct resource *root, return NULL; } -static unsigned long shmedia_alloc_io(unsigned long phys, unsigned long size, - const char *name) +static void __iomem *shmedia_alloc_io(unsigned long phys, unsigned long size, + const char *name, unsigned long flags) { - static int printed_full = 0; - struct xresource *xres; - struct resource *res; - char *tack; - int tlen; + static int printed_full; + struct xresource *xres; + struct resource *res; + char *tack; + int tlen; - if (name == NULL) name = "???"; + if (name == NULL) + name = "???"; - if ((xres = xres_alloc()) != 0) { - tack = xres->xname; - res = &xres->xres; - } else { - if (!printed_full) { - printk("%s: done with statics, switching to kmalloc\n", - __func__); - printed_full = 1; - } - tlen = strlen(name); - tack = kmalloc(sizeof (struct resource) + tlen + 1, GFP_KERNEL); - if (!tack) - return -ENOMEM; - memset(tack, 0, sizeof(struct resource)); - res = (struct resource *) tack; - tack += sizeof (struct resource); - } + xres = xres_alloc(); + if (xres != 0) { + tack = xres->xname; + res = &xres->xres; + } else { + if (!printed_full) { + printk(KERN_NOTICE "%s: done with statics, " + "switching to kmalloc\n", __func__); + printed_full = 1; + } + tlen = strlen(name); + tack = kmalloc(sizeof(struct resource) + tlen + 1, GFP_KERNEL); + if (!tack) + return NULL; + memset(tack, 0, sizeof(struct resource)); + res = (struct resource *) tack; + tack += sizeof(struct resource); + } - strncpy(tack, name, XNMLN); - tack[XNMLN] = 0; - res->name = tack; + strncpy(tack, name, XNMLN); + tack[XNMLN] = 0; + res->name = tack; - return shmedia_ioremap(res, phys, size); + return shmedia_ioremap(res, phys, size, flags); } -static unsigned long shmedia_ioremap(struct resource *res, u32 pa, int sz) +static void __iomem *shmedia_ioremap(struct resource *res, u32 pa, int sz, + unsigned long flags) { - unsigned long offset = ((unsigned long) pa) & (~PAGE_MASK); + unsigned long offset = ((unsigned long) pa) & (~PAGE_MASK); unsigned long round_sz = (offset + sz + PAGE_SIZE-1) & PAGE_MASK; - unsigned long va; - unsigned int psz; + unsigned long va; + unsigned int psz; - if (allocate_resource(&shmedia_iomap, res, round_sz, + if (allocate_resource(&shmedia_iomap, res, round_sz, shmedia_iomap.start, shmedia_iomap.end, PAGE_SIZE, NULL, NULL) != 0) { - panic("alloc_io_res(%s): cannot occupy\n", - (res->name != NULL)? res->name: "???"); - } + panic("alloc_io_res(%s): cannot occupy\n", + (res->name != NULL) ? res->name : "???"); + } - va = res->start; - pa &= PAGE_MASK; + va = res->start; + pa &= PAGE_MASK; psz = (res->end - res->start + (PAGE_SIZE - 1)) / PAGE_SIZE; - /* log at boot time ... */ - printk("mapioaddr: %6s [%2d page%s] va 0x%08lx pa 0x%08x\n", - ((res->name != NULL) ? res->name : "???"), - psz, psz == 1 ? " " : "s", va, pa); + for (psz = res->end - res->start + 1; psz != 0; psz -= PAGE_SIZE) { + shmedia_mapioaddr(pa, va, flags); + va += PAGE_SIZE; + pa += PAGE_SIZE; + } - for (psz = res->end - res->start + 1; psz != 0; psz -= PAGE_SIZE) { - shmedia_mapioaddr(pa, va); - va += PAGE_SIZE; - pa += PAGE_SIZE; - } - - res->start += offset; - res->end = res->start + sz - 1; /* not strictly necessary.. */ - - return res->start; + return (void __iomem *)(unsigned long)(res->start + offset); } static void shmedia_free_io(struct resource *res) @@ -249,14 +173,12 @@ static void shmedia_free_io(struct resource *res) static __init_refok void *sh64_get_page(void) { - extern int after_bootmem; void *page; - if (after_bootmem) { - page = (void *)get_zeroed_page(GFP_ATOMIC); - } else { + if (after_bootmem) + page = (void *)get_zeroed_page(GFP_KERNEL); + else page = alloc_bootmem_pages(PAGE_SIZE); - } if (!page || ((unsigned long)page & ~PAGE_MASK)) panic("sh64_get_page: Out of memory already?\n"); @@ -264,17 +186,20 @@ static __init_refok void *sh64_get_page(void) return page; } -static void shmedia_mapioaddr(unsigned long pa, unsigned long va) +static void shmedia_mapioaddr(unsigned long pa, unsigned long va, + unsigned long flags) { pgd_t *pgdp; pud_t *pudp; pmd_t *pmdp; pte_t *ptep, pte; pgprot_t prot; - unsigned long flags = 1; /* 1 = CB0-1 device */ pr_debug("shmedia_mapiopage pa %08lx va %08lx\n", pa, va); + if (!flags) + flags = 1; /* 1 = CB0-1 device */ + pgdp = pgd_offset_k(va); if (pgd_none(*pgdp) || !pgd_present(*pgdp)) { pudp = (pud_t *)sh64_get_page(); @@ -288,7 +213,7 @@ static void shmedia_mapioaddr(unsigned long pa, unsigned long va) } pmdp = pmd_offset(pudp, va); - if (pmd_none(*pmdp) || !pmd_present(*pmdp) ) { + if (pmd_none(*pmdp) || !pmd_present(*pmdp)) { ptep = (pte_t *)sh64_get_page(); set_pmd(pmdp, __pmd((unsigned long)ptep + _PAGE_TABLE)); } @@ -336,17 +261,19 @@ static void shmedia_unmapioaddr(unsigned long vaddr) pte_clear(&init_mm, vaddr, ptep); } -unsigned long onchip_remap(unsigned long phys, unsigned long size, const char *name) +void __iomem *__ioremap(unsigned long offset, unsigned long size, + unsigned long flags) { - if (size < PAGE_SIZE) - size = PAGE_SIZE; + char name[14]; - return shmedia_alloc_io(phys, size, name); + sprintf(name, "phys_%08x", (u32)offset); + return shmedia_alloc_io(offset, size, name, flags); } -EXPORT_SYMBOL(onchip_remap); +EXPORT_SYMBOL(__ioremap); -void onchip_unmap(unsigned long vaddr) +void __iounmap(void __iomem *virtual) { + unsigned long vaddr = (unsigned long)virtual & PAGE_MASK; struct resource *res; unsigned int psz; @@ -357,10 +284,7 @@ void onchip_unmap(unsigned long vaddr) return; } - psz = (res->end - res->start + (PAGE_SIZE - 1)) / PAGE_SIZE; - - printk(KERN_DEBUG "unmapioaddr: %6s [%2d page%s] freed\n", - res->name, psz, psz == 1 ? " " : "s"); + psz = (res->end - res->start + (PAGE_SIZE - 1)) / PAGE_SIZE; shmedia_free_io(res); @@ -371,9 +295,8 @@ void onchip_unmap(unsigned long vaddr) kfree(res); } } -EXPORT_SYMBOL(onchip_unmap); +EXPORT_SYMBOL(__iounmap); -#ifdef CONFIG_PROC_FS static int ioremap_proc_info(char *buf, char **start, off_t fpos, int length, int *eof, void *data) @@ -385,7 +308,10 @@ ioremap_proc_info(char *buf, char **start, off_t fpos, int length, int *eof, for (r = ((struct resource *)data)->child; r != NULL; r = r->sibling) { if (p + 32 >= e) /* Better than nothing */ break; - if ((nm = r->name) == 0) nm = "???"; + nm = r->name; + if (nm == NULL) + nm = "???"; + p += sprintf(p, "%08lx-%08lx: %s\n", (unsigned long)r->start, (unsigned long)r->end, nm); @@ -393,14 +319,11 @@ ioremap_proc_info(char *buf, char **start, off_t fpos, int length, int *eof, return p-buf; } -#endif /* CONFIG_PROC_FS */ static int __init register_proc_onchip(void) { -#ifdef CONFIG_PROC_FS - create_proc_read_entry("io_map",0,0, ioremap_proc_info, &shmedia_iomap); -#endif + create_proc_read_entry("io_map", 0, 0, ioremap_proc_info, + &shmedia_iomap); return 0; } - -__initcall(register_proc_onchip); +late_initcall(register_proc_onchip); diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index dbf5357a77b..728d6a062bf 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -985,13 +985,7 @@ static void sci_config_port(struct uart_port *port, int flags) port->type = s->type; if (port->flags & UPF_IOREMAP && !port->membase) { -#if defined(CONFIG_SUPERH64) - port->mapbase = onchip_remap(SCIF_ADDR_SH5, 1024, "SCIF"); - port->membase = (void __iomem *)port->mapbase; -#else port->membase = ioremap_nocache(port->mapbase, 0x40); -#endif - dev_err(port->dev, "can't remap port#%d\n", port->line); } } From c51279ec0d2d959e13831ae84b714301f0494f27 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 7 May 2009 18:17:20 +0900 Subject: [PATCH 0822/2061] sh: Kill off unused SH-5 irq_describe cruft. Signed-off-by: Paul Mundt --- arch/sh/boards/mach-cayman/irq.c | 15 --------------- arch/sh/include/cpu-sh5/cpu/irq.h | 1 - arch/sh/kernel/cpu/irq/intc-sh5.c | 22 ---------------------- 3 files changed, 38 deletions(-) diff --git a/arch/sh/boards/mach-cayman/irq.c b/arch/sh/boards/mach-cayman/irq.c index dbb00c99132..33f77085631 100644 --- a/arch/sh/boards/mach-cayman/irq.c +++ b/arch/sh/boards/mach-cayman/irq.c @@ -142,21 +142,6 @@ int cayman_irq_demux(int evt) return irq; } -#if defined(CONFIG_PROC_FS) && defined(CONFIG_SYSCTL) -int cayman_irq_describe(char* p, int irq) -{ - if (irq < NR_INTC_IRQS) { - return intc_irq_describe(p, irq); - } else if (irq < NR_INTC_IRQS + 8) { - return sprintf(p, "(SMSC %d)", irq - NR_INTC_IRQS); - } else if ((irq >= NR_INTC_IRQS + 24) && (irq < NR_INTC_IRQS + 32)) { - return sprintf(p, "(PCI2 %d)", irq - (NR_INTC_IRQS + 24)); - } - - return 0; -} -#endif - void init_cayman_irq(void) { int i; diff --git a/arch/sh/include/cpu-sh5/cpu/irq.h b/arch/sh/include/cpu-sh5/cpu/irq.h index f0f0756e6e8..0ccf257a72d 100644 --- a/arch/sh/include/cpu-sh5/cpu/irq.h +++ b/arch/sh/include/cpu-sh5/cpu/irq.h @@ -111,7 +111,6 @@ #define TOP_PRIORITY 15 extern int intc_evt_to_irq[(0xE20/0x20)+1]; -int intc_irq_describe(char* p, int irq); extern int platform_int_priority[NR_INTC_IRQS]; #endif /* __ASM_SH_CPU_SH5_IRQ_H */ diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c index 02d8137547e..a619eaf6e6b 100644 --- a/arch/sh/kernel/cpu/irq/intc-sh5.c +++ b/arch/sh/kernel/cpu/irq/intc-sh5.c @@ -160,28 +160,6 @@ void make_intc_irq(unsigned int irq) disable_intc_irq(irq); } -#if defined(CONFIG_PROC_FS) && defined(CONFIG_SYSCTL) -static int IRQ_to_vectorN[NR_INTC_IRQS] = { - 0x12, 0x15, 0x18, 0x1B, 0x40, 0x41, 0x42, 0x43, /* 0- 7 */ - -1, -1, -1, -1, 0x50, 0x51, 0x52, 0x53, /* 8-15 */ - 0x54, 0x55, 0x32, 0x33, 0x34, 0x35, 0x36, -1, /* 16-23 */ - -1, -1, -1, -1, -1, -1, -1, -1, /* 24-31 */ - 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x38, /* 32-39 */ - 0x39, 0x3A, 0x3B, -1, -1, -1, -1, -1, /* 40-47 */ - -1, -1, -1, -1, -1, -1, -1, -1, /* 48-55 */ - -1, -1, -1, -1, -1, -1, -1, 0x2B, /* 56-63 */ - -}; - -int intc_irq_describe(char* p, int irq) -{ - if (irq < NR_INTC_IRQS) - return sprintf(p, "(0x%3x)", IRQ_to_vectorN[irq]*0x20); - else - return 0; -} -#endif - void __init plat_irq_setup(void) { unsigned long long __dummy0, __dummy1=~0x00000000100000f0; From 29c8000ee7da3a6756d26143991e573eaaf2a9f6 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 11:13:42 -0400 Subject: [PATCH 0823/2061] ring-buffer: remove complex calculations in ring-buffer-test Ingo Molnar thought that the code to calculate the time in cond_resched is a bit too ugly and is not needed. This patch removes it and replaces it with a simple call to cond_resched. I kept the comment that explains the reason for the cond_resched. [ Impact: remove ugly code ] Reported-by: Ingo Molnar Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 37 ++++++---------------------- 1 file changed, 7 insertions(+), 30 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a26fc67b63b..f4ceb453c7d 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -185,35 +185,6 @@ static void ring_buffer_consumer(void) complete(&read_done); } -/* - * If we are a non preempt kernel, the 10 second run will - * stop everything while it runs. Instead, we will call cond_resched - * and also add any time that was lost by a rescedule. - */ -#ifdef CONFIG_PREEMPT -static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) -{ -} -#else -static void sched_if_needed(struct timeval *start_tv, struct timeval *end_tv) -{ - struct timeval tv; - - cond_resched(); - do_gettimeofday(&tv); - if (tv.tv_usec < end_tv->tv_usec) { - tv.tv_usec += 1000000; - tv.tv_sec--; - } - start_tv->tv_sec += tv.tv_sec - end_tv->tv_sec; - start_tv->tv_usec += tv.tv_usec - end_tv->tv_usec; - if (start_tv->tv_usec > 1000000) { - start_tv->tv_usec -= 1000000; - start_tv->tv_sec++; - } -} -#endif - static void ring_buffer_producer(void) { struct timeval start_tv; @@ -250,7 +221,13 @@ static void ring_buffer_producer(void) if (consumer && !(++cnt % wakeup_interval)) wake_up_process(consumer); - sched_if_needed(&start_tv, &end_tv); + /* + * If we are a non preempt kernel, the 10 second run will + * stop everything while it runs. Instead, we will call + * cond_resched and also add any time that was lost by a + * rescedule. + */ + cond_resched(); } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); From d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 11:49:35 -0400 Subject: [PATCH 0824/2061] tracing: append ":*" to internal setting of system events The system enabling of events uses the same code as the set_event file. It passes in the name of the system to the parser and that will enable all the events that has that system as a name. The problem is that it will also enable events with the same name as the system. If you have system name foo, and system name bar, but within the system bar, there exists an event called foo. By setting the system name foo, you will also be enabling the event foo in the system bar. This is not an expected result. The solution is to pass in "foo:*", which will only enable the system foo and not events called foo. [ Impact: prevent accidental enabling of events with same name as a system ] Reported-by: Li Zefan Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 87feb0117ce..8d0fae3af59 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -509,9 +509,11 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, return -EINVAL; } - command = kstrdup(system, GFP_KERNEL); + /* +3 for the ":*\0" */ + command = kmalloc(strlen(system)+3, GFP_KERNEL); if (!command) return -ENOMEM; + sprintf(command, "%s:*", system); ret = ftrace_set_clr_event(command, val); if (ret) @@ -1179,7 +1181,7 @@ static __init int event_trace_init(void) &ftrace_show_header_fops); trace_create_file("enable", 0644, d_events, - "*:*", &ftrace_system_enable_fops); + "*", &ftrace_system_enable_fops); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ From 65b77242043f74bca6a0d733c0e48ef03a8c9893 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 12:49:27 -0400 Subject: [PATCH 0825/2061] tracing: have menu default enabled when kernel debug is configured Tracing can be very helpful to debug the kernel. When DEBUG_KERNEL is enabled it is nice to enable the trace menu as well. This patch only make the tracing menu enabled by default, it does not make any of the tracers enabled. And the menu is only enabled by default if DEBUG_KERNEL is enabled. [ Impact: show tracing options to those debugging the kernel ] Signed-off-by: Steven Rostedt --- kernel/trace/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 50f62a296e1..f61be301578 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -79,6 +79,7 @@ if TRACING_SUPPORT menuconfig FTRACE bool "Tracers" + default y if DEBUG_KERNEL help Enable the kernel tracing infrastructure. From 0574ea421b90e0e45a72c447dd3c2c79ffd8c153 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 14:20:28 -0400 Subject: [PATCH 0826/2061] ring-buffer: only periodically call cond_resched to ring-buffer-benchmark Calling cond_resched at every iteration of the loop adds a bit of overhead to the benchmark. This patch does two things. 1) only calls cond-resched when CONFIG_PREEMPT is not enabled 2) only calls cond-resched after so many traces has been performed. [ Impact: less overhead to the ring-buffer-benchmark ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index f4ceb453c7d..a7c048bb446 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -218,16 +218,23 @@ static void ring_buffer_producer(void) } do_gettimeofday(&end_tv); - if (consumer && !(++cnt % wakeup_interval)) + cnt++; + if (consumer && !(cnt % wakeup_interval)) wake_up_process(consumer); +#ifndef CONFIG_PREEMPT /* * If we are a non preempt kernel, the 10 second run will * stop everything while it runs. Instead, we will call * cond_resched and also add any time that was lost by a * rescedule. + * + * Do a cond resched at the same frequency we would wake up + * the reader. */ - cond_resched(); + if (cnt % wakeup_interval) + cond_resched(); +#endif } while (end_tv.tv_sec < (start_tv.tv_sec + RUN_TIME) && !kill_test); pr_info("End ring buffer hammer\n"); From 7da3046d6ce6ea97494020081c509b642b7016af Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 19:52:20 -0400 Subject: [PATCH 0827/2061] ring-buffer: add total count in ring-buffer-benchmark It is nice to see the overhead of the benchmark test when tracing is disabled. That is, we turn off the ring buffer just to see what the cost of running the loop that calls into the ring buffer is. Currently, if no entries wer made, we get 0. This is not informative. This patch changes it to check if we had any "missed" (non recorded) events. If so, a total count is also reported. [ Impact: evaluate the over head of the ring buffer benchmark test ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a7c048bb446..a21aa7b3d05 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -285,6 +285,17 @@ static void ring_buffer_producer(void) avg = 1000000 / hit; pr_info("%ld ns per entry\n", avg); } + + + if (missed) { + if (time) + missed /= (long)time; + + pr_info("Total iterations per millisec: %ld\n", hit + missed); + + avg = 1000000 / (hit + missed); + pr_info("%ld ns per entry\n", avg); + } } static void wait_to_die(void) From 74f4fd21664148b8c454cc07bfe74e4dd51cf07b Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 7 May 2009 19:58:55 -0400 Subject: [PATCH 0828/2061] ring-buffer: change WARN_ON from checking preempt_count to preemptible There's a WARN_ON in the ring buffer code that makes sure preemption is disabled. It checks "!preempt_count()". But when CONFIG_PREEMPT is not enabled, preempt_count() is always zero, and this will trigger the warning. [ Impact: prevent false warning on non preemptible kernels ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 3ae5ccf2c0f..361170609bd 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1688,7 +1688,7 @@ void ring_buffer_discard_commit(struct ring_buffer *buffer, * committed yet. Thus we can assume that preemption * is still disabled. */ - RB_WARN_ON(buffer, !preempt_count()); + RB_WARN_ON(buffer, preemptible()); cpu = smp_processor_id(); cpu_buffer = buffer->buffers[cpu]; From 5dafc91fca9135a7b0acb76d9709f4abfad92d6e Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 7 May 2009 10:17:44 +0000 Subject: [PATCH 0829/2061] sh: sh7785 early scif fix This patch moves the SH4 case of EARLY_SCIF_CONSOLE_PORT so the SH7785 default value gets used. Without this patch the value for SH7785 is set to 0xffe80000. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/Kconfig.debug | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug index fdb049373e7..fafa907e14a 100644 --- a/arch/sh/Kconfig.debug +++ b/arch/sh/Kconfig.debug @@ -38,10 +38,10 @@ config EARLY_SCIF_CONSOLE_PORT default "0xffe00000" if CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7763 || \ CPU_SUBTYPE_SH7722 || CPU_SUBTYPE_SH7366 || \ CPU_SUBTYPE_SH7343 - default "0xffe80000" if CPU_SH4 default "0xffea0000" if CPU_SUBTYPE_SH7785 default "0xfffe8000" if CPU_SUBTYPE_SH7203 default "0xfffe9800" if CPU_SUBTYPE_SH7206 || CPU_SUBTYPE_SH7263 + default "0xffe80000" if CPU_SH4 default "0x00000000" config EARLY_PRINTK From b3cacf318172757783d8272fc333284dd4f50140 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 7 May 2009 10:31:39 +0000 Subject: [PATCH 0830/2061] sh: call clock framework init() callback once Make sure that clk->ops->init() only gets called once in the case of CLK_ALWAYS_ENABLED. Without this patch the init() callback may be called multiple times. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/clock.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 1dc896483b5..099373ae57d 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -99,15 +99,18 @@ static int __clk_enable(struct clk *clk) * changes and the clock needs to hunt for the proper set of * divisors to use before it can effectively recalc. */ + + if (clk->flags & CLK_ALWAYS_ENABLED) { + kref_get(&clk->kref); + return 0; + } + if (unlikely(atomic_read(&clk->kref.refcount) == 1)) if (clk->ops && clk->ops->init) clk->ops->init(clk); kref_get(&clk->kref); - if (clk->flags & CLK_ALWAYS_ENABLED) - return 0; - if (likely(clk->ops && clk->ops->enable)) clk->ops->enable(clk); From 06ee846a25642c434c55209151c633f538c12022 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 7 May 2009 10:44:55 +0000 Subject: [PATCH 0831/2061] sh: r7785 highlander clock fixes Update the r7785 highlander defconfig to fix PCLK value and that mode4 is set high. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/configs/r7785rp_defconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig index 890fe3cc914..5e677a8e3a3 100644 --- a/arch/sh/configs/r7785rp_defconfig +++ b/arch/sh/configs/r7785rp_defconfig @@ -269,7 +269,7 @@ CONFIG_SH_R7785RP=y # CONFIG_SH_TMU=y CONFIG_SH_TIMER_IRQ=28 -CONFIG_SH_PCLK_FREQ=50000000 +CONFIG_SH_PCLK_FREQ=33333333 CONFIG_TICK_ONESHOT=y CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y @@ -333,7 +333,7 @@ CONFIG_GUSA=y CONFIG_ZERO_PAGE_OFFSET=0x00001000 CONFIG_BOOT_LINK_OFFSET=0x00800000 CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" +CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1 mode4_pin=high" # # Bus options From e367592cc93ac653e7bc0bebbc9bb713a77e2696 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 7 May 2009 10:55:37 +0000 Subject: [PATCH 0832/2061] sh: TMU platform data for sh7785 This patch adds TMU platform data for sh7785. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 1 + arch/sh/kernel/cpu/sh4a/setup-sh7785.c | 204 +++++++++++++++++++++++++ 2 files changed, 205 insertions(+) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 02688d7909a..b773e7d1b06 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -393,6 +393,7 @@ config CPU_SUBTYPE_SH7785 select CPU_SHX2 select ARCH_SPARSEMEM_ENABLE select SYS_SUPPORTS_NUMA + select SYS_SUPPORTS_TMU config CPU_SUBTYPE_SH7786 bool "Support SH7786 processor" diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c index d80802a49db..dc5d3e507a2 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c @@ -13,8 +13,191 @@ #include #include #include +#include #include +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 28, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 29, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 30, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + +static struct sh_timer_config tmu3_platform_data = { + .name = "TMU3", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", +}; + +static struct resource tmu3_resources[] = { + [0] = { + .name = "TMU3", + .start = 0xffdc0008, + .end = 0xffdc0013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 96, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu3_device = { + .name = "sh_tmu", + .id = 3, + .dev = { + .platform_data = &tmu3_platform_data, + }, + .resource = tmu3_resources, + .num_resources = ARRAY_SIZE(tmu3_resources), +}; + +static struct sh_timer_config tmu4_platform_data = { + .name = "TMU4", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource tmu4_resources[] = { + [0] = { + .name = "TMU4", + .start = 0xffdc0014, + .end = 0xffdc001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 97, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu4_device = { + .name = "sh_tmu", + .id = 4, + .dev = { + .platform_data = &tmu4_platform_data, + }, + .resource = tmu4_resources, + .num_resources = ARRAY_SIZE(tmu4_resources), +}; + +static struct sh_timer_config tmu5_platform_data = { + .name = "TMU5", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu5_resources[] = { + [0] = { + .name = "TMU5", + .start = 0xffdc0020, + .end = 0xffdc002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 98, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu5_device = { + .name = "sh_tmu", + .id = 5, + .dev = { + .platform_data = &tmu5_platform_data, + }, + .resource = tmu5_resources, + .num_resources = ARRAY_SIZE(tmu5_resources), +}; + static struct plat_sci_port sci_platform_data[] = { { .mapbase = 0xffea0000, @@ -60,6 +243,12 @@ static struct platform_device sci_device = { }; static struct platform_device *sh7785_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, &sci_device, }; @@ -70,6 +259,21 @@ static int __init sh7785_devices_setup(void) } __initcall(sh7785_devices_setup); +static struct platform_device *sh7785_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7785_early_devices, + ARRAY_SIZE(sh7785_early_devices)); +} + enum { UNUSED = 0, From 4fa48e1774992fd03a4cd83a0f2adecb288c46f8 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 15:28:15 +0900 Subject: [PATCH 0833/2061] sh: Enable new TMU driver support for all SH-3 and SH-4 CPUs. The TMU block is supported on all SH-3 and SH-4 subtypes, so just select it there, rather than conditionalizing it per subtype. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index b773e7d1b06..88eb118e9fa 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -168,6 +168,7 @@ config CPU_SH3 bool select CPU_HAS_INTEVT select CPU_HAS_SR_RB + select SYS_SUPPORTS_TMU config CPU_SH4 bool @@ -175,6 +176,7 @@ config CPU_SH4 select CPU_HAS_SR_RB select CPU_HAS_PTEA if !CPU_SH4A || CPU_SHX2 select CPU_HAS_FPU if !CPU_SH4AL_DSP + select SYS_SUPPORTS_TMU config CPU_SH4A bool @@ -393,7 +395,6 @@ config CPU_SUBTYPE_SH7785 select CPU_SHX2 select ARCH_SPARSEMEM_ENABLE select SYS_SUPPORTS_NUMA - select SYS_SUPPORTS_TMU config CPU_SUBTYPE_SH7786 bool "Support SH7786 processor" @@ -428,7 +429,6 @@ config CPU_SUBTYPE_SH7722 select ARCH_SPARSEMEM_ENABLE select SYS_SUPPORTS_NUMA select SYS_SUPPORTS_CMT - select SYS_SUPPORTS_TMU config CPU_SUBTYPE_SH7366 bool "Support SH7366 processor" From c2ecb4c4a7da16288062a057b693b7b1e16aaf88 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 15:39:25 +0900 Subject: [PATCH 0834/2061] sh: Move out rtc-sh registration from time_64.c to setup-sh5.c Now that the onchip_remap() mess is sorted out, the rtc-sh support code for SH-5 can follow the same approach as the other CPUs. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh5/setup-sh5.c | 31 ++++++++++++++++++ arch/sh/kernel/time_64.c | 52 +----------------------------- 2 files changed, 32 insertions(+), 51 deletions(-) diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c index d8d59fea108..9a362c8f3fe 100644 --- a/arch/sh/kernel/cpu/sh5/setup-sh5.c +++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c @@ -34,8 +34,39 @@ static struct platform_device sci_device = { }, }; +static struct resource rtc_resources[] = { + [0] = { + .start = PHYS_PERIPHERAL_BLOCK + 0x01040000, + .end = PHYS_PERIPHERAL_BLOCK + 0x01040000 + 0x58 - 1, + .flags = IORESOURCE_IO, + }, + [1] = { + /* Period IRQ */ + .start = IRQ_PRI, + .flags = IORESOURCE_IRQ, + }, + [2] = { + /* Carry IRQ */ + .start = IRQ_CUI, + .flags = IORESOURCE_IRQ, + }, + [3] = { + /* Alarm IRQ */ + .start = IRQ_ATI, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device rtc_device = { + .name = "sh-rtc", + .id = -1, + .num_resources = ARRAY_SIZE(rtc_resources), + .resource = rtc_resources, +}; + static struct platform_device *sh5_devices[] __initdata = { &sci_device, + &rtc_device, }; static int __init sh5_devices_setup(void) diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c index 7bfeaa09b94..b4fe770e2a3 100644 --- a/arch/sh/kernel/time_64.c +++ b/arch/sh/kernel/time_64.c @@ -46,12 +46,6 @@ #define TMU_TSTR_INIT 1 #define TMU_TSTR_OFF 0 -/* Real Time Clock */ -#define RTC_BLOCK_OFF 0x01040000 -#define RTC_BASE PHYS_PERIPHERAL_BLOCK + RTC_BLOCK_OFF -#define RTC_RCR1_CIE 0x10 /* Carry Interrupt Enable */ -#define RTC_RCR1 (rtc_base + 0x38) - /* Time Management Unit */ #define TMU_BLOCK_OFF 0x01020000 #define TMU_BASE PHYS_PERIPHERAL_BLOCK + TMU_BLOCK_OFF @@ -68,8 +62,7 @@ #define TICK_SIZE (tick_nsec / 1000) -static unsigned long tmu_base, rtc_base; -unsigned long cprc_base; +static unsigned long tmu_base; /* Variables to allow interpolation of time of day to resolution better than a * jiffy. */ @@ -248,11 +241,6 @@ void __init time_init(void) panic("Unable to remap TMU\n"); } - rtc_base = (unsigned long)ioremap_nocache(RTC_BASE, 1024); - if (!rtc_base) { - panic("Unable to remap RTC\n"); - } - clk = clk_get(NULL, "cpu_clk"); scaled_recip_ctc_ticks_per_jiffy = ((1ULL << CTC_JIFFY_SCALE_SHIFT) / (unsigned long long)(clk_get_rate(clk) / HZ)); @@ -274,41 +262,3 @@ void __init time_init(void) ctrl_outl(interval, TMU0_TCNT); ctrl_outb(TMU_TSTR_INIT, TMU_TSTR); } - -static struct resource rtc_resources[] = { - [0] = { - /* RTC base, filled in by rtc_init */ - .flags = IORESOURCE_IO, - }, - [1] = { - /* Period IRQ */ - .start = IRQ_PRI, - .flags = IORESOURCE_IRQ, - }, - [2] = { - /* Carry IRQ */ - .start = IRQ_CUI, - .flags = IORESOURCE_IRQ, - }, - [3] = { - /* Alarm IRQ */ - .start = IRQ_ATI, - .flags = IORESOURCE_IRQ, - }, -}; - -static struct platform_device rtc_device = { - .name = "sh-rtc", - .id = -1, - .num_resources = ARRAY_SIZE(rtc_resources), - .resource = rtc_resources, -}; - -static int __init rtc_init(void) -{ - rtc_resources[0].start = rtc_base; - rtc_resources[0].end = rtc_resources[0].start + 0x58 - 1; - - return platform_device_register(&rtc_device); -} -device_initcall(rtc_init); From add47067a8ca324e9d26c4373350dc3cce7f1e7f Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 16:12:17 +0900 Subject: [PATCH 0835/2061] sh: Finish the sh64 migration off of ARCH_USES_GETTIMEOFFSET. This adds sh_tmu support to the SH-5 subtypes, which subsequently allows us to kill off time_64.c and use the now generic time_32.c. As a bonus, SH-5 now supports highres timers and tickless for the first time. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 5 +- arch/sh/kernel/Makefile_64 | 2 +- arch/sh/kernel/cpu/sh5/setup-sh5.c | 118 +++++++++++++ arch/sh/kernel/time_64.c | 264 ----------------------------- 4 files changed, 120 insertions(+), 269 deletions(-) delete mode 100644 arch/sh/kernel/time_64.c diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 88eb118e9fa..ca5c09b241c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -141,10 +141,6 @@ config ARCH_NO_VIRT_TO_BUS config ARCH_HAS_DEFAULT_IDLE def_bool y -config ARCH_USES_GETTIMEOFFSET - def_bool y - depends on SUPERH64 - config IO_TRAPPED bool @@ -190,6 +186,7 @@ config CPU_SH4AL_DSP config CPU_SH5 bool select CPU_HAS_FPU + select SYS_SUPPORTS_TMU config CPU_SHX2 bool diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64 index d256c774695..4667d4cac95 100644 --- a/arch/sh/kernel/Makefile_64 +++ b/arch/sh/kernel/Makefile_64 @@ -2,7 +2,7 @@ extra-y := head_64.o init_task.o vmlinux.lds obj-y := debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \ ptrace_64.o setup.o signal_64.o sys_sh.o sys_sh64.o \ - syscalls_64.o time_64.o topology.o traps.o traps_64.o + syscalls_64.o time_32.o topology.o traps.o traps_64.o obj-y += cpu/ timers/ obj-$(CONFIG_VSYSCALL) += vsyscall/ diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c index 9a362c8f3fe..678d69bdebb 100644 --- a/arch/sh/kernel/cpu/sh5/setup-sh5.c +++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c @@ -13,6 +13,7 @@ #include #include #include +#include #include static struct plat_sci_port sci_platform_data[] = { @@ -64,6 +65,110 @@ static struct platform_device rtc_device = { .resource = rtc_resources, }; +#define TMU_BLOCK_OFF 0x01020000 +#define TMU_BASE PHYS_PERIPHERAL_BLOCK + TMU_BLOCK_OFF +#define TMU0_BASE (TMU_BASE + 0x8 + (0xc * 0x0)) +#define TMU1_BASE (TMU_BASE + 0x8 + (0xc * 0x1)) +#define TMU2_BASE (TMU_BASE + 0x8 + (0xc * 0x2)) + +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = TMU0_BASE, + .end = TMU0_BASE + 0xc - 1, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_TUNI0, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = TMU1_BASE, + .end = TMU1_BASE + 0xc - 1, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_TUNI1, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = TMU2_BASE, + .end = TMU2_BASE + 0xc - 1, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = IRQ_TUNI2, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + +static struct platform_device *sh5_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, +}; + static struct platform_device *sh5_devices[] __initdata = { &sci_device, &rtc_device, @@ -71,7 +176,20 @@ static struct platform_device *sh5_devices[] __initdata = { static int __init sh5_devices_setup(void) { + int ret; + + ret = platform_add_devices(sh5_early_devices, + ARRAY_SIZE(sh5_early_devices)); + if (unlikely(ret != 0)) + return ret; + return platform_add_devices(sh5_devices, ARRAY_SIZE(sh5_devices)); } __initcall(sh5_devices_setup); + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh5_early_devices, + ARRAY_SIZE(sh5_early_devices)); +} diff --git a/arch/sh/kernel/time_64.c b/arch/sh/kernel/time_64.c deleted file mode 100644 index b4fe770e2a3..00000000000 --- a/arch/sh/kernel/time_64.c +++ /dev/null @@ -1,264 +0,0 @@ -/* - * arch/sh/kernel/time_64.c - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * Copyright (C) 2003 - 2007 Paul Mundt - * Copyright (C) 2003 Richard Curnow - * - * Original TMU/RTC code taken from sh version. - * Copyright (C) 1999 Tetsuya Okada & Niibe Yutaka - * Some code taken from i386 version. - * Copyright (C) 1991, 1992, 1995 Linus Torvalds - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include /* required by inline __asm__ stmt. */ -#include -#include -#include -#include -#include -#include - -#define TMU_TOCR_INIT 0x00 -#define TMU0_TCR_INIT 0x0020 -#define TMU_TSTR_INIT 1 -#define TMU_TSTR_OFF 0 - -/* Time Management Unit */ -#define TMU_BLOCK_OFF 0x01020000 -#define TMU_BASE PHYS_PERIPHERAL_BLOCK + TMU_BLOCK_OFF -#define TMU0_BASE tmu_base + 0x8 + (0xc * 0x0) -#define TMU1_BASE tmu_base + 0x8 + (0xc * 0x1) -#define TMU2_BASE tmu_base + 0x8 + (0xc * 0x2) - -#define TMU_TOCR tmu_base+0x0 /* Byte access */ -#define TMU_TSTR tmu_base+0x4 /* Byte access */ - -#define TMU0_TCOR TMU0_BASE+0x0 /* Long access */ -#define TMU0_TCNT TMU0_BASE+0x4 /* Long access */ -#define TMU0_TCR TMU0_BASE+0x8 /* Word access */ - -#define TICK_SIZE (tick_nsec / 1000) - -static unsigned long tmu_base; - -/* Variables to allow interpolation of time of day to resolution better than a - * jiffy. */ - -/* This is effectively protected by xtime_lock */ -static unsigned long ctc_last_interrupt; -static unsigned long long usecs_per_jiffy = 1000000/HZ; /* Approximation */ - -#define CTC_JIFFY_SCALE_SHIFT 40 - -/* 2**CTC_JIFFY_SCALE_SHIFT / ctc_ticks_per_jiffy */ -static unsigned long long scaled_recip_ctc_ticks_per_jiffy; - -/* Estimate number of microseconds that have elapsed since the last timer tick, - by scaling the delta that has occurred in the CTC register. - - WARNING WARNING WARNING : This algorithm relies on the CTC decrementing at - the CPU clock rate. If the CPU sleeps, the CTC stops counting. Bear this - in mind if enabling SLEEP_WORKS in process.c. In that case, this algorithm - probably needs to use TMU.TCNT0 instead. This will work even if the CPU is - sleeping, though will be coarser. - - FIXME : What if usecs_per_tick is moving around too much, e.g. if an adjtime - is running or if the freq or tick arguments of adjtimex are modified after - we have calibrated the scaling factor? This will result in either a jump at - the end of a tick period, or a wrap backwards at the start of the next one, - if the application is reading the time of day often enough. I think we - ought to do better than this. For this reason, usecs_per_jiffy is left - separated out in the calculation below. This allows some future hook into - the adjtime-related stuff in kernel/timer.c to remove this hazard. - -*/ - -static unsigned long usecs_since_tick(void) -{ - unsigned long long current_ctc; - long ctc_ticks_since_interrupt; - unsigned long long ull_ctc_ticks_since_interrupt; - unsigned long result; - - unsigned long long mul1_out; - unsigned long long mul1_out_high; - unsigned long long mul2_out_low, mul2_out_high; - - /* Read CTC register */ - asm ("getcon cr62, %0" : "=r" (current_ctc)); - /* Note, the CTC counts down on each CPU clock, not up. - Note(2), use long type to get correct wraparound arithmetic when - the counter crosses zero. */ - ctc_ticks_since_interrupt = (long) ctc_last_interrupt - (long) current_ctc; - ull_ctc_ticks_since_interrupt = (unsigned long long) ctc_ticks_since_interrupt; - - /* Inline assembly to do 32x32x32->64 multiplier */ - asm volatile ("mulu.l %1, %2, %0" : - "=r" (mul1_out) : - "r" (ull_ctc_ticks_since_interrupt), "r" (usecs_per_jiffy)); - - mul1_out_high = mul1_out >> 32; - - asm volatile ("mulu.l %1, %2, %0" : - "=r" (mul2_out_low) : - "r" (mul1_out), "r" (scaled_recip_ctc_ticks_per_jiffy)); - -#if 1 - asm volatile ("mulu.l %1, %2, %0" : - "=r" (mul2_out_high) : - "r" (mul1_out_high), "r" (scaled_recip_ctc_ticks_per_jiffy)); -#endif - - result = (unsigned long) (((mul2_out_high << 32) + mul2_out_low) >> CTC_JIFFY_SCALE_SHIFT); - - return result; -} - -u32 arch_gettimeoffset(void) -{ - return usecs_since_tick() * 1000; -} - -/* Dummy RTC ops */ -static void null_rtc_get_time(struct timespec *tv) -{ - tv->tv_sec = mktime(2000, 1, 1, 0, 0, 0); - tv->tv_nsec = 0; -} - -static int null_rtc_set_time(const time_t secs) -{ - return 0; -} - -void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time; -int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time; - -/* last time the RTC clock got updated */ -static long last_rtc_update; - -/* - * timer_interrupt() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick - */ -static inline void do_timer_interrupt(void) -{ - unsigned long long current_ctc; - - if (current->pid) - profile_tick(CPU_PROFILING); - - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - asm ("getcon cr62, %0" : "=r" (current_ctc)); - ctc_last_interrupt = (unsigned long) current_ctc; - - do_timer(1); - - /* - * If we have an externally synchronized Linux clock, then update - * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - */ - if (ntp_synced() && - xtime.tv_sec > last_rtc_update + 660 && - (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 && - (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) { - if (rtc_sh_set_time(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - /* do it again in 60 s */ - last_rtc_update = xtime.tv_sec - 600; - } - write_sequnlock(&xtime_lock); - -#ifndef CONFIG_SMP - update_process_times(user_mode(get_irq_regs())); -#endif -} - -/* - * This is the same as the above, except we _also_ save the current - * Time Stamp Counter value at the time of the timer interrupt, so that - * we later on can estimate the time of day more exactly. - */ -static irqreturn_t timer_interrupt(int irq, void *dev_id) -{ - unsigned long timer_status; - - /* Clear UNF bit */ - timer_status = ctrl_inw(TMU0_TCR); - timer_status &= ~0x100; - ctrl_outw(timer_status, TMU0_TCR); - - do_timer_interrupt(); - - return IRQ_HANDLED; -} - -static struct irqaction irq0 = { - .handler = timer_interrupt, - .flags = IRQF_DISABLED, - .name = "timer", -}; - -void __init time_init(void) -{ - unsigned long interval; - struct clk *clk; - - tmu_base = (unsigned long)ioremap_nocache(TMU_BASE, 1024); - if (!tmu_base) { - panic("Unable to remap TMU\n"); - } - - clk = clk_get(NULL, "cpu_clk"); - scaled_recip_ctc_ticks_per_jiffy = ((1ULL << CTC_JIFFY_SCALE_SHIFT) / - (unsigned long long)(clk_get_rate(clk) / HZ)); - - rtc_sh_get_time(&xtime); - - setup_irq(TIMER_IRQ, &irq0); - - clk = clk_get(NULL, "module_clk"); - interval = (clk_get_rate(clk)/(HZ*4)); - - printk("Interval = %ld\n", interval); - - /* Start TMU0 */ - ctrl_outb(TMU_TSTR_OFF, TMU_TSTR); - ctrl_outb(TMU_TOCR_INIT, TMU_TOCR); - ctrl_outw(TMU0_TCR_INIT, TMU0_TCR); - ctrl_outl(interval, TMU0_TCOR); - ctrl_outl(interval, TMU0_TCNT); - ctrl_outb(TMU_TSTR_INIT, TMU_TSTR); -} From b179b72fad5c88c3b616fb88a9ae7cbbc1a750d3 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 16:17:36 +0900 Subject: [PATCH 0836/2061] sh: Rename arch/sh/kernel/time_32.c to arch/sh/kernel/time.c. This is now fully generic, and used both by _32 and _64 variants. Rename it accordingly. Signed-off-by: Paul Mundt --- arch/sh/kernel/Makefile_32 | 2 +- arch/sh/kernel/Makefile_64 | 2 +- arch/sh/kernel/{time_32.c => time.c} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename arch/sh/kernel/{time_32.c => time.c} (100%) diff --git a/arch/sh/kernel/Makefile_32 b/arch/sh/kernel/Makefile_32 index 5b1d4d0fe04..aee08afe5ff 100644 --- a/arch/sh/kernel/Makefile_32 +++ b/arch/sh/kernel/Makefile_32 @@ -11,7 +11,7 @@ endif obj-y := debugtraps.o idle.o io.o io_generic.o irq.o \ machvec.o process_32.o ptrace_32.o setup.o signal_32.o \ - sys_sh.o sys_sh32.o syscalls_32.o time_32.o topology.o \ + sys_sh.o sys_sh32.o syscalls_32.o time.o topology.o \ traps.o traps_32.o obj-y += cpu/ timers/ diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64 index 4667d4cac95..31f6ebf375e 100644 --- a/arch/sh/kernel/Makefile_64 +++ b/arch/sh/kernel/Makefile_64 @@ -2,7 +2,7 @@ extra-y := head_64.o init_task.o vmlinux.lds obj-y := debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \ ptrace_64.o setup.o signal_64.o sys_sh.o sys_sh64.o \ - syscalls_64.o time_32.o topology.o traps.o traps_64.o + syscalls_64.o time.o topology.o traps.o traps_64.o obj-y += cpu/ timers/ obj-$(CONFIG_VSYSCALL) += vsyscall/ diff --git a/arch/sh/kernel/time_32.c b/arch/sh/kernel/time.c similarity index 100% rename from arch/sh/kernel/time_32.c rename to arch/sh/kernel/time.c From 6d134b9e8d3f32331ad2faca2db8186f54198931 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 16:36:13 +0900 Subject: [PATCH 0837/2061] sh: Wire up GENERIC_CMOS_UPDATE for the platforms that need it. Now that everything has converted over to generic timekeeping, we need an alternate method for keeping the RTC updated for those platforms that are still using the rtc_sh_get/set_time pairs, presently limited to SH-03 and the Dreamcast. This wires up the GENERIC_CMOS_UPDATE hooks for those to maintain the same behaviour. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 4 ++++ arch/sh/kernel/time.c | 14 ++++++++++++++ 2 files changed, 18 insertions(+) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index ca5c09b241c..d88a61b11d3 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -83,6 +83,10 @@ config GENERIC_CLOCKEVENTS config GENERIC_CLOCKEVENTS_BROADCAST bool +config GENERIC_CMOS_UPDATE + def_bool y + depends on SH_SH03 || SH_DREAMCAST + config GENERIC_LOCKBREAK def_bool y depends on SMP && PREEMPT diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index d41ca4cf20c..77b841a99c0 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -46,6 +46,20 @@ static int null_rtc_set_time(const time_t secs) void (*rtc_sh_get_time)(struct timespec *) = null_rtc_get_time; int (*rtc_sh_set_time)(const time_t) = null_rtc_set_time; +#ifdef CONFIG_GENERIC_CMOS_UPDATE +unsigned long read_persistent_clock(void) +{ + struct timespec tv; + rtc_sh_get_time(&tv); + return tv.tv_sec; +} + +int update_persistent_clock(struct timespec now) +{ + return rtc_sh_set_time(now.tv_sec); +} +#endif + unsigned int get_rtc_time(struct rtc_time *tm) { if (rtc_sh_get_time != null_rtc_get_time) { From 5ac5496411b30d41945a996fe7a7fb5abccf2aaa Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 16:44:00 +0900 Subject: [PATCH 0838/2061] sh: Kill off dead handle_timer_tick() code. Nothing is using this anymore now that we have fully converted to generic time, so kill it off completely. Signed-off-by: Paul Mundt --- arch/sh/include/asm/timer.h | 5 ---- arch/sh/kernel/time.c | 53 +++---------------------------------- 2 files changed, 4 insertions(+), 54 deletions(-) diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h index cb16645d839..6c0851188c0 100644 --- a/arch/sh/include/asm/timer.h +++ b/arch/sh/include/asm/timer.h @@ -18,17 +18,12 @@ struct sys_timer { struct sys_timer_ops *ops; }; -#define TICK_SIZE (tick_nsec / 1000) - extern struct sys_timer tmu_timer; extern struct sys_timer *sys_timer; /* arch/sh/kernel/timers/timer.c */ struct sys_timer *get_sys_timer(void); -/* arch/sh/kernel/time.c */ -void handle_timer_tick(void); - extern struct clocksource clocksource_sh; #endif /* __ASM_SH_TIMER_H */ diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index 77b841a99c0..f4c304adec4 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -1,13 +1,14 @@ /* - * arch/sh/kernel/time_32.c + * arch/sh/kernel/time.c * * Copyright (C) 1999 Tetsuya Okada & Niibe Yutaka * Copyright (C) 2000 Philipp Rumpf * Copyright (C) 2002 - 2009 Paul Mundt * Copyright (C) 2002 M. R. Brown * - * Some code taken from i386 version. - * Copyright (C) 1991, 1992, 1995 Linus Torvalds + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. */ #include #include @@ -23,7 +24,6 @@ #include #include #include -#include struct sys_timer *sys_timer; @@ -97,50 +97,6 @@ static int __init rtc_generic_init(void) } module_init(rtc_generic_init); -/* last time the RTC clock got updated */ -static long last_rtc_update; - -/* - * handle_timer_tick() needs to keep up the real-time clock, - * as well as call the "do_timer()" routine every clocktick - */ -void handle_timer_tick(void) -{ - if (current->pid) - profile_tick(CPU_PROFILING); - - /* - * Here we are in the timer irq handler. We just have irqs locally - * disabled but we don't know if the timer_bh is running on the other - * CPU. We need to avoid to SMP race with it. NOTE: we don' t need - * the irq version of write_lock because as just said we have irq - * locally disabled. -arca - */ - write_seqlock(&xtime_lock); - do_timer(1); - - /* - * If we have an externally synchronized Linux clock, then update - * RTC clock accordingly every ~11 minutes. Set_rtc_mmss() has to be - * called as close as possible to 500 ms before the new second starts. - */ - if (ntp_synced() && - xtime.tv_sec > last_rtc_update + 660 && - (xtime.tv_nsec / 1000) >= 500000 - ((unsigned) TICK_SIZE) / 2 && - (xtime.tv_nsec / 1000) <= 500000 + ((unsigned) TICK_SIZE) / 2) { - if (rtc_sh_set_time(xtime.tv_sec) == 0) - last_rtc_update = xtime.tv_sec; - else - /* do it again in 60s */ - last_rtc_update = xtime.tv_sec - 600; - } - write_sequnlock(&xtime_lock); - -#ifndef CONFIG_SMP - update_process_times(user_mode(get_irq_regs())); -#endif -} - #ifdef CONFIG_PM int timer_suspend(struct sys_device *dev, pm_message_t state) { @@ -242,4 +198,3 @@ void __init time_init(void) late_time_init = sh_late_time_init; } - From 6459d7bb72e9767bc7d22f2ee44aab35188e4b8a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 16:47:48 +0900 Subject: [PATCH 0839/2061] sh: Kill off dead timer sysclass pm hooks. With the conversion to generic clockevents these are completely unused, so just kill it off. Signed-off-by: Paul Mundt --- arch/sh/kernel/time.c | 45 ------------------------------------------- 1 file changed, 45 deletions(-) diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index f4c304adec4..c26576a5a45 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -97,51 +97,6 @@ static int __init rtc_generic_init(void) } module_init(rtc_generic_init); -#ifdef CONFIG_PM -int timer_suspend(struct sys_device *dev, pm_message_t state) -{ - struct sys_timer *sys_timer = container_of(dev, struct sys_timer, dev); - - sys_timer->ops->stop(); - - return 0; -} - -int timer_resume(struct sys_device *dev) -{ - struct sys_timer *sys_timer = container_of(dev, struct sys_timer, dev); - - sys_timer->ops->start(); - - return 0; -} -#else -#define timer_suspend NULL -#define timer_resume NULL -#endif - -static struct sysdev_class timer_sysclass = { - .name = "timer", - .suspend = timer_suspend, - .resume = timer_resume, -}; - -static int __init timer_init_sysfs(void) -{ - int ret; - - if (!sys_timer) - return 0; - - ret = sysdev_class_register(&timer_sysclass); - if (ret != 0) - return ret; - - sys_timer->dev.cls = &timer_sysclass; - return sysdev_register(&sys_timer->dev); -} -device_initcall(timer_init_sysfs); - void (*board_time_init)(void); struct clocksource clocksource_sh = { From cd1408f22d2fd8f0d09c082b07cf0b9bcfb6eac9 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 16:57:35 +0900 Subject: [PATCH 0840/2061] sh: mach-sh03: Give the sh03 rtc its own spinlock. This converts the sh03 rtc code off of using the global rtc_lock and on to its own spinlock. There are no other possible users of the rtc_lock, so serializing with it is not necessary. Signed-off-by: Paul Mundt --- arch/sh/boards/mach-sh03/rtc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/sh/boards/mach-sh03/rtc.c b/arch/sh/boards/mach-sh03/rtc.c index 0a9266bb51c..a8b9f844ab5 100644 --- a/arch/sh/boards/mach-sh03/rtc.c +++ b/arch/sh/boards/mach-sh03/rtc.c @@ -35,13 +35,13 @@ #define RTC_BUSY 1 #define RTC_STOP 2 -extern spinlock_t rtc_lock; +static DEFINE_SPINLOCK(sh03_rtc_lock); unsigned long get_cmos_time(void) { unsigned int year, mon, day, hour, min, sec; - spin_lock(&rtc_lock); + spin_lock(&sh03_rtc_lock); again: do { sec = (ctrl_inb(RTC_SEC1) & 0xf) + (ctrl_inb(RTC_SEC10) & 0x7) * 10; @@ -73,7 +73,7 @@ unsigned long get_cmos_time(void) goto again; } - spin_unlock(&rtc_lock); + spin_unlock(&sh03_rtc_lock); return mktime(year, mon, day, hour, min, sec); } @@ -91,7 +91,7 @@ static int set_rtc_mmss(unsigned long nowtime) int i; /* gets recalled with irq locally disabled */ - spin_lock(&rtc_lock); + spin_lock(&sh03_rtc_lock); for (i = 0 ; i < 1000000 ; i++) /* may take up to 1 second... */ if (!(ctrl_inb(RTC_CTL) & RTC_BUSY)) break; @@ -113,7 +113,7 @@ static int set_rtc_mmss(unsigned long nowtime) cmos_minutes, real_minutes); retval = -1; } - spin_unlock(&rtc_lock); + spin_unlock(&sh03_rtc_lock); return retval; } From 1af2fe45fec15bdba446c22b9b602699cdabfc9f Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 16:59:05 +0900 Subject: [PATCH 0841/2061] sh: Kill off the global rtc_lock with extreme prejudice. Now that all of the possible users for rtc_lock have gone away, it is no longer necessary to keep this lock definition around. This follows several other architectures that have either recently dropped it or never supported it in the first place. Signed-off-by: Paul Mundt --- arch/sh/kernel/time.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index c26576a5a45..e0aa769481f 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -17,7 +17,6 @@ #include #include #include -#include /* for rtc_lock */ #include #include #include @@ -27,10 +26,6 @@ struct sys_timer *sys_timer; -/* Move this somewhere more sensible.. */ -DEFINE_SPINLOCK(rtc_lock); -EXPORT_SYMBOL(rtc_lock); - /* Dummy RTC ops */ static void null_rtc_get_time(struct timespec *tv) { From cb3a86c89ebdf917b88665f70e06863986fbab5c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 17:25:35 +0900 Subject: [PATCH 0842/2061] sh: Kill off sh64's hand-rolled syscall tracer. This is no longer necessary, as there are now sufficient generic alternatives available. Signed-off-by: Paul Mundt --- arch/sh/Kconfig.debug | 8 -- arch/sh/Makefile | 34 +-------- arch/sh/kernel/cpu/sh5/entry.S | 57 -------------- arch/sh/lib64/.gitignore | 1 - arch/sh/lib64/dbg.c | 134 --------------------------------- 5 files changed, 2 insertions(+), 232 deletions(-) delete mode 100644 arch/sh/lib64/.gitignore diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug index fafa907e14a..44627ef5254 100644 --- a/arch/sh/Kconfig.debug +++ b/arch/sh/Kconfig.debug @@ -137,12 +137,4 @@ config SH64_SR_WATCH bool "Debug: set SR.WATCH to enable hardware watchpoints and trace" depends on SUPERH64 -config POOR_MANS_STRACE - bool "Debug: enable rudimentary strace facility" - depends on SUPERH64 - help - This option allows system calls to be traced to the console. It also - aids in detecting kernel stack underflow. It is useful for debugging - early-userland problems (e.g. init incurring fatal exceptions.) - endmenu diff --git a/arch/sh/Makefile b/arch/sh/Makefile index bece1f7535f..493da979eff 100644 --- a/arch/sh/Makefile +++ b/arch/sh/Makefile @@ -193,7 +193,7 @@ zImage uImage uImage.srec vmlinux.srec: vmlinux compressed: zImage -archprepare: maketools arch/sh/lib64/syscalltab.h +archprepare: maketools archclean: $(Q)$(MAKE) $(clean)=$(boot) @@ -205,34 +205,4 @@ define archhelp @echo ' uImage.srec - Create an S-record for U-Boot' endef -define filechk_gen-syscalltab - (set -e; \ - echo "/*"; \ - echo " * DO NOT MODIFY."; \ - echo " *"; \ - echo " * This file was generated by arch/sh/Makefile"; \ - echo " * Any changes will be reverted at build time."; \ - echo " */"; \ - echo ""; \ - echo "#ifndef __SYSCALLTAB_H"; \ - echo "#define __SYSCALLTAB_H"; \ - echo ""; \ - echo "#include "; \ - echo ""; \ - echo "struct syscall_info {"; \ - echo " const char *name;"; \ - echo "} syscall_info_table[] = {"; \ - sed -e '/^.*\.long /!d;s// { "/;s/\(\([^/]*\)\/\)\{1\}.*/\2/; \ - s/[ \t]*$$//g;s/$$/" },/;s/\("\)sys_/\1/g'; \ - echo "};"; \ - echo ""; \ - echo "#define NUM_SYSCALL_INFO_ENTRIES ARRAY_SIZE(syscall_info_table)";\ - echo ""; \ - echo "#endif /* __SYSCALLTAB_H */" ) -endef - -arch/sh/lib64/syscalltab.h: arch/sh/kernel/syscalls_64.S - $(call filechk,gen-syscalltab) - -CLEAN_FILES += arch/sh/lib64/syscalltab.h \ - include/asm-sh/machtypes.h +CLEAN_FILES += include/asm-sh/machtypes.h diff --git a/arch/sh/kernel/cpu/sh5/entry.S b/arch/sh/kernel/cpu/sh5/entry.S index 516297515b2..b0aacf67525 100644 --- a/arch/sh/kernel/cpu/sh5/entry.S +++ b/arch/sh/kernel/cpu/sh5/entry.S @@ -812,27 +812,6 @@ no_underflow: ! exceptions add SP, ZERO, r14 -#ifdef CONFIG_POOR_MANS_STRACE - /* We've pushed all the registers now, so only r2-r4 hold anything - * useful. Move them into callee save registers */ - or r2, ZERO, r28 - or r3, ZERO, r29 - or r4, ZERO, r30 - - /* Preserve r2 as the event code */ - movi evt_debug, r3 - ori r3, 1, r3 - ptabs r3, tr0 - - or SP, ZERO, r6 - getcon TRA, r5 - blink tr0, LINK - - or r28, ZERO, r2 - or r29, ZERO, r3 - or r30, ZERO, r4 -#endif - /* For syscall and debug race condition, get TRA now */ getcon TRA, r5 @@ -887,11 +866,6 @@ no_underflow: */ .global ret_from_irq ret_from_irq: -#ifdef CONFIG_POOR_MANS_STRACE - pta evt_debug_ret_from_irq, tr0 - ori SP, 0, r2 - blink tr0, LINK -#endif ld.q SP, FRAME_S(FSSR), r6 shlri r6, 30, r6 andi r6, 1, r6 @@ -905,12 +879,6 @@ ret_from_irq: ret_from_exception: preempt_stop() -#ifdef CONFIG_POOR_MANS_STRACE - pta evt_debug_ret_from_exc, tr0 - ori SP, 0, r2 - blink tr0, LINK -#endif - ld.q SP, FRAME_S(FSSR), r6 shlri r6, 30, r6 andi r6, 1, r6 @@ -1236,18 +1204,6 @@ syscall_bad: .global syscall_ret syscall_ret: st.q SP, FRAME_R(9), r2 /* Expecting SP back to BASIC frame */ - -#ifdef CONFIG_POOR_MANS_STRACE - /* nothing useful in registers at this point */ - - movi evt_debug2, r5 - ori r5, 1, r5 - ptabs r5, tr0 - ld.q SP, FRAME_R(9), r2 - or SP, ZERO, r3 - blink tr0, LINK -#endif - ld.q SP, FRAME_S(FSPC), r2 addi r2, 4, r2 /* Move PC, being pre-execution event */ st.q SP, FRAME_S(FSPC), r2 @@ -1268,25 +1224,12 @@ ret_from_fork: ptabs r5, tr0 blink tr0, LINK -#ifdef CONFIG_POOR_MANS_STRACE - /* nothing useful in registers at this point */ - - movi evt_debug2, r5 - ori r5, 1, r5 - ptabs r5, tr0 - ld.q SP, FRAME_R(9), r2 - or SP, ZERO, r3 - blink tr0, LINK -#endif - ld.q SP, FRAME_S(FSPC), r2 addi r2, 4, r2 /* Move PC, being pre-execution event */ st.q SP, FRAME_S(FSPC), r2 pta ret_from_syscall, tr0 blink tr0, ZERO - - syscall_allowed: /* Use LINK to deflect the exit point, default is syscall_ret */ pta syscall_ret, tr0 diff --git a/arch/sh/lib64/.gitignore b/arch/sh/lib64/.gitignore deleted file mode 100644 index 3508c2cb23c..00000000000 --- a/arch/sh/lib64/.gitignore +++ /dev/null @@ -1 +0,0 @@ -syscalltab.h diff --git a/arch/sh/lib64/dbg.c b/arch/sh/lib64/dbg.c index 2fb8eaf6de6..25f2481bd89 100644 --- a/arch/sh/lib64/dbg.c +++ b/arch/sh/lib64/dbg.c @@ -135,140 +135,6 @@ void print_itlb(void) (" =============================================================\n"); } -/* ======================================================================= */ - -#ifdef CONFIG_POOR_MANS_STRACE - -#include "syscalltab.h" - -struct ring_node { - int evt; - int ret_addr; - int event; - int tra; - int pid; - unsigned long sp; - unsigned long pc; -}; - -static struct ring_node event_ring[16]; -static int event_ptr = 0; - -struct stored_syscall_data { - int pid; - int syscall_number; -}; - -#define N_STORED_SYSCALLS 16 - -static struct stored_syscall_data stored_syscalls[N_STORED_SYSCALLS]; -static int syscall_next=0; -static int syscall_next_print=0; - -void evt_debug(int evt, int ret_addr, int event, int tra, struct pt_regs *regs) -{ - int syscallno = tra & 0xff; - unsigned long sp; - unsigned long stack_bottom; - int pid; - struct ring_node *rr; - - pid = current->pid; - stack_bottom = (unsigned long) task_stack_page(current); - asm volatile("ori r15, 0, %0" : "=r" (sp)); - rr = event_ring + event_ptr; - rr->evt = evt; - rr->ret_addr = ret_addr; - rr->event = event; - rr->tra = tra; - rr->pid = pid; - rr->sp = sp; - rr->pc = regs->pc; - - if (sp < stack_bottom + 3092) { - int i, j; - printk("evt_debug : stack underflow report\n"); - for (j=0, i = event_ptr; j<16; j++) { - rr = event_ring + i; - printk("evt=%08x event=%08x tra=%08x pid=%5d sp=%08lx pc=%08lx\n", - rr->evt, rr->event, rr->tra, rr->pid, rr->sp, rr->pc); - i--; - i &= 15; - } - panic("STACK UNDERFLOW\n"); - } - - event_ptr = (event_ptr + 1) & 15; - - if ((event == 2) && (evt == 0x160)) { - if (syscallno < NUM_SYSCALL_INFO_ENTRIES) { - /* Store the syscall information to print later. We - * can't print this now - currently we're running with - * SR.BL=1, so we can't take a tlbmiss (which could occur - * in the console drivers under printk). - * - * Just overwrite old entries on ring overflow - this - * is only for last-hope debugging. */ - stored_syscalls[syscall_next].pid = current->pid; - stored_syscalls[syscall_next].syscall_number = syscallno; - syscall_next++; - syscall_next &= (N_STORED_SYSCALLS - 1); - } - } -} - -static void drain_syscalls(void) { - while (syscall_next_print != syscall_next) { - printk("Task %d: %s()\n", - stored_syscalls[syscall_next_print].pid, - syscall_info_table[stored_syscalls[syscall_next_print].syscall_number].name); - syscall_next_print++; - syscall_next_print &= (N_STORED_SYSCALLS - 1); - } -} - -void evt_debug2(unsigned int ret) -{ - drain_syscalls(); - printk("Task %d: syscall returns %08x\n", current->pid, ret); -} - -void evt_debug_ret_from_irq(struct pt_regs *regs) -{ - int pid; - struct ring_node *rr; - - pid = current->pid; - rr = event_ring + event_ptr; - rr->evt = 0xffff; - rr->ret_addr = 0; - rr->event = 0; - rr->tra = 0; - rr->pid = pid; - rr->pc = regs->pc; - event_ptr = (event_ptr + 1) & 15; -} - -void evt_debug_ret_from_exc(struct pt_regs *regs) -{ - int pid; - struct ring_node *rr; - - pid = current->pid; - rr = event_ring + event_ptr; - rr->evt = 0xfffe; - rr->ret_addr = 0; - rr->event = 0; - rr->tra = 0; - rr->pid = pid; - rr->pc = regs->pc; - event_ptr = (event_ptr + 1) & 15; -} - -#endif /* CONFIG_POOR_MANS_STRACE */ - -/* ======================================================================= */ - void show_excp_regs(char *from, int trapnr, int signr, struct pt_regs *regs) { From ef9f89996e450e9686816b6fc1e34d7b9a65766c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 17:36:57 +0900 Subject: [PATCH 0843/2061] sh: Kill off unused sh64 debug code. None of the print_page() code and associated helpers are presently used by anything in-tree, so just kill it off. Signed-off-by: Paul Mundt --- arch/sh/lib64/dbg.c | 48 ------------------------------------------ arch/sh/lib64/udelay.c | 2 +- 2 files changed, 1 insertion(+), 49 deletions(-) diff --git a/arch/sh/lib64/dbg.c b/arch/sh/lib64/dbg.c index 25f2481bd89..6152a6a6d9c 100644 --- a/arch/sh/lib64/dbg.c +++ b/arch/sh/lib64/dbg.c @@ -246,51 +246,3 @@ void show_excp_regs(char *from, int trapnr, int signr, struct pt_regs *regs) print_dtlb(); print_itlb(); } - -/* ======================================================================= */ - -/* -** Depending on scan the MMU, Data or Instruction side -** looking for a valid mapping matching Eaddr & asid. -** Return -1 if not found or the TLB id entry otherwise. -** Note: it works only for 4k pages! -*/ -static unsigned long -lookup_mmu_side(unsigned long base, unsigned long Eaddr, unsigned long asid) -{ - regType_t pteH; - unsigned long epn; - int count; - - epn = Eaddr & 0xfffff000; - - for (count = 0; count < MAX_TLBs; count++, base += TLB_STEP) { - pteH = getConfigReg(base); - if (GET_VALID(pteH)) - if ((unsigned long) GET_EPN(pteH) == epn) - if ((unsigned long) GET_ASID(pteH) == asid) - break; - } - return ((unsigned long) ((count < MAX_TLBs) ? base : -1)); -} - -unsigned long lookup_dtlb(unsigned long Eaddr) -{ - unsigned long asid = get_asid(); - return (lookup_mmu_side((u64) DTLB_BASE, Eaddr, asid)); -} - -unsigned long lookup_itlb(unsigned long Eaddr) -{ - unsigned long asid = get_asid(); - return (lookup_mmu_side((u64) ITLB_BASE, Eaddr, asid)); -} - -void print_page(struct page *page) -{ - printk(" page[%p] -> index 0x%lx, count 0x%x, flags 0x%lx\n", - page, page->index, page_count(page), page->flags); - printk(" address_space = %p, pages =%ld\n", page->mapping, - page->mapping->nrpages); - -} diff --git a/arch/sh/lib64/udelay.c b/arch/sh/lib64/udelay.c index d76bd801194..f215b063da7 100644 --- a/arch/sh/lib64/udelay.c +++ b/arch/sh/lib64/udelay.c @@ -33,7 +33,7 @@ void __delay(unsigned long loops) :"0"(loops)); } -inline void __const_udelay(unsigned long xloops) +void __const_udelay(unsigned long xloops) { __delay(xloops * (HZ * cpu_data[raw_smp_processor_id()].loops_per_jiffy)); } From 7d170b1bc540a1d83098a9f27cf4939e026fda81 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 17:41:59 +0900 Subject: [PATCH 0844/2061] sh: Move out cayman-specific panic handler code to its own file. This moves out the cayman-specific panic handler code to a better location, and leaves the generic implementation a simple stub that is still used under emulation. Signed-off-by: Paul Mundt --- arch/sh/boards/mach-cayman/Makefile | 2 +- arch/sh/boards/mach-cayman/panic.c | 49 +++++++++++++++++++++++++++++ arch/sh/lib64/panic.c | 43 ------------------------- 3 files changed, 50 insertions(+), 44 deletions(-) create mode 100644 arch/sh/boards/mach-cayman/panic.c diff --git a/arch/sh/boards/mach-cayman/Makefile b/arch/sh/boards/mach-cayman/Makefile index cafe1ac3b29..00fa3eaecb1 100644 --- a/arch/sh/boards/mach-cayman/Makefile +++ b/arch/sh/boards/mach-cayman/Makefile @@ -1,4 +1,4 @@ # # Makefile for the Hitachi Cayman specific parts of the kernel # -obj-y := setup.o irq.o +obj-y := setup.o irq.o panic.o diff --git a/arch/sh/boards/mach-cayman/panic.c b/arch/sh/boards/mach-cayman/panic.c new file mode 100644 index 00000000000..d1e67306d07 --- /dev/null +++ b/arch/sh/boards/mach-cayman/panic.c @@ -0,0 +1,49 @@ +/* + * Copyright (C) 2003 Richard Curnow, SuperH UK Limited + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +#include +#include +#include + +/* THIS IS A PHYSICAL ADDRESS */ +#define HDSP2534_ADDR (0x04002100) + +static void poor_mans_delay(void) +{ + int i; + + for (i = 0; i < 2500000; i++) + cpu_relax(); +} + +static void show_value(unsigned long x) +{ + int i; + unsigned nibble; + for (i = 0; i < 8; i++) { + nibble = ((x >> (i * 4)) & 0xf); + + __raw_writeb(nibble + ((nibble > 9) ? 55 : 48), + HDSP2534_ADDR + 0xe0 + ((7 - i) << 2)); + } +} + +void +panic_handler(unsigned long panicPC, unsigned long panicSSR, + unsigned long panicEXPEVT) +{ + while (1) { + /* This piece of code displays the PC on the LED display */ + show_value(panicPC); + poor_mans_delay(); + show_value(panicSSR); + poor_mans_delay(); + show_value(panicEXPEVT); + poor_mans_delay(); + } +} diff --git a/arch/sh/lib64/panic.c b/arch/sh/lib64/panic.c index da32ba7b5fc..38c954e04f6 100644 --- a/arch/sh/lib64/panic.c +++ b/arch/sh/lib64/panic.c @@ -6,53 +6,10 @@ * for more details. */ -#include -#include -#include - -/* THIS IS A PHYSICAL ADDRESS */ -#define HDSP2534_ADDR (0x04002100) - -#ifdef CONFIG_SH_CAYMAN - -static void poor_mans_delay(void) -{ - int i; - for (i = 0; i < 2500000; i++) { - } /* poor man's delay */ -} - -static void show_value(unsigned long x) -{ - int i; - unsigned nibble; - for (i = 0; i < 8; i++) { - nibble = ((x >> (i * 4)) & 0xf); - - ctrl_outb(nibble + ((nibble > 9) ? 55 : 48), - HDSP2534_ADDR + 0xe0 + ((7 - i) << 2)); - } -} - -#endif - void panic_handler(unsigned long panicPC, unsigned long panicSSR, unsigned long panicEXPEVT) { -#ifdef CONFIG_SH_CAYMAN - while (1) { - /* This piece of code displays the PC on the LED display */ - show_value(panicPC); - poor_mans_delay(); - show_value(panicSSR); - poor_mans_delay(); - show_value(panicEXPEVT); - poor_mans_delay(); - } -#endif - /* Never return from the panic handler */ for (;;) ; - } From 4f5ecaa05493dfddf155b40224b951592bfce325 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 8 May 2009 08:23:29 +0000 Subject: [PATCH 0845/2061] sh: clock framework update, fix count and kill off kref This patch updates the clock framework use count code. With this patch the enable() and disable() callbacks only get called when counting from and to zero. While at it the kref stuff gets replaced with an int. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 4 +-- arch/sh/kernel/cpu/clock.c | 69 ++++++++++++++++++++----------------- 2 files changed, 39 insertions(+), 34 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index 2f6c9627bc1..b1f29199e4b 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -1,7 +1,6 @@ #ifndef __ASM_SH_CLOCK_H #define __ASM_SH_CLOCK_H -#include #include #include #include @@ -28,7 +27,7 @@ struct clk { struct clk *parent; struct clk_ops *ops; - struct kref kref; + int usecount; unsigned long rate; unsigned long flags; @@ -37,6 +36,7 @@ struct clk { #define CLK_ALWAYS_ENABLED (1 << 0) #define CLK_RATE_PROPAGATES (1 << 1) +#define CLK_NEEDS_INIT (1 << 2) /* Should be defined by processor-specific code */ void arch_init_clk_ops(struct clk_ops **, int type); diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 099373ae57d..133dbe40334 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -90,7 +89,7 @@ static void propagate_rate(struct clk *clk) } } -static int __clk_enable(struct clk *clk) +static void __clk_init(struct clk *clk) { /* * See if this is the first time we're enabling the clock, some @@ -100,19 +99,33 @@ static int __clk_enable(struct clk *clk) * divisors to use before it can effectively recalc. */ - if (clk->flags & CLK_ALWAYS_ENABLED) { - kref_get(&clk->kref); - return 0; - } - - if (unlikely(atomic_read(&clk->kref.refcount) == 1)) + if (clk->flags & CLK_NEEDS_INIT) { if (clk->ops && clk->ops->init) clk->ops->init(clk); - kref_get(&clk->kref); + clk->flags &= ~CLK_NEEDS_INIT; + } +} - if (likely(clk->ops && clk->ops->enable)) - clk->ops->enable(clk); +static int __clk_enable(struct clk *clk) +{ + if (!clk) + return -EINVAL; + + clk->usecount++; + + /* nothing to do if always enabled */ + if (clk->flags & CLK_ALWAYS_ENABLED) + return 0; + + if (clk->usecount == 1) { + __clk_init(clk); + + __clk_enable(clk->parent); + + if (clk->ops && clk->ops->enable) + clk->ops->enable(clk); + } return 0; } @@ -122,11 +135,6 @@ int clk_enable(struct clk *clk) unsigned long flags; int ret; - if (!clk) - return -EINVAL; - - clk_enable(clk->parent); - spin_lock_irqsave(&clock_lock, flags); ret = __clk_enable(clk); spin_unlock_irqrestore(&clock_lock, flags); @@ -135,21 +143,23 @@ int clk_enable(struct clk *clk) } EXPORT_SYMBOL_GPL(clk_enable); -static void clk_kref_release(struct kref *kref) -{ - /* Nothing to do */ -} - static void __clk_disable(struct clk *clk) { - int count = kref_put(&clk->kref, clk_kref_release); + if (!clk) + return; + + clk->usecount--; + + WARN_ON(clk->usecount < 0); if (clk->flags & CLK_ALWAYS_ENABLED) return; - if (!count) { /* count reaches zero, disable the clock */ + if (clk->usecount == 0) { if (likely(clk->ops && clk->ops->disable)) clk->ops->disable(clk); + + __clk_disable(clk->parent); } } @@ -157,14 +167,9 @@ void clk_disable(struct clk *clk) { unsigned long flags; - if (!clk) - return; - spin_lock_irqsave(&clock_lock, flags); __clk_disable(clk); spin_unlock_irqrestore(&clock_lock, flags); - - clk_disable(clk->parent); } EXPORT_SYMBOL_GPL(clk_disable); @@ -173,14 +178,14 @@ int clk_register(struct clk *clk) mutex_lock(&clock_list_sem); list_add(&clk->node, &clock_list); - kref_init(&clk->kref); + clk->usecount = 0; + clk->flags |= CLK_NEEDS_INIT; mutex_unlock(&clock_list_sem); if (clk->flags & CLK_ALWAYS_ENABLED) { + __clk_init(clk); pr_debug( "Clock '%s' is ALWAYS_ENABLED\n", clk->name); - if (clk->ops && clk->ops->init) - clk->ops->init(clk); if (clk->ops && clk->ops->enable) clk->ops->enable(clk); pr_debug( "Enabled."); @@ -356,7 +361,7 @@ static int show_clocks(char *buf, char **start, off_t off, p += sprintf(p, "%-12s\t: %ld.%02ldMHz\t%s\n", clk->name, rate / 1000000, (rate % 1000000) / 10000, ((clk->flags & CLK_ALWAYS_ENABLED) || - (atomic_read(&clk->kref.refcount) != 1)) ? + clk->usecount > 0) ? "enabled" : "disabled"); } From 583d1d549ff107500481461cec0325189c5349ec Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 8 May 2009 08:27:19 +0000 Subject: [PATCH 0846/2061] sh: enable TMU clocksource on sh7722 This patch enables the TMU clocksource on sh7722. To prioritize TMU over CMT we also adjust the CMT clock source rating. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c index 624e8e5a605..b1c3e7f9284 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c @@ -183,7 +183,7 @@ static struct sh_timer_config cmt_platform_data = { .timer_bit = 5, .clk = "cmt0", .clockevent_rating = 125, - .clocksource_rating = 200, + .clocksource_rating = 125, }; static struct resource cmt_resources[] = { @@ -245,7 +245,7 @@ static struct sh_timer_config tmu1_platform_data = { .channel_offset = 0x10, .timer_bit = 1, .clk = "tmu0", - .clocksource_rating = 0, /* disabled for now */ + .clocksource_rating = 200, }; static struct resource tmu1_resources[] = { From 47dd6f4439811c6c83d9abf2f364bcb6d2684640 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 8 May 2009 08:32:18 +0000 Subject: [PATCH 0847/2061] sh: TMU platform data for sh7723 This patch adds TMU platform data for sh7723. Both clockevent and clocksource support is enabled. While at it, adjust the CMT clocksource rating to prioritize the TMU. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7723.c | 196 ++++++++++++++++++++++++- 1 file changed, 195 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c index fb9b5427a06..484ca2dabde 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c @@ -107,7 +107,7 @@ static struct sh_timer_config cmt_platform_data = { .timer_bit = 5, .clk = "cmt0", .clockevent_rating = 125, - .clocksource_rating = 200, + .clocksource_rating = 125, }; static struct resource cmt_resources[] = { @@ -133,6 +133,188 @@ static struct platform_device cmt_device = { .num_resources = ARRAY_SIZE(cmt_resources), }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "tmu0", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "tmu0", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "tmu0", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + +static struct sh_timer_config tmu3_platform_data = { + .name = "TMU3", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "tmu1", +}; + +static struct resource tmu3_resources[] = { + [0] = { + .name = "TMU3", + .start = 0xffd90008, + .end = 0xffd90013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 57, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu3_device = { + .name = "sh_tmu", + .id = 3, + .dev = { + .platform_data = &tmu3_platform_data, + }, + .resource = tmu3_resources, + .num_resources = ARRAY_SIZE(tmu3_resources), +}; + +static struct sh_timer_config tmu4_platform_data = { + .name = "TMU4", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "tmu1", +}; + +static struct resource tmu4_resources[] = { + [0] = { + .name = "TMU4", + .start = 0xffd90014, + .end = 0xffd9001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 58, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu4_device = { + .name = "sh_tmu", + .id = 4, + .dev = { + .platform_data = &tmu4_platform_data, + }, + .resource = tmu4_resources, + .num_resources = ARRAY_SIZE(tmu4_resources), +}; + +static struct sh_timer_config tmu5_platform_data = { + .name = "TMU5", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "tmu1", +}; + +static struct resource tmu5_resources[] = { + [0] = { + .name = "TMU5", + .start = 0xffd90020, + .end = 0xffd9002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 57, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu5_device = { + .name = "sh_tmu", + .id = 5, + .dev = { + .platform_data = &tmu5_platform_data, + }, + .resource = tmu5_resources, + .num_resources = ARRAY_SIZE(tmu5_resources), +}; + static struct plat_sci_port sci_platform_data[] = { { .mapbase = 0xffe00000, @@ -255,6 +437,12 @@ static struct platform_device iic_device = { static struct platform_device *sh7723_devices[] __initdata = { &cmt_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, &sci_device, &rtc_device, &iic_device, @@ -282,6 +470,12 @@ __initcall(sh7723_devices_setup); static struct platform_device *sh7723_early_devices[] __initdata = { &cmt_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, }; void __init plat_early_device_setup(void) From 6b2e8523df148c15ea5abf13075026fb8bdb3f86 Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Thu, 7 May 2009 11:56:49 -0700 Subject: [PATCH 0848/2061] xen: reserve Xen start_info rather than e820 reserving Use reserve_early rather than e820 reservations for Xen start info and mfn->pfn table, so that the memory use is a bit more self-documenting. [ Impact: cleanup ] Signed-off-by: Jeremy Fitzhardinge Cc: Xen-devel Cc: Linus Torvalds LKML-Reference: <4A032EF1.6070708@goop.org> Signed-off-by: Ingo Molnar --- arch/x86/xen/setup.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c index 15c6c68db6a..ad0047f47cd 100644 --- a/arch/x86/xen/setup.c +++ b/arch/x86/xen/setup.c @@ -61,9 +61,9 @@ char * __init xen_memory_setup(void) * - xen_start_info * See comment above "struct start_info" in */ - e820_add_region(__pa(xen_start_info->mfn_list), - xen_start_info->pt_base - xen_start_info->mfn_list, - E820_RESERVED); + reserve_early(__pa(xen_start_info->mfn_list), + __pa(xen_start_info->pt_base), + "XEN START INFO"); sanitize_e820_map(e820.map, ARRAY_SIZE(e820.map), &e820.nr_map); From b366328335b98373325977e116d2423f500708ac Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 18:01:03 +0900 Subject: [PATCH 0849/2061] sh: Drop dead rules from arch/sh/kernel/Makefile_64. Several of these options are specific to the SHcompact ISA and will need to be rewritten for SHmedia if they are to be supported at all. Drop the impossible rules for now. Signed-off-by: Paul Mundt --- arch/sh/kernel/Makefile_64 | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64 index 31f6ebf375e..c845a0d6197 100644 --- a/arch/sh/kernel/Makefile_64 +++ b/arch/sh/kernel/Makefile_64 @@ -5,13 +5,10 @@ obj-y := debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \ syscalls_64.o time.o topology.o traps.o traps_64.o obj-y += cpu/ timers/ -obj-$(CONFIG_VSYSCALL) += vsyscall/ obj-$(CONFIG_SMP) += smp.o -obj-$(CONFIG_SH_STANDARD_BIOS) += sh_bios.o obj-$(CONFIG_SH_CPU_FREQ) += cpufreq.o obj-$(CONFIG_MODULES) += sh_ksyms_64.o module.o obj-$(CONFIG_EARLY_PRINTK) += early_printk.o -obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-$(CONFIG_STACKTRACE) += stacktrace.o obj-$(CONFIG_IO_TRAPPED) += io_trapped.o From 1dcdb5a9e7c235e6e80f1f4d5b8247b3e5347e48 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 17:44:11 +0200 Subject: [PATCH 0850/2061] oprofile: re-add force_arch_perfmon option This re-adds the force_arch_perfmon option that was in the original arch perfmon patchkit. Originally this was rejected in favour of a generalized perfmon=name option, but it turned out implementing the later in a reliable way is hard (and it would have been easy to crash the kernel if a user gets it wrong) But now Atom and Core i7 support being readded a user would need to update their oprofile userland to beyond 0.9.4 to use oprofile again on Atom or Core i7. To avoid this problem readd the force_arch_perfmon option. Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- Documentation/kernel-parameters.txt | 6 ++++++ arch/x86/oprofile/nmi_int.c | 6 ++++++ 2 files changed, 12 insertions(+) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 90b3924071b..9b9566bf330 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1650,6 +1650,12 @@ and is between 256 and 4096 characters. It is defined in the file oprofile.timer= [HW] Use timer interrupt instead of performance counters + oprofile.force_arch_perfmon=1 [X86] + Force use of architectural perfmon instead of + the CPU specific event set. + This might be useful if you have older oprofile + userland or if you want common events over Intel CPUs. + osst= [HW,SCSI] SCSI Tape Driver Format: , See also Documentation/scsi/st.txt. diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 202864ad49a..e5171c99e15 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -389,10 +389,16 @@ static int __init p4_init(char **cpu_type) return 0; } +int force_arch_perfmon; +module_param(force_arch_perfmon, int, 0); + static int __init ppro_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; + if (force_arch_perfmon && cpu_has_arch_perfmon) + return 0; + switch (cpu_model) { case 0 ... 2: *cpu_type = "i386/ppro"; From 1f3d7b60691993d8d368d8dd7d5d85871d41e8f5 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 17:44:12 +0200 Subject: [PATCH 0851/2061] oprofile: remove undocumented oprofile.p4force option There are no new P4s and the oprofile code knows about all existing ones, so we don't really need the p4force option anymore. Remove it. Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index e5171c99e15..f472c0c48a3 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -356,14 +356,11 @@ static void exit_sysfs(void) #define exit_sysfs() do { } while (0) #endif /* CONFIG_PM */ -static int p4force; -module_param(p4force, int, 0); - static int __init p4_init(char **cpu_type) { __u8 cpu_model = boot_cpu_data.x86_model; - if (!p4force && (cpu_model > 6 || cpu_model == 5)) + if (cpu_model > 6 || cpu_model == 5) return 0; #ifndef CONFIG_SMP From 6adf406f0a0eaf37251018d15f51e93f5b538ee6 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Mon, 27 Apr 2009 17:44:13 +0200 Subject: [PATCH 0852/2061] oprofile: add support for Core i7 and Atom The registers are about the same as other Family 6 CPUs so we only need to add detection. I'm not completely happy with calling Nehalem Core i7 because there will be undoubtedly other Nehalem based CPUs in the future with different marketing names, but it's the best we got for now. Requires updated oprofile userland for the new event files. If you don't want to update right now you can also use oprofile.force_arch_perfmon=1 (added in the next patch) with 0.9.4 Signed-off-by: Andi Kleen Signed-off-by: Robert Richter --- arch/x86/oprofile/nmi_int.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index f472c0c48a3..3308147182a 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -417,6 +417,13 @@ static int __init ppro_init(char **cpu_type) case 15: case 23: *cpu_type = "i386/core_2"; break; + case 26: + arch_perfmon_setup_counters(); + *cpu_type = "i386/core_i7"; + break; + case 28: + *cpu_type = "i386/atom"; + break; default: /* Unknown */ return 0; From 7e4e0bd50e80df2fe5501f48f872448376cdd997 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 6 May 2009 12:10:23 +0200 Subject: [PATCH 0853/2061] oprofile: introduce module_param oprofile.cpu_type This patch removes module_param oprofile.force_arch_perfmon and introduces oprofile.cpu_type=archperfmon instead. This new parameter can be reused for other models and architectures. Currently only archperfmon is supported. Cc: Andi Kleen Signed-off-by: Robert Richter --- Documentation/kernel-parameters.txt | 12 +++++++----- arch/x86/oprofile/nmi_int.c | 13 +++++++++++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index 9b9566bf330..6ce5f48859c 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -1650,11 +1650,13 @@ and is between 256 and 4096 characters. It is defined in the file oprofile.timer= [HW] Use timer interrupt instead of performance counters - oprofile.force_arch_perfmon=1 [X86] - Force use of architectural perfmon instead of - the CPU specific event set. - This might be useful if you have older oprofile - userland or if you want common events over Intel CPUs. + oprofile.cpu_type= Force an oprofile cpu type + This might be useful if you have an older oprofile + userland or if you want common events. + Format: { archperfmon } + archperfmon: [X86] Force use of architectural + perfmon on Intel CPUs instead of the + CPU specific event set. osst= [HW,SCSI] SCSI Tape Driver Format: , diff --git a/arch/x86/oprofile/nmi_int.c b/arch/x86/oprofile/nmi_int.c index 3308147182a..3b285e656e2 100644 --- a/arch/x86/oprofile/nmi_int.c +++ b/arch/x86/oprofile/nmi_int.c @@ -386,8 +386,17 @@ static int __init p4_init(char **cpu_type) return 0; } -int force_arch_perfmon; -module_param(force_arch_perfmon, int, 0); +static int force_arch_perfmon; +static int force_cpu_type(const char *str, struct kernel_param *kp) +{ + if (!strcmp(str, "archperfmon")) { + force_arch_perfmon = 1; + printk(KERN_INFO "oprofile: forcing architectural perfmon\n"); + } + + return 0; +} +module_param_call(cpu_type, force_cpu_type, NULL, NULL, 0); static int __init ppro_init(char **cpu_type) { From 6eac1af01112d1919b1c432f6591e6368e8d538f Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 19:48:47 +0900 Subject: [PATCH 0854/2061] sh: Always select RTC_LIB, not just for SUPERH32. The RTC_LIB helpers are used in arch/sh/kernel/time.c, which was previously only the case for the 32-bit variant. Now that this has become the common implementation, move the RTC_LIB select to reflect that. Fixes up the sh64 build. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index d88a61b11d3..6ed16c4548c 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -15,6 +15,7 @@ config SUPERH select HAVE_IOREMAP_PROT if MMU select HAVE_ARCH_TRACEHOOK select HAVE_DMA_API_DEBUG + select RTC_LIB help The SuperH is a RISC processor targeted for use in embedded systems and consumer electronics; it was also used in the Sega Dreamcast @@ -30,7 +31,6 @@ config SUPERH32 select HAVE_DYNAMIC_FTRACE select HAVE_ARCH_KGDB select ARCH_HIBERNATION_POSSIBLE if MMU - select RTC_LIB config SUPERH64 def_bool ARCH = "sh64" From 30d88cf52f229c3e0c90b8d71036960ccc2db4e2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 20:20:56 +0900 Subject: [PATCH 0855/2061] sh: Kill off extra cflags Kconfig entry. There is no real reason to use this anymore, as the build system generally knows what it is doing with regards to cflags mangling. Signed-off-by: Paul Mundt --- arch/sh/Kconfig.debug | 11 ----------- arch/sh/Makefile | 3 --- 2 files changed, 14 deletions(-) diff --git a/arch/sh/Kconfig.debug b/arch/sh/Kconfig.debug index 44627ef5254..8179cc9be9a 100644 --- a/arch/sh/Kconfig.debug +++ b/arch/sh/Kconfig.debug @@ -122,17 +122,6 @@ config SH_NO_BSS_INIT For all other cases, say N. If this option seems perplexing, or you aren't sure, say N. -config MORE_COMPILE_OPTIONS - bool "Add any additional compile options" - help - If you want to add additional CFLAGS to the kernel build, enable this - option and then enter what you would like to add in the next question. - Note however that -g is already appended with the selection of KGDB. - -config COMPILE_OPTIONS - string "Additional compile arguments" - depends on MORE_COMPILE_OPTIONS - config SH64_SR_WATCH bool "Debug: set SR.WATCH to enable hardware watchpoints and trace" depends on SUPERH64 diff --git a/arch/sh/Makefile b/arch/sh/Makefile index 493da979eff..68348e70028 100644 --- a/arch/sh/Makefile +++ b/arch/sh/Makefile @@ -70,9 +70,6 @@ cflags-$(CONFIG_CPU_LITTLE_ENDIAN) += -ml cflags-y += $(call cc-option,-mno-fdpic) cflags-y += $(isaflags-y) -ffreestanding -cflags-$(CONFIG_MORE_COMPILE_OPTIONS) += \ - $(shell echo $(CONFIG_COMPILE_OPTIONS) | sed -e 's/"//g') - OBJCOPYFLAGS := -O binary -R .note -R .note.gnu.build-id -R .comment \ -R .stab -R .stabstr -S From c29418c2ae15ee9171bc136ad261c497b95a46fc Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 20:32:56 +0900 Subject: [PATCH 0856/2061] sh: Always fixup unaligned userspace accesses on sh64. sh64 has traditionally had this configurable via a Kconfig option (CONFIG_SH64_USER_MISALIGNED_FIXUP). In practice it has never really been terribly useful to turn this off, so just get rid of the option entirely. We leave the sysctl around so we don't end up breaking existing root file systems, and to allow folks that really want this off to do so at their own risk. Signed-off-by: Paul Mundt --- arch/sh/Kconfig.cpu | 5 ----- arch/sh/kernel/traps_64.c | 35 +++++------------------------------ 2 files changed, 5 insertions(+), 35 deletions(-) diff --git a/arch/sh/Kconfig.cpu b/arch/sh/Kconfig.cpu index c7d704381a6..9eb1712372d 100644 --- a/arch/sh/Kconfig.cpu +++ b/arch/sh/Kconfig.cpu @@ -76,11 +76,6 @@ config SPECULATIVE_EXECUTION If unsure, say N. -config SH64_USER_MISALIGNED_FIXUP - def_bool y - prompt "Fixup misaligned loads/stores occurring in user mode" - depends on SUPERH64 - config SH64_ID2815_WORKAROUND bool "Include workaround for SH5-101 cut2 silicon defect ID2815" depends on CPU_SUBTYPE_SH5_101 diff --git a/arch/sh/kernel/traps_64.c b/arch/sh/kernel/traps_64.c index a85831cbf18..267e5ebbb47 100644 --- a/arch/sh/kernel/traps_64.c +++ b/arch/sh/kernel/traps_64.c @@ -370,7 +370,6 @@ static int generate_and_check_address(struct pt_regs *regs, return -1; } -#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP) /* Check accessible. For misaligned access in the kernel, assume the address is always accessible (and if not, just fault when the load/store gets done.) */ @@ -380,18 +379,13 @@ static int generate_and_check_address(struct pt_regs *regs, } /* Do access_ok check later - it depends on whether it's a load or a store. */ } -#endif *address = addr; return 0; } -/* Default value as for sh */ -#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP) static int user_mode_unaligned_fixup_count = 10; static int user_mode_unaligned_fixup_enable = 1; -#endif - static int kernel_mode_unaligned_fixup_count = 32; static void misaligned_kernel_word_load(__u64 address, int do_sign_extend, __u64 *result) @@ -440,7 +434,6 @@ static int misaligned_load(struct pt_regs *regs, } destreg = (opcode >> 4) & 0x3f; -#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP) if (user_mode(regs)) { __u64 buffer; @@ -470,9 +463,7 @@ static int misaligned_load(struct pt_regs *regs, width_shift, (unsigned long) regs->pc); break; } - } else -#endif - { + } else { /* kernel mode - we can take short cuts since if we fault, it's a genuine bug */ __u64 lo, hi; @@ -519,7 +510,6 @@ static int misaligned_store(struct pt_regs *regs, } srcreg = (opcode >> 4) & 0x3f; -#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP) if (user_mode(regs)) { __u64 buffer; @@ -546,9 +536,7 @@ static int misaligned_store(struct pt_regs *regs, if (__copy_user((void *)(int)address, &buffer, (1 << width_shift)) > 0) { return -1; /* fault */ } - } else -#endif - { + } else { /* kernel mode - we can take short cuts since if we fault, it's a genuine bug */ __u64 val = regs->regs[srcreg]; @@ -576,7 +564,6 @@ static int misaligned_store(struct pt_regs *regs, } -#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP) /* Never need to fix up misaligned FPU accesses within the kernel since that's a real error. */ static int misaligned_fpu_load(struct pt_regs *regs, @@ -727,7 +714,6 @@ static int misaligned_fpu_store(struct pt_regs *regs, return -1; } } -#endif static int misaligned_fixup(struct pt_regs *regs) { @@ -735,12 +721,8 @@ static int misaligned_fixup(struct pt_regs *regs) int error; int major, minor; -#if !defined(CONFIG_SH64_USER_MISALIGNED_FIXUP) - /* Never fixup user mode misaligned accesses without this option enabled. */ - return -1; -#else - if (!user_mode_unaligned_fixup_enable) return -1; -#endif + if (!user_mode_unaligned_fixup_enable) + return -1; error = read_opcode(regs->pc, &opcode, user_mode(regs)); if (error < 0) { @@ -749,15 +731,12 @@ static int misaligned_fixup(struct pt_regs *regs) major = (opcode >> 26) & 0x3f; minor = (opcode >> 16) & 0xf; -#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP) if (user_mode(regs) && (user_mode_unaligned_fixup_count > 0)) { --user_mode_unaligned_fixup_count; /* Only do 'count' worth of these reports, to remove a potential DoS against syslog */ printk("Fixing up unaligned userspace access in \"%s\" pid=%d pc=0x%08x ins=0x%08lx\n", current->comm, task_pid_nr(current), (__u32)regs->pc, opcode); - } else -#endif - if (!user_mode(regs) && (kernel_mode_unaligned_fixup_count > 0)) { + } else if (!user_mode(regs) && (kernel_mode_unaligned_fixup_count > 0)) { --kernel_mode_unaligned_fixup_count; if (in_interrupt()) { printk("Fixing up unaligned kernelspace access in interrupt pc=0x%08x ins=0x%08lx\n", @@ -830,7 +809,6 @@ static int misaligned_fixup(struct pt_regs *regs) } break; -#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP) case (0x94>>2): /* FLD.S */ error = misaligned_fpu_load(regs, opcode, 1, 2, 0); break; @@ -881,7 +859,6 @@ static int misaligned_fixup(struct pt_regs *regs) break; } break; -#endif default: /* Fault */ @@ -907,7 +884,6 @@ static ctl_table unaligned_table[] = { .mode = 0644, .proc_handler = &proc_dointvec }, -#if defined(CONFIG_SH64_USER_MISALIGNED_FIXUP) { .ctl_name = CTL_UNNUMBERED, .procname = "user_reports", @@ -923,7 +899,6 @@ static ctl_table unaligned_table[] = { .maxlen = sizeof(int), .mode = 0644, .proc_handler = &proc_dointvec}, -#endif {} }; From 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 8 May 2009 10:31:42 +0800 Subject: [PATCH 0857/2061] tracing/events: clean up for ftrace_set_clr_event() Add a helper function __ftrace_set_clr_event(), and replace some ftrace_set_clr_event() calls with this helper, thus we don't need any kstrdup() or kmalloc(). As a side effect, this patch fixes an issue in self tests code, which is similar to the one fixed in commit d6bf81ef0f7474434c2a049e8bf3c9146a14dd96 ("tracing: append ":*" to internal setting of system events") It's a small issue and won't cause any bug in fact, but we should do things right anyway. [ Impact: prevent spurious event-enabling in tracing self-tests ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A03998E.3020503@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 128 +++++++++++++----------------------- 1 file changed, 47 insertions(+), 81 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 8d0fae3af59..45f1099386b 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -111,35 +111,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, } } -static int ftrace_set_clr_event(char *buf, int set) +/* + * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. + */ +static int __ftrace_set_clr_event(const char *match, const char *sub, + const char *event, int set) { struct ftrace_event_call *call; - char *event = NULL, *sub = NULL, *match; - int ret = -EINVAL; - - /* - * The buf format can be : - * *: means any event by that name. - * : is the same. - * - * :* means all events in that subsystem - * : means the same. - * - * (no ':') means all events in a subsystem with - * the name or any event that matches - */ - - match = strsep(&buf, ":"); - if (buf) { - sub = match; - event = buf; - match = NULL; - - if (!strlen(sub) || strcmp(sub, "*") == 0) - sub = NULL; - if (!strlen(event) || strcmp(event, "*") == 0) - event = NULL; - } + int ret; mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { @@ -167,6 +146,37 @@ static int ftrace_set_clr_event(char *buf, int set) return ret; } +static int ftrace_set_clr_event(char *buf, int set) +{ + char *event = NULL, *sub = NULL, *match; + + /* + * The buf format can be : + * *: means any event by that name. + * : is the same. + * + * :* means all events in that subsystem + * : means the same. + * + * (no ':') means all events in a subsystem with + * the name or any event that matches + */ + + match = strsep(&buf, ":"); + if (buf) { + sub = match; + event = buf; + match = NULL; + + if (!strlen(sub) || strcmp(sub, "*") == 0) + sub = NULL; + if (!strlen(event) || strcmp(event, "*") == 0) + event = NULL; + } + + return __ftrace_set_clr_event(match, sub, event, set); +} + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 @@ -408,18 +418,14 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, struct ftrace_event_call *call; char buf[2]; int set = -1; - int all = 0; int ret; - if (system[0] == '*') - all = 1; - mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { if (!call->name || !call->regfunc) continue; - if (!all && strcmp(call->system, system) != 0) + if (system && strcmp(call->system, system) != 0) continue; /* @@ -480,7 +486,6 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, { const char *system = filp->private_data; unsigned long val; - char *command; char buf[64]; ssize_t ret; @@ -500,30 +505,16 @@ system_enable_write(struct file *filp, const char __user *ubuf, size_t cnt, if (ret < 0) return ret; - switch (val) { - case 0: - case 1: - break; - - default: + if (val != 0 && val != 1) return -EINVAL; - } - /* +3 for the ":*\0" */ - command = kmalloc(strlen(system)+3, GFP_KERNEL); - if (!command) - return -ENOMEM; - sprintf(command, "%s:*", system); - - ret = ftrace_set_clr_event(command, val); + ret = __ftrace_set_clr_event(NULL, system, NULL, val); if (ret) - goto out_free; + goto out; ret = cnt; - out_free: - kfree(command); - +out: *ppos += cnt; return ret; @@ -1181,7 +1172,7 @@ static __init int event_trace_init(void) &ftrace_show_header_fops); trace_create_file("enable", 0644, d_events, - "*", &ftrace_system_enable_fops); + NULL, &ftrace_system_enable_fops); for_each_event(call, __start_ftrace_events, __stop_ftrace_events) { /* The linker may leave blanks */ @@ -1259,7 +1250,6 @@ static __init void event_trace_self_tests(void) { struct ftrace_event_call *call; struct event_subsystem *system; - char *sysname; int ret; pr_info("Running tests on trace events:\n"); @@ -1305,14 +1295,7 @@ static __init void event_trace_self_tests(void) pr_info("Testing event system %s: ", system->name); - /* ftrace_set_clr_event can modify the name passed in. */ - sysname = kstrdup(system->name, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - ret = ftrace_set_clr_event(sysname, 1); - kfree(sysname); + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 1); if (WARN_ON_ONCE(ret)) { pr_warning("error enabling system %s\n", system->name); @@ -1321,14 +1304,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); - sysname = kstrdup(system->name, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - ret = ftrace_set_clr_event(sysname, 0); - kfree(sysname); - + ret = __ftrace_set_clr_event(NULL, system->name, NULL, 0); if (WARN_ON_ONCE(ret)) pr_warning("error disabling system %s\n", system->name); @@ -1341,15 +1317,8 @@ static __init void event_trace_self_tests(void) pr_info("Running tests on all trace events:\n"); pr_info("Testing all events: "); - sysname = kmalloc(4, GFP_KERNEL); - if (WARN_ON(!sysname)) { - pr_warning("Can't allocate memory, giving up!\n"); - return; - } - memcpy(sysname, "*:*", 4); - ret = ftrace_set_clr_event(sysname, 1); + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 1); if (WARN_ON_ONCE(ret)) { - kfree(sysname); pr_warning("error enabling all events\n"); return; } @@ -1357,10 +1326,7 @@ static __init void event_trace_self_tests(void) event_test_stuff(); /* reset sysname */ - memcpy(sysname, "*:*", 4); - ret = ftrace_set_clr_event(sysname, 0); - kfree(sysname); - + ret = __ftrace_set_clr_event(NULL, NULL, NULL, 0); if (WARN_ON_ONCE(ret)) { pr_warning("error disabling all events\n"); return; From c142b15dc56ee6d55cb97a062e3c8e9c61e384c0 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 8 May 2009 10:32:05 +0800 Subject: [PATCH 0858/2061] tracing/events: simplify system_enable_read() A smarter way to figure out the output of an enable file. [ Impact: clean up ] Signed-off-by: Li Zefan Acked-by: Steven Rostedt Acked-by: Frederic Weisbecker LKML-Reference: <4A0399A5.2080603@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_events.c | 40 ++++++------------------------------- 1 file changed, 6 insertions(+), 34 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 45f1099386b..df394bc6d54 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -414,10 +414,11 @@ static ssize_t system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, loff_t *ppos) { + const char set_to_char[4] = { '?', '0', '1', 'X' }; const char *system = filp->private_data; struct ftrace_event_call *call; char buf[2]; - int set = -1; + int set = 0; int ret; mutex_lock(&event_mutex); @@ -433,47 +434,18 @@ system_enable_read(struct file *filp, char __user *ubuf, size_t cnt, * or if all events or cleared, or if we have * a mixture. */ - if (call->enabled) { - switch (set) { - case -1: - set = 1; - break; - case 0: - set = 2; - break; - } - } else { - switch (set) { - case -1: - set = 0; - break; - case 1: - set = 2; - break; - } - } + set |= (1 << !!call->enabled); + /* * If we have a mixture, no need to look further. */ - if (set == 2) + if (set == 3) break; } mutex_unlock(&event_mutex); + buf[0] = set_to_char[set]; buf[1] = '\n'; - switch (set) { - case 0: - buf[0] = '0'; - break; - case 1: - buf[0] = '1'; - break; - case 2: - buf[0] = 'X'; - break; - default: - buf[0] = '?'; - } ret = simple_read_from_buffer(ubuf, cnt, ppos, buf, 2); From c3d480ded1584dc17f6e82f49af4155380a51dda Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 21:57:04 +0900 Subject: [PATCH 0859/2061] sh: TMU platform data for SH7786. Wires up all 12 TMU channels, with TMU0 and 1 used as clockevent and clocksource respectively. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7786.c | 393 +++++++++++++++++++++++++ 1 file changed, 393 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c index 90e8cfff55f..2c464bf5a89 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c @@ -3,6 +3,7 @@ * * Copyright (C) 2009 Renesas Solutions Corp. * Kuninori Morimoto + * Paul Mundt * * Based on SH7785 Setup * @@ -19,6 +20,7 @@ #include #include #include +#include #include static struct plat_sci_port sci_platform_data[] = { @@ -69,6 +71,368 @@ static struct platform_device sci_device = { }, }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + +static struct sh_timer_config tmu3_platform_data = { + .name = "TMU3", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", +}; + +static struct resource tmu3_resources[] = { + [0] = { + .name = "TMU3", + .start = 0xffda0008, + .end = 0xffda0013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 20, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu3_device = { + .name = "sh_tmu", + .id = 3, + .dev = { + .platform_data = &tmu3_platform_data, + }, + .resource = tmu3_resources, + .num_resources = ARRAY_SIZE(tmu3_resources), +}; + +static struct sh_timer_config tmu4_platform_data = { + .name = "TMU4", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource tmu4_resources[] = { + [0] = { + .name = "TMU4", + .start = 0xffda0014, + .end = 0xffda001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 21, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu4_device = { + .name = "sh_tmu", + .id = 4, + .dev = { + .platform_data = &tmu4_platform_data, + }, + .resource = tmu4_resources, + .num_resources = ARRAY_SIZE(tmu4_resources), +}; + +static struct sh_timer_config tmu5_platform_data = { + .name = "TMU5", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu5_resources[] = { + [0] = { + .name = "TMU5", + .start = 0xffda0020, + .end = 0xffda002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 22, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu5_device = { + .name = "sh_tmu", + .id = 5, + .dev = { + .platform_data = &tmu5_platform_data, + }, + .resource = tmu5_resources, + .num_resources = ARRAY_SIZE(tmu5_resources), +}; + +static struct sh_timer_config tmu6_platform_data = { + .name = "TMU6", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", +}; + +static struct resource tmu6_resources[] = { + [0] = { + .name = "TMU6", + .start = 0xffdc0008, + .end = 0xffdc0013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 45, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu6_device = { + .name = "sh_tmu", + .id = 6, + .dev = { + .platform_data = &tmu6_platform_data, + }, + .resource = tmu6_resources, + .num_resources = ARRAY_SIZE(tmu6_resources), +}; + +static struct sh_timer_config tmu7_platform_data = { + .name = "TMU7", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource tmu7_resources[] = { + [0] = { + .name = "TMU7", + .start = 0xffdc0014, + .end = 0xffdc001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 45, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu7_device = { + .name = "sh_tmu", + .id = 7, + .dev = { + .platform_data = &tmu7_platform_data, + }, + .resource = tmu7_resources, + .num_resources = ARRAY_SIZE(tmu7_resources), +}; + +static struct sh_timer_config tmu8_platform_data = { + .name = "TMU8", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu8_resources[] = { + [0] = { + .name = "TMU8", + .start = 0xffdc0020, + .end = 0xffdc002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 45, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu8_device = { + .name = "sh_tmu", + .id = 8, + .dev = { + .platform_data = &tmu8_platform_data, + }, + .resource = tmu8_resources, + .num_resources = ARRAY_SIZE(tmu8_resources), +}; + +static struct sh_timer_config tmu9_platform_data = { + .name = "TMU9", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", +}; + +static struct resource tmu9_resources[] = { + [0] = { + .name = "TMU9", + .start = 0xffde0008, + .end = 0xffde0013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 46, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu9_device = { + .name = "sh_tmu", + .id = 9, + .dev = { + .platform_data = &tmu9_platform_data, + }, + .resource = tmu9_resources, + .num_resources = ARRAY_SIZE(tmu9_resources), +}; + +static struct sh_timer_config tmu10_platform_data = { + .name = "TMU10", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource tmu10_resources[] = { + [0] = { + .name = "TMU10", + .start = 0xffde0014, + .end = 0xffde001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 46, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu10_device = { + .name = "sh_tmu", + .id = 10, + .dev = { + .platform_data = &tmu10_platform_data, + }, + .resource = tmu10_resources, + .num_resources = ARRAY_SIZE(tmu10_resources), +}; + +static struct sh_timer_config tmu11_platform_data = { + .name = "TMU11", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu11_resources[] = { + [0] = { + .name = "TMU11", + .start = 0xffde0020, + .end = 0xffde002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 46, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu11_device = { + .name = "sh_tmu", + .id = 11, + .dev = { + .platform_data = &tmu11_platform_data, + }, + .resource = tmu11_resources, + .num_resources = ARRAY_SIZE(tmu11_resources), +}; + static struct resource usb_ohci_resources[] = { [0] = { .start = 0xffe70400, @@ -94,6 +458,21 @@ static struct platform_device usb_ohci_device = { .resource = usb_ohci_resources, }; +static struct platform_device *sh7786_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, + &tmu6_device, + &tmu7_device, + &tmu8_device, + &tmu9_device, + &tmu10_device, + &tmu11_device, +}; + static struct platform_device *sh7786_devices[] __initdata = { &sci_device, &usb_ohci_device, @@ -156,12 +535,26 @@ static void __init sh7786_usb_setup(void) static int __init sh7786_devices_setup(void) { + int ret; + sh7786_usb_setup(); + + ret = platform_add_devices(sh7786_early_devices, + ARRAY_SIZE(sh7786_early_devices)); + if (unlikely(ret != 0)) + return ret; + return platform_add_devices(sh7786_devices, ARRAY_SIZE(sh7786_devices)); } device_initcall(sh7786_devices_setup); +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7786_early_devices, + ARRAY_SIZE(sh7786_early_devices)); +} + enum { UNUSED = 0, From ccdaeb4c8f982900a2abf722e34b60131394d8eb Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 22:09:30 +0900 Subject: [PATCH 0860/2061] sh: TMU platform data for SH-X3 proto CPU. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-shx3.c | 209 ++++++++++++++++++++++++++- 1 file changed, 207 insertions(+), 2 deletions(-) diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c index bd35f32534b..9d5185b42f1 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c @@ -1,7 +1,7 @@ /* - * SH-X3 Setup + * SH-X3 Prototype Setup * - * Copyright (C) 2007 Paul Mundt + * Copyright (C) 2007 - 2009 Paul Mundt * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive @@ -12,6 +12,7 @@ #include #include #include +#include #include static struct plat_sci_port sci_platform_data[] = { @@ -48,17 +49,221 @@ static struct platform_device sci_device = { }, }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffc10008, + .end = 0xffc10013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffc10014, + .end = 0xffc1001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffc10020, + .end = 0xffc1002f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + +static struct sh_timer_config tmu3_platform_data = { + .name = "TMU3", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", +}; + +static struct resource tmu3_resources[] = { + [0] = { + .name = "TMU3", + .start = 0xffc20008, + .end = 0xffc20013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 19, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu3_device = { + .name = "sh_tmu", + .id = 3, + .dev = { + .platform_data = &tmu3_platform_data, + }, + .resource = tmu3_resources, + .num_resources = ARRAY_SIZE(tmu3_resources), +}; + +static struct sh_timer_config tmu4_platform_data = { + .name = "TMU4", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource tmu4_resources[] = { + [0] = { + .name = "TMU4", + .start = 0xffc20014, + .end = 0xffc2001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 20, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu4_device = { + .name = "sh_tmu", + .id = 4, + .dev = { + .platform_data = &tmu4_platform_data, + }, + .resource = tmu4_resources, + .num_resources = ARRAY_SIZE(tmu4_resources), +}; + +static struct sh_timer_config tmu5_platform_data = { + .name = "TMU5", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu5_resources[] = { + [0] = { + .name = "TMU5", + .start = 0xffc20020, + .end = 0xffc2002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 21, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu5_device = { + .name = "sh_tmu", + .id = 5, + .dev = { + .platform_data = &tmu5_platform_data, + }, + .resource = tmu5_resources, + .num_resources = ARRAY_SIZE(tmu5_resources), +}; + +static struct platform_device *shx3_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, +}; + static struct platform_device *shx3_devices[] __initdata = { &sci_device, }; static int __init shx3_devices_setup(void) { + int ret; + + ret = platform_add_devices(shx3_early_devices, + ARRAY_SIZE(shx3_early_devices)); + if (unlikely(ret != 0)) + return ret; + return platform_add_devices(shx3_devices, ARRAY_SIZE(shx3_devices)); } __initcall(shx3_devices_setup); +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(shx3_early_devices, + ARRAY_SIZE(shx3_early_devices)); +} + enum { UNUSED = 0, From 7b551f9daa9bd9533ba4ce31622ed4be1dd97d3e Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 22:14:01 +0900 Subject: [PATCH 0861/2061] sh: Kill off the GENERIC_CALIBRATE_DELAY ifndef. Now that everyone is using the clock framework directly and we unconditionally provide our own calibrate_delay() function, having it wrapped in an ifndef is no longer useful. So, kill it off. Signed-off-by: Paul Mundt --- arch/sh/kernel/setup.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/arch/sh/kernel/setup.c b/arch/sh/kernel/setup.c index 38515a79592..00086b23427 100644 --- a/arch/sh/kernel/setup.c +++ b/arch/sh/kernel/setup.c @@ -156,7 +156,7 @@ static void __init reserve_crashkernel(void) &crash_size, &crash_base); if (ret == 0 && crash_size) { if (crash_base <= 0) { - vp = alloc_bootmem_nopanic(crash_size); + vp = alloc_bootmem_nopanic(crash_size); if (!vp) { printk(KERN_INFO "crashkernel allocation " "failed\n"); @@ -185,7 +185,6 @@ static inline void __init reserve_crashkernel(void) {} #endif -#ifndef CONFIG_GENERIC_CALIBRATE_DELAY void __cpuinit calibrate_delay(void) { struct clk *clk = clk_get(NULL, "cpu_clk"); @@ -201,7 +200,6 @@ void __cpuinit calibrate_delay(void) (loops_per_jiffy/(5000/HZ)) % 100, loops_per_jiffy); } -#endif void __init __add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn) From e552de2413edad1a7b0c7f82a2f2753e4f905d93 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 21 Jan 2009 15:13:42 +0000 Subject: [PATCH 0862/2061] sh-sci: add platform device private data This patch adds per-platform private data to the sh-sci driver. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 127 +++++++++++++++++++++++++++------------- 1 file changed, 85 insertions(+), 42 deletions(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index 728d6a062bf..af5da110d52 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -47,6 +47,7 @@ #include #include #include +#include #ifdef CONFIG_SUPERH #include @@ -77,6 +78,16 @@ struct sci_port { #ifdef CONFIG_HAVE_CLK /* Port clock */ struct clk *clk; +#endif + struct list_head node; +}; + +struct sh_sci_priv { + spinlock_t lock; + struct list_head ports; + +#ifdef CONFIG_HAVE_CLK + struct notifier_block clk_nb; #endif }; @@ -726,19 +737,22 @@ static irqreturn_t sci_mpxed_interrupt(int irq, void *ptr) static int sci_notifier(struct notifier_block *self, unsigned long phase, void *p) { - int i; + struct sh_sci_priv *priv = container_of(self, + struct sh_sci_priv, clk_nb); + struct sci_port *sci_port; + unsigned long flags; if ((phase == CPUFREQ_POSTCHANGE) || - (phase == CPUFREQ_RESUMECHANGE)) - for (i = 0; i < SCI_NPORTS; i++) { - struct sci_port *s = &sci_ports[i]; - s->port.uartclk = clk_get_rate(s->clk); - } + (phase == CPUFREQ_RESUMECHANGE)) { + spin_lock_irqsave(&priv->lock, flags); + list_for_each_entry(sci_port, &priv->ports, node) + sci_port->port.uartclk = clk_get_rate(sci_port->clk); + + spin_unlock_irqrestore(&priv->lock, flags); + } return NOTIFY_OK; } - -static struct notifier_block sci_nb = { &sci_notifier, NULL, 0 }; #endif static int sci_request_irq(struct sci_port *port) @@ -1201,6 +1215,27 @@ static struct uart_driver sci_uart_driver = { .cons = SCI_CONSOLE, }; + +static int __devexit sci_remove(struct platform_device *dev) +{ + struct sh_sci_priv *priv = platform_get_drvdata(dev); + struct sci_port *p; + unsigned long flags; + +#ifdef CONFIG_HAVE_CLK + cpufreq_unregister_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER); +#endif + + spin_lock_irqsave(&priv->lock, flags); + list_for_each_entry(p, &priv->ports, node) + uart_remove_one_port(&sci_uart_driver, &p->port); + + spin_unlock_irqrestore(&priv->lock, flags); + + kfree(priv); + return 0; +} + /* * Register a set of serial devices attached to a platform device. The * list is terminated with a zero flags entry, which means we expect @@ -1210,7 +1245,22 @@ static struct uart_driver sci_uart_driver = { static int __devinit sci_probe(struct platform_device *dev) { struct plat_sci_port *p = dev->dev.platform_data; + struct sh_sci_priv *priv; int i, ret = -EINVAL; + unsigned long flags; + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return -ENOMEM; + + INIT_LIST_HEAD(&priv->ports); + spin_lock_init(&priv->lock); + platform_set_drvdata(dev, priv); + +#ifdef CONFIG_HAVE_CLK + priv->clk_nb.notifier_call = sci_notifier; + cpufreq_register_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER); +#endif for (i = 0; p && p->flags != 0; p++, i++) { struct sci_port *sciport = &sci_ports[i]; @@ -1254,12 +1304,19 @@ static int __devinit sci_probe(struct platform_device *dev) memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs)); - uart_add_one_port(&sci_uart_driver, &sciport->port); - } + ret = uart_add_one_port(&sci_uart_driver, &sciport->port); -#ifdef CONFIG_HAVE_CLK - cpufreq_register_notifier(&sci_nb, CPUFREQ_TRANSITION_NOTIFIER); -#endif + if (ret && (p->flags & UPF_IOREMAP)) { + iounmap(p->membase); + goto err_unreg; + } + + INIT_LIST_HEAD(&sciport->node); + + spin_lock_irqsave(&priv->lock, flags); + list_add(&sciport->node, &priv->ports); + spin_unlock_irqrestore(&priv->lock, flags); + } #ifdef CONFIG_SH_STANDARD_BIOS sh_bios_gdb_detach(); @@ -1268,50 +1325,36 @@ static int __devinit sci_probe(struct platform_device *dev) return 0; err_unreg: - for (i = i - 1; i >= 0; i--) - uart_remove_one_port(&sci_uart_driver, &sci_ports[i].port); - + sci_remove(dev); return ret; } -static int __devexit sci_remove(struct platform_device *dev) -{ - int i; - -#ifdef CONFIG_HAVE_CLK - cpufreq_unregister_notifier(&sci_nb, CPUFREQ_TRANSITION_NOTIFIER); -#endif - - for (i = 0; i < SCI_NPORTS; i++) - uart_remove_one_port(&sci_uart_driver, &sci_ports[i].port); - - return 0; -} - static int sci_suspend(struct platform_device *dev, pm_message_t state) { - int i; + struct sh_sci_priv *priv = platform_get_drvdata(dev); + struct sci_port *p; + unsigned long flags; - for (i = 0; i < SCI_NPORTS; i++) { - struct sci_port *p = &sci_ports[i]; + spin_lock_irqsave(&priv->lock, flags); + list_for_each_entry(p, &priv->ports, node) + uart_suspend_port(&sci_uart_driver, &p->port); - if (p->type != PORT_UNKNOWN && p->port.dev == &dev->dev) - uart_suspend_port(&sci_uart_driver, &p->port); - } + spin_unlock_irqrestore(&priv->lock, flags); return 0; } static int sci_resume(struct platform_device *dev) { - int i; + struct sh_sci_priv *priv = platform_get_drvdata(dev); + struct sci_port *p; + unsigned long flags; - for (i = 0; i < SCI_NPORTS; i++) { - struct sci_port *p = &sci_ports[i]; + spin_lock_irqsave(&priv->lock, flags); + list_for_each_entry(p, &priv->ports, node) + uart_resume_port(&sci_uart_driver, &p->port); - if (p->type != PORT_UNKNOWN && p->port.dev == &dev->dev) - uart_resume_port(&sci_uart_driver, &p->port); - } + spin_unlock_irqrestore(&priv->lock, flags); return 0; } From 9080b72819650c3a757d173a19bc930d603b79d6 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 21 Jan 2009 15:13:58 +0000 Subject: [PATCH 0863/2061] sh-sci: remove early_sci_setup() Remove unused early_sci_setup() function from sh-sci. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 14 -------------- include/linux/serial_sci.h | 2 -- 2 files changed, 16 deletions(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index af5da110d52..5f37f7d3166 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -1082,20 +1082,6 @@ static void __init sci_init_ports(void) } } -int __init early_sci_setup(struct uart_port *port) -{ - if (unlikely(port->line > SCI_NPORTS)) - return -ENODEV; - - sci_init_ports(); - - sci_ports[port->line].port.membase = port->membase; - sci_ports[port->line].port.mapbase = port->mapbase; - sci_ports[port->line].port.type = port->type; - - return 0; -} - #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE /* * Print a string to the serial port trying not to disturb diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index 893cc53486b..717059d6791 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -27,6 +27,4 @@ struct plat_sci_port { upf_t flags; /* UPF_* flags */ }; -int early_sci_setup(struct uart_port *port); - #endif /* __LINUX_SERIAL_SCI_H */ From dc8e6f5bfcd6a307a8196d3e41fd9798be5a1c76 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 21 Jan 2009 15:14:06 +0000 Subject: [PATCH 0864/2061] sh-sci: rework serial console support Rework sh-sci serial console code. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 47 ++++++++++++++++++++--------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index 5f37f7d3166..d2cd1a400c1 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -91,10 +91,6 @@ struct sh_sci_priv { #endif }; -#ifdef CONFIG_SERIAL_SH_SCI_CONSOLE -static struct sci_port *serial_console_port; -#endif - /* Function prototypes */ static void sci_stop_tx(struct uart_port *port); @@ -1083,6 +1079,18 @@ static void __init sci_init_ports(void) } #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE +static struct tty_driver *serial_console_device(struct console *co, int *index) +{ + struct uart_driver *p = &sci_uart_driver; + *index = co->index; + return p->tty_driver; +} + +static void serial_console_putchar(struct uart_port *port, int ch) +{ + sci_poll_put_char(port, ch); +} + /* * Print a string to the serial port trying not to disturb * any possible real use of the port... @@ -1090,16 +1098,10 @@ static void __init sci_init_ports(void) static void serial_console_write(struct console *co, const char *s, unsigned count) { - struct uart_port *port = &serial_console_port->port; + struct uart_port *port = co->data; unsigned short bits; - int i; - for (i = 0; i < count; i++) { - if (*s == 10) - sci_poll_put_char(port, '\r'); - - sci_poll_put_char(port, *s++); - } + uart_console_write(co->data, s, count, serial_console_putchar); /* wait until fifo is empty and last bit has been transmitted */ bits = SCxSR_TDxE(port) | SCxSR_TEND(port); @@ -1109,6 +1111,7 @@ static void serial_console_write(struct console *co, const char *s, static int __init serial_console_setup(struct console *co, char *options) { + struct sci_port *sci_port; struct uart_port *port; int baud = 115200; int bits = 8; @@ -1124,8 +1127,9 @@ static int __init serial_console_setup(struct console *co, char *options) if (co->index >= SCI_NPORTS) co->index = 0; - serial_console_port = &sci_ports[co->index]; - port = &serial_console_port->port; + sci_port = &sci_ports[co->index]; + port = &sci_port->port; + co->data = port; /* * Also need to check port->type, we don't actually have any @@ -1135,21 +1139,17 @@ static int __init serial_console_setup(struct console *co, char *options) */ if (!port->type) return -ENODEV; - if (!port->membase || !port->mapbase) - return -ENODEV; - - port->type = serial_console_port->type; #ifdef CONFIG_HAVE_CLK - if (!serial_console_port->clk) - serial_console_port->clk = clk_get(NULL, "module_clk"); + if (!sci_port->clk) + sci_port->clk = clk_get(NULL, "module_clk"); #endif if (port->flags & UPF_IOREMAP) sci_config_port(port, 0); - if (serial_console_port->enable) - serial_console_port->enable(port); + if (sci_port->enable) + sci_port->enable(port); if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); @@ -1165,12 +1165,11 @@ static int __init serial_console_setup(struct console *co, char *options) static struct console serial_console = { .name = "ttySC", - .device = uart_console_device, + .device = serial_console_device, .write = serial_console_write, .setup = serial_console_setup, .flags = CON_PRINTBUFFER, .index = -1, - .data = &sci_uart_driver, }; static int __init sci_console_init(void) From a5660adae85918f2ab6b10ab58e2f574c1bd5ce1 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 21 Jan 2009 15:14:38 +0000 Subject: [PATCH 0865/2061] sh-sci: use to_sci_port() if possible Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index d2cd1a400c1..17fa7f17bbe 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -618,7 +618,7 @@ static inline int sci_handle_breaks(struct uart_port *port) int copied = 0; unsigned short status = sci_in(port, SCxSR); struct tty_struct *tty = port->info->port.tty; - struct sci_port *s = &sci_ports[port->line]; + struct sci_port *s = to_sci_port(port); if (uart_handle_break(port)) return 0; @@ -875,7 +875,7 @@ static void sci_break_ctl(struct uart_port *port, int break_state) static int sci_startup(struct uart_port *port) { - struct sci_port *s = &sci_ports[port->line]; + struct sci_port *s = to_sci_port(port); if (s->enable) s->enable(port); @@ -893,7 +893,7 @@ static int sci_startup(struct uart_port *port) static void sci_shutdown(struct uart_port *port) { - struct sci_port *s = &sci_ports[port->line]; + struct sci_port *s = to_sci_port(port); sci_stop_rx(port); sci_stop_tx(port); @@ -990,7 +990,7 @@ static int sci_request_port(struct uart_port *port) static void sci_config_port(struct uart_port *port, int flags) { - struct sci_port *s = &sci_ports[port->line]; + struct sci_port *s = to_sci_port(port); port->type = s->type; @@ -1002,7 +1002,7 @@ static void sci_config_port(struct uart_port *port, int flags) static int sci_verify_port(struct uart_port *port, struct serial_struct *ser) { - struct sci_port *s = &sci_ports[port->line]; + struct sci_port *s = to_sci_port(port); if (ser->irq != s->irqs[SCIx_TXI_IRQ] || ser->irq > nr_irqs) return -EINVAL; From 0ee70712922c15252183db8b50a7e369c96017c0 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 21 Jan 2009 15:13:50 +0000 Subject: [PATCH 0866/2061] sh-sci: allow single port platform devices Allow registration of single port sh-sci platform devices. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 125 +++++++++++++++++++++++----------------- 1 file changed, 72 insertions(+), 53 deletions(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index 17fa7f17bbe..e9b350c58ba 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -1221,6 +1221,70 @@ static int __devexit sci_remove(struct platform_device *dev) return 0; } +static int __devinit sci_probe_single(struct platform_device *dev, + unsigned int index, + struct plat_sci_port *p, + struct sci_port *sciport) +{ + struct sh_sci_priv *priv = platform_get_drvdata(dev); + unsigned long flags; + int ret; + + /* Sanity check */ + if (unlikely(index >= SCI_NPORTS)) { + dev_notice(&dev->dev, "Attempting to register port " + "%d when only %d are available.\n", + index+1, SCI_NPORTS); + dev_notice(&dev->dev, "Consider bumping " + "CONFIG_SERIAL_SH_SCI_NR_UARTS!\n"); + return 0; + } + + sciport->port.mapbase = p->mapbase; + + if (p->mapbase && !p->membase) { + if (p->flags & UPF_IOREMAP) { + p->membase = ioremap_nocache(p->mapbase, 0x40); + if (IS_ERR(p->membase)) + return PTR_ERR(p->membase); + } else { + /* + * For the simple (and majority of) cases + * where we don't need to do any remapping, + * just cast the cookie directly. + */ + p->membase = (void __iomem *)p->mapbase; + } + } + + sciport->port.membase = p->membase; + + sciport->port.irq = p->irqs[SCIx_TXI_IRQ]; + sciport->port.flags = p->flags; + sciport->port.dev = &dev->dev; + + sciport->type = sciport->port.type = p->type; + + memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs)); + + ret = uart_add_one_port(&sci_uart_driver, &sciport->port); + + if (ret) { + if (p->flags & UPF_IOREMAP) + iounmap(p->membase); + + return ret; + } + + INIT_LIST_HEAD(&sciport->node); + + spin_lock_irqsave(&priv->lock, flags); + list_add(&sciport->node, &priv->ports); + spin_unlock_irqrestore(&priv->lock, flags); + + return 0; +} + /* * Register a set of serial devices attached to a platform device. The * list is terminated with a zero flags entry, which means we expect @@ -1232,7 +1296,6 @@ static int __devinit sci_probe(struct platform_device *dev) struct plat_sci_port *p = dev->dev.platform_data; struct sh_sci_priv *priv; int i, ret = -EINVAL; - unsigned long flags; priv = kzalloc(sizeof(*priv), GFP_KERNEL); if (!priv) @@ -1247,60 +1310,16 @@ static int __devinit sci_probe(struct platform_device *dev) cpufreq_register_notifier(&priv->clk_nb, CPUFREQ_TRANSITION_NOTIFIER); #endif - for (i = 0; p && p->flags != 0; p++, i++) { - struct sci_port *sciport = &sci_ports[i]; - - /* Sanity check */ - if (unlikely(i == SCI_NPORTS)) { - dev_notice(&dev->dev, "Attempting to register port " - "%d when only %d are available.\n", - i+1, SCI_NPORTS); - dev_notice(&dev->dev, "Consider bumping " - "CONFIG_SERIAL_SH_SCI_NR_UARTS!\n"); - break; - } - - sciport->port.mapbase = p->mapbase; - - if (p->mapbase && !p->membase) { - if (p->flags & UPF_IOREMAP) { - p->membase = ioremap_nocache(p->mapbase, 0x40); - if (IS_ERR(p->membase)) { - ret = PTR_ERR(p->membase); - goto err_unreg; - } - } else { - /* - * For the simple (and majority of) cases - * where we don't need to do any remapping, - * just cast the cookie directly. - */ - p->membase = (void __iomem *)p->mapbase; - } - } - - sciport->port.membase = p->membase; - - sciport->port.irq = p->irqs[SCIx_TXI_IRQ]; - sciport->port.flags = p->flags; - sciport->port.dev = &dev->dev; - - sciport->type = sciport->port.type = p->type; - - memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs)); - - ret = uart_add_one_port(&sci_uart_driver, &sciport->port); - - if (ret && (p->flags & UPF_IOREMAP)) { - iounmap(p->membase); + if (dev->id != -1) { + ret = sci_probe_single(dev, dev->id, p, &sci_ports[dev->id]); + if (ret) goto err_unreg; + } else { + for (i = 0; p && p->flags != 0; p++, i++) { + ret = sci_probe_single(dev, i, p, &sci_ports[i]); + if (ret) + goto err_unreg; } - - INIT_LIST_HEAD(&sciport->node); - - spin_lock_irqsave(&priv->lock, flags); - list_add(&sciport->node, &priv->ports); - spin_unlock_irqrestore(&priv->lock, flags); } #ifdef CONFIG_SH_STANDARD_BIOS From 7ed7e0711b3ff85b3e15591081b42f2af96d584b Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 21 Jan 2009 15:14:14 +0000 Subject: [PATCH 0867/2061] sh-sci: replace sci_init_ports() Replace sci_init_ports() with sci_init_single(). Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 107 +++++++++++++++++++--------------------- 1 file changed, 50 insertions(+), 57 deletions(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index e9b350c58ba..039c700ce1e 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -1036,46 +1036,64 @@ static struct uart_ops sci_uart_ops = { #endif }; -static void __init sci_init_ports(void) +static int __devinit sci_init_single(struct sci_port *sci_port, + unsigned int index, + struct plat_sci_port *p) { - static int first = 1; - int i; - - if (!first) - return; - - first = 0; - - for (i = 0; i < SCI_NPORTS; i++) { - sci_ports[i].port.ops = &sci_uart_ops; - sci_ports[i].port.iotype = UPIO_MEM; - sci_ports[i].port.line = i; - sci_ports[i].port.fifosize = 1; + sci_port->port.ops = &sci_uart_ops; + sci_port->port.iotype = UPIO_MEM; + sci_port->port.line = index; + sci_port->port.fifosize = 1; #if defined(__H8300H__) || defined(__H8300S__) #ifdef __H8300S__ - sci_ports[i].enable = h8300_sci_enable; - sci_ports[i].disable = h8300_sci_disable; + sci_port->enable = h8300_sci_enable; + sci_port->disable = h8300_sci_disable; #endif - sci_ports[i].port.uartclk = CONFIG_CPU_CLOCK; + sci_port->port.uartclk = CONFIG_CPU_CLOCK; #elif defined(CONFIG_HAVE_CLK) - /* - * XXX: We should use a proper SCI/SCIF clock - */ - { - struct clk *clk = clk_get(NULL, "module_clk"); - sci_ports[i].port.uartclk = clk_get_rate(clk); - clk_put(clk); - } + /* + * XXX: We should use a proper SCI/SCIF clock + */ + { + struct clk *clk = clk_get(NULL, "module_clk"); + sci_port->port.uartclk = clk_get_rate(clk); + clk_put(clk); + } #else #error "Need a valid uartclk" #endif - sci_ports[i].break_timer.data = (unsigned long)&sci_ports[i]; - sci_ports[i].break_timer.function = sci_break_timer; + sci_port->break_timer.data = (unsigned long)sci_port; + sci_port->break_timer.function = sci_break_timer; + init_timer(&sci_port->break_timer); - init_timer(&sci_ports[i].break_timer); + sci_port->port.mapbase = p->mapbase; + + if (p->mapbase && !p->membase) { + if (p->flags & UPF_IOREMAP) { + p->membase = ioremap_nocache(p->mapbase, 0x40); + if (IS_ERR(p->membase)) + return PTR_ERR(p->membase); + } else { + /* + * For the simple (and majority of) cases + * where we don't need to do any remapping, + * just cast the cookie directly. + */ + p->membase = (void __iomem *)p->mapbase; + } } + + sci_port->port.membase = p->membase; + + sci_port->port.irq = p->irqs[SCIx_TXI_IRQ]; + sci_port->port.flags = p->flags; + sci_port->type = sci_port->port.type = p->type; + + memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs)); + + return 0; } #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE @@ -1174,7 +1192,6 @@ static struct console serial_console = { static int __init sci_console_init(void) { - sci_init_ports(); register_console(&serial_console); return 0; } @@ -1240,32 +1257,10 @@ static int __devinit sci_probe_single(struct platform_device *dev, return 0; } - sciport->port.mapbase = p->mapbase; - - if (p->mapbase && !p->membase) { - if (p->flags & UPF_IOREMAP) { - p->membase = ioremap_nocache(p->mapbase, 0x40); - if (IS_ERR(p->membase)) - return PTR_ERR(p->membase); - } else { - /* - * For the simple (and majority of) cases - * where we don't need to do any remapping, - * just cast the cookie directly. - */ - p->membase = (void __iomem *)p->mapbase; - } - } - - sciport->port.membase = p->membase; - - sciport->port.irq = p->irqs[SCIx_TXI_IRQ]; - sciport->port.flags = p->flags; - sciport->port.dev = &dev->dev; - - sciport->type = sciport->port.type = p->type; - - memcpy(&sciport->irqs, &p->irqs, sizeof(p->irqs)); + sciport->port.dev = &dev->dev; + ret = sci_init_single(sciport, index, p); + if (ret) + return ret; ret = uart_add_one_port(&sci_uart_driver, &sciport->port); @@ -1380,8 +1375,6 @@ static int __init sci_init(void) printk(banner); - sci_init_ports(); - ret = uart_register_driver(&sci_uart_driver); if (likely(ret == 0)) { ret = platform_driver_register(&sci_driver); From 08f8cb315fdf9195b472aeb440ae65b189b151da Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 21 Jan 2009 15:14:22 +0000 Subject: [PATCH 0868/2061] sh-sci: ioremap() in a single place Handle ioremap() in sci_config_port only. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 54 +++++++++++++++-------------------------- 1 file changed, 20 insertions(+), 34 deletions(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index 039c700ce1e..408624ae7fe 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -994,9 +994,21 @@ static void sci_config_port(struct uart_port *port, int flags) port->type = s->type; - if (port->flags & UPF_IOREMAP && !port->membase) { + if (port->membase) + return; + + if (port->flags & UPF_IOREMAP) { port->membase = ioremap_nocache(port->mapbase, 0x40); - dev_err(port->dev, "can't remap port#%d\n", port->line); + + if (IS_ERR(port->membase)) + dev_err(port->dev, "can't remap port#%d\n", port->line); + } else { + /* + * For the simple (and majority of) cases where we don't + * need to do any remapping, just cast the cookie + * directly. + */ + port->membase = (void __iomem *)port->mapbase; } } @@ -1036,9 +1048,9 @@ static struct uart_ops sci_uart_ops = { #endif }; -static int __devinit sci_init_single(struct sci_port *sci_port, - unsigned int index, - struct plat_sci_port *p) +static void __devinit sci_init_single(struct sci_port *sci_port, + unsigned int index, + struct plat_sci_port *p) { sci_port->port.ops = &sci_uart_ops; sci_port->port.iotype = UPIO_MEM; @@ -1069,22 +1081,6 @@ static int __devinit sci_init_single(struct sci_port *sci_port, init_timer(&sci_port->break_timer); sci_port->port.mapbase = p->mapbase; - - if (p->mapbase && !p->membase) { - if (p->flags & UPF_IOREMAP) { - p->membase = ioremap_nocache(p->mapbase, 0x40); - if (IS_ERR(p->membase)) - return PTR_ERR(p->membase); - } else { - /* - * For the simple (and majority of) cases - * where we don't need to do any remapping, - * just cast the cookie directly. - */ - p->membase = (void __iomem *)p->mapbase; - } - } - sci_port->port.membase = p->membase; sci_port->port.irq = p->irqs[SCIx_TXI_IRQ]; @@ -1092,8 +1088,6 @@ static int __devinit sci_init_single(struct sci_port *sci_port, sci_port->type = sci_port->port.type = p->type; memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs)); - - return 0; } #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE @@ -1163,8 +1157,7 @@ static int __init serial_console_setup(struct console *co, char *options) sci_port->clk = clk_get(NULL, "module_clk"); #endif - if (port->flags & UPF_IOREMAP) - sci_config_port(port, 0); + sci_config_port(port, 0); if (sci_port->enable) sci_port->enable(port); @@ -1258,18 +1251,11 @@ static int __devinit sci_probe_single(struct platform_device *dev, } sciport->port.dev = &dev->dev; - ret = sci_init_single(sciport, index, p); - if (ret) - return ret; + sci_init_single(sciport, index, p); ret = uart_add_one_port(&sci_uart_driver, &sciport->port); - - if (ret) { - if (p->flags & UPF_IOREMAP) - iounmap(p->membase); - + if (ret) return ret; - } INIT_LIST_HEAD(&sciport->node); From 501b825d01efb93766c87d29f299851152cf4eb0 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 21 Jan 2009 15:14:30 +0000 Subject: [PATCH 0869/2061] sh-sci: improve clock framework support Use enable/disable hooks for clock framework integration. Make sure we control the clock for the serial console as well. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 77 +++++++++++++++++++++++--------------- include/linux/serial_sci.h | 1 + 2 files changed, 47 insertions(+), 31 deletions(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index 408624ae7fe..3daf76725ac 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -76,8 +76,10 @@ struct sci_port { int break_flag; #ifdef CONFIG_HAVE_CLK - /* Port clock */ - struct clk *clk; + /* Interface clock */ + struct clk *iclk; + /* Data clock */ + struct clk *dclk; #endif struct list_head node; }; @@ -166,12 +168,12 @@ static void h8300_sci_config(struct uart_port *port, unsigned int ctrl) *mstpcrl &= ~mask; } -static inline void h8300_sci_enable(struct uart_port *port) +static void h8300_sci_enable(struct uart_port *port) { h8300_sci_config(port, sci_enable); } -static inline void h8300_sci_disable(struct uart_port *port) +static void h8300_sci_disable(struct uart_port *port) { h8300_sci_config(port, sci_disable); } @@ -742,13 +744,34 @@ static int sci_notifier(struct notifier_block *self, (phase == CPUFREQ_RESUMECHANGE)) { spin_lock_irqsave(&priv->lock, flags); list_for_each_entry(sci_port, &priv->ports, node) - sci_port->port.uartclk = clk_get_rate(sci_port->clk); + sci_port->port.uartclk = clk_get_rate(sci_port->dclk); spin_unlock_irqrestore(&priv->lock, flags); } return NOTIFY_OK; } + +static void sci_clk_enable(struct uart_port *port) +{ + struct sci_port *sci_port = to_sci_port(port); + + clk_enable(sci_port->dclk); + sci_port->port.uartclk = clk_get_rate(sci_port->dclk); + + if (sci_port->iclk) + clk_enable(sci_port->iclk); +} + +static void sci_clk_disable(struct uart_port *port) +{ + struct sci_port *sci_port = to_sci_port(port); + + if (sci_port->iclk) + clk_disable(sci_port->iclk); + + clk_disable(sci_port->dclk); +} #endif static int sci_request_irq(struct sci_port *port) @@ -880,10 +903,6 @@ static int sci_startup(struct uart_port *port) if (s->enable) s->enable(port); -#ifdef CONFIG_HAVE_CLK - s->clk = clk_get(NULL, "module_clk"); -#endif - sci_request_irq(s); sci_start_tx(port); sci_start_rx(port, 1); @@ -901,11 +920,6 @@ static void sci_shutdown(struct uart_port *port) if (s->disable) s->disable(port); - -#ifdef CONFIG_HAVE_CLK - clk_put(s->clk); - s->clk = NULL; -#endif } static void sci_set_termios(struct uart_port *port, struct ktermios *termios, @@ -1048,7 +1062,8 @@ static struct uart_ops sci_uart_ops = { #endif }; -static void __devinit sci_init_single(struct sci_port *sci_port, +static void __devinit sci_init_single(struct platform_device *dev, + struct sci_port *sci_port, unsigned int index, struct plat_sci_port *p) { @@ -1064,14 +1079,10 @@ static void __devinit sci_init_single(struct sci_port *sci_port, #endif sci_port->port.uartclk = CONFIG_CPU_CLOCK; #elif defined(CONFIG_HAVE_CLK) - /* - * XXX: We should use a proper SCI/SCIF clock - */ - { - struct clk *clk = clk_get(NULL, "module_clk"); - sci_port->port.uartclk = clk_get_rate(clk); - clk_put(clk); - } + sci_port->iclk = p->clk ? clk_get(&dev->dev, p->clk) : NULL; + sci_port->dclk = clk_get(&dev->dev, "module_clk"); + sci_port->enable = sci_clk_enable; + sci_port->disable = sci_clk_disable; #else #error "Need a valid uartclk" #endif @@ -1085,9 +1096,11 @@ static void __devinit sci_init_single(struct sci_port *sci_port, sci_port->port.irq = p->irqs[SCIx_TXI_IRQ]; sci_port->port.flags = p->flags; + sci_port->port.dev = &dev->dev; sci_port->type = sci_port->port.type = p->type; memcpy(&sci_port->irqs, &p->irqs, sizeof(p->irqs)); + } #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE @@ -1111,14 +1124,21 @@ static void serial_console_write(struct console *co, const char *s, unsigned count) { struct uart_port *port = co->data; + struct sci_port *sci_port = to_sci_port(port); unsigned short bits; - uart_console_write(co->data, s, count, serial_console_putchar); + if (sci_port->enable) + sci_port->enable(port); + + uart_console_write(port, s, count, serial_console_putchar); /* wait until fifo is empty and last bit has been transmitted */ bits = SCxSR_TDxE(port) | SCxSR_TEND(port); while ((sci_in(port, SCxSR) & bits) != bits) cpu_relax(); + + if (sci_port->disable); + sci_port->disable(port); } static int __init serial_console_setup(struct console *co, char *options) @@ -1152,11 +1172,6 @@ static int __init serial_console_setup(struct console *co, char *options) if (!port->type) return -ENODEV; -#ifdef CONFIG_HAVE_CLK - if (!sci_port->clk) - sci_port->clk = clk_get(NULL, "module_clk"); -#endif - sci_config_port(port, 0); if (sci_port->enable) @@ -1171,6 +1186,7 @@ static int __init serial_console_setup(struct console *co, char *options) if (ret == 0) sci_stop_rx(port); #endif + /* TODO: disable clock */ return ret; } @@ -1250,8 +1266,7 @@ static int __devinit sci_probe_single(struct platform_device *dev, return 0; } - sciport->port.dev = &dev->dev; - sci_init_single(sciport, index, p); + sci_init_single(dev, sciport, index, p); ret = uart_add_one_port(&sci_uart_driver, &sciport->port); if (ret) diff --git a/include/linux/serial_sci.h b/include/linux/serial_sci.h index 717059d6791..1c297ddc9d5 100644 --- a/include/linux/serial_sci.h +++ b/include/linux/serial_sci.h @@ -25,6 +25,7 @@ struct plat_sci_port { unsigned int irqs[SCIx_NR_IRQS]; /* ERI, RXI, TXI, BRI */ unsigned int type; /* SCI / SCIF / IRDA */ upf_t flags; /* UPF_* flags */ + char *clk; /* clock string */ }; #endif /* __LINUX_SERIAL_SCI_H */ From 3b226e15beb5ecf068738e796811afd1e5b3f81f Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 23:28:54 +0900 Subject: [PATCH 0870/2061] sh: Add clock id to sh-sci platform data on SH-Mobile CPUs. This adds the clock specifier to all of the SH-Mobile sh-sci ports. Impacted CPUs are SH7343/SH7366/SH7722/SH7723/SH7724. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7343.c | 4 ++++ arch/sh/kernel/cpu/sh4a/setup-sh7366.c | 1 + arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 3 +++ arch/sh/kernel/cpu/sh4a/setup-sh7723.c | 6 ++++++ arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 6 ++++++ 5 files changed, 20 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c index f327b7eaf47..b9c361e8cc8 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c @@ -179,21 +179,25 @@ static struct plat_sci_port sci_platform_data[] = { .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 80, 80, 80, 80 }, + .clk = "scif0", }, { .mapbase = 0xffe10000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 81, 81, 81, 81 }, + .clk = "scif1", }, { .mapbase = 0xffe20000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 82, 82, 82, 82 }, + .clk = "scif2", }, { .mapbase = 0xffe30000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 83, 83, 83, 83 }, + .clk = "scif3", }, { .flags = 0, } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c index 7361ea989b9..d4a994be4a7 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c @@ -186,6 +186,7 @@ static struct plat_sci_port sci_platform_data[] = { .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 80, 80, 80, 80 }, + .clk = "scif0", }, { .flags = 0, } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c index b1c3e7f9284..5d6247fecd6 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c @@ -307,18 +307,21 @@ static struct plat_sci_port sci_platform_data[] = { .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 80, 80, 80, 80 }, + .clk = "scif0", }, { .mapbase = 0xffe10000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 81, 81, 81, 81 }, + .clk = "scif1", }, { .mapbase = 0xffe20000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 82, 82, 82, 82 }, + .clk = "scif2", }, { .flags = 0, diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c index 484ca2dabde..1429fc5e428 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c @@ -321,31 +321,37 @@ static struct plat_sci_port sci_platform_data[] = { .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 80, 80, 80, 80 }, + .clk = "scif0", },{ .mapbase = 0xffe10000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 81, 81, 81, 81 }, + .clk = "scif1", },{ .mapbase = 0xffe20000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 82, 82, 82, 82 }, + .clk = "scif2", },{ .mapbase = 0xa4e30000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIFA, .irqs = { 56, 56, 56, 56 }, + .clk = "scif3", },{ .mapbase = 0xa4e40000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIFA, .irqs = { 88, 88, 88, 88 }, + .clk = "scif4", },{ .mapbase = 0xa4e50000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIFA, .irqs = { 109, 109, 109, 109 }, + .clk = "scif5", }, { .flags = 0, } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index e074951b88b..a3039ce1a6a 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -30,31 +30,37 @@ static struct plat_sci_port sci_platform_data[] = { .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 80, 80, 80, 80 }, + .clk = "scif0", }, { .mapbase = 0xffe10000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 81, 81, 81, 81 }, + .clk = "scif1", }, { .mapbase = 0xffe20000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 82, 82, 82, 82 }, + .clk = "scif2", }, { .mapbase = 0xa4e30000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIFA, .irqs = { 56, 56, 56, 56 }, + .clk = "scif3", }, { .mapbase = 0xa4e40000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIFA, .irqs = { 88, 88, 88, 88 }, + .clk = "scif4", }, { .mapbase = 0xa4e50000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIFA, .irqs = { 109, 109, 109, 109 }, + .clk = "scif5", }, { .flags = 0, } From 54507f6ee99778a727ff1b38a1f4050fe6479835 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 8 May 2009 23:48:33 +0900 Subject: [PATCH 0871/2061] serial: sh-sci: Fix up section mismatch in error path. The sci_probe_single() path attempts to use sci_remove() for the error path, while sci_remove() is still flagged as __devexit. So, we simply discard the section annotation. Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index 3daf76725ac..686e4a456e3 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -1227,7 +1227,7 @@ static struct uart_driver sci_uart_driver = { }; -static int __devexit sci_remove(struct platform_device *dev) +static int sci_remove(struct platform_device *dev) { struct sh_sci_priv *priv = platform_get_drvdata(dev); struct sci_port *p; From 168f36237b16e2b3159e24c7d3b658e3c912d149 Mon Sep 17 00:00:00 2001 From: Yoshinori Sato Date: Tue, 28 Apr 2009 04:40:15 +0000 Subject: [PATCH 0872/2061] serial: sh-sci: Fix up h8300 support. - Dummy SCIF functions define. - h8300 specific header include. Signed-off-by: Yoshinori Sato Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.c | 4 ++++ drivers/serial/sh-sci.h | 17 +++++++++++++++-- 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index 686e4a456e3..4e3248d5860 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -54,6 +54,10 @@ #include #endif +#ifdef CONFIG_H8300 +#include +#endif + #include "sh-sci.h" struct sci_port { diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h index 84cc6512f08..e3eae4eaadd 100644 --- a/drivers/serial/sh-sci.h +++ b/drivers/serial/sh-sci.h @@ -317,7 +317,18 @@ } \ } -#define CPU_SCIF_FNS(name, scif_offset, scif_size) \ +#ifdef CONFIG_H8300 +/* h8300 don't have SCIF */ +#define CPU_SCIF_FNS(name) \ + static inline unsigned int sci_##name##_in(struct uart_port *port) \ + { \ + return 0; \ + } \ + static inline void sci_##name##_out(struct uart_port *port, unsigned int value) \ + { \ + } +#else +#define CPU_SCIF_FNS(name, scif_offset, scif_size) \ static inline unsigned int sci_##name##_in(struct uart_port *port) \ { \ SCI_IN(scif_size, scif_offset); \ @@ -326,6 +337,7 @@ { \ SCI_OUT(scif_size, scif_offset, value); \ } +#endif #define CPU_SCI_FNS(name, sci_offset, sci_size) \ static inline unsigned int sci_##name##_in(struct uart_port* port) \ @@ -363,7 +375,8 @@ sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size, \ h8_sci_offset, h8_sci_size) \ CPU_SCI_FNS(name, h8_sci_offset, h8_sci_size) -#define SCIF_FNS(name, sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size) +#define SCIF_FNS(name, sh3_scif_offset, sh3_scif_size, sh4_scif_offset, sh4_scif_size) \ + CPU_SCIF_FNS(name) #elif defined(CONFIG_CPU_SUBTYPE_SH7723) ||\ defined(CONFIG_CPU_SUBTYPE_SH7724) #define SCIx_FNS(name, sh4_scifa_offset, sh4_scifa_size, sh4_scif_offset, sh4_scif_size) \ From be6514c6295cc79498eeb9a8f933451082ca9e69 Mon Sep 17 00:00:00 2001 From: Kieran Bingham Date: Fri, 8 May 2009 15:48:15 +0100 Subject: [PATCH 0873/2061] sh: Add in some ptrace definitions from GDB. Plugs in PT_TEXT_END_ADDR/PT_TEXT_ADDR/PT_DATA_ADDR/PT_TEXT_LEN definitions. Signed-off-by: Kieran Bingham Signed-off-by: Peter Griffin Signed-off-by: Paul Mundt --- arch/sh/include/asm/ptrace.h | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/sh/include/asm/ptrace.h b/arch/sh/include/asm/ptrace.h index 68e20ff9aa9..1dc12cb44a2 100644 --- a/arch/sh/include/asm/ptrace.h +++ b/arch/sh/include/asm/ptrace.h @@ -102,6 +102,11 @@ struct pt_dspregs { #define PTRACE_GETDSPREGS 55 /* DSP registers */ #define PTRACE_SETDSPREGS 56 +#define PT_TEXT_END_ADDR 240 +#define PT_TEXT_ADDR 244 /* &(struct user)->start_code */ +#define PT_DATA_ADDR 248 /* &(struct user)->start_data */ +#define PT_TEXT_LEN 252 + #ifdef __KERNEL__ #include From e73173dbe55e5b4c2306728aad50c8e42194f6d5 Mon Sep 17 00:00:00 2001 From: Kieran Bingham Date: Fri, 8 May 2009 15:49:50 +0100 Subject: [PATCH 0874/2061] sh: Fix UBC setup and registers for SH2A Signed-off-by: Kieran Bingham Signed-off-by: Peter Griffin Signed-off-by: Paul Mundt --- arch/sh/include/asm/ubc.h | 11 +++++++++++ arch/sh/include/cpu-sh2a/cpu/ubc.h | 29 ++++++++++++++++++++++++++++- arch/sh/kernel/process_32.c | 4 +++- 3 files changed, 42 insertions(+), 2 deletions(-) diff --git a/arch/sh/include/asm/ubc.h b/arch/sh/include/asm/ubc.h index a7b9028bbfb..4ca4b771737 100644 --- a/arch/sh/include/asm/ubc.h +++ b/arch/sh/include/asm/ubc.h @@ -42,12 +42,23 @@ #define BRCR_CMFA (1 << 15) #define BRCR_CMFB (1 << 14) + +#if defined CONFIG_CPU_SH2A +#define BRCR_CMFCA (1 << 15) +#define BRCR_CMFCB (1 << 14) +#define BRCR_CMFDA (1 << 13) +#define BRCR_CMFDB (1 << 12) +#define BRCR_PCBB (1 << 6) /* 1: after execution */ +#define BRCR_PCBA (1 << 5) /* 1: after execution */ +#define BRCR_PCTE 0 +#else #define BRCR_PCTE (1 << 11) #define BRCR_PCBA (1 << 10) /* 1: after execution */ #define BRCR_DBEB (1 << 7) #define BRCR_PCBB (1 << 6) #define BRCR_SEQ (1 << 3) #define BRCR_UBDE (1 << 0) +#endif #ifndef __ASSEMBLY__ /* arch/sh/kernel/cpu/ubc.S */ diff --git a/arch/sh/include/cpu-sh2a/cpu/ubc.h b/arch/sh/include/cpu-sh2a/cpu/ubc.h index 8ce2fc1cf62..1192e1c761a 100644 --- a/arch/sh/include/cpu-sh2a/cpu/ubc.h +++ b/arch/sh/include/cpu-sh2a/cpu/ubc.h @@ -1 +1,28 @@ -#include +/* + * SH-2A UBC definitions + * + * Copyright (C) 2008 Kieran Bingham + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +#ifndef __ASM_CPU_SH2A_UBC_H +#define __ASM_CPU_SH2A_UBC_H + +#define UBC_BARA 0xfffc0400 +#define UBC_BAMRA 0xfffc0404 +#define UBC_BBRA 0xfffc04a0 /* 16 bit access */ +#define UBC_BDRA 0xfffc0408 +#define UBC_BDMRA 0xfffc040c + +#define UBC_BARB 0xfffc0410 +#define UBC_BAMRB 0xfffc0414 +#define UBC_BBRB 0xfffc04b0 /* 16 bit access */ +#define UBC_BDRB 0xfffc0418 +#define UBC_BDMRB 0xfffc041c + +#define UBC_BRCR 0xfffc04c0 + +#endif /* __ASM_CPU_SH2A_UBC_H */ diff --git a/arch/sh/kernel/process_32.c b/arch/sh/kernel/process_32.c index 6d94725d22f..9289ede29c7 100644 --- a/arch/sh/kernel/process_32.c +++ b/arch/sh/kernel/process_32.c @@ -251,7 +251,8 @@ static void ubc_set_tracing(int asid, unsigned long pc) if (current_cpu_data.type == CPU_SH7729 || current_cpu_data.type == CPU_SH7710 || - current_cpu_data.type == CPU_SH7712) { + current_cpu_data.type == CPU_SH7712 || + current_cpu_data.type == CPU_SH7203){ ctrl_outw(BBR_INST | BBR_READ | BBR_CPU, UBC_BBRA); ctrl_outl(BRCR_PCBA | BRCR_PCTE, UBC_BRCR); } else { @@ -407,6 +408,7 @@ asmlinkage void break_point_trap(void) #else ctrl_outw(0, UBC_BBRA); ctrl_outw(0, UBC_BBRB); + ctrl_outl(0, UBC_BRCR); #endif current->thread.ubc_pc = 0; ubc_usercnt -= 1; From ba0d474082dfa37fb0efd439b4d0c0234ceb1e80 Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Fri, 8 May 2009 15:50:54 +0100 Subject: [PATCH 0875/2061] sh: Add ptrace support for NOMMU debugging Signed-off-by: Peter Griffin Signed-off-by: Paul Mundt --- arch/sh/kernel/ptrace_32.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/sh/kernel/ptrace_32.c b/arch/sh/kernel/ptrace_32.c index f7b22dd83b0..3392e835a37 100644 --- a/arch/sh/kernel/ptrace_32.c +++ b/arch/sh/kernel/ptrace_32.c @@ -334,6 +334,14 @@ long arch_ptrace(struct task_struct *child, long request, long addr, long data) [(addr - (long)&dummy->fpu) >> 2]; } else if (addr == (long) &dummy->u_fpvalid) tmp = !!tsk_used_math(child); + else if (addr == PT_TEXT_ADDR) + tmp = child->mm->start_code; + else if (addr == PT_DATA_ADDR) + tmp = child->mm->start_data; + else if (addr == PT_TEXT_END_ADDR) + tmp = child->mm->end_code; + else if (addr == PT_TEXT_LEN) + tmp = child->mm->end_code - child->mm->start_code; else tmp = 0; ret = put_user(tmp, datap); From cd89436e54b29a07a383ee82f2f718d8c9d24cc4 Mon Sep 17 00:00:00 2001 From: Peter Griffin Date: Fri, 8 May 2009 15:51:51 +0100 Subject: [PATCH 0876/2061] sh: Add UBC trap vector for SH2A Signed-off-by: Peter Griffin Signed-off-by: Paul Mundt --- arch/sh/kernel/traps_32.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index 30ca9c51e52..67550d88c4e 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -34,6 +34,7 @@ # define TRAP_ILLEGAL_SLOT_INST 6 # define TRAP_ADDRESS_ERROR 9 # ifdef CONFIG_CPU_SH2A +# define TRAP_UBC 12 # define TRAP_FPU_ERROR 13 # define TRAP_DIVZERO_ERROR 17 # define TRAP_DIVOVF_ERROR 18 @@ -849,6 +850,10 @@ void __init trap_init(void) #endif #endif +#ifdef TRAP_UBC + set_exception_table_vec(TRAP_UBC, break_point_trap); +#endif + /* Setup VBR for boot cpu */ per_cpu_trap_init(); } From bf8b9a63c18a1a7777571650de0c9f4fd4368ca0 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Fri, 8 May 2009 20:53:58 +0530 Subject: [PATCH 0877/2061] x86: msr-index.h remove duplicate MSR C001_0015 declaration MSRC001_0015 Hardware Configuration Register (HWCR) is already defined as MSR_K7_HWCR. And HWCR is available for >= K7. So MSR_K8_HWCR is not required and no-one is using it. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput --- arch/x86/include/asm/msr-index.h | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h index ec41fc16c16..4d58d04fca8 100644 --- a/arch/x86/include/asm/msr-index.h +++ b/arch/x86/include/asm/msr-index.h @@ -121,7 +121,6 @@ #define MSR_K8_TOP_MEM1 0xc001001a #define MSR_K8_TOP_MEM2 0xc001001d #define MSR_K8_SYSCFG 0xc0010010 -#define MSR_K8_HWCR 0xc0010015 #define MSR_K8_INT_PENDING_MSG 0xc0010055 /* C1E active bits in int pending message */ #define K8_INTP_C1E_ACTIVE_MASK 0x18000000 From 7fc23a5380797012e92a9633169440f2f4a21253 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:21 +0200 Subject: [PATCH 0878/2061] perf_counter: optimize perf_counter_task_tick() perf_counter_task_tick() does way too much work to find out there's nothing to do. Provide an easy short-circuit for the normal case where there are no counters on the system. [ Impact: micro-optimization ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090508170028.750619201@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 60e55f0b48f..fdb0d242127 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -39,6 +39,7 @@ int perf_max_counters __read_mostly = 1; static int perf_reserved_percpu __read_mostly; static int perf_overcommit __read_mostly = 1; +static atomic_t nr_counters __read_mostly; static atomic_t nr_mmap_tracking __read_mostly; static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; @@ -1076,8 +1077,14 @@ static void rotate_ctx(struct perf_counter_context *ctx) void perf_counter_task_tick(struct task_struct *curr, int cpu) { - struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_cpu_context *cpuctx; + struct perf_counter_context *ctx; + + if (!atomic_read(&nr_counters)) + return; + + cpuctx = &per_cpu(perf_cpu_context, cpu); + ctx = &curr->perf_counter_ctx; perf_counter_cpu_sched_out(cpuctx); perf_counter_task_sched_out(curr, cpu); @@ -1197,6 +1204,7 @@ static void free_counter(struct perf_counter *counter) { perf_pending_sync(counter); + atomic_dec(&nr_counters); if (counter->hw_event.mmap) atomic_dec(&nr_mmap_tracking); if (counter->hw_event.munmap) @@ -2861,6 +2869,7 @@ done: counter->pmu = pmu; + atomic_inc(&nr_counters); if (counter->hw_event.mmap) atomic_inc(&nr_mmap_tracking); if (counter->hw_event.munmap) From 3df5edad87a998273aa5a9a8c728c05d855ad00e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:22 +0200 Subject: [PATCH 0879/2061] perf_counter: rework ioctl()s Corey noticed that ioctl()s on grouped counters didn't work on the whole group. This extends the ioctl() interface to take a second argument that is interpreted as a flags field. We then provide PERF_IOC_FLAG_GROUP to toggle the behaviour. Having this flag gives the greatest flexibility, allowing you to individually enable/disable/reset counters in a group, or all together. [ Impact: fix group counter enable/disable semantics ] Reported-by: Corey Ashford Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <20090508170028.837558214@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++- kernel/perf_counter.c | 104 +++++++++++++++++++---------------- 2 files changed, 65 insertions(+), 49 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 00081d84169..88f863ec274 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -157,10 +157,14 @@ struct perf_counter_hw_event { /* * Ioctls that can be done on a perf counter fd: */ -#define PERF_COUNTER_IOC_ENABLE _IO ('$', 0) -#define PERF_COUNTER_IOC_DISABLE _IO ('$', 1) +#define PERF_COUNTER_IOC_ENABLE _IOW('$', 0, u32) +#define PERF_COUNTER_IOC_DISABLE _IOW('$', 1, u32) #define PERF_COUNTER_IOC_REFRESH _IOW('$', 2, u32) -#define PERF_COUNTER_IOC_RESET _IO ('$', 3) +#define PERF_COUNTER_IOC_RESET _IOW('$', 3, u32) + +enum perf_counter_ioc_flags { + PERF_IOC_FLAG_GROUP = 1U << 0, +}; /* * Structure of the page that can be mapped via mmap diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index fdb0d242127..f4883f1f47e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -82,7 +82,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) * add it straight to the context's counter list, or to the group * leader's sibling list: */ - if (counter->group_leader == counter) + if (group_leader == counter) list_add_tail(&counter->list_entry, &ctx->counter_list); else { list_add_tail(&counter->list_entry, &group_leader->sibling_list); @@ -385,24 +385,6 @@ static void perf_counter_disable(struct perf_counter *counter) spin_unlock_irq(&ctx->lock); } -/* - * Disable a counter and all its children. - */ -static void perf_counter_disable_family(struct perf_counter *counter) -{ - struct perf_counter *child; - - perf_counter_disable(counter); - - /* - * Lock the mutex to protect the list of children - */ - mutex_lock(&counter->mutex); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_disable(child); - mutex_unlock(&counter->mutex); -} - static int counter_sched_in(struct perf_counter *counter, struct perf_cpu_context *cpuctx, @@ -753,24 +735,6 @@ static int perf_counter_refresh(struct perf_counter *counter, int refresh) return 0; } -/* - * Enable a counter and all its children. - */ -static void perf_counter_enable_family(struct perf_counter *counter) -{ - struct perf_counter *child; - - perf_counter_enable(counter); - - /* - * Lock the mutex to protect the list of children - */ - mutex_lock(&counter->mutex); - list_for_each_entry(child, &counter->child_list, child_list) - perf_counter_enable(child); - mutex_unlock(&counter->mutex); -} - void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx) { @@ -1307,31 +1271,79 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void perf_counter_reset(struct perf_counter *counter) { + (void)perf_counter_read(counter); atomic_set(&counter->count, 0); + perf_counter_update_userpage(counter); +} + +static void perf_counter_for_each_sibling(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter_context *ctx = counter->ctx; + struct perf_counter *sibling; + + spin_lock_irq(&ctx->lock); + counter = counter->group_leader; + + func(counter); + list_for_each_entry(sibling, &counter->sibling_list, list_entry) + func(sibling); + spin_unlock_irq(&ctx->lock); +} + +static void perf_counter_for_each_child(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter *child; + + mutex_lock(&counter->mutex); + func(counter); + list_for_each_entry(child, &counter->child_list, child_list) + func(child); + mutex_unlock(&counter->mutex); +} + +static void perf_counter_for_each(struct perf_counter *counter, + void (*func)(struct perf_counter *)) +{ + struct perf_counter *child; + + mutex_lock(&counter->mutex); + perf_counter_for_each_sibling(counter, func); + list_for_each_entry(child, &counter->child_list, child_list) + perf_counter_for_each_sibling(child, func); + mutex_unlock(&counter->mutex); } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct perf_counter *counter = file->private_data; - int err = 0; + void (*func)(struct perf_counter *); + u32 flags = arg; switch (cmd) { case PERF_COUNTER_IOC_ENABLE: - perf_counter_enable_family(counter); + func = perf_counter_enable; break; case PERF_COUNTER_IOC_DISABLE: - perf_counter_disable_family(counter); - break; - case PERF_COUNTER_IOC_REFRESH: - err = perf_counter_refresh(counter, arg); + func = perf_counter_disable; break; case PERF_COUNTER_IOC_RESET: - perf_counter_reset(counter); + func = perf_counter_reset; break; + + case PERF_COUNTER_IOC_REFRESH: + return perf_counter_refresh(counter, arg); default: - err = -ENOTTY; + return -ENOTTY; } - return err; + + if (flags & PERF_IOC_FLAG_GROUP) + perf_counter_for_each(counter, func); + else + perf_counter_for_each_child(counter, func); + + return 0; } /* From a85f61abe11a46553c4562e74edb27ebc782aeb7 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:23 +0200 Subject: [PATCH 0880/2061] perf_counter: add PERF_RECORD_CONFIG Much like CONFIG_RECORD_GROUP records the hw_event.config to identify the values, allow to record this for all counters. [ Impact: extend perfcounter output record format ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090508170028.923228280@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 8 ++++++++ 2 files changed, 10 insertions(+) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 88f863ec274..0e6303d36c6 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -104,6 +104,7 @@ enum perf_counter_record_format { PERF_RECORD_ADDR = 1U << 3, PERF_RECORD_GROUP = 1U << 4, PERF_RECORD_CALLCHAIN = 1U << 5, + PERF_RECORD_CONFIG = 1U << 6, }; /* @@ -258,6 +259,7 @@ enum perf_event_type { * { u32 pid, tid; } && PERF_RECORD_TID * { u64 time; } && PERF_RECORD_TIME * { u64 addr; } && PERF_RECORD_ADDR + * { u64 config; } && PERF_RECORD_CONFIG * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index f4883f1f47e..c615f52aa40 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1994,6 +1994,11 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } + if (record_type & PERF_RECORD_CONFIG) { + header.type |= PERF_RECORD_CONFIG; + header.size += sizeof(u64); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -2029,6 +2034,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_ADDR) perf_output_put(&handle, addr); + if (record_type & PERF_RECORD_CONFIG) + perf_output_put(&handle, counter->hw_event.config); + /* * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. */ From f370e1e2f195ec1e6420e26fc83e0319595db578 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 8 May 2009 18:52:24 +0200 Subject: [PATCH 0881/2061] perf_counter: add PERF_RECORD_CPU Allow recording the CPU number the event was generated on. RFC: this leaves a u32 as reserved, should we fill in the node_id() there, or leave this open for future extention, as userspace can already easily do the cpu->node mapping if needed. [ Impact: extend perfcounter output record format ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: <20090508170029.008627711@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ kernel/perf_counter.c | 13 +++++++++++++ 2 files changed, 15 insertions(+) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0e6303d36c6..614f921d616 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -105,6 +105,7 @@ enum perf_counter_record_format { PERF_RECORD_GROUP = 1U << 4, PERF_RECORD_CALLCHAIN = 1U << 5, PERF_RECORD_CONFIG = 1U << 6, + PERF_RECORD_CPU = 1U << 7, }; /* @@ -260,6 +261,7 @@ enum perf_event_type { * { u64 time; } && PERF_RECORD_TIME * { u64 addr; } && PERF_RECORD_ADDR * { u64 config; } && PERF_RECORD_CONFIG + * { u32 cpu, res; } && PERF_RECORD_CPU * * { u64 nr; * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c615f52aa40..d850a1fb8d4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1956,6 +1956,9 @@ static void perf_counter_output(struct perf_counter *counter, struct perf_callchain_entry *callchain = NULL; int callchain_size = 0; u64 time; + struct { + u32 cpu, reserved; + } cpu_entry; header.type = 0; header.size = sizeof(header); @@ -1999,6 +2002,13 @@ static void perf_counter_output(struct perf_counter *counter, header.size += sizeof(u64); } + if (record_type & PERF_RECORD_CPU) { + header.type |= PERF_RECORD_CPU; + header.size += sizeof(cpu_entry); + + cpu_entry.cpu = raw_smp_processor_id(); + } + if (record_type & PERF_RECORD_GROUP) { header.type |= PERF_RECORD_GROUP; header.size += sizeof(u64) + @@ -2037,6 +2047,9 @@ static void perf_counter_output(struct perf_counter *counter, if (record_type & PERF_RECORD_CONFIG) perf_output_put(&handle, counter->hw_event.config); + if (record_type & PERF_RECORD_CPU) + perf_output_put(&handle, cpu_entry); + /* * XXX PERF_RECORD_GROUP vs inherited counters seems difficult. */ From 29f93943d1916d1a3faa3f10f4a06994347ac990 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 16:06:47 -0400 Subject: [PATCH 0882/2061] tracing: initialize return value for __ftrace_set_clr_event Commit 8f31bfe538ebafac187d2d4465a92e1d9ee6d8c2 tracing/events: clean up for ftrace_set_clr_event() Moved out the code for ftrace_set_clr_event into a helper funciton but did not initialize the return value. As a result, we do not warn about a typo in the echoing of events in set_event. This patch restores the old warning: # echo foobar > set_event -bash: echo: write error: Invalid argument [ Impact: restore warning of invalid entries to set_event ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index df394bc6d54..2eecb87e42d 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -118,7 +118,7 @@ static int __ftrace_set_clr_event(const char *match, const char *sub, const char *event, int set) { struct ftrace_event_call *call; - int ret; + int ret = -EINVAL; mutex_lock(&event_mutex); list_for_each_entry(call, &ftrace_events, list) { From 4671c79408a3f8a5a6a45e39c4c164dada3a5678 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 16:27:41 -0400 Subject: [PATCH 0883/2061] tracing: add trace_set_clr_event to export event enabling function Other parts of the kernel may need to be able to enable or disable specific events. Especially parts that create trace events. [ Impact: allow enabling of trace events by those that create the event ] Signed-off-by: Steven Rostedt --- include/linux/ftrace_event.h | 2 ++ kernel/trace/trace_events.c | 17 +++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/include/linux/ftrace_event.h b/include/linux/ftrace_event.h index 662c1becf36..bae51ddfabd 100644 --- a/include/linux/ftrace_event.h +++ b/include/linux/ftrace_event.h @@ -127,6 +127,8 @@ extern int trace_define_field(struct ftrace_event_call *call, char *type, #define is_signed_type(type) (((type)(-1)) < 0) +int trace_set_clr_event(const char *system, const char *event, int set); + /* * The double __builtin_constant_p is because gcc will give us an error * if we try to allocate the static variable to fmt if it is not a diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 2eecb87e42d..0eec0c55dd8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -177,6 +177,23 @@ static int ftrace_set_clr_event(char *buf, int set) return __ftrace_set_clr_event(match, sub, event, set); } +/** + * trace_set_clr_event - enable or disable an event + * @system: system name to match (NULL for any system) + * @event: event name to match (NULL for all events, within system) + * @set: 1 to enable, 0 to disable + * + * This is a way for other parts of the kernel to enable or disable + * event recording. + * + * Returns 0 on success, -EINVAL if the parameters do not match any + * registered events. + */ +int trace_set_clr_event(const char *system, const char *event, int set) +{ + return __ftrace_set_clr_event(NULL, system, event, set); +} + /* 128 should be much more than enough */ #define EVENT_BUF_SIZE 127 From 6cac5a924668a56c7ccefc345805f1fe0536a90e Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Sun, 29 Mar 2009 19:56:29 -0700 Subject: [PATCH 0884/2061] xen/x86-64: fix breakpoints and hardware watchpoints Native x86-64 uses the IST mechanism to run int3 and debug traps on an alternative stack. Xen does not do this, and so the frames were being misinterpreted by the ptrace code. This change special-cases these two exceptions by using Xen variants which run on the normal kernel stack properly. Impact: avoid crash or bad data when IST trap is invoked under Xen Signed-off-by: Jeremy Fitzhardinge --- arch/x86/include/asm/traps.h | 3 +++ arch/x86/kernel/entry_64.S | 5 +++++ arch/x86/xen/enlighten.c | 19 ++++++++++++++++++- 3 files changed, 26 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h index 0d5342515b8..c44e5002f2f 100644 --- a/arch/x86/include/asm/traps.h +++ b/arch/x86/include/asm/traps.h @@ -13,6 +13,9 @@ asmlinkage void divide_error(void); asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); +asmlinkage void xen_debug(void); +asmlinkage void xen_int3(void); +asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 38946c6e843..bb01ce080b8 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1379,6 +1379,11 @@ END(xen_failsafe_callback) paranoidzeroentry_ist debug do_debug DEBUG_STACK paranoidzeroentry_ist int3 do_int3 DEBUG_STACK paranoiderrorentry stack_segment do_stack_segment +#ifdef CONFIG_XEN +zeroentry xen_debug do_debug +zeroentry xen_int3 do_int3 +errorentry xen_stack_segment do_stack_segment +#endif errorentry general_protection do_general_protection errorentry page_fault do_page_fault #ifdef CONFIG_X86_MCE diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 12a3159333b..7566e13c0ca 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,7 @@ #include #include #include +#include #include #include #include @@ -428,11 +430,26 @@ static void xen_write_ldt_entry(struct desc_struct *dt, int entrynum, static int cvt_gate_to_trap(int vector, const gate_desc *val, struct trap_info *info) { + unsigned long addr; + if (val->type != GATE_TRAP && val->type != GATE_INTERRUPT) return 0; info->vector = vector; - info->address = gate_offset(*val); + + addr = gate_offset(*val); +#ifdef CONFIG_X86_64 + if (addr == (unsigned long)debug) + addr = (unsigned long)xen_debug; + else if (addr == (unsigned long)int3) + addr = (unsigned long)xen_int3; + else if (addr == (unsigned long)stack_segment) + addr = (unsigned long)xen_stack_segment; + else + WARN_ON(val->ist != 0); +#endif /* CONFIG_X86_64 */ + info->address = addr; + info->cs = gate_segment(*val); info->flags = val->dpl; /* interrupt gates clear IF */ From b80119bb35a49a4e8dbfb9708872adfd5cf38dee Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 00:22:08 -0700 Subject: [PATCH 0885/2061] xen/x86-64: clean up warnings about IST-using traps Ignore known IST-using traps. Aside from the debugger traps, they're low-level faults which Xen will handle for us, so the kernel needn't worry about them. Keep warning in case unknown trap starts using IST. Impact: suppress spurious warnings Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 7566e13c0ca..e9df942aa14 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -439,14 +439,32 @@ static int cvt_gate_to_trap(int vector, const gate_desc *val, addr = gate_offset(*val); #ifdef CONFIG_X86_64 + /* + * Look for known traps using IST, and substitute them + * appropriately. The debugger ones are the only ones we care + * about. Xen will handle faults like double_fault and + * machine_check, so we should never see them. Warn if + * there's an unexpected IST-using fault handler. + */ if (addr == (unsigned long)debug) addr = (unsigned long)xen_debug; else if (addr == (unsigned long)int3) addr = (unsigned long)xen_int3; else if (addr == (unsigned long)stack_segment) addr = (unsigned long)xen_stack_segment; - else - WARN_ON(val->ist != 0); + else if (addr == (unsigned long)double_fault || + addr == (unsigned long)nmi) { + /* Don't need to handle these */ + return 0; +#ifdef CONFIG_X86_MCE + } else if (addr == (unsigned long)machine_check) { + return 0; +#endif + } else { + /* Some other trap using IST? */ + if (WARN_ON(val->ist != 0)) + return 0; + } #endif /* CONFIG_X86_64 */ info->address = addr; From a789ed5fb6d0256c4177c2cc27e06520ddbe4d4c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Fri, 24 Apr 2009 00:26:50 -0700 Subject: [PATCH 0886/2061] xen: cache cr0 value to avoid trap'n'emulate for read_cr0 stts() is implemented in terms of read_cr0/write_cr0 to update the state of the TS bit. This happens during context switch, and so is fairly performance critical. Rather than falling back to a trap-and-emulate native read_cr0, implement our own by caching the last-written value from write_cr0 (the TS bit is the only one we really care about). Impact: optimise Xen context switches Signed-off-by: Jeremy Fitzhardinge --- arch/x86/xen/enlighten.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index e9df942aa14..0a1700a2be9 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -658,10 +658,26 @@ static void xen_clts(void) xen_mc_issue(PARAVIRT_LAZY_CPU); } +static DEFINE_PER_CPU(unsigned long, xen_cr0_value); + +static unsigned long xen_read_cr0(void) +{ + unsigned long cr0 = percpu_read(xen_cr0_value); + + if (unlikely(cr0 == 0)) { + cr0 = native_read_cr0(); + percpu_write(xen_cr0_value, cr0); + } + + return cr0; +} + static void xen_write_cr0(unsigned long cr0) { struct multicall_space mcs; + percpu_write(xen_cr0_value, cr0); + /* Only pay attention to cr0.TS; everything else is ignored. */ mcs = xen_mc_entry(0); @@ -847,7 +863,7 @@ static const struct pv_cpu_ops xen_cpu_ops __initdata = { .clts = xen_clts, - .read_cr0 = native_read_cr0, + .read_cr0 = xen_read_cr0, .write_cr0 = xen_write_cr0, .read_cr4 = native_read_cr4, From 0b4eb462da10f832b28d518abffa4d77805928a0 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 30 Apr 2009 17:59:36 -0700 Subject: [PATCH 0887/2061] x86, boot: align the .bss section in the decompressor Aligning the .bss section makes it trivial to use large operation sizes for moving the initialized sections and clearing the .bss. The alignment chosen (L1 cache) is somewhat arbitrary, but should be large enough to avoid all known performance traps and small enough to not cause troubles. [ Impact: trivial performance enhancement, future patch prep ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/vmlinux.lds.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index 0d26c92d3c7..dbe515e13fe 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -42,6 +42,7 @@ SECTIONS *(.data.*) _edata = . ; } + . = ALIGN(CONFIG_X86_L1_CACHE_BYTES); .bss : { _bss = . ; *(.bss) From d3dd3b5a29bb9582957451531fed461628dfc834 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 5 May 2009 21:17:15 -0700 Subject: [PATCH 0888/2061] kbuild: allow compressors (gzip, bzip2, lzma) to take multiple inputs Allow the compression commands in Kbuild (i.e. gzip, bzip2, lzma) to take multiple input files and emit the concatenated compressed output. This avoids an intermediate step when a kernel image is built from multiple components, such as the relocatable x86-32 kernel. Sam Ravnborg integrated the bin_size script into the Makefile. [ Impact: new build feature, not yet used ] Signed-off-by: H. Peter Anvin Acked-by: Sam Ravnborg --- scripts/Makefile.lib | 26 ++++++++++++++++++++------ scripts/bin_size | 10 ---------- 2 files changed, 20 insertions(+), 16 deletions(-) delete mode 100644 scripts/bin_size diff --git a/scripts/Makefile.lib b/scripts/Makefile.lib index 979619574f7..f8cf938dde9 100644 --- a/scripts/Makefile.lib +++ b/scripts/Makefile.lib @@ -183,20 +183,34 @@ cmd_objcopy = $(OBJCOPY) $(OBJCOPYFLAGS) $(OBJCOPYFLAGS_$(@F)) $< $@ # --------------------------------------------------------------------------- quiet_cmd_gzip = GZIP $@ -cmd_gzip = gzip -f -9 < $< > $@ +cmd_gzip = (cat $(filter-out FORCE,$^) | gzip -f -9 > $@) || \ + (rm -f $@ ; false) # Bzip2 # --------------------------------------------------------------------------- -# Bzip2 does not include size in file... so we have to fake that -size_append=$(CONFIG_SHELL) $(srctree)/scripts/bin_size +# Bzip2 and LZMA do not include size in file... so we have to fake that; +# append the size as a 32-bit littleendian number as gzip does. +size_append = echo -ne $(shell \ +dec_size=0; \ +for F in $1; do \ + fsize=$$(stat -c "%s" $$F); \ + dec_size=$$(expr $$dec_size + $$fsize); \ +done; \ +printf "%08x" $$dec_size | \ + sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g' \ +) -quiet_cmd_bzip2 = BZIP2 $@ -cmd_bzip2 = (bzip2 -9 < $< && $(size_append) $<) > $@ || (rm -f $@ ; false) +quiet_cmd_bzip2 = BZIP2 $@ +cmd_bzip2 = (cat $(filter-out FORCE,$^) | \ + bzip2 -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) # Lzma # --------------------------------------------------------------------------- quiet_cmd_lzma = LZMA $@ -cmd_lzma = (lzma -9 -c $< && $(size_append) $<) >$@ || (rm -f $@ ; false) +cmd_lzma = (cat $(filter-out FORCE,$^) | \ + lzma -9 && $(call size_append, $(filter-out FORCE,$^))) > $@ || \ + (rm -f $@ ; false) diff --git a/scripts/bin_size b/scripts/bin_size deleted file mode 100644 index 43e1b360cee..00000000000 --- a/scripts/bin_size +++ /dev/null @@ -1,10 +0,0 @@ -#!/bin/sh - -if [ $# = 0 ] ; then - echo Usage: $0 file -fi - -size_dec=`stat -c "%s" $1` -size_hex_echo_string=`printf "%08x" $size_dec | - sed 's/\(..\)\(..\)\(..\)\(..\)/\\\\x\4\\\\x\3\\\\x\2\\\\x\1/g'` -/bin/echo -ne $size_hex_echo_string From 845adf7266a7ba6970bf982ffd96abc60d2018ab Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 5 May 2009 21:20:51 -0700 Subject: [PATCH 0889/2061] x86: add a Kconfig symbol for when relocations are needed We only need to build relocations when we are building a 32-bit relocatable kernel. Rather than unnecessarily complicating the Makefiles, make an explicit Kbuild symbol for this. [ Impact: permits future cleanup ] Signed-off-by: H. Peter Anvin Cc: Sam Ravnborg --- arch/x86/Kconfig | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 039c3f04aac..5aee45356b5 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1513,6 +1513,11 @@ config RELOCATABLE it has been loaded at and the compile time physical address (CONFIG_PHYSICAL_START) is ignored. +# Relocation on x86-32 needs some additional build support +config X86_NEED_RELOCS + def_bool y + depends on X86_32 && RELOCATABLE + config PHYSICAL_ALIGN hex prompt "Alignment value to which kernel should be aligned" if X86_32 From 5f11e02019ef44f041e6e38a1363fa2fd4b8785d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 5 May 2009 22:53:11 -0700 Subject: [PATCH 0890/2061] x86, boot: simplify arch/x86/boot/compressed/Makefile Simplify the arch/x86/boot/compressed/Makefile, by using the new capability of specifying multiple inputs to a compressor, and the CONFIG_X86_NEED_RELOCS Kconfig symbol. Signed-off-by: H. Peter Anvin Acked-by: Sam Ravnborg --- arch/x86/boot/compressed/Makefile | 39 +++++-------------------------- 1 file changed, 6 insertions(+), 33 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 0f4b5e2abd3..b35c3bb7090 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -29,7 +29,7 @@ $(obj)/vmlinux.bin: vmlinux FORCE targets += vmlinux.bin.all vmlinux.relocs relocs -hostprogs-$(CONFIG_X86_32) += relocs +hostprogs-$(CONFIG_X86_NEED_RELOCS) += relocs quiet_cmd_relocs = RELOCS $@ cmd_relocs = $(obj)/relocs $< > $@;$(obj)/relocs --abs-relocs $< @@ -37,46 +37,19 @@ $(obj)/vmlinux.relocs: vmlinux $(obj)/relocs FORCE $(call if_changed,relocs) vmlinux.bin.all-y := $(obj)/vmlinux.bin -vmlinux.bin.all-$(CONFIG_RELOCATABLE) += $(obj)/vmlinux.relocs -quiet_cmd_relocbin = BUILD $@ - cmd_relocbin = cat $(filter-out FORCE,$^) > $@ -$(obj)/vmlinux.bin.all: $(vmlinux.bin.all-y) FORCE - $(call if_changed,relocbin) +vmlinux.bin.all-$(CONFIG_X86_NEED_RELOCS) += $(obj)/vmlinux.relocs -ifeq ($(CONFIG_X86_32),y) - -ifdef CONFIG_RELOCATABLE -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin.all FORCE +$(obj)/vmlinux.bin.gz: $(vmlinux.bin.all-y) FORCE $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin.all FORCE +$(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin.all FORCE +$(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzma) -else -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE - $(call if_changed,lzma) -endif -LDFLAGS_piggy.o := -r --format binary --oformat elf32-i386 -T - -else - -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) -$(obj)/vmlinux.bin.bz2: $(obj)/vmlinux.bin FORCE - $(call if_changed,bzip2) -$(obj)/vmlinux.bin.lzma: $(obj)/vmlinux.bin FORCE - $(call if_changed,lzma) - -LDFLAGS_piggy.o := -r --format binary --oformat elf64-x86-64 -T -endif suffix_$(CONFIG_KERNEL_GZIP) = gz suffix_$(CONFIG_KERNEL_BZIP2) = bz2 suffix_$(CONFIG_KERNEL_LZMA) = lzma +LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T $(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE $(call if_changed,ld) From 283ab1c0bd462dd0b179393fb081a626f6687413 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 15:32:47 -0700 Subject: [PATCH 0891/2061] x86, boot: follow standard Kbuild style for compression suffix When generating the compression suffix in arch/x86/boot/compressed/Makefile, follow standard Kbuild conventions, that is: - Use a dash not underscore before y/m/n endings - Use := whenever possible. Requested-by: Sam Ravnborg Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index b35c3bb7090..7f24fdb584e 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -46,10 +46,10 @@ $(obj)/vmlinux.bin.bz2: $(vmlinux.bin.all-y) FORCE $(obj)/vmlinux.bin.lzma: $(vmlinux.bin.all-y) FORCE $(call if_changed,lzma) -suffix_$(CONFIG_KERNEL_GZIP) = gz -suffix_$(CONFIG_KERNEL_BZIP2) = bz2 -suffix_$(CONFIG_KERNEL_LZMA) = lzma +suffix-$(CONFIG_KERNEL_GZIP) := gz +suffix-$(CONFIG_KERNEL_BZIP2) := bz2 +suffix-$(CONFIG_KERNEL_LZMA) := lzma LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T -$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix_y) FORCE +$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE $(call if_changed,ld) From bd2a36984c50bb546a7d04cb395fddcf98a1092c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 5 May 2009 23:24:50 -0700 Subject: [PATCH 0892/2061] x86, boot: use BP_scratch in arch/x86/boot/compressed/head_*.S Use the BP_scratch symbol from asm-offsets.h instead of hard-coding the location. [ Impact: cleanup ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 2 +- arch/x86/boot/compressed/head_64.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 85bd3285706..e3398f3d1b3 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -53,7 +53,7 @@ ENTRY(startup_32) * data at 0x1e4 (defined as a scratch field) are used as the stack * for this calculation. Only 4 bytes are needed. */ - leal (0x1e4+4)(%esi), %esp + leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %ebp subl $1b, %ebp diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index ed4a8294800..06cc7e59352 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -56,7 +56,7 @@ ENTRY(startup_32) * data at 0x1e4 (defined as a scratch field) are used as the stack * for this calculation. Only 4 bytes are needed. */ - leal (0x1e4+4)(%esi), %esp + leal (BP_scratch+4)(%esi), %esp call 1f 1: popl %ebp subl $1b, %ebp From 5f64ec64e7f9b246c0a94f34cdf7782f98a6e55d Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 15:45:17 -0700 Subject: [PATCH 0893/2061] x86, boot: stylistic cleanups for boot/compressed/head_32.S Reformat arch/x86/boot/compressed/head_32.S to be closer to currently preferred kernel assembly style, that is: - opcode and operand separated by tab - operands separated by ", " - C-style comments This also makes it more similar to head_64.S. [ Impact: cleanup, no object code change ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 170 +++++++++++++++-------------- 1 file changed, 89 insertions(+), 81 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index e3398f3d1b3..7bd7766ffab 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -12,16 +12,16 @@ * the page directory. [According to comments etc elsewhere on a compressed * kernel it will end up at 0x1000 + 1Mb I hope so as I assume this. - AC] * - * Page 0 is deliberately kept safe, since System Management Mode code in + * Page 0 is deliberately kept safe, since System Management Mode code in * laptops may need to access the BIOS data stored there. This is also - * useful for future device drivers that either access the BIOS via VM86 + * useful for future device drivers that either access the BIOS via VM86 * mode. */ /* * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ -.text + .text #include #include @@ -29,75 +29,80 @@ #include #include -.section ".text.head","ax",@progbits + .section ".text.head","ax",@progbits ENTRY(startup_32) cld - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) - jnz 1f + /* + * Test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments + */ + testb $(1<<6), BP_loadflags(%esi) + jnz 1f cli - movl $(__BOOT_DS),%eax - movl %eax,%ds - movl %eax,%es - movl %eax,%fs - movl %eax,%gs - movl %eax,%ss + movl $__BOOT_DS, %eax + movl %eax, %ds + movl %eax, %es + movl %eax, %fs + movl %eax, %gs + movl %eax, %ss 1: -/* Calculate the delta between where we were compiled to run +/* + * Calculate the delta between where we were compiled to run * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what * address we are running at. The reserved chunk of the real-mode * data at 0x1e4 (defined as a scratch field) are used as the stack * for this calculation. Only 4 bytes are needed. */ - leal (BP_scratch+4)(%esi), %esp - call 1f -1: popl %ebp - subl $1b, %ebp + leal (BP_scratch+4)(%esi), %esp + call 1f +1: popl %ebp + subl $1b, %ebp -/* %ebp contains the address we are loaded at by the boot loader and %ebx +/* + * %ebp contains the address we are loaded at by the boot loader and %ebx * contains the address where we should move the kernel image temporarily * for safe in-place decompression. */ #ifdef CONFIG_RELOCATABLE - movl %ebp, %ebx + movl %ebp, %ebx addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx #else - movl $LOAD_PHYSICAL_ADDR, %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx + subl input_len(%ebp), %ebx + movl output_len(%ebp), %eax + addl %eax, %ebx /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx + shrl $12, %eax + addl %eax, %ebx /* Add 32K + 18 bytes of extra slack */ - addl $(32768 + 18), %ebx + addl $(32768 + 18), %ebx /* Align on a 4K boundary */ - addl $4095, %ebx - andl $~4095, %ebx + addl $4095, %ebx + andl $~4095, %ebx -/* Copy the compressed kernel to the end of our buffer +/* + * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - pushl %esi - leal _ebss(%ebp), %esi - leal _ebss(%ebx), %edi - movl $(_ebss - startup_32), %ecx + pushl %esi + leal _ebss(%ebp), %esi + leal _ebss(%ebx), %edi + movl $(_ebss - startup_32), %ecx std - rep - movsb + rep movsb cld - popl %esi + popl %esi -/* Compute the kernel start address. +/* + * Compute the kernel start address. */ #ifdef CONFIG_RELOCATABLE addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp @@ -109,81 +114,84 @@ ENTRY(startup_32) /* * Jump to the relocated address. */ - leal relocated(%ebx), %eax - jmp *%eax + leal relocated(%ebx), %eax + jmp *%eax ENDPROC(startup_32) -.section ".text" + .text relocated: /* * Clear BSS */ - xorl %eax,%eax - leal _edata(%ebx),%edi - leal _ebss(%ebx), %ecx - subl %edi,%ecx + xorl %eax, %eax + leal _edata(%ebx), %edi + leal _ebss(%ebx), %ecx + subl %edi, %ecx cld - rep - stosb + rep stosb /* * Setup the stack for the decompressor */ - leal boot_stack_end(%ebx), %esp + leal boot_stack_end(%ebx), %esp /* * Do the decompression, and jump to the new kernel.. */ - movl output_len(%ebx), %eax - pushl %eax - # push arguments for decompress_kernel: - pushl %ebp # output address - movl input_len(%ebx), %eax - pushl %eax # input_len - leal input_data(%ebx), %eax - pushl %eax # input_data - leal boot_heap(%ebx), %eax - pushl %eax # heap area - pushl %esi # real mode pointer - call decompress_kernel - addl $20, %esp - popl %ecx + movl output_len(%ebx), %eax + pushl %eax + /* push arguments for decompress_kernel: */ + pushl %ebp /* output address */ + movl input_len(%ebx), %eax + pushl %eax /* input_len */ + leal input_data(%ebx), %eax + pushl %eax /* input_data */ + leal boot_heap(%ebx), %eax + pushl %eax /* heap area */ + pushl %esi /* real mode pointer */ + call decompress_kernel + addl $20, %esp + popl %ecx #if CONFIG_RELOCATABLE -/* Find the address of the relocations. +/* + * Find the address of the relocations. */ - movl %ebp, %edi - addl %ecx, %edi + movl %ebp, %edi + addl %ecx, %edi -/* Calculate the delta between where vmlinux was compiled to run +/* + * Calculate the delta between where vmlinux was compiled to run * and where it was actually loaded. */ - movl %ebp, %ebx - subl $LOAD_PHYSICAL_ADDR, %ebx - jz 2f /* Nothing to be done if loaded at compiled addr. */ + movl %ebp, %ebx + subl $LOAD_PHYSICAL_ADDR, %ebx + jz 2f /* Nothing to be done if loaded at compiled addr. */ /* * Process relocations. */ -1: subl $4, %edi - movl 0(%edi), %ecx - testl %ecx, %ecx - jz 2f - addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) - jmp 1b +1: subl $4, %edi + movl (%edi), %ecx + testl %ecx, %ecx + jz 2f + addl %ebx, -__PAGE_OFFSET(%ebx, %ecx) + jmp 1b 2: #endif /* * Jump to the decompressed kernel. */ - xorl %ebx,%ebx - jmp *%ebp + xorl %ebx, %ebx + jmp *%ebp -.bss -/* Stack and heap for uncompression */ -.balign 4 +/* + * Stack and heap for uncompression + */ + .bss + .balign 4 boot_heap: .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: From b40d68d5b5b799caaf99d2e073e62962e6d917ce Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 15:59:13 -0700 Subject: [PATCH 0894/2061] x86, boot: stylistic cleanups for boot/compressed/head_64.S Clean up style issues in arch/x86/boot/compressed/head_64.S. This file had a lot fewer style issues than its 32-bit cousin, but the ones it has are worth fixing, especially since it makes the two files more similar. [ Impact: cleanup, no object code change ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_64.S | 52 ++++++++++++++++++------------ 1 file changed, 31 insertions(+), 21 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 06cc7e59352..26c3def43ac 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -21,8 +21,8 @@ /* * High loaded stuff by Hans Lermen & Werner Almesberger, Feb. 1996 */ -.code32 -.text + .code32 + .text #include #include @@ -33,12 +33,14 @@ #include #include -.section ".text.head" + .section ".text.head" .code32 ENTRY(startup_32) cld - /* test KEEP_SEGMENTS flag to see if the bootloader is asking - * us to not reload segments */ + /* + * Test KEEP_SEGMENTS flag to see if the bootloader is asking + * us to not reload segments + */ testb $(1<<6), BP_loadflags(%esi) jnz 1f @@ -49,7 +51,8 @@ ENTRY(startup_32) movl %eax, %ss 1: -/* Calculate the delta between where we were compiled to run +/* + * Calculate the delta between where we were compiled to run * at and where we were actually loaded at. This can only be done * with a short local call on x86. Nothing else will tell us what * address we are running at. The reserved chunk of the real-mode @@ -70,10 +73,11 @@ ENTRY(startup_32) testl %eax, %eax jnz no_longmode -/* Compute the delta between where we were compiled to run at +/* + * Compute the delta between where we were compiled to run at * and where the code will actually run at. - */ -/* %ebp contains the address we are loaded at by the boot loader and %ebx + * + * %ebp contains the address we are loaded at by the boot loader and %ebx * contains the address where we should move the kernel image temporarily * for safe in-place decompression. */ @@ -114,7 +118,7 @@ ENTRY(startup_32) /* * Build early 4G boot pagetable */ - /* Initialize Page tables to 0*/ + /* Initialize Page tables to 0 */ leal pgtable(%ebx), %edi xorl %eax, %eax movl $((4096*6)/4), %ecx @@ -155,7 +159,8 @@ ENTRY(startup_32) btsl $_EFER_LME, %eax wrmsr - /* Setup for the jump to 64bit mode + /* + * Setup for the jump to 64bit mode * * When the jump is performend we will be in long mode but * in 32bit compatibility mode with EFER.LME = 1, CS.L = 0, CS.D = 1 @@ -184,7 +189,8 @@ no_longmode: #include "../../kernel/verify_cpu_64.S" - /* Be careful here startup_64 needs to be at a predictable + /* + * Be careful here startup_64 needs to be at a predictable * address so I can export it in an ELF header. Bootloaders * should look at the ELF header to find this address, as * it may change in the future. @@ -192,7 +198,8 @@ no_longmode: .code64 .org 0x200 ENTRY(startup_64) - /* We come here either from startup_32 or directly from a + /* + * We come here either from startup_32 or directly from a * 64bit bootloader. If we come here from a bootloader we depend on * an identity mapped page table being provied that maps our * entire text+data+bss and hopefully all of memory. @@ -209,7 +216,8 @@ ENTRY(startup_64) movl $0x20, %eax ltr %ax - /* Compute the decompressed kernel start address. It is where + /* + * Compute the decompressed kernel start address. It is where * we were loaded at aligned to a 2M boundary. %rbp contains the * decompressed kernel start address. * @@ -241,7 +249,8 @@ ENTRY(startup_64) addq $(32768 + 18 + 4095), %rbx andq $~4095, %rbx -/* Copy the compressed kernel to the end of our buffer +/* + * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ leaq _end_before_pgt(%rip), %r8 @@ -260,7 +269,7 @@ ENTRY(startup_64) leaq relocated(%rbx), %rax jmp *%rax -.section ".text" + .text relocated: /* @@ -271,8 +280,7 @@ relocated: leaq _end_before_pgt(%rbx), %rcx subq %rdi, %rcx cld - rep - stosb + rep stosb /* Setup the stack */ leaq boot_stack_end(%rip), %rsp @@ -311,9 +319,11 @@ gdt: .quad 0x0000000000000000 /* TS continued */ gdt_end: -.bss -/* Stack and heap for uncompression */ -.balign 4 +/* + * Stack and heap for uncompression + */ + .bss + .balign 4 boot_heap: .fill BOOT_HEAP_SIZE, 1, 0 boot_stack: From 5b11f1cee5797b38d16b94d8745b12b6727a8373 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 16:20:34 -0700 Subject: [PATCH 0895/2061] x86, boot: straighten out ranges to copy/zero in compressed/head*.S Both on 32 and 64 bits, we copy all the way up to the end of bss, except that on 64 bits there is a hack to avoid copying on top of the page tables. There is no point in copying bss at all, especially since we are just about to zero it all anyway. To clean up and unify the handling, we now do: - copy from startup_32 to _bss. - zero from _bss to _ebss. - the _ebss symbol is aligned to an 8-byte boundary. - the page tables are moved to a separate section. Use _bss as the copy endpoint since _edata may be misaligned. [ Impact: cleanup, trivial performance improvement ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 8 ++++---- arch/x86/boot/compressed/head_64.S | 18 +++++++++++++----- arch/x86/boot/compressed/vmlinux.lds.S | 19 ++++++++++++------- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 7bd7766ffab..59425e157df 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -93,9 +93,9 @@ ENTRY(startup_32) * where decompression in place becomes safe. */ pushl %esi - leal _ebss(%ebp), %esi - leal _ebss(%ebx), %edi - movl $(_ebss - startup_32), %ecx + leal _bss(%ebp), %esi + leal _bss(%ebx), %edi + movl $(_bss - startup_32), %ecx std rep movsb cld @@ -125,7 +125,7 @@ relocated: * Clear BSS */ xorl %eax, %eax - leal _edata(%ebx), %edi + leal _bss(%ebx), %edi leal _ebss(%ebx), %ecx subl %edi, %ecx cld diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 26c3def43ac..5bc9052615b 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -253,9 +253,9 @@ ENTRY(startup_64) * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - leaq _end_before_pgt(%rip), %r8 - leaq _end_before_pgt(%rbx), %r9 - movq $_end_before_pgt /* - $startup_32 */, %rcx + leaq _bss(%rip), %r8 + leaq _bss(%rbx), %r9 + movq $_bss /* - $startup_32 */, %rcx 1: subq $8, %r8 subq $8, %r9 movq 0(%r8), %rax @@ -276,8 +276,8 @@ relocated: * Clear BSS */ xorq %rax, %rax - leaq _edata(%rbx), %rdi - leaq _end_before_pgt(%rbx), %rcx + leaq _bss(%rbx), %rdi + leaq _ebss(%rbx), %rcx subq %rdi, %rcx cld rep stosb @@ -329,3 +329,11 @@ boot_heap: boot_stack: .fill BOOT_STACK_SIZE, 1, 0 boot_stack_end: + +/* + * Space for page tables (not in .bss so not zeroed) + */ + .section ".pgtable","a",@nobits + .balign 4096 +pgtable: + .fill 6*4096, 1, 0 diff --git a/arch/x86/boot/compressed/vmlinux.lds.S b/arch/x86/boot/compressed/vmlinux.lds.S index dbe515e13fe..cc353e1b3ff 100644 --- a/arch/x86/boot/compressed/vmlinux.lds.S +++ b/arch/x86/boot/compressed/vmlinux.lds.S @@ -2,6 +2,8 @@ OUTPUT_FORMAT(CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT, CONFIG_OUTPUT_FORMAT) #undef i386 +#include + #ifdef CONFIG_X86_64 OUTPUT_ARCH(i386:x86-64) ENTRY(startup_64) @@ -48,13 +50,16 @@ SECTIONS *(.bss) *(.bss.*) *(COMMON) -#ifdef CONFIG_X86_64 - . = ALIGN(8); - _end_before_pgt = . ; - . = ALIGN(4096); - pgtable = . ; - . = . + 4096 * 6; -#endif + . = ALIGN(8); /* For convenience during zeroing */ _ebss = .; } +#ifdef CONFIG_X86_64 + . = ALIGN(PAGE_SIZE); + .pgtable : { + _pgtable = . ; + *(.pgtable) + _epgtable = . ; + } +#endif + _end = .; } From 0a137736704ef9af719409933b3c33e138461786 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 16:27:41 -0700 Subject: [PATCH 0896/2061] x86, boot: set up the decompression stack as early as possible Set up the decompression stack as soon as we know where it needs to go. That way we have a full-service stack as soon as possible, rather than relying on the BP_scratch field. Note that the stack does need to be empty during bss zeroing (or else the stack needs to be moved out of the bss segment, which is also an option.) [ Impact: cleanup, minor paranoia ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 10 ++++------ arch/x86/boot/compressed/head_64.S | 16 ++++++++-------- 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 59425e157df..d7245cf8026 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -88,6 +88,9 @@ ENTRY(startup_32) addl $4095, %ebx andl $~4095, %ebx + /* Set up the stack */ + leal boot_stack_end(%ebx), %esp + /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. @@ -122,7 +125,7 @@ ENDPROC(startup_32) relocated: /* - * Clear BSS + * Clear BSS (stack is currently empty) */ xorl %eax, %eax leal _bss(%ebx), %edi @@ -131,11 +134,6 @@ relocated: cld rep stosb -/* - * Setup the stack for the decompressor - */ - leal boot_stack_end(%ebx), %esp - /* * Do the decompression, and jump to the new kernel.. */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 5bc9052615b..a0b18426069 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -249,6 +249,13 @@ ENTRY(startup_64) addq $(32768 + 18 + 4095), %rbx andq $~4095, %rbx + /* Set up the stack */ + leaq boot_stack_end(%rbx), %rsp + + /* Zero EFLAGS */ + pushq $0 + popfq + /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. @@ -273,7 +280,7 @@ ENTRY(startup_64) relocated: /* - * Clear BSS + * Clear BSS (stack is currently empty) */ xorq %rax, %rax leaq _bss(%rbx), %rdi @@ -282,13 +289,6 @@ relocated: cld rep stosb - /* Setup the stack */ - leaq boot_stack_end(%rip), %rsp - - /* zero EFLAGS after setting rsp */ - pushq $0 - popfq - /* * Do the decompression, and jump to the new kernel.. */ From 97541912785369925723b6255438ad9fce2ddf04 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 6 May 2009 17:56:51 -0700 Subject: [PATCH 0897/2061] x86, boot: zero EFLAGS on 32 bits The 64-bit code already clears EFLAGS as soon as it has a stack. This seems like a reasonable precaution, so do it on 32 bits as well. [ Impact: extra paranoia ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index d7245cf8026..d02a4f02be1 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -91,6 +91,10 @@ ENTRY(startup_32) /* Set up the stack */ leal boot_stack_end(%ebx), %esp + /* Zero EFLAGS */ + pushl $0 + popfl + /* * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. From 36d3793c947f1ef7ba3d24eeeddc1be41adc5ab4 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 16:45:15 -0700 Subject: [PATCH 0898/2061] x86, boot: use appropriate rep string for move and clear In the pre-decompression code, use the appropriate largest possible rep movs and rep stos to move code and clear bss, respectively. For reverse copy, do note that the initial values are supposed to be the address of the first (highest) copy datum, not one byte beyond the end of the buffer. rep strings are not necessarily the fastest way to perform these operations on all current processors, but are likely to be in the future, and perhaps more importantly, we want to encourage the architecturally right thing to do here. This also fixes a couple of trivial inefficiencies on 64 bits. [ Impact: trivial performance enhancement, increase code similarity ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 11 ++++++----- arch/x86/boot/compressed/head_64.S | 26 +++++++++++++------------- 2 files changed, 19 insertions(+), 18 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index d02a4f02be1..6710dc78ac5 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -100,11 +100,12 @@ ENTRY(startup_32) * where decompression in place becomes safe. */ pushl %esi - leal _bss(%ebp), %esi - leal _bss(%ebx), %edi + leal (_bss-4)(%ebp), %esi + leal (_bss-4)(%ebx), %edi movl $(_bss - startup_32), %ecx + shrl $2, %ecx std - rep movsb + rep movsl cld popl %esi @@ -135,8 +136,8 @@ relocated: leal _bss(%ebx), %edi leal _ebss(%ebx), %ecx subl %edi, %ecx - cld - rep stosb + shrl $2, %ecx + rep stosl /* * Do the decompression, and jump to the new kernel.. diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index a0b18426069..723c72dfd7b 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -260,15 +260,15 @@ ENTRY(startup_64) * Copy the compressed kernel to the end of our buffer * where decompression in place becomes safe. */ - leaq _bss(%rip), %r8 - leaq _bss(%rbx), %r9 + pushq %rsi + leaq (_bss-8)(%rip), %rsi + leaq (_bss-8)(%rbx), %rdi movq $_bss /* - $startup_32 */, %rcx -1: subq $8, %r8 - subq $8, %r9 - movq 0(%r8), %rax - movq %rax, 0(%r9) - subq $8, %rcx - jnz 1b + shrq $3, %rcx + std + rep movsq + cld + popq %rsi /* * Jump to the relocated address. @@ -282,12 +282,12 @@ relocated: /* * Clear BSS (stack is currently empty) */ - xorq %rax, %rax - leaq _bss(%rbx), %rdi - leaq _ebss(%rbx), %rcx + xorl %eax, %eax + leaq _bss(%rip), %rdi + leaq _ebss(%rip), %rcx subq %rdi, %rcx - cld - rep stosb + shrq $3, %rcx + rep stosq /* * Do the decompression, and jump to the new kernel.. From 02a884c0fe7ec8459d00d34b7d4101af21fc4a86 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 8 May 2009 17:42:16 -0700 Subject: [PATCH 0899/2061] x86, boot: determine compressed code offset at compile time Determine the compressed code offset (from the kernel runtime address) at compile time. This allows some minor optimizations in arch/x86/boot/compressed/head_*.S, but more importantly it makes this value available to the build process, which will enable a future patch to export the necessary linear memory footprint into the bzImage header. [ Impact: cleanup, future patch enabling ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/Makefile | 11 +++- arch/x86/boot/compressed/head_32.S | 24 ++----- arch/x86/boot/compressed/head_64.S | 41 ++++-------- arch/x86/boot/compressed/mkpiggy.c | 97 ++++++++++++++++++++++++++++ arch/x86/boot/compressed/vmlinux.scr | 10 --- 5 files changed, 123 insertions(+), 60 deletions(-) create mode 100644 arch/x86/boot/compressed/mkpiggy.c delete mode 100644 arch/x86/boot/compressed/vmlinux.scr diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile index 7f24fdb584e..49c8a4c37d7 100644 --- a/arch/x86/boot/compressed/Makefile +++ b/arch/x86/boot/compressed/Makefile @@ -19,6 +19,8 @@ KBUILD_AFLAGS := $(KBUILD_CFLAGS) -D__ASSEMBLY__ LDFLAGS := -m elf_$(UTS_MACHINE) LDFLAGS_vmlinux := -T +hostprogs-y := mkpiggy + $(obj)/vmlinux: $(obj)/vmlinux.lds $(obj)/head_$(BITS).o $(obj)/misc.o $(obj)/piggy.o FORCE $(call if_changed,ld) @: @@ -50,6 +52,9 @@ suffix-$(CONFIG_KERNEL_GZIP) := gz suffix-$(CONFIG_KERNEL_BZIP2) := bz2 suffix-$(CONFIG_KERNEL_LZMA) := lzma -LDFLAGS_piggy.o := -r --format binary --oformat $(CONFIG_OUTPUT_FORMAT) -T -$(obj)/piggy.o: $(obj)/vmlinux.scr $(obj)/vmlinux.bin.$(suffix-y) FORCE - $(call if_changed,ld) +quiet_cmd_mkpiggy = MKPIGGY $@ + cmd_mkpiggy = $(obj)/mkpiggy $< > $@ || ( rm -f $@ ; false ) + +targets += piggy.S +$(obj)/piggy.S: $(obj)/vmlinux.bin.$(suffix-y) $(obj)/mkpiggy FORCE + $(call if_changed,mkpiggy) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 6710dc78ac5..470474bafc4 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -75,18 +75,8 @@ ENTRY(startup_32) movl $LOAD_PHYSICAL_ADDR, %ebx #endif - /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx - /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx - /* Add 32K + 18 bytes of extra slack */ - addl $(32768 + 18), %ebx - /* Align on a 4K boundary */ - addl $4095, %ebx - andl $~4095, %ebx + /* Target address to relocate to for decompression */ + addl $z_extract_offset, %ebx /* Set up the stack */ leal boot_stack_end(%ebx), %esp @@ -142,12 +132,10 @@ relocated: /* * Do the decompression, and jump to the new kernel.. */ - movl output_len(%ebx), %eax - pushl %eax + leal z_extract_offset_negative(%ebx), %ebp /* push arguments for decompress_kernel: */ pushl %ebp /* output address */ - movl input_len(%ebx), %eax - pushl %eax /* input_len */ + pushl $z_input_len /* input_len */ leal input_data(%ebx), %eax pushl %eax /* input_data */ leal boot_heap(%ebx), %eax @@ -155,14 +143,12 @@ relocated: pushl %esi /* real mode pointer */ call decompress_kernel addl $20, %esp - popl %ecx #if CONFIG_RELOCATABLE /* * Find the address of the relocations. */ - movl %ebp, %edi - addl %ecx, %edi + leal z_output_len(%ebp), %edi /* * Calculate the delta between where vmlinux was compiled to run diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 723c72dfd7b..2b9f2510507 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -90,16 +90,8 @@ ENTRY(startup_32) movl $CONFIG_PHYSICAL_START, %ebx #endif - /* Replace the compressed data size with the uncompressed size */ - subl input_len(%ebp), %ebx - movl output_len(%ebp), %eax - addl %eax, %ebx - /* Add 8 bytes for every 32K input block */ - shrl $12, %eax - addl %eax, %ebx - /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ - addl $(32768 + 18 + 4095), %ebx - andl $~4095, %ebx + /* Target address to relocate to for decompression */ + addl $z_extract_offset, %ebx /* * Prepare for entering 64 bit mode @@ -224,6 +216,9 @@ ENTRY(startup_64) * If it is a relocatable kernel then decompress and run the kernel * from load address aligned to 2MB addr, otherwise decompress and * run the kernel from CONFIG_PHYSICAL_START + * + * We cannot rely on the calculation done in 32-bit mode, since we + * may have been invoked via the 64-bit entry point. */ /* Start with the delta to where the kernel will run at. */ @@ -237,17 +232,8 @@ ENTRY(startup_64) movq %rbp, %rbx #endif - /* Replace the compressed data size with the uncompressed size */ - movl input_len(%rip), %eax - subq %rax, %rbx - movl output_len(%rip), %eax - addq %rax, %rbx - /* Add 8 bytes for every 32K input block */ - shrq $12, %rax - addq %rax, %rbx - /* Add 32K + 18 bytes of extra slack and align on a 4K boundary */ - addq $(32768 + 18 + 4095), %rbx - andq $~4095, %rbx + /* Target address to relocate to for decompression */ + leaq z_extract_offset(%rbp), %rbx /* Set up the stack */ leaq boot_stack_end(%rbx), %rsp @@ -292,13 +278,12 @@ relocated: /* * Do the decompression, and jump to the new kernel.. */ - pushq %rsi # Save the real mode argument - movq %rsi, %rdi # real mode address - leaq boot_heap(%rip), %rsi # malloc area for uncompression - leaq input_data(%rip), %rdx # input_data - movl input_len(%rip), %eax - movq %rax, %rcx # input_len - movq %rbp, %r8 # output + pushq %rsi /* Save the real mode argument */ + movq %rsi, %rdi /* real mode address */ + leaq boot_heap(%rip), %rsi /* malloc area for uncompression */ + leaq input_data(%rip), %rdx /* input_data */ + movl $z_input_len, %ecx /* input_len */ + movq %rbp, %r8 /* output target address */ call decompress_kernel popq %rsi diff --git a/arch/x86/boot/compressed/mkpiggy.c b/arch/x86/boot/compressed/mkpiggy.c new file mode 100644 index 00000000000..bcbd36c4143 --- /dev/null +++ b/arch/x86/boot/compressed/mkpiggy.c @@ -0,0 +1,97 @@ +/* ----------------------------------------------------------------------- * + * + * Copyright (C) 2009 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * + * H. Peter Anvin + * + * ----------------------------------------------------------------------- */ + +/* + * Compute the desired load offset from a compressed program; outputs + * a small assembly wrapper with the appropriate symbols defined. + */ + +#include +#include +#include +#include + +static uint32_t getle32(const void *p) +{ + const uint8_t *cp = p; + + return (uint32_t)cp[0] + ((uint32_t)cp[1] << 8) + + ((uint32_t)cp[2] << 16) + ((uint32_t)cp[3] << 24); +} + +int main(int argc, char *argv[]) +{ + uint32_t olen; + long ilen; + unsigned long offs; + FILE *f; + + if (argc < 2) { + fprintf(stderr, "Usage: %s compressed_file\n", argv[0]); + return 1; + } + + /* Get the information for the compressed kernel image first */ + + f = fopen(argv[1], "r"); + if (!f) { + perror(argv[1]); + return 1; + } + + + if (fseek(f, -4L, SEEK_END)) { + perror(argv[1]); + } + fread(&olen, sizeof olen, 1, f); + ilen = ftell(f); + olen = getle32(&olen); + fclose(f); + + /* + * Now we have the input (compressed) and output (uncompressed) + * sizes, compute the necessary decompression offset... + */ + + offs = (olen > ilen) ? olen - ilen : 0; + offs += olen >> 12; /* Add 8 bytes for each 32K block */ + offs += 32*1024 + 18; /* Add 32K + 18 bytes slack */ + offs = (offs+4095) & ~4095; /* Round to a 4K boundary */ + + printf(".section \".rodata.compressed\",\"a\",@progbits\n"); + printf(".globl z_input_len\n"); + printf("z_input_len = %lu\n", ilen); + printf(".globl z_output_len\n"); + printf("z_output_len = %lu\n", (unsigned long)olen); + printf(".globl z_extract_offset\n"); + printf("z_extract_offset = 0x%lx\n", offs); + /* z_extract_offset_negative allows simplification of head_32.S */ + printf(".globl z_extract_offset_negative\n"); + printf("z_extract_offset_negative = -0x%lx\n", offs); + + printf(".globl input_data, input_data_end\n"); + printf("input_data:\n"); + printf(".incbin \"%s\"\n", argv[1]); + printf("input_data_end:\n"); + + return 0; +} diff --git a/arch/x86/boot/compressed/vmlinux.scr b/arch/x86/boot/compressed/vmlinux.scr deleted file mode 100644 index f02382ae5c4..00000000000 --- a/arch/x86/boot/compressed/vmlinux.scr +++ /dev/null @@ -1,10 +0,0 @@ -SECTIONS -{ - .rodata.compressed : { - input_len = .; - LONG(input_data_end - input_data) input_data = .; - *(.data) - output_len = . - 4; - input_data_end = .; - } -} From 778dedae0cb76a441145f3a0c5d59fcb3ba296d5 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sat, 9 May 2009 12:54:34 +0800 Subject: [PATCH 0900/2061] x86: mce: remove duplicated #include Remove duplicated #include in arch/x86/kernel/cpu/mcheck/mce_intel_64.c. [ Impact: cleanup ] Signed-off-by: Huang Weiyi Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/mcheck/mce_intel_64.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c index d6b72df89d6..aedf9766ee9 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_intel_64.c +++ b/arch/x86/kernel/cpu/mcheck/mce_intel_64.c @@ -15,7 +15,6 @@ #include #include #include -#include asmlinkage void smp_thermal_interrupt(void) { From b30505c81a9d4adea8b70ecff512b0216929b797 Mon Sep 17 00:00:00 2001 From: Darren Hart Date: Thu, 7 May 2009 15:40:14 -0700 Subject: [PATCH 0901/2061] futex: add requeue-pi documentation Add Documentation/futex-requeue-pi.txt describing the motivation for the newly added FUTEX_*REQUEUE_PI op codes and their implementation. [ Impact: add documentation ] Signed-off-by: Darren Hart Cc: Sripathi Kodi Cc: Peter Zijlstra Cc: John Stultz Cc: Steven Rostedt Cc: Dinakar Guniguntala Cc: Ulrich Drepper Cc: Eric Dumazet Cc: Jakub Jelinek LKML-Reference: <4A03634E.3080609@us.ibm.com> [ reformatted the file ] Signed-off-by: Ingo Molnar --- Documentation/futex-requeue-pi.txt | 131 +++++++++++++++++++++++++++++ 1 file changed, 131 insertions(+) create mode 100644 Documentation/futex-requeue-pi.txt diff --git a/Documentation/futex-requeue-pi.txt b/Documentation/futex-requeue-pi.txt new file mode 100644 index 00000000000..9dc1ff4fd53 --- /dev/null +++ b/Documentation/futex-requeue-pi.txt @@ -0,0 +1,131 @@ +Futex Requeue PI +---------------- + +Requeueing of tasks from a non-PI futex to a PI futex requires +special handling in order to ensure the underlying rt_mutex is never +left without an owner if it has waiters; doing so would break the PI +boosting logic [see rt-mutex-desgin.txt] For the purposes of +brevity, this action will be referred to as "requeue_pi" throughout +this document. Priority inheritance is abbreviated throughout as +"PI". + +Motivation +---------- + +Without requeue_pi, the glibc implementation of +pthread_cond_broadcast() must resort to waking all the tasks waiting +on a pthread_condvar and letting them try to sort out which task +gets to run first in classic thundering-herd formation. An ideal +implementation would wake the highest-priority waiter, and leave the +rest to the natural wakeup inherent in unlocking the mutex +associated with the condvar. + +Consider the simplified glibc calls: + +/* caller must lock mutex */ +pthread_cond_wait(cond, mutex) +{ + lock(cond->__data.__lock); + unlock(mutex); + do { + unlock(cond->__data.__lock); + futex_wait(cond->__data.__futex); + lock(cond->__data.__lock); + } while(...) + unlock(cond->__data.__lock); + lock(mutex); +} + +pthread_cond_broadcast(cond) +{ + lock(cond->__data.__lock); + unlock(cond->__data.__lock); + futex_requeue(cond->data.__futex, cond->mutex); +} + +Once pthread_cond_broadcast() requeues the tasks, the cond->mutex +has waiters. Note that pthread_cond_wait() attempts to lock the +mutex only after it has returned to user space. This will leave the +underlying rt_mutex with waiters, and no owner, breaking the +previously mentioned PI-boosting algorithms. + +In order to support PI-aware pthread_condvar's, the kernel needs to +be able to requeue tasks to PI futexes. This support implies that +upon a successful futex_wait system call, the caller would return to +user space already holding the PI futex. The glibc implementation +would be modified as follows: + + +/* caller must lock mutex */ +pthread_cond_wait_pi(cond, mutex) +{ + lock(cond->__data.__lock); + unlock(mutex); + do { + unlock(cond->__data.__lock); + futex_wait_requeue_pi(cond->__data.__futex); + lock(cond->__data.__lock); + } while(...) + unlock(cond->__data.__lock); + /* the kernel acquired the the mutex for us */ +} + +pthread_cond_broadcast_pi(cond) +{ + lock(cond->__data.__lock); + unlock(cond->__data.__lock); + futex_requeue_pi(cond->data.__futex, cond->mutex); +} + +The actual glibc implementation will likely test for PI and make the +necessary changes inside the existing calls rather than creating new +calls for the PI cases. Similar changes are needed for +pthread_cond_timedwait() and pthread_cond_signal(). + +Implementation +-------------- + +In order to ensure the rt_mutex has an owner if it has waiters, it +is necessary for both the requeue code, as well as the waiting code, +to be able to acquire the rt_mutex before returning to user space. +The requeue code cannot simply wake the waiter and leave it to +acquire the rt_mutex as it would open a race window between the +requeue call returning to user space and the waiter waking and +starting to run. This is especially true in the uncontended case. + +The solution involves two new rt_mutex helper routines, +rt_mutex_start_proxy_lock() and rt_mutex_finish_proxy_lock(), which +allow the requeue code to acquire an uncontended rt_mutex on behalf +of the waiter and to enqueue the waiter on a contended rt_mutex. +Two new system calls provide the kernel<->user interface to +requeue_pi: FUTEX_WAIT_REQUEUE_PI and FUTEX_REQUEUE_CMP_PI. + +FUTEX_WAIT_REQUEUE_PI is called by the waiter (pthread_cond_wait() +and pthread_cond_timedwait()) to block on the initial futex and wait +to be requeued to a PI-aware futex. The implementation is the +result of a high-speed collision between futex_wait() and +futex_lock_pi(), with some extra logic to check for the additional +wake-up scenarios. + +FUTEX_REQUEUE_CMP_PI is called by the waker +(pthread_cond_broadcast() and pthread_cond_signal()) to requeue and +possibly wake the waiting tasks. Internally, this system call is +still handled by futex_requeue (by passing requeue_pi=1). Before +requeueing, futex_requeue() attempts to acquire the requeue target +PI futex on behalf of the top waiter. If it can, this waiter is +woken. futex_requeue() then proceeds to requeue the remaining +nr_wake+nr_requeue tasks to the PI futex, calling +rt_mutex_start_proxy_lock() prior to each requeue to prepare the +task as a waiter on the underlying rt_mutex. It is possible that +the lock can be acquired at this stage as well, if so, the next +waiter is woken to finish the acquisition of the lock. + +FUTEX_REQUEUE_PI accepts nr_wake and nr_requeue as arguments, but +their sum is all that really matters. futex_requeue() will wake or +requeue up to nr_wake + nr_requeue tasks. It will wake only as many +tasks as it can acquire the lock for, which in the majority of cases +should be 0 as good programming practice dictates that the caller of +either pthread_cond_broadcast() or pthread_cond_signal() acquire the +mutex prior to making the call. FUTEX_REQUEUE_PI requires that +nr_wake=1. nr_requeue should be INT_MAX for broadcast and 0 for +signal. From e9e8b1fb995543c4cc3930f465923a552b2e48b7 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sat, 9 May 2009 14:31:37 +0900 Subject: [PATCH 0902/2061] sh: Fix up the sh64 earlyprintk build. sci_rxd_in() on SH-5 wants to be using SCSPTR, not SCSPTR2. Signed-off-by: Paul Mundt --- drivers/serial/sh-sci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/serial/sh-sci.h b/drivers/serial/sh-sci.h index e3eae4eaadd..38072c15b84 100644 --- a/drivers/serial/sh-sci.h +++ b/drivers/serial/sh-sci.h @@ -636,7 +636,7 @@ static inline int sci_rxd_in(struct uart_port *port) #elif defined(CONFIG_CPU_SUBTYPE_SH5_101) || defined(CONFIG_CPU_SUBTYPE_SH5_103) static inline int sci_rxd_in(struct uart_port *port) { - return sci_in(port, SCSPTR2)&0x0001 ? 1 : 0; /* SCIF */ + return sci_in(port, SCSPTR)&0x0001 ? 1 : 0; /* SCIF */ } #elif defined(__H8300H__) || defined(__H8300S__) static inline int sci_rxd_in(struct uart_port *port) From 2fedaacdc07e053d93e0607047a96d282f62aca2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sat, 9 May 2009 14:38:49 +0900 Subject: [PATCH 0903/2061] sh: Cleanup irqflags size mismatch on SH-5 build. Signed-off-by: Paul Mundt --- arch/sh/mm/cache-sh5.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sh/mm/cache-sh5.c b/arch/sh/mm/cache-sh5.c index 9e277ec7d53..86762092508 100644 --- a/arch/sh/mm/cache-sh5.c +++ b/arch/sh/mm/cache-sh5.c @@ -60,7 +60,7 @@ static inline void sh64_teardown_dtlb_cache_slot(void) static inline void sh64_icache_inv_all(void) { unsigned long long addr, flag, data; - unsigned int flags; + unsigned long flags; addr = ICCR0; flag = ICCR0_ICI; @@ -172,7 +172,7 @@ static void sh64_icache_inv_user_page_range(struct mm_struct *mm, unsigned long eaddr; unsigned long after_last_page_start; unsigned long mm_asid, current_asid; - unsigned long long flags = 0ULL; + unsigned long flags = 0; mm_asid = cpu_asid(smp_processor_id(), mm); current_asid = get_asid(); @@ -236,7 +236,7 @@ static void sh64_icache_inv_user_small_range(struct mm_struct *mm, unsigned long long eaddr = start; unsigned long long eaddr_end = start + len; unsigned long current_asid, mm_asid; - unsigned long long flags; + unsigned long flags; unsigned long long epage_start; /* @@ -342,7 +342,7 @@ static void inline sh64_dcache_purge_sets(int sets_to_purge_base, int n_sets) * alloco is a NOP if the cache is write-through. */ if (test_bit(SH_CACHE_MODE_WT, &(cpu_data->dcache.flags))) - ctrl_inb(eaddr); + __raw_readb((unsigned long)eaddr); } } From 6dbe47a170f80159f23c856ad4e02f2685c6f6cb Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sat, 9 May 2009 14:44:30 +0900 Subject: [PATCH 0904/2061] sh: Provide __read_{read,write}sl() definitions for sh64. These are presently only defined for sh32, use the plain unoptimized versions for sh64. Fixes up smsc911x build. Signed-off-by: Paul Mundt --- arch/sh/include/asm/io.h | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/arch/sh/include/asm/io.h b/arch/sh/include/asm/io.h index c7c360b5866..25348141674 100644 --- a/arch/sh/include/asm/io.h +++ b/arch/sh/include/asm/io.h @@ -123,10 +123,15 @@ static inline void __raw_reads##bwlq(volatile void __iomem *mem, \ __BUILD_MEMORY_STRING(b, u8) __BUILD_MEMORY_STRING(w, u16) -__BUILD_MEMORY_STRING(q, u64) +#ifdef CONFIG_SUPERH32 void __raw_writesl(void __iomem *addr, const void *data, int longlen); void __raw_readsl(const void __iomem *addr, void *data, int longlen); +#else +__BUILD_MEMORY_STRING(l, u32) +#endif + +__BUILD_MEMORY_STRING(q, u64) #define writesb __raw_writesb #define writesw __raw_writesw From 2bcfffa42309b6f73042c62459bf5207762a271d Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sat, 9 May 2009 16:02:08 +0900 Subject: [PATCH 0905/2061] sh: Rename opcode_t to insn_size_t. This is now clashing with a driver, so just rename it. Signed-off-by: Paul Mundt --- arch/sh/include/asm/kprobes.h | 2 +- arch/sh/include/asm/system_32.h | 2 +- arch/sh/include/asm/types.h | 4 ++-- arch/sh/kernel/io_trapped.c | 2 +- arch/sh/kernel/kgdb.c | 4 ++-- arch/sh/kernel/traps.c | 6 +++--- arch/sh/kernel/traps_32.c | 10 +++++----- 7 files changed, 15 insertions(+), 15 deletions(-) diff --git a/arch/sh/include/asm/kprobes.h b/arch/sh/include/asm/kprobes.h index 613644a758e..036c3311233 100644 --- a/arch/sh/include/asm/kprobes.h +++ b/arch/sh/include/asm/kprobes.h @@ -6,7 +6,7 @@ #include #include -typedef u16 kprobe_opcode_t; +typedef insn_size_t kprobe_opcode_t; #define BREAKPOINT_INSTRUCTION 0xc33a #define MAX_INSN_SIZE 16 diff --git a/arch/sh/include/asm/system_32.h b/arch/sh/include/asm/system_32.h index 240b31e1142..6c68a51f1cc 100644 --- a/arch/sh/include/asm/system_32.h +++ b/arch/sh/include/asm/system_32.h @@ -198,7 +198,7 @@ do { \ }) #endif -int handle_unaligned_access(opcode_t instruction, struct pt_regs *regs, +int handle_unaligned_access(insn_size_t instruction, struct pt_regs *regs, struct mem_access *ma); asmlinkage void do_address_error(struct pt_regs *regs, diff --git a/arch/sh/include/asm/types.h b/arch/sh/include/asm/types.h index beea4e6f8df..b13caca62a7 100644 --- a/arch/sh/include/asm/types.h +++ b/arch/sh/include/asm/types.h @@ -23,9 +23,9 @@ typedef unsigned short umode_t; typedef u32 dma_addr_t; #ifdef CONFIG_SUPERH32 -typedef u16 opcode_t; +typedef u16 insn_size_t; #else -typedef u32 opcode_t; +typedef u32 insn_size_t; #endif #endif /* __ASSEMBLY__ */ diff --git a/arch/sh/kernel/io_trapped.c b/arch/sh/kernel/io_trapped.c index c22853b059e..77dfecb6437 100644 --- a/arch/sh/kernel/io_trapped.c +++ b/arch/sh/kernel/io_trapped.c @@ -267,7 +267,7 @@ static struct mem_access trapped_io_access = { int handle_trapped_io(struct pt_regs *regs, unsigned long address) { mm_segment_t oldfs; - opcode_t instruction; + insn_size_t instruction; int tmp; if (!lookup_tiop(address)) diff --git a/arch/sh/kernel/kgdb.c b/arch/sh/kernel/kgdb.c index 7c747e7d71b..305aad742ae 100644 --- a/arch/sh/kernel/kgdb.c +++ b/arch/sh/kernel/kgdb.c @@ -47,7 +47,7 @@ char in_nmi = 0; /* Set during NMI to prevent re-entry */ /* Calculate the new address for after a step */ static short *get_step_address(struct pt_regs *linux_regs) { - opcode_t op = __raw_readw(linux_regs->pc); + insn_size_t op = __raw_readw(linux_regs->pc); long addr; /* BT */ @@ -134,7 +134,7 @@ static short *get_step_address(struct pt_regs *linux_regs) */ static unsigned long stepped_address; -static opcode_t stepped_opcode; +static insn_size_t stepped_opcode; static void do_single_step(struct pt_regs *linux_regs) { diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index 438f1ebcc45..46348ed07cc 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -22,11 +22,11 @@ static void handle_BUG(struct pt_regs *regs) int is_valid_bugaddr(unsigned long addr) { - unsigned short opcode; + insn_size_t opcode; if (addr < PAGE_OFFSET) return 0; - if (probe_kernel_address((u16 *)addr, opcode)) + if (probe_kernel_address((insn_size_t *)addr, opcode)) return 0; return opcode == TRAPA_BUG_OPCODE; @@ -66,7 +66,7 @@ BUILD_TRAP_HANDLER(bug) #ifdef CONFIG_BUG if (__kernel_text_address(instruction_pointer(regs))) { - opcode_t insn = *(opcode_t *)instruction_pointer(regs); + insn_size_t insn = *(insn_size_t *)instruction_pointer(regs); if (insn == TRAPA_BUG_OPCODE) handle_BUG(regs); } diff --git a/arch/sh/kernel/traps_32.c b/arch/sh/kernel/traps_32.c index 67550d88c4e..2b772776fcd 100644 --- a/arch/sh/kernel/traps_32.c +++ b/arch/sh/kernel/traps_32.c @@ -177,7 +177,7 @@ static struct mem_access user_mem_access = { * (if that instruction is in a branch delay slot) * - return 0 if emulation okay, -EFAULT on existential error */ -static int handle_unaligned_ins(opcode_t instruction, struct pt_regs *regs, +static int handle_unaligned_ins(insn_size_t instruction, struct pt_regs *regs, struct mem_access *ma) { int ret, index, count; @@ -322,10 +322,10 @@ static int handle_unaligned_ins(opcode_t instruction, struct pt_regs *regs, * - fetches the instruction from PC+2 */ static inline int handle_delayslot(struct pt_regs *regs, - opcode_t old_instruction, + insn_size_t old_instruction, struct mem_access *ma) { - opcode_t instruction; + insn_size_t instruction; void __user *addr = (void __user *)(regs->pc + instruction_size(old_instruction)); @@ -365,7 +365,7 @@ static inline int handle_delayslot(struct pt_regs *regs, static int handle_unaligned_notify_count = 10; -int handle_unaligned_access(opcode_t instruction, struct pt_regs *regs, +int handle_unaligned_access(insn_size_t instruction, struct pt_regs *regs, struct mem_access *ma) { u_int rm; @@ -523,7 +523,7 @@ asmlinkage void do_address_error(struct pt_regs *regs, unsigned long error_code = 0; mm_segment_t oldfs; siginfo_t info; - opcode_t instruction; + insn_size_t instruction; int tmp; /* Intentional ifdef */ From f15b18d0755b3ee4b29991fc2fde535ee41df53c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 9 May 2009 10:04:22 +0200 Subject: [PATCH 0906/2061] perf_counter tools: remove debug code from builtin-stat.c Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-stat.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index e2fa117eab5..cf575c305a6 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -331,8 +331,6 @@ static void create_perfstat_counter(int counter) hw_event.exclude_kernel = event_mask[counter] & EVENT_MASK_KERNEL; hw_event.exclude_user = event_mask[counter] & EVENT_MASK_USER; -printf("exclude: %d\n", event_mask[counter]); - if (scale) hw_event.read_format = PERF_FORMAT_TOTAL_TIME_ENABLED | PERF_FORMAT_TOTAL_TIME_RUNNING; From 1031a3a57ed76f5cb8754a89ddb22f3d9ebe8861 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sat, 9 May 2009 17:57:21 +0900 Subject: [PATCH 0907/2061] sh: Provide an __sdivsi3_2 export for sh64. Newer code paths that are heavier in 64-bit math manage to get this generated by newer compilers, provide a definition and export accordingly. This is trivially wrapped around the existing __sdivsi3 code. Signed-off-by: Paul Mundt --- arch/sh/kernel/sh_ksyms_64.c | 2 ++ arch/sh/lib64/sdivsi3.S | 6 +++++- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/sh_ksyms_64.c b/arch/sh/kernel/sh_ksyms_64.c index 0d74d6b8774..8f54ef0cfbc 100644 --- a/arch/sh/kernel/sh_ksyms_64.c +++ b/arch/sh/kernel/sh_ksyms_64.c @@ -76,5 +76,7 @@ EXPORT_SYMBOL(strcpy); #define DECLARE_EXPORT(name) extern void name(void);EXPORT_SYMBOL(name) DECLARE_EXPORT(__sdivsi3); +DECLARE_EXPORT(__sdivsi3_1); +DECLARE_EXPORT(__sdivsi3_2); DECLARE_EXPORT(__udivsi3); DECLARE_EXPORT(__div_table); diff --git a/arch/sh/lib64/sdivsi3.S b/arch/sh/lib64/sdivsi3.S index 6a800c6a490..1963bbd4228 100644 --- a/arch/sh/lib64/sdivsi3.S +++ b/arch/sh/lib64/sdivsi3.S @@ -1,4 +1,6 @@ .global __sdivsi3 + .global __sdivsi3_1 + .global __sdivsi3_2 .section .text..SHmedia32,"ax" .align 2 @@ -6,13 +8,15 @@ /* clobbered: r1,r18,r19,r20,r21,r25,tr0 */ /* result in r0 */ __sdivsi3: +__sdivsi3_1: ptb __div_table,tr0 + gettr tr0,r20 +__sdivsi3_2: nsb r5, r1 shlld r5, r1, r25 /* normalize; [-2 ..1, 1..2) in s2.62 */ shari r25, 58, r21 /* extract 5(6) bit index (s2.4 with hole -1..1) */ /* bubble */ - gettr tr0,r20 ldx.ub r20, r21, r19 /* u0.8 */ shari r25, 32, r25 /* normalize to s2.30 */ shlli r21, 1, r21 From 7cd0378ef4c0d7b05e7dcb1daa115b841ffdc31a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sat, 9 May 2009 18:03:37 +0900 Subject: [PATCH 0908/2061] sh: Fix up SHmedia module ELF relocations. This fixes up the LSB setting for SHmedia branching in updated symbols when processing module relocations. Signed-off-by: Paul Mundt --- arch/sh/kernel/module.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/module.c b/arch/sh/kernel/module.c index c43081039dd..c19b0f7d2cc 100644 --- a/arch/sh/kernel/module.c +++ b/arch/sh/kernel/module.c @@ -90,7 +90,7 @@ int apply_relocate_add(Elf32_Shdr *sechdrs, * SHmedia, the LSB of the symbol needs to be asserted * for the CPU to be in SHmedia mode when it starts executing * the branch target. */ - relocation |= (sym->st_other & 4); + relocation |= !!(sym->st_other & 4); #endif switch (ELF32_R_TYPE(rel[i].r_info)) { From c3e2586b794b12ffcdf69b4e547030b51e18e6d9 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sat, 9 May 2009 23:33:02 +0900 Subject: [PATCH 0909/2061] sh: Integrate sh64 bits in vmlinux_32.lds.S. This adds all of the requisite bits from vmlinux_64.lds.S in to the _32 variant, resulting in a unified and generic linker script that can be shared across both. Signed-off-by: Paul Mundt --- arch/sh/kernel/vmlinux_32.lds.S | 117 ++++++++++++++++++++------------ 1 file changed, 72 insertions(+), 45 deletions(-) diff --git a/arch/sh/kernel/vmlinux_32.lds.S b/arch/sh/kernel/vmlinux_32.lds.S index dd9b2ee1312..8d97f52ce32 100644 --- a/arch/sh/kernel/vmlinux_32.lds.S +++ b/arch/sh/kernel/vmlinux_32.lds.S @@ -1,17 +1,23 @@ /* * ld script to make SuperH Linux kernel - * Written by Niibe Yutaka + * Written by Niibe Yutaka and Paul Mundt */ -#include -#include -#include - +#ifdef CONFIG_SUPERH64 +#define LOAD_OFFSET CONFIG_PAGE_OFFSET +OUTPUT_ARCH(sh:sh5) +#else +OUTPUT_ARCH(sh) #ifdef CONFIG_CPU_LITTLE_ENDIAN OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux") #else OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux") #endif -OUTPUT_ARCH(sh) +#endif + +#include +#include +#include + ENTRY(_start) SECTIONS { @@ -24,28 +30,35 @@ SECTIONS . = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET; #endif - _text = .; /* Text and read-only data */ - - .empty_zero_page : { + .empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) { *(.empty_zero_page) } = 0 - .text : { + .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; /* Text and read-only data */ HEAD_TEXT TEXT_TEXT + +#ifdef CONFIG_SUPERH64 + *(.text64) + *(.text..SHmedia32) +#endif + SCHED_TEXT LOCK_TEXT KPROBES_TEXT + IRQENTRY_TEXT *(.fixup) *(.gnu.warning) + _etext = .; /* End of text section */ } = 0x0009 . = ALIGN(16); /* Exception table */ - __start___ex_table = .; - __ex_table : { *(__ex_table) } - __stop___ex_table = .; - - _etext = .; /* End of text section */ + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } NOTES RO_DATA(PAGE_SIZE) @@ -54,13 +67,15 @@ SECTIONS * Code which must be executed uncached and the associated data */ . = ALIGN(PAGE_SIZE); - __uncached_start = .; - .uncached.text : { *(.uncached.text) } - .uncached.data : { *(.uncached.data) } - __uncached_end = .; + .uncached : AT(ADDR(.uncached) - LOAD_OFFSET) { + __uncached_start = .; + *(.uncached.text) + *(.uncached.data) + __uncached_end = .; + } . = ALIGN(THREAD_SIZE); - .data : { /* Data */ + .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ *(.data.init_task) . = ALIGN(L1_CACHE_BYTES); @@ -84,39 +99,51 @@ SECTIONS _edata = .; /* End of data section */ . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; - _sinittext = .; - .init.text : { INIT_TEXT } - _einittext = .; - .init.data : { INIT_DATA } + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA } . = ALIGN(16); - __setup_start = .; - .init.setup : { *(.init.setup) } - __setup_end = .; - - __initcall_start = .; - .initcall.init : { - INITCALLS + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; } - __initcall_end = .; - __con_initcall_start = .; - .con_initcall.init : { *(.con_initcall.init) } - __con_initcall_end = .; SECURITY_INIT #ifdef CONFIG_BLK_DEV_INITRD . = ALIGN(PAGE_SIZE); - __initramfs_start = .; - .init.ramfs : { *(.init.ramfs) } - __initramfs_end = .; + .init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } #endif . = ALIGN(4); - __machvec_start = .; - .machvec.init : { *(.machvec.init) } - __machvec_end = .; + .machvec.init : AT(ADDR(.machvec.init) - LOAD_OFFSET) { + __machvec_start = .; + *(.machvec.init) + __machvec_end = .; + } PERCPU(PAGE_SIZE) @@ -124,11 +151,11 @@ SECTIONS * .exit.text is discarded at runtime, not link time, to deal with * references from __bug_table */ - .exit.text : { EXIT_TEXT } - .exit.data : { EXIT_DATA } + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT } + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { EXIT_DATA } . = ALIGN(PAGE_SIZE); - .bss : { + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { __init_end = .; __bss_start = .; /* BSS */ *(.bss.page_aligned) From dce97c8cb2ad365fa87dc6924d99528c21c63912 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sat, 9 May 2009 23:36:10 +0900 Subject: [PATCH 0910/2061] sh: Move the unified linker script in place, kill off old _64 one. Just forcefully rename the _32 variant overtop, and kill off the now unused _64 version. Signed-off-by: Paul Mundt --- arch/sh/kernel/vmlinux.lds.S | 182 +++++++++++++++++++++++++++++++- arch/sh/kernel/vmlinux_32.lds.S | 181 ------------------------------- arch/sh/kernel/vmlinux_64.lds.S | 163 ---------------------------- 3 files changed, 179 insertions(+), 347 deletions(-) delete mode 100644 arch/sh/kernel/vmlinux_32.lds.S delete mode 100644 arch/sh/kernel/vmlinux_64.lds.S diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index d7d4991f32a..8d97f52ce32 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -1,5 +1,181 @@ -#ifdef CONFIG_SUPERH32 -# include "vmlinux_32.lds.S" +/* + * ld script to make SuperH Linux kernel + * Written by Niibe Yutaka and Paul Mundt + */ +#ifdef CONFIG_SUPERH64 +#define LOAD_OFFSET CONFIG_PAGE_OFFSET +OUTPUT_ARCH(sh:sh5) #else -# include "vmlinux_64.lds.S" +OUTPUT_ARCH(sh) +#ifdef CONFIG_CPU_LITTLE_ENDIAN +OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux") +#else +OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux") #endif +#endif + +#include +#include +#include + +ENTRY(_start) +SECTIONS +{ +#ifdef CONFIG_PMB_FIXED + . = CONFIG_PAGE_OFFSET + (CONFIG_MEMORY_START & 0x1fffffff) + + CONFIG_ZERO_PAGE_OFFSET; +#elif defined(CONFIG_32BIT) + . = CONFIG_PAGE_OFFSET + CONFIG_ZERO_PAGE_OFFSET; +#else + . = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET; +#endif + + .empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) { + *(.empty_zero_page) + } = 0 + + .text : AT(ADDR(.text) - LOAD_OFFSET) { + _text = .; /* Text and read-only data */ + HEAD_TEXT + TEXT_TEXT + +#ifdef CONFIG_SUPERH64 + *(.text64) + *(.text..SHmedia32) +#endif + + SCHED_TEXT + LOCK_TEXT + KPROBES_TEXT + IRQENTRY_TEXT + *(.fixup) + *(.gnu.warning) + _etext = .; /* End of text section */ + } = 0x0009 + + . = ALIGN(16); /* Exception table */ + __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { + __start___ex_table = .; + *(__ex_table) + __stop___ex_table = .; + } + + NOTES + RO_DATA(PAGE_SIZE) + + /* + * Code which must be executed uncached and the associated data + */ + . = ALIGN(PAGE_SIZE); + .uncached : AT(ADDR(.uncached) - LOAD_OFFSET) { + __uncached_start = .; + *(.uncached.text) + *(.uncached.data) + __uncached_end = .; + } + + . = ALIGN(THREAD_SIZE); + .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ + *(.data.init_task) + + . = ALIGN(L1_CACHE_BYTES); + *(.data.cacheline_aligned) + + . = ALIGN(L1_CACHE_BYTES); + *(.data.read_mostly) + + . = ALIGN(PAGE_SIZE); + *(.data.page_aligned) + + __nosave_begin = .; + *(.data.nosave) + . = ALIGN(PAGE_SIZE); + __nosave_end = .; + + DATA_DATA + CONSTRUCTORS + } + + _edata = .; /* End of data section */ + + . = ALIGN(PAGE_SIZE); /* Init code and data */ + .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { + __init_begin = .; + _sinittext = .; + INIT_TEXT + _einittext = .; + } + + .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA } + + . = ALIGN(16); + .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { + __setup_start = .; + *(.init.setup) + __setup_end = .; + } + + .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { + __initcall_start = .; + INITCALLS + __initcall_end = .; + } + + .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { + __con_initcall_start = .; + *(.con_initcall.init) + __con_initcall_end = .; + } + + SECURITY_INIT + +#ifdef CONFIG_BLK_DEV_INITRD + . = ALIGN(PAGE_SIZE); + .init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) { + __initramfs_start = .; + *(.init.ramfs) + __initramfs_end = .; + } +#endif + + . = ALIGN(4); + .machvec.init : AT(ADDR(.machvec.init) - LOAD_OFFSET) { + __machvec_start = .; + *(.machvec.init) + __machvec_end = .; + } + + PERCPU(PAGE_SIZE) + + /* + * .exit.text is discarded at runtime, not link time, to deal with + * references from __bug_table + */ + .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT } + .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { EXIT_DATA } + + . = ALIGN(PAGE_SIZE); + .bss : AT(ADDR(.bss) - LOAD_OFFSET) { + __init_end = .; + __bss_start = .; /* BSS */ + *(.bss.page_aligned) + *(.bss) + *(COMMON) + . = ALIGN(4); + _ebss = .; /* uClinux MTD sucks */ + _end = . ; + } + + /* + * When something in the kernel is NOT compiled as a module, the + * module cleanup code and data are put into these segments. Both + * can then be thrown away, as cleanup code is never called unless + * it's a module. + */ + /DISCARD/ : { + *(.exitcall.exit) + } + + STABS_DEBUG + DWARF_DEBUG +} diff --git a/arch/sh/kernel/vmlinux_32.lds.S b/arch/sh/kernel/vmlinux_32.lds.S deleted file mode 100644 index 8d97f52ce32..00000000000 --- a/arch/sh/kernel/vmlinux_32.lds.S +++ /dev/null @@ -1,181 +0,0 @@ -/* - * ld script to make SuperH Linux kernel - * Written by Niibe Yutaka and Paul Mundt - */ -#ifdef CONFIG_SUPERH64 -#define LOAD_OFFSET CONFIG_PAGE_OFFSET -OUTPUT_ARCH(sh:sh5) -#else -OUTPUT_ARCH(sh) -#ifdef CONFIG_CPU_LITTLE_ENDIAN -OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux") -#else -OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux") -#endif -#endif - -#include -#include -#include - -ENTRY(_start) -SECTIONS -{ -#ifdef CONFIG_PMB_FIXED - . = CONFIG_PAGE_OFFSET + (CONFIG_MEMORY_START & 0x1fffffff) + - CONFIG_ZERO_PAGE_OFFSET; -#elif defined(CONFIG_32BIT) - . = CONFIG_PAGE_OFFSET + CONFIG_ZERO_PAGE_OFFSET; -#else - . = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET; -#endif - - .empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) { - *(.empty_zero_page) - } = 0 - - .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ - HEAD_TEXT - TEXT_TEXT - -#ifdef CONFIG_SUPERH64 - *(.text64) - *(.text..SHmedia32) -#endif - - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - IRQENTRY_TEXT - *(.fixup) - *(.gnu.warning) - _etext = .; /* End of text section */ - } = 0x0009 - - . = ALIGN(16); /* Exception table */ - __ex_table : AT(ADDR(__ex_table) - LOAD_OFFSET) { - __start___ex_table = .; - *(__ex_table) - __stop___ex_table = .; - } - - NOTES - RO_DATA(PAGE_SIZE) - - /* - * Code which must be executed uncached and the associated data - */ - . = ALIGN(PAGE_SIZE); - .uncached : AT(ADDR(.uncached) - LOAD_OFFSET) { - __uncached_start = .; - *(.uncached.text) - *(.uncached.data) - __uncached_end = .; - } - - . = ALIGN(THREAD_SIZE); - .data : AT(ADDR(.data) - LOAD_OFFSET) { /* Data */ - *(.data.init_task) - - . = ALIGN(L1_CACHE_BYTES); - *(.data.cacheline_aligned) - - . = ALIGN(L1_CACHE_BYTES); - *(.data.read_mostly) - - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - - DATA_DATA - CONSTRUCTORS - } - - _edata = .; /* End of data section */ - - . = ALIGN(PAGE_SIZE); /* Init code and data */ - .init.text : AT(ADDR(.init.text) - LOAD_OFFSET) { - __init_begin = .; - _sinittext = .; - INIT_TEXT - _einittext = .; - } - - .init.data : AT(ADDR(.init.data) - LOAD_OFFSET) { INIT_DATA } - - . = ALIGN(16); - .init.setup : AT(ADDR(.init.setup) - LOAD_OFFSET) { - __setup_start = .; - *(.init.setup) - __setup_end = .; - } - - .initcall.init : AT(ADDR(.initcall.init) - LOAD_OFFSET) { - __initcall_start = .; - INITCALLS - __initcall_end = .; - } - - .con_initcall.init : AT(ADDR(.con_initcall.init) - LOAD_OFFSET) { - __con_initcall_start = .; - *(.con_initcall.init) - __con_initcall_end = .; - } - - SECURITY_INIT - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) { - __initramfs_start = .; - *(.init.ramfs) - __initramfs_end = .; - } -#endif - - . = ALIGN(4); - .machvec.init : AT(ADDR(.machvec.init) - LOAD_OFFSET) { - __machvec_start = .; - *(.machvec.init) - __machvec_end = .; - } - - PERCPU(PAGE_SIZE) - - /* - * .exit.text is discarded at runtime, not link time, to deal with - * references from __bug_table - */ - .exit.text : AT(ADDR(.exit.text) - LOAD_OFFSET) { EXIT_TEXT } - .exit.data : AT(ADDR(.exit.data) - LOAD_OFFSET) { EXIT_DATA } - - . = ALIGN(PAGE_SIZE); - .bss : AT(ADDR(.bss) - LOAD_OFFSET) { - __init_end = .; - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - *(COMMON) - . = ALIGN(4); - _ebss = .; /* uClinux MTD sucks */ - _end = . ; - } - - /* - * When something in the kernel is NOT compiled as a module, the - * module cleanup code and data are put into these segments. Both - * can then be thrown away, as cleanup code is never called unless - * it's a module. - */ - /DISCARD/ : { - *(.exitcall.exit) - } - - STABS_DEBUG - DWARF_DEBUG -} diff --git a/arch/sh/kernel/vmlinux_64.lds.S b/arch/sh/kernel/vmlinux_64.lds.S deleted file mode 100644 index 69664460c68..00000000000 --- a/arch/sh/kernel/vmlinux_64.lds.S +++ /dev/null @@ -1,163 +0,0 @@ -/* - * ld script to make SH64 Linux kernel - * - * Copyright (C) 2000, 2001 Paolo Alberelli - * - * benedict.gaster@superh.com: 2nd May 2002 - * Add definition of empty_zero_page to be the first page of kernel image. - * - * benedict.gaster@superh.com: 3rd May 2002 - * Added support for ramdisk, removing statically linked romfs at the - * same time. - * - * lethal@linux-sh.org: 9th May 2003 - * Kill off GLOBAL_NAME() usage and other CDC-isms. - * - * lethal@linux-sh.org: 19th May 2003 - * Remove support for ancient toolchains. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include - -#define LOAD_OFFSET CONFIG_PAGE_OFFSET -#include - -OUTPUT_ARCH(sh:sh5) - -#define C_PHYS(x) AT (ADDR(x) - LOAD_OFFSET) - -ENTRY(__start) -SECTIONS -{ - . = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + PAGE_SIZE; - _text = .; /* Text and read-only data */ - - .empty_zero_page : C_PHYS(.empty_zero_page) { - *(.empty_zero_page) - } = 0 - - .text : C_PHYS(.text) { - HEAD_TEXT - TEXT_TEXT - *(.text64) - *(.text..SHmedia32) - SCHED_TEXT - LOCK_TEXT - KPROBES_TEXT - *(.fixup) - *(.gnu.warning) -#ifdef CONFIG_CPU_LITTLE_ENDIAN - } = 0x6ff0fff0 -#else - } = 0xf0fff06f -#endif - - /* We likely want __ex_table to be Cache Line aligned */ - . = ALIGN(L1_CACHE_BYTES); /* Exception table */ - __start___ex_table = .; - __ex_table : C_PHYS(__ex_table) { *(__ex_table) } - __stop___ex_table = .; - - _etext = .; /* End of text section */ - - NOTES - RO_DATA(PAGE_SIZE) - - . = ALIGN(THREAD_SIZE); - .data : C_PHYS(.data) { /* Data */ - *(.data.init_task) - - . = ALIGN(L1_CACHE_BYTES); - *(.data.cacheline_aligned) - - . = ALIGN(L1_CACHE_BYTES); - *(.data.read_mostly) - - . = ALIGN(PAGE_SIZE); - *(.data.page_aligned) - - __nosave_begin = .; - *(.data.nosave) - . = ALIGN(PAGE_SIZE); - __nosave_end = .; - - DATA_DATA - CONSTRUCTORS - } - - _edata = .; /* End of data section */ - - . = ALIGN(PAGE_SIZE); /* Init code and data */ - __init_begin = .; - _sinittext = .; - .init.text : C_PHYS(.init.text) { INIT_TEXT } - _einittext = .; - .init.data : C_PHYS(.init.data) { INIT_DATA } - . = ALIGN(L1_CACHE_BYTES); /* Better if Cache Line aligned */ - __setup_start = .; - .init.setup : C_PHYS(.init.setup) { *(.init.setup) } - __setup_end = .; - __initcall_start = .; - .initcall.init : C_PHYS(.initcall.init) { - INITCALLS - } - __initcall_end = .; - __con_initcall_start = .; - .con_initcall.init : C_PHYS(.con_initcall.init) { - *(.con_initcall.init) - } - __con_initcall_end = .; - - SECURITY_INIT - -#ifdef CONFIG_BLK_DEV_INITRD - . = ALIGN(PAGE_SIZE); - __initramfs_start = .; - .init.ramfs : C_PHYS(.init.ramfs) { *(.init.ramfs) } - __initramfs_end = .; -#endif - - . = ALIGN(8); - __machvec_start = .; - .machvec.init : C_PHYS(.machvec.init) { *(.machvec.init) } - __machvec_end = .; - - PERCPU(PAGE_SIZE) - - /* - * .exit.text is discarded at runtime, not link time, to deal with - * references from __bug_table - */ - .exit.text : C_PHYS(.exit.text) { EXIT_TEXT } - .exit.data : C_PHYS(.exit.data) { EXIT_DATA } - - . = ALIGN(PAGE_SIZE); - .bss : C_PHYS(.bss) { - __init_end = .; - __bss_start = .; /* BSS */ - *(.bss.page_aligned) - *(.bss) - *(COMMON) - . = ALIGN(4); - _ebss = .; /* uClinux MTD sucks */ - _end = . ; - } - - /* - * When something in the kernel is NOT compiled as a module, the - * module cleanup code and data are put into these segments. Both - * can then be thrown away, as cleanup code is never called unless - * it's a module. - */ - /DISCARD/ : { - *(.exitcall.exit) - } - - STABS_DEBUG - DWARF_DEBUG -} From 7b022d07a0fd2ce02d4456b732c674ff1d16f5ce Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 10 May 2009 00:25:08 +0900 Subject: [PATCH 0911/2061] sh: Tidy up the ldscript output format specifier. Tie this in to the Makefile directly, where we already know what we are running on. This tidies up the linker script a bit, and is prep work for unifying the arch/sh/boot/compressed linker scripts. Signed-off-by: Paul Mundt --- arch/sh/Makefile | 8 ++++++-- arch/sh/boot/compressed/Makefile_32 | 3 ++- arch/sh/boot/compressed/Makefile_64 | 4 ++-- arch/sh/kernel/vmlinux.lds.S | 6 +----- 4 files changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/sh/Makefile b/arch/sh/Makefile index 68348e70028..b941dc9b20f 100644 --- a/arch/sh/Makefile +++ b/arch/sh/Makefile @@ -100,13 +100,17 @@ LDFLAGS_vmlinux += --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \ endif ifdef CONFIG_CPU_LITTLE_ENDIAN -LDFLAGS_vmlinux += --defsym 'jiffies=jiffies_64' +ld-bfd := elf32-$(UTS_MACHINE)-linux +LDFLAGS_vmlinux += --defsym 'jiffies=jiffies_64' --oformat $(ld-bfd) LDFLAGS += -EL else -LDFLAGS_vmlinux += --defsym 'jiffies=jiffies_64+4' +ld-bfd := elf32-$(UTS_MACHINE)big-linux +LDFLAGS_vmlinux += --defsym 'jiffies=jiffies_64+4' --oformat $(ld-bfd) LDFLAGS += -EB endif +export ld-bfd + head-y := arch/sh/kernel/init_task.o head-$(CONFIG_SUPERH32) += arch/sh/kernel/head_32.o head-$(CONFIG_SUPERH64) += arch/sh/kernel/head_64.o diff --git a/arch/sh/boot/compressed/Makefile_32 b/arch/sh/boot/compressed/Makefile_32 index b96a055b053..249255729d7 100644 --- a/arch/sh/boot/compressed/Makefile_32 +++ b/arch/sh/boot/compressed/Makefile_32 @@ -28,7 +28,8 @@ ORIG_CFLAGS := $(KBUILD_CFLAGS) KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) endif -LDFLAGS_vmlinux := -Ttext $(IMAGE_OFFSET) -e startup -T $(obj)/../../kernel/vmlinux.lds +LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \ + -T $(obj)/../../kernel/vmlinux.lds $(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE $(call if_changed,ld) diff --git a/arch/sh/boot/compressed/Makefile_64 b/arch/sh/boot/compressed/Makefile_64 index 658d4f91555..541a529d066 100644 --- a/arch/sh/boot/compressed/Makefile_64 +++ b/arch/sh/boot/compressed/Makefile_64 @@ -24,8 +24,8 @@ OBJECTS := $(obj)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o \ ZIMAGE_OFFSET := $(shell /bin/bash -c 'printf "0x%08x" \ $$[$(CONFIG_PAGE_OFFSET)+0x400000+0x10000]') -LDFLAGS_vmlinux := -Ttext $(ZIMAGE_OFFSET) -e startup \ - -T $(obj)/../../kernel/vmlinux.lds +LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(ZIMAGE_OFFSET) -e startup \ + -T $(obj)/../../kernel/vmlinux.lds $(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o FORCE $(call if_changed,ld) diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 8d97f52ce32..1a2c8db5cb2 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -6,12 +6,8 @@ #define LOAD_OFFSET CONFIG_PAGE_OFFSET OUTPUT_ARCH(sh:sh5) #else +#define LOAD_OFFSET 0 OUTPUT_ARCH(sh) -#ifdef CONFIG_CPU_LITTLE_ENDIAN -OUTPUT_FORMAT("elf32-sh-linux", "elf32-sh-linux", "elf32-sh-linux") -#else -OUTPUT_FORMAT("elf32-shbig-linux", "elf32-shbig-linux", "elf32-shbig-linux") -#endif #endif #include From 20b27fa33743c6ef77a1248421fab51e8bf21a25 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 10 May 2009 00:36:22 +0900 Subject: [PATCH 0912/2061] sh: Fix up the sh64 zImage build. Signed-off-by: Paul Mundt --- arch/sh/boot/compressed/Makefile_64 | 3 +- arch/sh/boot/compressed/head_64.S | 5 +- arch/sh/boot/compressed/vmlinux_64.lds | 64 -------------------------- 3 files changed, 2 insertions(+), 70 deletions(-) delete mode 100644 arch/sh/boot/compressed/vmlinux_64.lds diff --git a/arch/sh/boot/compressed/Makefile_64 b/arch/sh/boot/compressed/Makefile_64 index 541a529d066..7093255d345 100644 --- a/arch/sh/boot/compressed/Makefile_64 +++ b/arch/sh/boot/compressed/Makefile_64 @@ -14,8 +14,7 @@ targets := vmlinux vmlinux.bin vmlinux.bin.gz \ head_64.o misc_64.o cache.o piggy.o -OBJECTS := $(obj)/vmlinux_64.lds $(obj)/head_64.o $(obj)/misc_64.o \ - $(obj)/cache.o +OBJECTS := $(obj)/head_64.o $(obj)/misc_64.o $(obj)/cache.o # # ZIMAGE_OFFSET is the load offset of the compression loader diff --git a/arch/sh/boot/compressed/head_64.S b/arch/sh/boot/compressed/head_64.S index 622eac3cf55..9993113c671 100644 --- a/arch/sh/boot/compressed/head_64.S +++ b/arch/sh/boot/compressed/head_64.S @@ -14,6 +14,7 @@ * Copyright (C) 2002 Stuart Menefy (stuart.menefy@st.com) */ #include +#include #include #include @@ -33,11 +34,7 @@ #define ICCR0_INIT_VAL ICCR0_ON | ICCR0_ICI /* ICE + ICI */ #define ICCR1_INIT_VAL ICCR1_NOLOCK /* No locking */ -#if 1 #define OCCR0_INIT_VAL OCCR0_ON | OCCR0_OCI | OCCR0_WB /* OCE + OCI + WB */ -#else -#define OCCR0_INIT_VAL OCCR0_OFF -#endif #define OCCR1_INIT_VAL OCCR1_NOLOCK /* No locking */ .text diff --git a/arch/sh/boot/compressed/vmlinux_64.lds b/arch/sh/boot/compressed/vmlinux_64.lds deleted file mode 100644 index 59c2ef4aeda..00000000000 --- a/arch/sh/boot/compressed/vmlinux_64.lds +++ /dev/null @@ -1,64 +0,0 @@ -/* - * ld script to make compressed SuperH/shmedia Linux kernel+decompression - * bootstrap - * Modified by Stuart Menefy from arch/sh/vmlinux.lds.S written by Niibe Yutaka - */ - - -#ifdef CONFIG_LITTLE_ENDIAN -/* OUTPUT_FORMAT("elf32-sh64l-linux", "elf32-sh64l-linux", "elf32-sh64l-linux") */ -#define NOP 0x6ff0fff0 -#else -/* OUTPUT_FORMAT("elf32-sh64", "elf32-sh64", "elf32-sh64") */ -#define NOP 0xf0fff06f -#endif - -OUTPUT_FORMAT("elf32-sh64-linux") -OUTPUT_ARCH(sh) -ENTRY(_start) - -#define ALIGNED_GAP(section, align) (((ADDR(section)+SIZEOF(section)+(align)-1) & ~((align)-1))-ADDR(section)) -#define FOLLOWING(section, align) AT (LOADADDR(section) + ALIGNED_GAP(section,align)) - -SECTIONS -{ - _text = .; /* Text and read-only data */ - - .text : { - *(.text) - *(.text64) - *(.text..SHmedia32) - *(.fixup) - *(.gnu.warning) - } = NOP - . = ALIGN(4); - .rodata : { *(.rodata) } - - /* There is no 'real' reason for eight byte alignment, four would work - * as well, but gdb downloads much (*4) faster with this. - */ - . = ALIGN(8); - .image : { *(.image) } - . = ALIGN(4); - _etext = .; /* End of text section */ - - .data : /* Data */ - FOLLOWING(.image, 4) - { - _data = .; - *(.data) - } - _data_image = LOADADDR(.data);/* Address of data section in ROM */ - - _edata = .; /* End of data section */ - - .stack : { stack = .; _stack = .; } - - . = ALIGN(4); - __bss_start = .; /* BSS */ - .bss : { - *(.bss) - } - . = ALIGN(4); - _end = . ; -} From a2e76c80d93ec3c59a030b6ca37b9087033565c1 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 10 May 2009 00:54:39 +0900 Subject: [PATCH 0913/2061] sh: Provide a tighter BOOT_LINK_OFFSET definition for the Cayman board. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 6ed16c4548c..bca6a2a0759 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -704,6 +704,7 @@ config ZERO_PAGE_OFFSET config BOOT_LINK_OFFSET hex "Link address offset for booting" + default "0x00400000" if SH_CAYMAN default "0x00800000" help This option allows you to set the link address offset of the zImage. From b20883562455060272126c36563a7d8edafc30d3 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 10 May 2009 00:55:45 +0900 Subject: [PATCH 0914/2061] sh: Provide a BITS definition, use it in the arch/sh/boot/ Makefiles. This introduces a BITS export that can handily be picked up by Makefiles for cleaner sharing. Reflect its use in arch/sh/boot/compressed/ in preparation for unifying the Makefiles. Signed-off-by: Paul Mundt --- arch/sh/Makefile | 8 ++++---- arch/sh/boot/Makefile | 6 +++--- arch/sh/boot/compressed/Makefile | 4 ---- arch/sh/boot/compressed/Makefile_32 | 6 +++--- 4 files changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/sh/Makefile b/arch/sh/Makefile index b941dc9b20f..c1bbae1f65e 100644 --- a/arch/sh/Makefile +++ b/arch/sh/Makefile @@ -91,9 +91,11 @@ KBUILD_IMAGE := $(defaultimage-y) # ifdef CONFIG_SUPERH32 UTS_MACHINE := sh +BITS := 32 LDFLAGS_vmlinux += -e _stext else UTS_MACHINE := sh64 +BITS := 64 LDFLAGS_vmlinux += --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \ --defsym phys_stext_shmedia=phys_stext+1 \ -e phys_stext_shmedia @@ -109,11 +111,9 @@ LDFLAGS_vmlinux += --defsym 'jiffies=jiffies_64+4' --oformat $(ld-bfd) LDFLAGS += -EB endif -export ld-bfd +export ld-bfd BITS -head-y := arch/sh/kernel/init_task.o -head-$(CONFIG_SUPERH32) += arch/sh/kernel/head_32.o -head-$(CONFIG_SUPERH64) += arch/sh/kernel/head_64.o +head-y := arch/sh/kernel/init_task.o arch/sh/kernel/head_$(BITS).o core-y += arch/sh/kernel/ arch/sh/mm/ arch/sh/boards/ core-$(CONFIG_SH_FPU_EMU) += arch/sh/math-emu/ diff --git a/arch/sh/boot/Makefile b/arch/sh/boot/Makefile index 95483d16125..78efb04c28f 100644 --- a/arch/sh/boot/Makefile +++ b/arch/sh/boot/Makefile @@ -20,9 +20,6 @@ CONFIG_BOOT_LINK_OFFSET ?= 0x00800000 CONFIG_ZERO_PAGE_OFFSET ?= 0x00001000 CONFIG_ENTRY_OFFSET ?= 0x00001000 -export CONFIG_PAGE_OFFSET CONFIG_MEMORY_START CONFIG_BOOT_LINK_OFFSET \ - CONFIG_ZERO_PAGE_OFFSET CONFIG_ENTRY_OFFSET - targets := zImage vmlinux.srec uImage uImage.srec subdir- := compressed @@ -43,6 +40,9 @@ KERNEL_MEMORY := $(shell /bin/bash -c 'printf "0x%08x" \ $$[$(CONFIG_MEMORY_START)]') endif +export CONFIG_PAGE_OFFSET CONFIG_MEMORY_START CONFIG_BOOT_LINK_OFFSET \ + CONFIG_ZERO_PAGE_OFFSET CONFIG_ENTRY_OFFSET KERNEL_MEMORY + KERNEL_LOAD := $(shell /bin/bash -c 'printf "0x%08x" \ $$[$(CONFIG_PAGE_OFFSET) + \ $(KERNEL_MEMORY) + \ diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile index efb01dc3c8c..f0a71df2609 100644 --- a/arch/sh/boot/compressed/Makefile +++ b/arch/sh/boot/compressed/Makefile @@ -1,5 +1 @@ -ifeq ($(CONFIG_SUPERH32),y) include ${srctree}/arch/sh/boot/compressed/Makefile_32 -else -include ${srctree}/arch/sh/boot/compressed/Makefile_64 -endif diff --git a/arch/sh/boot/compressed/Makefile_32 b/arch/sh/boot/compressed/Makefile_32 index 249255729d7..9531bf1b7c2 100644 --- a/arch/sh/boot/compressed/Makefile_32 +++ b/arch/sh/boot/compressed/Makefile_32 @@ -5,9 +5,9 @@ # targets := vmlinux vmlinux.bin vmlinux.bin.gz \ - head_32.o misc_32.o piggy.o + head_$(BITS).o misc_$(BITS).o piggy.o -OBJECTS = $(obj)/head_32.o $(obj)/misc_32.o +OBJECTS = $(obj)/head_$(BITS).o $(obj)/misc_$(BITS).o $(obj)/cache.o ifdef CONFIG_SH_STANDARD_BIOS OBJECTS += $(obj)/../../kernel/sh_bios.o @@ -18,7 +18,7 @@ endif # IMAGE_OFFSET := $(shell /bin/bash -c 'printf "0x%08x" \ $$[$(CONFIG_PAGE_OFFSET) + \ - $(CONFIG_MEMORY_START) + \ + $(KERNEL_MEMORY) + \ $(CONFIG_BOOT_LINK_OFFSET)]') LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) From 1eca133cc9f978a8c44788fc5b2fe54219f9425c Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 10 May 2009 00:58:21 +0900 Subject: [PATCH 0915/2061] sh: Merge the split arch/sh/boot/compressed/ Makefiles. This kills off the _64 variant and moves the _32 one over as the generic one to use. Signed-off-by: Paul Mundt --- arch/sh/boot/compressed/Makefile | 48 ++++++++++++++++++++++++++++- arch/sh/boot/compressed/Makefile_32 | 47 ---------------------------- arch/sh/boot/compressed/Makefile_64 | 42 ------------------------- 3 files changed, 47 insertions(+), 90 deletions(-) delete mode 100644 arch/sh/boot/compressed/Makefile_32 delete mode 100644 arch/sh/boot/compressed/Makefile_64 diff --git a/arch/sh/boot/compressed/Makefile b/arch/sh/boot/compressed/Makefile index f0a71df2609..9531bf1b7c2 100644 --- a/arch/sh/boot/compressed/Makefile +++ b/arch/sh/boot/compressed/Makefile @@ -1 +1,47 @@ -include ${srctree}/arch/sh/boot/compressed/Makefile_32 +# +# linux/arch/sh/boot/compressed/Makefile +# +# create a compressed vmlinux image from the original vmlinux +# + +targets := vmlinux vmlinux.bin vmlinux.bin.gz \ + head_$(BITS).o misc_$(BITS).o piggy.o + +OBJECTS = $(obj)/head_$(BITS).o $(obj)/misc_$(BITS).o $(obj)/cache.o + +ifdef CONFIG_SH_STANDARD_BIOS +OBJECTS += $(obj)/../../kernel/sh_bios.o +endif + +# +# IMAGE_OFFSET is the load offset of the compression loader +# +IMAGE_OFFSET := $(shell /bin/bash -c 'printf "0x%08x" \ + $$[$(CONFIG_PAGE_OFFSET) + \ + $(KERNEL_MEMORY) + \ + $(CONFIG_BOOT_LINK_OFFSET)]') + +LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) + +ifeq ($(CONFIG_FUNCTION_TRACER),y) +ORIG_CFLAGS := $(KBUILD_CFLAGS) +KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) +endif + +LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \ + -T $(obj)/../../kernel/vmlinux.lds + +$(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE + $(call if_changed,ld) + @: + +$(obj)/vmlinux.bin: vmlinux FORCE + $(call if_changed,objcopy) + +$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE + $(call if_changed,gzip) + +OBJCOPYFLAGS += -R .empty_zero_page + +$(obj)/piggy.o: $(obj)/piggy.S $(obj)/vmlinux.bin.gz FORCE + $(call if_changed,as_o_S) diff --git a/arch/sh/boot/compressed/Makefile_32 b/arch/sh/boot/compressed/Makefile_32 deleted file mode 100644 index 9531bf1b7c2..00000000000 --- a/arch/sh/boot/compressed/Makefile_32 +++ /dev/null @@ -1,47 +0,0 @@ -# -# linux/arch/sh/boot/compressed/Makefile -# -# create a compressed vmlinux image from the original vmlinux -# - -targets := vmlinux vmlinux.bin vmlinux.bin.gz \ - head_$(BITS).o misc_$(BITS).o piggy.o - -OBJECTS = $(obj)/head_$(BITS).o $(obj)/misc_$(BITS).o $(obj)/cache.o - -ifdef CONFIG_SH_STANDARD_BIOS -OBJECTS += $(obj)/../../kernel/sh_bios.o -endif - -# -# IMAGE_OFFSET is the load offset of the compression loader -# -IMAGE_OFFSET := $(shell /bin/bash -c 'printf "0x%08x" \ - $$[$(CONFIG_PAGE_OFFSET) + \ - $(KERNEL_MEMORY) + \ - $(CONFIG_BOOT_LINK_OFFSET)]') - -LIBGCC := $(shell $(CC) $(KBUILD_CFLAGS) -print-libgcc-file-name) - -ifeq ($(CONFIG_FUNCTION_TRACER),y) -ORIG_CFLAGS := $(KBUILD_CFLAGS) -KBUILD_CFLAGS = $(subst -pg, , $(ORIG_CFLAGS)) -endif - -LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(IMAGE_OFFSET) -e startup \ - -T $(obj)/../../kernel/vmlinux.lds - -$(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o $(LIBGCC) FORCE - $(call if_changed,ld) - @: - -$(obj)/vmlinux.bin: vmlinux FORCE - $(call if_changed,objcopy) - -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) - -OBJCOPYFLAGS += -R .empty_zero_page - -$(obj)/piggy.o: $(obj)/piggy.S $(obj)/vmlinux.bin.gz FORCE - $(call if_changed,as_o_S) diff --git a/arch/sh/boot/compressed/Makefile_64 b/arch/sh/boot/compressed/Makefile_64 deleted file mode 100644 index 7093255d345..00000000000 --- a/arch/sh/boot/compressed/Makefile_64 +++ /dev/null @@ -1,42 +0,0 @@ -# -# arch/sh/boot/compressed/Makefile_64 -# -# create a compressed vmlinux image from the original vmlinux -# -# Copyright (C) 2002 Stuart Menefy -# Copyright (C) 2004 Paul Mundt -# -# This file is subject to the terms and conditions of the GNU General Public -# License. See the file "COPYING" in the main directory of this archive -# for more details. -# - -targets := vmlinux vmlinux.bin vmlinux.bin.gz \ - head_64.o misc_64.o cache.o piggy.o - -OBJECTS := $(obj)/head_64.o $(obj)/misc_64.o $(obj)/cache.o - -# -# ZIMAGE_OFFSET is the load offset of the compression loader -# (4M for the kernel plus 64K for this loader) -# -ZIMAGE_OFFSET := $(shell /bin/bash -c 'printf "0x%08x" \ - $$[$(CONFIG_PAGE_OFFSET)+0x400000+0x10000]') - -LDFLAGS_vmlinux := --oformat $(ld-bfd) -Ttext $(ZIMAGE_OFFSET) -e startup \ - -T $(obj)/../../kernel/vmlinux.lds - -$(obj)/vmlinux: $(OBJECTS) $(obj)/piggy.o FORCE - $(call if_changed,ld) - @: - -$(obj)/vmlinux.bin: vmlinux FORCE - $(call if_changed,objcopy) - -$(obj)/vmlinux.bin.gz: $(obj)/vmlinux.bin FORCE - $(call if_changed,gzip) - -OBJCOPYFLAGS += -R .empty_zero_page - -$(obj)/piggy.o: $(obj)/piggy.S $(obj)/vmlinux.bin.gz FORCE - $(call if_changed,as_o_S) From b412a49af970e703124fa3259bf24165c0c74024 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 10 May 2009 01:23:25 +0900 Subject: [PATCH 0916/2061] sh: Consolidate the boot link and entry offset definitions. Consolidate these in a single place in the Kconfig menus. At the same time, disable their interactivity and set them according to the board config defaults. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 22 ++++++++++++++++++---- arch/sh/mm/Kconfig | 8 -------- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index bca6a2a0759..a9dee13ddd8 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -694,23 +694,37 @@ endmenu menu "Boot options" config ZERO_PAGE_OFFSET - hex "Zero page offset" - default "0x00004000" if SH_SH03 - default "0x00010000" if PAGE_SIZE_64KB + hex + default "0x00010000" if PAGE_SIZE_64KB || SH_RTS7751R2D || \ + SH_7751_SOLUTION_ENGINE + default "0x00004000" if PAGE_SIZE_16KB || SH_SH03 default "0x00002000" if PAGE_SIZE_8KB default "0x00001000" help This sets the default offset of zero page. config BOOT_LINK_OFFSET - hex "Link address offset for booting" + hex + default "0x00210000" if SH_SHMIN default "0x00400000" if SH_CAYMAN + default "0x00810000" if SH_7780_SOLUTION_ENGINE + default "0x009e0000" if SH_TITAN + default "0x01800000" if SH_SDK7780 + default "0x02000000" if SH_EDOSK7760 default "0x00800000" help This option allows you to set the link address offset of the zImage. This can be useful if you are on a board which has a small amount of memory. +config ENTRY_OFFSET + hex + default "0x00001000" if PAGE_SIZE_4KB + default "0x00002000" if PAGE_SIZE_8KB + default "0x00004000" if PAGE_SIZE_16KB + default "0x00010000" if PAGE_SIZE_64KB + default "0x00000000" + config UBC_WAKEUP bool "Wakeup UBC on startup" depends on CPU_SH4 && !CPU_SH4A diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index d4079cab2d5..b900d2cd18f 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -201,14 +201,6 @@ config PAGE_SIZE_64KB endchoice -config ENTRY_OFFSET - hex - default "0x00001000" if PAGE_SIZE_4KB - default "0x00002000" if PAGE_SIZE_8KB - default "0x00004000" if PAGE_SIZE_16KB - default "0x00010000" if PAGE_SIZE_64KB - default "0x00000000" - choice prompt "HugeTLB page size" depends on HUGETLB_PAGE && (CPU_SH4 || CPU_SH5) && MMU From 457daa2b66e07bbd2280b9f8d2b03e800f357243 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 10 May 2009 01:28:01 +0900 Subject: [PATCH 0917/2061] sh: Hook up cc-cross-prefix support. This implements a simple case that just iterates through the common cases, looking at UTS_MACHINE for hints. Signed-off-by: Paul Mundt --- arch/sh/Makefile | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/sh/Makefile b/arch/sh/Makefile index c1bbae1f65e..a47fbd260ba 100644 --- a/arch/sh/Makefile +++ b/arch/sh/Makefile @@ -101,6 +101,12 @@ LDFLAGS_vmlinux += --defsym phys_stext=_stext-$(CONFIG_PAGE_OFFSET) \ -e phys_stext_shmedia endif +ifneq ($(SUBARCH),$(ARCH)) + ifeq ($(CROSS_COMPILE),) + CROSS_COMPILE := $(call cc-cross-prefix, $(UTS_MACHINE)-linux- $(UTS_MACHINE)-linux-gnu- $(UTS_MACHINE)-unknown-linux-gnu-) + endif +endif + ifdef CONFIG_CPU_LITTLE_ENDIAN ld-bfd := elf32-$(UTS_MACHINE)-linux LDFLAGS_vmlinux += --defsym 'jiffies=jiffies_64' --oformat $(ld-bfd) From 567bb8fd47624cb9f894c64ce9530d43d5862a71 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Sun, 10 May 2009 14:25:39 +0900 Subject: [PATCH 0918/2061] sh: Fix up R0 dependence in __arch_swab16/32. There is nothing in these routines that inherently depends on R0 use. Given that these routines are inlined, it is rather easy to blow up the compiler by exhausting the spill class when performing a 64-bit swab. This presently manifests itself as the following: CC fs/ocfs2/suballoc.o fs/ocfs2/suballoc.c: In function 'ocfs2_reserve_suballoc_bits': fs/ocfs2/suballoc.c:638: error: unrecognizable insn: (insn 2793 1230 1231 103 arch/sh/include/asm/swab.h:33 (set (reg:HI 853) (subreg:HI (reg:SI 149 macl) 2)) -1 (expr_list:REG_DEAD (reg:SI 149 macl) (nil))) fs/ocfs2/suballoc.c:638: internal compiler error: in extract_insn, at recog.c:1991 This patch switches over to using an arbitrarily assigned register instead. While the same issue does not exist in the SH-5 case, there is likewise no harm in having an alternate register used for the byterev/shari pair. Signed-off-by: Paul Mundt --- arch/sh/include/asm/swab.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/sh/include/asm/swab.h b/arch/sh/include/asm/swab.h index e6931593510..0e08fe54ad7 100644 --- a/arch/sh/include/asm/swab.h +++ b/arch/sh/include/asm/swab.h @@ -14,15 +14,15 @@ static inline __attribute_const__ __u32 __arch_swab32(__u32 x) { __asm__( #ifdef __SH5__ - "byterev %0, %0\n\t" + "byterev %1, %0\n\t" "shari %0, 32, %0" #else - "swap.b %0, %0\n\t" + "swap.b %1, %0\n\t" "swap.w %0, %0\n\t" "swap.b %0, %0" #endif : "=r" (x) - : "0" (x)); + : "r" (x)); return x; } @@ -32,13 +32,13 @@ static inline __attribute_const__ __u16 __arch_swab16(__u16 x) { __asm__( #ifdef __SH5__ - "byterev %0, %0\n\t" + "byterev %1, %0\n\t" "shari %0, 32, %0" #else - "swap.b %0, %0" + "swap.b %1, %0" #endif : "=r" (x) - : "0" (x)); + : "r" (x)); return x; } From 82afae6016b672acb90ceb8e773bba0bd977d2a3 Mon Sep 17 00:00:00 2001 From: Erdem Aktas Date: Sun, 10 May 2009 02:13:19 -0400 Subject: [PATCH 0919/2061] perf_counter tools: fix buffer overwrite problem for perf top command There is a buffer overwrite problem in builtin-top.c line 526, When I tried to use ./perf top command, it was giving memory corruption problem. [ Impact: fix 'perf top' crash ] LKML-Reference: <3fee128b0905092313x608e65e0l7b1116d86914114f@mail.gmail.com> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-top.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index cd6f61d7341..b1549dd2772 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -523,7 +523,7 @@ static int read_symbol(FILE *in, struct sym_entry *s) if (strstr(sym, "_text_start") || strstr(sym, "_text_end")) return 1; - s->sym = malloc(strlen(str)); + s->sym = malloc(strlen(str)+1); assert(s->sym); strcpy((char *)s->sym, str); From 5e751e992f3fb08ba35e1ca8095ec8fbf9eda523 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 8 May 2009 13:55:22 +0100 Subject: [PATCH 0920/2061] CRED: Rename cred_exec_mutex to reflect that it's a guard against ptrace Rename cred_exec_mutex to reflect that it's a guard against foreign intervention on a process's credential state, such as is made by ptrace(). The attachment of a debugger to a process affects execve()'s calculation of the new credential state - _and_ also setprocattr()'s calculation of that state. Signed-off-by: David Howells Signed-off-by: James Morris --- fs/compat.c | 6 +++--- fs/exec.c | 10 +++++----- include/linux/init_task.h | 4 ++-- include/linux/sched.h | 4 +++- kernel/cred.c | 4 ++-- kernel/ptrace.c | 9 +++++---- 6 files changed, 20 insertions(+), 17 deletions(-) diff --git a/fs/compat.c b/fs/compat.c index 681ed81e6be..bb2a9b2e817 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1488,7 +1488,7 @@ int compat_do_execve(char * filename, if (!bprm) goto out_files; - retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + retval = mutex_lock_interruptible(¤t->cred_guard_mutex); if (retval < 0) goto out_free; current->in_execve = 1; @@ -1550,7 +1550,7 @@ int compat_do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_exec_mutex); + mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1573,7 +1573,7 @@ out_unmark: out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_exec_mutex); + mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/fs/exec.c b/fs/exec.c index 639177b0eea..998e856c307 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1045,7 +1045,7 @@ void install_exec_creds(struct linux_binprm *bprm) commit_creds(bprm->cred); bprm->cred = NULL; - /* cred_exec_mutex must be held at least to this point to prevent + /* cred_guard_mutex must be held at least to this point to prevent * ptrace_attach() from altering our determination of the task's * credentials; any time after this it may be unlocked */ @@ -1055,7 +1055,7 @@ EXPORT_SYMBOL(install_exec_creds); /* * determine how safe it is to execute the proposed program - * - the caller must hold current->cred_exec_mutex to protect against + * - the caller must hold current->cred_guard_mutex to protect against * PTRACE_ATTACH */ int check_unsafe_exec(struct linux_binprm *bprm) @@ -1297,7 +1297,7 @@ int do_execve(char * filename, if (!bprm) goto out_files; - retval = mutex_lock_interruptible(¤t->cred_exec_mutex); + retval = mutex_lock_interruptible(¤t->cred_guard_mutex); if (retval < 0) goto out_free; current->in_execve = 1; @@ -1360,7 +1360,7 @@ int do_execve(char * filename, /* execve succeeded */ current->fs->in_exec = 0; current->in_execve = 0; - mutex_unlock(¤t->cred_exec_mutex); + mutex_unlock(¤t->cred_guard_mutex); acct_update_integrals(current); free_bprm(bprm); if (displaced) @@ -1383,7 +1383,7 @@ out_unmark: out_unlock: current->in_execve = 0; - mutex_unlock(¤t->cred_exec_mutex); + mutex_unlock(¤t->cred_guard_mutex); out_free: free_bprm(bprm); diff --git a/include/linux/init_task.h b/include/linux/init_task.h index d87247d2641..7f54ba94242 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -145,8 +145,8 @@ extern struct cred init_cred; .group_leader = &tsk, \ .real_cred = &init_cred, \ .cred = &init_cred, \ - .cred_exec_mutex = \ - __MUTEX_INITIALIZER(tsk.cred_exec_mutex), \ + .cred_guard_mutex = \ + __MUTEX_INITIALIZER(tsk.cred_guard_mutex), \ .comm = "swapper", \ .thread = INIT_THREAD, \ .fs = &init_fs, \ diff --git a/include/linux/sched.h b/include/linux/sched.h index 3fa82b353c9..5932ace2240 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1247,7 +1247,9 @@ struct task_struct { * credentials (COW) */ const struct cred *cred; /* effective (overridable) subjective task * credentials (COW) */ - struct mutex cred_exec_mutex; /* execve vs ptrace cred calculation mutex */ + struct mutex cred_guard_mutex; /* guard against foreign influences on + * credential calculations + * (notably. ptrace) */ char comm[TASK_COMM_LEN]; /* executable name excluding path - access with [gs]et_task_comm (which lock diff --git a/kernel/cred.c b/kernel/cred.c index 3a039189d70..1bb4d7e5d61 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -167,7 +167,7 @@ EXPORT_SYMBOL(prepare_creds); /* * Prepare credentials for current to perform an execve() - * - The caller must hold current->cred_exec_mutex + * - The caller must hold current->cred_guard_mutex */ struct cred *prepare_exec_creds(void) { @@ -276,7 +276,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) struct cred *new; int ret; - mutex_init(&p->cred_exec_mutex); + mutex_init(&p->cred_guard_mutex); if ( #ifdef CONFIG_KEYS diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0692ab5a0d6..27ac80298bf 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -185,10 +185,11 @@ int ptrace_attach(struct task_struct *task) if (same_thread_group(task, current)) goto out; - /* Protect exec's credential calculations against our interference; - * SUID, SGID and LSM creds get determined differently under ptrace. + /* Protect the target's credential calculations against our + * interference; SUID, SGID and LSM creds get determined differently + * under ptrace. */ - retval = mutex_lock_interruptible(&task->cred_exec_mutex); + retval = mutex_lock_interruptible(&task->cred_guard_mutex); if (retval < 0) goto out; @@ -232,7 +233,7 @@ repeat: bad: write_unlock_irqrestore(&tasklist_lock, flags); task_unlock(task); - mutex_unlock(&task->cred_exec_mutex); + mutex_unlock(&task->cred_guard_mutex); out: return retval; } From 107db7c7dd137aeb7361b8c2606ac936c0be58ff Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 8 May 2009 13:55:27 +0100 Subject: [PATCH 0921/2061] CRED: Guard the setprocattr security hook against ptrace Guard the setprocattr security hook against ptrace by taking the target task's cred_guard_mutex around it. The problem is that setprocattr() may otherwise note the lack of a debugger, and then perform an action on that basis whilst letting a debugger attach between the two points. Holding cred_guard_mutex across the test and the action prevents ptrace_attach() from doing that. Signed-off-by: David Howells Signed-off-by: James Morris --- fs/proc/base.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/proc/base.c b/fs/proc/base.c index fb45615943c..23342e188a6 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -2128,9 +2128,15 @@ static ssize_t proc_pid_attr_write(struct file * file, const char __user * buf, if (copy_from_user(page, buf, count)) goto out_free; + /* Guard against adverse ptrace interaction */ + length = mutex_lock_interruptible(&task->cred_guard_mutex); + if (length < 0) + goto out_free; + length = security_setprocattr(task, (char*)file->f_path.dentry->d_name.name, (void*)page, count); + mutex_unlock(&task->cred_guard_mutex); out_free: free_page((unsigned long) page); out: From d9d674e50007362567edb5df65c05dd56a5964bf Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 11 May 2009 12:12:38 +0900 Subject: [PATCH 0922/2061] sh: Fix up typo in arch/sh/kernel/vmlinux.lds.S .init_ramfs ought to be .init.ramfs, fix it up. Signed-off-by: Paul Mundt --- arch/sh/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index 1a2c8db5cb2..d73723080fd 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -127,7 +127,7 @@ SECTIONS #ifdef CONFIG_BLK_DEV_INITRD . = ALIGN(PAGE_SIZE); - .init.ramfs : AT(ADDR(.init_ramfs) - LOAD_OFFSET) { + .init.ramfs : AT(ADDR(.init.ramfs) - LOAD_OFFSET) { __initramfs_start = .; *(.init.ramfs) __initramfs_end = .; From 780f98ff1fa9cfcab177f6b5ab09b11321f1e5c8 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Mon, 11 May 2009 12:15:14 +0900 Subject: [PATCH 0923/2061] sh: Account for INITIAL_JIFFIES when using jiffies clocksource. In the case where we fall back on the generic jiffies clocksource, INITIAL_JIFFIES needs to be accounted for so that printk times aren't completely skewed. Signed-off-by: Paul Mundt --- arch/sh/kernel/time.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index e0aa769481f..a77838f539f 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -104,7 +104,7 @@ unsigned long long sched_clock(void) /* jiffies based sched_clock if no clocksource is installed */ if (!clocksource_sh.rating) - return (unsigned long long)jiffies * (NSEC_PER_SEC / HZ); + return (jiffies_64 - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); cycles = clocksource_sh.read(&clocksource_sh); return cyc2ns(&clocksource_sh, cycles); From 45fbe3ee01b8e463b28c2751b5dcc0cbdc142d90 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Wed, 6 May 2009 08:06:44 -0700 Subject: [PATCH 0924/2061] x86, e820, pci: reserve extra free space near end of RAM The point is to take all RAM resources we have, and _after_ we've added all the resources we've seen in the E820 tree, we then _also_ try to add fake reserved entries for any "round up to X" at the end of the RAM resources. [ Impact: improve PCI mem-resource allocation robustness, protect "stolen RAM" ] Reported-by: Yannick Roehlly Acked-by: Jesse Barnes Signed-off-by: Yinghai Lu Cc: Ivan Kokshaysky Cc: Andrew Morton Cc: yannick.roehlly@free.fr LKML-Reference: <4A01A784.2050407@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 35 +++++++++++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 00628130292..a2335d9de05 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1371,6 +1371,23 @@ void __init e820_reserve_resources(void) } } +/* How much should we pad RAM ending depending on where it is? */ +static unsigned long ram_alignment(resource_size_t pos) +{ + unsigned long mb = pos >> 20; + + /* To 64kB in the first megabyte */ + if (!mb) + return 64*1024; + + /* To 1MB in the first 16MB */ + if (mb < 16) + return 1024*1024; + + /* To 32MB for anything above that */ + return 32*1024*1024; +} + void __init e820_reserve_resources_late(void) { int i; @@ -1382,6 +1399,24 @@ void __init e820_reserve_resources_late(void) insert_resource_expand_to_fit(&iomem_resource, res); res++; } + + /* + * Try to bump up RAM regions to reasonable boundaries to + * avoid stolen RAM: + */ + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *entry = &e820_saved.map[i]; + resource_size_t start, end; + + if (entry->type != E820_RAM) + continue; + start = entry->addr + entry->size; + end = round_up(start, ram_alignment(start)); + if (start == end) + continue; + reserve_region_with_split(&iomem_resource, start, + end - 1, "RAM buffer"); + } } char *__init default_machine_specific_memory_setup(void) From 5d423ccd7ba4285f1084e91b26805e1d0ae978ed Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Wed, 6 May 2009 08:07:52 -0700 Subject: [PATCH 0925/2061] x86/pci: remove rounding quirk from e820_setup_gap() Now that the e820 code explicitly reserves 'potentially dangerous' free physical memory address space to protect ACPI stolen RAM, there's no need for the rounding quirk in the PCI allocator anymore. Also, this quirk was open-ended iteration that could end up reserving a lot of free space and potentially breaking drivers - such as the one reported by Yannick Roehlly where there's a PCI device with a large memory resource. So remove it. [ Impact: make more of the PCI hole available for assigning pci devices ] Reported-by: Yannick Roehlly Signed-off-by: Yinghai Lu Acked-by: Jesse Barnes Cc: Ivan Kokshaysky Cc: Linus Torvalds Cc: Andrew Morton LKML-Reference: <4A01A7C8.5090701@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index a2335d9de05..7271fa33d79 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -617,7 +617,7 @@ __init int e820_search_gap(unsigned long *gapstart, unsigned long *gapsize, */ __init void e820_setup_gap(void) { - unsigned long gapstart, gapsize, round; + unsigned long gapstart, gapsize; int found; gapstart = 0x10000000; @@ -635,14 +635,9 @@ __init void e820_setup_gap(void) #endif /* - * See how much we want to round up: start off with - * rounding to the next 1MB area. + * e820_reserve_resources_late protect stolen RAM already */ - round = 0x100000; - while ((gapsize >> 4) > round) - round += round; - /* Fun with two's complement */ - pci_mem_start = (gapstart + round) & -round; + pci_mem_start = gapstart; printk(KERN_INFO "Allocating PCI resources starting at %lx (gap: %lx:%lx)\n", From 53d6979ab6747e758207e8ac861b96d0da0d3332 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:35 +0900 Subject: [PATCH 0926/2061] nbd: don't clear rq->sector and nr_sectors unnecessarily There's no reason to clear rq->sector and nr_sectors after calling blk_rq_init(). They're guaranteed to be clear. Drop unnecessary clearing. [ Impact: cleanup ] Signed-off-by: Tejun Heo Cc: Paul Clements Signed-off-by: Jens Axboe --- drivers/block/nbd.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 4d6de4f15cc..a9ab8be9d92 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -580,13 +580,6 @@ static int __nbd_ioctl(struct block_device *bdev, struct nbd_device *lo, blk_rq_init(NULL, &sreq); sreq.cmd_type = REQ_TYPE_SPECIAL; nbd_cmd(&sreq) = NBD_CMD_DISC; - /* - * Set these to sane values in case server implementation - * fails to check the request type first and also to keep - * debugging output cleaner. - */ - sreq.sector = 0; - sreq.nr_sectors = 0; if (!lo->sock) return -EINVAL; nbd_send_req(lo, &sreq); From 9720aef2539c10e3a872e9a92beec225030d99db Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:36 +0900 Subject: [PATCH 0927/2061] ide-tape: don't initialize rq->sector for rw requests rq->sector is set to the tape->first_frame but it's never actually used and not even in the correct unit (512 byte sectors). Don't set it. [ Impact: cleanup ] Signed-off-by: Tejun Heo Acked-by: Borislav Petkov Acked-by: Bartlomiej Zolnierkiewicz Signed-off-by: Jens Axboe --- drivers/ide/ide-tape.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 8dfc68892d6..7149224d1fe 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -892,7 +892,6 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) rq->cmd_type = REQ_TYPE_SPECIAL; rq->cmd[13] = cmd; rq->rq_disk = tape->disk; - rq->sector = tape->first_frame; if (size) { ret = blk_rq_map_kern(drive->queue, rq, tape->buf, size, From c3a4d78c580de4edc9ef0f7c59812fb02ceb037f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:37 +0900 Subject: [PATCH 0928/2061] block: add rq->resid_len rq->data_len served two purposes - the length of data buffer on issue and the residual count on completion. This duality creates some headaches. First of all, block layer and low level drivers can't really determine what rq->data_len contains while a request is executing. It could be the total request length or it coulde be anything else one of the lower layers is using to keep track of residual count. This complicates things because blk_rq_bytes() and thus [__]blk_end_request_all() relies on rq->data_len for PC commands. Drivers which want to report residual count should first cache the total request length, update rq->data_len and then complete the request with the cached data length. Secondly, it makes requests default to reporting full residual count, ie. reporting that no data transfer occurred. The residual count is an exception not the norm; however, the driver should clear rq->data_len to zero to signify the normal cases while leaving it alone means no data transfer occurred at all. This reverse default behavior complicates code unnecessarily and renders block PC on some drivers (ide-tape/floppy) unuseable. This patch adds rq->resid_len which is used only for residual count. While at it, remove now unnecessasry blk_rq_bytes() caching in ide_pc_intr() as rq->data_len is not changed anymore. Boaz : spotted missing conversion in osd Sergei : spotted too early conversion to blk_rq_bytes() in ide-tape [ Impact: cleanup residual count handling, report 0 resid by default ] Signed-off-by: Tejun Heo Cc: James Bottomley Cc: Bartlomiej Zolnierkiewicz Cc: Borislav Petkov Cc: Sergei Shtylyov Cc: Mike Miller Cc: Eric Moore Cc: Alan Stern Cc: FUJITA Tomonori Cc: Doug Gilbert Cc: Mike Miller Cc: Eric Moore Cc: Darrick J. Wong Cc: Pete Zaitcev Cc: Boaz Harrosh Signed-off-by: Jens Axboe --- block/bsg.c | 8 ++--- block/scsi_ioctl.c | 2 +- drivers/block/cciss.c | 13 +++----- drivers/block/ub.c | 6 ++-- drivers/ide/ide-atapi.c | 9 +----- drivers/ide/ide-cd.c | 13 ++++---- drivers/ide/ide-tape.c | 4 +-- drivers/message/fusion/mptsas.c | 3 +- drivers/scsi/libsas/sas_expander.c | 6 +--- drivers/scsi/libsas/sas_host_smp.c | 38 +++++++++++++----------- drivers/scsi/mpt2sas/mpt2sas_transport.c | 4 +-- drivers/scsi/scsi_lib.c | 24 +++++++-------- drivers/scsi/sg.c | 2 +- drivers/scsi/st.c | 2 +- fs/exofs/osd.c | 4 +-- include/linux/blkdev.h | 1 + 16 files changed, 59 insertions(+), 80 deletions(-) diff --git a/block/bsg.c b/block/bsg.c index 206060e795d..2d746e34f4c 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -445,14 +445,14 @@ static int blk_complete_sgv4_hdr_rq(struct request *rq, struct sg_io_v4 *hdr, } if (rq->next_rq) { - hdr->dout_resid = rq->data_len; - hdr->din_resid = rq->next_rq->data_len; + hdr->dout_resid = rq->resid_len; + hdr->din_resid = rq->next_rq->resid_len; blk_rq_unmap_user(bidi_bio); blk_put_request(rq->next_rq); } else if (rq_data_dir(rq) == READ) - hdr->din_resid = rq->data_len; + hdr->din_resid = rq->resid_len; else - hdr->dout_resid = rq->data_len; + hdr->dout_resid = rq->resid_len; /* * If the request generated a negative error number, return it diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index 58cf4560f74..a9670dd4b5d 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -230,7 +230,7 @@ static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, hdr->info = 0; if (hdr->masked_status || hdr->host_status || hdr->driver_status) hdr->info |= SG_INFO_CHECK; - hdr->resid = rq->data_len; + hdr->resid = rq->resid_len; hdr->sb_len_wr = 0; if (rq->sense_len && hdr->sbp) { diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index 4d4d5e0d3fa..f22d4932433 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1299,7 +1299,6 @@ static void cciss_softirq_done(struct request *rq) { CommandList_struct *cmd = rq->completion_data; ctlr_info_t *h = hba[cmd->ctlr]; - unsigned int nr_bytes; unsigned long flags; u64bit temp64; int i, ddir; @@ -1321,15 +1320,11 @@ static void cciss_softirq_done(struct request *rq) printk("Done with %p\n", rq); #endif /* CCISS_DEBUG */ - /* - * Store the full size and set the residual count for pc requests - */ - nr_bytes = blk_rq_bytes(rq); + /* set the residual count for pc requests */ if (blk_pc_request(rq)) - rq->data_len = cmd->err_info->ResidualCnt; + rq->resid_len = cmd->err_info->ResidualCnt; - if (blk_end_request(rq, (rq->errors == 0) ? 0 : -EIO, nr_bytes)) - BUG(); + blk_end_request_all(rq, (rq->errors == 0) ? 0 : -EIO); spin_lock_irqsave(&h->lock, flags); cmd_free(h, cmd, 1); @@ -2691,7 +2686,7 @@ static inline void complete_command(ctlr_info_t *h, CommandList_struct *cmd, printk(KERN_WARNING "cciss: cmd %p has" " completed with data underrun " "reported\n", cmd); - cmd->rq->data_len = cmd->err_info->ResidualCnt; + cmd->rq->resid_len = cmd->err_info->ResidualCnt; } break; case CMD_DATA_OVERRUN: diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 689cd27ac89..8c2cc71327e 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -783,10 +783,8 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd) if (cmd->error == 0) { if (blk_pc_request(rq)) { - if (cmd->act_len >= rq->data_len) - rq->data_len = 0; - else - rq->data_len -= cmd->act_len; + if (cmd->act_len < rq->data_len) + rq->resid_len = rq->data_len - cmd->act_len; scsi_status = 0; } else { if (cmd->act_len != cmd->len) { diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index afe5a432387..e4a02a05fc8 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -367,7 +367,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) /* No more interrupts */ if ((stat & ATA_DRQ) == 0) { int uptodate, error; - unsigned int done; debug_log("Packet command completed, %d bytes transferred\n", pc->xferred); @@ -406,12 +405,6 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) if ((pc->flags & PC_FLAG_WAIT_FOR_DSC) && (stat & ATA_DSC) == 0) dsc = 1; - /* - * ->pc_callback() might change rq->data_len for - * residual count, cache total length. - */ - done = blk_rq_bytes(rq); - /* Command finished - Call the callback function */ uptodate = drive->pc_callback(drive, dsc); @@ -431,7 +424,7 @@ static ide_startstop_t ide_pc_intr(ide_drive_t *drive) error = uptodate ? 0 : -EIO; } - ide_complete_rq(drive, error, done); + ide_complete_rq(drive, error, blk_rq_bytes(rq)); return ide_stopped; } diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 673628790f1..8bbe222c5e4 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -519,7 +519,7 @@ int ide_cd_queue_pc(ide_drive_t *drive, const unsigned char *cmd, error = blk_execute_rq(drive->queue, info->disk, rq, 0); if (buffer) - *bufflen = rq->data_len; + *bufflen = rq->resid_len; flags = rq->cmd_flags; blk_put_request(rq); @@ -707,11 +707,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) out_end: if (blk_pc_request(rq) && rc == 0) { - unsigned int dlen = rq->data_len; - - rq->data_len = 0; - - if (blk_end_request(rq, 0, dlen)) + if (blk_end_request(rq, 0, rq->data_len)) BUG(); hwif->rq = NULL; @@ -740,9 +736,10 @@ out_end: nsectors = 1; if (blk_fs_request(rq) == 0) { - rq->data_len -= (cmd->nbytes - cmd->nleft); + rq->resid_len = rq->data_len - + (cmd->nbytes - cmd->nleft); if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE)) - rq->data_len += cmd->last_xfer_len; + rq->resid_len += cmd->last_xfer_len; } ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9); diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 7149224d1fe..65c5b705883 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -380,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc) } tape->first_frame += blocks; - rq->data_len -= blocks * tape->blk_size; + rq->resid_len = rq->data_len - blocks * tape->blk_size; if (pc->error) { uptodate = 0; @@ -903,7 +903,7 @@ static int idetape_queue_rw_tail(ide_drive_t *drive, int cmd, int size) blk_execute_rq(drive->queue, tape->disk, rq, 0); /* calculate the number of transferred bytes and update buffer state */ - size -= rq->data_len; + size -= rq->resid_len; tape->cur = tape->buf; if (cmd == REQ_IDETAPE_READ) tape->valid = size; diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index a9019f081b9..5d5f34715de 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -1357,8 +1357,7 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply; memcpy(req->sense, smprep, sizeof(*smprep)); req->sense_len = sizeof(*smprep); - req->data_len = 0; - rsp->data_len -= smprep->ResponseDataLength; + rsp->resid_len = rsp->data_len - smprep->ResponseDataLength; } else { printk(MYIOC_s_ERR_FMT "%s: smp passthru reply failed to be returned\n", ioc->name, __func__); diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 3da02e43678..6605ec905cc 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -1936,12 +1936,8 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, bio_data(rsp->bio), rsp->data_len); if (ret > 0) { /* positive number is the untransferred residual */ - rsp->data_len = ret; - req->data_len = 0; + rsp->resid_len = ret; ret = 0; - } else if (ret == 0) { - rsp->data_len = 0; - req->data_len = 0; } return ret; diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c index d110a366c48..89952edd0be 100644 --- a/drivers/scsi/libsas/sas_host_smp.c +++ b/drivers/scsi/libsas/sas_host_smp.c @@ -134,7 +134,7 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, { u8 *req_data = NULL, *resp_data = NULL, *buf; struct sas_ha_struct *sas_ha = SHOST_TO_SAS_HA(shost); - int error = -EINVAL, resp_data_len = rsp->data_len; + int error = -EINVAL; /* eight is the minimum size for request and response frames */ if (req->data_len < 8 || rsp->data_len < 8) @@ -176,17 +176,20 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, resp_data[1] = req_data[1]; resp_data[2] = SMP_RESP_FUNC_UNK; + req->resid_len = req->data_len; + rsp->resid_len = rsp->data_len; + switch (req_data[1]) { case SMP_REPORT_GENERAL: - req->data_len -= 8; - resp_data_len -= 32; + req->resid_len -= 8; + rsp->resid_len -= 32; resp_data[2] = SMP_RESP_FUNC_ACC; resp_data[9] = sas_ha->num_phys; break; case SMP_REPORT_MANUF_INFO: - req->data_len -= 8; - resp_data_len -= 64; + req->resid_len -= 8; + rsp->resid_len -= 64; resp_data[2] = SMP_RESP_FUNC_ACC; memcpy(resp_data + 12, shost->hostt->name, SAS_EXPANDER_VENDOR_ID_LEN); @@ -199,13 +202,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, break; case SMP_DISCOVER: - req->data_len -= 16; - if ((int)req->data_len < 0) { - req->data_len = 0; + req->resid_len -= 16; + if ((int)req->resid_len < 0) { + req->resid_len = 0; error = -EINVAL; goto out; } - resp_data_len -= 56; + rsp->resid_len -= 56; sas_host_smp_discover(sas_ha, resp_data, req_data[9]); break; @@ -215,13 +218,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, break; case SMP_REPORT_PHY_SATA: - req->data_len -= 16; - if ((int)req->data_len < 0) { - req->data_len = 0; + req->resid_len -= 16; + if ((int)req->resid_len < 0) { + req->resid_len = 0; error = -EINVAL; goto out; } - resp_data_len -= 60; + rsp->resid_len -= 60; sas_report_phy_sata(sas_ha, resp_data, req_data[9]); break; @@ -238,13 +241,13 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, break; case SMP_PHY_CONTROL: - req->data_len -= 44; - if ((int)req->data_len < 0) { - req->data_len = 0; + req->resid_len -= 44; + if ((int)req->resid_len < 0) { + req->resid_len = 0; error = -EINVAL; goto out; } - resp_data_len -= 8; + rsp->resid_len -= 8; sas_phy_control(sas_ha, req_data[9], req_data[10], req_data[32] >> 4, req_data[33] >> 4, resp_data); @@ -265,7 +268,6 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, flush_kernel_dcache_page(bio_page(rsp->bio)); kunmap_atomic(buf - bio_offset(rsp->bio), KM_USER0); local_irq_enable(); - rsp->data_len = resp_data_len; out: kfree(req_data); diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c index e03dc0b1e1a..53759c566bf 100644 --- a/drivers/scsi/mpt2sas/mpt2sas_transport.c +++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c @@ -1170,9 +1170,7 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, memcpy(req->sense, mpi_reply, sizeof(*mpi_reply)); req->sense_len = sizeof(*mpi_reply); - req->data_len = 0; - rsp->data_len -= mpi_reply->ResponseDataLength; - + rsp->resid_len = rsp->data_len - mpi_reply->ResponseDataLength; } else { dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT "%s - no reply\n", ioc->name, __func__)); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index aa9fc572e45..7d49ef589f3 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -240,11 +240,11 @@ int scsi_execute(struct scsi_device *sdev, const unsigned char *cmd, * is invalid. Prevent the garbage from being misinterpreted * and prevent security leaks by zeroing out the excess data. */ - if (unlikely(req->data_len > 0 && req->data_len <= bufflen)) - memset(buffer + (bufflen - req->data_len), 0, req->data_len); + if (unlikely(req->resid_len > 0 && req->resid_len <= bufflen)) + memset(buffer + (bufflen - req->resid_len), 0, req->resid_len); if (resid) - *resid = req->data_len; + *resid = req->resid_len; ret = req->errors; out: blk_put_request(req); @@ -549,7 +549,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error, int leftover = (req->hard_nr_sectors << 9); if (blk_pc_request(req)) - leftover = req->data_len; + leftover = req->resid_len; /* kill remainder if no retrys */ if (error && scsi_noretry_cmd(cmd)) @@ -673,11 +673,11 @@ void scsi_release_buffers(struct scsi_cmnd *cmd) EXPORT_SYMBOL(scsi_release_buffers); /* - * Bidi commands Must be complete as a whole, both sides at once. - * If part of the bytes were written and lld returned - * scsi_in()->resid and/or scsi_out()->resid this information will be left - * in req->data_len and req->next_rq->data_len. The upper-layer driver can - * decide what to do with this information. + * Bidi commands Must be complete as a whole, both sides at once. If + * part of the bytes were written and lld returned scsi_in()->resid + * and/or scsi_out()->resid this information will be left in + * req->resid_len and req->next_rq->resid_len. The upper-layer driver + * can decide what to do with this information. */ static void scsi_end_bidi_request(struct scsi_cmnd *cmd) { @@ -685,8 +685,8 @@ static void scsi_end_bidi_request(struct scsi_cmnd *cmd) unsigned int dlen = req->data_len; unsigned int next_dlen = req->next_rq->data_len; - req->data_len = scsi_out(cmd)->resid; - req->next_rq->data_len = scsi_in(cmd)->resid; + req->resid_len = scsi_out(cmd)->resid; + req->next_rq->resid_len = scsi_in(cmd)->resid; /* The req and req->next_rq have not been completed */ BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen)); @@ -778,7 +778,7 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) scsi_end_bidi_request(cmd); return; } - req->data_len = scsi_get_resid(cmd); + req->resid_len = scsi_get_resid(cmd); } BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */ diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 82312df9b0b..dec4c70677d 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1260,7 +1260,7 @@ static void sg_rq_end_io(struct request *rq, int uptodate) sense = rq->sense; result = rq->errors; - resid = rq->data_len; + resid = rq->resid_len; SCSI_LOG_TIMEOUT(4, printk("sg_cmd_done: %s, pack_id=%d, res=0x%x\n", sdp->disk->disk_name, srp->header.pack_id, result)); diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index eb24efea8f1..8681b708344 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -463,7 +463,7 @@ static void st_scsi_execute_end(struct request *req, int uptodate) struct scsi_tape *STp = SRpnt->stp; STp->buffer->cmdstat.midlevel_result = SRpnt->result = req->errors; - STp->buffer->cmdstat.residual = req->data_len; + STp->buffer->cmdstat.residual = req->resid_len; if (SRpnt->waiting) complete(SRpnt->waiting); diff --git a/fs/exofs/osd.c b/fs/exofs/osd.c index b249ae97fb1..06ca92672eb 100644 --- a/fs/exofs/osd.c +++ b/fs/exofs/osd.c @@ -50,10 +50,10 @@ int exofs_check_ok_resid(struct osd_request *or, u64 *in_resid, u64 *out_resid) /* FIXME: should be include in osd_sense_info */ if (in_resid) - *in_resid = or->in.req ? or->in.req->data_len : 0; + *in_resid = or->in.req ? or->in.req->resid_len : 0; if (out_resid) - *out_resid = or->out.req ? or->out.req->data_len : 0; + *out_resid = or->out.req ? or->out.req->resid_len : 0; return ret; } diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3a5b1bd6582..6a967cad89f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -229,6 +229,7 @@ struct request { unsigned int data_len; unsigned int extra_len; /* length of alignment and padding */ unsigned int sense_len; + unsigned int resid_len; /* residual count */ void *sense; unsigned long deadline; From 5b93629b4509c03ffa87a9316412fedf6f58cb37 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:38 +0900 Subject: [PATCH 0929/2061] block: implement blk_rq_pos/[cur_]sectors() and convert obvious ones Implement accessors - blk_rq_pos(), blk_rq_sectors() and blk_rq_cur_sectors() which return rq->hard_sector, rq->hard_nr_sectors and rq->hard_cur_sectors respectively and convert direct references of the said fields to the accessors. This is in preparation of request data length handling cleanup. Geert : suggested adding const to struct request * parameter to accessors Sergei : spotted error in patch description [ Impact: cleanup ] Signed-off-by: Tejun Heo Acked-by: Geert Uytterhoeven Acked-by: Stephen Rothwell Tested-by: Grant Likely Acked-by: Grant Likely Ackec-by: Sergei Shtylyov Cc: Bartlomiej Zolnierkiewicz Cc: Borislav Petkov Cc: James Bottomley Signed-off-by: Jens Axboe --- block/blk-barrier.c | 2 +- block/blk-core.c | 2 +- block/cfq-iosched.c | 2 +- drivers/block/ps3disk.c | 2 +- drivers/block/viodasd.c | 6 +++--- drivers/block/xsysace.c | 10 +++++----- drivers/ide/ide-cd.c | 8 ++++---- drivers/ide/ide-io.c | 4 ++-- drivers/message/i2o/i2o_block.c | 2 +- drivers/scsi/scsi_lib.c | 2 +- include/linux/blkdev.h | 23 ++++++++++++++++++++--- kernel/trace/blktrace.c | 4 ++-- 12 files changed, 42 insertions(+), 25 deletions(-) diff --git a/block/blk-barrier.c b/block/blk-barrier.c index c8d087655ef..c167de5b9ef 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -163,7 +163,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp) * For an empty barrier, there's no actual BAR request, which * in turn makes POSTFLUSH unnecessary. Mask them off. */ - if (!rq->hard_nr_sectors) { + if (!blk_rq_sectors(rq)) { q->ordered &= ~(QUEUE_ORDERED_DO_BAR | QUEUE_ORDERED_DO_POSTFLUSH); /* diff --git a/block/blk-core.c b/block/blk-core.c index 394c5bd8127..895e55b74a4 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1683,7 +1683,7 @@ static void blk_account_io_done(struct request *req) unsigned int blk_rq_bytes(struct request *rq) { if (blk_fs_request(rq)) - return rq->hard_nr_sectors << 9; + return blk_rq_sectors(rq) << 9; return rq->data_len; } diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index def0c698a4b..575083a9ffe 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -760,7 +760,7 @@ static void cfq_activate_request(struct request_queue *q, struct request *rq) cfq_log_cfqq(cfqd, RQ_CFQQ(rq), "activate rq, drv=%d", cfqd->rq_in_driver); - cfqd->last_position = rq->hard_sector + rq->hard_nr_sectors; + cfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); } static void cfq_deactivate_request(struct request_queue *q, struct request *rq) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index f6586e4d351..c2388673684 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -136,7 +136,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev, dev_dbg(&dev->sbd.core, "%s:%u: %s req has %u bvecs for %lu sectors %lu hard sectors\n", __func__, __LINE__, op, n, req->nr_sectors, - req->hard_nr_sectors); + blk_rq_sectors(req)); #endif start_sector = req->sector * priv->blocking_factor; diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c index ecccf65dce2..e821eed7132 100644 --- a/drivers/block/viodasd.c +++ b/drivers/block/viodasd.c @@ -368,12 +368,12 @@ static void do_viodasd_request(struct request_queue *q) blkdev_dequeue_request(req); /* check that request contains a valid command */ if (!blk_fs_request(req)) { - viodasd_end_request(req, -EIO, req->hard_nr_sectors); + viodasd_end_request(req, -EIO, blk_rq_sectors(req)); continue; } /* Try sending the request */ if (send_request(req) != 0) - viodasd_end_request(req, -EIO, req->hard_nr_sectors); + viodasd_end_request(req, -EIO, blk_rq_sectors(req)); } } @@ -590,7 +590,7 @@ static int viodasd_handle_read_write(struct vioblocklpevent *bevent) err = vio_lookup_rc(viodasd_err_table, bevent->sub_result); printk(VIOD_KERN_WARNING "read/write error %d:0x%04x (%s)\n", event->xRc, bevent->sub_result, err->msg); - num_sect = req->hard_nr_sectors; + num_sect = blk_rq_sectors(req); } qlock = req->q->queue_lock; spin_lock_irqsave(qlock, irq_flags); diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index b1e1d7e5ab1..5722931d14c 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -645,8 +645,8 @@ static void ace_fsm_dostate(struct ace_device *ace) /* Okay, it's a data request, set it up for transfer */ dev_dbg(ace->dev, - "request: sec=%llx hcnt=%lx, ccnt=%x, dir=%i\n", - (unsigned long long) req->sector, req->hard_nr_sectors, + "request: sec=%llx hcnt=%x, ccnt=%x, dir=%i\n", + (unsigned long long) req->sector, blk_rq_sectors(req), req->current_nr_sectors, rq_data_dir(req)); ace->req = req; @@ -654,7 +654,7 @@ static void ace_fsm_dostate(struct ace_device *ace) ace->data_count = req->current_nr_sectors * ACE_BUF_PER_SECTOR; ace_out32(ace, ACE_MPULBA, req->sector & 0x0FFFFFFF); - count = req->hard_nr_sectors; + count = blk_rq_sectors(req); if (rq_data_dir(req)) { /* Kick off write request */ dev_dbg(ace->dev, "write data\n"); @@ -719,8 +719,8 @@ static void ace_fsm_dostate(struct ace_device *ace) /* bio finished; is there another one? */ if (__blk_end_request(ace->req, 0, blk_rq_cur_bytes(ace->req))) { - /* dev_dbg(ace->dev, "next block; h=%li c=%i\n", - * ace->req->hard_nr_sectors, + /* dev_dbg(ace->dev, "next block; h=%u c=%u\n", + * blk_rq_sectors(ace->req), * ace->req->current_nr_sectors); */ ace->data_ptr = ace->req->buffer; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 8bbe222c5e4..182320dd6ea 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -730,7 +730,7 @@ out_end: if (blk_pc_request(rq)) nsectors = (rq->data_len + 511) >> 9; else - nsectors = rq->hard_nr_sectors; + nsectors = blk_rq_sectors(rq); if (nsectors == 0) nsectors = 1; @@ -875,7 +875,7 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, return ide_issue_pc(drive, &cmd); out_end: - nsectors = rq->hard_nr_sectors; + nsectors = blk_rq_sectors(rq); if (nsectors == 0) nsectors = 1; @@ -1359,8 +1359,8 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive) static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq) { int hard_sect = queue_hardsect_size(q); - long block = (long)rq->hard_sector / (hard_sect >> 9); - unsigned long blocks = rq->hard_nr_sectors / (hard_sect >> 9); + long block = (long)blk_rq_pos(rq) / (hard_sect >> 9); + unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9); memset(rq->cmd, 0, BLK_MAX_CDB); diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index a0309ea661a..df23bcbd94b 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -118,7 +118,7 @@ unsigned int ide_rq_bytes(struct request *rq) if (blk_pc_request(rq)) return rq->data_len; else - return rq->hard_cur_sectors << 9; + return blk_rq_cur_sectors(rq) << 9; } EXPORT_SYMBOL_GPL(ide_rq_bytes); @@ -133,7 +133,7 @@ int ide_complete_rq(ide_drive_t *drive, int error, unsigned int nr_bytes) * and complete the whole request right now */ if (blk_noretry_request(rq) && error <= 0) - nr_bytes = rq->hard_nr_sectors << 9; + nr_bytes = blk_rq_sectors(rq) << 9; rc = ide_end_rq(drive, rq, error, nr_bytes); if (rc == 0) diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index 221317e6a00..56e60f0d531 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -427,7 +427,7 @@ static void i2o_block_end_request(struct request *req, int error, unsigned long flags; if (blk_end_request(req, error, nr_bytes)) { - int leftover = (req->hard_nr_sectors << KERNEL_SECTOR_SHIFT); + int leftover = (blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT); if (blk_pc_request(req)) leftover = req->data_len; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 7d49ef589f3..9ff0ca9988a 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -546,7 +546,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error, * to queue the remainder of them. */ if (blk_end_request(req, error, bytes)) { - int leftover = (req->hard_nr_sectors << 9); + int leftover = blk_rq_sectors(req) << 9; if (blk_pc_request(req)) leftover = req->resid_len; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6a967cad89f..4e5f8559872 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -832,13 +832,30 @@ static inline void blk_run_address_space(struct address_space *mapping) extern void blkdev_dequeue_request(struct request *req); /* - * blk_end_request() takes bytes instead of sectors as a complete size. - * blk_rq_bytes() returns bytes left to complete in the entire request. - * blk_rq_cur_bytes() returns bytes left to complete in the current segment. + * blk_rq_pos() : the current sector + * blk_rq_bytes() : bytes left in the entire request + * blk_rq_cur_bytes() : bytes left in the current segment + * blk_rq_sectors() : sectors left in the entire request + * blk_rq_cur_sectors() : sectors left in the current segment */ +static inline sector_t blk_rq_pos(const struct request *rq) +{ + return rq->hard_sector; +} + extern unsigned int blk_rq_bytes(struct request *rq); extern unsigned int blk_rq_cur_bytes(struct request *rq); +static inline unsigned int blk_rq_sectors(const struct request *rq) +{ + return rq->hard_nr_sectors; +} + +static inline unsigned int blk_rq_cur_sectors(const struct request *rq) +{ + return rq->hard_cur_sectors; +} + /* * Request completion related functions. * diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 921ef5d1f0b..42f1c11e754 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -646,7 +646,7 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9, rw, what, rq->errors, 0, NULL); } } @@ -857,7 +857,7 @@ void blk_add_driver_data(struct request_queue *q, __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, rq->errors, len, data); else - __blk_add_trace(bt, rq->hard_sector, rq->hard_nr_sectors << 9, + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9, 0, BLK_TA_DRV_DATA, rq->errors, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); From 83096ebf1263b2c1ee5e653ba37d993d02e3eb7b Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:39 +0900 Subject: [PATCH 0930/2061] block: convert to pos and nr_sectors accessors With recent cleanups, there is no place where low level driver directly manipulates request fields. This means that the 'hard' request fields always equal the !hard fields. Convert all rq->sectors, nr_sectors and current_nr_sectors references to accessors. While at it, drop superflous blk_rq_pos() < 0 test in swim.c. [ Impact: use pos and nr_sectors accessors ] Signed-off-by: Tejun Heo Acked-by: Geert Uytterhoeven Tested-by: Grant Likely Acked-by: Grant Likely Tested-by: Adrian McMenamin Acked-by: Adrian McMenamin Acked-by: Mike Miller Cc: James Bottomley Cc: Bartlomiej Zolnierkiewicz Cc: Borislav Petkov Cc: Sergei Shtylyov Cc: Eric Moore Cc: Alan Stern Cc: FUJITA Tomonori Cc: Pete Zaitcev Cc: Stephen Rothwell Cc: Paul Clements Cc: Tim Waugh Cc: Jeff Garzik Cc: Jeremy Fitzhardinge Cc: Alex Dubov Cc: David Woodhouse Cc: Martin Schwidefsky Cc: Dario Ballabio Cc: David S. Miller Cc: Rusty Russell Cc: unsik Kim Cc: Laurent Vivier Signed-off-by: Jens Axboe --- arch/um/drivers/ubd_kern.c | 2 +- block/as-iosched.c | 18 +++++----- block/blk-barrier.c | 2 +- block/blk-core.c | 17 +++++---- block/blk-merge.c | 10 +++--- block/cfq-iosched.c | 18 +++++----- block/deadline-iosched.c | 2 +- block/elevator.c | 22 ++++++------ drivers/block/DAC960.c | 6 ++-- drivers/block/amiflop.c | 6 ++-- drivers/block/ataflop.c | 10 +++--- drivers/block/cciss.c | 22 ++++++------ drivers/block/cpqarray.c | 9 ++--- drivers/block/floppy.c | 53 +++++++++++++++-------------- drivers/block/hd.c | 14 ++++---- drivers/block/mg_disk.c | 25 +++++++------- drivers/block/nbd.c | 12 +++---- drivers/block/paride/pcd.c | 4 +-- drivers/block/paride/pd.c | 8 ++--- drivers/block/paride/pf.c | 8 ++--- drivers/block/ps3disk.c | 9 +++-- drivers/block/sunvdc.c | 2 +- drivers/block/swim.c | 6 ++-- drivers/block/swim3.c | 34 +++++++++--------- drivers/block/sx8.c | 6 ++-- drivers/block/ub.c | 6 ++-- drivers/block/viodasd.c | 2 +- drivers/block/virtio_blk.c | 2 +- drivers/block/xd.c | 4 +-- drivers/block/xen-blkfront.c | 11 +++--- drivers/block/xsysace.c | 17 ++++----- drivers/block/z2ram.c | 6 ++-- drivers/cdrom/gdrom.c | 6 ++-- drivers/cdrom/viocd.c | 2 +- drivers/memstick/core/mspro_block.c | 6 ++-- drivers/message/i2o/i2o_block.c | 20 ++++++----- drivers/mmc/card/block.c | 10 +++--- drivers/mtd/mtd_blkdevs.c | 7 ++-- drivers/s390/block/dasd.c | 2 +- drivers/s390/block/dasd_diag.c | 5 +-- drivers/s390/block/dasd_eckd.c | 6 ++-- drivers/s390/block/dasd_fba.c | 7 ++-- drivers/s390/char/tape_34xx.c | 2 +- drivers/s390/char/tape_3590.c | 2 +- drivers/s390/char/tape_block.c | 2 +- drivers/sbus/char/jsflash.c | 4 +-- drivers/scsi/eata.c | 24 ++++++------- drivers/scsi/lpfc/lpfc_scsi.c | 22 ++++++------ drivers/scsi/scsi_lib.c | 6 ++-- drivers/scsi/sd.c | 24 ++++++------- drivers/scsi/sd_dif.c | 2 +- drivers/scsi/sr.c | 15 ++++---- drivers/scsi/u14-34f.c | 22 ++++++------ include/scsi/scsi_cmnd.h | 2 +- 54 files changed, 292 insertions(+), 279 deletions(-) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 433012764a3..402ba8f70fc 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1239,7 +1239,7 @@ static void do_ubd_request(struct request_queue *q) } req = dev->request; - sector = req->sector; + sector = blk_rq_pos(req); while(dev->start_sg < dev->end_sg){ struct scatterlist *sg = &dev->sg[dev->start_sg]; diff --git a/block/as-iosched.c b/block/as-iosched.c index 45bd07059c2..7a12cf6ee1d 100644 --- a/block/as-iosched.c +++ b/block/as-iosched.c @@ -306,8 +306,8 @@ as_choose_req(struct as_data *ad, struct request *rq1, struct request *rq2) data_dir = rq_is_sync(rq1); last = ad->last_sector[data_dir]; - s1 = rq1->sector; - s2 = rq2->sector; + s1 = blk_rq_pos(rq1); + s2 = blk_rq_pos(rq2); BUG_ON(data_dir != rq_is_sync(rq2)); @@ -566,13 +566,15 @@ static void as_update_iohist(struct as_data *ad, struct as_io_context *aic, as_update_thinktime(ad, aic, thinktime); /* Calculate read -> read seek distance */ - if (aic->last_request_pos < rq->sector) - seek_dist = rq->sector - aic->last_request_pos; + if (aic->last_request_pos < blk_rq_pos(rq)) + seek_dist = blk_rq_pos(rq) - + aic->last_request_pos; else - seek_dist = aic->last_request_pos - rq->sector; + seek_dist = aic->last_request_pos - + blk_rq_pos(rq); as_update_seekdist(ad, aic, seek_dist); } - aic->last_request_pos = rq->sector + rq->nr_sectors; + aic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); set_bit(AS_TASK_IOSTARTED, &aic->state); spin_unlock(&aic->lock); } @@ -587,7 +589,7 @@ static int as_close_req(struct as_data *ad, struct as_io_context *aic, { unsigned long delay; /* jiffies */ sector_t last = ad->last_sector[ad->batch_data_dir]; - sector_t next = rq->sector; + sector_t next = blk_rq_pos(rq); sector_t delta; /* acceptable close offset (in sectors) */ sector_t s; @@ -981,7 +983,7 @@ static void as_move_to_dispatch(struct as_data *ad, struct request *rq) * This has to be set in order to be correctly updated by * as_find_next_rq */ - ad->last_sector[data_dir] = rq->sector + rq->nr_sectors; + ad->last_sector[data_dir] = blk_rq_pos(rq) + blk_rq_sectors(rq); if (data_dir == BLK_RW_SYNC) { struct io_context *ioc = RQ_IOC(rq); diff --git a/block/blk-barrier.c b/block/blk-barrier.c index c167de5b9ef..8713c2fbc4f 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -324,7 +324,7 @@ int blkdev_issue_flush(struct block_device *bdev, sector_t *error_sector) /* * The driver must store the error location in ->bi_sector, if * it supports it. For non-stacked drivers, this should be copied - * from rq->sector. + * from blk_rq_pos(rq). */ if (error_sector) *error_sector = bio->bi_sector; diff --git a/block/blk-core.c b/block/blk-core.c index 895e55b74a4..82dc20621c0 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -72,7 +72,7 @@ static void drive_stat_acct(struct request *rq, int new_io) return; cpu = part_stat_lock(); - part = disk_map_sector_rcu(rq->rq_disk, rq->sector); + part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); if (!new_io) part_stat_inc(cpu, part, merges[rw]); @@ -185,10 +185,9 @@ void blk_dump_rq_flags(struct request *rq, char *msg) rq->rq_disk ? rq->rq_disk->disk_name : "?", rq->cmd_type, rq->cmd_flags); - printk(KERN_INFO " sector %llu, nr/cnr %lu/%u\n", - (unsigned long long)rq->sector, - rq->nr_sectors, - rq->current_nr_sectors); + printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", + (unsigned long long)blk_rq_pos(rq), + blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", rq->bio, rq->biotail, rq->buffer, rq->data_len); @@ -1557,7 +1556,7 @@ EXPORT_SYMBOL(submit_bio); */ int blk_rq_check_limits(struct request_queue *q, struct request *rq) { - if (rq->nr_sectors > q->max_sectors || + if (blk_rq_sectors(rq) > q->max_sectors || rq->data_len > q->max_hw_sectors << 9) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; @@ -1645,7 +1644,7 @@ static void blk_account_io_completion(struct request *req, unsigned int bytes) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, req->sector); + part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_stat_add(cpu, part, sectors[rw], bytes >> 9); part_stat_unlock(); } @@ -1665,7 +1664,7 @@ static void blk_account_io_done(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, req->sector); + part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_stat_inc(cpu, part, ios[rw]); part_stat_add(cpu, part, ticks[rw], duration); @@ -1846,7 +1845,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) if (error && (blk_fs_request(req) && !(req->cmd_flags & REQ_QUIET))) { printk(KERN_ERR "end_request: I/O error, dev %s, sector %llu\n", req->rq_disk ? req->rq_disk->disk_name : "?", - (unsigned long long)req->sector); + (unsigned long long)blk_rq_pos(req)); } blk_account_io_completion(req, nr_bytes); diff --git a/block/blk-merge.c b/block/blk-merge.c index 23d2a6fe34a..bf62a87a9da 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -259,7 +259,7 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, else max_sectors = q->max_sectors; - if (req->nr_sectors + bio_sectors(bio) > max_sectors) { + if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -284,7 +284,7 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, max_sectors = q->max_sectors; - if (req->nr_sectors + bio_sectors(bio) > max_sectors) { + if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -315,7 +315,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, /* * Will it become too large? */ - if ((req->nr_sectors + next->nr_sectors) > q->max_sectors) + if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > q->max_sectors) return 0; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; @@ -345,7 +345,7 @@ static void blk_account_io_merge(struct request *req) int cpu; cpu = part_stat_lock(); - part = disk_map_sector_rcu(req->rq_disk, req->sector); + part = disk_map_sector_rcu(req->rq_disk, blk_rq_pos(req)); part_round_stats(cpu, part); part_dec_in_flight(part); @@ -366,7 +366,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, /* * not contiguous */ - if (req->sector + req->nr_sectors != next->sector) + if (blk_rq_pos(req) + blk_rq_sectors(req) != blk_rq_pos(next)) return 0; if (rq_data_dir(req) != rq_data_dir(next) diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 575083a9ffe..db4d990a66b 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -349,8 +349,8 @@ cfq_choose_req(struct cfq_data *cfqd, struct request *rq1, struct request *rq2) else if (rq_is_meta(rq2) && !rq_is_meta(rq1)) return rq2; - s1 = rq1->sector; - s2 = rq2->sector; + s1 = blk_rq_pos(rq1); + s2 = blk_rq_pos(rq2); last = cfqd->last_position; @@ -949,10 +949,10 @@ static struct cfq_queue *cfq_set_active_queue(struct cfq_data *cfqd, static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, struct request *rq) { - if (rq->sector >= cfqd->last_position) - return rq->sector - cfqd->last_position; + if (blk_rq_pos(rq) >= cfqd->last_position) + return blk_rq_pos(rq) - cfqd->last_position; else - return cfqd->last_position - rq->sector; + return cfqd->last_position - blk_rq_pos(rq); } #define CIC_SEEK_THR 8 * 1024 @@ -1918,10 +1918,10 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_io_context *cic, if (!cic->last_request_pos) sdist = 0; - else if (cic->last_request_pos < rq->sector) - sdist = rq->sector - cic->last_request_pos; + else if (cic->last_request_pos < blk_rq_pos(rq)) + sdist = blk_rq_pos(rq) - cic->last_request_pos; else - sdist = cic->last_request_pos - rq->sector; + sdist = cic->last_request_pos - blk_rq_pos(rq); /* * Don't allow the seek distance to get too large from the @@ -2071,7 +2071,7 @@ cfq_rq_enqueued(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_update_io_seektime(cfqd, cic, rq); cfq_update_idle_window(cfqd, cfqq, cic); - cic->last_request_pos = rq->sector + rq->nr_sectors; + cic->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); if (cfqq == cfqd->active_queue) { /* diff --git a/block/deadline-iosched.c b/block/deadline-iosched.c index c4d991d4ade..b547cbca7b2 100644 --- a/block/deadline-iosched.c +++ b/block/deadline-iosched.c @@ -138,7 +138,7 @@ deadline_merge(struct request_queue *q, struct request **req, struct bio *bio) __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); if (__rq) { - BUG_ON(sector != __rq->sector); + BUG_ON(sector != blk_rq_pos(__rq)); if (elv_rq_merge_ok(__rq, bio)) { ret = ELEVATOR_FRONT_MERGE; diff --git a/block/elevator.c b/block/elevator.c index 1af5d9f04af..918920056e4 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -52,7 +52,7 @@ static const int elv_hash_shift = 6; #define ELV_HASH_FN(sec) \ (hash_long(ELV_HASH_BLOCK((sec)), elv_hash_shift)) #define ELV_HASH_ENTRIES (1 << elv_hash_shift) -#define rq_hash_key(rq) ((rq)->sector + (rq)->nr_sectors) +#define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) DEFINE_TRACE(block_rq_insert); DEFINE_TRACE(block_rq_issue); @@ -119,9 +119,9 @@ static inline int elv_try_merge(struct request *__rq, struct bio *bio) * we can merge and sequence is ok, check if it's possible */ if (elv_rq_merge_ok(__rq, bio)) { - if (__rq->sector + __rq->nr_sectors == bio->bi_sector) + if (blk_rq_pos(__rq) + blk_rq_sectors(__rq) == bio->bi_sector) ret = ELEVATOR_BACK_MERGE; - else if (__rq->sector - bio_sectors(bio) == bio->bi_sector) + else if (blk_rq_pos(__rq) - bio_sectors(bio) == bio->bi_sector) ret = ELEVATOR_FRONT_MERGE; } @@ -370,9 +370,9 @@ struct request *elv_rb_add(struct rb_root *root, struct request *rq) parent = *p; __rq = rb_entry(parent, struct request, rb_node); - if (rq->sector < __rq->sector) + if (blk_rq_pos(rq) < blk_rq_pos(__rq)) p = &(*p)->rb_left; - else if (rq->sector > __rq->sector) + else if (blk_rq_pos(rq) > blk_rq_pos(__rq)) p = &(*p)->rb_right; else return __rq; @@ -400,9 +400,9 @@ struct request *elv_rb_find(struct rb_root *root, sector_t sector) while (n) { rq = rb_entry(n, struct request, rb_node); - if (sector < rq->sector) + if (sector < blk_rq_pos(rq)) n = n->rb_left; - else if (sector > rq->sector) + else if (sector > blk_rq_pos(rq)) n = n->rb_right; else return rq; @@ -441,14 +441,14 @@ void elv_dispatch_sort(struct request_queue *q, struct request *rq) break; if (pos->cmd_flags & stop_flags) break; - if (rq->sector >= boundary) { - if (pos->sector < boundary) + if (blk_rq_pos(rq) >= boundary) { + if (blk_rq_pos(pos) < boundary) continue; } else { - if (pos->sector >= boundary) + if (blk_rq_pos(pos) >= boundary) break; } - if (rq->sector >= pos->sector) + if (blk_rq_pos(rq) >= blk_rq_pos(pos)) break; } diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index f22ed6cc69f..774ab05973a 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -3338,8 +3338,8 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_ } Command->Completion = Request->end_io_data; Command->LogicalDriveNumber = (long)Request->rq_disk->private_data; - Command->BlockNumber = Request->sector; - Command->BlockCount = Request->nr_sectors; + Command->BlockNumber = blk_rq_pos(Request); + Command->BlockCount = blk_rq_sectors(Request); Command->Request = Request; blkdev_dequeue_request(Request); Command->SegmentCount = blk_rq_map_sg(req_q, @@ -3431,7 +3431,7 @@ static void DAC960_queue_partial_rw(DAC960_Command_T *Command) * successfully as possible. */ Command->SegmentCount = 1; - Command->BlockNumber = Request->sector; + Command->BlockNumber = blk_rq_pos(Request); Command->BlockCount = 1; DAC960_QueueReadWriteCommand(Command); return; diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 8ff95f2c0ed..e4a14b94828 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1351,13 +1351,13 @@ static void redo_fd_request(void) drive = floppy - unit; /* Here someone could investigate to be more efficient */ - for (cnt = 0; cnt < CURRENT->current_nr_sectors; cnt++) { + for (cnt = 0; cnt < blk_rq_cur_sectors(CURRENT); cnt++) { #ifdef DEBUG printk("fd: sector %ld + %d requested for %s\n", - CURRENT->sector,cnt, + blk_rq_pos(CURRENT), cnt, (rq_data_dir(CURRENT) == READ) ? "read" : "write"); #endif - block = CURRENT->sector + cnt; + block = blk_rq_pos(CURRENT) + cnt; if ((int)block > floppy->blocks) { __blk_end_request_cur(CURRENT, -EIO); goto repeat; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 25067287211..234024cda5e 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -725,7 +725,7 @@ static void do_fd_action( int drive ) if (IS_BUFFERED( drive, ReqSide, ReqTrack )) { if (ReqCmd == READ) { copy_buffer( SECTOR_BUFFER(ReqSector), ReqData ); - if (++ReqCnt < CURRENT->current_nr_sectors) { + if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) { /* read next sector */ setup_req_params( drive ); goto repeat; @@ -1130,7 +1130,7 @@ static void fd_rwsec_done1(int status) } } - if (++ReqCnt < CURRENT->current_nr_sectors) { + if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) { /* read next sector */ setup_req_params( SelectedDrive ); do_fd_action( SelectedDrive ); @@ -1394,7 +1394,7 @@ static void redo_fd_request(void) DPRINT(("redo_fd_request: CURRENT=%p dev=%s CURRENT->sector=%ld\n", CURRENT, CURRENT ? CURRENT->rq_disk->disk_name : "", - CURRENT ? CURRENT->sector : 0 )); + CURRENT ? blk_rq_pos(CURRENT) : 0 )); IsFormatting = 0; @@ -1440,7 +1440,7 @@ repeat: UD.autoprobe = 0; } - if (CURRENT->sector + 1 > UDT->blocks) { + if (blk_rq_pos(CURRENT) + 1 > UDT->blocks) { __blk_end_request_cur(CURRENT, -EIO); goto repeat; } @@ -1450,7 +1450,7 @@ repeat: ReqCnt = 0; ReqCmd = rq_data_dir(CURRENT); - ReqBlock = CURRENT->sector; + ReqBlock = blk_rq_pos(CURRENT); ReqBuffer = CURRENT->buffer; setup_req_params( drive ); do_fd_action( drive ); diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index f22d4932433..ab7b04c0db7 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -2835,10 +2835,10 @@ static void do_cciss_request(struct request_queue *q) c->Request.Timeout = 0; // Don't time out c->Request.CDB[0] = (rq_data_dir(creq) == READ) ? h->cciss_read : h->cciss_write; - start_blk = creq->sector; + start_blk = blk_rq_pos(creq); #ifdef CCISS_DEBUG - printk(KERN_DEBUG "ciss: sector =%d nr_sectors=%d\n", (int)creq->sector, - (int)creq->nr_sectors); + printk(KERN_DEBUG "ciss: sector =%d nr_sectors=%d\n", + (int)blk_rq_pos(creq), (int)blk_rq_sectors(creq)); #endif /* CCISS_DEBUG */ sg_init_table(tmp_sg, MAXSGENTRIES); @@ -2864,8 +2864,8 @@ static void do_cciss_request(struct request_queue *q) h->maxSG = seg; #ifdef CCISS_DEBUG - printk(KERN_DEBUG "cciss: Submitting %lu sectors in %d segments\n", - creq->nr_sectors, seg); + printk(KERN_DEBUG "cciss: Submitting %u sectors in %d segments\n", + blk_rq_sectors(creq), seg); #endif /* CCISS_DEBUG */ c->Header.SGList = c->Header.SGTotal = seg; @@ -2877,8 +2877,8 @@ static void do_cciss_request(struct request_queue *q) c->Request.CDB[4] = (start_blk >> 8) & 0xff; c->Request.CDB[5] = start_blk & 0xff; c->Request.CDB[6] = 0; // (sect >> 24) & 0xff; MSB - c->Request.CDB[7] = (creq->nr_sectors >> 8) & 0xff; - c->Request.CDB[8] = creq->nr_sectors & 0xff; + c->Request.CDB[7] = (blk_rq_sectors(creq) >> 8) & 0xff; + c->Request.CDB[8] = blk_rq_sectors(creq) & 0xff; c->Request.CDB[9] = c->Request.CDB[11] = c->Request.CDB[12] = 0; } else { u32 upper32 = upper_32_bits(start_blk); @@ -2893,10 +2893,10 @@ static void do_cciss_request(struct request_queue *q) c->Request.CDB[7]= (start_blk >> 16) & 0xff; c->Request.CDB[8]= (start_blk >> 8) & 0xff; c->Request.CDB[9]= start_blk & 0xff; - c->Request.CDB[10]= (creq->nr_sectors >> 24) & 0xff; - c->Request.CDB[11]= (creq->nr_sectors >> 16) & 0xff; - c->Request.CDB[12]= (creq->nr_sectors >> 8) & 0xff; - c->Request.CDB[13]= creq->nr_sectors & 0xff; + c->Request.CDB[10]= (blk_rq_sectors(creq) >> 24) & 0xff; + c->Request.CDB[11]= (blk_rq_sectors(creq) >> 16) & 0xff; + c->Request.CDB[12]= (blk_rq_sectors(creq) >> 8) & 0xff; + c->Request.CDB[13]= blk_rq_sectors(creq) & 0xff; c->Request.CDB[14] = c->Request.CDB[15] = 0; } } else if (blk_pc_request(creq)) { diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index 488a8f4a60a..a5caeff4718 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -919,10 +919,11 @@ queue_next: c->hdr.size = sizeof(rblk_t) >> 2; c->size += sizeof(rblk_t); - c->req.hdr.blk = creq->sector; + c->req.hdr.blk = blk_rq_pos(creq); c->rq = creq; DBGPX( - printk("sector=%d, nr_sectors=%d\n", creq->sector, creq->nr_sectors); + printk("sector=%d, nr_sectors=%u\n", + blk_rq_pos(creq), blk_rq_sectors(creq)); ); sg_init_table(tmp_sg, SG_MAX); seg = blk_rq_map_sg(q, creq, tmp_sg); @@ -940,9 +941,9 @@ DBGPX( tmp_sg[i].offset, tmp_sg[i].length, dir); } -DBGPX( printk("Submitting %d sectors in %d segments\n", creq->nr_sectors, seg); ); +DBGPX( printk("Submitting %u sectors in %d segments\n", blk_rq_sectors(creq), seg); ); c->req.hdr.sg_cnt = seg; - c->req.hdr.blk_cnt = creq->nr_sectors; + c->req.hdr.blk_cnt = blk_rq_sectors(creq); c->req.hdr.cmd = (rq_data_dir(creq) == READ) ? IDA_READ : IDA_WRITE; c->type = CMD_RWREQ; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 1300df6f164..45248628338 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2303,7 +2303,7 @@ static void floppy_end_request(struct request *req, int error) /* current_count_sectors can be zero if transfer failed */ if (error) - nr_sectors = req->current_nr_sectors; + nr_sectors = blk_rq_cur_sectors(req); if (__blk_end_request(req, error, nr_sectors << 9)) return; @@ -2332,7 +2332,7 @@ static void request_done(int uptodate) if (uptodate) { /* maintain values for invalidation on geometry * change */ - block = current_count_sectors + req->sector; + block = current_count_sectors + blk_rq_pos(req); INFBOUND(DRS->maxblock, block); if (block > _floppy->sect) DRS->maxtrack = 1; @@ -2346,10 +2346,10 @@ static void request_done(int uptodate) /* record write error information */ DRWE->write_errors++; if (DRWE->write_errors == 1) { - DRWE->first_error_sector = req->sector; + DRWE->first_error_sector = blk_rq_pos(req); DRWE->first_error_generation = DRS->generation; } - DRWE->last_error_sector = req->sector; + DRWE->last_error_sector = blk_rq_pos(req); DRWE->last_error_generation = DRS->generation; } spin_lock_irqsave(q->queue_lock, flags); @@ -2503,24 +2503,24 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) max_sector = transfer_size(ssize, min(max_sector, max_sector_2), - current_req->nr_sectors); + blk_rq_sectors(current_req)); if (current_count_sectors <= 0 && CT(COMMAND) == FD_WRITE && - buffer_max > fsector_t + current_req->nr_sectors) + buffer_max > fsector_t + blk_rq_sectors(current_req)) current_count_sectors = min_t(int, buffer_max - fsector_t, - current_req->nr_sectors); + blk_rq_sectors(current_req)); remaining = current_count_sectors << 9; #ifdef FLOPPY_SANITY_CHECK - if ((remaining >> 9) > current_req->nr_sectors && + if ((remaining >> 9) > blk_rq_sectors(current_req) && CT(COMMAND) == FD_WRITE) { DPRINT("in copy buffer\n"); printk("current_count_sectors=%ld\n", current_count_sectors); printk("remaining=%d\n", remaining >> 9); - printk("current_req->nr_sectors=%ld\n", - current_req->nr_sectors); + printk("current_req->nr_sectors=%u\n", + blk_rq_sectors(current_req)); printk("current_req->current_nr_sectors=%u\n", - current_req->current_nr_sectors); + blk_rq_cur_sectors(current_req)); printk("max_sector=%d\n", max_sector); printk("ssize=%d\n", ssize); } @@ -2530,7 +2530,7 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) dma_buffer = floppy_track_buffer + ((fsector_t - buffer_min) << 9); - size = current_req->current_nr_sectors << 9; + size = blk_rq_cur_sectors(current_req) << 9; rq_for_each_segment(bv, current_req, iter) { if (!remaining) @@ -2648,10 +2648,10 @@ static int make_raw_rw_request(void) max_sector = _floppy->sect * _floppy->head; - TRACK = (int)current_req->sector / max_sector; - fsector_t = (int)current_req->sector % max_sector; + TRACK = (int)blk_rq_pos(current_req) / max_sector; + fsector_t = (int)blk_rq_pos(current_req) % max_sector; if (_floppy->track && TRACK >= _floppy->track) { - if (current_req->current_nr_sectors & 1) { + if (blk_rq_cur_sectors(current_req) & 1) { current_count_sectors = 1; return 1; } else @@ -2669,7 +2669,7 @@ static int make_raw_rw_request(void) if (fsector_t >= max_sector) { current_count_sectors = min_t(int, _floppy->sect - fsector_t, - current_req->nr_sectors); + blk_rq_sectors(current_req)); return 1; } SIZECODE = 2; @@ -2720,7 +2720,7 @@ static int make_raw_rw_request(void) in_sector_offset = (fsector_t % _floppy->sect) % ssize; aligned_sector_t = fsector_t - in_sector_offset; - max_size = current_req->nr_sectors; + max_size = blk_rq_sectors(current_req); if ((raw_cmd->track == buffer_track) && (current_drive == buffer_drive) && (fsector_t >= buffer_min) && (fsector_t < buffer_max)) { @@ -2729,10 +2729,10 @@ static int make_raw_rw_request(void) copy_buffer(1, max_sector, buffer_max); return 1; } - } else if (in_sector_offset || current_req->nr_sectors < ssize) { + } else if (in_sector_offset || blk_rq_sectors(current_req) < ssize) { if (CT(COMMAND) == FD_WRITE) { - if (fsector_t + current_req->nr_sectors > ssize && - fsector_t + current_req->nr_sectors < ssize + ssize) + if (fsector_t + blk_rq_sectors(current_req) > ssize && + fsector_t + blk_rq_sectors(current_req) < ssize + ssize) max_size = ssize + ssize; else max_size = ssize; @@ -2776,7 +2776,7 @@ static int make_raw_rw_request(void) (indirect * 2 > direct * 3 && *errors < DP->max_errors.read_track && ((!probing || (DP->read_track & (1 << DRS->probed_format)))))) { - max_size = current_req->nr_sectors; + max_size = blk_rq_sectors(current_req); } else { raw_cmd->kernel_data = current_req->buffer; raw_cmd->length = current_count_sectors << 9; @@ -2801,7 +2801,7 @@ static int make_raw_rw_request(void) fsector_t > buffer_max || fsector_t < buffer_min || ((CT(COMMAND) == FD_READ || - (!in_sector_offset && current_req->nr_sectors >= ssize)) && + (!in_sector_offset && blk_rq_sectors(current_req) >= ssize)) && max_sector > 2 * max_buffer_sectors + buffer_min && max_size + fsector_t > 2 * max_buffer_sectors + buffer_min) /* not enough space */ @@ -2879,8 +2879,8 @@ static int make_raw_rw_request(void) printk("write\n"); return 0; } - } else if (raw_cmd->length > current_req->nr_sectors << 9 || - current_count_sectors > current_req->nr_sectors) { + } else if (raw_cmd->length > blk_rq_sectors(current_req) << 9 || + current_count_sectors > blk_rq_sectors(current_req)) { DPRINT("buffer overrun in direct transfer\n"); return 0; } else if (raw_cmd->length < current_count_sectors << 9) { @@ -2990,8 +2990,9 @@ static void do_fd_request(struct request_queue * q) if (usage_count == 0) { printk("warning: usage count=0, current_req=%p exiting\n", current_req); - printk("sect=%ld type=%x flags=%x\n", (long)current_req->sector, - current_req->cmd_type, current_req->cmd_flags); + printk("sect=%ld type=%x flags=%x\n", + (long)blk_rq_pos(current_req), current_req->cmd_type, + current_req->cmd_flags); return; } if (test_bit(0, &fdc_busy)) { diff --git a/drivers/block/hd.c b/drivers/block/hd.c index 75b9ca95c4e..a3b39940ce0 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -228,7 +228,7 @@ static void dump_status(const char *msg, unsigned int stat) printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL), inb(HD_CURRENT) & 0xf, inb(HD_SECTOR)); if (CURRENT) - printk(", sector=%ld", CURRENT->sector); + printk(", sector=%ld", blk_rq_pos(CURRENT)); } printk("\n"); } @@ -457,9 +457,9 @@ ok_to_read: req = CURRENT; insw(HD_DATA, req->buffer, 256); #ifdef DEBUG - printk("%s: read: sector %ld, remaining = %ld, buffer=%p\n", - req->rq_disk->disk_name, req->sector + 1, req->nr_sectors - 1, - req->buffer+512); + printk("%s: read: sector %ld, remaining = %u, buffer=%p\n", + req->rq_disk->disk_name, blk_rq_pos(req) + 1, + blk_rq_sectors(req) - 1, req->buffer+512); #endif if (__blk_end_request(req, 0, 512)) { SET_HANDLER(&read_intr); @@ -485,7 +485,7 @@ static void write_intr(void) continue; if (!OK_STATUS(i)) break; - if ((req->nr_sectors <= 1) || (i & DRQ_STAT)) + if ((blk_rq_sectors(req) <= 1) || (i & DRQ_STAT)) goto ok_to_write; } while (--retries > 0); dump_status("write_intr", i); @@ -589,8 +589,8 @@ repeat: return; } disk = req->rq_disk->private_data; - block = req->sector; - nsect = req->nr_sectors; + block = blk_rq_pos(req); + nsect = blk_rq_sectors(req); if (block >= get_capacity(req->rq_disk) || ((block+nsect) > get_capacity(req->rq_disk))) { printk("%s: bad access: block=%d, count=%d\n", diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 71e56cc28ca..826c3492b9f 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -220,7 +220,8 @@ static void mg_dump_status(const char *msg, unsigned int stat, if (host->breq) { req = elv_next_request(host->breq); if (req) - printk(", sector=%u", (u32)req->sector); + printk(", sector=%u", + (unsigned int)blk_rq_pos(req)); } } @@ -493,12 +494,12 @@ static void mg_read(struct request *req) u32 j; struct mg_host *host = req->rq_disk->private_data; - if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_RD, NULL) != - MG_ERR_NONE) + if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req), + MG_CMD_RD, NULL) != MG_ERR_NONE) mg_bad_rw_intr(host); MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - req->nr_sectors, req->sector, req->buffer); + blk_rq_sectors(req), blk_rq_pos(req), req->buffer); do { u16 *buff = (u16 *)req->buffer; @@ -522,14 +523,14 @@ static void mg_write(struct request *req) u32 j; struct mg_host *host = req->rq_disk->private_data; - if (mg_out(host, req->sector, req->nr_sectors, MG_CMD_WR, NULL) != - MG_ERR_NONE) { + if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req), + MG_CMD_WR, NULL) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - req->nr_sectors, req->sector, req->buffer); + blk_rq_sectors(req), blk_rq_pos(req), req->buffer); do { u16 *buff = (u16 *)req->buffer; @@ -579,7 +580,7 @@ ok_to_read: (i << 1)); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", - req->sector, req->nr_sectors - 1, req->buffer); + blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer); /* send read confirm */ outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); @@ -609,7 +610,7 @@ static void mg_write_intr(struct mg_host *host) break; if (!MG_READY_OK(i)) break; - if ((req->nr_sectors <= 1) || (i & ATA_DRQ)) + if ((blk_rq_sectors(req) <= 1) || (i & ATA_DRQ)) goto ok_to_write; } while (0); mg_dump_status("mg_write_intr", i, host); @@ -627,7 +628,7 @@ ok_to_write: buff++; } MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", - req->sector, req->nr_sectors, req->buffer); + blk_rq_pos(req), blk_rq_sectors(req), req->buffer); host->mg_do_intr = mg_write_intr; mod_timer(&host->timer, jiffies + 3 * HZ); } @@ -749,9 +750,9 @@ static void mg_request(struct request_queue *q) del_timer(&host->timer); - sect_num = req->sector; + sect_num = blk_rq_pos(req); /* deal whole segments */ - sect_cnt = req->nr_sectors; + sect_cnt = blk_rq_sectors(req); /* sanity check */ if (sect_num >= get_capacity(req->rq_disk) || diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index a9ab8be9d92..977a5737793 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -110,7 +110,7 @@ static void nbd_end_request(struct request *req) req, error ? "failed" : "done"); spin_lock_irqsave(q->queue_lock, flags); - __blk_end_request(req, error, req->nr_sectors << 9); + __blk_end_request(req, error, blk_rq_sectors(req) << 9); spin_unlock_irqrestore(q->queue_lock, flags); } @@ -231,19 +231,19 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) { int result, flags; struct nbd_request request; - unsigned long size = req->nr_sectors << 9; + unsigned long size = blk_rq_sectors(req) << 9; request.magic = htonl(NBD_REQUEST_MAGIC); request.type = htonl(nbd_cmd(req)); - request.from = cpu_to_be64((u64) req->sector << 9); + request.from = cpu_to_be64((u64)blk_rq_pos(req) << 9); request.len = htonl(size); memcpy(request.handle, &req, sizeof(req)); - dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%luB)\n", + dprintk(DBG_TX, "%s: request %p: sending control (%s@%llu,%uB)\n", lo->disk->disk_name, req, nbdcmd_to_ascii(nbd_cmd(req)), - (unsigned long long)req->sector << 9, - req->nr_sectors << 9); + (unsigned long long)blk_rq_pos(req) << 9, + blk_rq_sectors(req) << 9); result = sock_xmit(lo, 1, &request, sizeof(request), (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 9fd57c2aa46..2d5dc0af55e 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -728,8 +728,8 @@ static void do_pcd_request(struct request_queue * q) if (cd != pcd_current) pcd_bufblk = -1; pcd_current = cd; - pcd_sector = pcd_req->sector; - pcd_count = pcd_req->current_nr_sectors; + pcd_sector = blk_rq_pos(pcd_req); + pcd_count = blk_rq_cur_sectors(pcd_req); pcd_buf = pcd_req->buffer; pcd_busy = 1; ps_set_intr(do_pcd_read, NULL, 0, nice); diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 0732df4e901..9ec5d4ac0b6 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -444,11 +444,11 @@ static enum action do_pd_io_start(void) pd_cmd = rq_data_dir(pd_req); if (pd_cmd == READ || pd_cmd == WRITE) { - pd_block = pd_req->sector; - pd_count = pd_req->current_nr_sectors; + pd_block = blk_rq_pos(pd_req); + pd_count = blk_rq_cur_sectors(pd_req); if (pd_block + pd_count > get_capacity(pd_req->rq_disk)) return Fail; - pd_run = pd_req->nr_sectors; + pd_run = blk_rq_sectors(pd_req); pd_buf = pd_req->buffer; pd_retries = 0; if (pd_cmd == READ) @@ -479,7 +479,7 @@ static int pd_next_buf(void) return 0; spin_lock_irqsave(&pd_lock, saved_flags); __blk_end_request_cur(pd_req, 0); - pd_count = pd_req->current_nr_sectors; + pd_count = blk_rq_cur_sectors(pd_req); pd_buf = pd_req->buffer; spin_unlock_irqrestore(&pd_lock, saved_flags); return 0; diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index 3871e3586d6..e88c889aa7f 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -768,9 +768,9 @@ repeat: return; pf_current = pf_req->rq_disk->private_data; - pf_block = pf_req->sector; - pf_run = pf_req->nr_sectors; - pf_count = pf_req->current_nr_sectors; + pf_block = blk_rq_pos(pf_req); + pf_run = blk_rq_sectors(pf_req); + pf_count = blk_rq_cur_sectors(pf_req); if (pf_block + pf_count > get_capacity(pf_req->rq_disk)) { pf_end_request(-EIO); @@ -810,7 +810,7 @@ static int pf_next_buf(void) spin_unlock_irqrestore(&pf_spin_lock, saved_flags); if (!pf_req) return 1; - pf_count = pf_req->current_nr_sectors; + pf_count = blk_rq_cur_sectors(pf_req); pf_buf = pf_req->buffer; } return 0; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index c2388673684..8d583081b50 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -134,13 +134,12 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev, rq_for_each_segment(bv, req, iter) n++; dev_dbg(&dev->sbd.core, - "%s:%u: %s req has %u bvecs for %lu sectors %lu hard sectors\n", - __func__, __LINE__, op, n, req->nr_sectors, - blk_rq_sectors(req)); + "%s:%u: %s req has %u bvecs for %u sectors\n", + __func__, __LINE__, op, n, blk_rq_sectors(req)); #endif - start_sector = req->sector * priv->blocking_factor; - sectors = req->nr_sectors * priv->blocking_factor; + start_sector = blk_rq_pos(req) * priv->blocking_factor; + sectors = blk_rq_sectors(req) * priv->blocking_factor; dev_dbg(&dev->sbd.core, "%s:%u: %s %llu sectors starting at %llu\n", __func__, __LINE__, op, sectors, start_sector); diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index f59887c5ffb..9f351bfa15e 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -416,7 +416,7 @@ static int __send_request(struct request *req) desc->slice = 0; } desc->status = ~0; - desc->offset = (req->sector << 9) / port->vdisk_block_size; + desc->offset = (blk_rq_pos(req) << 9) / port->vdisk_block_size; desc->size = len; desc->ncookies = err; diff --git a/drivers/block/swim.c b/drivers/block/swim.c index 97ef4266c4c..fc6a1c32293 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -531,7 +531,7 @@ static void redo_fd_request(struct request_queue *q) while ((req = elv_next_request(q))) { fs = req->rq_disk->private_data; - if (req->sector < 0 || req->sector >= fs->total_secs) { + if (blk_rq_pos(req) >= fs->total_secs) { __blk_end_request_cur(req, -EIO); continue; } @@ -551,8 +551,8 @@ static void redo_fd_request(struct request_queue *q) __blk_end_request_cur(req, -EIO); break; case READ: - if (floppy_read_sectors(fs, req->sector, - req->current_nr_sectors, + if (floppy_read_sectors(fs, blk_rq_pos(req), + blk_rq_cur_sectors(req), req->buffer)) { __blk_end_request_cur(req, -EIO); continue; diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index 424855945b9..c1b9a4dc11b 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -312,14 +312,14 @@ static void start_request(struct floppy_state *fs) } while (fs->state == idle && (req = elv_next_request(swim3_queue))) { #if 0 - printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%ld buf=%p\n", + printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n", req->rq_disk->disk_name, req->cmd, - (long)req->sector, req->nr_sectors, req->buffer); - printk(" errors=%d current_nr_sectors=%ld\n", - req->errors, req->current_nr_sectors); + (long)blk_rq_pos(req), blk_rq_sectors(req), req->buffer); + printk(" errors=%d current_nr_sectors=%u\n", + req->errors, blk_rq_cur_sectors(req)); #endif - if (req->sector >= fs->total_secs) { + if (blk_rq_pos(req) >= fs->total_secs) { __blk_end_request_cur(req, -EIO); continue; } @@ -337,13 +337,14 @@ static void start_request(struct floppy_state *fs) } } - /* Do not remove the cast. req->sector is now a sector_t and - * can be 64 bits, but it will never go past 32 bits for this - * driver anyway, so we can safely cast it down and not have - * to do a 64/32 division + /* Do not remove the cast. blk_rq_pos(req) is now a + * sector_t and can be 64 bits, but it will never go + * past 32 bits for this driver anyway, so we can + * safely cast it down and not have to do a 64/32 + * division */ - fs->req_cyl = ((long)req->sector) / fs->secpercyl; - x = ((long)req->sector) % fs->secpercyl; + fs->req_cyl = ((long)blk_rq_pos(req)) / fs->secpercyl; + x = ((long)blk_rq_pos(req)) % fs->secpercyl; fs->head = x / fs->secpertrack; fs->req_sector = x % fs->secpertrack + 1; fd_req = req; @@ -420,7 +421,7 @@ static inline void setup_transfer(struct floppy_state *fs) struct dbdma_cmd *cp = fs->dma_cmd; struct dbdma_regs __iomem *dr = fs->dma; - if (fd_req->current_nr_sectors <= 0) { + if (blk_rq_cur_sectors(fd_req) <= 0) { printk(KERN_ERR "swim3: transfer 0 sectors?\n"); return; } @@ -428,8 +429,8 @@ static inline void setup_transfer(struct floppy_state *fs) n = 1; else { n = fs->secpertrack - fs->req_sector + 1; - if (n > fd_req->current_nr_sectors) - n = fd_req->current_nr_sectors; + if (n > blk_rq_cur_sectors(fd_req)) + n = blk_rq_cur_sectors(fd_req); } fs->scount = n; swim3_select(fs, fs->head? READ_DATA_1: READ_DATA_0); @@ -600,7 +601,8 @@ static void xfer_timeout(unsigned long data) out_8(&sw->control_bic, WRITE_SECTORS | DO_ACTION); out_8(&sw->select, RELAX); printk(KERN_ERR "swim3: timeout %sing sector %ld\n", - (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)fd_req->sector); + (rq_data_dir(fd_req)==WRITE? "writ": "read"), + (long)blk_rq_pos(fd_req)); __blk_end_request_cur(fd_req, -EIO); fs->state = idle; start_request(fs); @@ -714,7 +716,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) } else { printk("swim3: error %sing block %ld (err=%x)\n", rq_data_dir(fd_req) == WRITE? "writ": "read", - (long)fd_req->sector, err); + (long)blk_rq_pos(fd_req), err); __blk_end_request_cur(fd_req, -EIO); fs->state = idle; } diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 60e85bb6f79..087c94c8b2d 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -903,10 +903,10 @@ queue_one_request: msg->sg_count = n_elem; msg->sg_type = SGT_32BIT; msg->handle = cpu_to_le32(TAG_ENCODE(crq->tag)); - msg->lba = cpu_to_le32(rq->sector & 0xffffffff); - tmp = (rq->sector >> 16) >> 16; + msg->lba = cpu_to_le32(blk_rq_pos(rq) & 0xffffffff); + tmp = (blk_rq_pos(rq) >> 16) >> 16; msg->lba_high = cpu_to_le16( (u16) tmp ); - msg->lba_count = cpu_to_le16(rq->nr_sectors); + msg->lba_count = cpu_to_le16(blk_rq_sectors(rq)); msg_size = sizeof(struct carm_msg_rw) - sizeof(msg->sg); for (i = 0; i < n_elem; i++) { diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 8c2cc71327e..dc3b899ad2b 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -726,8 +726,8 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun, * The call to blk_queue_hardsect_size() guarantees that request * is aligned, but it is given in terms of 512 byte units, always. */ - block = rq->sector >> lun->capacity.bshift; - nblks = rq->nr_sectors >> lun->capacity.bshift; + block = blk_rq_pos(rq) >> lun->capacity.bshift; + nblks = blk_rq_sectors(rq) >> lun->capacity.bshift; cmd->cdb[0] = (cmd->dir == UB_DIR_READ)? READ_10: WRITE_10; /* 10-byte uses 4 bytes of LBA: 2147483648KB, 2097152MB, 2048GB */ @@ -739,7 +739,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun, cmd->cdb[8] = nblks; cmd->cdb_len = 10; - cmd->len = rq->nr_sectors * 512; + cmd->len = blk_rq_sectors(rq) * 512; } static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun, diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c index e821eed7132..2086cb12d3e 100644 --- a/drivers/block/viodasd.c +++ b/drivers/block/viodasd.c @@ -252,7 +252,7 @@ static int send_request(struct request *req) struct viodasd_device *d; unsigned long flags; - start = (u64)req->sector << 9; + start = (u64)blk_rq_pos(req) << 9; if (rq_data_dir(req) == READ) { direction = DMA_FROM_DEVICE; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 50745e64414..1980ab45635 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -85,7 +85,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, vbr->req = req; if (blk_fs_request(vbr->req)) { vbr->out_hdr.type = 0; - vbr->out_hdr.sector = vbr->req->sector; + vbr->out_hdr.sector = blk_rq_pos(vbr->req); vbr->out_hdr.ioprio = req_get_ioprio(vbr->req); } else if (blk_pc_request(vbr->req)) { vbr->out_hdr.type = VIRTIO_BLK_T_SCSI_CMD; diff --git a/drivers/block/xd.c b/drivers/block/xd.c index 14be4c1ed1a..4ef88018bcd 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c @@ -306,8 +306,8 @@ static void do_xd_request (struct request_queue * q) return; while ((req = elv_next_request(q)) != NULL) { - unsigned block = req->sector; - unsigned count = req->nr_sectors; + unsigned block = blk_rq_pos(req); + unsigned count = blk_rq_sectors(req); XD_INFO *disk = req->rq_disk->private_data; int res = 0; int retry; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index b4564479f64..91fc56597e9 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -231,7 +231,7 @@ static int blkif_queue_request(struct request *req) info->shadow[id].request = (unsigned long)req; ring_req->id = id; - ring_req->sector_number = (blkif_sector_t)req->sector; + ring_req->sector_number = (blkif_sector_t)blk_rq_pos(req); ring_req->handle = info->handle; ring_req->operation = rq_data_dir(req) ? @@ -310,11 +310,10 @@ static void do_blkif_request(struct request_queue *rq) goto wait; pr_debug("do_blk_req %p: cmd %p, sec %lx, " - "(%u/%li) buffer:%p [%s]\n", - req, req->cmd, (unsigned long)req->sector, - req->current_nr_sectors, - req->nr_sectors, req->buffer, - rq_data_dir(req) ? "write" : "read"); + "(%u/%u) buffer:%p [%s]\n", + req, req->cmd, (unsigned long)blk_rq_pos(req), + blk_rq_cur_sectors(req), blk_rq_sectors(req), + req->buffer, rq_data_dir(req) ? "write" : "read"); blkdev_dequeue_request(req); diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 5722931d14c..97c99b43f88 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -646,13 +646,14 @@ static void ace_fsm_dostate(struct ace_device *ace) /* Okay, it's a data request, set it up for transfer */ dev_dbg(ace->dev, "request: sec=%llx hcnt=%x, ccnt=%x, dir=%i\n", - (unsigned long long) req->sector, blk_rq_sectors(req), - req->current_nr_sectors, rq_data_dir(req)); + (unsigned long long)blk_rq_pos(req), + blk_rq_sectors(req), blk_rq_cur_sectors(req), + rq_data_dir(req)); ace->req = req; ace->data_ptr = req->buffer; - ace->data_count = req->current_nr_sectors * ACE_BUF_PER_SECTOR; - ace_out32(ace, ACE_MPULBA, req->sector & 0x0FFFFFFF); + ace->data_count = blk_rq_cur_sectors(req) * ACE_BUF_PER_SECTOR; + ace_out32(ace, ACE_MPULBA, blk_rq_pos(req) & 0x0FFFFFFF); count = blk_rq_sectors(req); if (rq_data_dir(req)) { @@ -688,7 +689,7 @@ static void ace_fsm_dostate(struct ace_device *ace) dev_dbg(ace->dev, "CFBSY set; t=%i iter=%i c=%i dc=%i irq=%i\n", ace->fsm_task, ace->fsm_iter_num, - ace->req->current_nr_sectors * 16, + blk_rq_cur_sectors(ace->req) * 16, ace->data_count, ace->in_irq); ace_fsm_yield(ace); /* need to poll CFBSY bit */ break; @@ -697,7 +698,7 @@ static void ace_fsm_dostate(struct ace_device *ace) dev_dbg(ace->dev, "DATABUF not set; t=%i iter=%i c=%i dc=%i irq=%i\n", ace->fsm_task, ace->fsm_iter_num, - ace->req->current_nr_sectors * 16, + blk_rq_cur_sectors(ace->req) * 16, ace->data_count, ace->in_irq); ace_fsm_yieldirq(ace); break; @@ -721,10 +722,10 @@ static void ace_fsm_dostate(struct ace_device *ace) blk_rq_cur_bytes(ace->req))) { /* dev_dbg(ace->dev, "next block; h=%u c=%u\n", * blk_rq_sectors(ace->req), - * ace->req->current_nr_sectors); + * blk_rq_cur_sectors(ace->req)); */ ace->data_ptr = ace->req->buffer; - ace->data_count = ace->req->current_nr_sectors * 16; + ace->data_count = blk_rq_cur_sectors(ace->req) * 16; ace_fsm_yieldirq(ace); break; } diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index b66ad58a3c3..d4e6b71f514 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -71,12 +71,12 @@ static void do_z2_request(struct request_queue *q) { struct request *req; while ((req = elv_next_request(q)) != NULL) { - unsigned long start = req->sector << 9; - unsigned long len = req->current_nr_sectors << 9; + unsigned long start = blk_rq_pos(req) << 9; + unsigned long len = blk_rq_cur_sectors(req) << 9; if (start + len > z2ram_size) { printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n", - req->sector, req->current_nr_sectors); + blk_rq_pos(req), blk_rq_cur_sectors(req)); __blk_end_request_cur(req, -EIO); continue; } diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index cab2b1fb2fe..488423cab51 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -584,8 +584,8 @@ static void gdrom_readdisk_dma(struct work_struct *work) list_for_each_safe(elem, next, &gdrom_deferred) { req = list_entry(elem, struct request, queuelist); spin_unlock(&gdrom_lock); - block = req->sector/GD_TO_BLK + GD_SESSION_OFFSET; - block_cnt = req->nr_sectors/GD_TO_BLK; + block = blk_rq_pos(req)/GD_TO_BLK + GD_SESSION_OFFSET; + block_cnt = blk_rq_sectors(req)/GD_TO_BLK; ctrl_outl(PHYSADDR(req->buffer), GDROM_DMA_STARTADDR_REG); ctrl_outl(block_cnt * GDROM_HARD_SECTOR, GDROM_DMA_LENGTH_REG); ctrl_outl(1, GDROM_DMA_DIRECTION_REG); @@ -661,7 +661,7 @@ static void gdrom_request(struct request_queue *rq) printk(" write request ignored\n"); __blk_end_request_cur(req, -EIO); } - if (req->nr_sectors) + if (blk_rq_sectors(req)) gdrom_request_handler_dma(req); else __blk_end_request_cur(req, -EIO); diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index cc3efa096e1..6e190a93d8d 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -282,7 +282,7 @@ static int send_request(struct request *req) viopath_targetinst(viopath_hostLp), (u64)req, VIOVERSION << 16, ((u64)DEVICE_NR(diskinfo) << 48) | dmaaddr, - (u64)req->sector * 512, len, 0); + (u64)blk_rq_pos(req) * 512, len, 0); if (hvrc != HvLpEvent_Rc_Good) { printk(VIOCD_KERN_WARNING "hv error on op %d\n", (int)hvrc); return -1; diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index a41634699f8..9e600d22f40 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -677,10 +677,10 @@ try_again: continue; } - t_sec = msb->block_req->sector << 9; + t_sec = blk_rq_pos(msb->block_req) << 9; sector_div(t_sec, msb->page_size); - count = msb->block_req->nr_sectors << 9; + count = blk_rq_sectors(msb->block_req) << 9; count /= msb->page_size; param.system = msb->system; @@ -745,7 +745,7 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error) t_len *= msb->page_size; } } else - t_len = msb->block_req->nr_sectors << 9; + t_len = blk_rq_sectors(msb->block_req) << 9; dev_dbg(&card->dev, "transferred %x (%d)\n", t_len, error); diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index 56e60f0d531..510eb472637 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -761,7 +761,7 @@ static int i2o_block_transfer(struct request *req) break; case CACHE_SMARTFETCH: - if (req->nr_sectors > 16) + if (blk_rq_sectors(req) > 16) ctl_flags = 0x201F0008; else ctl_flags = 0x001F0000; @@ -781,13 +781,13 @@ static int i2o_block_transfer(struct request *req) ctl_flags = 0x001F0010; break; case CACHE_SMARTBACK: - if (req->nr_sectors > 16) + if (blk_rq_sectors(req) > 16) ctl_flags = 0x001F0004; else ctl_flags = 0x001F0010; break; case CACHE_SMARTTHROUGH: - if (req->nr_sectors > 16) + if (blk_rq_sectors(req) > 16) ctl_flags = 0x001F0004; else ctl_flags = 0x001F0010; @@ -827,22 +827,24 @@ static int i2o_block_transfer(struct request *req) *mptr++ = cpu_to_le32(scsi_flags); - *((u32 *) & cmd[2]) = cpu_to_be32(req->sector * hwsec); - *((u16 *) & cmd[7]) = cpu_to_be16(req->nr_sectors * hwsec); + *((u32 *) & cmd[2]) = cpu_to_be32(blk_rq_pos(req) * hwsec); + *((u16 *) & cmd[7]) = cpu_to_be16(blk_rq_sectors(req) * hwsec); memcpy(mptr, cmd, 10); mptr += 4; - *mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT); + *mptr++ = + cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT); } else #endif { msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid); *mptr++ = cpu_to_le32(ctl_flags); - *mptr++ = cpu_to_le32(req->nr_sectors << KERNEL_SECTOR_SHIFT); *mptr++ = - cpu_to_le32((u32) (req->sector << KERNEL_SECTOR_SHIFT)); + cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT); *mptr++ = - cpu_to_le32(req->sector >> (32 - KERNEL_SECTOR_SHIFT)); + cpu_to_le32((u32) (blk_rq_pos(req) << KERNEL_SECTOR_SHIFT)); + *mptr++ = + cpu_to_le32(blk_rq_pos(req) >> (32 - KERNEL_SECTOR_SHIFT)); } if (!i2o_block_sglist_alloc(c, ireq, &mptr)) { diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index fe8041e619e..949e99770ad 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -243,7 +243,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) brq.mrq.cmd = &brq.cmd; brq.mrq.data = &brq.data; - brq.cmd.arg = req->sector; + brq.cmd.arg = blk_rq_pos(req); if (!mmc_card_blockaddr(card)) brq.cmd.arg <<= 9; brq.cmd.flags = MMC_RSP_SPI_R1 | MMC_RSP_R1 | MMC_CMD_ADTC; @@ -251,7 +251,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) brq.stop.opcode = MMC_STOP_TRANSMISSION; brq.stop.arg = 0; brq.stop.flags = MMC_RSP_SPI_R1B | MMC_RSP_R1B | MMC_CMD_AC; - brq.data.blocks = req->nr_sectors; + brq.data.blocks = blk_rq_sectors(req); /* * After a read error, we redo the request one sector at a time @@ -293,7 +293,7 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) * Adjust the sg list so it is the same size as the * request. */ - if (brq.data.blocks != req->nr_sectors) { + if (brq.data.blocks != blk_rq_sectors(req)) { int i, data_size = brq.data.blocks << 9; struct scatterlist *sg; @@ -344,8 +344,8 @@ static int mmc_blk_issue_rq(struct mmc_queue *mq, struct request *req) printk(KERN_ERR "%s: error %d transferring data," " sector %u, nr %u, card status %#x\n", req->rq_disk->disk_name, brq.data.error, - (unsigned)req->sector, - (unsigned)req->nr_sectors, status); + (unsigned)blk_rq_pos(req), + (unsigned)blk_rq_sectors(req), status); } if (brq.stop.error) { diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 76c4c8d1307..4ea2e67ac97 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -47,8 +47,8 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, unsigned long block, nsect; char *buf; - block = req->sector << 9 >> tr->blkshift; - nsect = req->current_nr_sectors << 9 >> tr->blkshift; + block = blk_rq_pos(req) << 9 >> tr->blkshift; + nsect = blk_rq_cur_sectors(req) << 9 >> tr->blkshift; buf = req->buffer; @@ -59,7 +59,8 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, if (!blk_fs_request(req)) return -EIO; - if (req->sector + req->current_nr_sectors > get_capacity(req->rq_disk)) + if (blk_rq_pos(req) + blk_rq_cur_sectors(req) > + get_capacity(req->rq_disk)) return -EIO; switch(rq_data_dir(req)) { diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index fabec95686b..7df03c7aea0 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -603,7 +603,7 @@ static void dasd_profile_end(struct dasd_block *block, if (dasd_profile_level != DASD_PROFILE_ON) return; - sectors = req->nr_sectors; + sectors = blk_rq_sectors(req); if (!cqr->buildclk || !cqr->startclk || !cqr->stopclk || !cqr->endclk || !sectors) diff --git a/drivers/s390/block/dasd_diag.c b/drivers/s390/block/dasd_diag.c index b9a7f773344..2efaddfae56 100644 --- a/drivers/s390/block/dasd_diag.c +++ b/drivers/s390/block/dasd_diag.c @@ -505,8 +505,9 @@ static struct dasd_ccw_req *dasd_diag_build_cp(struct dasd_device *memdev, return ERR_PTR(-EINVAL); blksize = block->bp_block; /* Calculate record id of first and last block. */ - first_rec = req->sector >> block->s2b_shift; - last_rec = (req->sector + req->nr_sectors - 1) >> block->s2b_shift; + first_rec = blk_rq_pos(req) >> block->s2b_shift; + last_rec = + (blk_rq_pos(req) + blk_rq_sectors(req) - 1) >> block->s2b_shift; /* Check struct bio and count the number of blocks for the request. */ count = 0; rq_for_each_segment(bv, req, iter) { diff --git a/drivers/s390/block/dasd_eckd.c b/drivers/s390/block/dasd_eckd.c index cb52da033f0..a41c94053e6 100644 --- a/drivers/s390/block/dasd_eckd.c +++ b/drivers/s390/block/dasd_eckd.c @@ -2354,10 +2354,10 @@ static struct dasd_ccw_req *dasd_eckd_build_cp(struct dasd_device *startdev, blksize = block->bp_block; blk_per_trk = recs_per_track(&private->rdc_data, 0, blksize); /* Calculate record id of first and last block. */ - first_rec = first_trk = req->sector >> block->s2b_shift; + first_rec = first_trk = blk_rq_pos(req) >> block->s2b_shift; first_offs = sector_div(first_trk, blk_per_trk); last_rec = last_trk = - (req->sector + req->nr_sectors - 1) >> block->s2b_shift; + (blk_rq_pos(req) + blk_rq_sectors(req) - 1) >> block->s2b_shift; last_offs = sector_div(last_trk, blk_per_trk); cdlspecial = (private->uses_cdl && first_rec < 2*blk_per_trk); @@ -2420,7 +2420,7 @@ dasd_eckd_free_cp(struct dasd_ccw_req *cqr, struct request *req) private = (struct dasd_eckd_private *) cqr->block->base->private; blksize = cqr->block->bp_block; blk_per_trk = recs_per_track(&private->rdc_data, 0, blksize); - recid = req->sector >> cqr->block->s2b_shift; + recid = blk_rq_pos(req) >> cqr->block->s2b_shift; ccw = cqr->cpaddr; /* Skip over define extent & locate record. */ ccw++; diff --git a/drivers/s390/block/dasd_fba.c b/drivers/s390/block/dasd_fba.c index a3eb6fd1467..8912358daa2 100644 --- a/drivers/s390/block/dasd_fba.c +++ b/drivers/s390/block/dasd_fba.c @@ -270,8 +270,9 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev, return ERR_PTR(-EINVAL); blksize = block->bp_block; /* Calculate record id of first and last block. */ - first_rec = req->sector >> block->s2b_shift; - last_rec = (req->sector + req->nr_sectors - 1) >> block->s2b_shift; + first_rec = blk_rq_pos(req) >> block->s2b_shift; + last_rec = + (blk_rq_pos(req) + blk_rq_sectors(req) - 1) >> block->s2b_shift; /* Check struct bio and count the number of blocks for the request. */ count = 0; cidaw = 0; @@ -309,7 +310,7 @@ static struct dasd_ccw_req *dasd_fba_build_cp(struct dasd_device * memdev, ccw = cqr->cpaddr; /* First ccw is define extent. */ define_extent(ccw++, cqr->data, rq_data_dir(req), - block->bp_block, req->sector, req->nr_sectors); + block->bp_block, blk_rq_pos(req), blk_rq_sectors(req)); /* Build locate_record + read/write ccws. */ idaws = (unsigned long *) (cqr->data + sizeof(struct DE_fba_data)); LO_data = (struct LO_fba_data *) (idaws + cidaw); diff --git a/drivers/s390/char/tape_34xx.c b/drivers/s390/char/tape_34xx.c index 5f8e8ef43dd..2d00a383a47 100644 --- a/drivers/s390/char/tape_34xx.c +++ b/drivers/s390/char/tape_34xx.c @@ -1134,7 +1134,7 @@ tape_34xx_bread(struct tape_device *device, struct request *req) /* Setup ccws. */ request->op = TO_BLOCK; start_block = (struct tape_34xx_block_id *) request->cpdata; - start_block->block = req->sector >> TAPEBLOCK_HSEC_S2B; + start_block->block = blk_rq_pos(req) >> TAPEBLOCK_HSEC_S2B; DBF_EVENT(6, "start_block = %i\n", start_block->block); ccw = request->cpaddr; diff --git a/drivers/s390/char/tape_3590.c b/drivers/s390/char/tape_3590.c index 823b05bd0dd..c453b2f3e9f 100644 --- a/drivers/s390/char/tape_3590.c +++ b/drivers/s390/char/tape_3590.c @@ -633,7 +633,7 @@ tape_3590_bread(struct tape_device *device, struct request *req) struct req_iterator iter; DBF_EVENT(6, "xBREDid:"); - start_block = req->sector >> TAPEBLOCK_HSEC_S2B; + start_block = blk_rq_pos(req) >> TAPEBLOCK_HSEC_S2B; DBF_EVENT(6, "start_block = %i\n", start_block); rq_for_each_segment(bv, req, iter) diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c index 86596d3813b..5d035e4939d 100644 --- a/drivers/s390/char/tape_block.c +++ b/drivers/s390/char/tape_block.c @@ -87,7 +87,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data) if (ccw_req->rc == 0) /* Update position. */ device->blk_data.block_position = - (req->sector + req->nr_sectors) >> TAPEBLOCK_HSEC_S2B; + (blk_rq_pos(req) + blk_rq_sectors(req)) >> TAPEBLOCK_HSEC_S2B; else /* We lost the position information due to an error. */ device->blk_data.block_position = -1; diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index 09617884a50..2132c906e53 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -188,8 +188,8 @@ static void jsfd_do_request(struct request_queue *q) while ((req = elv_next_request(q)) != NULL) { struct jsfd_part *jdp = req->rq_disk->private_data; - unsigned long offset = req->sector << 9; - size_t len = req->current_nr_sectors << 9; + unsigned long offset = blk_rq_pos(req) << 9; + size_t len = blk_rq_cur_sectors(req) << 9; if ((offset + len) > jdp->dsize) { __blk_end_request_cur(req, -EIO); diff --git a/drivers/scsi/eata.c b/drivers/scsi/eata.c index be5099dd94b..c7076ce25e2 100644 --- a/drivers/scsi/eata.c +++ b/drivers/scsi/eata.c @@ -1825,7 +1825,7 @@ static int eata2x_queuecommand(struct scsi_cmnd *SCpnt, if (linked_comm && SCpnt->device->queue_depth > 2 && TLDEV(SCpnt->device->type)) { ha->cp_stat[i] = READY; - flush_dev(SCpnt->device, SCpnt->request->sector, ha, 0); + flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), ha, 0); return 0; } @@ -2144,13 +2144,13 @@ static int reorder(struct hostdata *ha, unsigned long cursec, if (!cpp->din) input_only = 0; - if (SCpnt->request->sector < minsec) - minsec = SCpnt->request->sector; - if (SCpnt->request->sector > maxsec) - maxsec = SCpnt->request->sector; + if (blk_rq_pos(SCpnt->request) < minsec) + minsec = blk_rq_pos(SCpnt->request); + if (blk_rq_pos(SCpnt->request) > maxsec) + maxsec = blk_rq_pos(SCpnt->request); - sl[n] = SCpnt->request->sector; - ioseek += SCpnt->request->nr_sectors; + sl[n] = blk_rq_pos(SCpnt->request); + ioseek += blk_rq_sectors(SCpnt->request); if (!n) continue; @@ -2190,7 +2190,7 @@ static int reorder(struct hostdata *ha, unsigned long cursec, k = il[n]; cpp = &ha->cp[k]; SCpnt = cpp->SCpnt; - ll[n] = SCpnt->request->nr_sectors; + ll[n] = blk_rq_sectors(SCpnt->request); pl[n] = SCpnt->serial_number; if (!n) @@ -2236,12 +2236,12 @@ static int reorder(struct hostdata *ha, unsigned long cursec, cpp = &ha->cp[k]; SCpnt = cpp->SCpnt; scmd_printk(KERN_INFO, SCpnt, - "%s pid %ld mb %d fc %d nr %d sec %ld ns %ld" + "%s pid %ld mb %d fc %d nr %d sec %ld ns %u" " cur %ld s:%c r:%c rev:%c in:%c ov:%c xd %d.\n", (ihdlr ? "ihdlr" : "qcomm"), SCpnt->serial_number, k, flushcount, - n_ready, SCpnt->request->sector, - SCpnt->request->nr_sectors, cursec, YESNO(s), + n_ready, blk_rq_pos(SCpnt->request), + blk_rq_sectors(SCpnt->request), cursec, YESNO(s), YESNO(r), YESNO(rev), YESNO(input_only), YESNO(overlap), cpp->din); } @@ -2408,7 +2408,7 @@ static irqreturn_t ihdlr(struct Scsi_Host *shost) if (linked_comm && SCpnt->device->queue_depth > 2 && TLDEV(SCpnt->device->type)) - flush_dev(SCpnt->device, SCpnt->request->sector, ha, 1); + flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), ha, 1); tstatus = status_byte(spp->target_status); diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c index 36fd2e75da1..a8fab397711 100644 --- a/drivers/scsi/lpfc/lpfc_scsi.c +++ b/drivers/scsi/lpfc/lpfc_scsi.c @@ -1313,10 +1313,10 @@ lpfc_parse_bg_err(struct lpfc_hba *phba, struct lpfc_scsi_buf *lpfc_cmd, uint32_t bgstat = bgf->bgstat; uint64_t failing_sector = 0; - printk(KERN_ERR "BG ERROR in cmd 0x%x lba 0x%llx blk cnt 0x%lx " + printk(KERN_ERR "BG ERROR in cmd 0x%x lba 0x%llx blk cnt 0x%x " "bgstat=0x%x bghm=0x%x\n", cmd->cmnd[0], (unsigned long long)scsi_get_lba(cmd), - cmd->request->nr_sectors, bgstat, bghm); + blk_rq_sectors(cmd->request), bgstat, bghm); spin_lock(&_dump_buf_lock); if (!_dump_buf_done) { @@ -2375,15 +2375,15 @@ lpfc_queuecommand(struct scsi_cmnd *cmnd, void (*done) (struct scsi_cmnd *)) if (cmnd->cmnd[0] == READ_10) lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG, "9035 BLKGRD: READ @ sector %llu, " - "count %lu\n", - (unsigned long long)scsi_get_lba(cmnd), - cmnd->request->nr_sectors); + "count %u\n", + (unsigned long long)scsi_get_lba(cmnd), + blk_rq_sectors(cmnd->request)); else if (cmnd->cmnd[0] == WRITE_10) lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG, "9036 BLKGRD: WRITE @ sector %llu, " - "count %lu cmd=%p\n", + "count %u cmd=%p\n", (unsigned long long)scsi_get_lba(cmnd), - cmnd->request->nr_sectors, + blk_rq_sectors(cmnd->request), cmnd); err = lpfc_bg_scsi_prep_dma_buf(phba, lpfc_cmd); @@ -2403,15 +2403,15 @@ lpfc_queuecommand(struct scsi_cmnd *cmnd, void (*done) (struct scsi_cmnd *)) if (cmnd->cmnd[0] == READ_10) lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG, "9040 dbg: READ @ sector %llu, " - "count %lu\n", + "count %u\n", (unsigned long long)scsi_get_lba(cmnd), - cmnd->request->nr_sectors); + blk_rq_sectors(cmnd->request)); else if (cmnd->cmnd[0] == WRITE_10) lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG, "9041 dbg: WRITE @ sector %llu, " - "count %lu cmd=%p\n", + "count %u cmd=%p\n", (unsigned long long)scsi_get_lba(cmnd), - cmnd->request->nr_sectors, cmnd); + blk_rq_sectors(cmnd->request), cmnd); else lpfc_printf_vlog(vport, KERN_WARNING, LOG_BG, "9042 dbg: parser not implemented\n"); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 9ff0ca9988a..39b3acfc0dd 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -787,9 +787,9 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) * Next deal with any sectors which we were able to correctly * handle. */ - SCSI_LOG_HLCOMPLETE(1, printk("%ld sectors total, " + SCSI_LOG_HLCOMPLETE(1, printk("%u sectors total, " "%d bytes done.\n", - req->nr_sectors, good_bytes)); + blk_rq_sectors(req), good_bytes)); /* * Recovered errors need reporting, but they're always treated @@ -968,7 +968,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb, if (blk_pc_request(req)) sdb->length = req->data_len; else - sdb->length = req->nr_sectors << 9; + sdb->length = blk_rq_sectors(req) << 9; return BLKPREP_OK; } diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 3fcb64b91c4..70c4dd99bbf 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -383,9 +383,9 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq) struct scsi_device *sdp = q->queuedata; struct gendisk *disk = rq->rq_disk; struct scsi_disk *sdkp; - sector_t block = rq->sector; + sector_t block = blk_rq_pos(rq); sector_t threshold; - unsigned int this_count = rq->nr_sectors; + unsigned int this_count = blk_rq_sectors(rq); int ret, host_dif; if (rq->cmd_type == REQ_TYPE_BLOCK_PC) { @@ -412,10 +412,10 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq) this_count)); if (!sdp || !scsi_device_online(sdp) || - block + rq->nr_sectors > get_capacity(disk)) { + block + blk_rq_sectors(rq) > get_capacity(disk)) { SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, - "Finishing %ld sectors\n", - rq->nr_sectors)); + "Finishing %u sectors\n", + blk_rq_sectors(rq))); SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, "Retry with 0x%p\n", SCpnt)); goto out; @@ -462,7 +462,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq) * for this. */ if (sdp->sector_size == 1024) { - if ((block & 1) || (rq->nr_sectors & 1)) { + if ((block & 1) || (blk_rq_sectors(rq) & 1)) { scmd_printk(KERN_ERR, SCpnt, "Bad block number requested\n"); goto out; @@ -472,7 +472,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq) } } if (sdp->sector_size == 2048) { - if ((block & 3) || (rq->nr_sectors & 3)) { + if ((block & 3) || (blk_rq_sectors(rq) & 3)) { scmd_printk(KERN_ERR, SCpnt, "Bad block number requested\n"); goto out; @@ -482,7 +482,7 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq) } } if (sdp->sector_size == 4096) { - if ((block & 7) || (rq->nr_sectors & 7)) { + if ((block & 7) || (blk_rq_sectors(rq) & 7)) { scmd_printk(KERN_ERR, SCpnt, "Bad block number requested\n"); goto out; @@ -511,10 +511,10 @@ static int sd_prep_fn(struct request_queue *q, struct request *rq) } SCSI_LOG_HLQUEUE(2, scmd_printk(KERN_INFO, SCpnt, - "%s %d/%ld 512 byte blocks.\n", + "%s %d/%u 512 byte blocks.\n", (rq_data_dir(rq) == WRITE) ? "writing" : "reading", this_count, - rq->nr_sectors)); + blk_rq_sectors(rq))); /* Set RDPROTECT/WRPROTECT if disk is formatted with DIF */ host_dif = scsi_host_dif_capable(sdp->host, sdkp->protection_type); @@ -970,8 +970,8 @@ static struct block_device_operations sd_fops = { static unsigned int sd_completed_bytes(struct scsi_cmnd *scmd) { - u64 start_lba = scmd->request->sector; - u64 end_lba = scmd->request->sector + (scsi_bufflen(scmd) / 512); + u64 start_lba = blk_rq_pos(scmd->request); + u64 end_lba = blk_rq_pos(scmd->request) + (scsi_bufflen(scmd) / 512); u64 bad_lba; int info_valid; diff --git a/drivers/scsi/sd_dif.c b/drivers/scsi/sd_dif.c index 184dff49279..82f14a9482d 100644 --- a/drivers/scsi/sd_dif.c +++ b/drivers/scsi/sd_dif.c @@ -507,7 +507,7 @@ void sd_dif_complete(struct scsi_cmnd *scmd, unsigned int good_bytes) sector_sz = scmd->device->sector_size; sectors = good_bytes / sector_sz; - phys = scmd->request->sector & 0xffffffff; + phys = blk_rq_pos(scmd->request) & 0xffffffff; if (sector_sz == 4096) phys >>= 3; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index 0e1a0f2d2ad..fddba53c7fe 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -292,7 +292,8 @@ static int sr_done(struct scsi_cmnd *SCpnt) if (cd->device->sector_size == 2048) error_sector <<= 2; error_sector &= ~(block_sectors - 1); - good_bytes = (error_sector - SCpnt->request->sector) << 9; + good_bytes = (error_sector - + blk_rq_pos(SCpnt->request)) << 9; if (good_bytes < 0 || good_bytes >= this_count) good_bytes = 0; /* @@ -349,8 +350,8 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) cd->disk->disk_name, block)); if (!cd->device || !scsi_device_online(cd->device)) { - SCSI_LOG_HLQUEUE(2, printk("Finishing %ld sectors\n", - rq->nr_sectors)); + SCSI_LOG_HLQUEUE(2, printk("Finishing %u sectors\n", + blk_rq_sectors(rq))); SCSI_LOG_HLQUEUE(2, printk("Retry with 0x%p\n", SCpnt)); goto out; } @@ -413,7 +414,7 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) /* * request doesn't start on hw block boundary, add scatter pads */ - if (((unsigned int)rq->sector % (s_size >> 9)) || + if (((unsigned int)blk_rq_pos(rq) % (s_size >> 9)) || (scsi_bufflen(SCpnt) % s_size)) { scmd_printk(KERN_NOTICE, SCpnt, "unaligned transfer\n"); goto out; @@ -422,14 +423,14 @@ static int sr_prep_fn(struct request_queue *q, struct request *rq) this_count = (scsi_bufflen(SCpnt) >> 9) / (s_size >> 9); - SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%ld 512 byte blocks.\n", + SCSI_LOG_HLQUEUE(2, printk("%s : %s %d/%u 512 byte blocks.\n", cd->cdi.name, (rq_data_dir(rq) == WRITE) ? "writing" : "reading", - this_count, rq->nr_sectors)); + this_count, blk_rq_sectors(rq))); SCpnt->cmnd[1] = 0; - block = (unsigned int)rq->sector / (s_size >> 9); + block = (unsigned int)blk_rq_pos(rq) / (s_size >> 9); if (this_count > 0xffff) { this_count = 0xffff; diff --git a/drivers/scsi/u14-34f.c b/drivers/scsi/u14-34f.c index 601e95141cb..54023d41fd1 100644 --- a/drivers/scsi/u14-34f.c +++ b/drivers/scsi/u14-34f.c @@ -1306,7 +1306,7 @@ static int u14_34f_queuecommand(struct scsi_cmnd *SCpnt, void (*done)(struct scs if (linked_comm && SCpnt->device->queue_depth > 2 && TLDEV(SCpnt->device->type)) { HD(j)->cp_stat[i] = READY; - flush_dev(SCpnt->device, SCpnt->request->sector, j, FALSE); + flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), j, FALSE); return 0; } @@ -1610,11 +1610,13 @@ static int reorder(unsigned int j, unsigned long cursec, if (!(cpp->xdir == DTD_IN)) input_only = FALSE; - if (SCpnt->request->sector < minsec) minsec = SCpnt->request->sector; - if (SCpnt->request->sector > maxsec) maxsec = SCpnt->request->sector; + if (blk_rq_pos(SCpnt->request) < minsec) + minsec = blk_rq_pos(SCpnt->request); + if (blk_rq_pos(SCpnt->request) > maxsec) + maxsec = blk_rq_pos(SCpnt->request); - sl[n] = SCpnt->request->sector; - ioseek += SCpnt->request->nr_sectors; + sl[n] = blk_rq_pos(SCpnt->request); + ioseek += blk_rq_sectors(SCpnt->request); if (!n) continue; @@ -1642,7 +1644,7 @@ static int reorder(unsigned int j, unsigned long cursec, if (!input_only) for (n = 0; n < n_ready; n++) { k = il[n]; cpp = &HD(j)->cp[k]; SCpnt = cpp->SCpnt; - ll[n] = SCpnt->request->nr_sectors; pl[n] = SCpnt->serial_number; + ll[n] = blk_rq_sectors(SCpnt->request); pl[n] = SCpnt->serial_number; if (!n) continue; @@ -1666,12 +1668,12 @@ static int reorder(unsigned int j, unsigned long cursec, if (link_statistics && (overlap || !(flushcount % link_statistics))) for (n = 0; n < n_ready; n++) { k = il[n]; cpp = &HD(j)->cp[k]; SCpnt = cpp->SCpnt; - printk("%s %d.%d:%d pid %ld mb %d fc %d nr %d sec %ld ns %ld"\ + printk("%s %d.%d:%d pid %ld mb %d fc %d nr %d sec %ld ns %u"\ " cur %ld s:%c r:%c rev:%c in:%c ov:%c xd %d.\n", (ihdlr ? "ihdlr" : "qcomm"), SCpnt->channel, SCpnt->target, SCpnt->lun, SCpnt->serial_number, k, flushcount, n_ready, - SCpnt->request->sector, SCpnt->request->nr_sectors, cursec, - YESNO(s), YESNO(r), YESNO(rev), YESNO(input_only), + blk_rq_pos(SCpnt->request), blk_rq_sectors(SCpnt->request), + cursec, YESNO(s), YESNO(r), YESNO(rev), YESNO(input_only), YESNO(overlap), cpp->xdir); } #endif @@ -1799,7 +1801,7 @@ static irqreturn_t ihdlr(unsigned int j) if (linked_comm && SCpnt->device->queue_depth > 2 && TLDEV(SCpnt->device->type)) - flush_dev(SCpnt->device, SCpnt->request->sector, j, TRUE); + flush_dev(SCpnt->device, blk_rq_pos(SCpnt->request), j, TRUE); tstatus = status_byte(spp->target_status); diff --git a/include/scsi/scsi_cmnd.h b/include/scsi/scsi_cmnd.h index 43b50d36925..3878d1dc7f5 100644 --- a/include/scsi/scsi_cmnd.h +++ b/include/scsi/scsi_cmnd.h @@ -270,7 +270,7 @@ static inline unsigned char scsi_get_prot_type(struct scsi_cmnd *scmd) static inline sector_t scsi_get_lba(struct scsi_cmnd *scmd) { - return scmd->request->sector; + return blk_rq_pos(scmd->request); } static inline unsigned scsi_prot_sg_count(struct scsi_cmnd *cmd) From 9780e2dd8254351f6cbe11304849126b51dbd561 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:40 +0900 Subject: [PATCH 0931/2061] ide: convert to rq pos and nr_sectors accessors ide doesn't manipulate request fields anymore and thus all hard and their soft equivalents are always equal. Convert all references to accessors. [ Impact: use pos and nr_sectors accessors ] Signed-off-by: Tejun Heo Acked-by: Bartlomiej Zolnierkiewicz Cc: Borislav Petkov Cc: Sergei Shtylyov Signed-off-by: Jens Axboe --- drivers/ide/ide-cd.c | 8 ++++---- drivers/ide/ide-disk.c | 8 ++++---- drivers/ide/ide-dma.c | 2 +- drivers/ide/ide-floppy.c | 6 +++--- drivers/ide/ide-io.c | 4 ++-- drivers/ide/ide-lib.c | 2 +- drivers/ide/ide-tape.c | 6 +++--- drivers/ide/ide-taskfile.c | 2 +- drivers/ide/pdc202xx_old.c | 2 +- drivers/ide/tc86c001.c | 2 +- drivers/ide/tx4939ide.c | 2 +- 11 files changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 182320dd6ea..eb4f3dc9f18 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -775,8 +775,8 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq) } /* fs requests *must* be hardware frame aligned */ - if ((rq->nr_sectors & (sectors_per_frame - 1)) || - (rq->sector & (sectors_per_frame - 1))) + if ((blk_rq_sectors(rq) & (sectors_per_frame - 1)) || + (blk_rq_pos(rq) & (sectors_per_frame - 1))) return ide_stopped; /* use DMA, if possible */ @@ -868,8 +868,8 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, cmd.rq = rq; if (blk_fs_request(rq) || rq->data_len) { - ide_init_sg_cmd(&cmd, blk_fs_request(rq) ? (rq->nr_sectors << 9) - : rq->data_len); + ide_init_sg_cmd(&cmd, blk_fs_request(rq) ? + (blk_rq_sectors(rq) << 9) : rq->data_len); ide_map_sg(drive, &cmd); } diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index c2438804d3c..ad18e14043c 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -82,7 +82,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq, sector_t block) { ide_hwif_t *hwif = drive->hwif; - u16 nsectors = (u16)rq->nr_sectors; + u16 nsectors = (u16)blk_rq_sectors(rq); u8 lba48 = !!(drive->dev_flags & IDE_DFLAG_LBA48); u8 dma = !!(drive->dev_flags & IDE_DFLAG_USING_DMA); struct ide_cmd cmd; @@ -90,7 +90,7 @@ static ide_startstop_t __ide_do_rw_disk(ide_drive_t *drive, struct request *rq, ide_startstop_t rc; if ((hwif->host_flags & IDE_HFLAG_NO_LBA48_DMA) && lba48 && dma) { - if (block + rq->nr_sectors > 1ULL << 28) + if (block + blk_rq_sectors(rq) > 1ULL << 28) dma = 0; else lba48 = 0; @@ -195,9 +195,9 @@ static ide_startstop_t ide_do_rw_disk(ide_drive_t *drive, struct request *rq, ledtrig_ide_activity(); - pr_debug("%s: %sing: block=%llu, sectors=%lu, buffer=0x%08lx\n", + pr_debug("%s: %sing: block=%llu, sectors=%u, buffer=0x%08lx\n", drive->name, rq_data_dir(rq) == READ ? "read" : "writ", - (unsigned long long)block, rq->nr_sectors, + (unsigned long long)block, blk_rq_sectors(rq), (unsigned long)rq->buffer); if (hwif->rw_disk) diff --git a/drivers/ide/ide-dma.c b/drivers/ide/ide-dma.c index d9123ecae4a..001f68f0bb2 100644 --- a/drivers/ide/ide-dma.c +++ b/drivers/ide/ide-dma.c @@ -103,7 +103,7 @@ ide_startstop_t ide_dma_intr(ide_drive_t *drive) ide_finish_cmd(drive, cmd, stat); else ide_complete_rq(drive, 0, - cmd->rq->nr_sectors << 9); + blk_rq_sectors(cmd->rq) << 9); return ide_stopped; } printk(KERN_ERR "%s: %s: bad DMA status (0x%02x)\n", diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 537b7c55803..1c460bd5637 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -194,7 +194,7 @@ static void idefloppy_create_rw_cmd(ide_drive_t *drive, { struct ide_disk_obj *floppy = drive->driver_data; int block = sector / floppy->bs_factor; - int blocks = rq->nr_sectors / floppy->bs_factor; + int blocks = blk_rq_sectors(rq) / floppy->bs_factor; int cmd = rq_data_dir(rq); ide_debug_log(IDE_DBG_FUNC, "block: %d, blocks: %d", block, blocks); @@ -259,8 +259,8 @@ static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, goto out_end; } if (blk_fs_request(rq)) { - if (((long)rq->sector % floppy->bs_factor) || - (rq->nr_sectors % floppy->bs_factor)) { + if (((long)blk_rq_pos(rq) % floppy->bs_factor) || + (blk_rq_sectors(rq) % floppy->bs_factor)) { printk(KERN_ERR PFX "%s: unsupported r/w rq size\n", drive->name); goto out_end; diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index df23bcbd94b..59799ca5524 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -279,7 +279,7 @@ static ide_startstop_t execute_drive_cmd (ide_drive_t *drive, if (cmd) { if (cmd->protocol == ATA_PROT_PIO) { - ide_init_sg_cmd(cmd, rq->nr_sectors << 9); + ide_init_sg_cmd(cmd, blk_rq_sectors(rq) << 9); ide_map_sg(drive, cmd); } @@ -387,7 +387,7 @@ static ide_startstop_t start_request (ide_drive_t *drive, struct request *rq) drv = *(struct ide_driver **)rq->rq_disk->private_data; - return drv->do_request(drive, rq, rq->sector); + return drv->do_request(drive, rq, blk_rq_pos(rq)); } return do_special(drive); kill_rq: diff --git a/drivers/ide/ide-lib.c b/drivers/ide/ide-lib.c index 56ff8c46c7d..05b7fbc7ead 100644 --- a/drivers/ide/ide-lib.c +++ b/drivers/ide/ide-lib.c @@ -114,7 +114,7 @@ static void ide_dump_ata_error(ide_drive_t *drive, u8 err) if (rq) printk(KERN_CONT ", sector=%llu", - (unsigned long long)rq->sector); + (unsigned long long)blk_rq_pos(rq)); } printk(KERN_CONT "\n"); } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 65c5b705883..c6cf3a951f2 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -586,7 +586,7 @@ static void ide_tape_create_rw_cmd(idetape_tape_t *tape, struct ide_atapi_pc *pc, struct request *rq, u8 opcode) { - unsigned int length = rq->nr_sectors; + unsigned int length = blk_rq_sectors(rq); ide_init_pc(pc); put_unaligned(cpu_to_be32(length), (unsigned int *) &pc->c[1]); @@ -617,8 +617,8 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, struct ide_cmd cmd; u8 stat; - debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n" - (unsigned long long)rq->sector, rq->nr_sectors); + debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %u\n" + (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq)); if (!(blk_special_request(rq) || blk_sense_request(rq))) { /* We do not support buffer cache originated requests. */ diff --git a/drivers/ide/ide-taskfile.c b/drivers/ide/ide-taskfile.c index f400eb4d4af..a0c3e1b2f73 100644 --- a/drivers/ide/ide-taskfile.c +++ b/drivers/ide/ide-taskfile.c @@ -385,7 +385,7 @@ out_end: if ((cmd->tf_flags & IDE_TFLAG_FS) == 0) ide_finish_cmd(drive, cmd, stat); else - ide_complete_rq(drive, 0, cmd->rq->nr_sectors << 9); + ide_complete_rq(drive, 0, blk_rq_sectors(cmd->rq) << 9); return ide_stopped; out_err: ide_error_cmd(drive, cmd); diff --git a/drivers/ide/pdc202xx_old.c b/drivers/ide/pdc202xx_old.c index 248a54bd238..c2a16a8f486 100644 --- a/drivers/ide/pdc202xx_old.c +++ b/drivers/ide/pdc202xx_old.c @@ -177,7 +177,7 @@ static void pdc202xx_dma_start(ide_drive_t *drive) u8 clock = inb(high_16 + 0x11); outb(clock | (hwif->channel ? 0x08 : 0x02), high_16 + 0x11); - word_count = (rq->nr_sectors << 8); + word_count = (blk_rq_sectors(rq) << 8); word_count = (rq_data_dir(rq) == READ) ? word_count | 0x05000000 : word_count | 0x06000000; diff --git a/drivers/ide/tc86c001.c b/drivers/ide/tc86c001.c index b4cf42dc8a6..05a93d6baec 100644 --- a/drivers/ide/tc86c001.c +++ b/drivers/ide/tc86c001.c @@ -112,7 +112,7 @@ static void tc86c001_dma_start(ide_drive_t *drive) ide_hwif_t *hwif = drive->hwif; unsigned long sc_base = hwif->config_data; unsigned long twcr_port = sc_base + (drive->dn ? 0x06 : 0x04); - unsigned long nsectors = hwif->rq->nr_sectors; + unsigned long nsectors = blk_rq_sectors(hwif->rq); /* * We have to manually load the sector count and size into diff --git a/drivers/ide/tx4939ide.c b/drivers/ide/tx4939ide.c index 564422d2397..5ca76224f6d 100644 --- a/drivers/ide/tx4939ide.c +++ b/drivers/ide/tx4939ide.c @@ -307,7 +307,7 @@ static int tx4939ide_dma_setup(ide_drive_t *drive, struct ide_cmd *cmd) tx4939ide_writew(SECTOR_SIZE / 2, base, drive->dn ? TX4939IDE_Xfer_Cnt_2 : TX4939IDE_Xfer_Cnt_1); - tx4939ide_writew(cmd->rq->nr_sectors, base, TX4939IDE_Sec_Cnt); + tx4939ide_writew(blk_rq_sectors(cmd->rq), base, TX4939IDE_Sec_Cnt); return 0; } From 2e46e8b27aa57c6bd34b3102b40ee4d0144b4fab Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:41 +0900 Subject: [PATCH 0932/2061] block: drop request->hard_* and *nr_sectors struct request has had a few different ways to represent some properties of a request. ->hard_* represent block layer's view of the request progress (completion cursor) and the ones without the prefix are supposed to represent the issue cursor and allowed to be updated as necessary by the low level drivers. The thing is that as block layer supports partial completion, the two cursors really aren't necessary and only cause confusion. In addition, manual management of request detail from low level drivers is cumbersome and error-prone at the very least. Another interesting duplicate fields are rq->[hard_]nr_sectors and rq->{hard_cur|current}_nr_sectors against rq->data_len and rq->bio->bi_size. This is more convoluted than the hard_ case. rq->[hard_]nr_sectors are initialized for requests with bio but blk_rq_bytes() uses it only for !pc requests. rq->data_len is initialized for all request but blk_rq_bytes() uses it only for pc requests. This causes good amount of confusion throughout block layer and its drivers and determining the request length has been a bit of black magic which may or may not work depending on circumstances and what the specific LLD is actually doing. rq->{hard_cur|current}_nr_sectors represent the number of sectors in the contiguous data area at the front. This is mainly used by drivers which transfers data by walking request segment-by-segment. This value always equals rq->bio->bi_size >> 9. However, data length for pc requests may not be multiple of 512 bytes and using this field becomes a bit confusing. In general, having multiple fields to represent the same property leads only to confusion and subtle bugs. With recent block low level driver cleanups, no driver is accessing or manipulating these duplicate fields directly. Drop all the duplicates. Now rq->sector means the current sector, rq->data_len the current total length and rq->bio->bi_size the current segment length. Everything else is defined in terms of these three and available only through accessors. * blk_recalc_rq_sectors() is collapsed into blk_update_request() and now handles pc and fs requests equally other than rq->sector update. This means that now pc requests can use partial completion too (no in-kernel user yet tho). * bio_cur_sectors() is replaced with bio_cur_bytes() as block layer now uses byte count as the primary data length. * blk_rq_pos() is now guranteed to be always correct. In-block users converted. * blk_rq_bytes() is now guaranteed to be always valid as is blk_rq_sectors(). In-block users converted. * blk_rq_sectors() is now guaranteed to equal blk_rq_bytes() >> 9. More convenient one is used. * blk_rq_bytes() and blk_rq_cur_bytes() are now inlined and take const pointer to request. [ Impact: API cleanup, single way to represent one property of a request ] Signed-off-by: Tejun Heo Cc: Boaz Harrosh Signed-off-by: Jens Axboe --- block/blk-core.c | 81 +++++++++++++++------------------------- block/blk-merge.c | 36 ++---------------- block/blk.h | 1 - block/cfq-iosched.c | 10 ++--- include/linux/bio.h | 6 +-- include/linux/blkdev.h | 37 ++++++++---------- include/linux/elevator.h | 2 +- kernel/trace/blktrace.c | 16 ++++---- 8 files changed, 67 insertions(+), 122 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 82dc20621c0..3596ca71909 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -127,7 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) INIT_LIST_HEAD(&rq->timeout_list); rq->cpu = -1; rq->q = q; - rq->sector = rq->hard_sector = (sector_t) -1; + rq->sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->cmd = rq->__cmd; @@ -189,8 +189,7 @@ void blk_dump_rq_flags(struct request *rq, char *msg) (unsigned long long)blk_rq_pos(rq), blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); printk(KERN_INFO " bio %p, biotail %p, buffer %p, len %u\n", - rq->bio, rq->biotail, - rq->buffer, rq->data_len); + rq->bio, rq->biotail, rq->buffer, blk_rq_bytes(rq)); if (blk_pc_request(rq)) { printk(KERN_INFO " cdb: "); @@ -1096,7 +1095,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->cmd_flags |= REQ_NOIDLE; req->errors = 0; - req->hard_sector = req->sector = bio->bi_sector; + req->sector = bio->bi_sector; req->ioprio = bio_prio(bio); blk_rq_bio_prep(req->q, req, bio); } @@ -1113,14 +1112,13 @@ static inline bool queue_should_plug(struct request_queue *q) static int __make_request(struct request_queue *q, struct bio *bio) { struct request *req; - int el_ret, nr_sectors; + int el_ret; + unsigned int bytes = bio->bi_size; const unsigned short prio = bio_prio(bio); const int sync = bio_sync(bio); const int unplug = bio_unplug(bio); int rw_flags; - nr_sectors = bio_sectors(bio); - /* * low level driver can indicate that it wants pages above a * certain limit bounced to low memory (ie for highmem, or even @@ -1145,7 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) req->biotail->bi_next = bio; req->biotail = bio; - req->nr_sectors = req->hard_nr_sectors += nr_sectors; + req->data_len += bytes; req->ioprio = ioprio_best(req->ioprio, prio); if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; @@ -1171,10 +1169,8 @@ static int __make_request(struct request_queue *q, struct bio *bio) * not touch req->buffer either... */ req->buffer = bio_data(bio); - req->current_nr_sectors = bio_cur_sectors(bio); - req->hard_cur_sectors = req->current_nr_sectors; - req->sector = req->hard_sector = bio->bi_sector; - req->nr_sectors = req->hard_nr_sectors += nr_sectors; + req->sector = bio->bi_sector; + req->data_len += bytes; req->ioprio = ioprio_best(req->ioprio, prio); if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; @@ -1557,7 +1553,7 @@ EXPORT_SYMBOL(submit_bio); int blk_rq_check_limits(struct request_queue *q, struct request *rq) { if (blk_rq_sectors(rq) > q->max_sectors || - rq->data_len > q->max_hw_sectors << 9) { + blk_rq_bytes(rq) > q->max_hw_sectors << 9) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; } @@ -1675,35 +1671,6 @@ static void blk_account_io_done(struct request *req) } } -/** - * blk_rq_bytes - Returns bytes left to complete in the entire request - * @rq: the request being processed - **/ -unsigned int blk_rq_bytes(struct request *rq) -{ - if (blk_fs_request(rq)) - return blk_rq_sectors(rq) << 9; - - return rq->data_len; -} -EXPORT_SYMBOL_GPL(blk_rq_bytes); - -/** - * blk_rq_cur_bytes - Returns bytes left to complete in the current segment - * @rq: the request being processed - **/ -unsigned int blk_rq_cur_bytes(struct request *rq) -{ - if (blk_fs_request(rq)) - return rq->current_nr_sectors << 9; - - if (rq->bio) - return rq->bio->bi_size; - - return rq->data_len; -} -EXPORT_SYMBOL_GPL(blk_rq_cur_bytes); - struct request *elv_next_request(struct request_queue *q) { struct request *rq; @@ -1736,7 +1703,7 @@ struct request *elv_next_request(struct request_queue *q) if (rq->cmd_flags & REQ_DONTPREP) break; - if (q->dma_drain_size && rq->data_len) { + if (q->dma_drain_size && blk_rq_bytes(rq)) { /* * make sure space for the drain appears we * know we can do this because max_hw_segments @@ -1759,7 +1726,7 @@ struct request *elv_next_request(struct request_queue *q) * avoid resource deadlock. REQ_STARTED will * prevent other fs requests from passing this one. */ - if (q->dma_drain_size && rq->data_len && + if (q->dma_drain_size && blk_rq_bytes(rq) && !(rq->cmd_flags & REQ_DONTPREP)) { /* * remove the space for the drain we added @@ -1911,8 +1878,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) * can find how many bytes remain in the request * later. */ - req->nr_sectors = req->hard_nr_sectors = 0; - req->current_nr_sectors = req->hard_cur_sectors = 0; + req->data_len = 0; return false; } @@ -1926,8 +1892,25 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) bio_iovec(bio)->bv_len -= nr_bytes; } - blk_recalc_rq_sectors(req, total_bytes >> 9); + req->data_len -= total_bytes; + req->buffer = bio_data(req->bio); + + /* update sector only for requests with clear definition of sector */ + if (blk_fs_request(req) || blk_discard_rq(req)) + req->sector += total_bytes >> 9; + + /* + * If total number of sectors is less than the first segment + * size, something has gone terribly wrong. + */ + if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { + printk(KERN_ERR "blk: request botched\n"); + req->data_len = blk_rq_cur_bytes(req); + } + + /* recalculate the number of segments */ blk_recalc_rq_segments(req); + return true; } EXPORT_SYMBOL_GPL(blk_update_request); @@ -2049,11 +2032,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, rq->nr_phys_segments = bio_phys_segments(q, bio); rq->buffer = bio_data(bio); } - rq->current_nr_sectors = bio_cur_sectors(bio); - rq->hard_cur_sectors = rq->current_nr_sectors; - rq->hard_nr_sectors = rq->nr_sectors = bio_sectors(bio); rq->data_len = bio->bi_size; - rq->bio = rq->biotail = bio; if (bio->bi_bdev) diff --git a/block/blk-merge.c b/block/blk-merge.c index bf62a87a9da..b8df66aef0f 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -9,35 +9,6 @@ #include "blk.h" -void blk_recalc_rq_sectors(struct request *rq, int nsect) -{ - if (blk_fs_request(rq) || blk_discard_rq(rq)) { - rq->hard_sector += nsect; - rq->hard_nr_sectors -= nsect; - - /* - * Move the I/O submission pointers ahead if required. - */ - if ((rq->nr_sectors >= rq->hard_nr_sectors) && - (rq->sector <= rq->hard_sector)) { - rq->sector = rq->hard_sector; - rq->nr_sectors = rq->hard_nr_sectors; - rq->hard_cur_sectors = bio_cur_sectors(rq->bio); - rq->current_nr_sectors = rq->hard_cur_sectors; - rq->buffer = bio_data(rq->bio); - } - - /* - * if total number of sectors is less than the first segment - * size, something has gone terribly wrong - */ - if (rq->nr_sectors < rq->current_nr_sectors) { - printk(KERN_ERR "blk: request botched\n"); - rq->nr_sectors = rq->current_nr_sectors; - } - } -} - static unsigned int __blk_recalc_rq_segments(struct request_queue *q, struct bio *bio) { @@ -199,8 +170,9 @@ new_segment: if (unlikely(rq->cmd_flags & REQ_COPY_USER) && - (rq->data_len & q->dma_pad_mask)) { - unsigned int pad_len = (q->dma_pad_mask & ~rq->data_len) + 1; + (blk_rq_bytes(rq) & q->dma_pad_mask)) { + unsigned int pad_len = + (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1; sg->length += pad_len; rq->extra_len += pad_len; @@ -398,7 +370,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, req->biotail->bi_next = next->bio; req->biotail = next->biotail; - req->nr_sectors = req->hard_nr_sectors += next->hard_nr_sectors; + req->data_len += blk_rq_bytes(next); elv_merge_requests(q, req, next); diff --git a/block/blk.h b/block/blk.h index 51115599df9..ab54529103c 100644 --- a/block/blk.h +++ b/block/blk.h @@ -101,7 +101,6 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, int attempt_back_merge(struct request_queue *q, struct request *rq); int attempt_front_merge(struct request_queue *q, struct request *rq); void blk_recalc_rq_segments(struct request *rq); -void blk_recalc_rq_sectors(struct request *rq, int nsect); void blk_queue_congestion_threshold(struct request_queue *q); diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index db4d990a66b..99ac4304d71 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -579,9 +579,9 @@ cfq_prio_tree_lookup(struct cfq_data *cfqd, struct rb_root *root, * Sort strictly based on sector. Smallest to the left, * largest to the right. */ - if (sector > cfqq->next_rq->sector) + if (sector > blk_rq_pos(cfqq->next_rq)) n = &(*p)->rb_right; - else if (sector < cfqq->next_rq->sector) + else if (sector < blk_rq_pos(cfqq->next_rq)) n = &(*p)->rb_left; else break; @@ -611,8 +611,8 @@ static void cfq_prio_tree_add(struct cfq_data *cfqd, struct cfq_queue *cfqq) return; cfqq->p_root = &cfqd->prio_trees[cfqq->org_ioprio]; - __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, cfqq->next_rq->sector, - &parent, &p); + __cfqq = cfq_prio_tree_lookup(cfqd, cfqq->p_root, + blk_rq_pos(cfqq->next_rq), &parent, &p); if (!__cfqq) { rb_link_node(&cfqq->p_node, parent, p); rb_insert_color(&cfqq->p_node, cfqq->p_root); @@ -996,7 +996,7 @@ static struct cfq_queue *cfqq_close(struct cfq_data *cfqd, if (cfq_rq_close(cfqd, __cfqq->next_rq)) return __cfqq; - if (__cfqq->next_rq->sector < sector) + if (blk_rq_pos(__cfqq->next_rq) < sector) node = rb_next(&__cfqq->p_node); else node = rb_prev(&__cfqq->p_node); diff --git a/include/linux/bio.h b/include/linux/bio.h index f37ca8c726b..d30ec6f30dd 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -218,12 +218,12 @@ struct bio { #define bio_sectors(bio) ((bio)->bi_size >> 9) #define bio_empty_barrier(bio) (bio_barrier(bio) && !bio_has_data(bio) && !bio_discard(bio)) -static inline unsigned int bio_cur_sectors(struct bio *bio) +static inline unsigned int bio_cur_bytes(struct bio *bio) { if (bio->bi_vcnt) - return bio_iovec(bio)->bv_len >> 9; + return bio_iovec(bio)->bv_len; else /* dataless requests such as discard */ - return bio->bi_size >> 9; + return bio->bi_size; } static inline void *bio_data(struct bio *bio) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 4e5f8559872..ce2bf5efa9b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -166,19 +166,8 @@ struct request { enum rq_cmd_type_bits cmd_type; unsigned long atomic_flags; - /* Maintain bio traversal state for part by part I/O submission. - * hard_* are block layer internals, no driver should touch them! - */ - - sector_t sector; /* next sector to submit */ - sector_t hard_sector; /* next sector to complete */ - unsigned long nr_sectors; /* no. of sectors left to submit */ - unsigned long hard_nr_sectors; /* no. of sectors left to complete */ - /* no. of sectors left to submit in the current segment */ - unsigned int current_nr_sectors; - - /* no. of sectors left to complete in the current segment */ - unsigned int hard_cur_sectors; + sector_t sector; /* sector cursor */ + unsigned int data_len; /* total data len, don't access directly */ struct bio *bio; struct bio *biotail; @@ -226,7 +215,6 @@ struct request { unsigned char __cmd[BLK_MAX_CDB]; unsigned char *cmd; - unsigned int data_len; unsigned int extra_len; /* length of alignment and padding */ unsigned int sense_len; unsigned int resid_len; /* residual count */ @@ -840,20 +828,27 @@ extern void blkdev_dequeue_request(struct request *req); */ static inline sector_t blk_rq_pos(const struct request *rq) { - return rq->hard_sector; + return rq->sector; } -extern unsigned int blk_rq_bytes(struct request *rq); -extern unsigned int blk_rq_cur_bytes(struct request *rq); +static inline unsigned int blk_rq_bytes(const struct request *rq) +{ + return rq->data_len; +} + +static inline int blk_rq_cur_bytes(const struct request *rq) +{ + return rq->bio ? bio_cur_bytes(rq->bio) : 0; +} static inline unsigned int blk_rq_sectors(const struct request *rq) { - return rq->hard_nr_sectors; + return blk_rq_bytes(rq) >> 9; } static inline unsigned int blk_rq_cur_sectors(const struct request *rq) { - return rq->hard_cur_sectors; + return blk_rq_cur_bytes(rq) >> 9; } /* @@ -928,7 +923,7 @@ static inline void blk_end_request_all(struct request *rq, int error) */ static inline bool blk_end_request_cur(struct request *rq, int error) { - return blk_end_request(rq, error, rq->hard_cur_sectors << 9); + return blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } /** @@ -981,7 +976,7 @@ static inline void __blk_end_request_all(struct request *rq, int error) */ static inline bool __blk_end_request_cur(struct request *rq, int error) { - return __blk_end_request(rq, error, rq->hard_cur_sectors << 9); + return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); } extern void blk_complete_request(struct request *); diff --git a/include/linux/elevator.h b/include/linux/elevator.h index c59b769f62b..4e462878c9c 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -171,7 +171,7 @@ enum { ELV_MQUEUE_MUST, }; -#define rq_end_sector(rq) ((rq)->sector + (rq)->nr_sectors) +#define rq_end_sector(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) #define rb_entry_rq(node) rb_entry((node), struct request, rb_node) /* diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 42f1c11e754..5708a14bee5 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -642,12 +642,12 @@ static void blk_add_trace_rq(struct request_queue *q, struct request *rq, if (blk_pc_request(rq)) { what |= BLK_TC_ACT(BLK_TC_PC); - __blk_add_trace(bt, 0, rq->data_len, rw, what, rq->errors, - rq->cmd_len, rq->cmd); + __blk_add_trace(bt, 0, blk_rq_bytes(rq), rw, + what, rq->errors, rq->cmd_len, rq->cmd); } else { what |= BLK_TC_ACT(BLK_TC_FS); - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9, - rw, what, rq->errors, 0, NULL); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), rw, + what, rq->errors, 0, NULL); } } @@ -854,11 +854,11 @@ void blk_add_driver_data(struct request_queue *q, return; if (blk_pc_request(rq)) - __blk_add_trace(bt, 0, rq->data_len, 0, BLK_TA_DRV_DATA, - rq->errors, len, data); + __blk_add_trace(bt, 0, blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, rq->errors, len, data); else - __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_sectors(rq) << 9, - 0, BLK_TA_DRV_DATA, rq->errors, len, data); + __blk_add_trace(bt, blk_rq_pos(rq), blk_rq_bytes(rq), 0, + BLK_TA_DRV_DATA, rq->errors, len, data); } EXPORT_SYMBOL_GPL(blk_add_driver_data); From b0790410300abaaf4f25f702803beff701baebf1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:42 +0900 Subject: [PATCH 0933/2061] block: cleanup rq->data_len usages With recent unification of fields, it's now guaranteed that rq->data_len always equals blk_rq_bytes(). Convert all non-IDE direct users to accessors. IDE will be converted in a separate patch. Boaz: spotted incorrect data_len/resid_len conversion in osd. [ Impact: convert direct rq->data_len usages to blk_rq_bytes() ] Signed-off-by: Tejun Heo Acked-by: Sergei Shtylyov Cc: Pete Zaitcev Cc: Eric Moore Cc: Markus Lidel Cc: Darrick J. Wong Cc: James Bottomley Cc: Eric Moore Cc: Boaz Harrosh Cc: FUJITA Tomonori Signed-off-by: Jens Axboe --- drivers/ata/libata-scsi.c | 2 +- drivers/block/ub.c | 8 ++++---- drivers/message/fusion/mptsas.c | 20 ++++++++++---------- drivers/message/i2o/i2o_block.c | 2 +- drivers/scsi/libsas/sas_expander.c | 8 ++++---- drivers/scsi/libsas/sas_host_smp.c | 18 +++++++++--------- drivers/scsi/mpt2sas/mpt2sas_transport.c | 21 +++++++++++---------- drivers/scsi/osd/osd_initiator.c | 4 ++-- drivers/scsi/scsi_lib.c | 13 ++++++------- drivers/scsi/scsi_tgt_lib.c | 2 +- 10 files changed, 49 insertions(+), 49 deletions(-) diff --git a/drivers/ata/libata-scsi.c b/drivers/ata/libata-scsi.c index 2733b0c90b7..6e4c600f5a1 100644 --- a/drivers/ata/libata-scsi.c +++ b/drivers/ata/libata-scsi.c @@ -1084,7 +1084,7 @@ static int atapi_drain_needed(struct request *rq) if (likely(!blk_pc_request(rq))) return 0; - if (!rq->data_len || (rq->cmd_flags & REQ_RW)) + if (!blk_rq_bytes(rq) || (rq->cmd_flags & REQ_RW)) return 0; return atapi_cmd_type(rq->cmd[0]) == ATAPI_MISC; diff --git a/drivers/block/ub.c b/drivers/block/ub.c index dc3b899ad2b..1591f61b6c6 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -747,7 +747,7 @@ static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun, { struct request *rq = urq->rq; - if (rq->data_len == 0) { + if (blk_rq_bytes(rq) == 0) { cmd->dir = UB_DIR_NONE; } else { if (rq_data_dir(rq) == WRITE) @@ -762,7 +762,7 @@ static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun, memcpy(&cmd->cdb, rq->cmd, rq->cmd_len); cmd->cdb_len = rq->cmd_len; - cmd->len = rq->data_len; + cmd->len = blk_rq_bytes(rq); /* * To reapply this to every URB is not as incorrect as it looks. @@ -783,8 +783,8 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd) if (cmd->error == 0) { if (blk_pc_request(rq)) { - if (cmd->act_len < rq->data_len) - rq->resid_len = rq->data_len - cmd->act_len; + if (cmd->act_len < blk_rq_bytes(rq)) + rq->resid_len = blk_rq_bytes(rq) - cmd->act_len; scsi_status = 0; } else { if (cmd->act_len != cmd->len) { diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 5d5f34715de..4e6fcf06a6f 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -1277,8 +1277,8 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, /* do we need to support multiple segments? */ if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) { printk(MYIOC_s_ERR_FMT "%s: multiple segments req %u %u, rsp %u %u\n", - ioc->name, __func__, req->bio->bi_vcnt, req->data_len, - rsp->bio->bi_vcnt, rsp->data_len); + ioc->name, __func__, req->bio->bi_vcnt, blk_rq_bytes(req), + rsp->bio->bi_vcnt, blk_rq_bytes(rsp)); return -EINVAL; } @@ -1295,7 +1295,7 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, smpreq = (SmpPassthroughRequest_t *)mf; memset(smpreq, 0, sizeof(*smpreq)); - smpreq->RequestDataLength = cpu_to_le16(req->data_len - 4); + smpreq->RequestDataLength = cpu_to_le16(blk_rq_bytes(req) - 4); smpreq->Function = MPI_FUNCTION_SMP_PASSTHROUGH; if (rphy) @@ -1321,10 +1321,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, MPI_SGE_FLAGS_END_OF_BUFFER | MPI_SGE_FLAGS_DIRECTION | mpt_addr_size()) << MPI_SGE_FLAGS_SHIFT; - flagsLength |= (req->data_len - 4); + flagsLength |= (blk_rq_bytes(req) - 4); dma_addr_out = pci_map_single(ioc->pcidev, bio_data(req->bio), - req->data_len, PCI_DMA_BIDIRECTIONAL); + blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL); if (!dma_addr_out) goto put_mf; mpt_add_sge(psge, flagsLength, dma_addr_out); @@ -1332,9 +1332,9 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, /* response */ flagsLength = MPT_SGE_FLAGS_SSIMPLE_READ; - flagsLength |= rsp->data_len + 4; + flagsLength |= blk_rq_bytes(rsp) + 4; dma_addr_in = pci_map_single(ioc->pcidev, bio_data(rsp->bio), - rsp->data_len, PCI_DMA_BIDIRECTIONAL); + blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL); if (!dma_addr_in) goto unmap; mpt_add_sge(psge, flagsLength, dma_addr_in); @@ -1357,7 +1357,7 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply; memcpy(req->sense, smprep, sizeof(*smprep)); req->sense_len = sizeof(*smprep); - rsp->resid_len = rsp->data_len - smprep->ResponseDataLength; + rsp->resid_len = blk_rq_bytes(rsp) - smprep->ResponseDataLength; } else { printk(MYIOC_s_ERR_FMT "%s: smp passthru reply failed to be returned\n", ioc->name, __func__); @@ -1365,10 +1365,10 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, } unmap: if (dma_addr_out) - pci_unmap_single(ioc->pcidev, dma_addr_out, req->data_len, + pci_unmap_single(ioc->pcidev, dma_addr_out, blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL); if (dma_addr_in) - pci_unmap_single(ioc->pcidev, dma_addr_in, rsp->data_len, + pci_unmap_single(ioc->pcidev, dma_addr_in, blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL); put_mf: if (mf) diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index 510eb472637..6b61d289d6c 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -430,7 +430,7 @@ static void i2o_block_end_request(struct request *req, int error, int leftover = (blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT); if (blk_pc_request(req)) - leftover = req->data_len; + leftover = blk_rq_bytes(req); if (error) blk_end_request(req, -EIO, leftover); diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 6605ec905cc..531af9ed719 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -1927,13 +1927,13 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, /* do we need to support multiple segments? */ if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) { printk("%s: multiple segments req %u %u, rsp %u %u\n", - __func__, req->bio->bi_vcnt, req->data_len, - rsp->bio->bi_vcnt, rsp->data_len); + __func__, req->bio->bi_vcnt, blk_rq_bytes(req), + rsp->bio->bi_vcnt, blk_rq_bytes(rsp)); return -EINVAL; } - ret = smp_execute_task(dev, bio_data(req->bio), req->data_len, - bio_data(rsp->bio), rsp->data_len); + ret = smp_execute_task(dev, bio_data(req->bio), blk_rq_bytes(req), + bio_data(rsp->bio), blk_rq_bytes(rsp)); if (ret > 0) { /* positive number is the untransferred residual */ rsp->resid_len = ret; diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c index 89952edd0be..be9a951b977 100644 --- a/drivers/scsi/libsas/sas_host_smp.c +++ b/drivers/scsi/libsas/sas_host_smp.c @@ -137,21 +137,21 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, int error = -EINVAL; /* eight is the minimum size for request and response frames */ - if (req->data_len < 8 || rsp->data_len < 8) + if (blk_rq_bytes(req) < 8 || blk_rq_bytes(rsp) < 8) goto out; - if (bio_offset(req->bio) + req->data_len > PAGE_SIZE || - bio_offset(rsp->bio) + rsp->data_len > PAGE_SIZE) { + if (bio_offset(req->bio) + blk_rq_bytes(req) > PAGE_SIZE || + bio_offset(rsp->bio) + blk_rq_bytes(rsp) > PAGE_SIZE) { shost_printk(KERN_ERR, shost, "SMP request/response frame crosses page boundary"); goto out; } - req_data = kzalloc(req->data_len, GFP_KERNEL); + req_data = kzalloc(blk_rq_bytes(req), GFP_KERNEL); /* make sure frame can always be built ... we copy * back only the requested length */ - resp_data = kzalloc(max(rsp->data_len, 128U), GFP_KERNEL); + resp_data = kzalloc(max(blk_rq_bytes(rsp), 128U), GFP_KERNEL); if (!req_data || !resp_data) { error = -ENOMEM; @@ -160,7 +160,7 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, local_irq_disable(); buf = kmap_atomic(bio_page(req->bio), KM_USER0) + bio_offset(req->bio); - memcpy(req_data, buf, req->data_len); + memcpy(req_data, buf, blk_rq_bytes(req)); kunmap_atomic(buf - bio_offset(req->bio), KM_USER0); local_irq_enable(); @@ -176,8 +176,8 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, resp_data[1] = req_data[1]; resp_data[2] = SMP_RESP_FUNC_UNK; - req->resid_len = req->data_len; - rsp->resid_len = rsp->data_len; + req->resid_len = blk_rq_bytes(req); + rsp->resid_len = blk_rq_bytes(rsp); switch (req_data[1]) { case SMP_REPORT_GENERAL: @@ -264,7 +264,7 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, local_irq_disable(); buf = kmap_atomic(bio_page(rsp->bio), KM_USER0) + bio_offset(rsp->bio); - memcpy(buf, resp_data, rsp->data_len); + memcpy(buf, resp_data, blk_rq_bytes(rsp)); flush_kernel_dcache_page(bio_page(rsp->bio)); kunmap_atomic(buf - bio_offset(rsp->bio), KM_USER0); local_irq_enable(); diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c index 53759c566bf..af95a449930 100644 --- a/drivers/scsi/mpt2sas/mpt2sas_transport.c +++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c @@ -1041,7 +1041,7 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, if (req->bio->bi_vcnt > 1 || rsp->bio->bi_vcnt > 1) { printk(MPT2SAS_ERR_FMT "%s: multiple segments req %u %u, " "rsp %u %u\n", ioc->name, __func__, req->bio->bi_vcnt, - req->data_len, rsp->bio->bi_vcnt, rsp->data_len); + blk_rq_bytes(req), rsp->bio->bi_vcnt, blk_rq_bytes(rsp)); return -EINVAL; } @@ -1104,7 +1104,7 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, *((u64 *)&mpi_request->SASAddress) = (rphy) ? cpu_to_le64(rphy->identify.sas_address) : cpu_to_le64(ioc->sas_hba.sas_address); - mpi_request->RequestDataLength = cpu_to_le16(req->data_len - 4); + mpi_request->RequestDataLength = cpu_to_le16(blk_rq_bytes(req) - 4); psge = &mpi_request->SGL; /* WRITE sgel first */ @@ -1112,13 +1112,13 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_HOST_TO_IOC); sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT; dma_addr_out = pci_map_single(ioc->pdev, bio_data(req->bio), - req->data_len, PCI_DMA_BIDIRECTIONAL); + blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL); if (!dma_addr_out) { mpt2sas_base_free_smid(ioc, le16_to_cpu(smid)); goto unmap; } - ioc->base_add_sg_single(psge, sgl_flags | (req->data_len - 4), + ioc->base_add_sg_single(psge, sgl_flags | (blk_rq_bytes(req) - 4), dma_addr_out); /* incr sgel */ @@ -1129,14 +1129,14 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, MPI2_SGE_FLAGS_LAST_ELEMENT | MPI2_SGE_FLAGS_END_OF_BUFFER | MPI2_SGE_FLAGS_END_OF_LIST); sgl_flags = sgl_flags << MPI2_SGE_FLAGS_SHIFT; - dma_addr_in = pci_map_single(ioc->pdev, bio_data(rsp->bio), - rsp->data_len, PCI_DMA_BIDIRECTIONAL); + dma_addr_in = pci_map_single(ioc->pdev, bio_data(rsp->bio), + blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL); if (!dma_addr_in) { mpt2sas_base_free_smid(ioc, le16_to_cpu(smid)); goto unmap; } - ioc->base_add_sg_single(psge, sgl_flags | (rsp->data_len + 4), + ioc->base_add_sg_single(psge, sgl_flags | (blk_rq_bytes(rsp) + 4), dma_addr_in); dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT "%s - " @@ -1170,7 +1170,8 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, memcpy(req->sense, mpi_reply, sizeof(*mpi_reply)); req->sense_len = sizeof(*mpi_reply); - rsp->resid_len = rsp->data_len - mpi_reply->ResponseDataLength; + rsp->resid_len = blk_rq_bytes(rsp) - + mpi_reply->ResponseDataLength; } else { dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT "%s - no reply\n", ioc->name, __func__)); @@ -1186,10 +1187,10 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, unmap: if (dma_addr_out) - pci_unmap_single(ioc->pdev, dma_addr_out, req->data_len, + pci_unmap_single(ioc->pdev, dma_addr_out, blk_rq_bytes(req), PCI_DMA_BIDIRECTIONAL); if (dma_addr_in) - pci_unmap_single(ioc->pdev, dma_addr_in, rsp->data_len, + pci_unmap_single(ioc->pdev, dma_addr_in, blk_rq_bytes(rsp), PCI_DMA_BIDIRECTIONAL); out: diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index 2a5f0777148..d178a8b799e 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -1299,7 +1299,7 @@ int osd_finalize_request(struct osd_request *or, return ret; } OSD_DEBUG("out bytes=%llu (bytes_req=%u)\n", - _LLU(or->out.total_bytes), or->out.req->data_len); + _LLU(or->out.total_bytes), blk_rq_bytes(or->out.req)); } if (or->in.bio) { ret = blk_rq_append_bio(or->request->q, or->in.req, or->in.bio); @@ -1308,7 +1308,7 @@ int osd_finalize_request(struct osd_request *or, return ret; } OSD_DEBUG("in bytes=%llu (bytes_req=%u)\n", - _LLU(or->in.total_bytes), or->in.req->data_len); + _LLU(or->in.total_bytes), blk_rq_bytes(or->in.req)); } or->out.pad_buff = sg_out_pad_buffer; diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 39b3acfc0dd..3d16c70fbde 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -682,14 +682,13 @@ EXPORT_SYMBOL(scsi_release_buffers); static void scsi_end_bidi_request(struct scsi_cmnd *cmd) { struct request *req = cmd->request; - unsigned int dlen = req->data_len; - unsigned int next_dlen = req->next_rq->data_len; req->resid_len = scsi_out(cmd)->resid; req->next_rq->resid_len = scsi_in(cmd)->resid; /* The req and req->next_rq have not been completed */ - BUG_ON(blk_end_bidi_request(req, 0, dlen, next_dlen)); + BUG_ON(blk_end_bidi_request(req, 0, blk_rq_bytes(req), + blk_rq_bytes(req->next_rq))); scsi_release_buffers(cmd); @@ -966,7 +965,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb, BUG_ON(count > sdb->table.nents); sdb->table.nents = count; if (blk_pc_request(req)) - sdb->length = req->data_len; + sdb->length = blk_rq_bytes(req); else sdb->length = blk_rq_sectors(req) << 9; return BLKPREP_OK; @@ -1087,21 +1086,21 @@ int scsi_setup_blk_pc_cmnd(struct scsi_device *sdev, struct request *req) if (unlikely(ret)) return ret; } else { - BUG_ON(req->data_len); + BUG_ON(blk_rq_bytes(req)); memset(&cmd->sdb, 0, sizeof(cmd->sdb)); req->buffer = NULL; } cmd->cmd_len = req->cmd_len; - if (!req->data_len) + if (!blk_rq_bytes(req)) cmd->sc_data_direction = DMA_NONE; else if (rq_data_dir(req) == WRITE) cmd->sc_data_direction = DMA_TO_DEVICE; else cmd->sc_data_direction = DMA_FROM_DEVICE; - cmd->transfersize = req->data_len; + cmd->transfersize = blk_rq_bytes(req); cmd->allowed = req->retries; return BLKPREP_OK; } diff --git a/drivers/scsi/scsi_tgt_lib.c b/drivers/scsi/scsi_tgt_lib.c index 48ba413f7f6..10303272ba4 100644 --- a/drivers/scsi/scsi_tgt_lib.c +++ b/drivers/scsi/scsi_tgt_lib.c @@ -387,7 +387,7 @@ static int scsi_map_user_pages(struct scsi_tgt_cmd *tcmd, struct scsi_cmnd *cmd, * we use REQ_TYPE_BLOCK_PC so scsi_init_io doesn't set the * length for us. */ - cmd->sdb.length = rq->data_len; + cmd->sdb.length = blk_rq_bytes(rq); return 0; From 34b7d2c957199834c474c9d46739265643f4d9c7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:43 +0900 Subject: [PATCH 0934/2061] ide: cleanup rq->data_len usages With recent unification of fields, it's now guaranteed that rq->data_len always equals blk_rq_bytes(). Convert all direct users to accessors. [ Impact: convert direct rq->data_len usages to blk_rq_bytes() ] Signed-off-by: Tejun Heo Acked-by: Bartlomiej Zolnierkiewicz Cc: Borislav Petkov Cc: Sergei Shtylyov Signed-off-by: Jens Axboe --- drivers/ide/ide-atapi.c | 4 ++-- drivers/ide/ide-cd.c | 25 +++++++------------------ drivers/ide/ide-floppy.c | 4 ++-- drivers/ide/ide-io.c | 2 +- drivers/ide/ide-tape.c | 2 +- 5 files changed, 13 insertions(+), 24 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index e4a02a05fc8..792534db8f8 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -255,7 +255,7 @@ void ide_retry_pc(ide_drive_t *drive) ide_init_pc(pc); memcpy(pc->c, sense_rq->cmd, 12); pc->buf = bio_data(sense_rq->bio); /* pointer to mapped address */ - pc->req_xfer = sense_rq->data_len; + pc->req_xfer = blk_rq_bytes(sense_rq); if (drive->media == ide_tape) set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags); @@ -303,7 +303,7 @@ int ide_cd_get_xferlen(struct request *rq) return 32768; else if (blk_sense_request(rq) || blk_pc_request(rq) || rq->cmd_type == REQ_TYPE_ATA_PC) - return rq->data_len; + return blk_rq_bytes(rq); else return 0; } diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index eb4f3dc9f18..2eadc9d2e96 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -577,7 +577,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) struct request *rq = hwif->rq; ide_expiry_t *expiry = NULL; int dma_error = 0, dma, thislen, uptodate = 0; - int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0, nsectors; + int write = (rq_data_dir(rq) == WRITE) ? 1 : 0, rc = 0; int sense = blk_sense_request(rq); unsigned int timeout; u16 len; @@ -707,9 +707,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) out_end: if (blk_pc_request(rq) && rc == 0) { - if (blk_end_request(rq, 0, rq->data_len)) - BUG(); - + blk_end_request_all(rq, 0); hwif->rq = NULL; } else { if (sense && uptodate) @@ -727,22 +725,14 @@ out_end: ide_cd_error_cmd(drive, cmd); /* make sure it's fully ended */ - if (blk_pc_request(rq)) - nsectors = (rq->data_len + 511) >> 9; - else - nsectors = blk_rq_sectors(rq); - - if (nsectors == 0) - nsectors = 1; - if (blk_fs_request(rq) == 0) { - rq->resid_len = rq->data_len - + rq->resid_len = blk_rq_bytes(rq) - (cmd->nbytes - cmd->nleft); if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE)) rq->resid_len += cmd->last_xfer_len; } - ide_complete_rq(drive, uptodate ? 0 : -EIO, nsectors << 9); + ide_complete_rq(drive, uptodate ? 0 : -EIO, blk_rq_bytes(rq)); if (sense && rc == 2) ide_error(drive, "request sense failure", stat); @@ -819,7 +809,7 @@ static void cdrom_do_block_pc(ide_drive_t *drive, struct request *rq) */ alignment = queue_dma_alignment(q) | q->dma_pad_mask; if ((unsigned long)buf & alignment - || rq->data_len & q->dma_pad_mask + || blk_rq_bytes(rq) & q->dma_pad_mask || object_is_on_stack(buf)) drive->dma = 0; } @@ -867,9 +857,8 @@ static ide_startstop_t ide_cd_do_request(ide_drive_t *drive, struct request *rq, cmd.rq = rq; - if (blk_fs_request(rq) || rq->data_len) { - ide_init_sg_cmd(&cmd, blk_fs_request(rq) ? - (blk_rq_sectors(rq) << 9) : rq->data_len); + if (blk_fs_request(rq) || blk_rq_bytes(rq)) { + ide_init_sg_cmd(&cmd, blk_rq_bytes(rq)); ide_map_sg(drive, &cmd); } diff --git a/drivers/ide/ide-floppy.c b/drivers/ide/ide-floppy.c index 1c460bd5637..650981758f1 100644 --- a/drivers/ide/ide-floppy.c +++ b/drivers/ide/ide-floppy.c @@ -220,14 +220,14 @@ static void idefloppy_blockpc_cmd(struct ide_disk_obj *floppy, ide_init_pc(pc); memcpy(pc->c, rq->cmd, sizeof(pc->c)); pc->rq = rq; - if (rq->data_len) { + if (blk_rq_bytes(rq)) { pc->flags |= PC_FLAG_DMA_OK; if (rq_data_dir(rq) == WRITE) pc->flags |= PC_FLAG_WRITING; } /* pio will be performed by ide_pio_bytes() which handles sg fine */ pc->buf = NULL; - pc->req_xfer = pc->buf_size = rq->data_len; + pc->req_xfer = pc->buf_size = blk_rq_bytes(rq); } static ide_startstop_t ide_floppy_do_request(ide_drive_t *drive, diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index 59799ca5524..ca2519d7b99 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -116,7 +116,7 @@ void ide_complete_cmd(ide_drive_t *drive, struct ide_cmd *cmd, u8 stat, u8 err) unsigned int ide_rq_bytes(struct request *rq) { if (blk_pc_request(rq)) - return rq->data_len; + return blk_rq_bytes(rq); else return blk_rq_cur_sectors(rq) << 9; } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index c6cf3a951f2..e16604562f2 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -380,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc) } tape->first_frame += blocks; - rq->resid_len = rq->data_len - blocks * tape->blk_size; + rq->resid_len = blk_rq_bytes(rq) - blocks * tape->blk_size; if (pc->error) { uptodate = 0; From a2dec7b36364a5cc564c4d76cf16d2e7d33f5c05 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:44 +0900 Subject: [PATCH 0935/2061] block: hide request sector and data_len Block low level drivers for some reason have been pretty good at abusing block layer API. Especially struct request's fields tend to get violated in all possible ways. Make it clear that low level drivers MUST NOT access or manipulate rq->sector and rq->data_len directly by prefixing them with double underscores. This change is also necessary to break build of out-of-tree codes which assume the previous block API where internal fields can be manipulated and rq->data_len carries residual count on completion. [ Impact: hide internal fields, block API change ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- block/blk-core.c | 20 ++++++++++---------- block/blk-map.c | 2 +- block/blk-merge.c | 2 +- include/linux/blkdev.h | 9 +++++---- 4 files changed, 17 insertions(+), 16 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 3596ca71909..6226a380fb6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -127,7 +127,7 @@ void blk_rq_init(struct request_queue *q, struct request *rq) INIT_LIST_HEAD(&rq->timeout_list); rq->cpu = -1; rq->q = q; - rq->sector = (sector_t) -1; + rq->__sector = (sector_t) -1; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); rq->cmd = rq->__cmd; @@ -1095,7 +1095,7 @@ void init_request_from_bio(struct request *req, struct bio *bio) req->cmd_flags |= REQ_NOIDLE; req->errors = 0; - req->sector = bio->bi_sector; + req->__sector = bio->bi_sector; req->ioprio = bio_prio(bio); blk_rq_bio_prep(req->q, req, bio); } @@ -1143,7 +1143,7 @@ static int __make_request(struct request_queue *q, struct bio *bio) req->biotail->bi_next = bio; req->biotail = bio; - req->data_len += bytes; + req->__data_len += bytes; req->ioprio = ioprio_best(req->ioprio, prio); if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; @@ -1169,8 +1169,8 @@ static int __make_request(struct request_queue *q, struct bio *bio) * not touch req->buffer either... */ req->buffer = bio_data(bio); - req->sector = bio->bi_sector; - req->data_len += bytes; + req->__sector = bio->bi_sector; + req->__data_len += bytes; req->ioprio = ioprio_best(req->ioprio, prio); if (!blk_rq_cpu_valid(req)) req->cpu = bio->bi_comp_cpu; @@ -1878,7 +1878,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) * can find how many bytes remain in the request * later. */ - req->data_len = 0; + req->__data_len = 0; return false; } @@ -1892,12 +1892,12 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) bio_iovec(bio)->bv_len -= nr_bytes; } - req->data_len -= total_bytes; + req->__data_len -= total_bytes; req->buffer = bio_data(req->bio); /* update sector only for requests with clear definition of sector */ if (blk_fs_request(req) || blk_discard_rq(req)) - req->sector += total_bytes >> 9; + req->__sector += total_bytes >> 9; /* * If total number of sectors is less than the first segment @@ -1905,7 +1905,7 @@ bool blk_update_request(struct request *req, int error, unsigned int nr_bytes) */ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { printk(KERN_ERR "blk: request botched\n"); - req->data_len = blk_rq_cur_bytes(req); + req->__data_len = blk_rq_cur_bytes(req); } /* recalculate the number of segments */ @@ -2032,7 +2032,7 @@ void blk_rq_bio_prep(struct request_queue *q, struct request *rq, rq->nr_phys_segments = bio_phys_segments(q, bio); rq->buffer = bio_data(bio); } - rq->data_len = bio->bi_size; + rq->__data_len = bio->bi_size; rq->bio = rq->biotail = bio; if (bio->bi_bdev) diff --git a/block/blk-map.c b/block/blk-map.c index 694fefad34e..56082bea450 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -20,7 +20,7 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq, rq->biotail->bi_next = bio; rq->biotail = bio; - rq->data_len += bio->bi_size; + rq->__data_len += bio->bi_size; } return 0; } diff --git a/block/blk-merge.c b/block/blk-merge.c index b8df66aef0f..4974dd5767e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -370,7 +370,7 @@ static int attempt_merge(struct request_queue *q, struct request *req, req->biotail->bi_next = next->bio; req->biotail = next->biotail; - req->data_len += blk_rq_bytes(next); + req->__data_len += blk_rq_bytes(next); elv_merge_requests(q, req, next); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index ce2bf5efa9b..c7558034570 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -166,8 +166,9 @@ struct request { enum rq_cmd_type_bits cmd_type; unsigned long atomic_flags; - sector_t sector; /* sector cursor */ - unsigned int data_len; /* total data len, don't access directly */ + /* the following two fields are internal, NEVER access directly */ + sector_t __sector; /* sector cursor */ + unsigned int __data_len; /* total data len */ struct bio *bio; struct bio *biotail; @@ -828,12 +829,12 @@ extern void blkdev_dequeue_request(struct request *req); */ static inline sector_t blk_rq_pos(const struct request *rq) { - return rq->sector; + return rq->__sector; } static inline unsigned int blk_rq_bytes(const struct request *rq) { - return rq->data_len; + return rq->__data_len; } static inline int blk_rq_cur_bytes(const struct request *rq) From 1011c1b9f2e45ce7c6e38888d2b83936aec38771 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 May 2009 22:24:45 +0900 Subject: [PATCH 0936/2061] block: blk_rq_[cur_]_{sectors|bytes}() usage cleanup With the previous changes, the followings are now guaranteed for all requests in any valid state. * blk_rq_sectors() == blk_rq_bytes() >> 9 * blk_rq_cur_sectors() == blk_rq_cur_bytes() >> 9 Clean up accessor usages. Notable changes are * nbd,i2o_block: end_all used instead of explicit byte count * scsi_lib: unnecessary conditional on request type removed [ Impact: cleanup ] Signed-off-by: Tejun Heo Cc: Paul Clements Cc: Pete Zaitcev Cc: Alex Dubov Cc: Markus Lidel Cc: David Woodhouse Cc: James Bottomley Cc: Boaz Harrosh Signed-off-by: Jens Axboe --- drivers/block/floppy.c | 7 +++---- drivers/block/nbd.c | 6 +++--- drivers/block/ub.c | 2 +- drivers/block/z2ram.c | 2 +- drivers/memstick/core/mspro_block.c | 4 ++-- drivers/message/i2o/i2o_block.c | 16 ++++------------ drivers/mtd/mtd_blkdevs.c | 2 +- drivers/sbus/char/jsflash.c | 2 +- drivers/scsi/scsi_lib.c | 7 ++----- 9 files changed, 18 insertions(+), 30 deletions(-) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 45248628338..1e27ed9208b 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -2512,8 +2512,7 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) remaining = current_count_sectors << 9; #ifdef FLOPPY_SANITY_CHECK - if ((remaining >> 9) > blk_rq_sectors(current_req) && - CT(COMMAND) == FD_WRITE) { + if (remaining > blk_rq_bytes(current_req) && CT(COMMAND) == FD_WRITE) { DPRINT("in copy buffer\n"); printk("current_count_sectors=%ld\n", current_count_sectors); printk("remaining=%d\n", remaining >> 9); @@ -2530,7 +2529,7 @@ static void copy_buffer(int ssize, int max_sector, int max_sector_2) dma_buffer = floppy_track_buffer + ((fsector_t - buffer_min) << 9); - size = blk_rq_cur_sectors(current_req) << 9; + size = blk_rq_cur_bytes(current_req); rq_for_each_segment(bv, current_req, iter) { if (!remaining) @@ -2879,7 +2878,7 @@ static int make_raw_rw_request(void) printk("write\n"); return 0; } - } else if (raw_cmd->length > blk_rq_sectors(current_req) << 9 || + } else if (raw_cmd->length > blk_rq_bytes(current_req) || current_count_sectors > blk_rq_sectors(current_req)) { DPRINT("buffer overrun in direct transfer\n"); return 0; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index 977a5737793..fad167de23b 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -110,7 +110,7 @@ static void nbd_end_request(struct request *req) req, error ? "failed" : "done"); spin_lock_irqsave(q->queue_lock, flags); - __blk_end_request(req, error, blk_rq_sectors(req) << 9); + __blk_end_request_all(req, error); spin_unlock_irqrestore(q->queue_lock, flags); } @@ -231,7 +231,7 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) { int result, flags; struct nbd_request request; - unsigned long size = blk_rq_sectors(req) << 9; + unsigned long size = blk_rq_bytes(req); request.magic = htonl(NBD_REQUEST_MAGIC); request.type = htonl(nbd_cmd(req)); @@ -243,7 +243,7 @@ static int nbd_send_req(struct nbd_device *lo, struct request *req) lo->disk->disk_name, req, nbdcmd_to_ascii(nbd_cmd(req)), (unsigned long long)blk_rq_pos(req) << 9, - blk_rq_sectors(req) << 9); + blk_rq_bytes(req)); result = sock_xmit(lo, 1, &request, sizeof(request), (nbd_cmd(req) == NBD_CMD_WRITE) ? MSG_MORE : 0); if (result <= 0) { diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 1591f61b6c6..40d03cf63f2 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -739,7 +739,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun, cmd->cdb[8] = nblks; cmd->cdb_len = 10; - cmd->len = blk_rq_sectors(rq) * 512; + cmd->len = blk_rq_bytes(rq); } static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun, diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index d4e6b71f514..6a1383834ec 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -72,7 +72,7 @@ static void do_z2_request(struct request_queue *q) struct request *req; while ((req = elv_next_request(q)) != NULL) { unsigned long start = blk_rq_pos(req) << 9; - unsigned long len = blk_rq_cur_sectors(req) << 9; + unsigned long len = blk_rq_cur_bytes(req); if (start + len > z2ram_size) { printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n", diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 9e600d22f40..93b2c618565 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -680,7 +680,7 @@ try_again: t_sec = blk_rq_pos(msb->block_req) << 9; sector_div(t_sec, msb->page_size); - count = blk_rq_sectors(msb->block_req) << 9; + count = blk_rq_bytes(msb->block_req); count /= msb->page_size; param.system = msb->system; @@ -745,7 +745,7 @@ static int mspro_block_complete_req(struct memstick_dev *card, int error) t_len *= msb->page_size; } } else - t_len = blk_rq_sectors(msb->block_req) << 9; + t_len = blk_rq_bytes(msb->block_req); dev_dbg(&card->dev, "transferred %x (%d)\n", t_len, error); diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index 6b61d289d6c..e153f5d5237 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -426,15 +426,9 @@ static void i2o_block_end_request(struct request *req, int error, struct request_queue *q = req->q; unsigned long flags; - if (blk_end_request(req, error, nr_bytes)) { - int leftover = (blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT); - - if (blk_pc_request(req)) - leftover = blk_rq_bytes(req); - + if (blk_end_request(req, error, nr_bytes)) if (error) - blk_end_request(req, -EIO, leftover); - } + blk_end_request_all(req, -EIO); spin_lock_irqsave(q->queue_lock, flags); @@ -832,15 +826,13 @@ static int i2o_block_transfer(struct request *req) memcpy(mptr, cmd, 10); mptr += 4; - *mptr++ = - cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT); + *mptr++ = cpu_to_le32(blk_rq_bytes(req)); } else #endif { msg->u.head[1] = cpu_to_le32(cmd | HOST_TID << 12 | tid); *mptr++ = cpu_to_le32(ctl_flags); - *mptr++ = - cpu_to_le32(blk_rq_sectors(req) << KERNEL_SECTOR_SHIFT); + *mptr++ = cpu_to_le32(blk_rq_bytes(req)); *mptr++ = cpu_to_le32((u32) (blk_rq_pos(req) << KERNEL_SECTOR_SHIFT)); *mptr++ = diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 4ea2e67ac97..50c76a2ca76 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -48,7 +48,7 @@ static int do_blktrans_request(struct mtd_blktrans_ops *tr, char *buf; block = blk_rq_pos(req) << 9 >> tr->blkshift; - nsect = blk_rq_cur_sectors(req) << 9 >> tr->blkshift; + nsect = blk_rq_cur_bytes(req) >> tr->blkshift; buf = req->buffer; diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index 2132c906e53..d56ddaa7703 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -189,7 +189,7 @@ static void jsfd_do_request(struct request_queue *q) while ((req = elv_next_request(q)) != NULL) { struct jsfd_part *jdp = req->rq_disk->private_data; unsigned long offset = blk_rq_pos(req) << 9; - size_t len = blk_rq_cur_sectors(req) << 9; + size_t len = blk_rq_cur_bytes(req); if ((offset + len) > jdp->dsize) { __blk_end_request_cur(req, -EIO); diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 3d16c70fbde..ee308f6f798 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -546,7 +546,7 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error, * to queue the remainder of them. */ if (blk_end_request(req, error, bytes)) { - int leftover = blk_rq_sectors(req) << 9; + int leftover = blk_rq_bytes(req); if (blk_pc_request(req)) leftover = req->resid_len; @@ -964,10 +964,7 @@ static int scsi_init_sgtable(struct request *req, struct scsi_data_buffer *sdb, count = blk_rq_map_sg(req->q, req, sdb->table.sgl); BUG_ON(count > sdb->table.nents); sdb->table.nents = count; - if (blk_pc_request(req)) - sdb->length = blk_rq_bytes(req); - else - sdb->length = blk_rq_sectors(req) << 9; + sdb->length = blk_rq_bytes(req); return BLKPREP_OK; } From 8f6205cd572fece673da0255d74843680f67f879 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:53:59 +0900 Subject: [PATCH 0937/2061] ide: dequeue in-flight request ide generally has single request in flight and tracks it using hwif->rq and all state handlers follow the following convention. * ide_started is returned if the request is in flight. * ide_stopped is returned if the queue needs to be restarted. The request might or might not have been processed fully or partially. * hwif->rq is set to NULL, when an issued request completes. So, dequeueing model can be implemented by dequeueing after fetch, requeueing if hwif->rq isn't NULL on ide_stopped return and doing about the same thing on completion / port unlock paths. These changes can be made in ide-io proper. In addition to the above main changes, the following updates are necessary. * ide-cd shouldn't dequeue a request when issuing REQUEST SENSE for it as the request is already dequeued. * ide-atapi uses request queue as stack when issuing REQUEST SENSE to put the REQUEST SENSE in front of the failed request. This now needs to be done using requeueing. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Cc: Bartlomiej Zolnierkiewicz Cc: Borislav Petkov Cc: Sergei Shtylyov Signed-off-by: Jens Axboe --- drivers/ide/ide-atapi.c | 14 ++++++++++++-- drivers/ide/ide-cd.c | 8 -------- drivers/ide/ide-io.c | 34 +++++++++++++++++++++++++++------- 3 files changed, 39 insertions(+), 17 deletions(-) diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 792534db8f8..2874c3d703a 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -246,6 +246,7 @@ EXPORT_SYMBOL_GPL(ide_queue_sense_rq); */ void ide_retry_pc(ide_drive_t *drive) { + struct request *failed_rq = drive->hwif->rq; struct request *sense_rq = &drive->sense_rq; struct ide_atapi_pc *pc = &drive->request_sense_pc; @@ -260,8 +261,17 @@ void ide_retry_pc(ide_drive_t *drive) if (drive->media == ide_tape) set_bit(IDE_AFLAG_IGNORE_DSC, &drive->atapi_flags); - if (ide_queue_sense_rq(drive, pc)) - ide_complete_rq(drive, -EIO, blk_rq_bytes(drive->hwif->rq)); + /* + * Push back the failed request and put request sense on top + * of it. The failed command will be retried after sense data + * is acquired. + */ + blk_requeue_request(failed_rq->q, failed_rq); + drive->hwif->rq = NULL; + if (ide_queue_sense_rq(drive, pc)) { + blkdev_dequeue_request(failed_rq); + ide_complete_rq(drive, -EIO, blk_rq_bytes(failed_rq)); + } } EXPORT_SYMBOL_GPL(ide_retry_pc); diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 2eadc9d2e96..4c7792fd5f9 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -405,15 +405,7 @@ static int cdrom_decode_status(ide_drive_t *drive, u8 stat) end_request: if (stat & ATA_ERR) { - struct request_queue *q = drive->queue; - unsigned long flags; - - spin_lock_irqsave(q->queue_lock, flags); - blkdev_dequeue_request(rq); - spin_unlock_irqrestore(q->queue_lock, flags); - hwif->rq = NULL; - return ide_queue_sense_rq(drive, rq) ? 2 : 1; } else return 2; diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index ca2519d7b99..abda7337b3f 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -487,10 +487,10 @@ void do_ide_request(struct request_queue *q) if (!ide_lock_port(hwif)) { ide_hwif_t *prev_port; + + WARN_ON_ONCE(hwif->rq); repeat: prev_port = hwif->host->cur_port; - hwif->rq = NULL; - if (drive->dev_flags & IDE_DFLAG_SLEEPING && time_after(drive->sleep, jiffies)) { ide_unlock_port(hwif); @@ -519,7 +519,12 @@ repeat: * we know that the queue isn't empty, but this can happen * if the q->prep_rq_fn() decides to kill a request */ - rq = elv_next_request(drive->queue); + if (!rq) { + rq = elv_next_request(drive->queue); + if (rq) + blkdev_dequeue_request(rq); + } + spin_unlock_irq(q->queue_lock); spin_lock_irq(&hwif->lock); @@ -555,8 +560,11 @@ repeat: startstop = start_request(drive, rq); spin_lock_irq(&hwif->lock); - if (startstop == ide_stopped) + if (startstop == ide_stopped) { + rq = hwif->rq; + hwif->rq = NULL; goto repeat; + } } else goto plug_device; out: @@ -572,18 +580,24 @@ plug_device: plug_device_2: spin_lock_irq(q->queue_lock); + if (rq) + blk_requeue_request(q, rq); if (!elv_queue_empty(q)) blk_plug_device(q); } -static void ide_plug_device(ide_drive_t *drive) +static void ide_requeue_and_plug(ide_drive_t *drive, struct request *rq) { struct request_queue *q = drive->queue; unsigned long flags; spin_lock_irqsave(q->queue_lock, flags); + + if (rq) + blk_requeue_request(q, rq); if (!elv_queue_empty(q)) blk_plug_device(q); + spin_unlock_irqrestore(q->queue_lock, flags); } @@ -632,6 +646,7 @@ void ide_timer_expiry (unsigned long data) unsigned long flags; int wait = -1; int plug_device = 0; + struct request *uninitialized_var(rq_in_flight); spin_lock_irqsave(&hwif->lock, flags); @@ -693,6 +708,8 @@ void ide_timer_expiry (unsigned long data) spin_lock_irq(&hwif->lock); enable_irq(hwif->irq); if (startstop == ide_stopped) { + rq_in_flight = hwif->rq; + hwif->rq = NULL; ide_unlock_port(hwif); plug_device = 1; } @@ -701,7 +718,7 @@ void ide_timer_expiry (unsigned long data) if (plug_device) { ide_unlock_host(hwif->host); - ide_plug_device(drive); + ide_requeue_and_plug(drive, rq_in_flight); } } @@ -787,6 +804,7 @@ irqreturn_t ide_intr (int irq, void *dev_id) ide_startstop_t startstop; irqreturn_t irq_ret = IRQ_NONE; int plug_device = 0; + struct request *uninitialized_var(rq_in_flight); if (host->host_flags & IDE_HFLAG_SERIALIZE) { if (hwif != host->cur_port) @@ -866,6 +884,8 @@ irqreturn_t ide_intr (int irq, void *dev_id) */ if (startstop == ide_stopped) { BUG_ON(hwif->handler); + rq_in_flight = hwif->rq; + hwif->rq = NULL; ide_unlock_port(hwif); plug_device = 1; } @@ -875,7 +895,7 @@ out: out_early: if (plug_device) { ide_unlock_host(hwif->host); - ide_plug_device(drive); + ide_requeue_and_plug(drive, rq_in_flight); } return irq_ret; From 9a8d23d8855e554fc5887f14cb008b55c4300ccc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:00 +0900 Subject: [PATCH 0938/2061] mg_disk: fix queue hang / infinite retry on !fs requests Both request functions in mg_disk simply return when they encounter a !fs request, which means the request will never be cleared from the queue causing queue hang and indefinite retry of the request. Fix it. While at it, flatten condition checks and add unlikely to !fs tests. [ Impact: fix possible queue hang / infinite retry of !fs requests ] Signed-off-by: Tejun Heo Cc: unsik Kim Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 24 +++++++++++++----------- 1 file changed, 13 insertions(+), 11 deletions(-) diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 826c3492b9f..be323880f24 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -672,16 +672,16 @@ static void mg_request_poll(struct request_queue *q) while ((req = elv_next_request(q)) != NULL) { host = req->rq_disk->private_data; - if (blk_fs_request(req)) { - switch (rq_data_dir(req)) { - case READ: - mg_read(req); - break; - case WRITE: - mg_write(req); - break; - } + + if (unlikely(!blk_fs_request(req))) { + __blk_end_request_cur(req, -EIO); + continue; } + + if (rq_data_dir(req) == READ) + mg_read(req); + else + mg_write(req); } } @@ -766,8 +766,10 @@ static void mg_request(struct request_queue *q) continue; } - if (!blk_fs_request(req)) - return; + if (unlikely(!blk_fs_request(req))) { + __blk_end_request_cur(req, -EIO); + continue; + } if (!mg_issue_req(req, host, sect_num, sect_cnt)) return; From 5b36ad6000ddea390aca3c3b67ead7616ace2ffc Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:01 +0900 Subject: [PATCH 0939/2061] mg_disk: dequeue and track in-flight request mg_disk has at most single request in flight per device. Till now, whenever it needs to access the in-flight request it called elv_next_request(). This patch makes mg_disk track the in-flight request directly using mg_host->req and dequeue it when processing starts. q->queuedata is set to mg_host so that mg_host can be determined without fetching request from the queue. [ Impact: dequeue in-flight request, one elv_next_request() per request ] Signed-off-by: Tejun Heo Cc: unsik Kim Signed-off-by: Jens Axboe --- drivers/block/mg_disk.c | 109 ++++++++++++++++++++++------------------ 1 file changed, 59 insertions(+), 50 deletions(-) diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index be323880f24..1ca5d1423fa 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -135,6 +135,7 @@ struct mg_host { struct device *dev; struct request_queue *breq; + struct request *req; spinlock_t lock; struct gendisk *gd; @@ -171,17 +172,27 @@ struct mg_host { static void mg_request(struct request_queue *); +static bool mg_end_request(struct mg_host *host, int err, unsigned int nr_bytes) +{ + if (__blk_end_request(host->req, err, nr_bytes)) + return true; + + host->req = NULL; + return false; +} + +static bool mg_end_request_cur(struct mg_host *host, int err) +{ + return mg_end_request(host, err, blk_rq_cur_bytes(host->req)); +} + static void mg_dump_status(const char *msg, unsigned int stat, struct mg_host *host) { char *name = MG_DISK_NAME; - struct request *req; - if (host->breq) { - req = elv_next_request(host->breq); - if (req) - name = req->rq_disk->disk_name; - } + if (host->req) + name = host->req->rq_disk->disk_name; printk(KERN_ERR "%s: %s: status=0x%02x { ", name, msg, stat & 0xff); if (stat & ATA_BUSY) @@ -217,13 +228,9 @@ static void mg_dump_status(const char *msg, unsigned int stat, printk("AddrMarkNotFound "); printk("}"); if (host->error & (ATA_BBK | ATA_UNC | ATA_IDNF | ATA_AMNF)) { - if (host->breq) { - req = elv_next_request(host->breq); - if (req) - printk(", sector=%u", - (unsigned int)blk_rq_pos(req)); - } - + if (host->req) + printk(", sector=%u", + (unsigned int)blk_rq_pos(host->req)); } printk("\n"); } @@ -453,11 +460,10 @@ static int mg_disk_init(struct mg_host *host) static void mg_bad_rw_intr(struct mg_host *host) { - struct request *req = elv_next_request(host->breq); - if (req != NULL) - if (++req->errors >= MG_MAX_ERRORS || - host->error == MG_ERR_TIMEOUT) - __blk_end_request_cur(req, -EIO); + if (host->req) + if (++host->req->errors >= MG_MAX_ERRORS || + host->error == MG_ERR_TIMEOUT) + mg_end_request_cur(host, -EIO); } static unsigned int mg_out(struct mg_host *host, @@ -515,7 +521,7 @@ static void mg_read(struct request *req) outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); - } while (__blk_end_request(req, 0, MG_SECTOR_SIZE)); + } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); } static void mg_write(struct request *req) @@ -545,14 +551,14 @@ static void mg_write(struct request *req) outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); - } while (__blk_end_request(req, 0, MG_SECTOR_SIZE)); + } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); } static void mg_read_intr(struct mg_host *host) { + struct request *req = host->req; u32 i; u16 *buff; - struct request *req; /* check status */ do { @@ -571,7 +577,6 @@ static void mg_read_intr(struct mg_host *host) ok_to_read: /* get current segment of request */ - req = elv_next_request(host->breq); buff = (u16 *)req->buffer; /* read 1 sector */ @@ -585,7 +590,7 @@ ok_to_read: /* send read confirm */ outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); - if (__blk_end_request(req, 0, MG_SECTOR_SIZE)) { + if (mg_end_request(host, 0, MG_SECTOR_SIZE)) { /* set handler if read remains */ host->mg_do_intr = mg_read_intr; mod_timer(&host->timer, jiffies + 3 * HZ); @@ -595,14 +600,11 @@ ok_to_read: static void mg_write_intr(struct mg_host *host) { + struct request *req = host->req; u32 i, j; u16 *buff; - struct request *req; bool rem; - /* get current segment of request */ - req = elv_next_request(host->breq); - /* check status */ do { i = inb((unsigned long)host->dev_base + MG_REG_STATUS); @@ -619,7 +621,7 @@ static void mg_write_intr(struct mg_host *host) return; ok_to_write: - if ((rem = __blk_end_request(req, 0, MG_SECTOR_SIZE))) { + if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) { /* write 1 sector and set handler if remains */ buff = (u16 *)req->buffer; for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) { @@ -644,44 +646,47 @@ void mg_times_out(unsigned long data) { struct mg_host *host = (struct mg_host *)data; char *name; - struct request *req; spin_lock_irq(&host->lock); - req = elv_next_request(host->breq); - if (!req) + if (!host->req) goto out_unlock; host->mg_do_intr = NULL; - name = req->rq_disk->disk_name; + name = host->req->rq_disk->disk_name; printk(KERN_DEBUG "%s: timeout\n", name); host->error = MG_ERR_TIMEOUT; mg_bad_rw_intr(host); - mg_request(host->breq); out_unlock: + mg_request(host->breq); spin_unlock_irq(&host->lock); } static void mg_request_poll(struct request_queue *q) { - struct request *req; - struct mg_host *host; + struct mg_host *host = q->queuedata; - while ((req = elv_next_request(q)) != NULL) { - host = req->rq_disk->private_data; + while (1) { + if (!host->req) { + host->req = elv_next_request(q); + if (host->req) + blkdev_dequeue_request(host->req); + else + break; + } - if (unlikely(!blk_fs_request(req))) { - __blk_end_request_cur(req, -EIO); + if (unlikely(!blk_fs_request(host->req))) { + mg_end_request_cur(host, -EIO); continue; } - if (rq_data_dir(req) == READ) - mg_read(req); + if (rq_data_dir(host->req) == READ) + mg_read(host->req); else - mg_write(req); + mg_write(host->req); } } @@ -733,16 +738,19 @@ static unsigned int mg_issue_req(struct request *req, /* This function also called from IRQ context */ static void mg_request(struct request_queue *q) { + struct mg_host *host = q->queuedata; struct request *req; - struct mg_host *host; u32 sect_num, sect_cnt; while (1) { - req = elv_next_request(q); - if (!req) - return; - - host = req->rq_disk->private_data; + if (!host->req) { + host->req = elv_next_request(q); + if (host->req) + blkdev_dequeue_request(host->req); + else + break; + } + req = host->req; /* check unwanted request call */ if (host->mg_do_intr) @@ -762,12 +770,12 @@ static void mg_request(struct request_queue *q) "%s: bad access: sector=%d, count=%d\n", req->rq_disk->disk_name, sect_num, sect_cnt); - __blk_end_request_cur(req, -EIO); + mg_end_request_cur(host, -EIO); continue; } if (unlikely(!blk_fs_request(req))) { - __blk_end_request_cur(req, -EIO); + mg_end_request_cur(host, -EIO); continue; } @@ -981,6 +989,7 @@ static int mg_probe(struct platform_device *plat_dev) __func__, __LINE__); goto probe_err_5; } + host->breq->queuedata = host; /* mflash is random device, thanx for the noop */ elevator_exit(host->breq->elevator); From 8a12c4a456c7ee261a85c2cbec835601e6aae05a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:02 +0900 Subject: [PATCH 0940/2061] hd: dequeue and track in-flight request hd has at most single request in flight. Till now, whenever it needs to access the in-flight request it called elv_next_request(). This patch makes hd track the in-flight request directly and dequeue it when processing starts. The added complexity is minimal and this will help future block layer changes. [ Impact: dequeue in-flight request, one elv_next_request() per request ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/hd.c | 63 +++++++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 23 deletions(-) diff --git a/drivers/block/hd.c b/drivers/block/hd.c index a3b39940ce0..288ab63c102 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -98,10 +98,9 @@ static DEFINE_SPINLOCK(hd_lock); static struct request_queue *hd_queue; +static struct request *hd_req; #define MAJOR_NR HD_MAJOR -#define QUEUE (hd_queue) -#define CURRENT elv_next_request(hd_queue) #define TIMEOUT_VALUE (6*HZ) #define HD_DELAY 0 @@ -195,11 +194,24 @@ static void __init hd_setup(char *str, int *ints) NR_HD = hdind+1; } +static bool hd_end_request(int err, unsigned int bytes) +{ + if (__blk_end_request(hd_req, err, bytes)) + return true; + hd_req = NULL; + return false; +} + +static bool hd_end_request_cur(int err) +{ + return hd_end_request(err, blk_rq_cur_bytes(hd_req)); +} + static void dump_status(const char *msg, unsigned int stat) { char *name = "hd?"; - if (CURRENT) - name = CURRENT->rq_disk->disk_name; + if (hd_req) + name = hd_req->rq_disk->disk_name; #ifdef VERBOSE_ERRORS printk("%s: %s: status=0x%02x { ", name, msg, stat & 0xff); @@ -227,8 +239,8 @@ static void dump_status(const char *msg, unsigned int stat) if (hd_error & (BBD_ERR|ECC_ERR|ID_ERR|MARK_ERR)) { printk(", CHS=%d/%d/%d", (inb(HD_HCYL)<<8) + inb(HD_LCYL), inb(HD_CURRENT) & 0xf, inb(HD_SECTOR)); - if (CURRENT) - printk(", sector=%ld", blk_rq_pos(CURRENT)); + if (hd_req) + printk(", sector=%ld", blk_rq_pos(hd_req)); } printk("\n"); } @@ -406,11 +418,12 @@ static void unexpected_hd_interrupt(void) */ static void bad_rw_intr(void) { - struct request *req = CURRENT; + struct request *req = hd_req; + if (req != NULL) { struct hd_i_struct *disk = req->rq_disk->private_data; if (++req->errors >= MAX_ERRORS || (hd_error & BBD_ERR)) { - __blk_end_request_cur(req, -EIO); + hd_end_request_cur(-EIO); disk->special_op = disk->recalibrate = 1; } else if (req->errors % RESET_FREQ == 0) reset = 1; @@ -454,14 +467,14 @@ static void read_intr(void) return; ok_to_read: - req = CURRENT; + req = hd_req; insw(HD_DATA, req->buffer, 256); #ifdef DEBUG printk("%s: read: sector %ld, remaining = %u, buffer=%p\n", req->rq_disk->disk_name, blk_rq_pos(req) + 1, blk_rq_sectors(req) - 1, req->buffer+512); #endif - if (__blk_end_request(req, 0, 512)) { + if (hd_end_request(0, 512)) { SET_HANDLER(&read_intr); return; } @@ -475,7 +488,7 @@ ok_to_read: static void write_intr(void) { - struct request *req = CURRENT; + struct request *req = hd_req; int i; int retries = 100000; @@ -494,7 +507,7 @@ static void write_intr(void) return; ok_to_write: - if (__blk_end_request(req, 0, 512)) { + if (hd_end_request(0, 512)) { SET_HANDLER(&write_intr); outsw(HD_DATA, req->buffer, 256); return; @@ -525,18 +538,18 @@ static void hd_times_out(unsigned long dummy) do_hd = NULL; - if (!CURRENT) + if (!hd_req) return; spin_lock_irq(hd_queue->queue_lock); reset = 1; - name = CURRENT->rq_disk->disk_name; + name = hd_req->rq_disk->disk_name; printk("%s: timeout\n", name); - if (++CURRENT->errors >= MAX_ERRORS) { + if (++hd_req->errors >= MAX_ERRORS) { #ifdef DEBUG printk("%s: too many errors\n", name); #endif - __blk_end_request_cur(CURRENT, -EIO); + hd_end_request_cur(-EIO); } hd_request(); spin_unlock_irq(hd_queue->queue_lock); @@ -551,7 +564,7 @@ static int do_special_op(struct hd_i_struct *disk, struct request *req) } if (disk->head > 16) { printk("%s: cannot handle device with more than 16 heads - giving up\n", req->rq_disk->disk_name); - __blk_end_request_cur(req, -EIO); + hd_end_request_cur(-EIO); } disk->special_op = 0; return 1; @@ -578,11 +591,15 @@ static void hd_request(void) repeat: del_timer(&device_timer); - req = CURRENT; - if (!req) { - do_hd = NULL; - return; + if (!hd_req) { + hd_req = elv_next_request(hd_queue); + if (!hd_req) { + do_hd = NULL; + return; + } + blkdev_dequeue_request(hd_req); } + req = hd_req; if (reset) { reset_hd(); @@ -595,7 +612,7 @@ repeat: ((block+nsect) > get_capacity(req->rq_disk))) { printk("%s: bad access: block=%d, count=%d\n", req->rq_disk->disk_name, block, nsect); - __blk_end_request_cur(req, -EIO); + hd_end_request_cur(-EIO); goto repeat; } @@ -635,7 +652,7 @@ repeat: break; default: printk("unknown hd-command\n"); - __blk_end_request_cur(req, -EIO); + hd_end_request_cur(-EIO); break; } } From a336ca6fe6e0c46b2eef0e520951acb4e6cb2976 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:03 +0900 Subject: [PATCH 0941/2061] ataflop: dequeue and track in-flight request ataflop has single request in flight. Till now, whenever it needs to access the in-flight request it called elv_next_request(). This patch makes ataflop track the in-flight request directly and dequeue it when processing starts. The added complexity is minimal and this will help future block layer changes. [ Impact: dequeue in-flight request, one elv_next_request() per request ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/ataflop.c | 63 +++++++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 28 deletions(-) diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 234024cda5e..89a591d9c83 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -79,9 +79,7 @@ #undef DEBUG static struct request_queue *floppy_queue; - -#define QUEUE (floppy_queue) -#define CURRENT elv_next_request(floppy_queue) +static struct request *fd_request; /* Disk types: DD, HD, ED */ static struct atari_disk_type { @@ -376,6 +374,12 @@ static DEFINE_TIMER(readtrack_timer, fd_readtrack_check, 0, 0); static DEFINE_TIMER(timeout_timer, fd_times_out, 0, 0); static DEFINE_TIMER(fd_timer, check_change, 0, 0); +static void fd_end_request_cur(int err) +{ + if (!__blk_end_request_cur(fd_request, err)) + fd_request = NULL; +} + static inline void start_motor_off_timer(void) { mod_timer(&motor_off_timer, jiffies + FD_MOTOR_OFF_DELAY); @@ -606,15 +610,15 @@ static void fd_error( void ) return; } - if (!CURRENT) + if (!fd_request) return; - CURRENT->errors++; - if (CURRENT->errors >= MAX_ERRORS) { + fd_request->errors++; + if (fd_request->errors >= MAX_ERRORS) { printk(KERN_ERR "fd%d: too many errors.\n", SelectedDrive ); - __blk_end_request_cur(CURRENT, -EIO); + fd_end_request_cur(-EIO); } - else if (CURRENT->errors == RECALIBRATE_ERRORS) { + else if (fd_request->errors == RECALIBRATE_ERRORS) { printk(KERN_WARNING "fd%d: recalibrating\n", SelectedDrive ); if (SelectedDrive != -1) SUD.track = -1; @@ -725,14 +729,14 @@ static void do_fd_action( int drive ) if (IS_BUFFERED( drive, ReqSide, ReqTrack )) { if (ReqCmd == READ) { copy_buffer( SECTOR_BUFFER(ReqSector), ReqData ); - if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) { + if (++ReqCnt < blk_rq_cur_sectors(fd_request)) { /* read next sector */ setup_req_params( drive ); goto repeat; } else { /* all sectors finished */ - __blk_end_request_cur(CURRENT, 0); + fd_end_request_cur(0); redo_fd_request(); return; } @@ -1130,14 +1134,14 @@ static void fd_rwsec_done1(int status) } } - if (++ReqCnt < blk_rq_cur_sectors(CURRENT)) { + if (++ReqCnt < blk_rq_cur_sectors(fd_request)) { /* read next sector */ setup_req_params( SelectedDrive ); do_fd_action( SelectedDrive ); } else { /* all sectors finished */ - __blk_end_request_cur(CURRENT, 0); + fd_end_request_cur(0); redo_fd_request(); } return; @@ -1378,7 +1382,7 @@ static void setup_req_params( int drive ) ReqData = ReqBuffer + 512 * ReqCnt; if (UseTrackbuffer) - read_track = (ReqCmd == READ && CURRENT->errors == 0); + read_track = (ReqCmd == READ && fd_request->errors == 0); else read_track = 0; @@ -1392,25 +1396,28 @@ static void redo_fd_request(void) int drive, type; struct atari_floppy_struct *floppy; - DPRINT(("redo_fd_request: CURRENT=%p dev=%s CURRENT->sector=%ld\n", - CURRENT, CURRENT ? CURRENT->rq_disk->disk_name : "", - CURRENT ? blk_rq_pos(CURRENT) : 0 )); + DPRINT(("redo_fd_request: fd_request=%p dev=%s fd_request->sector=%ld\n", + fd_request, fd_request ? fd_request->rq_disk->disk_name : "", + fd_request ? blk_rq_pos(fd_request) : 0 )); IsFormatting = 0; repeat: + if (!fd_request) { + fd_request = elv_next_request(floppy_queue); + if (!fd_request) + goto the_end; + blkdev_dequeue_request(fd_request); + } - if (!CURRENT) - goto the_end; - - floppy = CURRENT->rq_disk->private_data; + floppy = fd_request->rq_disk->private_data; drive = floppy - unit; type = floppy->type; if (!UD.connected) { /* drive not connected */ printk(KERN_ERR "Unknown Device: fd%d\n", drive ); - __blk_end_request_cur(CURRENT, -EIO); + fd_end_request_cur(-EIO); goto repeat; } @@ -1426,12 +1433,12 @@ repeat: /* user supplied disk type */ if (--type >= NUM_DISK_MINORS) { printk(KERN_WARNING "fd%d: invalid disk format", drive ); - __blk_end_request_cur(CURRENT, -EIO); + fd_end_request_cur(-EIO); goto repeat; } if (minor2disktype[type].drive_types > DriveType) { printk(KERN_WARNING "fd%d: unsupported disk format", drive ); - __blk_end_request_cur(CURRENT, -EIO); + fd_end_request_cur(-EIO); goto repeat; } type = minor2disktype[type].index; @@ -1440,8 +1447,8 @@ repeat: UD.autoprobe = 0; } - if (blk_rq_pos(CURRENT) + 1 > UDT->blocks) { - __blk_end_request_cur(CURRENT, -EIO); + if (blk_rq_pos(fd_request) + 1 > UDT->blocks) { + fd_end_request_cur(-EIO); goto repeat; } @@ -1449,9 +1456,9 @@ repeat: del_timer( &motor_off_timer ); ReqCnt = 0; - ReqCmd = rq_data_dir(CURRENT); - ReqBlock = blk_rq_pos(CURRENT); - ReqBuffer = CURRENT->buffer; + ReqCmd = rq_data_dir(fd_request); + ReqBlock = blk_rq_pos(fd_request); + ReqBuffer = fd_request->buffer; setup_req_params( drive ); do_fd_action( drive ); From f4bd4b90bfa86e867111aa182895cb5ac203aa7a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:04 +0900 Subject: [PATCH 0942/2061] swim3: dequeue in-flight request swim3 has at most single request in flight and already tracks it using fd_req. Convert it to dequeuing model by updating request fetching and wrapping completion function. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/swim3.c | 47 +++++++++++++++++++++++++++++++------------ 1 file changed, 34 insertions(+), 13 deletions(-) diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index c1b9a4dc11b..f48c6dd47e0 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -251,6 +251,20 @@ static int floppy_release(struct gendisk *disk, fmode_t mode); static int floppy_check_change(struct gendisk *disk); static int floppy_revalidate(struct gendisk *disk); +static bool swim3_end_request(int err, unsigned int nr_bytes) +{ + if (__blk_end_request(fd_req, err, nr_bytes)) + return true; + + fd_req = NULL; + return false; +} + +static bool swim3_end_request_cur(int err) +{ + return swim3_end_request(err, blk_rq_cur_bytes(fd_req)); +} + static void swim3_select(struct floppy_state *fs, int sel) { struct swim3 __iomem *sw = fs->swim3; @@ -310,7 +324,14 @@ static void start_request(struct floppy_state *fs) wake_up(&fs->wait); return; } - while (fs->state == idle && (req = elv_next_request(swim3_queue))) { + while (fs->state == idle) { + if (!fd_req) { + fd_req = elv_next_request(swim3_queue); + if (!fd_req) + break; + blkdev_dequeue_request(fd_req); + } + req = fd_req; #if 0 printk("do_fd_req: dev=%s cmd=%d sec=%ld nr_sec=%u buf=%p\n", req->rq_disk->disk_name, req->cmd, @@ -320,11 +341,11 @@ static void start_request(struct floppy_state *fs) #endif if (blk_rq_pos(req) >= fs->total_secs) { - __blk_end_request_cur(req, -EIO); + swim3_end_request_cur(-EIO); continue; } if (fs->ejected) { - __blk_end_request_cur(req, -EIO); + swim3_end_request_cur(-EIO); continue; } @@ -332,7 +353,7 @@ static void start_request(struct floppy_state *fs) if (fs->write_prot < 0) fs->write_prot = swim3_readbit(fs, WRITE_PROT); if (fs->write_prot) { - __blk_end_request_cur(req, -EIO); + swim3_end_request_cur(-EIO); continue; } } @@ -505,7 +526,7 @@ static void act(struct floppy_state *fs) case do_transfer: if (fs->cur_cyl != fs->req_cyl) { if (fs->retries > 5) { - __blk_end_request_cur(fd_req, -EIO); + swim3_end_request_cur(-EIO); fs->state = idle; return; } @@ -537,7 +558,7 @@ static void scan_timeout(unsigned long data) out_8(&sw->intr_enable, 0); fs->cur_cyl = -1; if (fs->retries > 5) { - __blk_end_request_cur(fd_req, -EIO); + swim3_end_request_cur(-EIO); fs->state = idle; start_request(fs); } else { @@ -556,7 +577,7 @@ static void seek_timeout(unsigned long data) out_8(&sw->select, RELAX); out_8(&sw->intr_enable, 0); printk(KERN_ERR "swim3: seek timeout\n"); - __blk_end_request_cur(fd_req, -EIO); + swim3_end_request_cur(-EIO); fs->state = idle; start_request(fs); } @@ -580,7 +601,7 @@ static void settle_timeout(unsigned long data) return; } printk(KERN_ERR "swim3: seek settle timeout\n"); - __blk_end_request_cur(fd_req, -EIO); + swim3_end_request_cur(-EIO); fs->state = idle; start_request(fs); } @@ -603,7 +624,7 @@ static void xfer_timeout(unsigned long data) printk(KERN_ERR "swim3: timeout %sing sector %ld\n", (rq_data_dir(fd_req)==WRITE? "writ": "read"), (long)blk_rq_pos(fd_req)); - __blk_end_request_cur(fd_req, -EIO); + swim3_end_request_cur(-EIO); fs->state = idle; start_request(fs); } @@ -634,7 +655,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) printk(KERN_ERR "swim3: seen sector but cyl=ff?\n"); fs->cur_cyl = -1; if (fs->retries > 5) { - __blk_end_request_cur(fd_req, -EIO); + swim3_end_request_cur(-EIO); fs->state = idle; start_request(fs); } else { @@ -717,7 +738,7 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) printk("swim3: error %sing block %ld (err=%x)\n", rq_data_dir(fd_req) == WRITE? "writ": "read", (long)blk_rq_pos(fd_req), err); - __blk_end_request_cur(fd_req, -EIO); + swim3_end_request_cur(-EIO); fs->state = idle; } } else { @@ -726,12 +747,12 @@ static irqreturn_t swim3_interrupt(int irq, void *dev_id) printk(KERN_ERR "swim3: fd dma: stat=%x resid=%d\n", stat, resid); printk(KERN_ERR " state=%d, dir=%x, intr=%x, err=%x\n", fs->state, rq_data_dir(fd_req), intr, err); - __blk_end_request_cur(fd_req, -EIO); + swim3_end_request_cur(-EIO); fs->state = idle; start_request(fs); break; } - if (__blk_end_request(fd_req, 0, fs->scount << 9)) { + if (swim3_end_request(0, fs->scount << 9)) { fs->req_sector += fs->scount; if (fs->req_sector > fs->secpertrack) { fs->req_sector -= fs->secpertrack; From 2d75ce084eec2a8154225c7e978191b364029003 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:05 +0900 Subject: [PATCH 0943/2061] xsysace: dequeue in-flight request xsysace already tracks in-flight request using ace->req. Converting to dequeueing model is mostly a matter of adding dequeueing call after request fetching. The only tricky part is handling CF removal which should complete both in flight and on queue requests. Convert to dequeueing model. While at it, remove explicit blk_rq_cur_bytes() and use __blk_end_request_cur() instead. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Cc: Grant Likely Signed-off-by: Jens Axboe --- drivers/block/xsysace.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 97c99b43f88..edf137b6c37 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -466,7 +466,8 @@ struct request *ace_get_next_request(struct request_queue * q) while ((req = elv_next_request(q)) != NULL) { if (blk_fs_request(req)) break; - __blk_end_request_cur(req, -EIO); + blkdev_dequeue_request(req); + __blk_end_request_all(req, -EIO); } return req; } @@ -492,9 +493,15 @@ static void ace_fsm_dostate(struct ace_device *ace) set_capacity(ace->gd, 0); dev_info(ace->dev, "No CF in slot\n"); - /* Drop all pending requests */ - while ((req = elv_next_request(ace->queue)) != NULL) - __blk_end_request_cur(req, -EIO); + /* Drop all in-flight and pending requests */ + if (ace->req) { + __blk_end_request_all(ace->req, -EIO); + ace->req = NULL; + } + while ((req = elv_next_request(ace->queue)) != NULL) { + blkdev_dequeue_request(req); + __blk_end_request_all(req, -EIO); + } /* Drop back to IDLE state and notify waiters */ ace->fsm_state = ACE_FSM_STATE_IDLE; @@ -642,6 +649,7 @@ static void ace_fsm_dostate(struct ace_device *ace) ace->fsm_state = ACE_FSM_STATE_IDLE; break; } + blkdev_dequeue_request(req); /* Okay, it's a data request, set it up for transfer */ dev_dbg(ace->dev, @@ -718,8 +726,7 @@ static void ace_fsm_dostate(struct ace_device *ace) } /* bio finished; is there another one? */ - if (__blk_end_request(ace->req, 0, - blk_rq_cur_bytes(ace->req))) { + if (__blk_end_request_cur(ace->req, 0)) { /* dev_dbg(ace->dev, "next block; h=%u c=%u\n", * blk_rq_sectors(ace->req), * blk_rq_cur_sectors(ace->req)); From b12d4f82c1a3cdcb2441c803a3368a9426f2f47f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:06 +0900 Subject: [PATCH 0944/2061] paride: dequeue in-flight request pd/pf/pcd have track in-flight request by pd/pf/pcd_req. They can be converted to dequeueing model by updating fetching and completion paths. Convert them. Note that removal of elv_next_request() call from pf_next_buf() doesn't make any functional difference. The path is traveled only during partial completion of a request and elv_next_request() call must return the same request anyway. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Cc: Tim Waugh Signed-off-by: Jens Axboe --- drivers/block/paride/pcd.c | 18 ++++++++++++------ drivers/block/paride/pd.c | 14 +++++++++----- drivers/block/paride/pf.c | 14 +++++++------- 3 files changed, 28 insertions(+), 18 deletions(-) diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 2d5dc0af55e..425f81586a3 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -719,9 +719,12 @@ static void do_pcd_request(struct request_queue * q) if (pcd_busy) return; while (1) { - pcd_req = elv_next_request(q); - if (!pcd_req) - return; + if (!pcd_req) { + pcd_req = elv_next_request(q); + if (!pcd_req) + return; + blkdev_dequeue_request(pcd_req); + } if (rq_data_dir(pcd_req) == READ) { struct pcd_unit *cd = pcd_req->rq_disk->private_data; @@ -734,8 +737,10 @@ static void do_pcd_request(struct request_queue * q) pcd_busy = 1; ps_set_intr(do_pcd_read, NULL, 0, nice); return; - } else - __blk_end_request_cur(pcd_req, -EIO); + } else { + __blk_end_request_all(pcd_req, -EIO); + pcd_req = NULL; + } } } @@ -744,7 +749,8 @@ static inline void next_request(int err) unsigned long saved_flags; spin_lock_irqsave(&pcd_lock, saved_flags); - __blk_end_request_cur(pcd_req, err); + if (!__blk_end_request_cur(pcd_req, err)) + pcd_req = NULL; pcd_busy = 0; do_pcd_request(pcd_queue); spin_unlock_irqrestore(&pcd_lock, saved_flags); diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index 9ec5d4ac0b6..d2ca3f55206 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -410,11 +410,14 @@ static void run_fsm(void) pd_claimed = 0; phase = NULL; spin_lock_irqsave(&pd_lock, saved_flags); - __blk_end_request_cur(pd_req, - res == Ok ? 0 : -EIO); - pd_req = elv_next_request(pd_queue); - if (!pd_req) - stop = 1; + if (!__blk_end_request_cur(pd_req, + res == Ok ? 0 : -EIO)) { + pd_req = elv_next_request(pd_queue); + if (!pd_req) + stop = 1; + else + blkdev_dequeue_request(pd_req); + } spin_unlock_irqrestore(&pd_lock, saved_flags); if (stop) return; @@ -706,6 +709,7 @@ static void do_pd_request(struct request_queue * q) pd_req = elv_next_request(q); if (!pd_req) return; + blkdev_dequeue_request(pd_req); schedule_fsm(); } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index e88c889aa7f..d6f7bd84ed3 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -752,10 +752,8 @@ static struct request_queue *pf_queue; static void pf_end_request(int err) { - if (pf_req) { - __blk_end_request_cur(pf_req, err); + if (pf_req && !__blk_end_request_cur(pf_req, err)) pf_req = NULL; - } } static void do_pf_request(struct request_queue * q) @@ -763,9 +761,12 @@ static void do_pf_request(struct request_queue * q) if (pf_busy) return; repeat: - pf_req = elv_next_request(q); - if (!pf_req) - return; + if (!pf_req) { + pf_req = elv_next_request(q); + if (!pf_req) + return; + blkdev_dequeue_request(pf_req); + } pf_current = pf_req->rq_disk->private_data; pf_block = blk_rq_pos(pf_req); @@ -806,7 +807,6 @@ static int pf_next_buf(void) if (!pf_count) { spin_lock_irqsave(&pf_spin_lock, saved_flags); pf_end_request(0); - pf_req = elv_next_request(pf_queue); spin_unlock_irqrestore(&pf_spin_lock, saved_flags); if (!pf_req) return 1; From 10e1e629b386aef97bf66de6ef28d450bec06ee3 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:07 +0900 Subject: [PATCH 0945/2061] ps3disk: dequeue in-flight request Other than in issue error paths, ps3disk always completely finishes fetched requests. With full completion on error paths, it can be easily converted to dequeueing model. * After L1 r/w call failure, ps3disk_submit_request_sg() now fails the whole request. Issue failure isn't likely to benefit from partial retry anyway and ps3disk uses full failure in completion error path too, so I don't think this amounts to any meaningful functionality loss. * flush completion is converted to _all for consistency. It doesn't make any functional difference. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Cc: Geert Uytterhoeven Signed-off-by: Jens Axboe --- drivers/block/ps3disk.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 8d583081b50..f4d8db944e7 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -157,7 +157,7 @@ static int ps3disk_submit_request_sg(struct ps3_storage_device *dev, if (res) { dev_err(&dev->sbd.core, "%s:%u: %s failed %d\n", __func__, __LINE__, op, res); - __blk_end_request_cur(req, -EIO); + __blk_end_request_all(req, -EIO); return 0; } @@ -179,7 +179,7 @@ static int ps3disk_submit_flush_request(struct ps3_storage_device *dev, if (res) { dev_err(&dev->sbd.core, "%s:%u: sync cache failed 0x%llx\n", __func__, __LINE__, res); - __blk_end_request_cur(req, -EIO); + __blk_end_request_all(req, -EIO); return 0; } @@ -195,6 +195,8 @@ static void ps3disk_do_request(struct ps3_storage_device *dev, dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__); while ((req = elv_next_request(q))) { + blkdev_dequeue_request(req); + if (blk_fs_request(req)) { if (ps3disk_submit_request_sg(dev, req)) break; @@ -204,7 +206,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev, break; } else { blk_dump_rq_flags(req, DEVICE_NAME " bad request"); - __blk_end_request_cur(req, -EIO); + __blk_end_request_all(req, -EIO); continue; } } From 9e31bebee2d8b5f8abe9677f2a29b90cba848657 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:08 +0900 Subject: [PATCH 0946/2061] amiflop: dequeue in-flight request Request processing in amiflop is done sequentially in redo_fd_request() proper and redo_fd_request() can easily be converted to track in-flight request. Remove CURRENT, track in-flight request directly and dequeue it when processing starts. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/amiflop.c | 48 ++++++++++++++++++++--------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index e4a14b94828..80a68b2e045 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -112,8 +112,6 @@ module_param(fd_def_df0, ulong, 0); MODULE_LICENSE("GPL"); static struct request_queue *floppy_queue; -#define QUEUE (floppy_queue) -#define CURRENT elv_next_request(floppy_queue) /* * Macros @@ -1335,59 +1333,61 @@ static int get_track(int drive, int track) static void redo_fd_request(void) { + struct request *rq; unsigned int cnt, block, track, sector; int drive; struct amiga_floppy_struct *floppy; char *data; unsigned long flags; + int err; - repeat: - if (!CURRENT) { +next_req: + rq = elv_next_request(floppy_queue); + if (!rq) { /* Nothing left to do */ return; } + blkdev_dequeue_request(rq); - floppy = CURRENT->rq_disk->private_data; + floppy = rq->rq_disk->private_data; drive = floppy - unit; +next_segment: /* Here someone could investigate to be more efficient */ - for (cnt = 0; cnt < blk_rq_cur_sectors(CURRENT); cnt++) { + for (cnt = 0, err = 0; cnt < blk_rq_cur_sectors(rq); cnt++) { #ifdef DEBUG printk("fd: sector %ld + %d requested for %s\n", - blk_rq_pos(CURRENT), cnt, - (rq_data_dir(CURRENT) == READ) ? "read" : "write"); + blk_rq_pos(rq), cnt, + (rq_data_dir(rq) == READ) ? "read" : "write"); #endif - block = blk_rq_pos(CURRENT) + cnt; + block = blk_rq_pos(rq) + cnt; if ((int)block > floppy->blocks) { - __blk_end_request_cur(CURRENT, -EIO); - goto repeat; + err = -EIO; + break; } track = block / (floppy->dtype->sects * floppy->type->sect_mult); sector = block % (floppy->dtype->sects * floppy->type->sect_mult); - data = CURRENT->buffer + 512 * cnt; + data = rq->buffer + 512 * cnt; #ifdef DEBUG printk("access to track %d, sector %d, with buffer at " "0x%08lx\n", track, sector, data); #endif if (get_track(drive, track) == -1) { - __blk_end_request_cur(CURRENT, -EIO); - goto repeat; + err = -EIO; + break; } - switch (rq_data_dir(CURRENT)) { - case READ: + if (rq_data_dir(rq) == READ) { memcpy(data, floppy->trackbuf + sector * 512, 512); - break; - - case WRITE: + } else { memcpy(floppy->trackbuf + sector * 512, data, 512); /* keep the drive spinning while writes are scheduled */ if (!fd_motor_on(drive)) { - __blk_end_request_cur(CURRENT, -EIO); - goto repeat; + err = -EIO; + break; } /* * setup a callback to write the track buffer @@ -1399,12 +1399,12 @@ static void redo_fd_request(void) /* reset the timer */ mod_timer (flush_track_timer + drive, jiffies + 1); local_irq_restore(flags); - break; } } - __blk_end_request_cur(CURRENT, 0); - goto repeat; + if (__blk_end_request_cur(rq, err)) + goto next_segment; + goto next_req; } static void do_fd_request(struct request_queue * q) From 06b0608e2b46465e8e663214e7db982ddb000346 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:09 +0900 Subject: [PATCH 0947/2061] swim: dequeue in-flight request swim processes requests one-by-one synchronously and can easily be converted to dequeuing model. Convert it. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Cc: Laurent Vivier Signed-off-by: Jens Axboe --- drivers/block/swim.c | 47 ++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 24 deletions(-) diff --git a/drivers/block/swim.c b/drivers/block/swim.c index fc6a1c32293..dedd4893f5e 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -514,7 +514,7 @@ static int floppy_read_sectors(struct floppy_state *fs, ret = swim_read_sector(fs, side, track, sector, buffer); if (try-- == 0) - return -1; + return -EIO; } while (ret != 512); buffer += ret; @@ -528,38 +528,37 @@ static void redo_fd_request(struct request_queue *q) struct request *req; struct floppy_state *fs; - while ((req = elv_next_request(q))) { + req = elv_next_request(q); + if (req) + blkdev_dequeue_request(req); + + while (req) { + int err = -EIO; fs = req->rq_disk->private_data; - if (blk_rq_pos(req) >= fs->total_secs) { - __blk_end_request_cur(req, -EIO); - continue; - } - if (!fs->disk_in) { - __blk_end_request_cur(req, -EIO); - continue; - } - if (rq_data_dir(req) == WRITE) { - if (fs->write_protected) { - __blk_end_request_cur(req, -EIO); - continue; - } - } + if (blk_rq_pos(req) >= fs->total_secs) + goto done; + if (!fs->disk_in) + goto done; + if (rq_data_dir(req) == WRITE && fs->write_protected) + goto done; + switch (rq_data_dir(req)) { case WRITE: /* NOT IMPLEMENTED */ - __blk_end_request_cur(req, -EIO); break; case READ: - if (floppy_read_sectors(fs, blk_rq_pos(req), - blk_rq_cur_sectors(req), - req->buffer)) { - __blk_end_request_cur(req, -EIO); - continue; - } - __blk_end_request_cur(req, 0); + err = floppy_read_sectors(fs, blk_rq_pos(req), + blk_rq_cur_sectors(req), + req->buffer); break; } + done: + if (!__blk_end_request_cur(req, err)) { + req = elv_next_request(q); + if (req) + blkdev_dequeue_request(req); + } } } From bab2a807a489822ded0c9d4a5344c80bcac10b0a Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:10 +0900 Subject: [PATCH 0948/2061] xd: dequeue in-flight request xd processes requests one-by-one synchronously and can be easily converted to dequeueing model. Convert it. While at it, use rq_cur_bytes instead of rq_bytes when checking for sector overflow. This is for for consistency and better behavior for merged requests. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/xd.c | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/drivers/block/xd.c b/drivers/block/xd.c index 4ef88018bcd..d4c4352354b 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c @@ -305,26 +305,31 @@ static void do_xd_request (struct request_queue * q) if (xdc_busy) return; - while ((req = elv_next_request(q)) != NULL) { + req = elv_next_request(q); + if (req) + blkdev_dequeue_request(req); + + while (req) { unsigned block = blk_rq_pos(req); - unsigned count = blk_rq_sectors(req); + unsigned count = blk_rq_cur_sectors(req); XD_INFO *disk = req->rq_disk->private_data; - int res = 0; + int res = -EIO; int retry; - if (!blk_fs_request(req)) { - __blk_end_request_cur(req, -EIO); - continue; - } - if (block + count > get_capacity(req->rq_disk)) { - __blk_end_request_cur(req, -EIO); - continue; - } + if (!blk_fs_request(req)) + goto done; + if (block + count > get_capacity(req->rq_disk)) + goto done; for (retry = 0; (retry < XD_RETRIES) && !res; retry++) res = xd_readwrite(rq_data_dir(req), disk, req->buffer, block, count); + done: /* wrap up, 0 = success, -errno = fail */ - __blk_end_request_cur(req, res); + if (!__blk_end_request_cur(req, res)) { + req = elv_next_request(q); + if (req) + blkdev_dequeue_request(req); + } } } From 1498ada7a8e80afe324f2b8d86158925d0096ec9 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:11 +0900 Subject: [PATCH 0949/2061] mtd_blkdevs: dequeue in-flight request mtd_blkdevs processes requests one-by-one synchronously from a kthread and can be easily converted to dequeueing model. Convert it. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Cc: David Woodhouse Signed-off-by: Jens Axboe --- drivers/mtd/mtd_blkdevs.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 50c76a2ca76..3e10442615d 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -89,18 +89,22 @@ static int mtd_blktrans_thread(void *arg) { struct mtd_blktrans_ops *tr = arg; struct request_queue *rq = tr->blkcore_priv->rq; + struct request *req = NULL; /* we might get involved when memory gets low, so use PF_MEMALLOC */ current->flags |= PF_MEMALLOC; spin_lock_irq(rq->queue_lock); + while (!kthread_should_stop()) { - struct request *req; struct mtd_blktrans_dev *dev; int res; - req = elv_next_request(rq); - + if (!req) { + req = elv_next_request(rq); + if (req) + blkdev_dequeue_request(req); + } if (!req) { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(rq->queue_lock); @@ -120,8 +124,13 @@ static int mtd_blktrans_thread(void *arg) spin_lock_irq(rq->queue_lock); - __blk_end_request_cur(req, res); + if (!__blk_end_request_cur(req, res)) + req = NULL; } + + if (req) + __blk_end_request_all(req, -EIO); + spin_unlock_irq(rq->queue_lock); return 0; From 6b0bf407b586b6ba8e060ad9979cb2bc3370b7eb Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:12 +0900 Subject: [PATCH 0950/2061] jsflash: dequeue in-flight request jsflash processes requests one-by-one synchronously from a kthread and can be easily converted to dequeueing model. Convert it. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Cc: Pete Zaitcev Signed-off-by: Jens Axboe --- drivers/sbus/char/jsflash.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index d56ddaa7703..f572a4a1d14 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -186,31 +186,37 @@ static void jsfd_do_request(struct request_queue *q) { struct request *req; - while ((req = elv_next_request(q)) != NULL) { + req = elv_next_request(q); + if (req) + blkdev_dequeue_request(req); + + while (req) { struct jsfd_part *jdp = req->rq_disk->private_data; unsigned long offset = blk_rq_pos(req) << 9; size_t len = blk_rq_cur_bytes(req); + int err = -EIO; - if ((offset + len) > jdp->dsize) { - __blk_end_request_cur(req, -EIO); - continue; - } + if ((offset + len) > jdp->dsize) + goto end; if (rq_data_dir(req) != READ) { printk(KERN_ERR "jsfd: write\n"); - __blk_end_request_cur(req, -EIO); - continue; + goto end; } if ((jdp->dbase & 0xff000000) != 0x20000000) { printk(KERN_ERR "jsfd: bad base %x\n", (int)jdp->dbase); - __blk_end_request_cur(req, -EIO); - continue; + goto end; } jsfd_read(req->buffer, jdp->dbase + offset, len); - - __blk_end_request_cur(req, 0); + err = 0; + end: + if (!__blk_end_request_cur(req, err)) { + req = elv_next_request(q); + if (req) + blkdev_dequeue_request(req); + } } } From fb3ac7f6b811eac8e0aafa3df1c16ed872e898a8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:13 +0900 Subject: [PATCH 0951/2061] z2ram: dequeue in-flight request z2ram processes requests one-by-one synchronously and can be easily converted to dequeueing model. Convert it. [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- drivers/block/z2ram.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index 6a1383834ec..c909c1a3f65 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -70,15 +70,21 @@ static struct gendisk *z2ram_gendisk; static void do_z2_request(struct request_queue *q) { struct request *req; - while ((req = elv_next_request(q)) != NULL) { + + req = elv_next_request(q); + if (req) + blkdev_dequeue_request(req); + + while (req) { unsigned long start = blk_rq_pos(req) << 9; unsigned long len = blk_rq_cur_bytes(req); + int err = 0; if (start + len > z2ram_size) { printk( KERN_ERR DEVICE_NAME ": bad access: block=%lu, count=%u\n", blk_rq_pos(req), blk_rq_cur_sectors(req)); - __blk_end_request_cur(req, -EIO); - continue; + err = -EIO; + goto done; } while (len) { unsigned long addr = start & Z2RAM_CHUNKMASK; @@ -93,7 +99,12 @@ static void do_z2_request(struct request_queue *q) start += size; len -= size; } - __blk_end_request_cur(req, 0); + done: + if (!__blk_end_request_cur(req, err)) { + req = elv_next_request(q); + if (req) + blkdev_dequeue_request(req); + } } } From 296b2f6ae654581adc27f0d6f0af454c7f3d06ee Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:15 +0900 Subject: [PATCH 0952/2061] block: convert to dequeueing model (easy ones) plat-omap/mailbox, floppy, viocd, mspro_block, i2o_block and mmc/card/queue are already pretty close to dequeueing model and can be converted with simple changes. Convert them. While at it, * xen-blkfront: !fs check moved downwards to share dequeue call with normal path. * mspro_block: __blk_end_request(..., blk_rq_cur_byte()) converted to __blk_end_request_cur() * mmc/card/queue: loop of __blk_end_request() converted to __blk_end_request_all() [ Impact: dequeue in-flight request ] Signed-off-by: Tejun Heo Cc: Rusty Russell Cc: Jeremy Fitzhardinge Cc: Stephen Rothwell Cc: Alex Dubov Cc: Markus Lidel Cc: Pierre Ossman Signed-off-by: Jens Axboe --- arch/arm/plat-omap/mailbox.c | 9 +++++++++ drivers/block/floppy.c | 4 +++- drivers/block/xen-blkfront.c | 13 +++++++------ drivers/cdrom/viocd.c | 2 ++ drivers/memstick/core/mspro_block.c | 8 +++++--- drivers/message/i2o/i2o_block.c | 6 ++++-- drivers/mmc/card/queue.c | 12 ++++++------ 7 files changed, 36 insertions(+), 18 deletions(-) diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c index 538ba7541d3..7a1f5c25fd1 100644 --- a/arch/arm/plat-omap/mailbox.c +++ b/arch/arm/plat-omap/mailbox.c @@ -198,6 +198,8 @@ static void mbox_tx_work(struct work_struct *work) spin_lock(q->queue_lock); rq = elv_next_request(q); + if (rq) + blkdev_dequeue_request(rq); spin_unlock(q->queue_lock); if (!rq) @@ -208,6 +210,9 @@ static void mbox_tx_work(struct work_struct *work) ret = __mbox_msg_send(mbox, tx_data->msg, tx_data->arg); if (ret) { enable_mbox_irq(mbox, IRQ_TX); + spin_lock(q->queue_lock); + blk_requeue_request(q, rq); + spin_unlock(q->queue_lock); return; } @@ -238,6 +243,8 @@ static void mbox_rx_work(struct work_struct *work) while (1) { spin_lock_irqsave(q->queue_lock, flags); rq = elv_next_request(q); + if (rq) + blkdev_dequeue_request(rq); spin_unlock_irqrestore(q->queue_lock, flags); if (!rq) break; @@ -345,6 +352,8 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf) while (1) { spin_lock_irqsave(q->queue_lock, flags); rq = elv_next_request(q); + if (rq) + blkdev_dequeue_request(rq); spin_unlock_irqrestore(q->queue_lock, flags); if (!rq) diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index 1e27ed9208b..e2c70d2085a 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -931,7 +931,7 @@ static inline void unlock_fdc(void) del_timer(&fd_timeout); cont = NULL; clear_bit(0, &fdc_busy); - if (elv_next_request(floppy_queue)) + if (current_req || elv_next_request(floppy_queue)) do_fd_request(floppy_queue); spin_unlock_irqrestore(&floppy_lock, flags); wake_up(&fdc_wait); @@ -2913,6 +2913,8 @@ static void redo_fd_request(void) spin_lock_irq(floppy_queue->queue_lock); req = elv_next_request(floppy_queue); + if (req) + blkdev_dequeue_request(req); spin_unlock_irq(floppy_queue->queue_lock); if (!req) { do_floppy = NULL; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 91fc56597e9..66f834571b8 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -301,22 +301,23 @@ static void do_blkif_request(struct request_queue *rq) while ((req = elv_next_request(rq)) != NULL) { info = req->rq_disk->private_data; - if (!blk_fs_request(req)) { - __blk_end_request_cur(req, -EIO); - continue; - } if (RING_FULL(&info->ring)) goto wait; + blkdev_dequeue_request(req); + + if (!blk_fs_request(req)) { + __blk_end_request_all(req, -EIO); + continue; + } + pr_debug("do_blk_req %p: cmd %p, sec %lx, " "(%u/%u) buffer:%p [%s]\n", req, req->cmd, (unsigned long)blk_rq_pos(req), blk_rq_cur_sectors(req), blk_rq_sectors(req), req->buffer, rq_data_dir(req) ? "write" : "read"); - - blkdev_dequeue_request(req); if (blkif_queue_request(req)) { blk_requeue_request(rq, req); wait: diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index 6e190a93d8d..bbe9f086734 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -298,6 +298,8 @@ static void do_viocd_request(struct request_queue *q) struct request *req; while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) { + blkdev_dequeue_request(req); + if (!blk_fs_request(req)) __blk_end_request_all(req, -EIO); else if (send_request(req) < 0) { diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 93b2c618565..58f5be8cd69 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -672,8 +672,7 @@ try_again: msb->req_sg); if (!msb->seg_count) { - chunk = __blk_end_request(msb->block_req, -ENOMEM, - blk_rq_cur_bytes(msb->block_req)); + chunk = __blk_end_request_cur(msb->block_req, -ENOMEM); continue; } @@ -711,6 +710,7 @@ try_again: dev_dbg(&card->dev, "issue end\n"); return -EAGAIN; } + blkdev_dequeue_request(msb->block_req); dev_dbg(&card->dev, "trying again\n"); chunk = 1; @@ -825,8 +825,10 @@ static void mspro_block_submit_req(struct request_queue *q) return; if (msb->eject) { - while ((req = elv_next_request(q)) != NULL) + while ((req = elv_next_request(q)) != NULL) { + blkdev_dequeue_request(req); __blk_end_request_all(req, -ENODEV); + } return; } diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index e153f5d5237..8b5cbfc3ba9 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -916,8 +916,10 @@ static void i2o_block_request_fn(struct request_queue *q) blk_stop_queue(q); break; } - } else - __blk_end_request_cur(req, -EIO); + } else { + blkdev_dequeue_request(req); + __blk_end_request_all(req, -EIO); + } } }; diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index 7a72e75d5c6..4b70f1e2834 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -54,8 +54,11 @@ static int mmc_queue_thread(void *d) spin_lock_irq(q->queue_lock); set_current_state(TASK_INTERRUPTIBLE); - if (!blk_queue_plugged(q)) + if (!blk_queue_plugged(q)) { req = elv_next_request(q); + if (req) + blkdev_dequeue_request(req); + } mq->req = req; spin_unlock_irq(q->queue_lock); @@ -88,15 +91,12 @@ static void mmc_request(struct request_queue *q) { struct mmc_queue *mq = q->queuedata; struct request *req; - int ret; if (!mq) { printk(KERN_ERR "MMC: killing requests for dead queue\n"); while ((req = elv_next_request(q)) != NULL) { - do { - ret = __blk_end_request(req, -EIO, - blk_rq_cur_bytes(req)); - } while (ret); + blkdev_dequeue_request(req); + __blk_end_request_all(req, -EIO); } return; } From 2343046826a8ca426b07601d9593ee046c298b68 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:14 +0900 Subject: [PATCH 0953/2061] gdrom: dequeue in-flight request gdrom already dequeues and fully completes requests on normal path and the error paths can be easily converted to do so too. Clean it up and dequeue requests on error paths too. While at it remove superflous blk_fs_request() && !blk_rq_sectors() condition check. [ Impact: dequeue in-flight request, cleanup ] Signed-off-by: Tejun Heo Cc: Adrian McMenamin Signed-off-by: Jens Axboe --- drivers/cdrom/gdrom.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 488423cab51..3cc02bfe828 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -638,33 +638,31 @@ static void gdrom_readdisk_dma(struct work_struct *work) kfree(read_command); } -static void gdrom_request_handler_dma(struct request *req) -{ - /* dequeue, add to list of deferred work - * and then schedule workqueue */ - blkdev_dequeue_request(req); - list_add_tail(&req->queuelist, &gdrom_deferred); - schedule_work(&work); -} - static void gdrom_request(struct request_queue *rq) { struct request *req; while ((req = elv_next_request(rq)) != NULL) { + blkdev_dequeue_request(req); + if (!blk_fs_request(req)) { printk(KERN_DEBUG "GDROM: Non-fs request ignored\n"); - __blk_end_request_cur(req, -EIO); + __blk_end_request_all(req, -EIO); + continue; } if (rq_data_dir(req) != READ) { printk(KERN_NOTICE "GDROM: Read only device -"); printk(" write request ignored\n"); - __blk_end_request_cur(req, -EIO); + __blk_end_request_all(req, -EIO); + continue; } - if (blk_rq_sectors(req)) - gdrom_request_handler_dma(req); - else - __blk_end_request_cur(req, -EIO); + + /* + * Add to list of deferred work and then schedule + * workqueue. + */ + list_add_tail(&req->queuelist, &gdrom_deferred); + schedule_work(&work); } } From 9934c8c04561413609d2bc38c6b9f268cba774a4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 8 May 2009 11:54:16 +0900 Subject: [PATCH 0954/2061] block: implement and enforce request peek/start/fetch Till now block layer allowed two separate modes of request execution. A request is always acquired from the request queue via elv_next_request(). After that, drivers are free to either dequeue it or process it without dequeueing. Dequeue allows elv_next_request() to return the next request so that multiple requests can be in flight. Executing requests without dequeueing has its merits mostly in allowing drivers for simpler devices which can't do sg to deal with segments only without considering request boundary. However, the benefit this brings is dubious and declining while the cost of the API ambiguity is increasing. Segment based drivers are usually for very old or limited devices and as converting to dequeueing model isn't difficult, it doesn't justify the API overhead it puts on block layer and its more modern users. Previous patches converted all block low level drivers to dequeueing model. This patch completes the API transition by... * renaming elv_next_request() to blk_peek_request() * renaming blkdev_dequeue_request() to blk_start_request() * adding blk_fetch_request() which is combination of peek and start * disallowing completion of queued (not started) requests * applying new API to all LLDs Renamings are for consistency and to break out of tree code so that it's apparent that out of tree drivers need updating. [ Impact: block request issue API cleanup, no functional change ] Signed-off-by: Tejun Heo Cc: Rusty Russell Cc: James Bottomley Cc: Mike Miller Cc: unsik Kim Cc: Paul Clements Cc: Tim Waugh Cc: Geert Uytterhoeven Cc: David S. Miller Cc: Laurent Vivier Cc: Jeff Garzik Cc: Jeremy Fitzhardinge Cc: Grant Likely Cc: Adrian McMenamin Cc: Stephen Rothwell Cc: Bartlomiej Zolnierkiewicz Cc: Borislav Petkov Cc: Sergei Shtylyov Cc: Alex Dubov Cc: Pierre Ossman Cc: David Woodhouse Cc: Markus Lidel Cc: Stefan Weinhuber Cc: Martin Schwidefsky Cc: Pete Zaitcev Cc: FUJITA Tomonori Signed-off-by: Jens Axboe --- arch/arm/plat-omap/mailbox.c | 12 +--- arch/um/drivers/ubd_kern.c | 3 +- block/blk-barrier.c | 4 +- block/blk-core.c | 105 ++++++++++++++++++++-------- block/blk-tag.c | 2 +- block/blk.h | 1 + drivers/block/DAC960.c | 4 +- drivers/block/amiflop.c | 3 +- drivers/block/ataflop.c | 3 +- drivers/block/cciss.c | 4 +- drivers/block/cpqarray.c | 4 +- drivers/block/floppy.c | 6 +- drivers/block/hd.c | 3 +- drivers/block/mg_disk.c | 12 ++-- drivers/block/nbd.c | 4 +- drivers/block/paride/pcd.c | 3 +- drivers/block/paride/pd.c | 7 +- drivers/block/paride/pf.c | 3 +- drivers/block/ps3disk.c | 4 +- drivers/block/sunvdc.c | 3 +- drivers/block/swim.c | 12 +--- drivers/block/swim3.c | 3 +- drivers/block/sx8.c | 8 +-- drivers/block/ub.c | 8 +-- drivers/block/viodasd.c | 4 +- drivers/block/virtio_blk.c | 4 +- drivers/block/xd.c | 12 +--- drivers/block/xen-blkfront.c | 4 +- drivers/block/xsysace.c | 10 ++- drivers/block/z2ram.c | 12 +--- drivers/cdrom/gdrom.c | 4 +- drivers/cdrom/viocd.c | 4 +- drivers/ide/ide-atapi.c | 2 +- drivers/ide/ide-io.c | 9 +-- drivers/memstick/core/mspro_block.c | 9 +-- drivers/message/i2o/i2o_block.c | 6 +- drivers/mmc/card/queue.c | 11 +-- drivers/mtd/mtd_blkdevs.c | 7 +- drivers/s390/block/dasd.c | 16 ++--- drivers/s390/char/tape_block.c | 7 +- drivers/sbus/char/jsflash.c | 12 +--- drivers/scsi/scsi_lib.c | 10 +-- drivers/scsi/scsi_transport_sas.c | 4 +- include/linux/blkdev.h | 9 ++- include/linux/elevator.h | 2 - 45 files changed, 172 insertions(+), 207 deletions(-) diff --git a/arch/arm/plat-omap/mailbox.c b/arch/arm/plat-omap/mailbox.c index 7a1f5c25fd1..40424edae93 100644 --- a/arch/arm/plat-omap/mailbox.c +++ b/arch/arm/plat-omap/mailbox.c @@ -197,9 +197,7 @@ static void mbox_tx_work(struct work_struct *work) struct omap_msg_tx_data *tx_data; spin_lock(q->queue_lock); - rq = elv_next_request(q); - if (rq) - blkdev_dequeue_request(rq); + rq = blk_fetch_request(q); spin_unlock(q->queue_lock); if (!rq) @@ -242,9 +240,7 @@ static void mbox_rx_work(struct work_struct *work) while (1) { spin_lock_irqsave(q->queue_lock, flags); - rq = elv_next_request(q); - if (rq) - blkdev_dequeue_request(rq); + rq = blk_fetch_request(q); spin_unlock_irqrestore(q->queue_lock, flags); if (!rq) break; @@ -351,9 +347,7 @@ omap_mbox_read(struct device *dev, struct device_attribute *attr, char *buf) while (1) { spin_lock_irqsave(q->queue_lock, flags); - rq = elv_next_request(q); - if (rq) - blkdev_dequeue_request(rq); + rq = blk_fetch_request(q); spin_unlock_irqrestore(q->queue_lock, flags); if (!rq) diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c index 402ba8f70fc..aa9e926e13d 100644 --- a/arch/um/drivers/ubd_kern.c +++ b/arch/um/drivers/ubd_kern.c @@ -1228,12 +1228,11 @@ static void do_ubd_request(struct request_queue *q) while(1){ struct ubd *dev = q->queuedata; if(dev->end_sg == 0){ - struct request *req = elv_next_request(q); + struct request *req = blk_fetch_request(q); if(req == NULL) return; dev->request = req; - blkdev_dequeue_request(req); dev->start_sg = 0; dev->end_sg = blk_rq_map_sg(q, req, dev->sg); } diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 8713c2fbc4f..0ab81a0a750 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -180,7 +180,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp) } /* stash away the original request */ - elv_dequeue_request(q, rq); + blk_dequeue_request(rq); q->orig_bar_rq = rq; rq = NULL; @@ -248,7 +248,7 @@ bool blk_do_ordered(struct request_queue *q, struct request **rqp) * Queue ordering not supported. Terminate * with prejudice. */ - elv_dequeue_request(q, rq); + blk_dequeue_request(rq); __blk_end_request_all(rq, -EOPNOTSUPP); *rqp = NULL; return false; diff --git a/block/blk-core.c b/block/blk-core.c index 6226a380fb6..93691d2ac5a 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -902,6 +902,8 @@ EXPORT_SYMBOL(blk_get_request); */ void blk_requeue_request(struct request_queue *q, struct request *rq) { + BUG_ON(blk_queued_rq(rq)); + blk_delete_timer(rq); blk_clear_rq_complete(rq); trace_block_rq_requeue(q, rq); @@ -1610,28 +1612,6 @@ int blk_insert_cloned_request(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL_GPL(blk_insert_cloned_request); -/** - * blkdev_dequeue_request - dequeue request and start timeout timer - * @req: request to dequeue - * - * Dequeue @req and start timeout timer on it. This hands off the - * request to the driver. - * - * Block internal functions which don't want to start timer should - * call elv_dequeue_request(). - */ -void blkdev_dequeue_request(struct request *req) -{ - elv_dequeue_request(req->q, req); - - /* - * We are now handing the request to the hardware, add the - * timeout handler. - */ - blk_add_timer(req); -} -EXPORT_SYMBOL(blkdev_dequeue_request); - static void blk_account_io_completion(struct request *req, unsigned int bytes) { if (blk_do_io_stat(req)) { @@ -1671,7 +1651,23 @@ static void blk_account_io_done(struct request *req) } } -struct request *elv_next_request(struct request_queue *q) +/** + * blk_peek_request - peek at the top of a request queue + * @q: request queue to peek at + * + * Description: + * Return the request at the top of @q. The returned request + * should be started using blk_start_request() before LLD starts + * processing it. + * + * Return: + * Pointer to the request at the top of @q if available. Null + * otherwise. + * + * Context: + * queue_lock must be held. + */ +struct request *blk_peek_request(struct request_queue *q) { struct request *rq; int ret; @@ -1748,10 +1744,12 @@ struct request *elv_next_request(struct request_queue *q) return rq; } -EXPORT_SYMBOL(elv_next_request); +EXPORT_SYMBOL(blk_peek_request); -void elv_dequeue_request(struct request_queue *q, struct request *rq) +void blk_dequeue_request(struct request *rq) { + struct request_queue *q = rq->q; + BUG_ON(list_empty(&rq->queuelist)); BUG_ON(ELV_ON_HASH(rq)); @@ -1766,6 +1764,58 @@ void elv_dequeue_request(struct request_queue *q, struct request *rq) q->in_flight++; } +/** + * blk_start_request - start request processing on the driver + * @req: request to dequeue + * + * Description: + * Dequeue @req and start timeout timer on it. This hands off the + * request to the driver. + * + * Block internal functions which don't want to start timer should + * call blk_dequeue_request(). + * + * Context: + * queue_lock must be held. + */ +void blk_start_request(struct request *req) +{ + blk_dequeue_request(req); + + /* + * We are now handing the request to the hardware, add the + * timeout handler. + */ + blk_add_timer(req); +} +EXPORT_SYMBOL(blk_start_request); + +/** + * blk_fetch_request - fetch a request from a request queue + * @q: request queue to fetch a request from + * + * Description: + * Return the request at the top of @q. The request is started on + * return and LLD can start processing it immediately. + * + * Return: + * Pointer to the request at the top of @q if available. Null + * otherwise. + * + * Context: + * queue_lock must be held. + */ +struct request *blk_fetch_request(struct request_queue *q) +{ + struct request *rq; + + rq = blk_peek_request(q); + if (rq) + blk_start_request(rq); + return rq; +} +EXPORT_SYMBOL(blk_fetch_request); + /** * blk_update_request - Special helper function for request stacking drivers * @rq: the request being processed @@ -1937,12 +1987,11 @@ static bool blk_update_bidi_request(struct request *rq, int error, */ static void blk_finish_request(struct request *req, int error) { + BUG_ON(blk_queued_rq(req)); + if (blk_rq_tagged(req)) blk_queue_end_tag(req->q, req); - if (blk_queued_rq(req)) - elv_dequeue_request(req->q, req); - if (unlikely(laptop_mode) && blk_fs_request(req)) laptop_io_completion(); diff --git a/block/blk-tag.c b/block/blk-tag.c index 3c518e3303a..c260f7c30dd 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -374,7 +374,7 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) rq->cmd_flags |= REQ_QUEUED; rq->tag = tag; bqt->tag_index[tag] = rq; - blkdev_dequeue_request(rq); + blk_start_request(rq); list_add(&rq->queuelist, &q->tag_busy_list); return 0; } diff --git a/block/blk.h b/block/blk.h index ab54529103c..9e0042ca949 100644 --- a/block/blk.h +++ b/block/blk.h @@ -13,6 +13,7 @@ extern struct kobj_type blk_queue_ktype; void init_request_from_bio(struct request *req, struct bio *bio); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); +void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); void blk_unplug_work(struct work_struct *work); diff --git a/drivers/block/DAC960.c b/drivers/block/DAC960.c index 774ab05973a..668dc234b8e 100644 --- a/drivers/block/DAC960.c +++ b/drivers/block/DAC960.c @@ -3321,7 +3321,7 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_ DAC960_Command_T *Command; while(1) { - Request = elv_next_request(req_q); + Request = blk_peek_request(req_q); if (!Request) return 1; @@ -3341,7 +3341,7 @@ static int DAC960_process_queue(DAC960_Controller_T *Controller, struct request_ Command->BlockNumber = blk_rq_pos(Request); Command->BlockCount = blk_rq_sectors(Request); Command->Request = Request; - blkdev_dequeue_request(Request); + blk_start_request(Request); Command->SegmentCount = blk_rq_map_sg(req_q, Command->Request, Command->cmd_sglist); /* pci_map_sg MAY change the value of SegCount */ diff --git a/drivers/block/amiflop.c b/drivers/block/amiflop.c index 80a68b2e045..9c6e5b0fe89 100644 --- a/drivers/block/amiflop.c +++ b/drivers/block/amiflop.c @@ -1342,12 +1342,11 @@ static void redo_fd_request(void) int err; next_req: - rq = elv_next_request(floppy_queue); + rq = blk_fetch_request(floppy_queue); if (!rq) { /* Nothing left to do */ return; } - blkdev_dequeue_request(rq); floppy = rq->rq_disk->private_data; drive = floppy - unit; diff --git a/drivers/block/ataflop.c b/drivers/block/ataflop.c index 89a591d9c83..f5e7180d7f4 100644 --- a/drivers/block/ataflop.c +++ b/drivers/block/ataflop.c @@ -1404,10 +1404,9 @@ static void redo_fd_request(void) repeat: if (!fd_request) { - fd_request = elv_next_request(floppy_queue); + fd_request = blk_fetch_request(floppy_queue); if (!fd_request) goto the_end; - blkdev_dequeue_request(fd_request); } floppy = fd_request->rq_disk->private_data; diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index ab7b04c0db7..e714e7cce6f 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -2801,7 +2801,7 @@ static void do_cciss_request(struct request_queue *q) goto startio; queue: - creq = elv_next_request(q); + creq = blk_peek_request(q); if (!creq) goto startio; @@ -2810,7 +2810,7 @@ static void do_cciss_request(struct request_queue *q) if ((c = cmd_alloc(h, 1)) == NULL) goto full; - blkdev_dequeue_request(creq); + blk_start_request(creq); spin_unlock_irq(q->queue_lock); diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index a5caeff4718..a02dcfc00f1 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -903,7 +903,7 @@ static void do_ida_request(struct request_queue *q) goto startio; queue_next: - creq = elv_next_request(q); + creq = blk_peek_request(q); if (!creq) goto startio; @@ -912,7 +912,7 @@ queue_next: if ((c = cmd_alloc(h,1)) == NULL) goto startio; - blkdev_dequeue_request(creq); + blk_start_request(creq); c->ctlr = h->ctlr; c->hdr.unit = (drv_info_t *)(creq->rq_disk->private_data) - h->drv; diff --git a/drivers/block/floppy.c b/drivers/block/floppy.c index e2c70d2085a..90877fee0ee 100644 --- a/drivers/block/floppy.c +++ b/drivers/block/floppy.c @@ -931,7 +931,7 @@ static inline void unlock_fdc(void) del_timer(&fd_timeout); cont = NULL; clear_bit(0, &fdc_busy); - if (current_req || elv_next_request(floppy_queue)) + if (current_req || blk_peek_request(floppy_queue)) do_fd_request(floppy_queue); spin_unlock_irqrestore(&floppy_lock, flags); wake_up(&fdc_wait); @@ -2912,9 +2912,7 @@ static void redo_fd_request(void) struct request *req; spin_lock_irq(floppy_queue->queue_lock); - req = elv_next_request(floppy_queue); - if (req) - blkdev_dequeue_request(req); + req = blk_fetch_request(floppy_queue); spin_unlock_irq(floppy_queue->queue_lock); if (!req) { do_floppy = NULL; diff --git a/drivers/block/hd.c b/drivers/block/hd.c index 288ab63c102..961de56d00a 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -592,12 +592,11 @@ repeat: del_timer(&device_timer); if (!hd_req) { - hd_req = elv_next_request(hd_queue); + hd_req = blk_fetch_request(hd_queue); if (!hd_req) { do_hd = NULL; return; } - blkdev_dequeue_request(hd_req); } req = hd_req; diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index 1ca5d1423fa..c0cd0a03f69 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -671,10 +671,8 @@ static void mg_request_poll(struct request_queue *q) while (1) { if (!host->req) { - host->req = elv_next_request(q); - if (host->req) - blkdev_dequeue_request(host->req); - else + host->req = blk_fetch_request(q); + if (!host->req) break; } @@ -744,10 +742,8 @@ static void mg_request(struct request_queue *q) while (1) { if (!host->req) { - host->req = elv_next_request(q); - if (host->req) - blkdev_dequeue_request(host->req); - else + host->req = blk_fetch_request(q); + if (!host->req) break; } req = host->req; diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c index fad167de23b..5d23ffad7c7 100644 --- a/drivers/block/nbd.c +++ b/drivers/block/nbd.c @@ -533,11 +533,9 @@ static void do_nbd_request(struct request_queue *q) { struct request *req; - while ((req = elv_next_request(q)) != NULL) { + while ((req = blk_fetch_request(q)) != NULL) { struct nbd_device *lo; - blkdev_dequeue_request(req); - spin_unlock_irq(q->queue_lock); dprintk(DBG_BLKDEV, "%s: request %p: dequeued (flags=%x)\n", diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index 425f81586a3..911dfd98d81 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -720,10 +720,9 @@ static void do_pcd_request(struct request_queue * q) return; while (1) { if (!pcd_req) { - pcd_req = elv_next_request(q); + pcd_req = blk_fetch_request(q); if (!pcd_req) return; - blkdev_dequeue_request(pcd_req); } if (rq_data_dir(pcd_req) == READ) { diff --git a/drivers/block/paride/pd.c b/drivers/block/paride/pd.c index d2ca3f55206..bf5955b3d87 100644 --- a/drivers/block/paride/pd.c +++ b/drivers/block/paride/pd.c @@ -412,11 +412,9 @@ static void run_fsm(void) spin_lock_irqsave(&pd_lock, saved_flags); if (!__blk_end_request_cur(pd_req, res == Ok ? 0 : -EIO)) { - pd_req = elv_next_request(pd_queue); + pd_req = blk_fetch_request(pd_queue); if (!pd_req) stop = 1; - else - blkdev_dequeue_request(pd_req); } spin_unlock_irqrestore(&pd_lock, saved_flags); if (stop) @@ -706,10 +704,9 @@ static void do_pd_request(struct request_queue * q) { if (pd_req) return; - pd_req = elv_next_request(q); + pd_req = blk_fetch_request(q); if (!pd_req) return; - blkdev_dequeue_request(pd_req); schedule_fsm(); } diff --git a/drivers/block/paride/pf.c b/drivers/block/paride/pf.c index d6f7bd84ed3..68a90834e99 100644 --- a/drivers/block/paride/pf.c +++ b/drivers/block/paride/pf.c @@ -762,10 +762,9 @@ static void do_pf_request(struct request_queue * q) return; repeat: if (!pf_req) { - pf_req = elv_next_request(q); + pf_req = blk_fetch_request(q); if (!pf_req) return; - blkdev_dequeue_request(pf_req); } pf_current = pf_req->rq_disk->private_data; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index f4d8db944e7..338cee4cc0b 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -194,9 +194,7 @@ static void ps3disk_do_request(struct ps3_storage_device *dev, dev_dbg(&dev->sbd.core, "%s:%u\n", __func__, __LINE__); - while ((req = elv_next_request(q))) { - blkdev_dequeue_request(req); - + while ((req = blk_fetch_request(q))) { if (blk_fs_request(req)) { if (ps3disk_submit_request_sg(dev, req)) break; diff --git a/drivers/block/sunvdc.c b/drivers/block/sunvdc.c index 9f351bfa15e..cbfd9c0aef0 100644 --- a/drivers/block/sunvdc.c +++ b/drivers/block/sunvdc.c @@ -441,12 +441,11 @@ out: static void do_vdc_request(struct request_queue *q) { while (1) { - struct request *req = elv_next_request(q); + struct request *req = blk_fetch_request(q); if (!req) break; - blkdev_dequeue_request(req); if (__send_request(req) < 0) __blk_end_request_all(req, -EIO); } diff --git a/drivers/block/swim.c b/drivers/block/swim.c index dedd4893f5e..cf7877fb8a7 100644 --- a/drivers/block/swim.c +++ b/drivers/block/swim.c @@ -528,10 +528,7 @@ static void redo_fd_request(struct request_queue *q) struct request *req; struct floppy_state *fs; - req = elv_next_request(q); - if (req) - blkdev_dequeue_request(req); - + req = blk_fetch_request(q); while (req) { int err = -EIO; @@ -554,11 +551,8 @@ static void redo_fd_request(struct request_queue *q) break; } done: - if (!__blk_end_request_cur(req, err)) { - req = elv_next_request(q); - if (req) - blkdev_dequeue_request(req); - } + if (!__blk_end_request_cur(req, err)) + req = blk_fetch_request(q); } } diff --git a/drivers/block/swim3.c b/drivers/block/swim3.c index f48c6dd47e0..80df93e3cdd 100644 --- a/drivers/block/swim3.c +++ b/drivers/block/swim3.c @@ -326,10 +326,9 @@ static void start_request(struct floppy_state *fs) } while (fs->state == idle) { if (!fd_req) { - fd_req = elv_next_request(swim3_queue); + fd_req = blk_fetch_request(swim3_queue); if (!fd_req) break; - blkdev_dequeue_request(fd_req); } req = fd_req; #if 0 diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 087c94c8b2d..da403b6a7f4 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -810,12 +810,10 @@ static void carm_oob_rq_fn(struct request_queue *q) while (1) { DPRINTK("get req\n"); - rq = elv_next_request(q); + rq = blk_fetch_request(q); if (!rq) break; - blkdev_dequeue_request(rq); - crq = rq->special; assert(crq != NULL); assert(crq->rq == rq); @@ -846,7 +844,7 @@ static void carm_rq_fn(struct request_queue *q) queue_one_request: VPRINTK("get req\n"); - rq = elv_next_request(q); + rq = blk_peek_request(q); if (!rq) return; @@ -857,7 +855,7 @@ queue_one_request: } crq->rq = rq; - blkdev_dequeue_request(rq); + blk_start_request(rq); if (rq_data_dir(rq) == WRITE) { writing = 1; diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 40d03cf63f2..178f459a50e 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -627,7 +627,7 @@ static void ub_request_fn(struct request_queue *q) struct ub_lun *lun = q->queuedata; struct request *rq; - while ((rq = elv_next_request(q)) != NULL) { + while ((rq = blk_peek_request(q)) != NULL) { if (ub_request_fn_1(lun, rq) != 0) { blk_stop_queue(q); break; @@ -643,13 +643,13 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq) int n_elem; if (atomic_read(&sc->poison)) { - blkdev_dequeue_request(rq); + blk_start_request(rq); ub_end_rq(rq, DID_NO_CONNECT << 16, blk_rq_bytes(rq)); return 0; } if (lun->changed && !blk_pc_request(rq)) { - blkdev_dequeue_request(rq); + blk_start_request(rq); ub_end_rq(rq, SAM_STAT_CHECK_CONDITION, blk_rq_bytes(rq)); return 0; } @@ -660,7 +660,7 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq) return -1; memset(cmd, 0, sizeof(struct ub_scsi_cmd)); - blkdev_dequeue_request(rq); + blk_start_request(rq); urq = &lun->urq; memset(urq, 0, sizeof(struct ub_request)); diff --git a/drivers/block/viodasd.c b/drivers/block/viodasd.c index 2086cb12d3e..390d69bb7c4 100644 --- a/drivers/block/viodasd.c +++ b/drivers/block/viodasd.c @@ -361,11 +361,9 @@ static void do_viodasd_request(struct request_queue *q) * back later. */ while (num_req_outstanding < VIOMAXREQ) { - req = elv_next_request(q); + req = blk_fetch_request(q); if (req == NULL) return; - /* dequeue the current request from the queue */ - blkdev_dequeue_request(req); /* check that request contains a valid command */ if (!blk_fs_request(req)) { viodasd_end_request(req, -EIO, blk_rq_sectors(req)); diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 1980ab45635..29a9daf4862 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -128,7 +128,7 @@ static void do_virtblk_request(struct request_queue *q) struct request *req; unsigned int issued = 0; - while ((req = elv_next_request(q)) != NULL) { + while ((req = blk_peek_request(q)) != NULL) { vblk = req->rq_disk->private_data; BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); @@ -138,7 +138,7 @@ static void do_virtblk_request(struct request_queue *q) blk_stop_queue(q); break; } - blkdev_dequeue_request(req); + blk_start_request(req); issued++; } diff --git a/drivers/block/xd.c b/drivers/block/xd.c index d4c4352354b..ce242921992 100644 --- a/drivers/block/xd.c +++ b/drivers/block/xd.c @@ -305,10 +305,7 @@ static void do_xd_request (struct request_queue * q) if (xdc_busy) return; - req = elv_next_request(q); - if (req) - blkdev_dequeue_request(req); - + req = blk_fetch_request(q); while (req) { unsigned block = blk_rq_pos(req); unsigned count = blk_rq_cur_sectors(req); @@ -325,11 +322,8 @@ static void do_xd_request (struct request_queue * q) block, count); done: /* wrap up, 0 = success, -errno = fail */ - if (!__blk_end_request_cur(req, res)) { - req = elv_next_request(q); - if (req) - blkdev_dequeue_request(req); - } + if (!__blk_end_request_cur(req, res)) + req = blk_fetch_request(q); } } diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 66f834571b8..6d4ac76c280 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -299,13 +299,13 @@ static void do_blkif_request(struct request_queue *rq) queued = 0; - while ((req = elv_next_request(rq)) != NULL) { + while ((req = blk_peek_request(rq)) != NULL) { info = req->rq_disk->private_data; if (RING_FULL(&info->ring)) goto wait; - blkdev_dequeue_request(req); + blk_start_request(req); if (!blk_fs_request(req)) { __blk_end_request_all(req, -EIO); diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index edf137b6c37..3a4397edab7 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -463,10 +463,10 @@ struct request *ace_get_next_request(struct request_queue * q) { struct request *req; - while ((req = elv_next_request(q)) != NULL) { + while ((req = blk_peek_request(q)) != NULL) { if (blk_fs_request(req)) break; - blkdev_dequeue_request(req); + blk_start_request(req); __blk_end_request_all(req, -EIO); } return req; @@ -498,10 +498,8 @@ static void ace_fsm_dostate(struct ace_device *ace) __blk_end_request_all(ace->req, -EIO); ace->req = NULL; } - while ((req = elv_next_request(ace->queue)) != NULL) { - blkdev_dequeue_request(req); + while ((req = blk_fetch_request(ace->queue)) != NULL) __blk_end_request_all(req, -EIO); - } /* Drop back to IDLE state and notify waiters */ ace->fsm_state = ACE_FSM_STATE_IDLE; @@ -649,7 +647,7 @@ static void ace_fsm_dostate(struct ace_device *ace) ace->fsm_state = ACE_FSM_STATE_IDLE; break; } - blkdev_dequeue_request(req); + blk_start_request(req); /* Okay, it's a data request, set it up for transfer */ dev_dbg(ace->dev, diff --git a/drivers/block/z2ram.c b/drivers/block/z2ram.c index c909c1a3f65..4575171e5be 100644 --- a/drivers/block/z2ram.c +++ b/drivers/block/z2ram.c @@ -71,10 +71,7 @@ static void do_z2_request(struct request_queue *q) { struct request *req; - req = elv_next_request(q); - if (req) - blkdev_dequeue_request(req); - + req = blk_fetch_request(q); while (req) { unsigned long start = blk_rq_pos(req) << 9; unsigned long len = blk_rq_cur_bytes(req); @@ -100,11 +97,8 @@ static void do_z2_request(struct request_queue *q) len -= size; } done: - if (!__blk_end_request_cur(req, err)) { - req = elv_next_request(q); - if (req) - blkdev_dequeue_request(req); - } + if (!__blk_end_request_cur(req, err)) + req = blk_fetch_request(q); } } diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 3cc02bfe828..1e366ad8f68 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -642,9 +642,7 @@ static void gdrom_request(struct request_queue *rq) { struct request *req; - while ((req = elv_next_request(rq)) != NULL) { - blkdev_dequeue_request(req); - + while ((req = blk_fetch_request(rq)) != NULL) { if (!blk_fs_request(req)) { printk(KERN_DEBUG "GDROM: Non-fs request ignored\n"); __blk_end_request_all(req, -EIO); diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index bbe9f086734..ca741c21e4a 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -297,9 +297,7 @@ static void do_viocd_request(struct request_queue *q) { struct request *req; - while ((rwreq == 0) && ((req = elv_next_request(q)) != NULL)) { - blkdev_dequeue_request(req); - + while ((rwreq == 0) && ((req = blk_fetch_request(q)) != NULL)) { if (!blk_fs_request(req)) __blk_end_request_all(req, -EIO); else if (send_request(req) < 0) { diff --git a/drivers/ide/ide-atapi.c b/drivers/ide/ide-atapi.c index 2874c3d703a..8a894fa37b5 100644 --- a/drivers/ide/ide-atapi.c +++ b/drivers/ide/ide-atapi.c @@ -269,7 +269,7 @@ void ide_retry_pc(ide_drive_t *drive) blk_requeue_request(failed_rq->q, failed_rq); drive->hwif->rq = NULL; if (ide_queue_sense_rq(drive, pc)) { - blkdev_dequeue_request(failed_rq); + blk_start_request(failed_rq); ide_complete_rq(drive, -EIO, blk_rq_bytes(failed_rq)); } } diff --git a/drivers/ide/ide-io.c b/drivers/ide/ide-io.c index abda7337b3f..e4e3a0e3201 100644 --- a/drivers/ide/ide-io.c +++ b/drivers/ide/ide-io.c @@ -519,11 +519,8 @@ repeat: * we know that the queue isn't empty, but this can happen * if the q->prep_rq_fn() decides to kill a request */ - if (!rq) { - rq = elv_next_request(drive->queue); - if (rq) - blkdev_dequeue_request(rq); - } + if (!rq) + rq = blk_fetch_request(drive->queue); spin_unlock_irq(q->queue_lock); spin_lock_irq(&hwif->lock); @@ -536,7 +533,7 @@ repeat: /* * Sanity: don't accept a request that isn't a PM request * if we are currently power managed. This is very important as - * blk_stop_queue() doesn't prevent the elv_next_request() + * blk_stop_queue() doesn't prevent the blk_fetch_request() * above to return us whatever is in the queue. Since we call * ide_do_request() ourselves, we end up taking requests while * the queue is blocked... diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index 58f5be8cd69..c0bebc6a2f2 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -704,13 +704,12 @@ try_again: return 0; } - dev_dbg(&card->dev, "elv_next\n"); - msb->block_req = elv_next_request(msb->queue); + dev_dbg(&card->dev, "blk_fetch\n"); + msb->block_req = blk_fetch_request(msb->queue); if (!msb->block_req) { dev_dbg(&card->dev, "issue end\n"); return -EAGAIN; } - blkdev_dequeue_request(msb->block_req); dev_dbg(&card->dev, "trying again\n"); chunk = 1; @@ -825,10 +824,8 @@ static void mspro_block_submit_req(struct request_queue *q) return; if (msb->eject) { - while ((req = elv_next_request(q)) != NULL) { - blkdev_dequeue_request(req); + while ((req = blk_fetch_request(q)) != NULL) __blk_end_request_all(req, -ENODEV); - } return; } diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index 8b5cbfc3ba9..6573ef4408f 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -877,7 +877,7 @@ static void i2o_block_request_fn(struct request_queue *q) struct request *req; while (!blk_queue_plugged(q)) { - req = elv_next_request(q); + req = blk_peek_request(q); if (!req) break; @@ -890,7 +890,7 @@ static void i2o_block_request_fn(struct request_queue *q) if (queue_depth < I2O_BLOCK_MAX_OPEN_REQUESTS) { if (!i2o_block_transfer(req)) { - blkdev_dequeue_request(req); + blk_start_request(req); continue; } else osm_info("transfer error\n"); @@ -917,7 +917,7 @@ static void i2o_block_request_fn(struct request_queue *q) break; } } else { - blkdev_dequeue_request(req); + blk_start_request(req); __blk_end_request_all(req, -EIO); } } diff --git a/drivers/mmc/card/queue.c b/drivers/mmc/card/queue.c index 4b70f1e2834..49e582356c6 100644 --- a/drivers/mmc/card/queue.c +++ b/drivers/mmc/card/queue.c @@ -54,11 +54,8 @@ static int mmc_queue_thread(void *d) spin_lock_irq(q->queue_lock); set_current_state(TASK_INTERRUPTIBLE); - if (!blk_queue_plugged(q)) { - req = elv_next_request(q); - if (req) - blkdev_dequeue_request(req); - } + if (!blk_queue_plugged(q)) + req = blk_fetch_request(q); mq->req = req; spin_unlock_irq(q->queue_lock); @@ -94,10 +91,8 @@ static void mmc_request(struct request_queue *q) if (!mq) { printk(KERN_ERR "MMC: killing requests for dead queue\n"); - while ((req = elv_next_request(q)) != NULL) { - blkdev_dequeue_request(req); + while ((req = blk_fetch_request(q)) != NULL) __blk_end_request_all(req, -EIO); - } return; } diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 3e10442615d..502622f628b 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -100,12 +100,7 @@ static int mtd_blktrans_thread(void *arg) struct mtd_blktrans_dev *dev; int res; - if (!req) { - req = elv_next_request(rq); - if (req) - blkdev_dequeue_request(req); - } - if (!req) { + if (!req && !(req = blk_fetch_request(rq))) { set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(rq->queue_lock); schedule(); diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index 7df03c7aea0..e64f62d5e0f 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1656,17 +1656,13 @@ static void __dasd_process_request_queue(struct dasd_block *block) if (basedev->state < DASD_STATE_READY) return; /* Now we try to fetch requests from the request queue */ - while (!blk_queue_plugged(queue) && - elv_next_request(queue)) { - - req = elv_next_request(queue); - + while (!blk_queue_plugged(queue) && (req = blk_peek_request(queue))) { if (basedev->features & DASD_FEATURE_READONLY && rq_data_dir(req) == WRITE) { DBF_DEV_EVENT(DBF_ERR, basedev, "Rejecting write request %p", req); - blkdev_dequeue_request(req); + blk_start_request(req); __blk_end_request_all(req, -EIO); continue; } @@ -1695,7 +1691,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) "CCW creation failed (rc=%ld) " "on request %p", PTR_ERR(cqr), req); - blkdev_dequeue_request(req); + blk_start_request(req); __blk_end_request_all(req, -EIO); continue; } @@ -1705,7 +1701,7 @@ static void __dasd_process_request_queue(struct dasd_block *block) */ cqr->callback_data = (void *) req; cqr->status = DASD_CQR_FILLED; - blkdev_dequeue_request(req); + blk_start_request(req); list_add_tail(&cqr->blocklist, &block->ccw_queue); dasd_profile_start(block, cqr, req); } @@ -2029,10 +2025,8 @@ static void dasd_flush_request_queue(struct dasd_block *block) return; spin_lock_irq(&block->request_queue_lock); - while ((req = elv_next_request(block->request_queue))) { - blkdev_dequeue_request(req); + while ((req = blk_fetch_request(block->request_queue))) __blk_end_request_all(req, -EIO); - } spin_unlock_irq(&block->request_queue_lock); } diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c index 5d035e4939d..1e796767598 100644 --- a/drivers/s390/char/tape_block.c +++ b/drivers/s390/char/tape_block.c @@ -93,7 +93,7 @@ __tapeblock_end_request(struct tape_request *ccw_req, void *data) device->blk_data.block_position = -1; device->discipline->free_bread(ccw_req); if (!list_empty(&device->req_queue) || - elv_next_request(device->blk_data.request_queue)) + blk_peek_request(device->blk_data.request_queue)) tapeblock_trigger_requeue(device); } @@ -162,19 +162,16 @@ tapeblock_requeue(struct work_struct *work) { spin_lock_irq(&device->blk_data.request_queue_lock); while ( !blk_queue_plugged(queue) && - elv_next_request(queue) && + (req = blk_fetch_request(queue)) && nr_queued < TAPEBLOCK_MIN_REQUEUE ) { - req = elv_next_request(queue); if (rq_data_dir(req) == WRITE) { DBF_EVENT(1, "TBLOCK: Rejecting write request\n"); - blkdev_dequeue_request(req); spin_unlock_irq(&device->blk_data.request_queue_lock); blk_end_request_all(req, -EIO); spin_lock_irq(&device->blk_data.request_queue_lock); continue; } - blkdev_dequeue_request(req); nr_queued++; spin_unlock_irq(&device->blk_data.request_queue_lock); rc = tapeblock_start_request(device, req); diff --git a/drivers/sbus/char/jsflash.c b/drivers/sbus/char/jsflash.c index f572a4a1d14..6d465168468 100644 --- a/drivers/sbus/char/jsflash.c +++ b/drivers/sbus/char/jsflash.c @@ -186,10 +186,7 @@ static void jsfd_do_request(struct request_queue *q) { struct request *req; - req = elv_next_request(q); - if (req) - blkdev_dequeue_request(req); - + req = blk_fetch_request(q); while (req) { struct jsfd_part *jdp = req->rq_disk->private_data; unsigned long offset = blk_rq_pos(req) << 9; @@ -212,11 +209,8 @@ static void jsfd_do_request(struct request_queue *q) jsfd_read(req->buffer, jdp->dbase + offset, len); err = 0; end: - if (!__blk_end_request_cur(req, err)) { - req = elv_next_request(q); - if (req) - blkdev_dequeue_request(req); - } + if (!__blk_end_request_cur(req, err)) + req = blk_fetch_request(q); } } diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index ee308f6f798..b12750f8216 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -1207,7 +1207,7 @@ int scsi_prep_return(struct request_queue *q, struct request *req, int ret) break; case BLKPREP_DEFER: /* - * If we defer, the elv_next_request() returns NULL, but the + * If we defer, the blk_peek_request() returns NULL, but the * queue must be restarted, so we plug here if no returning * command will automatically do that. */ @@ -1385,7 +1385,7 @@ static void scsi_kill_request(struct request *req, struct request_queue *q) struct scsi_target *starget = scsi_target(sdev); struct Scsi_Host *shost = sdev->host; - blkdev_dequeue_request(req); + blk_start_request(req); if (unlikely(cmd == NULL)) { printk(KERN_CRIT "impossible request in %s.\n", @@ -1477,7 +1477,7 @@ static void scsi_request_fn(struct request_queue *q) if (!sdev) { printk("scsi: killing requests for dead queue\n"); - while ((req = elv_next_request(q)) != NULL) + while ((req = blk_peek_request(q)) != NULL) scsi_kill_request(req, q); return; } @@ -1498,7 +1498,7 @@ static void scsi_request_fn(struct request_queue *q) * that the request is fully prepared even if we cannot * accept it. */ - req = elv_next_request(q); + req = blk_peek_request(q); if (!req || !scsi_dev_queue_ready(q, sdev)) break; @@ -1514,7 +1514,7 @@ static void scsi_request_fn(struct request_queue *q) * Remove the request from the request list. */ if (!(blk_queue_tagged(q) && !blk_queue_start_tag(q, req))) - blkdev_dequeue_request(req); + blk_start_request(req); sdev->device_busy++; spin_unlock(q->queue_lock); diff --git a/drivers/scsi/scsi_transport_sas.c b/drivers/scsi/scsi_transport_sas.c index 50988cbf7b2..d606452297c 100644 --- a/drivers/scsi/scsi_transport_sas.c +++ b/drivers/scsi/scsi_transport_sas.c @@ -163,12 +163,10 @@ static void sas_smp_request(struct request_queue *q, struct Scsi_Host *shost, int (*handler)(struct Scsi_Host *, struct sas_rphy *, struct request *); while (!blk_queue_plugged(q)) { - req = elv_next_request(q); + req = blk_fetch_request(q); if (!req) break; - blkdev_dequeue_request(req); - spin_unlock_irq(q->queue_lock); handler = to_sas_internal(shost->transportt)->f->smp_handler; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index c7558034570..6e59d3b92ff 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -818,8 +818,6 @@ static inline void blk_run_address_space(struct address_space *mapping) blk_run_backing_dev(mapping->backing_dev_info, NULL); } -extern void blkdev_dequeue_request(struct request *req); - /* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request @@ -852,6 +850,13 @@ static inline unsigned int blk_rq_cur_sectors(const struct request *rq) return blk_rq_cur_bytes(rq) >> 9; } +/* + * Request issue related functions. + */ +extern struct request *blk_peek_request(struct request_queue *q); +extern void blk_start_request(struct request *rq); +extern struct request *blk_fetch_request(struct request_queue *q); + /* * Request completion related functions. * diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 4e462878c9c..1cb3372e65d 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -103,10 +103,8 @@ extern int elv_merge(struct request_queue *, struct request **, struct bio *); extern void elv_merge_requests(struct request_queue *, struct request *, struct request *); extern void elv_merged_request(struct request_queue *, struct request *, int); -extern void elv_dequeue_request(struct request_queue *, struct request *); extern void elv_requeue_request(struct request_queue *, struct request *); extern int elv_queue_empty(struct request_queue *); -extern struct request *elv_next_request(struct request_queue *q); extern struct request *elv_former_request(struct request_queue *, struct request *); extern struct request *elv_latter_request(struct request_queue *, struct request *); extern int elv_register_queue(struct request_queue *q); From 7bce6c2740fab36708233e998a9e53115649b193 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Mon, 11 May 2009 06:51:28 +0000 Subject: [PATCH 0955/2061] sh: sh7785lcr: fix I2C device address map for 32-bit mode This fixes up the broken I2C offset in 32-bit mode. The cause is because the board datasheet had a mistake. Signed-off-by: Yoshihiro Shimoda Signed-off-by: Paul Mundt --- arch/sh/include/mach-common/mach/sh7785lcr.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/sh/include/mach-common/mach/sh7785lcr.h b/arch/sh/include/mach-common/mach/sh7785lcr.h index 1ce27d5c749..90011d435f3 100644 --- a/arch/sh/include/mach-common/mach/sh7785lcr.h +++ b/arch/sh/include/mach-common/mach/sh7785lcr.h @@ -9,11 +9,11 @@ * -----------------------------+---------------+--------------- * 0x00000000 - 0x03ffffff(CS0) | NOR Flash | NOR Flash * 0x04000000 - 0x05ffffff(CS1) | PLD | PLD - * 0x06000000 - 0x07ffffff(CS1) | reserved | I2C + * 0x06000000 - 0x07ffffff(CS1) | I2C | I2C * 0x08000000 - 0x0bffffff(CS2) | USB | DDR SDRAM * 0x0c000000 - 0x0fffffff(CS3) | SD | DDR SDRAM * 0x10000000 - 0x13ffffff(CS4) | SM107 | SM107 - * 0x14000000 - 0x17ffffff(CS5) | I2C | USB + * 0x14000000 - 0x17ffffff(CS5) | reserved | USB * 0x18000000 - 0x1bffffff(CS6) | reserved | SD * 0x40000000 - 0x5fffffff | DDR SDRAM | (cannot use) * @@ -32,6 +32,9 @@ #define PLD_VERSR (PLD_BASE_ADDR + 0x0c) #define PLD_MMSR (PLD_BASE_ADDR + 0x0e) +#define PCA9564_ADDR 0x06000000 /* I2C */ +#define PCA9564_SIZE 0x00000100 + #define SM107_MEM_ADDR 0x10000000 #define SM107_MEM_SIZE 0x00e00000 #define SM107_REG_ADDR 0x13e00000 @@ -40,16 +43,13 @@ #if defined(CONFIG_SH_SH7785LCR_29BIT_PHYSMAPS) #define R8A66597_ADDR 0x14000000 /* USB */ #define CG200_ADDR 0x18000000 /* SD */ -#define PCA9564_ADDR 0x06000000 /* I2C */ #else #define R8A66597_ADDR 0x08000000 #define CG200_ADDR 0x0c000000 -#define PCA9564_ADDR 0x14000000 #endif #define R8A66597_SIZE 0x00000100 #define CG200_SIZE 0x00010000 -#define PCA9564_SIZE 0x00000100 #endif /* __ASM_SH_RENESAS_SH7785LCR_H */ From 1822952ba2b9f22f79019d07ebbeca31dc14b718 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 11 May 2009 17:56:07 +0900 Subject: [PATCH 0956/2061] block: let blk_end_request_all handle bidi requests blk_end_request_all() and __blk_end_request_all() should finish all bytes including bidi, by definition. That's what all bidi users need , bidi requests must be complete as a whole (partial completion is impossible). Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 6e59d3b92ff..1069f4483c6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -910,8 +910,12 @@ static inline bool blk_end_request(struct request *rq, int error, static inline void blk_end_request_all(struct request *rq, int error) { bool pending; + unsigned int bidi_bytes = 0; - pending = blk_end_request(rq, error, blk_rq_bytes(rq)); + if (unlikely(blk_bidi_rq(rq))) + bidi_bytes = blk_rq_bytes(rq->next_rq); + + pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); BUG_ON(pending); } @@ -962,8 +966,12 @@ static inline bool __blk_end_request(struct request *rq, int error, static inline void __blk_end_request_all(struct request *rq, int error) { bool pending; + unsigned int bidi_bytes = 0; - pending = __blk_end_request(rq, error, blk_rq_bytes(rq)); + if (unlikely(blk_bidi_rq(rq))) + bidi_bytes = blk_rq_bytes(rq->next_rq); + + pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); BUG_ON(pending); } From e6bb7a96c2c36f20c05ef648f15bd3c2b1834c78 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 11 May 2009 17:56:08 +0900 Subject: [PATCH 0957/2061] scsi: simplify the bidi completion Let's use blk_end_request_all() instead of blk_end_bidi_request(). Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 43 +++++++++++++---------------------------- 1 file changed, 13 insertions(+), 30 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index b12750f8216..a54bec99438 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -672,33 +672,6 @@ void scsi_release_buffers(struct scsi_cmnd *cmd) } EXPORT_SYMBOL(scsi_release_buffers); -/* - * Bidi commands Must be complete as a whole, both sides at once. If - * part of the bytes were written and lld returned scsi_in()->resid - * and/or scsi_out()->resid this information will be left in - * req->resid_len and req->next_rq->resid_len. The upper-layer driver - * can decide what to do with this information. - */ -static void scsi_end_bidi_request(struct scsi_cmnd *cmd) -{ - struct request *req = cmd->request; - - req->resid_len = scsi_out(cmd)->resid; - req->next_rq->resid_len = scsi_in(cmd)->resid; - - /* The req and req->next_rq have not been completed */ - BUG_ON(blk_end_bidi_request(req, 0, blk_rq_bytes(req), - blk_rq_bytes(req->next_rq))); - - scsi_release_buffers(cmd); - - /* - * This will goose the queue request function at the end, so we don't - * need to worry about launching another command. - */ - scsi_next_command(cmd); -} - /* * Function: scsi_io_completion() * @@ -772,12 +745,22 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) if (!sense_deferred) error = -EIO; } + + req->resid_len = scsi_get_resid(cmd); + if (scsi_bidi_cmnd(cmd)) { - /* will also release_buffers */ - scsi_end_bidi_request(cmd); + /* + * Bidi commands Must be complete as a whole, + * both sides at once. + */ + req->next_rq->resid_len = scsi_in(cmd)->resid; + + blk_end_request_all(req, 0); + + scsi_release_buffers(cmd); + scsi_next_command(cmd); return; } - req->resid_len = scsi_get_resid(cmd); } BUG_ON(blk_bidi_rq(req)); /* bidi not support for !blk_pc_request yet */ From b1f744937f1be3e6d3009382a755679133cf782d Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Mon, 11 May 2009 17:56:09 +0900 Subject: [PATCH 0958/2061] block: move completion related functions back to blk-core.c Let's put the completion related functions back to block/blk-core.c where they have lived. We can also unexport blk_end_bidi_request() and __blk_end_bidi_request(), which nobody uses. Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- block/blk-core.c | 128 +++++++++++++++++++++++++++++++++++++++-- include/linux/blkdev.h | 128 +++-------------------------------------- 2 files changed, 130 insertions(+), 126 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 93691d2ac5a..a2d97de1a12 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -2026,8 +2026,8 @@ static void blk_finish_request(struct request *req, int error) * %false - we are done with this request * %true - still buffers pending for this request **/ -bool blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes) +static bool blk_end_bidi_request(struct request *rq, int error, + unsigned int nr_bytes, unsigned int bidi_bytes) { struct request_queue *q = rq->q; unsigned long flags; @@ -2041,7 +2041,6 @@ bool blk_end_bidi_request(struct request *rq, int error, return false; } -EXPORT_SYMBOL_GPL(blk_end_bidi_request); /** * __blk_end_bidi_request - Complete a bidi request with queue lock held @@ -2058,8 +2057,8 @@ EXPORT_SYMBOL_GPL(blk_end_bidi_request); * %false - we are done with this request * %true - still buffers pending for this request **/ -bool __blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, unsigned int bidi_bytes) +static bool __blk_end_bidi_request(struct request *rq, int error, + unsigned int nr_bytes, unsigned int bidi_bytes) { if (blk_update_bidi_request(rq, error, nr_bytes, bidi_bytes)) return true; @@ -2068,7 +2067,124 @@ bool __blk_end_bidi_request(struct request *rq, int error, return false; } -EXPORT_SYMBOL_GPL(__blk_end_bidi_request); + +/** + * blk_end_request - Helper function for drivers to complete the request. + * @rq: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete + * + * Description: + * Ends I/O on a number of bytes attached to @rq. + * If @rq has leftover, sets it up for the next range of segments. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + **/ +bool blk_end_request(struct request *rq, int error, unsigned int nr_bytes) +{ + return blk_end_bidi_request(rq, error, nr_bytes, 0); +} +EXPORT_SYMBOL_GPL(blk_end_request); + +/** + * blk_end_request_all - Helper function for drives to finish the request. + * @rq: the request to finish + * @err: %0 for success, < %0 for error + * + * Description: + * Completely finish @rq. + */ +void blk_end_request_all(struct request *rq, int error) +{ + bool pending; + unsigned int bidi_bytes = 0; + + if (unlikely(blk_bidi_rq(rq))) + bidi_bytes = blk_rq_bytes(rq->next_rq); + + pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); + BUG_ON(pending); +} +EXPORT_SYMBOL_GPL(blk_end_request_all); + +/** + * blk_end_request_cur - Helper function to finish the current request chunk. + * @rq: the request to finish the current chunk for + * @err: %0 for success, < %0 for error + * + * Description: + * Complete the current consecutively mapped chunk from @rq. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + */ +bool blk_end_request_cur(struct request *rq, int error) +{ + return blk_end_request(rq, error, blk_rq_cur_bytes(rq)); +} +EXPORT_SYMBOL_GPL(blk_end_request_cur); + +/** + * __blk_end_request - Helper function for drivers to complete the request. + * @rq: the request being processed + * @error: %0 for success, < %0 for error + * @nr_bytes: number of bytes to complete + * + * Description: + * Must be called with queue lock held unlike blk_end_request(). + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + **/ +bool __blk_end_request(struct request *rq, int error, unsigned int nr_bytes) +{ + return __blk_end_bidi_request(rq, error, nr_bytes, 0); +} +EXPORT_SYMBOL_GPL(__blk_end_request); + +/** + * __blk_end_request_all - Helper function for drives to finish the request. + * @rq: the request to finish + * @err: %0 for success, < %0 for error + * + * Description: + * Completely finish @rq. Must be called with queue lock held. + */ +void __blk_end_request_all(struct request *rq, int error) +{ + bool pending; + unsigned int bidi_bytes = 0; + + if (unlikely(blk_bidi_rq(rq))) + bidi_bytes = blk_rq_bytes(rq->next_rq); + + pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); + BUG_ON(pending); +} +EXPORT_SYMBOL_GPL(__blk_end_request_all); + +/** + * __blk_end_request_cur - Helper function to finish the current request chunk. + * @rq: the request to finish the current chunk for + * @err: %0 for success, < %0 for error + * + * Description: + * Complete the current consecutively mapped chunk from @rq. Must + * be called with queue lock held. + * + * Return: + * %false - we are done with this request + * %true - still buffers pending for this request + */ +bool __blk_end_request_cur(struct request *rq, int error) +{ + return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); +} +EXPORT_SYMBOL_GPL(__blk_end_request_cur); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1069f4483c6..f9d60a78c08 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -872,126 +872,14 @@ extern struct request *blk_fetch_request(struct request_queue *q); */ extern bool blk_update_request(struct request *rq, int error, unsigned int nr_bytes); -extern bool blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, - unsigned int bidi_bytes); -extern bool __blk_end_bidi_request(struct request *rq, int error, - unsigned int nr_bytes, - unsigned int bidi_bytes); - -/** - * blk_end_request - Helper function for drivers to complete the request. - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete - * - * Description: - * Ends I/O on a number of bytes attached to @rq. - * If @rq has leftover, sets it up for the next range of segments. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - **/ -static inline bool blk_end_request(struct request *rq, int error, - unsigned int nr_bytes) -{ - return blk_end_bidi_request(rq, error, nr_bytes, 0); -} - -/** - * blk_end_request_all - Helper function for drives to finish the request. - * @rq: the request to finish - * @err: %0 for success, < %0 for error - * - * Description: - * Completely finish @rq. - */ -static inline void blk_end_request_all(struct request *rq, int error) -{ - bool pending; - unsigned int bidi_bytes = 0; - - if (unlikely(blk_bidi_rq(rq))) - bidi_bytes = blk_rq_bytes(rq->next_rq); - - pending = blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); - BUG_ON(pending); -} - -/** - * blk_end_request_cur - Helper function to finish the current request chunk. - * @rq: the request to finish the current chunk for - * @err: %0 for success, < %0 for error - * - * Description: - * Complete the current consecutively mapped chunk from @rq. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - */ -static inline bool blk_end_request_cur(struct request *rq, int error) -{ - return blk_end_request(rq, error, blk_rq_cur_bytes(rq)); -} - -/** - * __blk_end_request - Helper function for drivers to complete the request. - * @rq: the request being processed - * @error: %0 for success, < %0 for error - * @nr_bytes: number of bytes to complete - * - * Description: - * Must be called with queue lock held unlike blk_end_request(). - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - **/ -static inline bool __blk_end_request(struct request *rq, int error, - unsigned int nr_bytes) -{ - return __blk_end_bidi_request(rq, error, nr_bytes, 0); -} - -/** - * __blk_end_request_all - Helper function for drives to finish the request. - * @rq: the request to finish - * @err: %0 for success, < %0 for error - * - * Description: - * Completely finish @rq. Must be called with queue lock held. - */ -static inline void __blk_end_request_all(struct request *rq, int error) -{ - bool pending; - unsigned int bidi_bytes = 0; - - if (unlikely(blk_bidi_rq(rq))) - bidi_bytes = blk_rq_bytes(rq->next_rq); - - pending = __blk_end_bidi_request(rq, error, blk_rq_bytes(rq), bidi_bytes); - BUG_ON(pending); -} - -/** - * __blk_end_request_cur - Helper function to finish the current request chunk. - * @rq: the request to finish the current chunk for - * @err: %0 for success, < %0 for error - * - * Description: - * Complete the current consecutively mapped chunk from @rq. Must - * be called with queue lock held. - * - * Return: - * %false - we are done with this request - * %true - still buffers pending for this request - */ -static inline bool __blk_end_request_cur(struct request *rq, int error) -{ - return __blk_end_request(rq, error, blk_rq_cur_bytes(rq)); -} +extern bool blk_end_request(struct request *rq, int error, + unsigned int nr_bytes); +extern void blk_end_request_all(struct request *rq, int error); +extern bool blk_end_request_cur(struct request *rq, int error); +extern bool __blk_end_request(struct request *rq, int error, + unsigned int nr_bytes); +extern void __blk_end_request_all(struct request *rq, int error); +extern bool __blk_end_request_cur(struct request *rq, int error); extern void blk_complete_request(struct request *); extern void __blk_complete_request(struct request *); From 80989ce0643c1034822f3e339ed8d790b649abe1 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Sat, 9 May 2009 23:47:42 -0700 Subject: [PATCH 0959/2061] x86: clean up and and print out initial max_pfn_mapped Do this so we can check the range that is mapped before init_memory_mapping(). To be able to print out meaningful info, we first have to fix 64-bit to have max_pfn_mapped assigned before that call. This also unifies the code-path a bit. [ Impact: print more debug info, cleanup ] Signed-off-by: Yinghai Lu LKML-Reference: <49BF0978.40605@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup.c | 4 ++++ arch/x86/mm/init.c | 7 +++---- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index 0d77e56e821..4031d6cb3ff 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -862,12 +862,16 @@ void __init setup_arch(char **cmdline_p) max_low_pfn = max_pfn; high_memory = (void *)__va(max_pfn * PAGE_SIZE - 1) + 1; + max_pfn_mapped = KERNEL_IMAGE_SIZE >> PAGE_SHIFT; #endif #ifdef CONFIG_X86_CHECK_BIOS_CORRUPTION setup_bios_corruption_check(); #endif + printk(KERN_DEBUG "initial memory mapped : 0 - %08lx\n", + max_pfn_mapped< Date: Fri, 8 May 2009 00:36:44 -0700 Subject: [PATCH 0960/2061] x86: Sanity check the e820 against the SRAT table using e820 map only node_cover_memory() sanity checks the SRAT table by ensuring that all PXMs cover the memory reported in the e820. However, when calculating the size of the holes in the e820, it uses the early_node_map[] which contains information taken from both SRAT and e820. If the SRAT is missing an entry, then it is not detected that the SRAT table is incorrect and missing entries. This patch uses the e820 map to calculate the holes instead of early_node_map[]. comment is from Mel. [ Impact: reject incorrect SRAT tables ] Signed-off-by: Yinghai Lu Acked-by: Mel Gorman Cc: Andrew Morton LKML-Reference: <4A03E10C.60906@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index 01765955baa..c7a18aaccf8 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -345,7 +345,7 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) pxmram = 0; } - e820ram = max_pfn - absent_pages_in_range(0, max_pfn); + e820ram = max_pfn - (e820_hole_size(0, max_pfn<>PAGE_SHIFT); /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ if ((long)(e820ram - pxmram) >= 1*1024*1024) { printk(KERN_ERR From 0964b0562bb9c93194e852b47bab2397b9e11c18 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 8 May 2009 00:37:34 -0700 Subject: [PATCH 0961/2061] x86: Allow 1MB of slack between the e820 map and SRAT, not 4GB It is expected that there might be slight differences between the e820 map and the SRAT table and the intention was that 1MB of slack be allowed. The calculation comparing e820ram and pxmram assumes the units are bytes, when they are in fact pages. This means 4GB of slack is being allowed, not 1MB. This patch makes the correct comparison. comment is from Mel. [ Impact: don't accept buggy SRATs that could dump up to 4G of RAM ] Signed-off-by: Yinghai Lu Acked-by: Mel Gorman Cc: Andrew Morton LKML-Reference: <4A03E13E.6050107@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/mm/srat_64.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index c7a18aaccf8..87b45bff250 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -346,8 +346,8 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) } e820ram = max_pfn - (e820_hole_size(0, max_pfn<>PAGE_SHIFT); - /* We seem to lose 3 pages somewhere. Allow a bit of slack. */ - if ((long)(e820ram - pxmram) >= 1*1024*1024) { + /* We seem to lose 3 pages somewhere. Allow 1M of slack. */ + if ((long)(e820ram - pxmram) >= (1<<(20 - PAGE_SHIFT))) { printk(KERN_ERR "SRAT: PXMs only cover %luMB of your %luMB e820 RAM. Not used.\n", (pxmram << PAGE_SHIFT) >> 20, From 53c0054c3f11b49fc09f24e46f58661def952728 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Mon, 11 May 2009 08:45:27 +0000 Subject: [PATCH 0962/2061] sh: include empty_zero_page in text Include empty_zero_page in _text. This fixes a problem introduced by c3e2586b794b12ffcdf69b4e547030b51e18e6d9 which results in broken boot on R2D-Plus. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/vmlinux.lds.S | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/vmlinux.lds.S b/arch/sh/kernel/vmlinux.lds.S index d73723080fd..f53c76acaed 100644 --- a/arch/sh/kernel/vmlinux.lds.S +++ b/arch/sh/kernel/vmlinux.lds.S @@ -26,12 +26,13 @@ SECTIONS . = CONFIG_PAGE_OFFSET + CONFIG_MEMORY_START + CONFIG_ZERO_PAGE_OFFSET; #endif + _text = .; /* Text and read-only data */ + .empty_zero_page : AT(ADDR(.empty_zero_page) - LOAD_OFFSET) { *(.empty_zero_page) } = 0 .text : AT(ADDR(.text) - LOAD_OFFSET) { - _text = .; /* Text and read-only data */ HEAD_TEXT TEXT_TEXT From 03f408f1aad8d0f5eb6380732bffc0f72b250fa7 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Mon, 11 May 2009 08:54:54 +0000 Subject: [PATCH 0963/2061] sh: TMU platform data for sh775x This patch adds TMU platform data for sh775x. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4/setup-sh7750.c | 187 ++++++++++++++++++++++++++ 1 file changed, 187 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c index a1c80d909cd..09da0c187d4 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c @@ -12,6 +12,7 @@ #include #include #include +#include #include static struct resource rtc_resources[] = { @@ -60,9 +61,177 @@ static struct platform_device sci_device = { }, }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + +/* SH7750R, SH7751 and SH7751R all have two extra timer channels */ +#if defined(CONFIG_CPU_SUBTYPE_SH7750R) || \ + defined(CONFIG_CPU_SUBTYPE_SH7751) || \ + defined(CONFIG_CPU_SUBTYPE_SH7751R) + +static struct sh_timer_config tmu3_platform_data = { + .name = "TMU3", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", +}; + +static struct resource tmu3_resources[] = { + [0] = { + .name = "TMU3", + .start = 0xfe100008, + .end = 0xfe100013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 72, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu3_device = { + .name = "sh_tmu", + .id = 3, + .dev = { + .platform_data = &tmu3_platform_data, + }, + .resource = tmu3_resources, + .num_resources = ARRAY_SIZE(tmu3_resources), +}; + +static struct sh_timer_config tmu4_platform_data = { + .name = "TMU4", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource tmu4_resources[] = { + [0] = { + .name = "TMU4", + .start = 0xfe100014, + .end = 0xfe10001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 76, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu4_device = { + .name = "sh_tmu", + .id = 4, + .dev = { + .platform_data = &tmu4_platform_data, + }, + .resource = tmu4_resources, + .num_resources = ARRAY_SIZE(tmu4_resources), +}; + +#endif + static struct platform_device *sh7750_devices[] __initdata = { &rtc_device, &sci_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, +#if defined(CONFIG_CPU_SUBTYPE_SH7750R) || \ + defined(CONFIG_CPU_SUBTYPE_SH7751) || \ + defined(CONFIG_CPU_SUBTYPE_SH7751R) + &tmu3_device, + &tmu4_device, +#endif }; static int __init sh7750_devices_setup(void) @@ -72,6 +241,24 @@ static int __init sh7750_devices_setup(void) } __initcall(sh7750_devices_setup); +static struct platform_device *sh7750_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, +#if defined(CONFIG_CPU_SUBTYPE_SH7750R) || \ + defined(CONFIG_CPU_SUBTYPE_SH7751) || \ + defined(CONFIG_CPU_SUBTYPE_SH7751R) + &tmu3_device, + &tmu4_device, +#endif +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7750_early_devices, + ARRAY_SIZE(sh7750_early_devices)); +} + enum { UNUSED = 0, From c42f32dca3855d8f867387ec6993d9b62516a00e Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Mon, 11 May 2009 09:06:24 +0000 Subject: [PATCH 0964/2061] sh: TMU platform data for sh7760 This patch adds TMU platform data for sh7760. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4/setup-sh7760.c | 109 ++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c index d9bdc931ac0..4a6fd0d3c7b 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -168,8 +169,104 @@ static struct platform_device sci_device = { }, }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + + static struct platform_device *sh7760_devices[] __initdata = { &sci_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, }; static int __init sh7760_devices_setup(void) @@ -179,6 +276,18 @@ static int __init sh7760_devices_setup(void) } __initcall(sh7760_devices_setup); +static struct platform_device *sh7760_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7760_early_devices, + ARRAY_SIZE(sh7760_early_devices)); +} + #define INTC_ICR 0xffd00000UL #define INTC_ICR_IRLM (1 << 7) From 3551f88f6439cf4da3f5a3747b320280e30500de Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Thu, 7 May 2009 15:35:41 +0300 Subject: [PATCH 0965/2061] x86: unify 64-bit UMA and NUMA paging_init() 64-bit UMA and NUMA versions of paging_init() are almost identical. Therefore, merge the copy in mm/numa_64.c to mm/init_64.c to remove duplicate code. [ Impact: cleanup ] Signed-off-by: Pekka Enberg LKML-Reference: <1241699741.17846.30.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 6 +++++- arch/x86/mm/numa_64.c | 15 --------------- 2 files changed, 5 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index 6a1a573e20f..be7e1279178 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -585,6 +585,7 @@ void __init initmem_init(unsigned long start_pfn, unsigned long end_pfn) early_res_to_bootmem(0, end_pfn< Date: Thu, 7 May 2009 15:35:42 +0300 Subject: [PATCH 0966/2061] x86: use sparse_memory_present_with_active_regions() on UMA There's no need to use call memory_present() manually on UMA because initmem_init() sets up early_node_map by calling e820_register_active_regions(). [ Impact: cleanup ] Signed-off-by: Pekka Enberg LKML-Reference: <1241699742.17846.31.camel@penberg-laptop> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_64.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c index be7e1279178..52bb9519bb8 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -596,11 +596,7 @@ void __init paging_init(void) max_zone_pfns[ZONE_DMA32] = MAX_DMA32_PFN; max_zone_pfns[ZONE_NORMAL] = max_pfn; -#ifdef CONFIG_NUMA sparse_memory_present_with_active_regions(MAX_NUMNODES); -#else - memory_present(0, 0, max_pfn); -#endif sparse_init(); free_area_init_nodes(max_zone_pfns); } From 8823392360dc4992f87bf4c623834d315f297493 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 10 May 2009 10:53:05 +0200 Subject: [PATCH 0967/2061] perf_counter, x86: clean up throttling printk s/PERFMON/perfcounters for perfcounter interrupt throttling warning. 'perfmon' is the CPU feature name that is Intel-only, while we do throttling in a generic way. [ Impact: cleanup ] Signed-off-by: Mike Galbraith Cc: Robert Richter Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index a6878b0798e..da27419923a 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -814,7 +814,7 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { if (printk_ratelimit()) - printk(KERN_WARNING "PERFMON: max interrupts exceeded!\n"); + printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n"); hw_perf_restore(cpuc->throttle_ctrl); } cpuc->interrupts = 0; From 6751b71ea2c7ab8c0d65f01973a3fc8ea16992f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 11 May 2009 12:08:02 +1000 Subject: [PATCH 0968/2061] perf_counter: Put whole group on when enabling group leader Currently, if you have a group where the leader is disabled and there are siblings that are enabled, and then you enable the leader, we only put the leader on the PMU, and not its enabled siblings. This is incorrect, since the enabled group members should be all on or all off at any given point. This fixes it by adding a call to group_sched_in in __perf_counter_enable in the case where we're enabling a group leader. To avoid the need for a forward declaration this also moves group_sched_in up before __perf_counter_enable. The actual content of group_sched_in is unchanged by this patch. [ Impact: fix bug in counter enable code ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18951.34946.451546.691693@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 99 ++++++++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 48 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d850a1fb8d4..a5bdc93ac47 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -419,6 +419,54 @@ counter_sched_in(struct perf_counter *counter, return 0; } +static int +group_sched_in(struct perf_counter *group_counter, + struct perf_cpu_context *cpuctx, + struct perf_counter_context *ctx, + int cpu) +{ + struct perf_counter *counter, *partial_group; + int ret; + + if (group_counter->state == PERF_COUNTER_STATE_OFF) + return 0; + + ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); + if (ret) + return ret < 0 ? ret : 0; + + group_counter->prev_state = group_counter->state; + if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) + return -EAGAIN; + + /* + * Schedule in siblings as one group (if any): + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + counter->prev_state = counter->state; + if (counter_sched_in(counter, cpuctx, ctx, cpu)) { + partial_group = counter; + goto group_error; + } + } + + return 0; + +group_error: + /* + * Groups can be scheduled in as one unit only, so undo any + * partial group before returning: + */ + list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { + if (counter == partial_group) + break; + counter_sched_out(counter, cpuctx, ctx); + } + counter_sched_out(group_counter, cpuctx, ctx); + + return -EAGAIN; +} + /* * Return 1 for a group consisting entirely of software counters, * 0 if the group contains any hardware counters. @@ -643,6 +691,9 @@ static void __perf_counter_enable(void *info) if (!group_can_go_on(counter, cpuctx, 1)) err = -EEXIST; + else if (counter == leader) + err = group_sched_in(counter, cpuctx, ctx, + smp_processor_id()); else err = counter_sched_in(counter, cpuctx, ctx, smp_processor_id()); @@ -791,54 +842,6 @@ static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) __perf_counter_sched_out(&cpuctx->ctx, cpuctx); } -static int -group_sched_in(struct perf_counter *group_counter, - struct perf_cpu_context *cpuctx, - struct perf_counter_context *ctx, - int cpu) -{ - struct perf_counter *counter, *partial_group; - int ret; - - if (group_counter->state == PERF_COUNTER_STATE_OFF) - return 0; - - ret = hw_perf_group_sched_in(group_counter, cpuctx, ctx, cpu); - if (ret) - return ret < 0 ? ret : 0; - - group_counter->prev_state = group_counter->state; - if (counter_sched_in(group_counter, cpuctx, ctx, cpu)) - return -EAGAIN; - - /* - * Schedule in siblings as one group (if any): - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - counter->prev_state = counter->state; - if (counter_sched_in(counter, cpuctx, ctx, cpu)) { - partial_group = counter; - goto group_error; - } - } - - return 0; - -group_error: - /* - * Groups can be scheduled in as one unit only, so undo any - * partial group before returning: - */ - list_for_each_entry(counter, &group_counter->sibling_list, list_entry) { - if (counter == partial_group) - break; - counter_sched_out(counter, cpuctx, ctx); - } - counter_sched_out(group_counter, cpuctx, ctx); - - return -EAGAIN; -} - static void __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) From a08b159fc243dbfe415250466d24cfc5010deee5 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 11 May 2009 15:46:10 +1000 Subject: [PATCH 0969/2061] perf_counter: don't count scheduler ticks as context switches The context-switch software counter gives inflated values at present because each scheduler tick and each process-wide counter enable/disable prctl gets counted as a context switch. This happens because perf_counter_task_tick, perf_counter_task_disable and perf_counter_task_enable all call perf_counter_task_sched_out, which calls perf_swcounter_event to record a context switch event. This fixes it by introducing a variant of perf_counter_task_sched_out with two underscores in front for internal use within the perf_counter code, and makes perf_counter_task_{tick,disable,enable} call it. This variant doesn't record a context switch event, and takes a struct perf_counter_context *. This adds the new variant rather than changing the behaviour or interface of perf_counter_task_sched_out because that is called from other code. [ Impact: fix inflated context-switch event counts ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18951.48034.485580.498953@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index a5bdc93ac47..7373b96bc36 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -837,6 +837,14 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) cpuctx->task_ctx = NULL; } +static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) +{ + struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + + __perf_counter_sched_out(ctx, cpuctx); + cpuctx->task_ctx = NULL; +} + static void perf_counter_cpu_sched_out(struct perf_cpu_context *cpuctx) { __perf_counter_sched_out(&cpuctx->ctx, cpuctx); @@ -943,15 +951,13 @@ int perf_counter_task_disable(void) struct perf_counter *counter; unsigned long flags; u64 perf_flags; - int cpu; if (likely(!ctx->nr_counters)) return 0; local_irq_save(flags); - cpu = smp_processor_id(); - perf_counter_task_sched_out(curr, cpu); + __perf_counter_task_sched_out(ctx); spin_lock(&ctx->lock); @@ -989,7 +995,7 @@ int perf_counter_task_enable(void) local_irq_save(flags); cpu = smp_processor_id(); - perf_counter_task_sched_out(curr, cpu); + __perf_counter_task_sched_out(ctx); spin_lock(&ctx->lock); @@ -1054,7 +1060,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) ctx = &curr->perf_counter_ctx; perf_counter_cpu_sched_out(cpuctx); - perf_counter_task_sched_out(curr, cpu); + __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); rotate_ctx(ctx); From 615a3f1e055ac9b0ae74e1f935a12ea2cfe2a2ad Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 11 May 2009 15:50:21 +1000 Subject: [PATCH 0970/2061] perf_counter: call atomic64_set for counter->count A compile warning triggered because we are calling atomic_set(&counter->count). But since counter->count is an atomic64_t, we have to use atomic64_set. So the count can be set short, resulting in the reset ioctl only resetting the low word. [ Impact: clear counter properly during the reset ioctl ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18951.48285.270311.981806@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7373b96bc36..5ea0240adab 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1281,7 +1281,7 @@ static unsigned int perf_poll(struct file *file, poll_table *wait) static void perf_counter_reset(struct perf_counter *counter) { (void)perf_counter_read(counter); - atomic_set(&counter->count, 0); + atomic64_set(&counter->count, 0); perf_counter_update_userpage(counter); } From 049862579333cc6cd9e6edfd6987cd0addfd8c59 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 May 2009 14:33:23 +0800 Subject: [PATCH 0971/2061] blktrace: pdu_buf of pc events should be unsigned I got this: 8,0 1 305.417782332 2037 I R 32 (ffffff9e 10 00 ...) [bash] It should be: 8,0 1 305.417782332 2037 I R 32 (9e 10 00 ...) [bash] [ Impact: fix output of pc events ] Signed-off-by: Li Zefan Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: Steven Rostedt Cc: Frederic Weisbecker LKML-Reference: <4A07C6B3.9080802@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index e099f8cc1d1..05b4747fd87 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -1065,7 +1065,7 @@ static int blk_log_action(struct trace_iterator *iter, const char *act) static int blk_log_dump_pdu(struct trace_seq *s, const struct trace_entry *ent) { - const char *pdu_buf; + const unsigned char *pdu_buf; int pdu_len; int i, end, ret; From 79c5d3ce614d8fe706545c7bca2158b63db6bb5e Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 11 May 2009 15:06:46 +0800 Subject: [PATCH 0972/2061] blktrace: from-sector redundant in trace_block_remap, cleanup The last argument of block_remap prober is the original sector before remap, so it should be 'from', not 'to'. [ Impact: clean up ] Signed-off-by: Li Zefan Cc: "Alan D. Brunelle" Cc: Jens Axboe Cc: Arnaldo Carvalho de Melo Cc: KOSAKI Motohiro LKML-Reference: <4A07CE86.5090301@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- include/trace/block.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/block.h b/include/trace/block.h index 8ac945b7746..5b12efa096b 100644 --- a/include/trace/block.h +++ b/include/trace/block.h @@ -70,7 +70,7 @@ DECLARE_TRACE(block_split, DECLARE_TRACE(block_remap, TP_PROTO(struct request_queue *q, struct bio *bio, dev_t dev, - sector_t to), - TP_ARGS(q, bio, dev, to)); + sector_t from), + TP_ARGS(q, bio, dev, from)); #endif From c969f58ca43fc403c75f5d3da4cf1e21de7afaa0 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 7 Apr 2009 14:13:01 +0100 Subject: [PATCH 0973/2061] GFS2: Update the rw flags After Jens recent updates: http://git.kernel.org/?p=linux/kernel/git/torvalds/linux-2.6.git;a=commitdiff;h=a1f242524c3c1f5d40f1c9c343427e34d1aadd6e et al. this is a patch to bring gfs2 uptodate with the core code. Also I've managed to squash another call to ll_rw_block() along the way. There is still one part of the GFS2 I/O paths which are not correctly annotated and that is due to the sharing of the writeback code between the data and metadata address spaces. I would like to change that too, but this patch is still worth doing on its own, I think. Signed-off-by: Steven Whitehouse --- fs/gfs2/log.c | 6 +++--- fs/gfs2/lops.c | 14 ++++++++------ fs/gfs2/meta_io.c | 38 +++++++++++++++++++++++++++----------- 3 files changed, 38 insertions(+), 20 deletions(-) diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 98918a75641..aa62cf5976e 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -120,7 +120,7 @@ __acquires(&sdp->sd_log_lock) lock_buffer(bh); if (test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); } else { unlock_buffer(bh); brelse(bh); @@ -604,7 +604,7 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags, int pull) if (test_bit(SDF_NOBARRIERS, &sdp->sd_flags)) goto skip_barrier; get_bh(bh); - submit_bh(WRITE_BARRIER | (1 << BIO_RW_META), bh); + submit_bh(WRITE_SYNC | (1 << BIO_RW_BARRIER) | (1 << BIO_RW_META), bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); @@ -664,7 +664,7 @@ static void gfs2_ordered_write(struct gfs2_sbd *sdp) lock_buffer(bh); if (buffer_mapped(bh) && test_clear_buffer_dirty(bh)) { bh->b_end_io = end_buffer_write_sync; - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); } else { unlock_buffer(bh); brelse(bh); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 80e4f5f898b..00315f50fa4 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include "gfs2.h" #include "incore.h" @@ -189,7 +191,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) } gfs2_log_unlock(sdp); - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); gfs2_log_lock(sdp); n = 0; @@ -199,7 +201,7 @@ static void buf_lo_before_commit(struct gfs2_sbd *sdp) gfs2_log_unlock(sdp); lock_buffer(bd2->bd_bh); bh = gfs2_log_fake_buf(sdp, bd2->bd_bh); - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); gfs2_log_lock(sdp); if (++n >= num) break; @@ -341,7 +343,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) sdp->sd_log_num_revoke--; if (offset + sizeof(u64) > sdp->sd_sb.sb_bsize) { - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); bh = gfs2_log_get_buf(sdp); mh = (struct gfs2_meta_header *)bh->b_data; @@ -358,7 +360,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp) } gfs2_assert_withdraw(sdp, !sdp->sd_log_num_revoke); - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); } static void revoke_lo_before_scan(struct gfs2_jdesc *jd, @@ -560,7 +562,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, ptr = bh_log_ptr(bh); get_bh(bh); - submit_bh(WRITE, bh); + submit_bh(WRITE_SYNC_PLUG, bh); gfs2_log_lock(sdp); while(!list_empty(list)) { bd = list_entry(list->next, struct gfs2_bufdata, bd_le.le_list); @@ -586,7 +588,7 @@ static void gfs2_write_blocks(struct gfs2_sbd *sdp, struct buffer_head *bh, } else { bh1 = gfs2_log_fake_buf(sdp, bd->bd_bh); } - submit_bh(WRITE, bh1); + submit_bh(WRITE_SYNC_PLUG, bh1); gfs2_log_lock(sdp); ptr += 2; } diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 8d6f13256b2..75b2aec06f8 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -201,16 +201,32 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) int gfs2_meta_read(struct gfs2_glock *gl, u64 blkno, int flags, struct buffer_head **bhp) { - *bhp = gfs2_getbuf(gl, blkno, CREATE); - if (!buffer_uptodate(*bhp)) { - ll_rw_block(READ_META, 1, bhp); - if (flags & DIO_WAIT) { - int error = gfs2_meta_wait(gl->gl_sbd, *bhp); - if (error) { - brelse(*bhp); - return error; - } - } + struct gfs2_sbd *sdp = gl->gl_sbd; + struct buffer_head *bh; + + if (unlikely(test_bit(SDF_SHUTDOWN, &sdp->sd_flags))) + return -EIO; + + *bhp = bh = gfs2_getbuf(gl, blkno, CREATE); + + lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + return 0; + } + bh->b_end_io = end_buffer_read_sync; + get_bh(bh); + submit_bh(READ_SYNC | (1 << BIO_RW_META), bh); + if (!(flags & DIO_WAIT)) + return 0; + + wait_on_buffer(bh); + if (unlikely(!buffer_uptodate(bh))) { + struct gfs2_trans *tr = current->journal_info; + if (tr && tr->tr_touched) + gfs2_io_error_bh(sdp, bh); + brelse(bh); + return -EIO; } return 0; @@ -404,7 +420,7 @@ struct buffer_head *gfs2_meta_ra(struct gfs2_glock *gl, u64 dblock, u32 extlen) if (buffer_uptodate(first_bh)) goto out; if (!buffer_locked(first_bh)) - ll_rw_block(READ_META, 1, &first_bh); + ll_rw_block(READ_SYNC | (1 << BIO_RW_META), 1, &first_bh); dblock++; extlen--; From 4a0f9a321a113392b448e477018311d14fba2b34 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Mon, 20 Apr 2009 08:16:26 +0100 Subject: [PATCH 0974/2061] GFS2: Optimise writepage for metadata This adds a GFS2 specific writepage for metadata, rather than continuing to use the VFS function. As a result we now tag all our metadata I/O with the correct flag so that blktraces will now be less confusing. Also, the generic function was checking for a number of corner cases which cannot happen on the metadata address spaces so that this should be faster too. Signed-off-by: Steven Whitehouse --- fs/gfs2/meta_io.c | 66 ++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 57 insertions(+), 9 deletions(-) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 75b2aec06f8..78a5f431266 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -33,17 +33,65 @@ #include "util.h" #include "ops_address.h" -static int aspace_get_block(struct inode *inode, sector_t lblock, - struct buffer_head *bh_result, int create) +static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) { - gfs2_assert_warn(inode->i_sb->s_fs_info, 0); - return -EOPNOTSUPP; -} + int err; + struct buffer_head *bh, *head; + int nr_underway = 0; + int write_op = (1 << BIO_RW_META) | ((wbc->sync_mode == WB_SYNC_ALL ? + WRITE_SYNC_PLUG : WRITE)); -static int gfs2_aspace_writepage(struct page *page, - struct writeback_control *wbc) -{ - return block_write_full_page(page, aspace_get_block, wbc); + BUG_ON(!PageLocked(page)); + BUG_ON(!page_has_buffers(page)); + + head = page_buffers(page); + bh = head; + + do { + if (!buffer_mapped(bh)) + continue; + /* + * If it's a fully non-blocking write attempt and we cannot + * lock the buffer then redirty the page. Note that this can + * potentially cause a busy-wait loop from pdflush and kswapd + * activity, but those code paths have their own higher-level + * throttling. + */ + if (wbc->sync_mode != WB_SYNC_NONE || !wbc->nonblocking) { + lock_buffer(bh); + } else if (!trylock_buffer(bh)) { + redirty_page_for_writepage(wbc, page); + continue; + } + if (test_clear_buffer_dirty(bh)) { + mark_buffer_async_write(bh); + } else { + unlock_buffer(bh); + } + } while ((bh = bh->b_this_page) != head); + + /* + * The page and its buffers are protected by PageWriteback(), so we can + * drop the bh refcounts early. + */ + BUG_ON(PageWriteback(page)); + set_page_writeback(page); + + do { + struct buffer_head *next = bh->b_this_page; + if (buffer_async_write(bh)) { + submit_bh(write_op, bh); + nr_underway++; + } + bh = next; + } while (bh != head); + unlock_page(page); + + err = 0; + if (nr_underway == 0) + end_page_writeback(page); + + return err; } static const struct address_space_operations aspace_aops = { From 48bf2b1711dc498494e77705c415ee46bb508fd9 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 29 Apr 2009 13:59:35 +0100 Subject: [PATCH 0975/2061] GFS2: Something nonlinear this way comes! For some reason GFS2 has been missing support for non-linear mappings. This patch fixes that, and also avoids taking any locks for mmap in the O_NOATIME case. In fact we don't actually need to take the lock here at all - just doing file_accessed() would be enough, but we have to take the lock eventually and this helps it hit disk (and thus be seen by other nodes) faster. Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_file.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c index 5d82e91887e..0ee7bd287c5 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/ops_file.c @@ -425,33 +425,36 @@ static struct vm_operations_struct gfs2_vm_ops = { .page_mkwrite = gfs2_page_mkwrite, }; - /** * gfs2_mmap - * @file: The file to map * @vma: The VMA which described the mapping * - * Returns: 0 or error code + * There is no need to get a lock here unless we should be updating + * atime. We ignore any locking errors since the only consequence is + * a missed atime update (which will just be deferred until later). + * + * Returns: 0 */ static int gfs2_mmap(struct file *file, struct vm_area_struct *vma) { struct gfs2_inode *ip = GFS2_I(file->f_mapping->host); - struct gfs2_holder i_gh; - int error; - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); - error = gfs2_glock_nq(&i_gh); - if (error) { - gfs2_holder_uninit(&i_gh); - return error; + if (!(file->f_flags & O_NOATIME)) { + struct gfs2_holder i_gh; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); + file_accessed(file); + if (error == 0) + gfs2_glock_dq_uninit(&i_gh); } - vma->vm_ops = &gfs2_vm_ops; + vma->vm_flags |= VM_CAN_NONLINEAR; - gfs2_glock_dq_uninit(&i_gh); - - return error; + return 0; } /** From 7c77f0b3f9208c339a4b40737bb2cb0f0319bb8d Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 7 May 2009 15:37:35 +0200 Subject: [PATCH 0976/2061] splice: implement pipe to pipe splicing Allow splice(2) to work when both the input and the output is a pipe. Based on the impementation of the tee(2) syscall, but instead of duplicating the buffer references move the buffers from the input pipe to the output pipe. Moving the whole buffer only succeeds if the full length of the buffer is spliced. Otherwise duplicate the buffer, just like tee(2), set the length of the output buffer and advance the offset on the input buffer. Since splice is operating on two pipes, special care needs to be taken with locking to prevent AN ABBA deadlock. Again this is done similarly to the tee(2) syscall, first preparing the input and output pipes so there's data to consume and space for that data, and then doing the move operation while holding both locks. If other processes are doing I/O on the same pipes parallel to the splice, then by the time both inodes are locked there might be no buffers left to move, or no space to move them to. In this case retry the whole operation, including the preparation phase. This could lead to starvation, but I'm not sure if that's serious enough to worry about. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 162 ++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 151 insertions(+), 11 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 666953d59a3..e405cf552f5 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1112,6 +1112,9 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, return ret; } +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags); /* * After the inode slimming patch, i_pipe/i_bdev/i_cdev share the same * location, so checking ->i_pipe is not enough to verify that this is a @@ -1132,12 +1135,32 @@ static long do_splice(struct file *in, loff_t __user *off_in, struct file *out, loff_t __user *off_out, size_t len, unsigned int flags) { - struct pipe_inode_info *pipe; + struct pipe_inode_info *ipipe; + struct pipe_inode_info *opipe; loff_t offset, *off; long ret; - pipe = pipe_info(in->f_path.dentry->d_inode); - if (pipe) { + ipipe = pipe_info(in->f_path.dentry->d_inode); + opipe = pipe_info(out->f_path.dentry->d_inode); + + if (ipipe && opipe) { + if (off_in || off_out) + return -ESPIPE; + + if (!(in->f_mode & FMODE_READ)) + return -EBADF; + + if (!(out->f_mode & FMODE_WRITE)) + return -EBADF; + + /* Splicing to self would be fun, but... */ + if (ipipe == opipe) + return -EINVAL; + + return splice_pipe_to_pipe(ipipe, opipe, len, flags); + } + + if (ipipe) { if (off_in) return -ESPIPE; if (off_out) { @@ -1149,7 +1172,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &out->f_pos; - ret = do_splice_from(pipe, out, off, len, flags); + ret = do_splice_from(ipipe, out, off, len, flags); if (off_out && copy_to_user(off_out, off, sizeof(loff_t))) ret = -EFAULT; @@ -1157,8 +1180,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, return ret; } - pipe = pipe_info(out->f_path.dentry->d_inode); - if (pipe) { + if (opipe) { if (off_out) return -ESPIPE; if (off_in) { @@ -1170,7 +1192,7 @@ static long do_splice(struct file *in, loff_t __user *off_in, } else off = &in->f_pos; - ret = do_splice_to(in, off, pipe, len, flags); + ret = do_splice_to(in, off, opipe, len, flags); if (off_in && copy_to_user(off_in, off, sizeof(loff_t))) ret = -EFAULT; @@ -1511,7 +1533,7 @@ SYSCALL_DEFINE6(splice, int, fd_in, loff_t __user *, off_in, * Make sure there's data to read. Wait for input if we can, otherwise * return an appropriate error. */ -static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; @@ -1549,7 +1571,7 @@ static int link_ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Make sure there's writeable room. Wait for room if we can, otherwise * return an appropriate error. */ -static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) +static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) { int ret; @@ -1586,6 +1608,124 @@ static int link_opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) return ret; } +/* + * Splice contents of ipipe to opipe. + */ +static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, nbuf; + bool input_wakeup = false; + + +retry: + ret = ipipe_prep(ipipe, flags); + if (ret) + return ret; + + ret = opipe_prep(opipe, flags); + if (ret) + return ret; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by pipe info address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + pipe_double_lock(ipipe, opipe); + + do { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + if (!ipipe->nrbufs && !ipipe->writers) + break; + + /* + * Cannot make any progress, because either the input + * pipe is empty or the output pipe is full. + */ + if (!ipipe->nrbufs || opipe->nrbufs >= PIPE_BUFFERS) { + /* Already processed some buffers, break */ + if (ret) + break; + + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + + /* + * We raced with another reader/writer and haven't + * managed to process any buffers. A zero return + * value means EOF, so retry instead. + */ + pipe_unlock(ipipe); + pipe_unlock(opipe); + goto retry; + } + + ibuf = ipipe->bufs + ipipe->curbuf; + nbuf = (opipe->curbuf + opipe->nrbufs) % PIPE_BUFFERS; + obuf = opipe->bufs + nbuf; + + if (len >= ibuf->len) { + /* + * Simply move the whole buffer from ipipe to opipe + */ + *obuf = *ibuf; + ibuf->ops = NULL; + opipe->nrbufs++; + ipipe->curbuf = (ipipe->curbuf + 1) % PIPE_BUFFERS; + ipipe->nrbufs--; + input_wakeup = true; + } else { + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + *obuf = *ibuf; + + /* + * Don't inherit the gift flag, we need to + * prevent multiple steals of this page. + */ + obuf->flags &= ~PIPE_BUF_FLAG_GIFT; + + obuf->len = len; + opipe->nrbufs++; + ibuf->offset += obuf->len; + ibuf->len -= obuf->len; + } + ret += obuf->len; + len -= obuf->len; + } while (len); + + pipe_unlock(ipipe); + pipe_unlock(opipe); + + /* + * If we put data in the output pipe, wakeup any potential readers. + */ + if (ret > 0) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + } + if (input_wakeup) + wakeup_pipe_writers(ipipe); + + return ret; +} + /* * Link contents of ipipe to opipe. */ @@ -1690,9 +1830,9 @@ static long do_tee(struct file *in, struct file *out, size_t len, * Keep going, unless we encounter an error. The ipipe/opipe * ordering doesn't really matter. */ - ret = link_ipipe_prep(ipipe, flags); + ret = ipipe_prep(ipipe, flags); if (!ret) { - ret = link_opipe_prep(opipe, flags); + ret = opipe_prep(opipe, flags); if (!ret) ret = link_pipe(ipipe, opipe, len, flags); } From 6818173bd658439b83896a2a7586f64ab51bf29c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 7 May 2009 15:37:36 +0200 Subject: [PATCH 0977/2061] splice: implement default splice_read method If f_op->splice_read() is not implemented, fall back to a plain read. Use vfs_readv() to read into previously allocated pages. This will allow splice and functions using splice, such as the loop device, to work on all filesystems. This includes "direct_io" files in fuse which bypass the page cache. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- drivers/block/loop.c | 11 +--- fs/coda/file.c | 9 ++- fs/pipe.c | 14 +++++ fs/read_write.c | 7 +-- fs/splice.c | 120 ++++++++++++++++++++++++++++++++++++-- include/linux/fs.h | 2 + include/linux/pipe_fs_i.h | 1 + 7 files changed, 140 insertions(+), 24 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 9ca4bb01465..801f4ab8330 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -709,10 +709,6 @@ static int loop_change_fd(struct loop_device *lo, struct block_device *bdev, if (!S_ISREG(inode->i_mode) && !S_ISBLK(inode->i_mode)) goto out_putf; - /* new backing store needs to support loop (eg splice_read) */ - if (!inode->i_fop->splice_read) - goto out_putf; - /* size of the new backing store needs to be the same */ if (get_loop_size(lo, file) != get_loop_size(lo, old_file)) goto out_putf; @@ -788,12 +784,7 @@ static int loop_set_fd(struct loop_device *lo, fmode_t mode, error = -EINVAL; if (S_ISREG(inode->i_mode) || S_ISBLK(inode->i_mode)) { const struct address_space_operations *aops = mapping->a_ops; - /* - * If we can't read - sorry. If we only can't write - well, - * it's going to be read-only. - */ - if (!file->f_op->splice_read) - goto out_putf; + if (aops->write_begin) lo_flags |= LO_FLAGS_USE_AOPS; if (!(lo_flags & LO_FLAGS_USE_AOPS) && !file->f_op->write) diff --git a/fs/coda/file.c b/fs/coda/file.c index 6a347fbc998..ffd42815fda 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -47,6 +47,8 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos, struct pipe_inode_info *pipe, size_t count, unsigned int flags) { + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); struct coda_file_info *cfi; struct file *host_file; @@ -54,10 +56,11 @@ coda_file_splice_read(struct file *coda_file, loff_t *ppos, BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); host_file = cfi->cfi_container; - if (!host_file->f_op || !host_file->f_op->splice_read) - return -EINVAL; + splice_read = host_file->f_op->splice_read; + if (!splice_read) + splice_read = default_file_splice_read; - return host_file->f_op->splice_read(host_file, ppos, pipe, count,flags); + return splice_read(host_file, ppos, pipe, count, flags); } static ssize_t diff --git a/fs/pipe.c b/fs/pipe.c index 13414ec45b8..f7dd21ad85a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -302,6 +302,20 @@ int generic_pipe_buf_confirm(struct pipe_inode_info *info, return 0; } +/** + * generic_pipe_buf_release - put a reference to a &struct pipe_buffer + * @pipe: the pipe that the buffer belongs to + * @buf: the buffer to put a reference to + * + * Description: + * This function releases a reference to @buf. + */ +void generic_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + page_cache_release(buf->page); +} + static const struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .map = generic_pipe_buf_map, diff --git a/fs/read_write.c b/fs/read_write.c index 9d1e76bb9ee..6c8c55dec2b 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -805,12 +805,6 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, goto out; if (!(in_file->f_mode & FMODE_READ)) goto fput_in; - retval = -EINVAL; - in_inode = in_file->f_path.dentry->d_inode; - if (!in_inode) - goto fput_in; - if (!in_file->f_op || !in_file->f_op->splice_read) - goto fput_in; retval = -ESPIPE; if (!ppos) ppos = &in_file->f_pos; @@ -834,6 +828,7 @@ static ssize_t do_sendfile(int out_fd, int in_fd, loff_t *ppos, retval = -EINVAL; if (!out_file->f_op || !out_file->f_op->sendpage) goto fput_out; + in_inode = in_file->f_path.dentry->d_inode; out_inode = out_file->f_path.dentry->d_inode; retval = rw_verify_area(WRITE, out_file, &out_file->f_pos, count); if (retval < 0) diff --git a/fs/splice.c b/fs/splice.c index e405cf552f5..3bd9cb21b38 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -507,9 +507,116 @@ ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, return ret; } - EXPORT_SYMBOL(generic_file_splice_read); +static const struct pipe_buf_operations default_pipe_buf_ops = { + .can_merge = 0, + .map = generic_pipe_buf_map, + .unmap = generic_pipe_buf_unmap, + .confirm = generic_pipe_buf_confirm, + .release = generic_pipe_buf_release, + .steal = generic_pipe_buf_steal, + .get = generic_pipe_buf_get, +}; + +static ssize_t kernel_readv(struct file *file, const struct iovec *vec, + unsigned long vlen, loff_t offset) +{ + mm_segment_t old_fs; + loff_t pos = offset; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_readv(file, (const struct iovec __user *)vec, vlen, &pos); + set_fs(old_fs); + + return res; +} + +ssize_t default_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + unsigned int nr_pages; + unsigned int nr_freed; + size_t offset; + struct page *pages[PIPE_BUFFERS]; + struct partial_page partial[PIPE_BUFFERS]; + struct iovec vec[PIPE_BUFFERS]; + pgoff_t index; + ssize_t res; + size_t this_len; + int error; + int i; + struct splice_pipe_desc spd = { + .pages = pages, + .partial = partial, + .flags = flags, + .ops = &default_pipe_buf_ops, + .spd_release = spd_release_page, + }; + + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { + struct page *page; + + page = alloc_page(GFP_HIGHUSER); + error = -ENOMEM; + if (!page) + goto err; + + this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); + vec[i].iov_base = (void __user *) kmap(page); + vec[i].iov_len = this_len; + pages[i] = page; + spd.nr_pages++; + len -= this_len; + offset = 0; + } + + res = kernel_readv(in, vec, spd.nr_pages, *ppos); + if (res < 0) + goto err; + + error = 0; + if (!res) + goto err; + + nr_freed = 0; + for (i = 0; i < spd.nr_pages; i++) { + kunmap(pages[i]); + this_len = min_t(size_t, vec[i].iov_len, res); + partial[i].offset = 0; + partial[i].len = this_len; + if (!this_len) { + __free_page(pages[i]); + pages[i] = NULL; + nr_freed++; + } + res -= this_len; + } + spd.nr_pages -= nr_freed; + + res = splice_to_pipe(pipe, &spd); + if (res > 0) + *ppos += res; + + return res; + +err: + for (i = 0; i < spd.nr_pages; i++) { + kunmap(pages[i]); + __free_page(pages[i]); + } + return error; +} +EXPORT_SYMBOL(default_file_splice_read); + /* * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' * using sendpage(). Return the number of bytes sent. @@ -933,11 +1040,10 @@ static long do_splice_to(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) { + ssize_t (*splice_read)(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); int ret; - if (unlikely(!in->f_op || !in->f_op->splice_read)) - return -EINVAL; - if (unlikely(!(in->f_mode & FMODE_READ))) return -EBADF; @@ -945,7 +1051,11 @@ static long do_splice_to(struct file *in, loff_t *ppos, if (unlikely(ret < 0)) return ret; - return in->f_op->splice_read(in, ppos, pipe, len, flags); + splice_read = in->f_op->splice_read; + if (!splice_read) + splice_read = default_file_splice_read; + + return splice_read(in, ppos, pipe, len, flags); } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index 5bed436f435..d926c2bea16 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2204,6 +2204,8 @@ extern int generic_segment_checks(const struct iovec *iov, /* fs/splice.c */ extern ssize_t generic_file_splice_read(struct file *, loff_t *, struct pipe_inode_info *, size_t, unsigned int); +extern ssize_t default_file_splice_read(struct file *, loff_t *, + struct pipe_inode_info *, size_t, unsigned int); extern ssize_t generic_file_splice_write(struct pipe_inode_info *, struct file *, loff_t *, size_t, unsigned int); extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index c8f038554e8..b43a9e03905 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -152,5 +152,6 @@ void generic_pipe_buf_unmap(struct pipe_inode_info *, struct pipe_buffer *, void void generic_pipe_buf_get(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_confirm(struct pipe_inode_info *, struct pipe_buffer *); int generic_pipe_buf_steal(struct pipe_inode_info *, struct pipe_buffer *); +void generic_pipe_buf_release(struct pipe_inode_info *, struct pipe_buffer *); #endif From 0b0a47f5c4a30b58432e20ae1706a27baea91a88 Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Thu, 7 May 2009 15:37:37 +0200 Subject: [PATCH 0978/2061] splice: implement default splice_write method If f_op->splice_write() is not implemented, fall back to a plain write. Use vfs_writev() to write from the pipe buffers. This will allow splice on all filesystems and file types. This includes "direct_io" files in fuse which bypass the page cache. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 138 insertions(+), 4 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 3bd9cb21b38..eefd96b1d7f 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -535,6 +535,21 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec, return res; } +static ssize_t kernel_writev(struct file *file, const struct iovec *vec, + unsigned long vlen, loff_t *ppos) +{ + mm_segment_t old_fs; + ssize_t res; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = vfs_writev(file, (const struct iovec __user *)vec, vlen, ppos); + set_fs(old_fs); + + return res; +} + ssize_t default_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe, size_t len, unsigned int flags) @@ -988,6 +1003,122 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, EXPORT_SYMBOL(generic_file_splice_write); +static struct pipe_buffer *nth_pipe_buf(struct pipe_inode_info *pipe, int n) +{ + return &pipe->bufs[(pipe->curbuf + n) % PIPE_BUFFERS]; +} + +static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, + struct file *out, loff_t *ppos, + size_t len, unsigned int flags) +{ + ssize_t ret = 0; + ssize_t total_len = 0; + int do_wakeup = 0; + + pipe_lock(pipe); + while (len) { + struct pipe_buffer *buf; + void *data[PIPE_BUFFERS]; + struct iovec vec[PIPE_BUFFERS]; + unsigned int nr_pages = 0; + unsigned int write_len = 0; + unsigned int now_len = len; + unsigned int this_len; + int i; + + BUG_ON(pipe->nrbufs > PIPE_BUFFERS); + for (i = 0; i < pipe->nrbufs && now_len; i++) { + buf = nth_pipe_buf(pipe, i); + + ret = buf->ops->confirm(pipe, buf); + if (ret) + break; + + data[i] = buf->ops->map(pipe, buf, 0); + this_len = min(buf->len, now_len); + vec[i].iov_base = (void __user *) data[i] + buf->offset; + vec[i].iov_len = this_len; + now_len -= this_len; + write_len += this_len; + nr_pages++; + } + + if (nr_pages) { + ret = kernel_writev(out, vec, nr_pages, ppos); + if (ret == 0) + ret = -EIO; + if (ret > 0) { + len -= ret; + total_len += ret; + } + } + + for (i = 0; i < nr_pages; i++) { + buf = nth_pipe_buf(pipe, i); + buf->ops->unmap(pipe, buf, data[i]); + + if (ret > 0) { + this_len = min_t(unsigned, vec[i].iov_len, ret); + buf->offset += this_len; + buf->len -= this_len; + ret -= this_len; + } + } + + if (ret < 0) + break; + + while (pipe->nrbufs) { + const struct pipe_buf_operations *ops; + + buf = nth_pipe_buf(pipe, 0); + if (buf->len) + break; + + ops = buf->ops; + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) % PIPE_BUFFERS; + pipe->nrbufs--; + if (pipe->inode) + do_wakeup = 1; + } + + if (pipe->nrbufs) + continue; + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + if (total_len) + break; + } + + if (flags & SPLICE_F_NONBLOCK) { + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + ret = -ERESTARTSYS; + break; + } + + if (do_wakeup) { + wakeup_pipe_writers(pipe); + do_wakeup = 0; + } + + pipe_wait(pipe); + } + pipe_unlock(pipe); + + if (do_wakeup) + wakeup_pipe_writers(pipe); + + return total_len ? total_len : ret; +} + /** * generic_splice_sendpage - splice data from a pipe to a socket * @pipe: pipe to splice from @@ -1015,11 +1146,10 @@ EXPORT_SYMBOL(generic_splice_sendpage); static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { + ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, + loff_t *, size_t, unsigned int); int ret; - if (unlikely(!out->f_op || !out->f_op->splice_write)) - return -EINVAL; - if (unlikely(!(out->f_mode & FMODE_WRITE))) return -EBADF; @@ -1030,7 +1160,11 @@ static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, if (unlikely(ret < 0)) return ret; - return out->f_op->splice_write(pipe, out, ppos, len, flags); + splice_write = out->f_op->splice_write; + if (!splice_write) + splice_write = default_file_splice_write; + + return splice_write(pipe, out, ppos, len, flags); } /* From 3d6ad460214cc72b93333f51f498441a56d622e9 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Mon, 11 May 2009 09:01:08 +0000 Subject: [PATCH 0979/2061] sh: multiple vectors per irq - sh7760 Update intc tables and platform data to use one linux irq per maskable interrupt source instead of keeping the one-to-one mapping between vectors and linux irqs. This fixes potential irq masking issues for sh7760 hardware blocks such as DMAC/TMU2/REF. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/drivers/dma/Kconfig | 3 ++- arch/sh/kernel/cpu/sh4/setup-sh7760.c | 31 ++++++++++----------------- 2 files changed, 13 insertions(+), 21 deletions(-) diff --git a/arch/sh/drivers/dma/Kconfig b/arch/sh/drivers/dma/Kconfig index 666713ac5fc..63e9dd30b41 100644 --- a/arch/sh/drivers/dma/Kconfig +++ b/arch/sh/drivers/dma/Kconfig @@ -16,7 +16,8 @@ config SH_DMA_IRQ_MULTI CPU_SUBTYPE_SH7750S || CPU_SUBTYPE_SH7750R || \ CPU_SUBTYPE_SH7751R || CPU_SUBTYPE_SH7091 || \ CPU_SUBTYPE_SH7763 || CPU_SUBTYPE_SH7764 || \ - CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 + CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \ + CPU_SUBTYPE_SH7760 config NR_ONCHIP_DMA_CHANNELS int diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c index 4a6fd0d3c7b..cd097335758 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c @@ -19,10 +19,7 @@ enum { /* interrupt sources */ IRL0, IRL1, IRL2, IRL3, - HUDI, GPIOI, - DMAC_DMTE0, DMAC_DMTE1, DMAC_DMTE2, DMAC_DMTE3, - DMAC_DMTE4, DMAC_DMTE5, DMAC_DMTE6, DMAC_DMTE7, - DMAC_DMAE, + HUDI, GPIOI, DMAC, IRQ4, IRQ5, IRQ6, IRQ7, HCAN20, HCAN21, SSI0, SSI1, @@ -37,21 +34,20 @@ enum { HSPI, MMCIF0, MMCIF1, MMCIF2, MMCIF3, MFI, ADC, CMT, - TMU0, TMU1, TMU2_TUNI, TMU2_TICPI, - WDT, - REF_RCMI, REF_ROVI, + TMU0, TMU1, TMU2, + WDT, REF, /* interrupt groups */ - DMAC, DMABRG, SCIF0, SCIF1, SCIF2, SIM, MMCIF, TMU2, REF, + DMABRG, SCIF0, SCIF1, SCIF2, SIM, MMCIF, }; static struct intc_vect vectors[] __initdata = { INTC_VECT(HUDI, 0x600), INTC_VECT(GPIOI, 0x620), - INTC_VECT(DMAC_DMTE0, 0x640), INTC_VECT(DMAC_DMTE1, 0x660), - INTC_VECT(DMAC_DMTE2, 0x680), INTC_VECT(DMAC_DMTE3, 0x6a0), - INTC_VECT(DMAC_DMTE4, 0x780), INTC_VECT(DMAC_DMTE5, 0x7a0), - INTC_VECT(DMAC_DMTE6, 0x7c0), INTC_VECT(DMAC_DMTE7, 0x7e0), - INTC_VECT(DMAC_DMAE, 0x6c0), + INTC_VECT(DMAC, 0x640), INTC_VECT(DMAC, 0x660), + INTC_VECT(DMAC, 0x680), INTC_VECT(DMAC, 0x6a0), + INTC_VECT(DMAC, 0x780), INTC_VECT(DMAC, 0x7a0), + INTC_VECT(DMAC, 0x7c0), INTC_VECT(DMAC, 0x7e0), + INTC_VECT(DMAC, 0x6c0), INTC_VECT(IRQ4, 0x800), INTC_VECT(IRQ5, 0x820), INTC_VECT(IRQ6, 0x840), INTC_VECT(IRQ6, 0x860), INTC_VECT(HCAN20, 0x900), INTC_VECT(HCAN21, 0x920), @@ -75,23 +71,18 @@ static struct intc_vect vectors[] __initdata = { INTC_VECT(MFI, 0xe80), /* 0xf80 according to data sheet */ INTC_VECT(ADC, 0xf80), INTC_VECT(CMT, 0xfa0), INTC_VECT(TMU0, 0x400), INTC_VECT(TMU1, 0x420), - INTC_VECT(TMU2_TUNI, 0x440), INTC_VECT(TMU2_TICPI, 0x460), + INTC_VECT(TMU2, 0x440), INTC_VECT(TMU2, 0x460), INTC_VECT(WDT, 0x560), - INTC_VECT(REF_RCMI, 0x580), INTC_VECT(REF_ROVI, 0x5a0), + INTC_VECT(REF, 0x580), INTC_VECT(REF, 0x5a0), }; static struct intc_group groups[] __initdata = { - INTC_GROUP(DMAC, DMAC_DMTE0, DMAC_DMTE1, DMAC_DMTE2, - DMAC_DMTE3, DMAC_DMTE4, DMAC_DMTE5, - DMAC_DMTE6, DMAC_DMTE7, DMAC_DMAE), INTC_GROUP(DMABRG, DMABRG0, DMABRG1, DMABRG2), INTC_GROUP(SCIF0, SCIF0_ERI, SCIF0_RXI, SCIF0_BRI, SCIF0_TXI), INTC_GROUP(SCIF1, SCIF1_ERI, SCIF1_RXI, SCIF1_BRI, SCIF1_TXI), INTC_GROUP(SCIF2, SCIF2_ERI, SCIF2_RXI, SCIF2_BRI, SCIF2_TXI), INTC_GROUP(SIM, SIM_ERI, SIM_RXI, SIM_TXI, SIM_TEI), INTC_GROUP(MMCIF, MMCIF0, MMCIF1, MMCIF2, MMCIF3), - INTC_GROUP(TMU2, TMU2_TUNI, TMU2_TICPI), - INTC_GROUP(REF, REF_RCMI, REF_ROVI), }; static struct intc_mask_reg mask_registers[] __initdata = { From 50e2d0d3b4ffc945a6c27f16d4e5e557e725ec67 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Mon, 11 May 2009 11:34:16 +0000 Subject: [PATCH 0980/2061] sh: r7780 highlander clock fixes Update the r7780 highlander defconfig to fix PCLK value, while at it fix cmdline on r7785. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/configs/r7780mp_defconfig | 2 +- arch/sh/configs/r7785rp_defconfig | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sh/configs/r7780mp_defconfig b/arch/sh/configs/r7780mp_defconfig index 68e5eff5d7f..943da63a385 100644 --- a/arch/sh/configs/r7780mp_defconfig +++ b/arch/sh/configs/r7780mp_defconfig @@ -260,7 +260,7 @@ CONFIG_SH_R7780MP=y # CONFIG_SH_TMU=y CONFIG_SH_TIMER_IRQ=28 -CONFIG_SH_PCLK_FREQ=32000000 +CONFIG_SH_PCLK_FREQ=33333333 # CONFIG_NO_HZ is not set # CONFIG_HIGH_RES_TIMERS is not set CONFIG_GENERIC_CLOCKEVENTS_BUILD=y diff --git a/arch/sh/configs/r7785rp_defconfig b/arch/sh/configs/r7785rp_defconfig index 5e677a8e3a3..82658f67239 100644 --- a/arch/sh/configs/r7785rp_defconfig +++ b/arch/sh/configs/r7785rp_defconfig @@ -333,7 +333,7 @@ CONFIG_GUSA=y CONFIG_ZERO_PAGE_OFFSET=0x00001000 CONFIG_BOOT_LINK_OFFSET=0x00800000 CONFIG_CMDLINE_BOOL=y -CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1 mode4_pin=high" +CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" # # Bus options From ccc195655fb25d7d967b278c4a4725dc5e7a6bf4 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Mon, 11 May 2009 11:37:16 +0000 Subject: [PATCH 0981/2061] sh: TMU platform data for sh7780 This patch adds TMU platform data for sh7780. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7780.c | 204 +++++++++++++++++++++++++ 1 file changed, 204 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c index 6f7227cd65b..f1df0209506 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c @@ -12,6 +12,189 @@ #include #include #include +#include + +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 28, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 29, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 30, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + +static struct sh_timer_config tmu3_platform_data = { + .name = "TMU3", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", +}; + +static struct resource tmu3_resources[] = { + [0] = { + .name = "TMU3", + .start = 0xffdc0008, + .end = 0xffdc0013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 96, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu3_device = { + .name = "sh_tmu", + .id = 3, + .dev = { + .platform_data = &tmu3_platform_data, + }, + .resource = tmu3_resources, + .num_resources = ARRAY_SIZE(tmu3_resources), +}; + +static struct sh_timer_config tmu4_platform_data = { + .name = "TMU4", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource tmu4_resources[] = { + [0] = { + .name = "TMU4", + .start = 0xffdc0014, + .end = 0xffdc001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 97, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu4_device = { + .name = "sh_tmu", + .id = 4, + .dev = { + .platform_data = &tmu4_platform_data, + }, + .resource = tmu4_resources, + .num_resources = ARRAY_SIZE(tmu4_resources), +}; + +static struct sh_timer_config tmu5_platform_data = { + .name = "TMU5", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu5_resources[] = { + [0] = { + .name = "TMU5", + .start = 0xffdc0020, + .end = 0xffdc002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 98, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu5_device = { + .name = "sh_tmu", + .id = 5, + .dev = { + .platform_data = &tmu5_platform_data, + }, + .resource = tmu5_resources, + .num_resources = ARRAY_SIZE(tmu5_resources), +}; static struct resource rtc_resources[] = { [0] = { @@ -58,6 +241,12 @@ static struct platform_device sci_device = { }; static struct platform_device *sh7780_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, &rtc_device, &sci_device, }; @@ -69,6 +258,21 @@ static int __init sh7780_devices_setup(void) } __initcall(sh7780_devices_setup); +static struct platform_device *sh7780_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7780_early_devices, + ARRAY_SIZE(sh7780_early_devices)); +} + enum { UNUSED = 0, From d756f4adb9d8a86e347a2d5435bb5cc95744733e Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 4 May 2009 03:29:52 +0400 Subject: [PATCH 0982/2061] x86, 32-bit: ifdef out struct thread_struct::fs After commit 464d1a78fbf8cf6c7fd970e7b3e2db50a320ce28 aka "[PATCH] i386: Convert i386 PDA code to use %fs" %fs saved during context switch moved from thread_struct to pt_regs and value on thread_struct became unused. [ Impact: reduce thread_struct size on 32-bit ] Signed-off-by: Alexey Dobriyan Cc: containers@lists.linux-foundation.org LKML-Reference: <20090503232952.GI16631@x200.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index c2cceae709c..a6732ff7b01 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -428,7 +428,9 @@ struct thread_struct { unsigned short gsindex; #endif unsigned long ip; +#ifdef CONFIG_X86_64 unsigned long fs; +#endif unsigned long gs; /* Hardware debugging registers: */ unsigned long debugreg0; @@ -874,7 +876,6 @@ static inline void spin_lock_prefetch(const void *x) .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ - .fs = __KERNEL_PERCPU, \ } /* From 0c23590f00f85467b318ad0c20c36796a5bd4c60 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Mon, 4 May 2009 03:30:15 +0400 Subject: [PATCH 0983/2061] x86, 64-bit: ifdef out struct thread_struct::ip struct thread_struct::ip isn't used on x86_64, struct pt_regs::ip is used instead. kgdb should be reading 0 always, but I can't check it. [ Impact: (potentially) reduce thread_struct size on 64-bit ] Signed-off-by: Alexey Dobriyan Cc: containers@lists.linux-foundation.org LKML-Reference: <20090503233015.GJ16631@x200.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 ++ arch/x86/kernel/kgdb.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index a6732ff7b01..a9ba7436821 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -427,7 +427,9 @@ struct thread_struct { unsigned short fsindex; unsigned short gsindex; #endif +#ifdef CONFIG_X86_32 unsigned long ip; +#endif #ifdef CONFIG_X86_64 unsigned long fs; #endif diff --git a/arch/x86/kernel/kgdb.c b/arch/x86/kernel/kgdb.c index eedfaebe106..d07706f1976 100644 --- a/arch/x86/kernel/kgdb.c +++ b/arch/x86/kernel/kgdb.c @@ -141,7 +141,7 @@ void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) gdb_regs32[GDB_PS] = *(unsigned long *)(p->thread.sp + 8); gdb_regs32[GDB_CS] = __KERNEL_CS; gdb_regs32[GDB_SS] = __KERNEL_DS; - gdb_regs[GDB_PC] = p->thread.ip; + gdb_regs[GDB_PC] = 0; gdb_regs[GDB_R8] = 0; gdb_regs[GDB_R9] = 0; gdb_regs[GDB_R10] = 0; From 5a772b2b3c68e7e0b503c5a48469113bb0634314 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 10:56:33 -0400 Subject: [PATCH 0984/2061] ring-buffer: replace constants with time macros in ring-buffer-benchmark The use of numeric constants is discouraged. It is cleaner and more descriptive to use macros for constant time conversions. This patch also removes an extra new line. [ Impact: more descriptive time conversions ] Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index a21aa7b3d05..7d3aef93c49 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -253,7 +253,7 @@ static void ring_buffer_producer(void) } time = end_tv.tv_sec - start_tv.tv_sec; - time *= 1000000; + time *= USEC_PER_SEC; time += (long long)((long)end_tv.tv_usec - (long)start_tv.tv_usec); entries = ring_buffer_entries(buffer); @@ -273,7 +273,8 @@ static void ring_buffer_producer(void) pr_info("Missed: %ld\n", missed); pr_info("Hit: %ld\n", hit); - do_div(time, 1000); + /* Convert time from usecs to millisecs */ + do_div(time, USEC_PER_MSEC); if (time) hit /= (long)time; else @@ -282,18 +283,19 @@ static void ring_buffer_producer(void) pr_info("Entries per millisec: %ld\n", hit); if (hit) { - avg = 1000000 / hit; + /* Calculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / hit; pr_info("%ld ns per entry\n", avg); } - if (missed) { if (time) missed /= (long)time; pr_info("Total iterations per millisec: %ld\n", hit + missed); - avg = 1000000 / (hit + missed); + /* Caculate the average time in nanosecs */ + avg = NSEC_PER_MSEC / (hit + missed); pr_info("%ld ns per entry\n", avg); } } From d988ff94c1074c4c914235c8591bcceafb585ecf Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 8 May 2009 11:03:57 -0400 Subject: [PATCH 0985/2061] ring-buffer: check for divide by zero in ring-buffer-benchmark Although we check if "missed" is not zero, we divide by hit + missed, and the addition can possible overflow and become a divide by zero. This patch checks for this case, and will report it when it happens then modify "hit" to make the calculation be non zero. [ Impact: prevent possible divide by zero in ring-buffer-benchmark ] Reported-by: Andrew Morton Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer_benchmark.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/trace/ring_buffer_benchmark.c b/kernel/trace/ring_buffer_benchmark.c index 7d3aef93c49..8d68e149a8b 100644 --- a/kernel/trace/ring_buffer_benchmark.c +++ b/kernel/trace/ring_buffer_benchmark.c @@ -294,6 +294,12 @@ static void ring_buffer_producer(void) pr_info("Total iterations per millisec: %ld\n", hit + missed); + /* it is possible that hit + missed will overflow and be zero */ + if (!(hit + missed)) { + pr_info("hit + missed overflowed and totalled zero!\n"); + hit--; /* make it non zero */ + } + /* Caculate the average time in nanosecs */ avg = NSEC_PER_MSEC / (hit + missed); pr_info("%ld ns per entry\n", avg); From 1cd8d7358948909ab80b254eb14bcebc555ad417 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 14:08:09 -0400 Subject: [PATCH 0986/2061] ring-buffer: remove type parameter from rb_reserve_next_event The rb_reserve_next_event is only called for the data type (type = 0). There is no reason to pass in the type to the function. Before: text data bss dec hex filename 16554 24 12 16590 40ce kernel/trace/ring_buffer.o After: text data bss dec hex filename 16538 24 12 16574 40be kernel/trace/ring_buffer.o [ Impact: cleaner, smaller and slightly more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 361170609bd..fe40f6c3507 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1389,7 +1389,7 @@ rb_add_time_stamp(struct ring_buffer_per_cpu *cpu_buffer, static struct ring_buffer_event * rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, - unsigned type, unsigned long length) + unsigned long length) { struct ring_buffer_event *event; u64 ts, delta; @@ -1448,7 +1448,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, /* Non commits have zero deltas */ delta = 0; - event = __rb_reserve_next(cpu_buffer, type, length, &ts); + event = __rb_reserve_next(cpu_buffer, 0, length, &ts); if (PTR_ERR(event) == -EAGAIN) goto again; @@ -1556,7 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (length > BUF_PAGE_SIZE) goto out; - event = rb_reserve_next_event(cpu_buffer, 0, length); + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; @@ -1782,7 +1782,7 @@ int ring_buffer_write(struct ring_buffer *buffer, goto out; event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, 0, event_length); + event = rb_reserve_next_event(cpu_buffer, event_length); if (!event) goto out; From be957c447f7233a67904a1b11eb3ab61e702bf4d Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 14:42:53 -0400 Subject: [PATCH 0987/2061] ring-buffer: move calculation of event length The event length is calculated and passed in to rb_reserve_next_event in two different locations. Having rb_reserve_next_event do the calculations directly makes only one location to do the change and causes the calculation to be inlined by gcc. Before: text data bss dec hex filename 16538 24 12 16574 40be kernel/trace/ring_buffer.o After: text data bss dec hex filename 16490 24 12 16526 408e kernel/trace/ring_buffer.o [ Impact: smaller more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index fe40f6c3507..493cba46abc 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -367,6 +367,9 @@ static inline int test_time_stamp(u64 delta) #define BUF_PAGE_SIZE (PAGE_SIZE - BUF_PAGE_HDR_SIZE) +/* Max payload is BUF_PAGE_SIZE - header (8bytes) */ +#define BUF_MAX_DATA_SIZE (BUF_PAGE_SIZE - (sizeof(u32) * 2)) + int ring_buffer_print_page_header(struct trace_seq *s) { struct buffer_data_page field; @@ -1396,6 +1399,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, int commit = 0; int nr_loops = 0; + length = rb_calculate_event_length(length); again: /* * We allow for interrupts to reenter here and do a trace. @@ -1552,8 +1556,7 @@ ring_buffer_lock_reserve(struct ring_buffer *buffer, unsigned long length) if (atomic_read(&cpu_buffer->record_disabled)) goto out; - length = rb_calculate_event_length(length); - if (length > BUF_PAGE_SIZE) + if (length > BUF_MAX_DATA_SIZE) goto out; event = rb_reserve_next_event(cpu_buffer, length); @@ -1758,7 +1761,6 @@ int ring_buffer_write(struct ring_buffer *buffer, { struct ring_buffer_per_cpu *cpu_buffer; struct ring_buffer_event *event; - unsigned long event_length; void *body; int ret = -EBUSY; int cpu, resched; @@ -1781,8 +1783,10 @@ int ring_buffer_write(struct ring_buffer *buffer, if (atomic_read(&cpu_buffer->record_disabled)) goto out; - event_length = rb_calculate_event_length(length); - event = rb_reserve_next_event(cpu_buffer, event_length); + if (length > BUF_MAX_DATA_SIZE) + goto out; + + event = rb_reserve_next_event(cpu_buffer, length); if (!event) goto out; From b68d8201433a91cabbcbeae48b53d8c1c426433a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 03:45:08 +0900 Subject: [PATCH 0988/2061] sh: clkfwk: Make recalc return an unsigned long. This is prep work for cleaning up some of the rate propagation bits. Trivial conversion. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 3 ++- arch/sh/kernel/cpu/clock.c | 33 ++++++++++++++------------ arch/sh/kernel/cpu/sh2/clock-sh7619.c | 13 +++++----- arch/sh/kernel/cpu/sh2a/clock-sh7201.c | 14 +++++------ arch/sh/kernel/cpu/sh2a/clock-sh7203.c | 12 +++++----- arch/sh/kernel/cpu/sh2a/clock-sh7206.c | 12 +++++----- arch/sh/kernel/cpu/sh3/clock-sh3.c | 12 +++++----- arch/sh/kernel/cpu/sh3/clock-sh7705.c | 12 +++++----- arch/sh/kernel/cpu/sh3/clock-sh7706.c | 12 +++++----- arch/sh/kernel/cpu/sh3/clock-sh7709.c | 12 +++++----- arch/sh/kernel/cpu/sh3/clock-sh7710.c | 12 +++++----- arch/sh/kernel/cpu/sh3/clock-sh7712.c | 8 +++---- arch/sh/kernel/cpu/sh4/clock-sh4-202.c | 14 +++++------ arch/sh/kernel/cpu/sh4/clock-sh4.c | 12 +++++----- arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 31 ++++++++++-------------- arch/sh/kernel/cpu/sh4a/clock-sh7763.c | 16 ++++++------- arch/sh/kernel/cpu/sh4a/clock-sh7770.c | 12 +++++----- arch/sh/kernel/cpu/sh4a/clock-sh7780.c | 16 ++++++------- arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 24 +++++++++---------- arch/sh/kernel/cpu/sh4a/clock-sh7786.c | 20 ++++++++-------- arch/sh/kernel/cpu/sh4a/clock-shx3.c | 16 ++++++------- arch/sh/kernel/cpu/sh5/clock-sh5.c | 12 +++++----- arch/sh/kernel/timers/timer-tmu.c | 16 ++++++------- 23 files changed, 170 insertions(+), 174 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index b1f29199e4b..d63352b375c 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -12,7 +12,7 @@ struct clk_ops { void (*init)(struct clk *clk); void (*enable)(struct clk *clk); void (*disable)(struct clk *clk); - void (*recalc)(struct clk *clk); + unsigned long (*recalc)(struct clk *clk); int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id); int (*set_parent)(struct clk *clk, struct clk *parent); long (*round_rate)(struct clk *clk, unsigned long rate); @@ -96,4 +96,5 @@ enum clk_sh_algo_id { IP_N1, }; + #endif /* __ASM_SH_CLOCK_H */ diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 133dbe40334..b022affb44c 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -75,6 +75,7 @@ static struct clk *onchip_clocks[] = { &cpu_clk, }; +/* Propagate rate to children */ static void propagate_rate(struct clk *clk) { struct clk *clkp; @@ -83,7 +84,7 @@ static void propagate_rate(struct clk *clk) if (likely(clkp->parent != clk)) continue; if (likely(clkp->ops && clkp->ops->recalc)) - clkp->ops->recalc(clkp); + clkp->rate = clkp->ops->recalc(clkp); if (unlikely(clkp->flags & CLK_RATE_PROPAGATES)) propagate_rate(clkp); } @@ -240,7 +241,7 @@ void clk_recalc_rate(struct clk *clk) unsigned long flags; spin_lock_irqsave(&clock_lock, flags); - clk->ops->recalc(clk); + clk->rate = clk->ops->recalc(clk); spin_unlock_irqrestore(&clock_lock, flags); } @@ -377,20 +378,22 @@ static int clks_sysdev_suspend(struct sys_device *dev, pm_message_t state) switch (state.event) { case PM_EVENT_ON: /* Resumeing from hibernation */ - if (prev_state.event == PM_EVENT_FREEZE) { - list_for_each_entry(clkp, &clock_list, node) - if (likely(clkp->ops)) { - unsigned long rate = clkp->rate; + if (prev_state.event != PM_EVENT_FREEZE) + break; - if (likely(clkp->ops->set_parent)) - clkp->ops->set_parent(clkp, - clkp->parent); - if (likely(clkp->ops->set_rate)) - clkp->ops->set_rate(clkp, - rate, NO_CHANGE); - else if (likely(clkp->ops->recalc)) - clkp->ops->recalc(clkp); - } + list_for_each_entry(clkp, &clock_list, node) { + if (likely(clkp->ops)) { + unsigned long rate = clkp->rate; + + if (likely(clkp->ops->set_parent)) + clkp->ops->set_parent(clkp, + clkp->parent); + if (likely(clkp->ops->set_rate)) + clkp->ops->set_rate(clkp, + rate, NO_CHANGE); + else if (likely(clkp->ops->recalc)) + clkp->rate = clkp->ops->recalc(clkp); + } } break; case PM_EVENT_FREEZE: diff --git a/arch/sh/kernel/cpu/sh2/clock-sh7619.c b/arch/sh/kernel/cpu/sh2/clock-sh7619.c index d2c15791799..26799139aa7 100644 --- a/arch/sh/kernel/cpu/sh2/clock-sh7619.c +++ b/arch/sh/kernel/cpu/sh2/clock-sh7619.c @@ -38,28 +38,28 @@ static struct clk_ops sh7619_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FREQCR) & 0x0007); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7619_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { - clk->rate = clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 7]; + return clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 7]; } static struct clk_ops sh7619_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { - clk->rate = clk->parent->rate; + return clk->parent->rate; } static struct clk_ops sh7619_cpu_clk_ops = { @@ -78,4 +78,3 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx) if (idx < ARRAY_SIZE(sh7619_clk_ops)) *ops = sh7619_clk_ops[idx]; } - diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7201.c b/arch/sh/kernel/cpu/sh2a/clock-sh7201.c index 4a5e5973233..7814c76159a 100644 --- a/arch/sh/kernel/cpu/sh2a/clock-sh7201.c +++ b/arch/sh/kernel/cpu/sh2a/clock-sh7201.c @@ -34,37 +34,37 @@ static const int pfc_divisors[]={1,2,3,4,6,8,12}; static void master_clk_init(struct clk *clk) { - clk->rate = 10000000 * PLL2 * pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007]; + return 10000000 * PLL2 * pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007]; } static struct clk_ops sh7201_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FREQCR) & 0x0007); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7201_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FREQCR) & 0x0007); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7201_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = ((ctrl_inw(FREQCR) >> 4) & 0x0007); - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh7201_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c index fb781329848..f8c6933857b 100644 --- a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c +++ b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c @@ -46,29 +46,29 @@ static struct clk_ops sh7203_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FREQCR) & 0x0007); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7203_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FREQCR) & 0x0007); - clk->rate = clk->parent->rate / pfc_divisors[idx-2]; + return clk->parent->rate / pfc_divisors[idx-2]; } static struct clk_ops sh7203_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { - clk->rate = clk->parent->rate; + return clk->parent->rate; } static struct clk_ops sh7203_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7206.c b/arch/sh/kernel/cpu/sh2a/clock-sh7206.c index 82d7f991ef6..c2268bdecee 100644 --- a/arch/sh/kernel/cpu/sh2a/clock-sh7206.c +++ b/arch/sh/kernel/cpu/sh2a/clock-sh7206.c @@ -41,29 +41,29 @@ static struct clk_ops sh7206_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FREQCR) & 0x0007); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7206_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { - clk->rate = clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007]; + return clk->parent->rate / pll1rate[(ctrl_inw(FREQCR) >> 8) & 0x0007]; } static struct clk_ops sh7206_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FREQCR) & 0x0007); - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh7206_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh3/clock-sh3.c b/arch/sh/kernel/cpu/sh3/clock-sh3.c index c3c945958ba..27b8738f0b0 100644 --- a/arch/sh/kernel/cpu/sh3/clock-sh3.c +++ b/arch/sh/kernel/cpu/sh3/clock-sh3.c @@ -38,36 +38,36 @@ static struct clk_ops sh3_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = ((frqcr & 0x2000) >> 11) | (frqcr & 0x0003); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh3_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = ((frqcr & 0x8000) >> 13) | ((frqcr & 0x0030) >> 4); - clk->rate = clk->parent->rate / stc_multipliers[idx]; + return clk->parent->rate / stc_multipliers[idx]; } static struct clk_ops sh3_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = ((frqcr & 0x4000) >> 12) | ((frqcr & 0x000c) >> 2); - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh3_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7705.c b/arch/sh/kernel/cpu/sh3/clock-sh7705.c index dfdbf3277fd..0ca8f2c3646 100644 --- a/arch/sh/kernel/cpu/sh3/clock-sh7705.c +++ b/arch/sh/kernel/cpu/sh3/clock-sh7705.c @@ -39,30 +39,30 @@ static struct clk_ops sh7705_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = ctrl_inw(FRQCR) & 0x0003; - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7705_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FRQCR) & 0x0300) >> 8; - clk->rate = clk->parent->rate / stc_multipliers[idx]; + return clk->parent->rate / stc_multipliers[idx]; } static struct clk_ops sh7705_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FRQCR) & 0x0030) >> 4; - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh7705_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7706.c b/arch/sh/kernel/cpu/sh3/clock-sh7706.c index 0cf96f9833b..4bf7887d310 100644 --- a/arch/sh/kernel/cpu/sh3/clock-sh7706.c +++ b/arch/sh/kernel/cpu/sh3/clock-sh7706.c @@ -34,36 +34,36 @@ static struct clk_ops sh7706_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = ((frqcr & 0x2000) >> 11) | (frqcr & 0x0003); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7706_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = ((frqcr & 0x8000) >> 13) | ((frqcr & 0x0030) >> 4); - clk->rate = clk->parent->rate / stc_multipliers[idx]; + return clk->parent->rate / stc_multipliers[idx]; } static struct clk_ops sh7706_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = ((frqcr & 0x4000) >> 12) | ((frqcr & 0x000c) >> 2); - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh7706_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7709.c b/arch/sh/kernel/cpu/sh3/clock-sh7709.c index b791a29fdb6..fa30b601773 100644 --- a/arch/sh/kernel/cpu/sh3/clock-sh7709.c +++ b/arch/sh/kernel/cpu/sh3/clock-sh7709.c @@ -41,12 +41,12 @@ static struct clk_ops sh7709_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = ((frqcr & 0x2000) >> 11) | (frqcr & 0x0003); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7709_module_clk_ops = { @@ -56,25 +56,25 @@ static struct clk_ops sh7709_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = (frqcr & 0x0080) ? ((frqcr & 0x8000) >> 13) | ((frqcr & 0x0030) >> 4) : 1; - clk->rate = clk->parent->rate * stc_multipliers[idx]; + return clk->parent->rate * stc_multipliers[idx]; } static struct clk_ops sh7709_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = ((frqcr & 0x4000) >> 12) | ((frqcr & 0x000c) >> 2); - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh7709_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7710.c b/arch/sh/kernel/cpu/sh3/clock-sh7710.c index 4744c50ec44..030a58ba18a 100644 --- a/arch/sh/kernel/cpu/sh3/clock-sh7710.c +++ b/arch/sh/kernel/cpu/sh3/clock-sh7710.c @@ -33,30 +33,30 @@ static struct clk_ops sh7710_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FRQCR) & 0x0007); - clk->rate = clk->parent->rate / md_table[idx]; + return clk->parent->rate / md_table[idx]; } static struct clk_ops sh7710_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FRQCR) & 0x0700) >> 8; - clk->rate = clk->parent->rate / md_table[idx]; + return clk->parent->rate / md_table[idx]; } static struct clk_ops sh7710_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FRQCR) & 0x0070) >> 4; - clk->rate = clk->parent->rate / md_table[idx]; + return clk->parent->rate / md_table[idx]; } static struct clk_ops sh7710_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh3/clock-sh7712.c b/arch/sh/kernel/cpu/sh3/clock-sh7712.c index 54f54df51ef..6428ee6c77e 100644 --- a/arch/sh/kernel/cpu/sh3/clock-sh7712.c +++ b/arch/sh/kernel/cpu/sh3/clock-sh7712.c @@ -33,24 +33,24 @@ static struct clk_ops sh7712_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = frqcr & 0x0007; - clk->rate = clk->parent->rate / divisors[idx]; + return clk->parent->rate / divisors[idx]; } static struct clk_ops sh7712_module_clk_ops = { .recalc = module_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int frqcr = ctrl_inw(FRQCR); int idx = (frqcr & 0x0030) >> 4; - clk->rate = clk->parent->rate / divisors[idx]; + return clk->parent->rate / divisors[idx]; } static struct clk_ops sh7712_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c index a33429463e9..628d50ea6f6 100644 --- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c @@ -21,10 +21,10 @@ static int frqcr3_divisors[] = { 1, 2, 3, 4, 6, 8, 16 }; static int frqcr3_values[] = { 0, 1, 2, 3, 4, 5, 6 }; -static void emi_clk_recalc(struct clk *clk) +static unsigned long emi_clk_recalc(struct clk *clk) { int idx = ctrl_inl(CPG2_FRQCR3) & 0x0007; - clk->rate = clk->parent->rate / frqcr3_divisors[idx]; + return clk->parent->rate / frqcr3_divisors[idx]; } static inline int frqcr3_lookup(struct clk *clk, unsigned long rate) @@ -50,10 +50,10 @@ static struct clk sh4202_emi_clk = { .ops = &sh4202_emi_clk_ops, }; -static void femi_clk_recalc(struct clk *clk) +static unsigned long femi_clk_recalc(struct clk *clk) { int idx = (ctrl_inl(CPG2_FRQCR3) >> 3) & 0x0007; - clk->rate = clk->parent->rate / frqcr3_divisors[idx]; + return clk->parent->rate / frqcr3_divisors[idx]; } static struct clk_ops sh4202_femi_clk_ops = { @@ -90,10 +90,10 @@ static void shoc_clk_init(struct clk *clk) WARN_ON(i == ARRAY_SIZE(frqcr3_divisors)); /* Undefined clock */ } -static void shoc_clk_recalc(struct clk *clk) +static unsigned long shoc_clk_recalc(struct clk *clk) { int idx = (ctrl_inl(CPG2_FRQCR3) >> 6) & 0x0007; - clk->rate = clk->parent->rate / frqcr3_divisors[idx]; + return clk->parent->rate / frqcr3_divisors[idx]; } static int shoc_clk_verify_rate(struct clk *clk, unsigned long rate) @@ -127,7 +127,7 @@ static int shoc_clk_set_rate(struct clk *clk, unsigned long rate, int algo_id) frqcr3 |= tmp << 6; ctrl_outl(frqcr3, CPG2_FRQCR3); - clk->rate = clk->parent->rate / frqcr3_divisors[tmp]; + return clk->parent->rate / frqcr3_divisors[tmp]; return 0; } diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4.c b/arch/sh/kernel/cpu/sh4/clock-sh4.c index dca9f87a12d..73294d9cd04 100644 --- a/arch/sh/kernel/cpu/sh4/clock-sh4.c +++ b/arch/sh/kernel/cpu/sh4/clock-sh4.c @@ -35,30 +35,30 @@ static struct clk_ops sh4_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FRQCR) & 0x0007); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh4_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FRQCR) >> 3) & 0x0007; - clk->rate = clk->parent->rate / bfc_divisors[idx]; + return clk->parent->rate / bfc_divisors[idx]; } static struct clk_ops sh4_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(FRQCR) >> 6) & 0x0007; - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh4_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c index 1ccdfc561fe..5b1427f1ed4 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c @@ -151,11 +151,11 @@ static int divisors2[] = { 4, 1, 8, 12, 16, 24, 32, 1, 48, 64, 72, 96, 1, 144 }; static int divisors2[] = { 2, 3, 4, 5, 6, 8, 10, 12, 16, 20, 24, 32, 40 }; #endif -static void master_clk_recalc(struct clk *clk) +static unsigned long master_clk_recalc(struct clk *clk) { unsigned frqcr = ctrl_inl(FRQCR); - clk->rate = CONFIG_SH_PCLK_FREQ * STCPLL(frqcr); + return CONFIG_SH_PCLK_FREQ * STCPLL(frqcr); } static void master_clk_init(struct clk *clk) @@ -166,12 +166,11 @@ static void master_clk_init(struct clk *clk) master_clk_recalc(clk); } - -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { unsigned long frqcr = ctrl_inl(FRQCR); - clk->rate = clk->parent->rate / STCPLL(frqcr); + return clk->parent->rate / STCPLL(frqcr); } #if defined(CONFIG_CPU_SUBTYPE_SH7724) @@ -283,14 +282,14 @@ static int sh7722_find_div_index(unsigned long parent_rate, unsigned rate) return index; } -static void sh7722_frqcr_recalc(struct clk *clk) +static unsigned long sh7722_frqcr_recalc(struct clk *clk) { struct frqcr_context ctx = sh7722_get_clk_context(clk->name); unsigned long frqcr = ctrl_inl(FRQCR); int index; index = (frqcr >> ctx.shift) & ctx.mask; - clk->rate = clk->parent->rate * 2 / divisors2[index]; + return clk->parent->rate * 2 / divisors2[index]; } static int sh7722_frqcr_set_rate(struct clk *clk, unsigned long rate, @@ -439,11 +438,8 @@ static struct clk_ops sh7722_frqcr_clk_ops = { /* * clock ops methods for SIU A/B and IrDA clock - * */ - #ifndef CONFIG_CPU_SUBTYPE_SH7343 - static int sh7722_siu_set_rate(struct clk *clk, unsigned long rate, int algo_id) { unsigned long r; @@ -458,12 +454,12 @@ static int sh7722_siu_set_rate(struct clk *clk, unsigned long rate, int algo_id) return 0; } -static void sh7722_siu_recalc(struct clk *clk) +static unsigned long sh7722_siu_recalc(struct clk *clk) { unsigned long r; r = ctrl_inl(clk->arch_flags); - clk->rate = clk->parent->rate * 2 / divisors2[r & 0xF]; + return clk->parent->rate * 2 / divisors2[r & 0xF]; } static int sh7722_siu_start_stop(struct clk *clk, int enable) @@ -525,12 +521,12 @@ static int sh7722_video_set_rate(struct clk *clk, unsigned long rate, return 0; } -static void sh7722_video_recalc(struct clk *clk) +static unsigned long sh7722_video_recalc(struct clk *clk) { unsigned long r; r = ctrl_inl(VCLKCR); - clk->rate = clk->parent->rate / ((r & 0x3F) + 1); + return clk->parent->rate / ((r & 0x3F) + 1); } static struct clk_ops sh7722_video_clk_ops = { @@ -627,7 +623,7 @@ static int sh7722_mstpcr_start_stop(struct clk *clk, int enable) break; default: return -EINVAL; - } + } r = ctrl_inl(reg); @@ -650,10 +646,9 @@ static void sh7722_mstpcr_disable(struct clk *clk) sh7722_mstpcr_start_stop(clk, 0); } -static void sh7722_mstpcr_recalc(struct clk *clk) +static unsigned long sh7722_mstpcr_recalc(struct clk *clk) { - if (clk->parent) - clk->rate = clk->parent->rate; + return clk->parent->rate; } static struct clk_ops sh7722_mstpcr_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c index 3177d0d1e06..26630fb190c 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c @@ -29,29 +29,29 @@ static struct clk_ops sh7763_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> 4) & 0x07); - clk->rate = clk->parent->rate / p0fc_divisors[idx]; + return clk->parent->rate / p0fc_divisors[idx]; } static struct clk_ops sh7763_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> 16) & 0x07); - clk->rate = clk->parent->rate / bfc_divisors[idx]; + return clk->parent->rate / bfc_divisors[idx]; } static struct clk_ops sh7763_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { - clk->rate = clk->parent->rate; + return clk->parent->rate; } static struct clk_ops sh7763_cpu_clk_ops = { @@ -71,10 +71,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx) *ops = sh7763_clk_ops[idx]; } -static void shyway_clk_recalc(struct clk *clk) +static unsigned long shyway_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> 20) & 0x07); - clk->rate = clk->parent->rate / cfc_divisors[idx]; + return clk->parent->rate / cfc_divisors[idx]; } static struct clk_ops sh7763_shyway_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7770.c b/arch/sh/kernel/cpu/sh4a/clock-sh7770.c index 8e236062c72..e0b89676920 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7770.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7770.c @@ -28,30 +28,30 @@ static struct clk_ops sh7770_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> 28) & 0x000f); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7770_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = (ctrl_inl(FRQCR) & 0x000f); - clk->rate = clk->parent->rate / bfc_divisors[idx]; + return clk->parent->rate / bfc_divisors[idx]; } static struct clk_ops sh7770_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> 24) & 0x000f); - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh7770_cpu_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c index 01f3da619d3..ba8dacc4ba2 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c @@ -29,30 +29,30 @@ static struct clk_ops sh7780_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = (ctrl_inl(FRQCR) & 0x0003); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7780_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> 16) & 0x0007); - clk->rate = clk->parent->rate / bfc_divisors[idx]; + return clk->parent->rate / bfc_divisors[idx]; } static struct clk_ops sh7780_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> 24) & 0x0001); - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh7780_cpu_clk_ops = { @@ -72,10 +72,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx) *ops = sh7780_clk_ops[idx]; } -static void shyway_clk_recalc(struct clk *clk) +static unsigned long shyway_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> 20) & 0x0007); - clk->rate = clk->parent->rate / cfc_divisors[idx]; + return clk->parent->rate / cfc_divisors[idx]; } static struct clk_ops sh7780_shyway_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c index 27fa81bef6a..52691eaeb9b 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c @@ -33,30 +33,30 @@ static struct clk_ops sh7785_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = (ctrl_inl(FRQMR1) & 0x000f); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7785_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQMR1) >> 16) & 0x000f); - clk->rate = clk->parent->rate / bfc_divisors[idx]; + return clk->parent->rate / bfc_divisors[idx]; } static struct clk_ops sh7785_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQMR1) >> 28) & 0x0003); - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh7785_cpu_clk_ops = { @@ -76,10 +76,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx) *ops = sh7785_clk_ops[idx]; } -static void shyway_clk_recalc(struct clk *clk) +static unsigned long shyway_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQMR1) >> 20) & 0x0003); - clk->rate = clk->parent->rate / sfc_divisors[idx]; + return clk->parent->rate / sfc_divisors[idx]; } static struct clk_ops sh7785_shyway_clk_ops = { @@ -92,10 +92,10 @@ static struct clk sh7785_shyway_clk = { .ops = &sh7785_shyway_clk_ops, }; -static void ddr_clk_recalc(struct clk *clk) +static unsigned long ddr_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQMR1) >> 12) & 0x0003); - clk->rate = clk->parent->rate / mfc_divisors[idx]; + return clk->parent->rate / mfc_divisors[idx]; } static struct clk_ops sh7785_ddr_clk_ops = { @@ -108,10 +108,10 @@ static struct clk sh7785_ddr_clk = { .ops = &sh7785_ddr_clk_ops, }; -static void ram_clk_recalc(struct clk *clk) +static unsigned long ram_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQMR1) >> 24) & 0x0003); - clk->rate = clk->parent->rate / ufc_divisors[idx]; + return clk->parent->rate / ufc_divisors[idx]; } static struct clk_ops sh7785_ram_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c index f84a9c13447..2e00ff436c6 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c @@ -36,30 +36,30 @@ static struct clk_ops sh7786_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = (ctrl_inl(FRQMR1) & 0x000f); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops sh7786_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQMR1) >> 16) & 0x000f); - clk->rate = clk->parent->rate / bfc_divisors[idx]; + return clk->parent->rate / bfc_divisors[idx]; } static struct clk_ops sh7786_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQMR1) >> 28) & 0x0003); - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops sh7786_cpu_clk_ops = { @@ -79,10 +79,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx) *ops = sh7786_clk_ops[idx]; } -static void shyway_clk_recalc(struct clk *clk) +static unsigned long shyway_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQMR1) >> 20) & 0x0003); - clk->rate = clk->parent->rate / sfc_divisors[idx]; + return clk->parent->rate / sfc_divisors[idx]; } static struct clk_ops sh7786_shyway_clk_ops = { @@ -95,10 +95,10 @@ static struct clk sh7786_shyway_clk = { .ops = &sh7786_shyway_clk_ops, }; -static void ddr_clk_recalc(struct clk *clk) +static unsigned long ddr_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQMR1) >> 12) & 0x0003); - clk->rate = clk->parent->rate / mfc_divisors[idx]; + return clk->parent->rate / mfc_divisors[idx]; } static struct clk_ops sh7786_ddr_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c index c630b29e06a..770934e6828 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c @@ -40,30 +40,30 @@ static struct clk_ops shx3_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> PFC_POS) & PFC_MSK); - clk->rate = clk->parent->rate / pfc_divisors[idx]; + return clk->parent->rate / pfc_divisors[idx]; } static struct clk_ops shx3_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> BFC_POS) & BFC_MSK); - clk->rate = clk->parent->rate / bfc_divisors[idx]; + return clk->parent->rate / bfc_divisors[idx]; } static struct clk_ops shx3_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> IFC_POS) & IFC_MSK); - clk->rate = clk->parent->rate / ifc_divisors[idx]; + return clk->parent->rate / ifc_divisors[idx]; } static struct clk_ops shx3_cpu_clk_ops = { @@ -83,10 +83,10 @@ void __init arch_init_clk_ops(struct clk_ops **ops, int idx) *ops = shx3_clk_ops[idx]; } -static void shyway_clk_recalc(struct clk *clk) +static unsigned long shyway_clk_recalc(struct clk *clk) { int idx = ((ctrl_inl(FRQCR) >> CFC_POS) & CFC_MSK); - clk->rate = clk->parent->rate / cfc_divisors[idx]; + return clk->parent->rate / cfc_divisors[idx]; } static struct clk_ops shx3_shyway_clk_ops = { diff --git a/arch/sh/kernel/cpu/sh5/clock-sh5.c b/arch/sh/kernel/cpu/sh5/clock-sh5.c index 5486324880e..7f864ebc51d 100644 --- a/arch/sh/kernel/cpu/sh5/clock-sh5.c +++ b/arch/sh/kernel/cpu/sh5/clock-sh5.c @@ -32,30 +32,30 @@ static struct clk_ops sh5_master_clk_ops = { .init = master_clk_init, }; -static void module_clk_recalc(struct clk *clk) +static unsigned long module_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(cprc_base) >> 12) & 0x0007; - clk->rate = clk->parent->rate / ifc_table[idx]; + return clk->parent->rate / ifc_table[idx]; } static struct clk_ops sh5_module_clk_ops = { .recalc = module_clk_recalc, }; -static void bus_clk_recalc(struct clk *clk) +static unsigned long bus_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(cprc_base) >> 3) & 0x0007; - clk->rate = clk->parent->rate / ifc_table[idx]; + return clk->parent->rate / ifc_table[idx]; } static struct clk_ops sh5_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static void cpu_clk_recalc(struct clk *clk) +static unsigned long cpu_clk_recalc(struct clk *clk) { int idx = (ctrl_inw(cprc_base) & 0x0007); - clk->rate = clk->parent->rate / ifc_table[idx]; + return clk->parent->rate / ifc_table[idx]; } static struct clk_ops sh5_cpu_clk_ops = { diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c index fe8d8930ccb..a693dcdbe13 100644 --- a/arch/sh/kernel/timers/timer-tmu.c +++ b/arch/sh/kernel/timers/timer-tmu.c @@ -172,22 +172,19 @@ static void __init tmu_clk_init(struct clk *clk) clk->rate = clk_get_rate(clk->parent) / (4 << (divisor << 1)); } -static void tmu_clk_recalc(struct clk *clk) +static unsigned long tmu_clk_recalc(struct clk *clk) { int tmu_num = clk->name[3]-'0'; - unsigned long prev_rate = clk_get_rate(clk); + unsigned long new_rate; unsigned long flags; u8 divisor = ctrl_inw(TMU0_TCR+tmu_num*0xC) & 0x7; - clk->rate = clk_get_rate(clk->parent) / (4 << (divisor << 1)); - if(prev_rate==clk_get_rate(clk)) - return; - - if(tmu_num) - return; /* No more work on TMU1 */ + new_rate = clk_get_rate(clk->parent) / (4 << (divisor << 1)); + if (clk->rate == new_rate || tmu_num) + return clk->rate; /* No more work on TMU1 */ local_irq_save(flags); - tmus_are_scaled = (prev_rate > clk->rate); + tmus_are_scaled = (clk->rate > new_rate); _tmu_stop(TMU0); @@ -210,6 +207,7 @@ static void tmu_clk_recalc(struct clk *clk) _tmu_start(TMU0); local_irq_restore(flags); + return new_rate; } static struct clk_ops tmu_clk_ops = { From a02cb230bb4fca04f091746c593de720a0e3a94a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 03:50:44 +0900 Subject: [PATCH 0989/2061] sh: clkfwk: Add a followparent_recalc() helper. This adds a followparent_recalc() helper for clocks that just follow the parent's rate. Switch over the few CPUs that use this scheme for some of their clocks. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 2 +- arch/sh/kernel/cpu/clock.c | 6 ++++++ arch/sh/kernel/cpu/sh2/clock-sh7619.c | 7 +------ arch/sh/kernel/cpu/sh2a/clock-sh7203.c | 7 +------ arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 7 +------ arch/sh/kernel/cpu/sh4a/clock-sh7763.c | 7 +------ 6 files changed, 11 insertions(+), 25 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index d63352b375c..241f1c1d9ce 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -44,7 +44,7 @@ int __init arch_clk_init(void); /* arch/sh/kernel/cpu/clock.c */ int clk_init(void); - +unsigned long followparent_recalc(struct clk *clk); void clk_recalc_rate(struct clk *); int clk_register(struct clk *); diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index b022affb44c..17f6c078e85 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -90,6 +90,12 @@ static void propagate_rate(struct clk *clk) } } +/* Used for clocks that always have same value as the parent clock */ +unsigned long followparent_recalc(struct clk *clk) +{ + return clk->parent->rate; +} + static void __clk_init(struct clk *clk) { /* diff --git a/arch/sh/kernel/cpu/sh2/clock-sh7619.c b/arch/sh/kernel/cpu/sh2/clock-sh7619.c index 26799139aa7..4fe863170e3 100644 --- a/arch/sh/kernel/cpu/sh2/clock-sh7619.c +++ b/arch/sh/kernel/cpu/sh2/clock-sh7619.c @@ -57,13 +57,8 @@ static struct clk_ops sh7619_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static unsigned long cpu_clk_recalc(struct clk *clk) -{ - return clk->parent->rate; -} - static struct clk_ops sh7619_cpu_clk_ops = { - .recalc = cpu_clk_recalc, + .recalc = followparent_recalc, }; static struct clk_ops *sh7619_clk_ops[] = { diff --git a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c index f8c6933857b..94098696510 100644 --- a/arch/sh/kernel/cpu/sh2a/clock-sh7203.c +++ b/arch/sh/kernel/cpu/sh2a/clock-sh7203.c @@ -66,13 +66,8 @@ static struct clk_ops sh7203_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static unsigned long cpu_clk_recalc(struct clk *clk) -{ - return clk->parent->rate; -} - static struct clk_ops sh7203_cpu_clk_ops = { - .recalc = cpu_clk_recalc, + .recalc = followparent_recalc, }; static struct clk_ops *sh7203_clk_ops[] = { diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c index 5b1427f1ed4..4bdae84aa6b 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c @@ -646,15 +646,10 @@ static void sh7722_mstpcr_disable(struct clk *clk) sh7722_mstpcr_start_stop(clk, 0); } -static unsigned long sh7722_mstpcr_recalc(struct clk *clk) -{ - return clk->parent->rate; -} - static struct clk_ops sh7722_mstpcr_clk_ops = { .enable = sh7722_mstpcr_enable, .disable = sh7722_mstpcr_disable, - .recalc = sh7722_mstpcr_recalc, + .recalc = followparent_recalc, }; #define MSTPCR(_name, _parent, regnr, bitnr) \ diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c index 26630fb190c..db51cffc5d5 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c @@ -49,13 +49,8 @@ static struct clk_ops sh7763_bus_clk_ops = { .recalc = bus_clk_recalc, }; -static unsigned long cpu_clk_recalc(struct clk *clk) -{ - return clk->parent->rate; -} - static struct clk_ops sh7763_cpu_clk_ops = { - .recalc = cpu_clk_recalc, + .recalc = followparent_recalc, }; static struct clk_ops *sh7763_clk_ops[] = { From b1f6cfe48c3cb1dfa77db3d2f42f765febaef9bc Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 04:27:43 +0900 Subject: [PATCH 0990/2061] sh: clkfwk: refactor rate propagation. This resyncs the rate propagation strategy with the scheme used by the OMAP clock framework. Child clocks are tracked on a list under each parent and propagation happens there specifically rather than constantly iterating over the global clock list. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 9 +- arch/sh/kernel/cpu/clock.c | 124 ++++++++++++++++--------- arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 8 +- 3 files changed, 87 insertions(+), 54 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index 241f1c1d9ce..5dc8b73a2bd 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -27,6 +27,9 @@ struct clk { struct clk *parent; struct clk_ops *ops; + struct list_head children; + struct list_head sibling; /* node for children */ + int usecount; unsigned long rate; @@ -35,7 +38,6 @@ struct clk { }; #define CLK_ALWAYS_ENABLED (1 << 0) -#define CLK_RATE_PROPAGATES (1 << 1) #define CLK_NEEDS_INIT (1 << 2) /* Should be defined by processor-specific code */ @@ -44,9 +46,10 @@ int __init arch_clk_init(void); /* arch/sh/kernel/cpu/clock.c */ int clk_init(void); -unsigned long followparent_recalc(struct clk *clk); +unsigned long followparent_recalc(struct clk *); +void recalculate_root_clocks(void); +void propagate_rate(struct clk *); void clk_recalc_rate(struct clk *); - int clk_register(struct clk *); void clk_unregister(struct clk *); diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 17f6c078e85..0a06df8cde2 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -1,11 +1,11 @@ /* * arch/sh/kernel/cpu/clock.c - SuperH clock framework * - * Copyright (C) 2005, 2006, 2007 Paul Mundt + * Copyright (C) 2005 - 2009 Paul Mundt * * This clock framework is derived from the OMAP version by: * - * Copyright (C) 2004 - 2005 Nokia Corporation + * Copyright (C) 2004 - 2008 Nokia Corporation * Written by Tuukka Tikkanen * * Modified for omap shared clock framework by Tony Lindgren @@ -43,20 +43,20 @@ static DEFINE_MUTEX(clock_list_sem); */ static struct clk master_clk = { .name = "master_clk", - .flags = CLK_ALWAYS_ENABLED | CLK_RATE_PROPAGATES, + .flags = CLK_ALWAYS_ENABLED, .rate = CONFIG_SH_PCLK_FREQ, }; static struct clk module_clk = { .name = "module_clk", .parent = &master_clk, - .flags = CLK_ALWAYS_ENABLED | CLK_RATE_PROPAGATES, + .flags = CLK_ALWAYS_ENABLED, }; static struct clk bus_clk = { .name = "bus_clk", .parent = &master_clk, - .flags = CLK_ALWAYS_ENABLED | CLK_RATE_PROPAGATES, + .flags = CLK_ALWAYS_ENABLED, }; static struct clk cpu_clk = { @@ -75,27 +75,24 @@ static struct clk *onchip_clocks[] = { &cpu_clk, }; -/* Propagate rate to children */ -static void propagate_rate(struct clk *clk) -{ - struct clk *clkp; - - list_for_each_entry(clkp, &clock_list, node) { - if (likely(clkp->parent != clk)) - continue; - if (likely(clkp->ops && clkp->ops->recalc)) - clkp->rate = clkp->ops->recalc(clkp); - if (unlikely(clkp->flags & CLK_RATE_PROPAGATES)) - propagate_rate(clkp); - } -} - /* Used for clocks that always have same value as the parent clock */ unsigned long followparent_recalc(struct clk *clk) { return clk->parent->rate; } +/* Propagate rate to children */ +void propagate_rate(struct clk *tclk) +{ + struct clk *clkp; + + list_for_each_entry(clkp, &tclk->children, sibling) { + if (clkp->ops->recalc) + clkp->rate = clkp->ops->recalc(clkp); + propagate_rate(clkp); + } +} + static void __clk_init(struct clk *clk) { /* @@ -180,10 +177,46 @@ void clk_disable(struct clk *clk) } EXPORT_SYMBOL_GPL(clk_disable); +static LIST_HEAD(root_clks); + +/** + * recalculate_root_clocks - recalculate and propagate all root clocks + * + * Recalculates all root clocks (clocks with no parent), which if the + * clock's .recalc is set correctly, should also propagate their rates. + * Called at init. + */ +void recalculate_root_clocks(void) +{ + struct clk *clkp; + + list_for_each_entry(clkp, &root_clks, sibling) { + if (clkp->ops->recalc) + clkp->rate = clkp->ops->recalc(clkp); + propagate_rate(clkp); + } +} + int clk_register(struct clk *clk) { + if (clk == NULL || IS_ERR(clk)) + return -EINVAL; + + /* + * trap out already registered clocks + */ + if (clk->node.next || clk->node.prev) + return 0; + mutex_lock(&clock_list_sem); + INIT_LIST_HEAD(&clk->children); + + if (clk->parent) + list_add(&clk->sibling, &clk->parent->children); + else + list_add(&clk->sibling, &root_clks); + list_add(&clk->node, &clock_list); clk->usecount = 0; clk->flags |= CLK_NEEDS_INIT; @@ -205,6 +238,7 @@ EXPORT_SYMBOL_GPL(clk_register); void clk_unregister(struct clk *clk) { mutex_lock(&clock_list_sem); + list_del(&clk->sibling); list_del(&clk->node); mutex_unlock(&clock_list_sem); } @@ -231,50 +265,53 @@ int clk_set_rate_ex(struct clk *clk, unsigned long rate, int algo_id) spin_lock_irqsave(&clock_lock, flags); ret = clk->ops->set_rate(clk, rate, algo_id); + if (ret == 0) { + if (clk->ops->recalc) + clk->rate = clk->ops->recalc(clk); + propagate_rate(clk); + } spin_unlock_irqrestore(&clock_lock, flags); } - if (unlikely(clk->flags & CLK_RATE_PROPAGATES)) - propagate_rate(clk); - return ret; } EXPORT_SYMBOL_GPL(clk_set_rate_ex); void clk_recalc_rate(struct clk *clk) { - if (likely(clk->ops && clk->ops->recalc)) { - unsigned long flags; + unsigned long flags; - spin_lock_irqsave(&clock_lock, flags); - clk->rate = clk->ops->recalc(clk); - spin_unlock_irqrestore(&clock_lock, flags); - } + if (!clk->ops->recalc) + return; - if (unlikely(clk->flags & CLK_RATE_PROPAGATES)) - propagate_rate(clk); + spin_lock_irqsave(&clock_lock, flags); + clk->rate = clk->ops->recalc(clk); + propagate_rate(clk); + spin_unlock_irqrestore(&clock_lock, flags); } EXPORT_SYMBOL_GPL(clk_recalc_rate); int clk_set_parent(struct clk *clk, struct clk *parent) { + unsigned long flags; int ret = -EINVAL; - struct clk *old; if (!parent || !clk) return ret; - old = clk->parent; - if (likely(clk->ops && clk->ops->set_parent)) { - unsigned long flags; - spin_lock_irqsave(&clock_lock, flags); - ret = clk->ops->set_parent(clk, parent); - spin_unlock_irqrestore(&clock_lock, flags); - clk->parent = (ret ? old : parent); - } + spin_lock_irqsave(&clock_lock, flags); + if (clk->usecount == 0) { + if (clk->ops->set_parent) + ret = clk->ops->set_parent(clk, parent); + if (ret == 0) { + if (clk->ops->recalc) + clk->rate = clk->ops->recalc(clk); + propagate_rate(clk); + } + } else + ret = -EBUSY; + spin_unlock_irqrestore(&clock_lock, flags); - if (unlikely(clk->flags & CLK_RATE_PROPAGATES)) - propagate_rate(clk); return ret; } EXPORT_SYMBOL_GPL(clk_set_parent); @@ -457,8 +494,7 @@ int __init clk_init(void) ret |= arch_clk_init(); /* Kick the child clocks.. */ - propagate_rate(&master_clk); - propagate_rate(&bus_clk); + recalculate_root_clocks(); return ret; } diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c index 4bdae84aa6b..8e53829ca07 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c @@ -161,9 +161,7 @@ static unsigned long master_clk_recalc(struct clk *clk) static void master_clk_init(struct clk *clk) { clk->parent = NULL; - clk->flags |= CLK_RATE_PROPAGATES; - clk->rate = CONFIG_SH_PCLK_FREQ; - master_clk_recalc(clk); + clk->rate = master_clk_recalc(clk); } static unsigned long module_clk_recalc(struct clk *clk) @@ -541,19 +539,16 @@ static struct clk_ops sh7722_video_clk_ops = { static struct clk sh7722_umem_clock = { .name = "umem_clk", .ops = &sh7722_frqcr_clk_ops, - .flags = CLK_RATE_PROPAGATES, }; static struct clk sh7722_sh_clock = { .name = "sh_clk", .ops = &sh7722_frqcr_clk_ops, - .flags = CLK_RATE_PROPAGATES, }; static struct clk sh7722_peripheral_clock = { .name = "peripheral_clk", .ops = &sh7722_frqcr_clk_ops, - .flags = CLK_RATE_PROPAGATES, }; static struct clk sh7722_sdram_clock = { @@ -564,7 +559,6 @@ static struct clk sh7722_sdram_clock = { static struct clk sh7722_r_clock = { .name = "r_clk", .rate = 32768, - .flags = CLK_RATE_PROPAGATES, }; #if !defined(CONFIG_CPU_SUBTYPE_SH7343) &&\ From 4ff29ff8e8723a41e7defd8bc78a7b16cbf940a2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 05:14:53 +0900 Subject: [PATCH 0991/2061] sh: clkfwk: Consolidate the ALWAYS_ENABLED / NEEDS_INIT mess. There is no real distinction here in behaviour, either a clock needs to be enabled on initialiation or not. The ALWAYS_ENABLED flag was always intended to only apply to clocks that were physically always on and could simply not be disabled at all from software. Unfortunately over time this was abused and the meaning became a bit blurry. So, we kill off both of all of those paths now, as well as the newer NEEDS_INIT flag, and consolidate on a CLK_ENABLE_ON_INIT. Clocks that need to be enabled on initialization can set this, and it will purposely enable them and bump the refcount up. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 3 +- arch/sh/kernel/cpu/clock.c | 94 +++++++++----------------- arch/sh/kernel/cpu/sh4/clock-sh4-202.c | 6 +- arch/sh/kernel/cpu/sh4a/clock-sh7763.c | 2 +- arch/sh/kernel/cpu/sh4a/clock-sh7780.c | 2 +- arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 6 +- arch/sh/kernel/cpu/sh4a/clock-sh7786.c | 4 +- arch/sh/kernel/cpu/sh4a/clock-shx3.c | 2 +- 8 files changed, 44 insertions(+), 75 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index 5dc8b73a2bd..246f9ebbed2 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -37,8 +37,7 @@ struct clk { unsigned long arch_flags; }; -#define CLK_ALWAYS_ENABLED (1 << 0) -#define CLK_NEEDS_INIT (1 << 2) +#define CLK_ENABLE_ON_INIT (1 << 0) /* Should be defined by processor-specific code */ void arch_init_clk_ops(struct clk_ops **, int type); diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 0a06df8cde2..c683be5ba8b 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -43,26 +43,26 @@ static DEFINE_MUTEX(clock_list_sem); */ static struct clk master_clk = { .name = "master_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .rate = CONFIG_SH_PCLK_FREQ, }; static struct clk module_clk = { .name = "module_clk", .parent = &master_clk, - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, }; static struct clk bus_clk = { .name = "bus_clk", .parent = &master_clk, - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, }; static struct clk cpu_clk = { .name = "cpu_clk", .parent = &master_clk, - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, }; /* @@ -93,39 +93,11 @@ void propagate_rate(struct clk *tclk) } } -static void __clk_init(struct clk *clk) -{ - /* - * See if this is the first time we're enabling the clock, some - * clocks that are always enabled still require "special" - * initialization. This is especially true if the clock mode - * changes and the clock needs to hunt for the proper set of - * divisors to use before it can effectively recalc. - */ - - if (clk->flags & CLK_NEEDS_INIT) { - if (clk->ops && clk->ops->init) - clk->ops->init(clk); - - clk->flags &= ~CLK_NEEDS_INIT; - } -} - static int __clk_enable(struct clk *clk) { - if (!clk) - return -EINVAL; - - clk->usecount++; - - /* nothing to do if always enabled */ - if (clk->flags & CLK_ALWAYS_ENABLED) - return 0; - - if (clk->usecount == 1) { - __clk_init(clk); - - __clk_enable(clk->parent); + if (clk->usecount++ == 0) { + if (clk->parent) + __clk_enable(clk->parent); if (clk->ops && clk->ops->enable) clk->ops->enable(clk); @@ -139,6 +111,9 @@ int clk_enable(struct clk *clk) unsigned long flags; int ret; + if (!clk) + return -EINVAL; + spin_lock_irqsave(&clock_lock, flags); ret = __clk_enable(clk); spin_unlock_irqrestore(&clock_lock, flags); @@ -149,21 +124,11 @@ EXPORT_SYMBOL_GPL(clk_enable); static void __clk_disable(struct clk *clk) { - if (!clk) - return; - - clk->usecount--; - - WARN_ON(clk->usecount < 0); - - if (clk->flags & CLK_ALWAYS_ENABLED) - return; - - if (clk->usecount == 0) { + if (clk->usecount > 0 && !(--clk->usecount)) { if (likely(clk->ops && clk->ops->disable)) clk->ops->disable(clk); - - __clk_disable(clk->parent); + if (likely(clk->parent)) + __clk_disable(clk->parent); } } @@ -171,6 +136,9 @@ void clk_disable(struct clk *clk) { unsigned long flags; + if (!clk) + return; + spin_lock_irqsave(&clock_lock, flags); __clk_disable(clk); spin_unlock_irqrestore(&clock_lock, flags); @@ -211,6 +179,7 @@ int clk_register(struct clk *clk) mutex_lock(&clock_list_sem); INIT_LIST_HEAD(&clk->children); + clk->usecount = 0; if (clk->parent) list_add(&clk->sibling, &clk->parent->children); @@ -218,19 +187,10 @@ int clk_register(struct clk *clk) list_add(&clk->sibling, &root_clks); list_add(&clk->node, &clock_list); - clk->usecount = 0; - clk->flags |= CLK_NEEDS_INIT; - + if (clk->ops->init) + clk->ops->init(clk); mutex_unlock(&clock_list_sem); - if (clk->flags & CLK_ALWAYS_ENABLED) { - __clk_init(clk); - pr_debug( "Clock '%s' is ALWAYS_ENABLED\n", clk->name); - if (clk->ops && clk->ops->enable) - clk->ops->enable(clk); - pr_debug( "Enabled."); - } - return 0; } EXPORT_SYMBOL_GPL(clk_register); @@ -244,6 +204,15 @@ void clk_unregister(struct clk *clk) } EXPORT_SYMBOL_GPL(clk_unregister); +static void clk_enable_init_clocks(void) +{ + struct clk *clkp; + + list_for_each_entry(clkp, &clock_list, node) + if (clkp->flags & CLK_ENABLE_ON_INIT) + clk_enable(clkp); +} + unsigned long clk_get_rate(struct clk *clk) { return clk->rate; @@ -404,9 +373,7 @@ static int show_clocks(char *buf, char **start, off_t off, p += sprintf(p, "%-12s\t: %ld.%02ldMHz\t%s\n", clk->name, rate / 1000000, (rate % 1000000) / 10000, - ((clk->flags & CLK_ALWAYS_ENABLED) || - clk->usecount > 0) ? - "enabled" : "disabled"); + (clk->usecount > 0) ? "enabled" : "disabled"); } return p - buf; @@ -496,6 +463,9 @@ int __init clk_init(void) /* Kick the child clocks.. */ recalculate_root_clocks(); + /* Enable the necessary init clocks */ + clk_enable_init_clocks(); + return ret; } diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c index 628d50ea6f6..0caca9f99fe 100644 --- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c @@ -46,7 +46,7 @@ static struct clk_ops sh4202_emi_clk_ops = { static struct clk sh4202_emi_clk = { .name = "emi_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &sh4202_emi_clk_ops, }; @@ -62,7 +62,7 @@ static struct clk_ops sh4202_femi_clk_ops = { static struct clk sh4202_femi_clk = { .name = "femi_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &sh4202_femi_clk_ops, }; @@ -140,7 +140,7 @@ static struct clk_ops sh4202_shoc_clk_ops = { static struct clk sh4202_shoc_clk = { .name = "shoc_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &sh4202_shoc_clk_ops, }; diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c index db51cffc5d5..21bd70f9ee4 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c @@ -78,7 +78,7 @@ static struct clk_ops sh7763_shyway_clk_ops = { static struct clk sh7763_shyway_clk = { .name = "shyway_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &sh7763_shyway_clk_ops, }; diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c index ba8dacc4ba2..4c11f8917e4 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c @@ -84,7 +84,7 @@ static struct clk_ops sh7780_shyway_clk_ops = { static struct clk sh7780_shyway_clk = { .name = "shyway_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &sh7780_shyway_clk_ops, }; diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c index 52691eaeb9b..edd432894bd 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c @@ -88,7 +88,7 @@ static struct clk_ops sh7785_shyway_clk_ops = { static struct clk sh7785_shyway_clk = { .name = "shyway_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &sh7785_shyway_clk_ops, }; @@ -104,7 +104,7 @@ static struct clk_ops sh7785_ddr_clk_ops = { static struct clk sh7785_ddr_clk = { .name = "ddr_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &sh7785_ddr_clk_ops, }; @@ -120,7 +120,7 @@ static struct clk_ops sh7785_ram_clk_ops = { static struct clk sh7785_ram_clk = { .name = "ram_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &sh7785_ram_clk_ops, }; diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c index 2e00ff436c6..2825494f85d 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c @@ -91,7 +91,7 @@ static struct clk_ops sh7786_shyway_clk_ops = { static struct clk sh7786_shyway_clk = { .name = "shyway_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &sh7786_shyway_clk_ops, }; @@ -107,7 +107,7 @@ static struct clk_ops sh7786_ddr_clk_ops = { static struct clk sh7786_ddr_clk = { .name = "ddr_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &sh7786_ddr_clk_ops, }; diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c index 770934e6828..6e5c864cf40 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c @@ -95,7 +95,7 @@ static struct clk_ops shx3_shyway_clk_ops = { static struct clk shx3_shyway_clk = { .name = "shyway_clk", - .flags = CLK_ALWAYS_ENABLED, + .flags = CLK_ENABLE_ON_INIT, .ops = &shx3_shyway_clk_ops, }; From 154502e160e02dee7b00ec2149762ae5d48e0bb4 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 05:18:13 +0900 Subject: [PATCH 0992/2061] sh: clkfwk: Convert SH-Mobile CPUs to use CLK_ENABLE_ON_INIT. Kill off all of the clk_always_enabled leftovers and use the new flag directly. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 16 -- arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 365 +++++++++++++------------ arch/sh/kernel/cpu/sh4a/setup-sh7343.c | 6 - arch/sh/kernel/cpu/sh4a/setup-sh7366.c | 6 - arch/sh/kernel/cpu/sh4a/setup-sh7722.c | 6 - arch/sh/kernel/cpu/sh4a/setup-sh7723.c | 5 - arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 4 - 7 files changed, 183 insertions(+), 225 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index 246f9ebbed2..e9fced9bd8f 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -52,22 +52,6 @@ void clk_recalc_rate(struct clk *); int clk_register(struct clk *); void clk_unregister(struct clk *); -static inline int clk_always_enable(const char *id) -{ - struct clk *clk; - int ret; - - clk = clk_get(NULL, id); - if (IS_ERR(clk)) - return PTR_ERR(clk); - - ret = clk_enable(clk); - if (ret) - clk_put(clk); - - return ret; -} - /* the exported API, in addition to clk_set_rate */ /** * clk_set_rate_ex - set the clock rate for a clock source, with additional parameter diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c index 8e53829ca07..f777d00d4af 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c @@ -646,207 +646,208 @@ static struct clk_ops sh7722_mstpcr_clk_ops = { .recalc = followparent_recalc, }; -#define MSTPCR(_name, _parent, regnr, bitnr) \ +#define MSTPCR(_name, _parent, regnr, bitnr, _flags) \ { \ .name = _name, \ + .flags = _flags, \ .arch_flags = MSTPCR_ARCH_FLAGS(regnr, bitnr), \ .ops = (void *)_parent, \ } static struct clk sh7722_mstpcr_clocks[] = { #if defined(CONFIG_CPU_SUBTYPE_SH7722) - MSTPCR("uram0", "umem_clk", 0, 28), - MSTPCR("xymem0", "bus_clk", 0, 26), - MSTPCR("tmu0", "peripheral_clk", 0, 15), - MSTPCR("cmt0", "r_clk", 0, 14), - MSTPCR("rwdt0", "r_clk", 0, 13), - MSTPCR("flctl0", "peripheral_clk", 0, 10), - MSTPCR("scif0", "peripheral_clk", 0, 7), - MSTPCR("scif1", "peripheral_clk", 0, 6), - MSTPCR("scif2", "peripheral_clk", 0, 5), - MSTPCR("i2c0", "peripheral_clk", 1, 9), - MSTPCR("rtc0", "r_clk", 1, 8), - MSTPCR("sdhi0", "peripheral_clk", 2, 18), - MSTPCR("keysc0", "r_clk", 2, 14), - MSTPCR("usbf0", "peripheral_clk", 2, 11), - MSTPCR("2dg0", "bus_clk", 2, 9), - MSTPCR("siu0", "bus_clk", 2, 8), - MSTPCR("vou0", "bus_clk", 2, 5), - MSTPCR("jpu0", "bus_clk", 2, 6), - MSTPCR("beu0", "bus_clk", 2, 4), - MSTPCR("ceu0", "bus_clk", 2, 3), - MSTPCR("veu0", "bus_clk", 2, 2), - MSTPCR("vpu0", "bus_clk", 2, 1), - MSTPCR("lcdc0", "bus_clk", 2, 0), + MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT), + MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT), + MSTPCR("tmu0", "peripheral_clk", 0, 15, 0), + MSTPCR("cmt0", "r_clk", 0, 14, 0), + MSTPCR("rwdt0", "r_clk", 0, 13, 0), + MSTPCR("flctl0", "peripheral_clk", 0, 10, 0), + MSTPCR("scif0", "peripheral_clk", 0, 7, 0), + MSTPCR("scif1", "peripheral_clk", 0, 6, 0), + MSTPCR("scif2", "peripheral_clk", 0, 5, 0), + MSTPCR("i2c0", "peripheral_clk", 1, 9, 0), + MSTPCR("rtc0", "r_clk", 1, 8, 0), + MSTPCR("sdhi0", "peripheral_clk", 2, 18, 0), + MSTPCR("keysc0", "r_clk", 2, 14, 0), + MSTPCR("usbf0", "peripheral_clk", 2, 11, 0), + MSTPCR("2dg0", "bus_clk", 2, 9, 0), + MSTPCR("siu0", "bus_clk", 2, 8, 0), + MSTPCR("vou0", "bus_clk", 2, 5, 0), + MSTPCR("jpu0", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT), + MSTPCR("beu0", "bus_clk", 2, 4, 0), + MSTPCR("ceu0", "bus_clk", 2, 3, 0), + MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT), + MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT), + MSTPCR("lcdc0", "bus_clk", 2, 0, 0), #endif #if defined(CONFIG_CPU_SUBTYPE_SH7723) /* See page 60 of Datasheet V1.0: Overview -> Block Diagram */ - MSTPCR("tlb0", "cpu_clk", 0, 31), - MSTPCR("ic0", "cpu_clk", 0, 30), - MSTPCR("oc0", "cpu_clk", 0, 29), - MSTPCR("l2c0", "sh_clk", 0, 28), - MSTPCR("ilmem0", "cpu_clk", 0, 27), - MSTPCR("fpu0", "cpu_clk", 0, 24), - MSTPCR("intc0", "cpu_clk", 0, 22), - MSTPCR("dmac0", "bus_clk", 0, 21), - MSTPCR("sh0", "sh_clk", 0, 20), - MSTPCR("hudi0", "peripheral_clk", 0, 19), - MSTPCR("ubc0", "cpu_clk", 0, 17), - MSTPCR("tmu0", "peripheral_clk", 0, 15), - MSTPCR("cmt0", "r_clk", 0, 14), - MSTPCR("rwdt0", "r_clk", 0, 13), - MSTPCR("dmac1", "bus_clk", 0, 12), - MSTPCR("tmu1", "peripheral_clk", 0, 11), - MSTPCR("flctl0", "peripheral_clk", 0, 10), - MSTPCR("scif0", "peripheral_clk", 0, 9), - MSTPCR("scif1", "peripheral_clk", 0, 8), - MSTPCR("scif2", "peripheral_clk", 0, 7), - MSTPCR("scif3", "bus_clk", 0, 6), - MSTPCR("scif4", "bus_clk", 0, 5), - MSTPCR("scif5", "bus_clk", 0, 4), - MSTPCR("msiof0", "bus_clk", 0, 2), - MSTPCR("msiof1", "bus_clk", 0, 1), - MSTPCR("meram0", "sh_clk", 0, 0), - MSTPCR("i2c0", "peripheral_clk", 1, 9), - MSTPCR("rtc0", "r_clk", 1, 8), - MSTPCR("atapi0", "sh_clk", 2, 28), - MSTPCR("adc0", "peripheral_clk", 2, 28), - MSTPCR("tpu0", "bus_clk", 2, 25), - MSTPCR("irda0", "peripheral_clk", 2, 24), - MSTPCR("tsif0", "bus_clk", 2, 22), - MSTPCR("icb0", "bus_clk", 2, 21), - MSTPCR("sdhi0", "bus_clk", 2, 18), - MSTPCR("sdhi1", "bus_clk", 2, 17), - MSTPCR("keysc0", "r_clk", 2, 14), - MSTPCR("usb0", "bus_clk", 2, 11), - MSTPCR("2dg0", "bus_clk", 2, 10), - MSTPCR("siu0", "bus_clk", 2, 8), - MSTPCR("veu1", "bus_clk", 2, 6), - MSTPCR("vou0", "bus_clk", 2, 5), - MSTPCR("beu0", "bus_clk", 2, 4), - MSTPCR("ceu0", "bus_clk", 2, 3), - MSTPCR("veu0", "bus_clk", 2, 2), - MSTPCR("vpu0", "bus_clk", 2, 1), - MSTPCR("lcdc0", "bus_clk", 2, 0), + MSTPCR("tlb0", "cpu_clk", 0, 31, 0), + MSTPCR("ic0", "cpu_clk", 0, 30, 0), + MSTPCR("oc0", "cpu_clk", 0, 29, 0), + MSTPCR("l2c0", "sh_clk", 0, 28, 0), + MSTPCR("ilmem0", "cpu_clk", 0, 27, 0), + MSTPCR("fpu0", "cpu_clk", 0, 24, 0), + MSTPCR("intc0", "cpu_clk", 0, 22, 0), + MSTPCR("dmac0", "bus_clk", 0, 21, 0), + MSTPCR("sh0", "sh_clk", 0, 20, 0), + MSTPCR("hudi0", "peripheral_clk", 0, 19, 0), + MSTPCR("ubc0", "cpu_clk", 0, 17, 0), + MSTPCR("tmu0", "peripheral_clk", 0, 15, 0), + MSTPCR("cmt0", "r_clk", 0, 14, 0), + MSTPCR("rwdt0", "r_clk", 0, 13, 0), + MSTPCR("dmac1", "bus_clk", 0, 12, 0), + MSTPCR("tmu1", "peripheral_clk", 0, 11, 0), + MSTPCR("flctl0", "peripheral_clk", 0, 10, 0), + MSTPCR("scif0", "peripheral_clk", 0, 9, 0), + MSTPCR("scif1", "peripheral_clk", 0, 8, 0), + MSTPCR("scif2", "peripheral_clk", 0, 7, 0), + MSTPCR("scif3", "bus_clk", 0, 6, 0), + MSTPCR("scif4", "bus_clk", 0, 5, 0), + MSTPCR("scif5", "bus_clk", 0, 4, 0), + MSTPCR("msiof0", "bus_clk", 0, 2, 0), + MSTPCR("msiof1", "bus_clk", 0, 1, 0), + MSTPCR("meram0", "sh_clk", 0, 0, CLK_ENABLE_ON_INIT), + MSTPCR("i2c0", "peripheral_clk", 1, 9, 0), + MSTPCR("rtc0", "r_clk", 1, 8, 0), + MSTPCR("atapi0", "sh_clk", 2, 28, 0), + MSTPCR("adc0", "peripheral_clk", 2, 28, 0), + MSTPCR("tpu0", "bus_clk", 2, 25, 0), + MSTPCR("irda0", "peripheral_clk", 2, 24, 0), + MSTPCR("tsif0", "bus_clk", 2, 22, 0), + MSTPCR("icb0", "bus_clk", 2, 21, 0), + MSTPCR("sdhi0", "bus_clk", 2, 18, 0), + MSTPCR("sdhi1", "bus_clk", 2, 17, 0), + MSTPCR("keysc0", "r_clk", 2, 14, 0), + MSTPCR("usb0", "bus_clk", 2, 11, 0), + MSTPCR("2dg0", "bus_clk", 2, 10, 0), + MSTPCR("siu0", "bus_clk", 2, 8, 0), + MSTPCR("veu1", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT), + MSTPCR("vou0", "bus_clk", 2, 5, 0), + MSTPCR("beu0", "bus_clk", 2, 4, 0), + MSTPCR("ceu0", "bus_clk", 2, 3, 0), + MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT), + MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT), + MSTPCR("lcdc0", "bus_clk", 2, 0, 0), #endif #if defined(CONFIG_CPU_SUBTYPE_SH7724) /* See Datasheet : Overview -> Block Diagram */ - MSTPCR("tlb0", "cpu_clk", 0, 31), - MSTPCR("ic0", "cpu_clk", 0, 30), - MSTPCR("oc0", "cpu_clk", 0, 29), - MSTPCR("rs0", "bus_clk", 0, 28), - MSTPCR("ilmem0", "cpu_clk", 0, 27), - MSTPCR("l2c0", "sh_clk", 0, 26), - MSTPCR("fpu0", "cpu_clk", 0, 24), - MSTPCR("intc0", "peripheral_clk", 0, 22), - MSTPCR("dmac0", "bus_clk", 0, 21), - MSTPCR("sh0", "sh_clk", 0, 20), - MSTPCR("hudi0", "peripheral_clk", 0, 19), - MSTPCR("ubc0", "cpu_clk", 0, 17), - MSTPCR("tmu0", "peripheral_clk", 0, 15), - MSTPCR("cmt0", "r_clk", 0, 14), - MSTPCR("rwdt0", "r_clk", 0, 13), - MSTPCR("dmac1", "bus_clk", 0, 12), - MSTPCR("tmu1", "peripheral_clk", 0, 10), - MSTPCR("scif0", "peripheral_clk", 0, 9), - MSTPCR("scif1", "peripheral_clk", 0, 8), - MSTPCR("scif2", "peripheral_clk", 0, 7), - MSTPCR("scif3", "bus_clk", 0, 6), - MSTPCR("scif4", "bus_clk", 0, 5), - MSTPCR("scif5", "bus_clk", 0, 4), - MSTPCR("msiof0", "bus_clk", 0, 2), - MSTPCR("msiof1", "bus_clk", 0, 1), - MSTPCR("keysc0", "r_clk", 1, 12), - MSTPCR("rtc0", "r_clk", 1, 11), - MSTPCR("i2c0", "peripheral_clk", 1, 9), - MSTPCR("i2c1", "peripheral_clk", 1, 8), - MSTPCR("mmc0", "bus_clk", 2, 29), - MSTPCR("eth0", "bus_clk", 2, 28), - MSTPCR("atapi0", "bus_clk", 2, 26), - MSTPCR("tpu0", "bus_clk", 2, 25), - MSTPCR("irda0", "peripheral_clk", 2, 24), - MSTPCR("tsif0", "bus_clk", 2, 22), - MSTPCR("usb1", "bus_clk", 2, 21), - MSTPCR("usb0", "bus_clk", 2, 20), - MSTPCR("2dg0", "bus_clk", 2, 19), - MSTPCR("sdhi0", "bus_clk", 2, 18), - MSTPCR("sdhi1", "bus_clk", 2, 17), - MSTPCR("veu1", "bus_clk", 2, 15), - MSTPCR("ceu1", "bus_clk", 2, 13), - MSTPCR("beu1", "bus_clk", 2, 12), - MSTPCR("2ddmac0", "sh_clk", 2, 10), - MSTPCR("spu0", "bus_clk", 2, 9), - MSTPCR("jpu0", "bus_clk", 2, 6), - MSTPCR("vou0", "bus_clk", 2, 5), - MSTPCR("beu0", "bus_clk", 2, 4), - MSTPCR("ceu0", "bus_clk", 2, 3), - MSTPCR("veu0", "bus_clk", 2, 2), - MSTPCR("vpu0", "bus_clk", 2, 1), - MSTPCR("lcdc0", "bus_clk", 2, 0), + MSTPCR("tlb0", "cpu_clk", 0, 31, 0), + MSTPCR("ic0", "cpu_clk", 0, 30, 0), + MSTPCR("oc0", "cpu_clk", 0, 29, 0), + MSTPCR("rs0", "bus_clk", 0, 28, 0), + MSTPCR("ilmem0", "cpu_clk", 0, 27, 0), + MSTPCR("l2c0", "sh_clk", 0, 26, 0), + MSTPCR("fpu0", "cpu_clk", 0, 24, 0), + MSTPCR("intc0", "peripheral_clk", 0, 22, 0), + MSTPCR("dmac0", "bus_clk", 0, 21, 0), + MSTPCR("sh0", "sh_clk", 0, 20, 0), + MSTPCR("hudi0", "peripheral_clk", 0, 19, 0), + MSTPCR("ubc0", "cpu_clk", 0, 17, 0), + MSTPCR("tmu0", "peripheral_clk", 0, 15, 0), + MSTPCR("cmt0", "r_clk", 0, 14, 0), + MSTPCR("rwdt0", "r_clk", 0, 13, 0), + MSTPCR("dmac1", "bus_clk", 0, 12, 0), + MSTPCR("tmu1", "peripheral_clk", 0, 10, 0), + MSTPCR("scif0", "peripheral_clk", 0, 9, 0), + MSTPCR("scif1", "peripheral_clk", 0, 8, 0), + MSTPCR("scif2", "peripheral_clk", 0, 7, 0), + MSTPCR("scif3", "bus_clk", 0, 6, 0), + MSTPCR("scif4", "bus_clk", 0, 5, 0), + MSTPCR("scif5", "bus_clk", 0, 4, 0), + MSTPCR("msiof0", "bus_clk", 0, 2, 0), + MSTPCR("msiof1", "bus_clk", 0, 1, 0), + MSTPCR("keysc0", "r_clk", 1, 12, 0), + MSTPCR("rtc0", "r_clk", 1, 11, 0), + MSTPCR("i2c0", "peripheral_clk", 1, 9, 0), + MSTPCR("i2c1", "peripheral_clk", 1, 8, 0), + MSTPCR("mmc0", "bus_clk", 2, 29, 0), + MSTPCR("eth0", "bus_clk", 2, 28, 0), + MSTPCR("atapi0", "bus_clk", 2, 26, 0), + MSTPCR("tpu0", "bus_clk", 2, 25, 0), + MSTPCR("irda0", "peripheral_clk", 2, 24, 0), + MSTPCR("tsif0", "bus_clk", 2, 22, 0), + MSTPCR("usb1", "bus_clk", 2, 21, 0), + MSTPCR("usb0", "bus_clk", 2, 20, 0), + MSTPCR("2dg0", "bus_clk", 2, 19, 0), + MSTPCR("sdhi0", "bus_clk", 2, 18, 0), + MSTPCR("sdhi1", "bus_clk", 2, 17, 0), + MSTPCR("veu1", "bus_clk", 2, 15, CLK_ENABLE_ON_INIT), + MSTPCR("ceu1", "bus_clk", 2, 13, 0), + MSTPCR("beu1", "bus_clk", 2, 12, 0), + MSTPCR("2ddmac0", "sh_clk", 2, 10, 0), + MSTPCR("spu0", "bus_clk", 2, 9, 0), + MSTPCR("jpu0", "bus_clk", 2, 6, 0), + MSTPCR("vou0", "bus_clk", 2, 5, 0), + MSTPCR("beu0", "bus_clk", 2, 4, 0), + MSTPCR("ceu0", "bus_clk", 2, 3, 0), + MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT), + MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT), + MSTPCR("lcdc0", "bus_clk", 2, 0, 0), #endif #if defined(CONFIG_CPU_SUBTYPE_SH7343) - MSTPCR("uram0", "umem_clk", 0, 28), - MSTPCR("xymem0", "bus_clk", 0, 26), - MSTPCR("tmu0", "peripheral_clk", 0, 15), - MSTPCR("cmt0", "r_clk", 0, 14), - MSTPCR("rwdt0", "r_clk", 0, 13), - MSTPCR("scif0", "peripheral_clk", 0, 7), - MSTPCR("scif1", "peripheral_clk", 0, 6), - MSTPCR("scif2", "peripheral_clk", 0, 5), - MSTPCR("scif3", "peripheral_clk", 0, 4), - MSTPCR("i2c0", "peripheral_clk", 1, 9), - MSTPCR("i2c1", "peripheral_clk", 1, 8), - MSTPCR("sdhi0", "peripheral_clk", 2, 18), - MSTPCR("keysc0", "r_clk", 2, 14), - MSTPCR("usbf0", "peripheral_clk", 2, 11), - MSTPCR("siu0", "bus_clk", 2, 8), - MSTPCR("jpu0", "bus_clk", 2, 6), - MSTPCR("vou0", "bus_clk", 2, 5), - MSTPCR("beu0", "bus_clk", 2, 4), - MSTPCR("ceu0", "bus_clk", 2, 3), - MSTPCR("veu0", "bus_clk", 2, 2), - MSTPCR("vpu0", "bus_clk", 2, 1), - MSTPCR("lcdc0", "bus_clk", 2, 0), + MSTPCR("uram0", "umem_clk", 0, 28, CLK_ENABLE_ON_INIT), + MSTPCR("xymem0", "bus_clk", 0, 26, CLK_ENABLE_ON_INIT), + MSTPCR("tmu0", "peripheral_clk", 0, 15, 0), + MSTPCR("cmt0", "r_clk", 0, 14, 0), + MSTPCR("rwdt0", "r_clk", 0, 13, 0), + MSTPCR("scif0", "peripheral_clk", 0, 7, 0), + MSTPCR("scif1", "peripheral_clk", 0, 6, 0), + MSTPCR("scif2", "peripheral_clk", 0, 5, 0), + MSTPCR("scif3", "peripheral_clk", 0, 4, 0), + MSTPCR("i2c0", "peripheral_clk", 1, 9, 0), + MSTPCR("i2c1", "peripheral_clk", 1, 8, 0), + MSTPCR("sdhi0", "peripheral_clk", 2, 18, 0), + MSTPCR("keysc0", "r_clk", 2, 14, 0), + MSTPCR("usbf0", "peripheral_clk", 2, 11, 0), + MSTPCR("siu0", "bus_clk", 2, 8, 0), + MSTPCR("jpu0", "bus_clk", 2, 6, CLK_ENABLE_ON_INIT), + MSTPCR("vou0", "bus_clk", 2, 5, 0), + MSTPCR("beu0", "bus_clk", 2, 4, 0), + MSTPCR("ceu0", "bus_clk", 2, 3, 0), + MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT), + MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT), + MSTPCR("lcdc0", "bus_clk", 2, 0, 0), #endif #if defined(CONFIG_CPU_SUBTYPE_SH7366) /* See page 52 of Datasheet V0.40: Overview -> Block Diagram */ - MSTPCR("tlb0", "cpu_clk", 0, 31), - MSTPCR("ic0", "cpu_clk", 0, 30), - MSTPCR("oc0", "cpu_clk", 0, 29), - MSTPCR("rsmem0", "sh_clk", 0, 28), - MSTPCR("xymem0", "cpu_clk", 0, 26), - MSTPCR("intc30", "peripheral_clk", 0, 23), - MSTPCR("intc0", "peripheral_clk", 0, 22), - MSTPCR("dmac0", "bus_clk", 0, 21), - MSTPCR("sh0", "sh_clk", 0, 20), - MSTPCR("hudi0", "peripheral_clk", 0, 19), - MSTPCR("ubc0", "cpu_clk", 0, 17), - MSTPCR("tmu0", "peripheral_clk", 0, 15), - MSTPCR("cmt0", "r_clk", 0, 14), - MSTPCR("rwdt0", "r_clk", 0, 13), - MSTPCR("flctl0", "peripheral_clk", 0, 10), - MSTPCR("scif0", "peripheral_clk", 0, 7), - MSTPCR("scif1", "bus_clk", 0, 6), - MSTPCR("scif2", "bus_clk", 0, 5), - MSTPCR("msiof0", "peripheral_clk", 0, 2), - MSTPCR("sbr0", "peripheral_clk", 0, 1), - MSTPCR("i2c0", "peripheral_clk", 1, 9), - MSTPCR("icb0", "bus_clk", 2, 27), - MSTPCR("meram0", "sh_clk", 2, 26), - MSTPCR("dacc0", "peripheral_clk", 2, 24), - MSTPCR("dacy0", "peripheral_clk", 2, 23), - MSTPCR("tsif0", "bus_clk", 2, 22), - MSTPCR("sdhi0", "bus_clk", 2, 18), - MSTPCR("mmcif0", "bus_clk", 2, 17), - MSTPCR("usb0", "bus_clk", 2, 11), - MSTPCR("siu0", "bus_clk", 2, 8), - MSTPCR("veu1", "bus_clk", 2, 7), - MSTPCR("vou0", "bus_clk", 2, 5), - MSTPCR("beu0", "bus_clk", 2, 4), - MSTPCR("ceu0", "bus_clk", 2, 3), - MSTPCR("veu0", "bus_clk", 2, 2), - MSTPCR("vpu0", "bus_clk", 2, 1), - MSTPCR("lcdc0", "bus_clk", 2, 0), + MSTPCR("tlb0", "cpu_clk", 0, 31, 0), + MSTPCR("ic0", "cpu_clk", 0, 30, 0), + MSTPCR("oc0", "cpu_clk", 0, 29, 0), + MSTPCR("rsmem0", "sh_clk", 0, 28, CLK_ENABLE_ON_INIT), + MSTPCR("xymem0", "cpu_clk", 0, 26, CLK_ENABLE_ON_INIT), + MSTPCR("intc30", "peripheral_clk", 0, 23, 0), + MSTPCR("intc0", "peripheral_clk", 0, 22, 0), + MSTPCR("dmac0", "bus_clk", 0, 21, 0), + MSTPCR("sh0", "sh_clk", 0, 20, 0), + MSTPCR("hudi0", "peripheral_clk", 0, 19, 0), + MSTPCR("ubc0", "cpu_clk", 0, 17, 0), + MSTPCR("tmu0", "peripheral_clk", 0, 15, 0), + MSTPCR("cmt0", "r_clk", 0, 14, 0), + MSTPCR("rwdt0", "r_clk", 0, 13, 0), + MSTPCR("flctl0", "peripheral_clk", 0, 10, 0), + MSTPCR("scif0", "peripheral_clk", 0, 7, 0), + MSTPCR("scif1", "bus_clk", 0, 6, 0), + MSTPCR("scif2", "bus_clk", 0, 5, 0), + MSTPCR("msiof0", "peripheral_clk", 0, 2, 0), + MSTPCR("sbr0", "peripheral_clk", 0, 1, 0), + MSTPCR("i2c0", "peripheral_clk", 1, 9, 0), + MSTPCR("icb0", "bus_clk", 2, 27, 0), + MSTPCR("meram0", "sh_clk", 2, 26, 0), + MSTPCR("dacc0", "peripheral_clk", 2, 24, 0), + MSTPCR("dacy0", "peripheral_clk", 2, 23, 0), + MSTPCR("tsif0", "bus_clk", 2, 22, 0), + MSTPCR("sdhi0", "bus_clk", 2, 18, 0), + MSTPCR("mmcif0", "bus_clk", 2, 17, 0), + MSTPCR("usb0", "bus_clk", 2, 11, 0), + MSTPCR("siu0", "bus_clk", 2, 8, 0), + MSTPCR("veu1", "bus_clk", 2, 7, CLK_ENABLE_ON_INIT), + MSTPCR("vou0", "bus_clk", 2, 5, 0), + MSTPCR("beu0", "bus_clk", 2, 4, 0), + MSTPCR("ceu0", "bus_clk", 2, 3, 0), + MSTPCR("veu0", "bus_clk", 2, 2, CLK_ENABLE_ON_INIT), + MSTPCR("vpu0", "bus_clk", 2, 1, CLK_ENABLE_ON_INIT), + MSTPCR("lcdc0", "bus_clk", 2, 0, 0), #endif }; diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c index b9c361e8cc8..42ce5fcbd73 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c @@ -223,12 +223,6 @@ static struct platform_device *sh7343_devices[] __initdata = { static int __init sh7343_devices_setup(void) { - clk_always_enable("uram0"); /* URAM */ - clk_always_enable("xymem0"); /* XYMEM */ - clk_always_enable("veu0"); /* VEU */ - clk_always_enable("vpu0"); /* VPU */ - clk_always_enable("jpu0"); /* JPU */ - platform_resource_setup_memory(&vpu_device, "vpu", 1 << 20); platform_resource_setup_memory(&veu_device, "veu", 2 << 20); platform_resource_setup_memory(&jpu_device, "jpu", 2 << 20); diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c index d4a994be4a7..d0172afa0b5 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c @@ -212,12 +212,6 @@ static struct platform_device *sh7366_devices[] __initdata = { static int __init sh7366_devices_setup(void) { - clk_always_enable("rsmem0"); /* RSMEM */ - clk_always_enable("xymem0"); /* XYMEM */ - clk_always_enable("veu1"); /* VEU-2 */ - clk_always_enable("veu0"); /* VEU-1 */ - clk_always_enable("vpu0"); /* VPU */ - platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20); platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20); platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20); diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c index 5d6247fecd6..ea524a2da3e 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7722.c @@ -352,12 +352,6 @@ static struct platform_device *sh7722_devices[] __initdata = { static int __init sh7722_devices_setup(void) { - clk_always_enable("uram0"); /* URAM */ - clk_always_enable("xymem0"); /* XYMEM */ - clk_always_enable("veu0"); /* VEU */ - clk_always_enable("vpu0"); /* VPU */ - clk_always_enable("jpu0"); /* JPU */ - platform_resource_setup_memory(&vpu_device, "vpu", 1 << 20); platform_resource_setup_memory(&veu_device, "veu", 2 << 20); platform_resource_setup_memory(&jpu_device, "jpu", 2 << 20); diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c index 1429fc5e428..04cb4aae7ea 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7723.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7723.c @@ -460,11 +460,6 @@ static struct platform_device *sh7723_devices[] __initdata = { static int __init sh7723_devices_setup(void) { - clk_always_enable("meram0"); /* MERAM */ - clk_always_enable("veu1"); /* VEU2H1 */ - clk_always_enable("veu0"); /* VEU2H0 */ - clk_always_enable("vpu0"); /* VPU */ - platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20); platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20); platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20); diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index a3039ce1a6a..080f541ae08 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -281,10 +281,6 @@ static struct platform_device *sh7724_devices[] __initdata = { static int __init sh7724_devices_setup(void) { - clk_always_enable("vpu0"); /* VPU */ - clk_always_enable("veu1"); /* VEU3F1 */ - clk_always_enable("veu0"); /* VEU3F0 */ - platform_resource_setup_memory(&vpu_device, "vpu", 2 << 20); platform_resource_setup_memory(&veu0_device, "veu0", 2 << 20); platform_resource_setup_memory(&veu1_device, "veu1", 2 << 20); From ae891a4264c91246c0b4c22be68b9838747ae48d Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 05:30:10 +0900 Subject: [PATCH 0993/2061] sh: clkfwk: Fix up the clk_enable() error path. There are a couple of instances where a clk_enable() can fail, which the SH-Mobile code presently handles, but doesn't get reported all the way back up. This fixes up the return type so the errors make it all the way down to the drivers. Additionally, we now also error out properly if the parent enable fails. Prep work for aggressively turning off unused clocks on boot. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 2 +- arch/sh/kernel/cpu/clock.c | 81 ++++++++++++++++---------- arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 11 ++-- 3 files changed, 58 insertions(+), 36 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index e9fced9bd8f..5de72eef972 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -10,7 +10,7 @@ struct clk; struct clk_ops { void (*init)(struct clk *clk); - void (*enable)(struct clk *clk); + int (*enable)(struct clk *clk); void (*disable)(struct clk *clk); unsigned long (*recalc)(struct clk *clk); int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id); diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index c683be5ba8b..e027fe5898d 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -93,38 +93,16 @@ void propagate_rate(struct clk *tclk) } } -static int __clk_enable(struct clk *clk) -{ - if (clk->usecount++ == 0) { - if (clk->parent) - __clk_enable(clk->parent); - - if (clk->ops && clk->ops->enable) - clk->ops->enable(clk); - } - - return 0; -} - -int clk_enable(struct clk *clk) -{ - unsigned long flags; - int ret; - - if (!clk) - return -EINVAL; - - spin_lock_irqsave(&clock_lock, flags); - ret = __clk_enable(clk); - spin_unlock_irqrestore(&clock_lock, flags); - - return ret; -} -EXPORT_SYMBOL_GPL(clk_enable); - static void __clk_disable(struct clk *clk) { - if (clk->usecount > 0 && !(--clk->usecount)) { + if (clk->usecount == 0) { + printk(KERN_ERR "Trying disable clock %s with 0 usecount\n", + clk->name); + WARN_ON(1); + return; + } + + if (!(--clk->usecount)) { if (likely(clk->ops && clk->ops->disable)) clk->ops->disable(clk); if (likely(clk->parent)) @@ -145,6 +123,49 @@ void clk_disable(struct clk *clk) } EXPORT_SYMBOL_GPL(clk_disable); +static int __clk_enable(struct clk *clk) +{ + int ret = 0; + + if (clk->usecount++ == 0) { + if (clk->parent) { + ret = __clk_enable(clk->parent); + if (unlikely(ret)) + goto err; + } + + if (clk->ops && clk->ops->enable) { + ret = clk->ops->enable(clk); + if (ret) { + if (clk->parent) + __clk_disable(clk->parent); + goto err; + } + } + } + + return ret; +err: + clk->usecount--; + return ret; +} + +int clk_enable(struct clk *clk) +{ + unsigned long flags; + int ret; + + if (!clk) + return -EINVAL; + + spin_lock_irqsave(&clock_lock, flags); + ret = __clk_enable(clk); + spin_unlock_irqrestore(&clock_lock, flags); + + return ret; +} +EXPORT_SYMBOL_GPL(clk_enable); + static LIST_HEAD(root_clks); /** diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c index f777d00d4af..1c41db41de7 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c @@ -472,9 +472,9 @@ static int sh7722_siu_start_stop(struct clk *clk, int enable) return 0; } -static void sh7722_siu_enable(struct clk *clk) +static int sh7722_siu_enable(struct clk *clk) { - sh7722_siu_start_stop(clk, 1); + return sh7722_siu_start_stop(clk, 1); } static void sh7722_siu_disable(struct clk *clk) @@ -491,12 +491,13 @@ static struct clk_ops sh7722_siu_clk_ops = { #endif /* CONFIG_CPU_SUBTYPE_SH7343 */ -static void sh7722_video_enable(struct clk *clk) +static int sh7722_video_enable(struct clk *clk) { unsigned long r; r = ctrl_inl(VCLKCR); ctrl_outl( r & ~(1<<8), VCLKCR); + return 0; } static void sh7722_video_disable(struct clk *clk) @@ -630,9 +631,9 @@ static int sh7722_mstpcr_start_stop(struct clk *clk, int enable) return 0; } -static void sh7722_mstpcr_enable(struct clk *clk) +static int sh7722_mstpcr_enable(struct clk *clk) { - sh7722_mstpcr_start_stop(clk, 1); + return sh7722_mstpcr_start_stop(clk, 1); } static void sh7722_mstpcr_disable(struct clk *clk) From aa87aa343f2cd236b5eccd643abd4df918ed5c4f Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 05:51:05 +0900 Subject: [PATCH 0994/2061] sh: clkfwk: Improve the generic clk_set_parent() implementation. This causes the generic clk_set_parent() implementation to be a bit more intelligent. A clk_reparent() is added to move the clock over to the new parent's sibling list, which then allows the generic rate propagation code to succeed. This also becomes a nop if the new and old parents are unchanged. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 1 + arch/sh/kernel/cpu/clock.c | 20 ++++++++++++++++++++ 2 files changed, 21 insertions(+) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index 5de72eef972..fdb915608db 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -48,6 +48,7 @@ int clk_init(void); unsigned long followparent_recalc(struct clk *); void recalculate_root_clocks(void); void propagate_rate(struct clk *); +int clk_reparent(struct clk *child, struct clk *parent); void clk_recalc_rate(struct clk *); int clk_register(struct clk *); void clk_unregister(struct clk *); diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index e027fe5898d..e3d1de8a46f 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -81,6 +81,19 @@ unsigned long followparent_recalc(struct clk *clk) return clk->parent->rate; } +int clk_reparent(struct clk *child, struct clk *parent) +{ + list_del_init(&child->sibling); + if (parent) + list_add(&child->sibling, &parent->children); + child->parent = parent; + + /* now do the debugfs renaming to reattach the child + to the proper parent */ + + return 0; +} + /* Propagate rate to children */ void propagate_rate(struct clk *tclk) { @@ -288,12 +301,19 @@ int clk_set_parent(struct clk *clk, struct clk *parent) if (!parent || !clk) return ret; + if (clk->parent == parent) + return 0; spin_lock_irqsave(&clock_lock, flags); if (clk->usecount == 0) { if (clk->ops->set_parent) ret = clk->ops->set_parent(clk, parent); + else + ret = clk_reparent(clk, parent); + if (ret == 0) { + pr_debug("clock: set parent of %s to %s (new rate %ld)\n", + clk->name, clk->parent->name, clk->rate); if (clk->ops->recalc) clk->rate = clk->ops->recalc(clk); propagate_rate(clk); From f5c84cf50812c80133e64683d0500b2416d55cb3 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 05:59:27 +0900 Subject: [PATCH 0995/2061] sh: clkfwk: Tidy up on-chip clock registration and rate propagation. This tidies up the set_rate hack that the on-chip clocks were abusing to trigger rate propagation, which is now handled generically. Additionally, now that CLK_ENABLE_ON_INIT is wired up where it needs to be for these clocks, the clk_enable() can go away. In some cases this was bumping up the refcount higher than it should have been. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4/clock-sh4-202.c | 19 ++++--------------- arch/sh/kernel/cpu/sh4a/clock-sh7763.c | 17 +++-------------- arch/sh/kernel/cpu/sh4a/clock-sh7780.c | 17 +++-------------- arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 15 +++------------ arch/sh/kernel/cpu/sh4a/clock-sh7786.c | 15 +++------------ arch/sh/kernel/cpu/sh4a/clock-shx3.c | 15 +++------------ 6 files changed, 19 insertions(+), 79 deletions(-) diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c index 0caca9f99fe..435f4f12ffb 100644 --- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c @@ -127,7 +127,7 @@ static int shoc_clk_set_rate(struct clk *clk, unsigned long rate, int algo_id) frqcr3 |= tmp << 6; ctrl_outl(frqcr3, CPG2_FRQCR3); - return clk->parent->rate / frqcr3_divisors[tmp]; + clk->rate = clk->parent->rate / frqcr3_divisors[tmp]; return 0; } @@ -153,28 +153,17 @@ static struct clk *sh4202_onchip_clocks[] = { static int __init sh4202_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); - int i; + int i, ret = 0; for (i = 0; i < ARRAY_SIZE(sh4202_onchip_clocks); i++) { struct clk *clkp = sh4202_onchip_clocks[i]; clkp->parent = clk; - clk_register(clkp); - clk_enable(clkp); + ret |= clk_register(clkp); } - /* - * Now that we have the rest of the clocks registered, we need to - * force the parent clock to propagate so that these clocks will - * automatically figure out their rate. We cheat by handing the - * parent clock its current rate and forcing child propagation. - */ - clk_set_rate(clk, clk_get_rate(clk)); - clk_put(clk); - return 0; + return ret; } - arch_initcall(sh4202_clk_init); - diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c index 21bd70f9ee4..0110da64a43 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c @@ -93,28 +93,17 @@ static struct clk *sh7763_onchip_clocks[] = { static int __init sh7763_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); - int i; + int i, ret = 0; for (i = 0; i < ARRAY_SIZE(sh7763_onchip_clocks); i++) { struct clk *clkp = sh7763_onchip_clocks[i]; clkp->parent = clk; - clk_register(clkp); - clk_enable(clkp); + ret |= clk_register(clkp); } - /* - * Now that we have the rest of the clocks registered, we need to - * force the parent clock to propagate so that these clocks will - * automatically figure out their rate. We cheat by handing the - * parent clock its current rate and forcing child propagation. - */ - clk_set_rate(clk, clk_get_rate(clk)); - clk_put(clk); - return 0; + return ret; } - arch_initcall(sh7763_clk_init); - diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c index 4c11f8917e4..0a22d50b109 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c @@ -99,28 +99,17 @@ static struct clk *sh7780_onchip_clocks[] = { static int __init sh7780_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); - int i; + int i, ret = 0; for (i = 0; i < ARRAY_SIZE(sh7780_onchip_clocks); i++) { struct clk *clkp = sh7780_onchip_clocks[i]; clkp->parent = clk; - clk_register(clkp); - clk_enable(clkp); + ret |= clk_register(clkp); } - /* - * Now that we have the rest of the clocks registered, we need to - * force the parent clock to propagate so that these clocks will - * automatically figure out their rate. We cheat by handing the - * parent clock its current rate and forcing child propagation. - */ - clk_set_rate(clk, clk_get_rate(clk)); - clk_put(clk); - return 0; + return ret; } - arch_initcall(sh7780_clk_init); - diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c index edd432894bd..4dcd1f6f0cb 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c @@ -137,26 +137,17 @@ static struct clk *sh7785_onchip_clocks[] = { static int __init sh7785_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); - int i; + int i, ret = 0; for (i = 0; i < ARRAY_SIZE(sh7785_onchip_clocks); i++) { struct clk *clkp = sh7785_onchip_clocks[i]; clkp->parent = clk; - clk_register(clkp); - clk_enable(clkp); + ret |= clk_register(clkp); } - /* - * Now that we have the rest of the clocks registered, we need to - * force the parent clock to propagate so that these clocks will - * automatically figure out their rate. We cheat by handing the - * parent clock its current rate and forcing child propagation. - */ - clk_set_rate(clk, clk_get_rate(clk)); - clk_put(clk); - return 0; + return ret; } arch_initcall(sh7785_clk_init); diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c index 2825494f85d..825556fe230 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c @@ -123,26 +123,17 @@ static struct clk *sh7786_onchip_clocks[] = { static int __init sh7786_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); - int i; + int i, ret = 0; for (i = 0; i < ARRAY_SIZE(sh7786_onchip_clocks); i++) { struct clk *clkp = sh7786_onchip_clocks[i]; clkp->parent = clk; - clk_register(clkp); - clk_enable(clkp); + ret |= clk_register(clkp); } - /* - * Now that we have the rest of the clocks registered, we need to - * force the parent clock to propagate so that these clocks will - * automatically figure out their rate. We cheat by handing the - * parent clock its current rate and forcing child propagation. - */ - clk_set_rate(clk, clk_get_rate(clk)); - clk_put(clk); - return 0; + return ret; } arch_initcall(sh7786_clk_init); diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c index 6e5c864cf40..1eb149b0fe6 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c @@ -110,26 +110,17 @@ static struct clk *shx3_onchip_clocks[] = { static int __init shx3_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); - int i; + int i, ret = 0; for (i = 0; i < ARRAY_SIZE(shx3_onchip_clocks); i++) { struct clk *clkp = shx3_onchip_clocks[i]; clkp->parent = clk; - clk_register(clkp); - clk_enable(clkp); + ret |= clk_register(clkp); } - /* - * Now that we have the rest of the clocks registered, we need to - * force the parent clock to propagate so that these clocks will - * automatically figure out their rate. We cheat by handing the - * parent clock its current rate and forcing child propagation. - */ - clk_set_rate(clk, clk_get_rate(clk)); - clk_put(clk); - return 0; + return ret; } arch_initcall(shx3_clk_init); From 007e8363b656768fe3f59c180824ff704680bb25 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 06:05:32 +0900 Subject: [PATCH 0996/2061] sh: clkfwk: Kill off clk_recalc_rate(). The only user for this is the SH-Mobile r_clk, which is now added as a root clock and can be kicked via propagate_rate() as usual. Given that, there is no longer any need for the special clk_recalc_rate(), so we kill it off. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 1 - arch/sh/kernel/cpu/clock.c | 14 -------------- arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 2 +- 3 files changed, 1 insertion(+), 16 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index fdb915608db..6a2f4633337 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -49,7 +49,6 @@ unsigned long followparent_recalc(struct clk *); void recalculate_root_clocks(void); void propagate_rate(struct clk *); int clk_reparent(struct clk *child, struct clk *parent); -void clk_recalc_rate(struct clk *); int clk_register(struct clk *); void clk_unregister(struct clk *); diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index e3d1de8a46f..7a356de99d7 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -280,20 +280,6 @@ int clk_set_rate_ex(struct clk *clk, unsigned long rate, int algo_id) } EXPORT_SYMBOL_GPL(clk_set_rate_ex); -void clk_recalc_rate(struct clk *clk) -{ - unsigned long flags; - - if (!clk->ops->recalc) - return; - - spin_lock_irqsave(&clock_lock, flags); - clk->rate = clk->ops->recalc(clk); - propagate_rate(clk); - spin_unlock_irqrestore(&clock_lock, flags); -} -EXPORT_SYMBOL_GPL(clk_recalc_rate); - int clk_set_parent(struct clk *clk, struct clk *parent) { unsigned long flags; diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c index 1c41db41de7..426065b4326 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c @@ -912,7 +912,7 @@ int __init arch_clk_init(void) clk_put(clk); } - clk_recalc_rate(&sh7722_r_clock); /* make sure rate gets propagated */ + propagate_rate(&sh7722_r_clock); /* make sure rate gets propagated */ return 0; } From 0dae89572cbcd5f676ea52a9448d9639d97a53d6 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 06:18:09 +0900 Subject: [PATCH 0997/2061] sh: clkfwk: Wire up clk_get_sys() support. This stubs in clk_get_sys() from the ARM clkdev implementation. Tentatively conver the clk_get() lookup code to use this, and once the rest of the in-tree users are happy with this, it can replace the fallback lookups. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 7 +++++ arch/sh/kernel/cpu/clock.c | 59 +++++++++++++++++++++++++++++++++++++ 2 files changed, 66 insertions(+) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index 6a2f4633337..e2f5bf1b4a9 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -37,6 +37,13 @@ struct clk { unsigned long arch_flags; }; +struct clk_lookup { + struct list_head node; + const char *dev_id; + const char *con_id; + struct clk *clk; +}; + #define CLK_ENABLE_ON_INIT (1 << 0) /* Should be defined by processor-specific code */ diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 7a356de99d7..34707b86776 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -10,6 +10,10 @@ * * Modified for omap shared clock framework by Tony Lindgren * + * With clkdev bits: + * + * Copyright (C) 2008 Russell King. + * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive * for more details. @@ -334,15 +338,70 @@ long clk_round_rate(struct clk *clk, unsigned long rate) } EXPORT_SYMBOL_GPL(clk_round_rate); +/* + * Find the correct struct clk for the device and connection ID. + * We do slightly fuzzy matching here: + * An entry with a NULL ID is assumed to be a wildcard. + * If an entry has a device ID, it must match + * If an entry has a connection ID, it must match + * Then we take the most specific entry - with the following + * order of precidence: dev+con > dev only > con only. + */ +static struct clk *clk_find(const char *dev_id, const char *con_id) +{ + struct clk_lookup *p; + struct clk *clk = NULL; + int match, best = 0; + + list_for_each_entry(p, &clock_list, node) { + match = 0; + if (p->dev_id) { + if (!dev_id || strcmp(p->dev_id, dev_id)) + continue; + match += 2; + } + if (p->con_id) { + if (!con_id || strcmp(p->con_id, con_id)) + continue; + match += 1; + } + if (match == 0) + continue; + + if (match > best) { + clk = p->clk; + best = match; + } + } + return clk; +} + +struct clk *clk_get_sys(const char *dev_id, const char *con_id) +{ + struct clk *clk; + + mutex_lock(&clock_list_sem); + clk = clk_find(dev_id, con_id); + mutex_unlock(&clock_list_sem); + + return clk ? clk : ERR_PTR(-ENOENT); +} +EXPORT_SYMBOL_GPL(clk_get_sys); + /* * Returns a clock. Note that we first try to use device id on the bus * and clock name. If this fails, we try to use clock name only. */ struct clk *clk_get(struct device *dev, const char *id) { + const char *dev_id = dev ? dev_name(dev) : NULL; struct clk *p, *clk = ERR_PTR(-ENOENT); int idno; + clk = clk_get_sys(dev_id, id); + if (clk) + return clk; + if (dev == NULL || dev->bus != &platform_bus_type) idno = -1; else From 77d1a4999502c260df0eb2de437d320bf8c64b36 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 14:21:12 -0700 Subject: [PATCH 0998/2061] x86, boot: make symbols from the main vmlinux available Make symbols from the main vmlinux, as opposed to just compressed/vmlinux, available to header.S. Also, export a few additional symbols. This will be used in a subsequent patch to export the total memory footprint of the kernel. [ Impact: enable future enhancement ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 24 ++++++++++++++++-------- arch/x86/boot/header.S | 7 ++++--- 2 files changed, 20 insertions(+), 11 deletions(-) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 6633b6e7505..75e0301fc69 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -86,19 +86,27 @@ $(obj)/vmlinux.bin: $(obj)/compressed/vmlinux FORCE SETUP_OBJS = $(addprefix $(obj)/,$(setup-y)) -sed-offsets := -e 's/^00*/0/' \ - -e 's/^\([0-9a-fA-F]*\) . \(input_data\|input_data_end\)$$/\#define \2 0x\1/p' +sed-voffset := -e 's/^\([0-9a-fA-F]*\) . \(_text\|_end\)$$/\#define VO_\2 0x\1/p' -quiet_cmd_offsets = OFFSETS $@ - cmd_offsets = $(NM) $< | sed -n $(sed-offsets) > $@ +quiet_cmd_voffset = VOFFSET $@ + cmd_voffset = $(NM) $< | sed -n $(sed-voffset) > $@ -$(obj)/offsets.h: $(obj)/compressed/vmlinux FORCE - $(call if_changed,offsets) +targets += voffset.h +$(obj)/voffset.h: vmlinux FORCE + $(call if_changed,voffset) + +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' + +quiet_cmd_zoffset = ZOFFSET $@ + cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ + +targets += zoffset.h +$(obj)/zoffset.h: $(obj)/compressed/vmlinux FORCE + $(call if_changed,zoffset) -targets += offsets.h AFLAGS_header.o += -I$(obj) -$(obj)/header.o: $(obj)/offsets.h +$(obj)/header.o: $(obj)/voffset.h $(obj)/zoffset.h LDFLAGS_setup.elf := -T $(obj)/setup.elf: $(src)/setup.ld $(SETUP_OBJS) FORCE diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 5d84d1c74e4..27285143ade 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -22,7 +22,8 @@ #include #include #include "boot.h" -#include "offsets.h" +#include "voffset.h" +#include "zoffset.h" BOOTSEG = 0x07C0 /* original address of boot-sector */ SYSSEG = 0x1000 /* historical load address >> 4 */ @@ -212,8 +213,8 @@ hardware_subarch: .long 0 # subarchitecture, added with 2.07 hardware_subarch_data: .quad 0 -payload_offset: .long input_data -payload_length: .long input_data_end-input_data +payload_offset: .long ZO_input_data +payload_length: .long ZO_z_input_len setup_data: .quad 0 # 64-bit physical pointer to # single linked list of From 40b387a8a9a821878ecdf9fb117958c426fc1385 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 14:41:55 -0700 Subject: [PATCH 0999/2061] x86, boot: use LOAD_PHYSICAL_ADDR on 64 bits Use LOAD_PHYSICAL_ADDR instead of CONFIG_PHYSICAL_START in the 64-bit decompression code, for equivalence with the 32-bit code. [ Impact: cleanup, increases code similarity ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2b9f2510507..4135d438b66 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -87,7 +87,7 @@ ENTRY(startup_32) addl $(PMD_PAGE_SIZE -1), %ebx andl $PMD_PAGE_MASK, %ebx #else - movl $CONFIG_PHYSICAL_START, %ebx + movl $LOAD_PHYSICAL_ADDR, %ebx #endif /* Target address to relocate to for decompression */ @@ -215,7 +215,7 @@ ENTRY(startup_64) * * If it is a relocatable kernel then decompress and run the kernel * from load address aligned to 2MB addr, otherwise decompress and - * run the kernel from CONFIG_PHYSICAL_START + * run the kernel from LOAD_PHYSICAL_ADDR * * We cannot rely on the calculation done in 32-bit mode, since we * may have been invoked via the 64-bit entry point. @@ -228,7 +228,7 @@ ENTRY(startup_64) andq $PMD_PAGE_MASK, %rbp movq %rbp, %rbx #else - movq $CONFIG_PHYSICAL_START, %rbp + movq $LOAD_PHYSICAL_ADDR, %rbp movq %rbp, %rbx #endif From 2ff799d3cff1ecb274049378b28120ee5c1c5e5f Mon Sep 17 00:00:00 2001 From: Vaidyanathan Srinivasan Date: Mon, 11 May 2009 20:09:14 +0530 Subject: [PATCH 1000/2061] sched: Don't export sched_mc_power_savings on multi-socket single core system Fix to prevent sched_mc_power_saving from being exported through sysfs for multi-scoket single core system. Max cores should be always greater than one (1). My earlier patch that introduced fix for not exporting 'sched_mc_power_saving' on laptops broke it on multi-socket single core system. This fix addresses issue on both laptop and multi-socket single core system. Below are the Test results: 1. Single socket - multi-core Before Patch: Does not export 'sched_mc_power_saving' After Patch: Does not export 'sched_mc_power_saving' Result: Pass 2. Multi Socket - single core Before Patch: exports 'sched_mc_power_saving' After Patch: Does not export 'sched_mc_power_saving' Result: Pass 3. Multi Socket - Multi core Before Patch: exports 'sched_mc_power_saving' After Patch: exports 'sched_mc_power_saving' [ Impact: make the sched_mc_power_saving control available more consistently ] Signed-off-by: Mahesh Salgaonkar Cc: Suresh B Siddha Cc: Venkatesh Pallipadi Cc: Peter Zijlstra LKML-Reference: <20090511143914.GB4853@dirshya.in.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/topology.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h index f44b49abca4..066ef590d7e 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -203,7 +203,8 @@ struct pci_bus; void x86_pci_root_bus_res_quirks(struct pci_bus *b); #ifdef CONFIG_SMP -#define mc_capable() (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids) +#define mc_capable() ((boot_cpu_data.x86_max_cores > 1) && \ + (cpumask_weight(cpu_core_mask(0)) != nr_cpu_ids)) #define smt_capable() (smp_num_siblings > 1) #endif From 99aa45595f45603526513d5e29fc00f8afbf3913 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 16:02:10 -0700 Subject: [PATCH 1001/2061] x86, boot: remove dead code from boot/compressed/head_*.S Remove a couple of lines of dead code from arch/x86/boot/compressed/head_*.S; all of these update registers that are dead in the current code. [ Impact: cleanup ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 10 ---------- arch/x86/boot/compressed/head_64.S | 2 -- 2 files changed, 12 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 470474bafc4..2b8e0dfa4b2 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -99,16 +99,6 @@ ENTRY(startup_32) cld popl %esi -/* - * Compute the kernel start address. - */ -#ifdef CONFIG_RELOCATABLE - addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebp - andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebp -#else - movl $LOAD_PHYSICAL_ADDR, %ebp -#endif - /* * Jump to the relocated address. */ diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 4135d438b66..2bb500af1bd 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -226,10 +226,8 @@ ENTRY(startup_64) leaq startup_32(%rip) /* - $startup_32 */, %rbp addq $(PMD_PAGE_SIZE - 1), %rbp andq $PMD_PAGE_MASK, %rbp - movq %rbp, %rbx #else movq $LOAD_PHYSICAL_ADDR, %rbp - movq %rbp, %rbx #endif /* Target address to relocate to for decompression */ From 37ba7ab5e33cebc25c68fffe33e9f21e7c2014e8 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 15:56:08 -0700 Subject: [PATCH 1002/2061] x86, boot: make kernel_alignment adjustable; new bzImage fields Make the kernel_alignment field adjustable; this allows us to set it to a large value (intended to be 16 MB to avoid ZONE_DMA contention, memory holes and other weirdness) while a smart bootloader can still force a loading at a lesser alignment if absolutely necessary. Also export pref_address (preferred loading address, corresponding to the link-time address) and init_size, the total amount of linear memory the kernel will require during initialization. [ Impact: allows better kernel placement, gives bootloader more info ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/head_32.S | 7 +++++-- arch/x86/boot/compressed/head_64.S | 14 ++++++++++---- arch/x86/boot/header.S | 15 +++++++++++++-- arch/x86/include/asm/boot.h | 15 +++++++++++++++ arch/x86/kernel/asm-offsets_32.c | 1 + arch/x86/kernel/asm-offsets_64.c | 1 + 6 files changed, 45 insertions(+), 8 deletions(-) diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 2b8e0dfa4b2..75e4f001e70 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -69,8 +69,11 @@ ENTRY(startup_32) #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - addl $(CONFIG_PHYSICAL_ALIGN - 1), %ebx - andl $(~(CONFIG_PHYSICAL_ALIGN - 1)), %ebx + movl BP_kernel_alignment(%esi), %eax + decl %eax + addl %eax, %ebx + notl %eax + andl %eax, %ebx #else movl $LOAD_PHYSICAL_ADDR, %ebx #endif diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 2bb500af1bd..f62c284db9e 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -84,8 +84,11 @@ ENTRY(startup_32) #ifdef CONFIG_RELOCATABLE movl %ebp, %ebx - addl $(PMD_PAGE_SIZE -1), %ebx - andl $PMD_PAGE_MASK, %ebx + movl BP_kernel_alignment(%esi), %eax + decl %eax + addl %eax, %ebx + notl %eax + andl %eax, %ebx #else movl $LOAD_PHYSICAL_ADDR, %ebx #endif @@ -224,8 +227,11 @@ ENTRY(startup_64) /* Start with the delta to where the kernel will run at. */ #ifdef CONFIG_RELOCATABLE leaq startup_32(%rip) /* - $startup_32 */, %rbp - addq $(PMD_PAGE_SIZE - 1), %rbp - andq $PMD_PAGE_MASK, %rbp + movl BP_kernel_alignment(%rsi), %eax + decl %eax + addq %rax, %rbp + notq %rax + andq %rax, %rbp #else movq $LOAD_PHYSICAL_ADDR, %rbp #endif diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 27285143ade..a0b426978d5 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -116,7 +116,7 @@ _start: # Part 2 of the header, from the old setup.S .ascii "HdrS" # header signature - .word 0x0209 # header version number (>= 0x0105) + .word 0x020a # header version number (>= 0x0105) # or else old loadlin-1.5 will fail) .globl realmode_swtch realmode_swtch: .word 0, 0 # default_switch, SETUPSEG @@ -201,7 +201,7 @@ relocatable_kernel: .byte 1 #else relocatable_kernel: .byte 0 #endif -pad2: .byte 0 +min_alignment: .byte MIN_KERNEL_ALIGN_LG2 # minimum alignment pad3: .word 0 cmdline_size: .long COMMAND_LINE_SIZE-1 #length of the command line, @@ -220,6 +220,17 @@ setup_data: .quad 0 # 64-bit physical pointer to # single linked list of # struct setup_data +pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr + +#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_extract_offset) +#define VO_INIT_SIZE (VO__end - VO__text) +#if ZO_INIT_SIZE > VO_INIT_SIZE +#define INIT_SIZE ZO_INIT_SIZE +#else +#define INIT_SIZE VO_INIT_SIZE +#endif +init_size: .long INIT_SIZE # kernel initialization size + # End of setup header ##################################################### .section ".inittext", "ax" diff --git a/arch/x86/include/asm/boot.h b/arch/x86/include/asm/boot.h index 6ba23dd9fc9..418e632d4a8 100644 --- a/arch/x86/include/asm/boot.h +++ b/arch/x86/include/asm/boot.h @@ -8,11 +8,26 @@ #ifdef __KERNEL__ +#include + /* Physical address where kernel should be loaded. */ #define LOAD_PHYSICAL_ADDR ((CONFIG_PHYSICAL_START \ + (CONFIG_PHYSICAL_ALIGN - 1)) \ & ~(CONFIG_PHYSICAL_ALIGN - 1)) +/* Minimum kernel alignment, as a power of two */ +#ifdef CONFIG_x86_64 +#define MIN_KERNEL_ALIGN_LG2 PMD_SHIFT +#else +#define MIN_KERNEL_ALIGN_LG2 (PAGE_SHIFT+1) +#endif +#define MIN_KERNEL_ALIGN (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2) + +#if (CONFIG_PHYSICAL_ALIGN & (CONFIG_PHYSICAL_ALIGN-1)) || \ + (CONFIG_PHYSICAL_ALIGN < (_AC(1, UL) << MIN_KERNEL_ALIGN_LG2)) +#error "Invalid value for CONFIG_PHYSICAL_ALIGN" +#endif + #ifdef CONFIG_KERNEL_BZIP2 #define BOOT_HEAP_SIZE 0x400000 #else /* !CONFIG_KERNEL_BZIP2 */ diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 5a6aa1c1162..1a830cbd701 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -146,4 +146,5 @@ void foo(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); } diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index e72f062fb4b..898ecc47e12 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -125,6 +125,7 @@ int main(void) OFFSET(BP_loadflags, boot_params, hdr.loadflags); OFFSET(BP_hardware_subarch, boot_params, hdr.hardware_subarch); OFFSET(BP_version, boot_params, hdr.version); + OFFSET(BP_kernel_alignment, boot_params, hdr.kernel_alignment); BLANK(); DEFINE(PAGE_SIZE_asm, PAGE_SIZE); From d297366ba692faf1f0384811a6ff0b20c3470b1b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 16:06:23 -0700 Subject: [PATCH 1003/2061] x86: document new bzImage fields Document the new bzImage fields for kernel memory placement. [ Impact: adds documentation ] Signed-off-by: H. Peter Anvin --- Documentation/x86/boot.txt | 69 +++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 4 deletions(-) diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index e0203662f9e..cf8dfc70a11 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -50,6 +50,11 @@ Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical pointer to single linked list of struct setup_data. +Protocol 2.10: (Kernel 2.6.31) A protocol for relaxed alignment + beyond the kernel_alignment added, new init_size and + pref_address fields. + + **** MEMORY LAYOUT The traditional memory map for the kernel loader, used for Image or @@ -173,7 +178,7 @@ Offset Proto Name Meaning 022C/4 2.03+ ramdisk_max Highest legal initrd address 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel 0234/1 2.05+ relocatable_kernel Whether kernel is relocatable or not -0235/1 N/A pad2 Unused +0235/1 2.10+ min_alignment Minimum alignment, as a power of two 0236/2 N/A pad3 Unused 0238/4 2.06+ cmdline_size Maximum size of the kernel command line 023C/4 2.07+ hardware_subarch Hardware subarchitecture @@ -182,6 +187,8 @@ Offset Proto Name Meaning 024C/4 2.08+ payload_length Length of kernel payload 0250/8 2.09+ setup_data 64-bit physical pointer to linked list of struct setup_data +0258/8 2.10+ pref_address Preferred loading address +0260/4 2.10+ init_size Linear memory required during initialization (1) For backwards compatibility, if the setup_sects field contains 0, the real value is 4. @@ -482,11 +489,19 @@ Protocol: 2.03+ 0x37FFFFFF, you can start your ramdisk at 0x37FE0000.) Field name: kernel_alignment -Type: read (reloc) +Type: read/modify (reloc) Offset/size: 0x230/4 -Protocol: 2.05+ +Protocol: 2.05+ (read), 2.10+ (modify) - Alignment unit required by the kernel (if relocatable_kernel is true.) + Alignment unit required by the kernel (if relocatable_kernel is + true.) A relocatable kernel that is loaded at an alignment + incompatible with the value in this field will be realigned during + kernel initialization. + + Starting with protocol version 2.10, this reflects the kernel + alignment preferred for optimal performance; it is possible for the + loader to modify this field to permit a lesser alignment. See the + min_alignment and pref_address field below. Field name: relocatable_kernel Type: read (reloc) @@ -498,6 +513,22 @@ Protocol: 2.05+ After loading, the boot loader must set the code32_start field to point to the loaded code, or to a boot loader hook. +Field name: min_alignment +Type: read (reloc) +Offset/size: 0x235/1 +Protocol: 2.10+ + + This field, if nonzero, indicates as a power of two the minimum + alignment required, as opposed to preferred, by the kernel to boot. + If a boot loader makes use of this field, it should update the + kernel_alignment field with the alignment unit desired; typically: + + kernel_alignment = 1 << min_alignment + + There may be a considerable performance cost with an excessively + misaligned kernel. Therefore, a loader should typically try each + power-of-two alignment from kernel_alignment down to this alignment. + Field name: cmdline_size Type: read Offset/size: 0x238/4 @@ -582,6 +613,36 @@ Protocol: 2.09+ sure to consider the case where the linked list already contains entries. +Field name: pref_address +Type: read (reloc) +Offset/size: 0x258/8 +Protocol: 2.10+ + + This field, if nonzero, represents a preferred load address for the + kernel. A relocating bootloader should attempt to load at this + address if possible. + + A non-relocatable kernel will unconditionally move itself and to run + at this address. + +Field name: init_size +Type: read +Offset/size: 0x25c/4 + + This field indicates the amount of linear contiguous memory starting + at the kernel runtime start address that the kernel needs before it + is capable of examining its memory map. This is not the same thing + as the total amount of memory the kernel needs to boot, but it can + be used by a relocating boot loader to help select a safe load + address for the kernel. + + The kernel runtime start address is determined by the following algorithm: + + if (relocatable_kernel) + runtime_start = align_up(load_address, kernel_alignment) + else + runtime_start = pref_address + **** THE IMAGE CHECKSUM From ceefccc93932b920a8ec6f35f596db05202a12fe Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 16:12:16 -0700 Subject: [PATCH 1004/2061] x86: default CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN to 16 MB Default CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN each to 16 MB, so that both non-relocatable and relocatable kernels are loaded at 16 MB by a non-relocating bootloader. This is somewhat hacky, but it appears to be the only way to do this that does not break some some set of existing bootloaders. We want to avoid the bottom 16 MB because of large page breakup, memory holes, and ZONE_DMA. Embedded systems may need to reduce this, or update their bootloaders to be aware of the new min_alignment field. [ Impact: performance improvement, avoids problems on some systems ] Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 27 ++++++++++++--------------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 5aee45356b5..50fbb47f529 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1455,9 +1455,7 @@ config KEXEC_JUMP config PHYSICAL_START hex "Physical address where the kernel is loaded" if (EMBEDDED || CRASH_DUMP) - default "0x1000000" if X86_NUMAQ - default "0x200000" if X86_64 - default "0x100000" + default "0x1000000" ---help--- This gives the physical address where the kernel is loaded. @@ -1476,15 +1474,15 @@ config PHYSICAL_START to be specifically compiled to run from a specific memory area (normally a reserved region) and this option comes handy. - So if you are using bzImage for capturing the crash dump, leave - the value here unchanged to 0x100000 and set CONFIG_RELOCATABLE=y. - Otherwise if you plan to use vmlinux for capturing the crash dump - change this value to start of the reserved region (Typically 16MB - 0x1000000). In other words, it can be set based on the "X" value as - specified in the "crashkernel=YM@XM" command line boot parameter - passed to the panic-ed kernel. Typically this parameter is set as - crashkernel=64M@16M. Please take a look at - Documentation/kdump/kdump.txt for more details about crash dumps. + So if you are using bzImage for capturing the crash dump, + leave the value here unchanged to 0x1000000 and set + CONFIG_RELOCATABLE=y. Otherwise if you plan to use vmlinux + for capturing the crash dump change this value to start of + the reserved region. In other words, it can be set based on + the "X" value as specified in the "crashkernel=YM@XM" + command line boot parameter passed to the panic-ed + kernel. Please take a look at Documentation/kdump/kdump.txt + for more details about crash dumps. Usage of bzImage for capturing the crash dump is recommended as one does not have to build two kernels. Same kernel can be used @@ -1521,9 +1519,8 @@ config X86_NEED_RELOCS config PHYSICAL_ALIGN hex prompt "Alignment value to which kernel should be aligned" if X86_32 - default "0x100000" if X86_32 - default "0x200000" if X86_64 - range 0x2000 0x400000 + default "0x1000000" + range 0x2000 0x1000000 ---help--- This value puts the alignment restrictions on physical address where kernel is loaded and run from. Kernel is compiled for an From 26717808f93a27c22d4853c4fb17fa225f4ccc68 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 May 2009 14:19:34 -0700 Subject: [PATCH 1005/2061] x86: make CONFIG_RELOCATABLE the default Remove the EXPERIMENTAL tag from CONFIG_RELOCATABLE and make it the default. Relocatable kernels have been used for a while now, and should now have identical semantics to non-relocatable kernels when loaded by a non-relocating bootloader. Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 50fbb47f529..3e0f80a764a 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1495,8 +1495,8 @@ config PHYSICAL_START Don't change this unless you know what you are doing. config RELOCATABLE - bool "Build a relocatable kernel (EXPERIMENTAL)" - depends on EXPERIMENTAL + bool "Build a relocatable kernel" + default y ---help--- This builds a kernel image that retains relocation information so it can be loaded someplace besides the default 1MB. From c4a994645d04d5fa6bfa52a3204af87dd92168d5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 16:20:12 -0700 Subject: [PATCH 1006/2061] x86, defconfig: update to current, no material changes Update defconfigs to reflect current configuration files. No other changes. [ Impact: updates defconfigs to match what "make defconfig" generates ] Signed-off-by: H. Peter Anvin --- arch/x86/configs/i386_defconfig | 143 ++++++++++++++++++++++------- arch/x86/configs/x86_64_defconfig | 148 ++++++++++++++++++++++-------- 2 files changed, 223 insertions(+), 68 deletions(-) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 235b81d0f6f..70036a7a950 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,12 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc4 -# Tue Feb 24 15:50:58 2009 +# Linux kernel version: 2.6.30-rc2 +# Mon May 11 16:19:47 2009 # # CONFIG_64BIT is not set CONFIG_X86_32=y # CONFIG_X86_64 is not set CONFIG_X86=y +CONFIG_OUTPUT_FORMAT="elf32-i386" CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig" CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -33,6 +34,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_DEFAULT_IDLE=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y # CONFIG_HAVE_CPUMASK_OF_CPU_MAP is not set CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y @@ -40,15 +42,16 @@ CONFIG_ARCH_SUSPEND_POSSIBLE=y CONFIG_ARCH_POPULATES_NODE_MAP=y # CONFIG_AUDIT_ARCH is not set CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_X86_SMP=y CONFIG_USE_GENERIC_SMP_HELPERS=y CONFIG_X86_32_SMP=y CONFIG_X86_HT=y -CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y +CONFIG_X86_32_LAZY_GS=y CONFIG_KTIME_SCALAR=y CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -60,10 +63,17 @@ CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_KERNEL_GZIP=y +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_TASKSTATS=y @@ -113,23 +123,26 @@ CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_PCSPKR_PLATFORM=y -# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y @@ -139,6 +152,7 @@ CONFIG_AIO=y CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set @@ -154,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set CONFIG_HAVE_GENERIC_DMA_COHERENT=y CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -167,7 +183,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y # CONFIG_LBD is not set -CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_INTEGRITY is not set @@ -194,12 +209,12 @@ CONFIG_HIGH_RES_TIMERS=y CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y CONFIG_SPARSE_IRQ=y -CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y +# CONFIG_X86_BIGSMP is not set +CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_ELAN is not set -# CONFIG_X86_GENERICARCH is not set -# CONFIG_X86_VSMP is not set # CONFIG_X86_RDC321X is not set +# CONFIG_X86_32_NON_STANDARD is not set CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set # CONFIG_MEMTEST is not set @@ -230,8 +245,10 @@ CONFIG_M686=y # CONFIG_GENERIC_CPU is not set CONFIG_X86_GENERIC=y CONFIG_X86_CPU=y +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=5 CONFIG_X86_XADD=y # CONFIG_X86_PPRO_FENCE is not set CONFIG_X86_WP_WORKS_OK=y @@ -247,7 +264,7 @@ CONFIG_X86_DEBUGCTLMSR=y CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_CYRIX_32=y CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_CENTAUR_32=y +CONFIG_CPU_SUP_CENTAUR=y CONFIG_CPU_SUP_TRANSMETA_32=y CONFIG_CPU_SUP_UMC_32=y CONFIG_X86_DS=y @@ -279,6 +296,7 @@ CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y +# CONFIG_X86_CPU_DEBUG is not set # CONFIG_NOHIGHMEM is not set CONFIG_HIGHMEM4G=y # CONFIG_HIGHMEM64G is not set @@ -302,6 +320,8 @@ CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y CONFIG_HIGHPTE=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y @@ -312,6 +332,7 @@ CONFIG_MTRR=y CONFIG_X86_PAT=y CONFIG_EFI=y CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set @@ -363,7 +384,6 @@ CONFIG_ACPI_THERMAL=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set # CONFIG_ACPI_PCI_SLOT is not set -CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y CONFIG_ACPI_CONTAINER=y # CONFIG_ACPI_SBS is not set @@ -425,6 +445,7 @@ CONFIG_PCI_BIOS=y CONFIG_PCI_DIRECT=y CONFIG_PCI_MMCONFIG=y CONFIG_PCI_DOMAINS=y +# CONFIG_DMAR is not set CONFIG_PCIEPORTBUS=y # CONFIG_HOTPLUG_PCI_PCIE is not set CONFIG_PCIEAER=y @@ -435,6 +456,7 @@ CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set CONFIG_HT_IRQ=y +# CONFIG_PCI_IOV is not set CONFIG_ISA_DMA_API=y # CONFIG_ISA is not set # CONFIG_MCA is not set @@ -481,7 +503,6 @@ CONFIG_NET=y # # Networking options # -CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y @@ -639,6 +660,7 @@ CONFIG_LLC=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set CONFIG_NET_SCHED=y # @@ -696,6 +718,7 @@ CONFIG_NET_SCH_FIFO=y # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set +# CONFIG_NET_DROP_MONITOR is not set CONFIG_HAMRADIO=y # @@ -706,12 +729,10 @@ CONFIG_HAMRADIO=y # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set CONFIG_FIB_RULES=y CONFIG_WIRELESS=y CONFIG_CFG80211=y # CONFIG_CFG80211_REG_DEBUG is not set -CONFIG_NL80211=y CONFIG_WIRELESS_OLD_REGULATORY=y CONFIG_WIRELESS_EXT=y CONFIG_WIRELESS_EXT_SYSFS=y @@ -789,6 +810,7 @@ CONFIG_MISC_DEVICES=y # CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set # CONFIG_HP_ILO is not set +# CONFIG_ISL29003 is not set # CONFIG_C2PORT is not set # @@ -842,6 +864,7 @@ CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set # CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -940,6 +963,7 @@ CONFIG_DM_ZERO=y CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y +CONFIG_COMPAT_NET_DEV_OPS=y # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -977,6 +1001,8 @@ CONFIG_MII=y CONFIG_NET_VENDOR_3COM=y # CONFIG_VORTEX is not set # CONFIG_TYPHOON is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set # CONFIG_TULIP is not set @@ -1026,6 +1052,7 @@ CONFIG_E1000=y CONFIG_E1000E=y # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -1040,6 +1067,7 @@ CONFIG_BNX2=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set @@ -1049,6 +1077,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -1058,6 +1087,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_BNX2X is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set +# CONFIG_BE2NET is not set CONFIG_TR=y # CONFIG_IBMOL is not set # CONFIG_IBMLS is not set @@ -1073,8 +1103,8 @@ CONFIG_WLAN_80211=y # CONFIG_LIBERTAS is not set # CONFIG_LIBERTAS_THINFIRM is not set # CONFIG_AIRO is not set -# CONFIG_HERMES is not set # CONFIG_ATMEL is not set +# CONFIG_AT76C50X_USB is not set # CONFIG_AIRO_CS is not set # CONFIG_PCMCIA_WL3501 is not set # CONFIG_PRISM54 is not set @@ -1084,21 +1114,21 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set # CONFIG_MAC80211_HWSIM is not set +# CONFIG_MWL8K is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set # CONFIG_ATH9K is not set +# CONFIG_AR9170_USB is not set # CONFIG_IPW2100 is not set # CONFIG_IPW2200 is not set -# CONFIG_IWLCORE is not set -# CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWLAGN is not set -# CONFIG_IWL3945 is not set +# CONFIG_IWLWIFI is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set # CONFIG_B43LEGACY is not set # CONFIG_ZD1211RW is not set # CONFIG_RT2X00 is not set +# CONFIG_HERMES is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers @@ -1209,6 +1239,8 @@ CONFIG_INPUT_TABLET=y # CONFIG_TABLET_USB_KBTAB is not set # CONFIG_TABLET_USB_WACOM is not set CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_AD7879_I2C is not set +# CONFIG_TOUCHSCREEN_AD7879 is not set # CONFIG_TOUCHSCREEN_FUJITSU is not set # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set @@ -1303,6 +1335,7 @@ CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y +# CONFIG_HW_RANDOM_TIMERIOMEM is not set CONFIG_HW_RANDOM_INTEL=y CONFIG_HW_RANDOM_AMD=y CONFIG_HW_RANDOM_GEODE=y @@ -1390,7 +1423,6 @@ CONFIG_I2C_I801=y # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set # CONFIG_I2C_DEBUG_CORE is not set @@ -1424,6 +1456,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_ADT7475 is not set # CONFIG_SENSORS_K8TEMP is not set # CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATK0110 is not set # CONFIG_SENSORS_ATXP1 is not set # CONFIG_SENSORS_DS1621 is not set # CONFIG_SENSORS_I5K_AMB is not set @@ -1433,6 +1466,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_FSCHER is not set # CONFIG_SENSORS_FSCPOS is not set # CONFIG_SENSORS_FSCHMD is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set @@ -1448,11 +1482,14 @@ CONFIG_HWMON=y # CONFIG_SENSORS_LM90 is not set # CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set # CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_MAX6650 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_SIS5595 is not set # CONFIG_SENSORS_DME1737 is not set # CONFIG_SENSORS_SMSC47M1 is not set @@ -1643,7 +1680,6 @@ CONFIG_FB_EFI=y # CONFIG_FB_3DFX is not set # CONFIG_FB_VOODOO1 is not set # CONFIG_FB_VT8623 is not set -# CONFIG_FB_CYBLA is not set # CONFIG_FB_TRIDENT is not set # CONFIG_FB_ARK is not set # CONFIG_FB_PM3 is not set @@ -1652,6 +1688,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_VIRTUAL is not set # CONFIG_FB_METRONOME is not set # CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y @@ -1738,6 +1775,8 @@ CONFIG_SND_PCI=y # CONFIG_SND_INDIGO is not set # CONFIG_SND_INDIGOIO is not set # CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_INDIGOIOX is not set +# CONFIG_SND_INDIGODJX is not set # CONFIG_SND_EMU10K1 is not set # CONFIG_SND_EMU10K1X is not set # CONFIG_SND_ENS1370 is not set @@ -1811,15 +1850,17 @@ CONFIG_USB_HIDDEV=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +CONFIG_HID_KYE=y CONFIG_HID_GYRATION=y +CONFIG_HID_KENSINGTON=y CONFIG_HID_LOGITECH=y CONFIG_LOGITECH_FF=y # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1885,11 +1926,11 @@ CONFIG_USB_PRINTER=y # CONFIG_USB_TMC is not set # -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may # # -# see USB_STORAGE Help for more information +# also be needed; see USB_STORAGE Help for more info # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1931,7 +1972,6 @@ CONFIG_USB_LIBUSUAL=y # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGET is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_FTDI_ELAN is not set # CONFIG_USB_APPLEDISPLAY is not set @@ -1947,6 +1987,7 @@ CONFIG_USB_LIBUSUAL=y # # OTG and related infrastructure # +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1958,8 +1999,10 @@ CONFIG_LEDS_CLASS=y # # CONFIG_LEDS_ALIX2 is not set # CONFIG_LEDS_PCA9532 is not set +# CONFIG_LEDS_LP5521 is not set # CONFIG_LEDS_CLEVO_MAIL is not set # CONFIG_LEDS_PCA955X is not set +# CONFIG_LEDS_BD2802 is not set # # LED Triggers @@ -1969,6 +2012,10 @@ CONFIG_LEDS_TRIGGERS=y # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set + +# +# iptables trigger is under Netfilter config (LED target) +# # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set CONFIG_EDAC=y @@ -2037,6 +2084,7 @@ CONFIG_DMADEVICES=y # DMA Devices # # CONFIG_INTEL_IOATDMA is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set # CONFIG_STAGING is not set CONFIG_X86_PLATFORM_DEVICES=y @@ -2071,6 +2119,7 @@ CONFIG_DMIID=y # # CONFIG_EXT2_FS is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y @@ -2100,6 +2149,11 @@ CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set CONFIG_GENERIC_ACL=y +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -2151,6 +2205,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -2164,7 +2219,6 @@ CONFIG_NFS_ACL_SUPPORT=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y -# CONFIG_SUNRPC_REGISTER_V4 is not set CONFIG_RPCSEC_GSS_KRB5=y # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set @@ -2251,6 +2305,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y @@ -2266,6 +2321,7 @@ CONFIG_TIMER_STATS=y # CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_HIGHMEM is not set CONFIG_DEBUG_BUGVERBOSE=y @@ -2289,13 +2345,19 @@ CONFIG_FRAME_POINTER=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y +CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -2305,13 +2367,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y # CONFIG_SYSPROF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_FTRACE_SYSCALLS is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_POWER_TRACER is not set # CONFIG_STACK_TRACER is not set # CONFIG_HW_BRANCH_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +CONFIG_BLK_DEV_IO_TRACE=y +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_MMIOTRACE is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -2321,7 +2391,6 @@ CONFIG_EARLY_PRINTK=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set CONFIG_DEBUG_RODATA=y @@ -2329,7 +2398,7 @@ CONFIG_DEBUG_RODATA=y CONFIG_DEBUG_NX_TEST=m # CONFIG_4KSTACKS is not set CONFIG_DOUBLEFAULT=y -# CONFIG_MMIOTRACE is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2365,6 +2434,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set # CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +# CONFIG_IMA is not set CONFIG_CRYPTO=y # @@ -2380,10 +2451,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y CONFIG_CRYPTO_HASH=y CONFIG_CRYPTO_HASH2=y CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_GF128MUL is not set # CONFIG_CRYPTO_NULL is not set +CONFIG_CRYPTO_WORKQUEUE=y # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_AUTHENC=y # CONFIG_CRYPTO_TEST is not set @@ -2456,6 +2529,7 @@ CONFIG_CRYPTO_DES=y # Compression # # CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_ZLIB is not set # CONFIG_CRYPTO_LZO is not set # @@ -2467,11 +2541,13 @@ CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_GEODE is not set # CONFIG_CRYPTO_DEV_HIFN_795X is not set CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set # CONFIG_LGUEST is not set # CONFIG_VIRTIO_PCI is not set # CONFIG_VIRTIO_BALLOON is not set +CONFIG_BINARY_PRINTF=y # # Library routines @@ -2489,7 +2565,10 @@ CONFIG_CRC32=y # CONFIG_LIBCRC32C is not set CONFIG_AUDIT_GENERIC=y CONFIG_ZLIB_INFLATE=y -CONFIG_PLIST=y +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y +CONFIG_NLATTR=y diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index 9fe5d212ab4..f3e53e21f43 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,12 +1,13 @@ # # Automatically generated make config: don't edit -# Linux kernel version: 2.6.29-rc4 -# Tue Feb 24 15:44:16 2009 +# Linux kernel version: 2.6.30-rc2 +# Mon May 11 16:19:24 2009 # CONFIG_64BIT=y # CONFIG_X86_32 is not set CONFIG_X86_64=y CONFIG_X86=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" CONFIG_GENERIC_TIME=y CONFIG_GENERIC_CMOS_UPDATE=y @@ -34,6 +35,7 @@ CONFIG_ARCH_HAS_CPU_RELAX=y CONFIG_ARCH_HAS_DEFAULT_IDLE=y CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_HAVE_DYNAMIC_PER_CPU_AREA=y CONFIG_HAVE_CPUMASK_OF_CPU_MAP=y CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y @@ -41,14 +43,14 @@ CONFIG_ZONE_DMA32=y CONFIG_ARCH_POPULATES_NODE_MAP=y CONFIG_AUDIT_ARCH=y CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y CONFIG_GENERIC_IRQ_PROBE=y CONFIG_GENERIC_PENDING_IRQ=y -CONFIG_X86_SMP=y CONFIG_USE_GENERIC_SMP_HELPERS=y CONFIG_X86_64_SMP=y CONFIG_X86_HT=y -CONFIG_X86_BIOS_REBOOT=y CONFIG_X86_TRAMPOLINE=y # CONFIG_KTIME_SCALAR is not set CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" @@ -61,10 +63,17 @@ CONFIG_LOCK_KERNEL=y CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_LOCALVERSION="" # CONFIG_LOCALVERSION_AUTO is not set +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_KERNEL_GZIP=y +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_BSD_PROCESS_ACCT=y # CONFIG_BSD_PROCESS_ACCT_V3 is not set CONFIG_TASKSTATS=y @@ -114,23 +123,26 @@ CONFIG_PID_NS=y CONFIG_NET_NS=y CONFIG_BLK_DEV_INITRD=y CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y CONFIG_CC_OPTIMIZE_FOR_SIZE=y CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y # CONFIG_EMBEDDED is not set CONFIG_UID16=y CONFIG_SYSCTL_SYSCALL=y CONFIG_KALLSYMS=y CONFIG_KALLSYMS_ALL=y CONFIG_KALLSYMS_EXTRA_PASS=y +# CONFIG_STRIP_ASM_SYMS is not set CONFIG_HOTPLUG=y CONFIG_PRINTK=y CONFIG_BUG=y CONFIG_ELF_CORE=y CONFIG_PCSPKR_PLATFORM=y -# CONFIG_COMPAT_BRK is not set CONFIG_BASE_FULL=y CONFIG_FUTEX=y -CONFIG_ANON_INODES=y CONFIG_EPOLL=y CONFIG_SIGNALFD=y CONFIG_TIMERFD=y @@ -140,6 +152,7 @@ CONFIG_AIO=y CONFIG_VM_EVENT_COUNTERS=y CONFIG_PCI_QUIRKS=y CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y # CONFIG_SLOB is not set @@ -155,6 +168,8 @@ CONFIG_HAVE_IOREMAP_PROT=y CONFIG_HAVE_KPROBES=y CONFIG_HAVE_KRETPROBES=y CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set # CONFIG_HAVE_GENERIC_DMA_COHERENT is not set CONFIG_SLABINFO=y CONFIG_RT_MUTEXES=y @@ -167,7 +182,6 @@ CONFIG_MODULE_FORCE_UNLOAD=y # CONFIG_MODULE_SRCVERSION_ALL is not set CONFIG_STOP_MACHINE=y CONFIG_BLOCK=y -CONFIG_BLK_DEV_IO_TRACE=y CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_INTEGRITY is not set CONFIG_BLOCK_COMPAT=y @@ -196,11 +210,10 @@ CONFIG_GENERIC_CLOCKEVENTS_BUILD=y CONFIG_SMP=y CONFIG_SPARSE_IRQ=y # CONFIG_NUMA_MIGRATE_IRQ_DESC is not set -CONFIG_X86_FIND_SMP_CONFIG=y CONFIG_X86_MPPARSE=y -# CONFIG_X86_ELAN is not set -# CONFIG_X86_GENERICARCH is not set +CONFIG_X86_EXTENDED_PLATFORM=y # CONFIG_X86_VSMP is not set +# CONFIG_X86_UV is not set CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_PARAVIRT_GUEST is not set # CONFIG_MEMTEST is not set @@ -230,10 +243,10 @@ CONFIG_SCHED_OMIT_FRAME_POINTER=y # CONFIG_MCORE2 is not set CONFIG_GENERIC_CPU=y CONFIG_X86_CPU=y -CONFIG_X86_L1_CACHE_BYTES=128 -CONFIG_X86_INTERNODE_CACHE_BYTES=128 +CONFIG_X86_L1_CACHE_BYTES=64 +CONFIG_X86_INTERNODE_CACHE_BYTES=64 CONFIG_X86_CMPXCHG=y -CONFIG_X86_L1_CACHE_SHIFT=7 +CONFIG_X86_L1_CACHE_SHIFT=6 CONFIG_X86_WP_WORKS_OK=y CONFIG_X86_TSC=y CONFIG_X86_CMPXCHG64=y @@ -242,7 +255,7 @@ CONFIG_X86_MINIMUM_CPU_FAMILY=64 CONFIG_X86_DEBUGCTLMSR=y CONFIG_CPU_SUP_INTEL=y CONFIG_CPU_SUP_AMD=y -CONFIG_CPU_SUP_CENTAUR_64=y +CONFIG_CPU_SUP_CENTAUR=y CONFIG_X86_DS=y CONFIG_X86_PTRACE_BTS=y CONFIG_HPET_TIMER=y @@ -269,6 +282,7 @@ CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y CONFIG_X86_MCE=y CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y # CONFIG_I8K is not set CONFIG_MICROCODE=y CONFIG_MICROCODE_INTEL=y @@ -276,6 +290,7 @@ CONFIG_MICROCODE_AMD=y CONFIG_MICROCODE_OLD_INTERFACE=y CONFIG_X86_MSR=y CONFIG_X86_CPUID=y +# CONFIG_X86_CPU_DEBUG is not set CONFIG_ARCH_PHYS_ADDR_T_64BIT=y CONFIG_DIRECT_GBPAGES=y CONFIG_NUMA=y @@ -309,6 +324,8 @@ CONFIG_ZONE_DMA_FLAG=1 CONFIG_BOUNCE=y CONFIG_VIRT_TO_BUS=y CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y CONFIG_X86_CHECK_BIOS_CORRUPTION=y CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y CONFIG_X86_RESERVE_LOW_64K=y @@ -317,6 +334,7 @@ CONFIG_MTRR=y CONFIG_X86_PAT=y CONFIG_EFI=y CONFIG_SECCOMP=y +# CONFIG_CC_STACKPROTECTOR is not set # CONFIG_HZ_100 is not set # CONFIG_HZ_250 is not set # CONFIG_HZ_300 is not set @@ -325,9 +343,10 @@ CONFIG_HZ=1000 CONFIG_SCHED_HRTICK=y CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y +# CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 # CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set # CONFIG_CMDLINE_BOOL is not set @@ -370,7 +389,6 @@ CONFIG_ACPI_NUMA=y CONFIG_ACPI_BLACKLIST_YEAR=0 # CONFIG_ACPI_DEBUG is not set # CONFIG_ACPI_PCI_SLOT is not set -CONFIG_ACPI_SYSTEM=y CONFIG_X86_PM_TIMER=y CONFIG_ACPI_CONTAINER=y # CONFIG_ACPI_SBS is not set @@ -436,6 +454,7 @@ CONFIG_PCI_MSI=y # CONFIG_PCI_DEBUG is not set # CONFIG_PCI_STUB is not set CONFIG_HT_IRQ=y +# CONFIG_PCI_IOV is not set CONFIG_ISA_DMA_API=y CONFIG_K8_NB=y CONFIG_PCCARD=y @@ -481,7 +500,6 @@ CONFIG_NET=y # # Networking options # -CONFIG_COMPAT_NET_DEV_OPS=y CONFIG_PACKET=y CONFIG_PACKET_MMAP=y CONFIG_UNIX=y @@ -639,6 +657,7 @@ CONFIG_LLC=y # CONFIG_LAPB is not set # CONFIG_ECONET is not set # CONFIG_WAN_ROUTER is not set +# CONFIG_PHONET is not set CONFIG_NET_SCHED=y # @@ -696,6 +715,7 @@ CONFIG_NET_SCH_FIFO=y # # CONFIG_NET_PKTGEN is not set # CONFIG_NET_TCPPROBE is not set +# CONFIG_NET_DROP_MONITOR is not set CONFIG_HAMRADIO=y # @@ -706,12 +726,10 @@ CONFIG_HAMRADIO=y # CONFIG_IRDA is not set # CONFIG_BT is not set # CONFIG_AF_RXRPC is not set -# CONFIG_PHONET is not set CONFIG_FIB_RULES=y CONFIG_WIRELESS=y CONFIG_CFG80211=y # CONFIG_CFG80211_REG_DEBUG is not set -CONFIG_NL80211=y CONFIG_WIRELESS_OLD_REGULATORY=y CONFIG_WIRELESS_EXT=y CONFIG_WIRELESS_EXT_SYSFS=y @@ -788,9 +806,8 @@ CONFIG_MISC_DEVICES=y # CONFIG_TIFM_CORE is not set # CONFIG_ICS932S401 is not set # CONFIG_ENCLOSURE_SERVICES is not set -# CONFIG_SGI_XP is not set # CONFIG_HP_ILO is not set -# CONFIG_SGI_GRU is not set +# CONFIG_ISL29003 is not set # CONFIG_C2PORT is not set # @@ -844,6 +861,7 @@ CONFIG_SCSI_SPI_ATTRS=y # CONFIG_SCSI_LOWLEVEL is not set # CONFIG_SCSI_LOWLEVEL_PCMCIA is not set # CONFIG_SCSI_DH is not set +# CONFIG_SCSI_OSD_INITIATOR is not set CONFIG_ATA=y # CONFIG_ATA_NONSTANDARD is not set CONFIG_ATA_ACPI=y @@ -940,6 +958,7 @@ CONFIG_DM_ZERO=y CONFIG_MACINTOSH_DRIVERS=y CONFIG_MAC_EMUMOUSEBTN=y CONFIG_NETDEVICES=y +CONFIG_COMPAT_NET_DEV_OPS=y # CONFIG_IFB is not set # CONFIG_DUMMY is not set # CONFIG_BONDING is not set @@ -977,6 +996,8 @@ CONFIG_MII=y CONFIG_NET_VENDOR_3COM=y # CONFIG_VORTEX is not set # CONFIG_TYPHOON is not set +# CONFIG_ETHOC is not set +# CONFIG_DNET is not set CONFIG_NET_TULIP=y # CONFIG_DE2104X is not set # CONFIG_TULIP is not set @@ -1026,6 +1047,7 @@ CONFIG_E1000=y # CONFIG_E1000E is not set # CONFIG_IP1000 is not set # CONFIG_IGB is not set +# CONFIG_IGBVF is not set # CONFIG_NS83820 is not set # CONFIG_HAMACHI is not set # CONFIG_YELLOWFIN is not set @@ -1040,6 +1062,7 @@ CONFIG_TIGON3=y # CONFIG_QLA3XXX is not set # CONFIG_ATL1 is not set # CONFIG_ATL1E is not set +# CONFIG_ATL1C is not set # CONFIG_JME is not set CONFIG_NETDEV_10000=y # CONFIG_CHELSIO_T1 is not set @@ -1049,6 +1072,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_IXGBE is not set # CONFIG_IXGB is not set # CONFIG_S2IO is not set +# CONFIG_VXGE is not set # CONFIG_MYRI10GE is not set # CONFIG_NETXEN_NIC is not set # CONFIG_NIU is not set @@ -1058,6 +1082,7 @@ CONFIG_CHELSIO_T3_DEPENDS=y # CONFIG_BNX2X is not set # CONFIG_QLGE is not set # CONFIG_SFC is not set +# CONFIG_BE2NET is not set CONFIG_TR=y # CONFIG_IBMOL is not set # CONFIG_3C359 is not set @@ -1072,8 +1097,8 @@ CONFIG_WLAN_80211=y # CONFIG_LIBERTAS is not set # CONFIG_LIBERTAS_THINFIRM is not set # CONFIG_AIRO is not set -# CONFIG_HERMES is not set # CONFIG_ATMEL is not set +# CONFIG_AT76C50X_USB is not set # CONFIG_AIRO_CS is not set # CONFIG_PCMCIA_WL3501 is not set # CONFIG_PRISM54 is not set @@ -1083,21 +1108,21 @@ CONFIG_WLAN_80211=y # CONFIG_RTL8187 is not set # CONFIG_ADM8211 is not set # CONFIG_MAC80211_HWSIM is not set +# CONFIG_MWL8K is not set # CONFIG_P54_COMMON is not set CONFIG_ATH5K=y # CONFIG_ATH5K_DEBUG is not set # CONFIG_ATH9K is not set +# CONFIG_AR9170_USB is not set # CONFIG_IPW2100 is not set # CONFIG_IPW2200 is not set -# CONFIG_IWLCORE is not set -# CONFIG_IWLWIFI_LEDS is not set -# CONFIG_IWLAGN is not set -# CONFIG_IWL3945 is not set +# CONFIG_IWLWIFI is not set # CONFIG_HOSTAP is not set # CONFIG_B43 is not set # CONFIG_B43LEGACY is not set # CONFIG_ZD1211RW is not set # CONFIG_RT2X00 is not set +# CONFIG_HERMES is not set # # Enable WiMAX (Networking options) to see the WiMAX drivers @@ -1208,6 +1233,8 @@ CONFIG_INPUT_TABLET=y # CONFIG_TABLET_USB_KBTAB is not set # CONFIG_TABLET_USB_WACOM is not set CONFIG_INPUT_TOUCHSCREEN=y +# CONFIG_TOUCHSCREEN_AD7879_I2C is not set +# CONFIG_TOUCHSCREEN_AD7879 is not set # CONFIG_TOUCHSCREEN_FUJITSU is not set # CONFIG_TOUCHSCREEN_GUNZE is not set # CONFIG_TOUCHSCREEN_ELO is not set @@ -1301,6 +1328,7 @@ CONFIG_UNIX98_PTYS=y # CONFIG_LEGACY_PTYS is not set # CONFIG_IPMI_HANDLER is not set CONFIG_HW_RANDOM=y +# CONFIG_HW_RANDOM_TIMERIOMEM is not set # CONFIG_HW_RANDOM_INTEL is not set # CONFIG_HW_RANDOM_AMD is not set CONFIG_NVRAM=y @@ -1382,7 +1410,6 @@ CONFIG_I2C_I801=y # CONFIG_SENSORS_PCF8574 is not set # CONFIG_PCF8575 is not set # CONFIG_SENSORS_PCA9539 is not set -# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_MAX6875 is not set # CONFIG_SENSORS_TSL2550 is not set # CONFIG_I2C_DEBUG_CORE is not set @@ -1416,6 +1443,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_ADT7475 is not set # CONFIG_SENSORS_K8TEMP is not set # CONFIG_SENSORS_ASB100 is not set +# CONFIG_SENSORS_ATK0110 is not set # CONFIG_SENSORS_ATXP1 is not set # CONFIG_SENSORS_DS1621 is not set # CONFIG_SENSORS_I5K_AMB is not set @@ -1425,6 +1453,7 @@ CONFIG_HWMON=y # CONFIG_SENSORS_FSCHER is not set # CONFIG_SENSORS_FSCPOS is not set # CONFIG_SENSORS_FSCHMD is not set +# CONFIG_SENSORS_G760A is not set # CONFIG_SENSORS_GL518SM is not set # CONFIG_SENSORS_GL520SM is not set # CONFIG_SENSORS_CORETEMP is not set @@ -1440,11 +1469,14 @@ CONFIG_HWMON=y # CONFIG_SENSORS_LM90 is not set # CONFIG_SENSORS_LM92 is not set # CONFIG_SENSORS_LM93 is not set +# CONFIG_SENSORS_LTC4215 is not set # CONFIG_SENSORS_LTC4245 is not set +# CONFIG_SENSORS_LM95241 is not set # CONFIG_SENSORS_MAX1619 is not set # CONFIG_SENSORS_MAX6650 is not set # CONFIG_SENSORS_PC87360 is not set # CONFIG_SENSORS_PC87427 is not set +# CONFIG_SENSORS_PCF8591 is not set # CONFIG_SENSORS_SIS5595 is not set # CONFIG_SENSORS_DME1737 is not set # CONFIG_SENSORS_SMSC47M1 is not set @@ -1635,6 +1667,7 @@ CONFIG_FB_EFI=y # CONFIG_FB_VIRTUAL is not set # CONFIG_FB_METRONOME is not set # CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set CONFIG_BACKLIGHT_LCD_SUPPORT=y # CONFIG_LCD_CLASS_DEVICE is not set CONFIG_BACKLIGHT_CLASS_DEVICE=y @@ -1720,6 +1753,8 @@ CONFIG_SND_PCI=y # CONFIG_SND_INDIGO is not set # CONFIG_SND_INDIGOIO is not set # CONFIG_SND_INDIGODJ is not set +# CONFIG_SND_INDIGOIOX is not set +# CONFIG_SND_INDIGODJX is not set # CONFIG_SND_EMU10K1 is not set # CONFIG_SND_EMU10K1X is not set # CONFIG_SND_ENS1370 is not set @@ -1792,15 +1827,17 @@ CONFIG_USB_HIDDEV=y # # Special HID drivers # -CONFIG_HID_COMPAT=y CONFIG_HID_A4TECH=y CONFIG_HID_APPLE=y CONFIG_HID_BELKIN=y CONFIG_HID_CHERRY=y CONFIG_HID_CHICONY=y CONFIG_HID_CYPRESS=y +# CONFIG_DRAGONRISE_FF is not set CONFIG_HID_EZKEY=y +CONFIG_HID_KYE=y CONFIG_HID_GYRATION=y +CONFIG_HID_KENSINGTON=y CONFIG_HID_LOGITECH=y CONFIG_LOGITECH_FF=y # CONFIG_LOGIRUMBLEPAD2_FF is not set @@ -1866,11 +1903,11 @@ CONFIG_USB_PRINTER=y # CONFIG_USB_TMC is not set # -# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may also be needed; +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may # # -# see USB_STORAGE Help for more information +# also be needed; see USB_STORAGE Help for more info # CONFIG_USB_STORAGE=y # CONFIG_USB_STORAGE_DEBUG is not set @@ -1912,7 +1949,6 @@ CONFIG_USB_LIBUSUAL=y # CONFIG_USB_LED is not set # CONFIG_USB_CYPRESS_CY7C63 is not set # CONFIG_USB_CYTHERM is not set -# CONFIG_USB_PHIDGET is not set # CONFIG_USB_IDMOUSE is not set # CONFIG_USB_FTDI_ELAN is not set # CONFIG_USB_APPLEDISPLAY is not set @@ -1928,6 +1964,7 @@ CONFIG_USB_LIBUSUAL=y # # OTG and related infrastructure # +# CONFIG_NOP_USB_XCEIV is not set # CONFIG_UWB is not set # CONFIG_MMC is not set # CONFIG_MEMSTICK is not set @@ -1939,8 +1976,10 @@ CONFIG_LEDS_CLASS=y # # CONFIG_LEDS_ALIX2 is not set # CONFIG_LEDS_PCA9532 is not set +# CONFIG_LEDS_LP5521 is not set # CONFIG_LEDS_CLEVO_MAIL is not set # CONFIG_LEDS_PCA955X is not set +# CONFIG_LEDS_BD2802 is not set # # LED Triggers @@ -1950,6 +1989,10 @@ CONFIG_LEDS_TRIGGERS=y # CONFIG_LEDS_TRIGGER_HEARTBEAT is not set # CONFIG_LEDS_TRIGGER_BACKLIGHT is not set # CONFIG_LEDS_TRIGGER_DEFAULT_ON is not set + +# +# iptables trigger is under Netfilter config (LED target) +# # CONFIG_ACCESSIBILITY is not set # CONFIG_INFINIBAND is not set CONFIG_EDAC=y @@ -2018,6 +2061,7 @@ CONFIG_DMADEVICES=y # DMA Devices # # CONFIG_INTEL_IOATDMA is not set +# CONFIG_AUXDISPLAY is not set # CONFIG_UIO is not set # CONFIG_STAGING is not set CONFIG_X86_PLATFORM_DEVICES=y @@ -2051,6 +2095,7 @@ CONFIG_DMIID=y # # CONFIG_EXT2_FS is not set CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set CONFIG_EXT3_FS_XATTR=y CONFIG_EXT3_FS_POSIX_ACL=y CONFIG_EXT3_FS_SECURITY=y @@ -2081,6 +2126,11 @@ CONFIG_AUTOFS4_FS=y # CONFIG_FUSE_FS is not set CONFIG_GENERIC_ACL=y +# +# Caches +# +# CONFIG_FSCACHE is not set + # # CD-ROM/DVD Filesystems # @@ -2132,6 +2182,7 @@ CONFIG_MISC_FILESYSTEMS=y # CONFIG_ROMFS_FS is not set # CONFIG_SYSV_FS is not set # CONFIG_UFS_FS is not set +# CONFIG_NILFS2_FS is not set CONFIG_NETWORK_FILESYSTEMS=y CONFIG_NFS_FS=y CONFIG_NFS_V3=y @@ -2145,7 +2196,6 @@ CONFIG_NFS_ACL_SUPPORT=y CONFIG_NFS_COMMON=y CONFIG_SUNRPC=y CONFIG_SUNRPC_GSS=y -# CONFIG_SUNRPC_REGISTER_V4 is not set CONFIG_RPCSEC_GSS_KRB5=y # CONFIG_RPCSEC_GSS_SPKM3 is not set # CONFIG_SMB_FS is not set @@ -2232,6 +2282,7 @@ CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y # CONFIG_DEBUG_SHIRQ is not set # CONFIG_DETECT_SOFTLOCKUP is not set +# CONFIG_DETECT_HUNG_TASK is not set # CONFIG_SCHED_DEBUG is not set CONFIG_SCHEDSTATS=y CONFIG_TIMER_STATS=y @@ -2247,6 +2298,7 @@ CONFIG_TIMER_STATS=y # CONFIG_LOCK_STAT is not set # CONFIG_DEBUG_SPINLOCK_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +CONFIG_STACKTRACE=y # CONFIG_DEBUG_KOBJECT is not set CONFIG_DEBUG_BUGVERBOSE=y # CONFIG_DEBUG_INFO is not set @@ -2269,13 +2321,19 @@ CONFIG_FRAME_POINTER=y # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set CONFIG_SYSCTL_SYSCALL_CHECK=y +# CONFIG_DEBUG_PAGEALLOC is not set CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y CONFIG_HAVE_FUNCTION_TRACER=y CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y CONFIG_HAVE_FUNCTION_TRACE_MCOUNT_TEST=y CONFIG_HAVE_DYNAMIC_FTRACE=y CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y CONFIG_HAVE_HW_BRANCH_TRACER=y +CONFIG_HAVE_FTRACE_SYSCALLS=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y # # Tracers @@ -2285,13 +2343,21 @@ CONFIG_HAVE_HW_BRANCH_TRACER=y # CONFIG_SYSPROF_TRACER is not set # CONFIG_SCHED_TRACER is not set # CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_FTRACE_SYSCALLS is not set # CONFIG_BOOT_TRACER is not set # CONFIG_TRACE_BRANCH_PROFILING is not set # CONFIG_POWER_TRACER is not set # CONFIG_STACK_TRACER is not set # CONFIG_HW_BRANCH_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +CONFIG_BLK_DEV_IO_TRACE=y +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_MMIOTRACE is not set CONFIG_PROVIDE_OHCI1394_DMA_INIT=y -# CONFIG_DYNAMIC_PRINTK_DEBUG is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set # CONFIG_SAMPLES is not set CONFIG_HAVE_ARCH_KGDB=y # CONFIG_KGDB is not set @@ -2301,14 +2367,13 @@ CONFIG_EARLY_PRINTK=y CONFIG_EARLY_PRINTK_DBGP=y CONFIG_DEBUG_STACKOVERFLOW=y CONFIG_DEBUG_STACK_USAGE=y -# CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_DEBUG_PER_CPU_MAPS is not set # CONFIG_X86_PTDUMP is not set CONFIG_DEBUG_RODATA=y # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_NX_TEST=m # CONFIG_IOMMU_DEBUG is not set -# CONFIG_MMIOTRACE is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y CONFIG_IO_DELAY_TYPE_0X80=0 CONFIG_IO_DELAY_TYPE_0XED=1 CONFIG_IO_DELAY_TYPE_UDELAY=2 @@ -2344,6 +2409,8 @@ CONFIG_SECURITY_SELINUX_AVC_STATS=y CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE=1 # CONFIG_SECURITY_SELINUX_POLICYDB_VERSION_MAX is not set # CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +# CONFIG_IMA is not set CONFIG_CRYPTO=y # @@ -2359,10 +2426,12 @@ CONFIG_CRYPTO_BLKCIPHER2=y CONFIG_CRYPTO_HASH=y CONFIG_CRYPTO_HASH2=y CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_PCOMP=y CONFIG_CRYPTO_MANAGER=y CONFIG_CRYPTO_MANAGER2=y # CONFIG_CRYPTO_GF128MUL is not set # CONFIG_CRYPTO_NULL is not set +CONFIG_CRYPTO_WORKQUEUE=y # CONFIG_CRYPTO_CRYPTD is not set CONFIG_CRYPTO_AUTHENC=y # CONFIG_CRYPTO_TEST is not set @@ -2414,6 +2483,7 @@ CONFIG_CRYPTO_SHA1=y # CONFIG_CRYPTO_AES=y # CONFIG_CRYPTO_AES_X86_64 is not set +# CONFIG_CRYPTO_AES_NI_INTEL is not set # CONFIG_CRYPTO_ANUBIS is not set CONFIG_CRYPTO_ARC4=y # CONFIG_CRYPTO_BLOWFISH is not set @@ -2435,6 +2505,7 @@ CONFIG_CRYPTO_DES=y # Compression # # CONFIG_CRYPTO_DEFLATE is not set +# CONFIG_CRYPTO_ZLIB is not set # CONFIG_CRYPTO_LZO is not set # @@ -2444,10 +2515,12 @@ CONFIG_CRYPTO_DES=y CONFIG_CRYPTO_HW=y # CONFIG_CRYPTO_DEV_HIFN_795X is not set CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y CONFIG_VIRTUALIZATION=y # CONFIG_KVM is not set # CONFIG_VIRTIO_PCI is not set # CONFIG_VIRTIO_BALLOON is not set +CONFIG_BINARY_PRINTF=y # # Library routines @@ -2464,7 +2537,10 @@ CONFIG_CRC32=y # CONFIG_CRC7 is not set # CONFIG_LIBCRC32C is not set CONFIG_ZLIB_INFLATE=y -CONFIG_PLIST=y +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT=y CONFIG_HAS_DMA=y +CONFIG_NLATTR=y From fe83fcc0a14dcf71996de5eb84771b2369ba7abc Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 11 May 2009 16:23:16 -0700 Subject: [PATCH 1007/2061] x86, defconfig: update kernel position parameters Update CONFIG_RELOCATABLE, CONFIG_PHYSICAL_START and CONFIG_PHYSICAL_ALIGN to reflect the current defaults. [ Impact: make defconfig match Kconfig defaults ] Signed-off-by: H. Peter Anvin --- arch/x86/configs/i386_defconfig | 7 ++++--- arch/x86/configs/x86_64_defconfig | 4 ++-- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig index 70036a7a950..edb992ebef9 100644 --- a/arch/x86/configs/i386_defconfig +++ b/arch/x86/configs/i386_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.30-rc2 -# Mon May 11 16:19:47 2009 +# Mon May 11 16:21:55 2009 # # CONFIG_64BIT is not set CONFIG_X86_32=y @@ -343,8 +343,9 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -# CONFIG_RELOCATABLE is not set -CONFIG_PHYSICAL_ALIGN=0x200000 +CONFIG_RELOCATABLE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set # CONFIG_CMDLINE_BOOL is not set diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig index f3e53e21f43..4ba7d4ef9aa 100644 --- a/arch/x86/configs/x86_64_defconfig +++ b/arch/x86/configs/x86_64_defconfig @@ -1,7 +1,7 @@ # # Automatically generated make config: don't edit # Linux kernel version: 2.6.30-rc2 -# Mon May 11 16:19:24 2009 +# Mon May 11 16:22:00 2009 # CONFIG_64BIT=y # CONFIG_X86_32 is not set @@ -345,7 +345,7 @@ CONFIG_KEXEC=y CONFIG_CRASH_DUMP=y # CONFIG_KEXEC_JUMP is not set CONFIG_PHYSICAL_START=0x1000000 -# CONFIG_RELOCATABLE is not set +CONFIG_RELOCATABLE=y CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_HOTPLUG_CPU=y # CONFIG_COMPAT_VDSO is not set From 5031296c57024a78ddad4edfc993367dbf4abb98 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Thu, 7 May 2009 16:54:11 -0700 Subject: [PATCH 1008/2061] x86: add extension fields for bootloader type and version A long ago, in days of yore, it all began with a god named Thor. There were vikings and boats and some plans for a Linux kernel header. Unfortunately, a single 8-bit field was used for bootloader type and version. This has generally worked without *too* much pain, but we're getting close to flat running out of ID fields. Add extension fields for both type and version. The type will be extended if it the old field is 0xE; the version is a simple MSB extension. Keep /proc/sys/kernel/bootloader_type containing (type << 4) + (ver & 0xf) for backwards compatiblity, but also add /proc/sys/kernel/bootloader_version which contains the full version number. [ Impact: new feature to support more bootloaders ] Signed-off-by: H. Peter Anvin --- Documentation/x86/boot.txt | 59 ++++++++++++++++++++++++++++---- arch/x86/boot/header.S | 6 +++- arch/x86/include/asm/bootparam.h | 3 +- arch/x86/include/asm/processor.h | 1 + arch/x86/kernel/setup.c | 10 ++++-- kernel/sysctl.c | 8 +++++ 6 files changed, 76 insertions(+), 11 deletions(-) diff --git a/Documentation/x86/boot.txt b/Documentation/x86/boot.txt index cf8dfc70a11..8da3a795083 100644 --- a/Documentation/x86/boot.txt +++ b/Documentation/x86/boot.txt @@ -50,10 +50,9 @@ Protocol 2.08: (Kernel 2.6.26) Added crc32 checksum and ELF format Protocol 2.09: (Kernel 2.6.26) Added a field of 64-bit physical pointer to single linked list of struct setup_data. -Protocol 2.10: (Kernel 2.6.31) A protocol for relaxed alignment +Protocol 2.10: (Kernel 2.6.31) Added a protocol for relaxed alignment beyond the kernel_alignment added, new init_size and - pref_address fields. - + pref_address fields. Added extended boot loader IDs. **** MEMORY LAYOUT @@ -173,7 +172,8 @@ Offset Proto Name Meaning 021C/4 2.00+ ramdisk_size initrd size (set by boot loader) 0220/4 2.00+ bootsect_kludge DO NOT USE - for bootsect.S use only 0224/2 2.01+ heap_end_ptr Free memory after setup end -0226/2 N/A pad1 Unused +0226/1 2.02+(3 ext_loader_ver Extended boot loader version +0227/1 2.02+(3 ext_loader_type Extended boot loader ID 0228/4 2.02+ cmd_line_ptr 32-bit pointer to the kernel command line 022C/4 2.03+ ramdisk_max Highest legal initrd address 0230/4 2.05+ kernel_alignment Physical addr alignment required for kernel @@ -197,6 +197,8 @@ Offset Proto Name Meaning field are unusable, which means the size of a bzImage kernel cannot be determined. +(3) Ignored, but safe to set, for boot protocols 2.02-2.09. + If the "HdrS" (0x53726448) magic number is not found at offset 0x202, the boot protocol version is "old". Loading an old kernel, the following parameters should be assumed: @@ -350,18 +352,32 @@ Protocol: 2.00+ 0xTV here, where T is an identifier for the boot loader and V is a version number. Otherwise, enter 0xFF here. + For boot loader IDs above T = 0xD, write T = 0xE to this field and + write the extended ID minus 0x10 to the ext_loader_type field. + Similarly, the ext_loader_ver field can be used to provide more than + four bits for the bootloader version. + + For example, for T = 0x15, V = 0x234, write: + + type_of_loader <- 0xE4 + ext_loader_type <- 0x05 + ext_loader_ver <- 0x23 + Assigned boot loader ids: 0 LILO (0x00 reserved for pre-2.00 bootloader) 1 Loadlin 2 bootsect-loader (0x20, all other values reserved) - 3 SYSLINUX - 4 EtherBoot + 3 Syslinux + 4 Etherboot/gPXE 5 ELILO 7 GRUB - 8 U-BOOT + 8 U-Boot 9 Xen A Gujin B Qemu + C Arcturus Networks uCbootloader + E Extended (see ext_loader_type) + F Special (0xFF = undefined) Please contact if you need a bootloader ID value assigned. @@ -460,6 +476,35 @@ Protocol: 2.01+ Set this field to the offset (from the beginning of the real-mode code) of the end of the setup stack/heap, minus 0x0200. +Field name: ext_loader_ver +Type: write (optional) +Offset/size: 0x226/1 +Protocol: 2.02+ + + This field is used as an extension of the version number in the + type_of_loader field. The total version number is considered to be + (type_of_loader & 0x0f) + (ext_loader_ver << 4). + + The use of this field is boot loader specific. If not written, it + is zero. + + Kernels prior to 2.6.31 did not recognize this field, but it is safe + to write for protocol version 2.02 or higher. + +Field name: ext_loader_type +Type: write (obligatory if (type_of_loader & 0xf0) == 0xe0) +Offset/size: 0x227/1 +Protocol: 2.02+ + + This field is used as an extension of the type number in + type_of_loader field. If the type in type_of_loader is 0xE, then + the actual type is (ext_loader_type + 0x10). + + This field is ignored if the type in type_of_loader is not 0xE. + + Kernels prior to 2.6.31 did not recognize this field, but it is safe + to write for protocol version 2.02 or higher. + Field name: cmd_line_ptr Type: write (obligatory) Offset/size: 0x228/4 diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index a0b426978d5..68c3bfbaff2 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -169,7 +169,11 @@ heap_end_ptr: .word _end+STACK_SIZE-512 # end of setup code can be used by setup # for local heap purposes. -pad1: .word 0 +ext_loader_ver: + .byte 0 # Extended boot loader version +ext_loader_type: + .byte 0 # Extended boot loader type + cmd_line_ptr: .long 0 # (Header version 0x0202 or later) # If nonzero, a 32-bit pointer # to the kernel command line. diff --git a/arch/x86/include/asm/bootparam.h b/arch/x86/include/asm/bootparam.h index 433adaebf9b..1724e8de317 100644 --- a/arch/x86/include/asm/bootparam.h +++ b/arch/x86/include/asm/bootparam.h @@ -50,7 +50,8 @@ struct setup_header { __u32 ramdisk_size; __u32 bootsect_kludge; __u16 heap_end_ptr; - __u16 _pad1; + __u8 ext_loader_ver; + __u8 ext_loader_type; __u32 cmd_line_ptr; __u32 initrd_addr_max; __u32 kernel_alignment; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index fcf4d92e7e0..6384d25121c 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -814,6 +814,7 @@ extern unsigned int BIOS_revision; /* Boot loader type from the setup header: */ extern int bootloader_type; +extern int bootloader_version; extern char ignore_fpu_irq; diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c index b4158439bf6..2b093451aec 100644 --- a/arch/x86/kernel/setup.c +++ b/arch/x86/kernel/setup.c @@ -214,8 +214,8 @@ unsigned long mmu_cr4_features; unsigned long mmu_cr4_features = X86_CR4_PAE; #endif -/* Boot loader ID as an integer, for the benefit of proc_dointvec */ -int bootloader_type; +/* Boot loader ID and version as integers, for the benefit of proc_dointvec */ +int bootloader_type, bootloader_version; /* * Setup options @@ -706,6 +706,12 @@ void __init setup_arch(char **cmdline_p) #endif saved_video_mode = boot_params.hdr.vid_mode; bootloader_type = boot_params.hdr.type_of_loader; + if ((bootloader_type >> 4) == 0xe) { + bootloader_type &= 0xf; + bootloader_type |= (boot_params.hdr.ext_loader_type+0x10) << 4; + } + bootloader_version = bootloader_type & 0xf; + bootloader_version |= boot_params.hdr.ext_loader_ver << 4; #ifdef CONFIG_BLK_DEV_RAM rd_image_start = boot_params.hdr.ram_size & RAMDISK_IMAGE_START_MASK; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index e3d2c7dd59b..cf91c9317b2 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -727,6 +727,14 @@ static struct ctl_table kern_table[] = { .mode = 0444, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "bootloader_version", + .data = &bootloader_version, + .maxlen = sizeof (int), + .mode = 0444, + .proc_handler = &proc_dointvec, + }, { .ctl_name = CTL_UNNUMBERED, .procname = "kstack_depth_to_print", From 37bcbf13d32e4e453e9def79ee72bd953b88302f Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 11 May 2009 13:59:10 -0400 Subject: [PATCH 1009/2061] IMA: use current_cred() instead of current->cred Proper invocation of the current credentials is to use current_cred() not current->cred. This patches makes IMA use the new method. Signed-off-by: Eric Paris Acked-by: Mimi Zohar Signed-off-by: James Morris --- security/integrity/ima/ima_audit.c | 2 +- security/integrity/ima/ima_main.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/security/integrity/ima/ima_audit.c b/security/integrity/ima/ima_audit.c index b628eea477a..ff513ff737f 100644 --- a/security/integrity/ima/ima_audit.c +++ b/security/integrity/ima/ima_audit.c @@ -41,7 +41,7 @@ void integrity_audit_msg(int audit_msgno, struct inode *inode, ab = audit_log_start(current->audit_context, GFP_KERNEL, audit_msgno); audit_log_format(ab, "integrity: pid=%d uid=%u auid=%u ses=%u", - current->pid, current->cred->uid, + current->pid, current_cred()->uid, audit_get_loginuid(current), audit_get_sessionid(current)); audit_log_task_context(ab); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 122f17fc7fc..cdae13c5ae0 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -184,7 +184,7 @@ int ima_path_check(struct path *path, int mask) struct dentry *dentry = dget(path->dentry); struct vfsmount *mnt = mntget(path->mnt); - file = dentry_open(dentry, mnt, O_RDONLY, current->cred); + file = dentry_open(dentry, mnt, O_RDONLY, current_cred()); rc = get_path_measurement(iint, file, dentry->d_name.name); } out: From f06dd16a03f6f7f72fab4db03be36e28c28c6fd6 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 11 May 2009 13:59:16 -0400 Subject: [PATCH 1010/2061] IMA: Handle dentry_open failures Currently IMA does not handle failures from dentry_open(). This means that we leave a pointer set to ERR_PTR(errno) and then try to use it just a few lines later in fput(). Oops. Signed-off-by: Eric Paris Acked-by: Mimi Zohar Signed-off-by: James Morris --- security/integrity/ima/ima_main.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index cdae13c5ae0..1987424623c 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -116,10 +116,6 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file, { int rc = 0; - if (IS_ERR(file)) { - pr_info("%s dentry_open failed\n", filename); - return rc; - } iint->opencount++; iint->readcount++; @@ -185,6 +181,12 @@ int ima_path_check(struct path *path, int mask) struct vfsmount *mnt = mntget(path->mnt); file = dentry_open(dentry, mnt, O_RDONLY, current_cred()); + if (IS_ERR(file)) { + pr_info("%s dentry_open failed\n", dentry->d_name.name); + rc = PTR_ERR(file); + file = NULL; + goto out; + } rc = get_path_measurement(iint, file, dentry->d_name.name); } out: From 1a62e958fa4aaeeb752311b4f5e16b2a86737b23 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 11 May 2009 13:59:22 -0400 Subject: [PATCH 1011/2061] IMA: open all files O_LARGEFILE If IMA tried to measure a file which was larger than 4G dentry_open would fail with -EOVERFLOW since IMA wasn't passing O_LARGEFILE. This patch passes O_LARGEFILE to all IMA opens to avoid this problem. Signed-off-by: Eric Paris Acked-by: Mimi Zohar Signed-off-by: James Morris --- security/integrity/ima/ima_main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index 1987424623c..c4228c0eb2d 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -180,7 +180,8 @@ int ima_path_check(struct path *path, int mask) struct dentry *dentry = dget(path->dentry); struct vfsmount *mnt = mntget(path->mnt); - file = dentry_open(dentry, mnt, O_RDONLY, current_cred()); + file = dentry_open(dentry, mnt, O_RDONLY | O_LARGEFILE, + current_cred()); if (IS_ERR(file)) { pr_info("%s dentry_open failed\n", dentry->d_name.name); rc = PTR_ERR(file); From d93e4c940f51ae06b59c14523c4d55947f9597d6 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 11 May 2009 20:47:15 -0400 Subject: [PATCH 1012/2061] securityfs: securityfs_remove should handle IS_ERR pointers Both of the securityfs users (TPM and IMA) can call securityfs_remove and pass an IS_ERR(dentry) in their failure paths. This patch handles those rather than panicing when it tries to start deferencing some negative memory. Signed-off-by: Eric Paris Signed-off-by: James Morris --- security/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/inode.c b/security/inode.c index f3b91bfbe4c..f7496c6a022 100644 --- a/security/inode.c +++ b/security/inode.c @@ -287,7 +287,7 @@ void securityfs_remove(struct dentry *dentry) { struct dentry *parent; - if (!dentry) + if (!dentry || IS_ERR(dentry)) return; parent = dentry->d_parent; From 0f0c85fc80adbbd2265d89867d743f929d516805 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 16:08:00 -0400 Subject: [PATCH 1013/2061] ring-buffer: small optimizations Doing some small changes in the fast path of the ring buffer recording saves over 3% in the ring-buffer-benchmark test. [ Impact: a little faster ring buffer recording ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 493cba46abc..f452de2ce49 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1000,7 +1000,7 @@ rb_event_index(struct ring_buffer_event *event) return (addr & ~PAGE_MASK) - (PAGE_SIZE - BUF_PAGE_SIZE); } -static int +static inline int rb_is_commit(struct ring_buffer_per_cpu *cpu_buffer, struct ring_buffer_event *event) { @@ -1423,9 +1423,9 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * also be made. But only the entry that did the actual * commit will be something other than zero. */ - if (cpu_buffer->tail_page == cpu_buffer->commit_page && - rb_page_write(cpu_buffer->tail_page) == - rb_commit_index(cpu_buffer)) { + if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && + rb_page_write(cpu_buffer->tail_page) == + rb_commit_index(cpu_buffer))) { delta = ts - cpu_buffer->write_stamp; @@ -1436,7 +1436,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (unlikely(ts < cpu_buffer->write_stamp)) delta = 0; - if (test_time_stamp(delta)) { + else if (unlikely(test_time_stamp(delta))) { commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); @@ -1470,7 +1470,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, * If the timestamp was commited, make the commit our entry * now so that we will update it when needed. */ - if (commit) + if (unlikely(commit)) rb_set_commit_event(cpu_buffer, event); else if (!rb_is_commit(cpu_buffer, event)) delta = 0; From 88eb0125362f2ab272cbaf84252cf101ddc2dec9 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 16:28:23 -0400 Subject: [PATCH 1014/2061] ring-buffer: use internal time stamp function The ring_buffer_time_stamp that is exported adds a little more overhead than is needed for using it internally. This patch adds an internal timestamp function that can be inlined (a single line function) and used internally for the ring buffer. [ Impact: a little less overhead to the ring buffer ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index f452de2ce49..a9e645a5bc1 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -454,13 +454,18 @@ struct ring_buffer_iter { /* Up this if you want to test the TIME_EXTENTS and normalization */ #define DEBUG_SHIFT 0 +static inline u64 rb_time_stamp(struct ring_buffer *buffer, int cpu) +{ + /* shift to debug/test normalization and TIME_EXTENTS */ + return buffer->clock() << DEBUG_SHIFT; +} + u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) { u64 time; preempt_disable_notrace(); - /* shift to debug/test normalization and TIME_EXTENTS */ - time = buffer->clock() << DEBUG_SHIFT; + time = rb_time_stamp(buffer, cpu); preempt_enable_no_resched_notrace(); return time; @@ -1247,7 +1252,7 @@ rb_move_tail(struct ring_buffer_per_cpu *cpu_buffer, cpu_buffer->tail_page = next_page; /* reread the time stamp */ - *ts = ring_buffer_time_stamp(buffer, cpu_buffer->cpu); + *ts = rb_time_stamp(buffer, cpu_buffer->cpu); cpu_buffer->tail_page->page->time_stamp = *ts; } @@ -1413,7 +1418,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (RB_WARN_ON(cpu_buffer, ++nr_loops > 1000)) return NULL; - ts = ring_buffer_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); + ts = rb_time_stamp(cpu_buffer->buffer, cpu_buffer->cpu); /* * Only the first commit can update the timestamp. From 168b6b1d0594c7866caa73b12f3b8d91075695f2 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Mon, 11 May 2009 22:11:05 -0400 Subject: [PATCH 1015/2061] ring-buffer: move code around to remove some branches This is a bit of micro-optimizations. But since the ring buffer is used in tracing every function call, it is an extreme hot path. Every nanosecond counts. This change shows over 5% improvement in the ring-buffer-benchmark. [ Impact: more efficient code ] Signed-off-by: Steven Rostedt --- kernel/trace/ring_buffer.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a9e645a5bc1..16b24d49604 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -1400,7 +1400,7 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, unsigned long length) { struct ring_buffer_event *event; - u64 ts, delta; + u64 ts, delta = 0; int commit = 0; int nr_loops = 0; @@ -1431,20 +1431,21 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, if (likely(cpu_buffer->tail_page == cpu_buffer->commit_page && rb_page_write(cpu_buffer->tail_page) == rb_commit_index(cpu_buffer))) { + u64 diff; - delta = ts - cpu_buffer->write_stamp; + diff = ts - cpu_buffer->write_stamp; - /* make sure this delta is calculated here */ + /* make sure this diff is calculated here */ barrier(); /* Did the write stamp get updated already? */ if (unlikely(ts < cpu_buffer->write_stamp)) - delta = 0; + goto get_event; - else if (unlikely(test_time_stamp(delta))) { + delta = diff; + if (unlikely(test_time_stamp(delta))) { commit = rb_add_time_stamp(cpu_buffer, &ts, &delta); - if (commit == -EBUSY) return NULL; @@ -1453,12 +1454,11 @@ rb_reserve_next_event(struct ring_buffer_per_cpu *cpu_buffer, RB_WARN_ON(cpu_buffer, commit < 0); } - } else - /* Non commits have zero deltas */ - delta = 0; + } + get_event: event = __rb_reserve_next(cpu_buffer, 0, length, &ts); - if (PTR_ERR(event) == -EAGAIN) + if (unlikely(PTR_ERR(event) == -EAGAIN)) goto again; if (!event) { From bc8e67409ccdcff72c3f1656b1fb1aad7ff396db Mon Sep 17 00:00:00 2001 From: Vincent Minet Date: Fri, 15 May 2009 08:33:18 -0400 Subject: [PATCH 1016/2061] ext4: Fix spinlock assertions on UP systems On UP systems without DEBUG_SPINLOCK, ext4_is_group_locked always fails which triggers a BUG_ON() call. This patch fixes it by using assert_spin_locked instead. Signed-off-by: Vincent Minet Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 6 ------ fs/ext4/mballoc.c | 10 +++++----- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 149e02dc360..89190ae671f 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1583,12 +1583,6 @@ static inline void ext4_unlock_group(struct super_block *sb, spin_unlock(ext4_group_lock_ptr(sb, group)); } -static inline int ext4_is_group_locked(struct super_block *sb, - ext4_group_t group) -{ - return spin_is_locked(ext4_group_lock_ptr(sb, group)); -} - /* * Inodes and files operations */ diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index e76459cedcd..541bd9adffa 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -436,7 +436,7 @@ static void mb_free_blocks_double(struct inode *inode, struct ext4_buddy *e4b, if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; - BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); for (i = 0; i < count; i++) { if (!mb_test_bit(first + i, e4b->bd_info->bb_bitmap)) { ext4_fsblk_t blocknr; @@ -460,7 +460,7 @@ static void mb_mark_used_double(struct ext4_buddy *e4b, int first, int count) if (unlikely(e4b->bd_info->bb_bitmap == NULL)) return; - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); for (i = 0; i < count; i++) { BUG_ON(mb_test_bit(first + i, e4b->bd_info->bb_bitmap)); mb_set_bit(first + i, e4b->bd_info->bb_bitmap); @@ -1115,7 +1115,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, struct super_block *sb = e4b->bd_sb; BUG_ON(first + count > (sb->s_blocksize << 3)); - BUG_ON(!ext4_is_group_locked(sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(sb, e4b->bd_group)); mb_check_buddy(e4b); mb_free_blocks_double(inode, e4b, first, count); @@ -1196,7 +1196,7 @@ static int mb_find_extent(struct ext4_buddy *e4b, int order, int block, int ord; void *buddy; - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); BUG_ON(ex == NULL); buddy = mb_find_buddy(e4b, order, &max); @@ -1260,7 +1260,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) BUG_ON(start + len > (e4b->bd_sb->s_blocksize << 3)); BUG_ON(e4b->bd_group != ex->fe_group); - BUG_ON(!ext4_is_group_locked(e4b->bd_sb, e4b->bd_group)); + assert_spin_locked(ext4_group_lock_ptr(e4b->bd_sb, e4b->bd_group)); mb_check_buddy(e4b); mb_mark_used_double(e4b, start, len); From f888e652d758bfe0c04c209b72a05972daeba386 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 12 May 2009 00:21:29 -0400 Subject: [PATCH 1017/2061] ext4: Simplify function signature for ext4_da_get_block_write() The function ext4_da_get_block_write() is called in exactly one write, and the last argument, create, is always 1. Remove it to simplify the code slightly. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 4e7f363e303..476d843610a 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2000,7 +2000,7 @@ static void ext4_print_free_blocks(struct inode *inode) #define EXT4_DELALLOC_RSVED 1 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) + struct buffer_head *bh_result) { int ret; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; @@ -2010,7 +2010,7 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, handle = ext4_journal_current_handle(); BUG_ON(!handle); ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, create, 0, EXT4_DELALLOC_RSVED); + bh_result, 1, 0, EXT4_DELALLOC_RSVED); if (ret <= 0) return ret; @@ -2088,7 +2088,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) if (!new.b_size) return 0; - err = ext4_da_get_block_write(mpd->inode, next, &new, 1); + err = ext4_da_get_block_write(mpd->inode, next, &new); if (err) { /* * If get block returns with error we simply From e4d996ca806e93dddb5d76c0d3d859b494c559f6 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 12 May 2009 00:25:28 -0400 Subject: [PATCH 1018/2061] ext4: Rename ext4_get_blocks_handle() to be ext4_ind_get_blocks() The static function ext4_get_blocks_handle() is badly named. Of *course* it takes a handle. Since its counterpart for extent-based file is ext4_ext_get_blocks(), rename it to be ext4_ind_get_blocks(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 476d843610a..f758e8021d1 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -914,7 +914,7 @@ err_out: * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) */ -static int ext4_get_blocks_handle(handle_t *handle, struct inode *inode, +static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int maxblocks, struct buffer_head *bh_result, int create, int extend_disksize) @@ -1129,7 +1129,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) * mapped. * * If file type is extents based, it will call ext4_ext_get_blocks(), - * Otherwise, call with ext4_get_blocks_handle() to handle indirect mapping + * Otherwise, call with ext4_ind_get_blocks() to handle indirect mapping * based files * * On success, it returns the number of blocks being mapped or allocate. @@ -1160,8 +1160,8 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, bh, 0, 0); } else { - retval = ext4_get_blocks_handle(handle, - inode, block, max_blocks, bh, 0, 0); + retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, + bh, 0, 0); } up_read((&EXT4_I(inode)->i_data_sem)); @@ -1215,7 +1215,7 @@ int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, bh, create, extend_disksize); } else { - retval = ext4_get_blocks_handle(handle, inode, block, + retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, bh, create, extend_disksize); if (retval > 0 && buffer_new(bh)) { @@ -1297,7 +1297,7 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, err = ext4_get_blocks_wrap(handle, inode, block, 1, &dummy, create, 1, 0); /* - * ext4_get_blocks_handle() returns number of blocks + * ext4_get_blocks_wrap() returns number of blocks * mapped. 0 in case of a HOLE. */ if (err > 0) { From e458824f9d32e9bf7700d1eb0d201749af48eee0 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 12 May 2009 08:49:32 +0200 Subject: [PATCH 1019/2061] scsi: fix resid_len mis-conversion in scsi_end_request() Commit c3a4d78c580de4edc9ef0f7c59812fb02ceb037f introduced rq->data_len and converted residual count users to it. While converting, it mistakenly converted scsi_end_request() to finish requests with residual count when it wants to do is fully complete the request. Fix it by using blk_end_request_all() instead. This bug was spotted by Boaz Harrosh. Signed-off-by: Tejun Heo Spotted-by: Boaz Harrosh Cc: FUJITA Tomonori Cc: James Bottomley Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index a54bec99438..e410d667910 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -546,14 +546,9 @@ static struct scsi_cmnd *scsi_end_request(struct scsi_cmnd *cmd, int error, * to queue the remainder of them. */ if (blk_end_request(req, error, bytes)) { - int leftover = blk_rq_bytes(req); - - if (blk_pc_request(req)) - leftover = req->resid_len; - /* kill remainder if no retrys */ if (error && scsi_noretry_cmd(cmd)) - blk_end_request(req, error, leftover); + blk_end_request_all(req, error); else { if (requeue) { /* From f3f8290cb3fa4aa627321530bb85d5f35e487433 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 16:07:40 +0900 Subject: [PATCH 1020/2061] sh: clkfwk: Handle clk_get_sys() returning an ERR_PTR. clk_get() needs to also perform an IS_ERR() check to see whether clk_get_sys() failed or not. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/clock.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 34707b86776..9e1fc133a47 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -399,7 +399,7 @@ struct clk *clk_get(struct device *dev, const char *id) int idno; clk = clk_get_sys(dev_id, id); - if (clk) + if (clk && !IS_ERR(clk)) return clk; if (dev == NULL || dev->bus != &platform_bus_type) From 871b72dd1e12afc3f024479531d25a9339d2e3f9 Mon Sep 17 00:00:00 2001 From: Dmitry Adamushko Date: Mon, 11 May 2009 23:48:27 +0200 Subject: [PATCH 1021/2061] x86: microcode: use smp_call_function_single instead of set_cpus_allowed, cleanup of synchronization logic * Solve issues described in 6f66cbc63081fd70e3191b4dbb796746780e5ae1 in a way that doesn't resort to set_cpus_allowed(); * in fact, only collect_cpu_info and apply_microcode callbacks must run on a target cpu, others will do just fine on any other. smp_call_function_single() (as suggested by Ingo) is used to run these callbacks on a target cpu. * cleanup of synchronization logic of the 'microcode_core' part The generic 'microcode_core' part guarantees that only a single cpu (be it a full-fledged cpu, one of the cores or HT) is being updated at any particular moment of time. In general, there is no need for any additional sync. mechanism in arch-specific parts (the patch removes existing spinlocks). See also the "Synchronization" section in microcode_core.c. * return -EINVAL instead of -1 (which is translated into -EPERM) in microcode_write(), reload_cpu() and mc_sysdev_add(). Other suggestions for an error code? * use 'enum ucode_state' as return value of request_microcode_{fw, user} to gain more flexibility by distinguishing between real error cases and situations when an appropriate ucode was not found (which is not an error per-se). * some minor cleanups Thanks a lot to Hugh Dickins for review/suggestions/testing! Reference: http://marc.info/?l=linux-kernel&m=124025889012541&w=2 [ Impact: refactor and clean up microcode driver locking code ] Signed-off-by: Dmitry Adamushko Acked-by: Hugh Dickins Cc: Andrew Morton Cc: Rusty Russell Cc: Andreas Herrmann Cc: Peter Oruba Cc: Arjan van de Ven LKML-Reference: <1242078507.5560.9.camel@earth> [ did some more cleanups ] Signed-off-by: Ingo Molnar arch/x86/include/asm/microcode.h | 25 ++ arch/x86/kernel/microcode_amd.c | 58 ++---- arch/x86/kernel/microcode_core.c | 326 +++++++++++++++++++++----------------- arch/x86/kernel/microcode_intel.c | 92 +++------- 4 files changed, 261 insertions(+), 240 deletions(-) (~20 new comment lines) --- arch/x86/include/asm/microcode.h | 25 ++- arch/x86/kernel/microcode_amd.c | 58 ++---- arch/x86/kernel/microcode_core.c | 333 +++++++++++++++++------------- arch/x86/kernel/microcode_intel.c | 92 +++------ 4 files changed, 265 insertions(+), 243 deletions(-) diff --git a/arch/x86/include/asm/microcode.h b/arch/x86/include/asm/microcode.h index c882664716c..ef51b501e22 100644 --- a/arch/x86/include/asm/microcode.h +++ b/arch/x86/include/asm/microcode.h @@ -9,20 +9,31 @@ struct cpu_signature { struct device; +enum ucode_state { UCODE_ERROR, UCODE_OK, UCODE_NFOUND }; + struct microcode_ops { - int (*request_microcode_user) (int cpu, const void __user *buf, size_t size); - int (*request_microcode_fw) (int cpu, struct device *device); + enum ucode_state (*request_microcode_user) (int cpu, + const void __user *buf, size_t size); - void (*apply_microcode) (int cpu); + enum ucode_state (*request_microcode_fw) (int cpu, + struct device *device); - int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); void (*microcode_fini_cpu) (int cpu); + + /* + * The generic 'microcode_core' part guarantees that + * the callbacks below run on a target cpu when they + * are being called. + * See also the "Synchronization" section in microcode_core.c. + */ + int (*apply_microcode) (int cpu); + int (*collect_cpu_info) (int cpu, struct cpu_signature *csig); }; struct ucode_cpu_info { - struct cpu_signature cpu_sig; - int valid; - void *mc; + struct cpu_signature cpu_sig; + int valid; + void *mc; }; extern struct ucode_cpu_info ucode_cpu_info[]; diff --git a/arch/x86/kernel/microcode_amd.c b/arch/x86/kernel/microcode_amd.c index 453b5795a5c..c8be20f1644 100644 --- a/arch/x86/kernel/microcode_amd.c +++ b/arch/x86/kernel/microcode_amd.c @@ -13,25 +13,13 @@ * Licensed under the terms of the GNU General Public * License version 2. See file COPYING for details. */ -#include -#include -#include #include -#include -#include #include #include #include #include #include -#include -#include -#include -#include -#include #include -#include -#include #include #include @@ -79,9 +67,6 @@ struct microcode_amd { #define UCODE_CONTAINER_SECTION_HDR 8 #define UCODE_CONTAINER_HEADER_SIZE 12 -/* serialize access to the physical write */ -static DEFINE_SPINLOCK(microcode_update_lock); - static struct equiv_cpu_entry *equiv_cpu_table; static int collect_cpu_info_amd(int cpu, struct cpu_signature *csig) @@ -144,9 +129,8 @@ static int get_matching_microcode(int cpu, void *mc, int rev) return 1; } -static void apply_microcode_amd(int cpu) +static int apply_microcode_amd(int cpu) { - unsigned long flags; u32 rev, dummy; int cpu_num = raw_smp_processor_id(); struct ucode_cpu_info *uci = ucode_cpu_info + cpu_num; @@ -156,25 +140,25 @@ static void apply_microcode_amd(int cpu) BUG_ON(cpu_num != cpu); if (mc_amd == NULL) - return; + return 0; - spin_lock_irqsave(µcode_update_lock, flags); wrmsrl(MSR_AMD64_PATCH_LOADER, (u64)(long)&mc_amd->hdr.data_code); /* get patch id after patching */ rdmsr(MSR_AMD64_PATCH_LEVEL, rev, dummy); - spin_unlock_irqrestore(µcode_update_lock, flags); /* check current patch id and patch's id for match */ if (rev != mc_amd->hdr.patch_id) { printk(KERN_ERR "microcode: CPU%d: update failed " "(for patch_level=0x%x)\n", cpu, mc_amd->hdr.patch_id); - return; + return -1; } printk(KERN_INFO "microcode: CPU%d: updated (new patch_level=0x%x)\n", cpu, rev); uci->cpu_sig.rev = rev; + + return 0; } static int get_ucode_data(void *to, const u8 *from, size_t n) @@ -263,7 +247,8 @@ static void free_equiv_cpu_table(void) } } -static int generic_load_microcode(int cpu, const u8 *data, size_t size) +static enum ucode_state +generic_load_microcode(int cpu, const u8 *data, size_t size) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; const u8 *ucode_ptr = data; @@ -272,12 +257,13 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) int new_rev = uci->cpu_sig.rev; unsigned int leftover; unsigned long offset; + enum ucode_state state = UCODE_OK; offset = install_equiv_cpu_table(ucode_ptr); if (!offset) { printk(KERN_ERR "microcode: failed to create " "equivalent cpu table\n"); - return -EINVAL; + return UCODE_ERROR; } ucode_ptr += offset; @@ -312,28 +298,27 @@ static int generic_load_microcode(int cpu, const u8 *data, size_t size) pr_debug("microcode: CPU%d found a matching microcode " "update with version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - } else + } else { vfree(new_mc); - } + state = UCODE_ERROR; + } + } else + state = UCODE_NFOUND; free_equiv_cpu_table(); - return (int)leftover; + return state; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { const char *fw_name = "amd-ucode/microcode_amd.bin"; const struct firmware *firmware; - int ret; + enum ucode_state ret; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - - ret = request_firmware(&firmware, fw_name, device); - if (ret) { + if (request_firmware(&firmware, fw_name, device)) { printk(KERN_ERR "microcode: failed to load file %s\n", fw_name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, firmware->data, firmware->size); @@ -343,11 +328,12 @@ static int request_microcode_fw(int cpu, struct device *device) return ret; } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { printk(KERN_INFO "microcode: AMD microcode update via " "/dev/cpu/microcode not supported\n"); - return -1; + return UCODE_ERROR; } static void microcode_fini_cpu_amd(int cpu) diff --git a/arch/x86/kernel/microcode_core.c b/arch/x86/kernel/microcode_core.c index 98c470c069d..9c4461501fc 100644 --- a/arch/x86/kernel/microcode_core.c +++ b/arch/x86/kernel/microcode_core.c @@ -71,27 +71,18 @@ * Thanks to Stuart Swales for pointing out this bug. */ #include -#include #include -#include +#include #include -#include -#include -#include -#include #include #include #include -#include -#include -#include #include #include #include #include #include -#include MODULE_DESCRIPTION("Microcode Update Driver"); MODULE_AUTHOR("Tigran Aivazian "); @@ -101,36 +92,110 @@ MODULE_LICENSE("GPL"); static struct microcode_ops *microcode_ops; -/* no concurrent ->write()s are allowed on /dev/cpu/microcode */ +/* + * Synchronization. + * + * All non cpu-hotplug-callback call sites use: + * + * - microcode_mutex to synchronize with each other; + * - get/put_online_cpus() to synchronize with + * the cpu-hotplug-callback call sites. + * + * We guarantee that only a single cpu is being + * updated at any particular moment of time. + */ static DEFINE_MUTEX(microcode_mutex); struct ucode_cpu_info ucode_cpu_info[NR_CPUS]; EXPORT_SYMBOL_GPL(ucode_cpu_info); +/* + * Operations that are run on a target cpu: + */ + +struct cpu_info_ctx { + struct cpu_signature *cpu_sig; + int err; +}; + +static void collect_cpu_info_local(void *arg) +{ + struct cpu_info_ctx *ctx = arg; + + ctx->err = microcode_ops->collect_cpu_info(smp_processor_id(), + ctx->cpu_sig); +} + +static int collect_cpu_info_on_target(int cpu, struct cpu_signature *cpu_sig) +{ + struct cpu_info_ctx ctx = { .cpu_sig = cpu_sig, .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, collect_cpu_info_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + +static int collect_cpu_info(int cpu) +{ + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + int ret; + + memset(uci, 0, sizeof(*uci)); + + ret = collect_cpu_info_on_target(cpu, &uci->cpu_sig); + if (!ret) + uci->valid = 1; + + return ret; +} + +struct apply_microcode_ctx { + int err; +}; + +static void apply_microcode_local(void *arg) +{ + struct apply_microcode_ctx *ctx = arg; + + ctx->err = microcode_ops->apply_microcode(smp_processor_id()); +} + +static int apply_microcode_on_target(int cpu) +{ + struct apply_microcode_ctx ctx = { .err = 0 }; + int ret; + + ret = smp_call_function_single(cpu, apply_microcode_local, &ctx, 1); + if (!ret) + ret = ctx.err; + + return ret; +} + #ifdef CONFIG_MICROCODE_OLD_INTERFACE static int do_microcode_update(const void __user *buf, size_t size) { - cpumask_t old; int error = 0; int cpu; - old = current->cpus_allowed; - for_each_online_cpu(cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; if (!uci->valid) continue; - set_cpus_allowed_ptr(current, &cpumask_of_cpu(cpu)); - error = microcode_ops->request_microcode_user(cpu, buf, size); - if (error < 0) - goto out; - if (!error) - microcode_ops->apply_microcode(cpu); + ustate = microcode_ops->request_microcode_user(cpu, buf, size); + if (ustate == UCODE_ERROR) { + error = -1; + break; + } else if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); } -out: - set_cpus_allowed_ptr(current, &old); + return error; } @@ -143,19 +208,17 @@ static int microcode_open(struct inode *unused1, struct file *unused2) static ssize_t microcode_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { - ssize_t ret; + ssize_t ret = -EINVAL; if ((len >> PAGE_SHIFT) > num_physpages) { - printk(KERN_ERR "microcode: too much data (max %ld pages)\n", - num_physpages); - return -EINVAL; + pr_err("microcode: too much data (max %ld pages)\n", num_physpages); + return ret; } get_online_cpus(); mutex_lock(µcode_mutex); - ret = do_microcode_update(buf, len); - if (!ret) + if (do_microcode_update(buf, len) == 0) ret = (ssize_t)len; mutex_unlock(µcode_mutex); @@ -165,15 +228,15 @@ static ssize_t microcode_write(struct file *file, const char __user *buf, } static const struct file_operations microcode_fops = { - .owner = THIS_MODULE, - .write = microcode_write, - .open = microcode_open, + .owner = THIS_MODULE, + .write = microcode_write, + .open = microcode_open, }; static struct miscdevice microcode_dev = { - .minor = MICROCODE_MINOR, - .name = "microcode", - .fops = µcode_fops, + .minor = MICROCODE_MINOR, + .name = "microcode", + .fops = µcode_fops, }; static int __init microcode_dev_init(void) @@ -182,9 +245,7 @@ static int __init microcode_dev_init(void) error = misc_register(µcode_dev); if (error) { - printk(KERN_ERR - "microcode: can't misc_register on minor=%d\n", - MICROCODE_MINOR); + pr_err("microcode: can't misc_register on minor=%d\n", MICROCODE_MINOR); return error; } @@ -205,42 +266,51 @@ MODULE_ALIAS_MISCDEV(MICROCODE_MINOR); /* fake device for request_firmware */ static struct platform_device *microcode_pdev; -static long reload_for_cpu(void *unused) +static int reload_for_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; int err = 0; mutex_lock(µcode_mutex); if (uci->valid) { - err = microcode_ops->request_microcode_fw(smp_processor_id(), - µcode_pdev->dev); - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); + enum ucode_state ustate; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + if (ustate == UCODE_OK) + apply_microcode_on_target(cpu); + else + if (ustate == UCODE_ERROR) + err = -EINVAL; } mutex_unlock(µcode_mutex); + return err; } static ssize_t reload_store(struct sys_device *dev, struct sysdev_attribute *attr, - const char *buf, size_t sz) + const char *buf, size_t size) { - char *end; - unsigned long val = simple_strtoul(buf, &end, 0); - int err = 0; + unsigned long val; int cpu = dev->id; + int ret = 0; + char *end; + val = simple_strtoul(buf, &end, 0); if (end == buf) return -EINVAL; + if (val == 1) { get_online_cpus(); if (cpu_online(cpu)) - err = work_on_cpu(cpu, reload_for_cpu, NULL); + ret = reload_for_cpu(cpu); put_online_cpus(); } - if (err) - return err; - return sz; + + if (!ret) + ret = size; + + return ret; } static ssize_t version_show(struct sys_device *dev, @@ -271,11 +341,11 @@ static struct attribute *mc_default_attrs[] = { }; static struct attribute_group mc_attr_group = { - .attrs = mc_default_attrs, - .name = "microcode", + .attrs = mc_default_attrs, + .name = "microcode", }; -static void __microcode_fini_cpu(int cpu) +static void microcode_fini_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; @@ -283,103 +353,68 @@ static void __microcode_fini_cpu(int cpu) uci->valid = 0; } -static void microcode_fini_cpu(int cpu) -{ - mutex_lock(µcode_mutex); - __microcode_fini_cpu(cpu); - mutex_unlock(µcode_mutex); -} - -static void collect_cpu_info(int cpu) +static enum ucode_state microcode_resume_cpu(int cpu) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - memset(uci, 0, sizeof(*uci)); - if (!microcode_ops->collect_cpu_info(cpu, &uci->cpu_sig)) - uci->valid = 1; -} - -static int microcode_resume_cpu(int cpu) -{ - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; - struct cpu_signature nsig; - - pr_debug("microcode: CPU%d resumed\n", cpu); - if (!uci->mc) - return 1; + return UCODE_NFOUND; - /* - * Let's verify that the 'cached' ucode does belong - * to this cpu (a bit of paranoia): - */ - if (microcode_ops->collect_cpu_info(cpu, &nsig)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "failed to collect_cpu_info for resuming cpu #%d\n", - cpu); - return -1; - } + pr_debug("microcode: CPU%d updated upon resume\n", cpu); + apply_microcode_on_target(cpu); - if ((nsig.sig != uci->cpu_sig.sig) || (nsig.pf != uci->cpu_sig.pf)) { - __microcode_fini_cpu(cpu); - printk(KERN_ERR "cached ucode doesn't match the resuming cpu #%d\n", - cpu); - /* Should we look for a new ucode here? */ - return 1; - } - - return 0; + return UCODE_OK; } -static long microcode_update_cpu(void *unused) +static enum ucode_state microcode_init_cpu(int cpu) { - struct ucode_cpu_info *uci = ucode_cpu_info + smp_processor_id(); - int err = 0; + enum ucode_state ustate; - /* - * Check if the system resume is in progress (uci->valid != NULL), - * otherwise just request a firmware: - */ - if (uci->valid) { - err = microcode_resume_cpu(smp_processor_id()); - } else { - collect_cpu_info(smp_processor_id()); - if (uci->valid && system_state == SYSTEM_RUNNING) - err = microcode_ops->request_microcode_fw( - smp_processor_id(), - µcode_pdev->dev); + if (collect_cpu_info(cpu)) + return UCODE_ERROR; + + /* --dimm. Trigger a delayed update? */ + if (system_state != SYSTEM_RUNNING) + return UCODE_NFOUND; + + ustate = microcode_ops->request_microcode_fw(cpu, µcode_pdev->dev); + + if (ustate == UCODE_OK) { + pr_debug("microcode: CPU%d updated upon init\n", cpu); + apply_microcode_on_target(cpu); } - if (!err) - microcode_ops->apply_microcode(smp_processor_id()); - return err; + + return ustate; } -static int microcode_init_cpu(int cpu) +static enum ucode_state microcode_update_cpu(int cpu) { - int err; - mutex_lock(µcode_mutex); - err = work_on_cpu(cpu, microcode_update_cpu, NULL); - mutex_unlock(µcode_mutex); + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; + enum ucode_state ustate; - return err; + if (uci->valid) + ustate = microcode_resume_cpu(cpu); + else + ustate = microcode_init_cpu(cpu); + + return ustate; } static int mc_sysdev_add(struct sys_device *sys_dev) { int err, cpu = sys_dev->id; - struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; pr_debug("microcode: CPU%d added\n", cpu); - memset(uci, 0, sizeof(*uci)); err = sysfs_create_group(&sys_dev->kobj, &mc_attr_group); if (err) return err; - err = microcode_init_cpu(cpu); + if (microcode_init_cpu(cpu) == UCODE_ERROR) + err = -EINVAL; return err; } @@ -400,19 +435,30 @@ static int mc_sysdev_remove(struct sys_device *sys_dev) static int mc_sysdev_resume(struct sys_device *dev) { int cpu = dev->id; + struct ucode_cpu_info *uci = ucode_cpu_info + cpu; if (!cpu_online(cpu)) return 0; - /* only CPU 0 will apply ucode here */ - microcode_update_cpu(NULL); + /* + * All non-bootup cpus are still disabled, + * so only CPU 0 will apply ucode here. + * + * Moreover, there can be no concurrent + * updates from any other places at this point. + */ + WARN_ON(cpu != 0); + + if (uci->valid && uci->mc) + microcode_ops->apply_microcode(cpu); + return 0; } static struct sysdev_driver mc_sysdev_driver = { - .add = mc_sysdev_add, - .remove = mc_sysdev_remove, - .resume = mc_sysdev_resume, + .add = mc_sysdev_add, + .remove = mc_sysdev_remove, + .resume = mc_sysdev_resume, }; static __cpuinit int @@ -425,15 +471,12 @@ mc_cpu_callback(struct notifier_block *nb, unsigned long action, void *hcpu) switch (action) { case CPU_ONLINE: case CPU_ONLINE_FROZEN: - if (microcode_init_cpu(cpu)) - printk(KERN_ERR "microcode: failed to init CPU%d\n", - cpu); + microcode_update_cpu(cpu); case CPU_DOWN_FAILED: case CPU_DOWN_FAILED_FROZEN: pr_debug("microcode: CPU%d added\n", cpu); if (sysfs_create_group(&sys_dev->kobj, &mc_attr_group)) - printk(KERN_ERR "microcode: Failed to create the sysfs " - "group for CPU%d\n", cpu); + pr_err("microcode: Failed to create group for CPU%d\n", cpu); break; case CPU_DOWN_PREPARE: case CPU_DOWN_PREPARE_FROZEN: @@ -465,13 +508,10 @@ static int __init microcode_init(void) microcode_ops = init_amd_microcode(); if (!microcode_ops) { - printk(KERN_ERR "microcode: no support for this CPU vendor\n"); + pr_err("microcode: no support for this CPU vendor\n"); return -ENODEV; } - error = microcode_dev_init(); - if (error) - return error; microcode_pdev = platform_device_register_simple("microcode", -1, NULL, 0); if (IS_ERR(microcode_pdev)) { @@ -480,23 +520,31 @@ static int __init microcode_init(void) } get_online_cpus(); + mutex_lock(µcode_mutex); + error = sysdev_driver_register(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); + if (error) { - microcode_dev_exit(); platform_device_unregister(microcode_pdev); return error; } + error = microcode_dev_init(); + if (error) + return error; + register_hotcpu_notifier(&mc_cpu_notifier); - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " ," " Peter Oruba\n"); return 0; } +module_init(microcode_init); static void __exit microcode_exit(void) { @@ -505,16 +553,17 @@ static void __exit microcode_exit(void) unregister_hotcpu_notifier(&mc_cpu_notifier); get_online_cpus(); + mutex_lock(µcode_mutex); + sysdev_driver_unregister(&cpu_sysdev_class, &mc_sysdev_driver); + + mutex_unlock(µcode_mutex); put_online_cpus(); platform_device_unregister(microcode_pdev); microcode_ops = NULL; - printk(KERN_INFO - "Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); + pr_info("Microcode Update Driver: v" MICROCODE_VERSION " removed.\n"); } - -module_init(microcode_init); module_exit(microcode_exit); diff --git a/arch/x86/kernel/microcode_intel.c b/arch/x86/kernel/microcode_intel.c index 149b9ec7c1a..0d334ddd0a9 100644 --- a/arch/x86/kernel/microcode_intel.c +++ b/arch/x86/kernel/microcode_intel.c @@ -70,24 +70,11 @@ * Fix sigmatch() macro to handle old CPUs with pf == 0. * Thanks to Stuart Swales for pointing out this bug. */ -#include -#include -#include #include -#include -#include -#include #include -#include #include #include -#include -#include -#include -#include -#include -#include -#include +#include #include #include @@ -150,13 +137,9 @@ struct extended_sigtable { #define exttable_size(et) ((et)->count * EXT_SIGNATURE_SIZE + EXT_HEADER_SIZE) -/* serialize access to the physical write to MSR 0x79 */ -static DEFINE_SPINLOCK(microcode_update_lock); - static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) { struct cpuinfo_x86 *c = &cpu_data(cpu_num); - unsigned long flags; unsigned int val[2]; memset(csig, 0, sizeof(*csig)); @@ -176,18 +159,14 @@ static int collect_cpu_info(int cpu_num, struct cpu_signature *csig) csig->pf = 1 << ((val[1] >> 18) & 7); } - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); - wrmsr(MSR_IA32_UCODE_REV, 0, 0); /* see notes above for revision 1.07. Apparent chip bug */ sync_core(); /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], csig->rev); - spin_unlock_irqrestore(µcode_update_lock, flags); - pr_debug("microcode: collect_cpu_info : sig=0x%x, pf=0x%x, rev=0x%x\n", - csig->sig, csig->pf, csig->rev); + printk(KERN_INFO "microcode: CPU%d sig=0x%x, pf=0x%x, revision=0x%x\n", + cpu_num, csig->sig, csig->pf, csig->rev); return 0; } @@ -318,11 +297,10 @@ get_matching_microcode(struct cpu_signature *cpu_sig, void *mc, int rev) return 0; } -static void apply_microcode(int cpu) +static int apply_microcode(int cpu) { struct microcode_intel *mc_intel; struct ucode_cpu_info *uci; - unsigned long flags; unsigned int val[2]; int cpu_num; @@ -334,10 +312,7 @@ static void apply_microcode(int cpu) BUG_ON(cpu_num != cpu); if (mc_intel == NULL) - return; - - /* serialize access to the physical write to MSR 0x79 */ - spin_lock_irqsave(µcode_update_lock, flags); + return 0; /* write microcode via MSR 0x79 */ wrmsr(MSR_IA32_UCODE_WRITE, @@ -351,30 +326,32 @@ static void apply_microcode(int cpu) /* get the current revision from MSR 0x8B */ rdmsr(MSR_IA32_UCODE_REV, val[0], val[1]); - spin_unlock_irqrestore(µcode_update_lock, flags); if (val[1] != mc_intel->hdr.rev) { - printk(KERN_ERR "microcode: CPU%d update from revision " - "0x%x to 0x%x failed\n", - cpu_num, uci->cpu_sig.rev, val[1]); - return; + printk(KERN_ERR "microcode: CPU%d update " + "to revision 0x%x failed\n", + cpu_num, mc_intel->hdr.rev); + return -1; } - printk(KERN_INFO "microcode: CPU%d updated from revision " - "0x%x to 0x%x, date = %04x-%02x-%02x \n", - cpu_num, uci->cpu_sig.rev, val[1], + printk(KERN_INFO "microcode: CPU%d updated to revision " + "0x%x, date = %04x-%02x-%02x \n", + cpu_num, val[1], mc_intel->hdr.date & 0xffff, mc_intel->hdr.date >> 24, (mc_intel->hdr.date >> 16) & 0xff); uci->cpu_sig.rev = val[1]; + + return 0; } -static int generic_load_microcode(int cpu, void *data, size_t size, - int (*get_ucode_data)(void *, const void *, size_t)) +static enum ucode_state generic_load_microcode(int cpu, void *data, size_t size, + int (*get_ucode_data)(void *, const void *, size_t)) { struct ucode_cpu_info *uci = ucode_cpu_info + cpu; u8 *ucode_ptr = data, *new_mc = NULL, *mc; int new_rev = uci->cpu_sig.rev; unsigned int leftover = size; + enum ucode_state state = UCODE_OK; while (leftover) { struct microcode_header_intel mc_header; @@ -412,11 +389,15 @@ static int generic_load_microcode(int cpu, void *data, size_t size, leftover -= mc_size; } - if (!new_mc) - goto out; - if (leftover) { - vfree(new_mc); + if (new_mc) + vfree(new_mc); + state = UCODE_ERROR; + goto out; + } + + if (!new_mc) { + state = UCODE_NFOUND; goto out; } @@ -427,9 +408,8 @@ static int generic_load_microcode(int cpu, void *data, size_t size, pr_debug("microcode: CPU%d found a matching microcode update with" " version 0x%x (current=0x%x)\n", cpu, new_rev, uci->cpu_sig.rev); - - out: - return (int)leftover; +out: + return state; } static int get_ucode_fw(void *to, const void *from, size_t n) @@ -438,21 +418,19 @@ static int get_ucode_fw(void *to, const void *from, size_t n) return 0; } -static int request_microcode_fw(int cpu, struct device *device) +static enum ucode_state request_microcode_fw(int cpu, struct device *device) { char name[30]; struct cpuinfo_x86 *c = &cpu_data(cpu); const struct firmware *firmware; - int ret; + enum ucode_state ret; - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); - ret = request_firmware(&firmware, name, device); - if (ret) { + + if (request_firmware(&firmware, name, device)) { pr_debug("microcode: data file %s load failed\n", name); - return ret; + return UCODE_NFOUND; } ret = generic_load_microcode(cpu, (void *)firmware->data, @@ -468,11 +446,9 @@ static int get_ucode_user(void *to, const void *from, size_t n) return copy_from_user(to, from, n); } -static int request_microcode_user(int cpu, const void __user *buf, size_t size) +static enum ucode_state +request_microcode_user(int cpu, const void __user *buf, size_t size) { - /* We should bind the task to the CPU */ - BUG_ON(cpu != raw_smp_processor_id()); - return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } From 2b1ccc0ee918a653d0483fdad9dd6112ce8e9043 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 12 May 2009 11:11:48 +0200 Subject: [PATCH 1022/2061] splice: fix misleading comment Splice is tied to pipes by design, it'll not change. And now that the splice stuff is in splice.h (and note pipe.h), the rest of the comment is out-of-date as well. Signed-off-by: Jens Axboe --- include/linux/splice.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/include/linux/splice.h b/include/linux/splice.h index 5f3faa9d15a..18e7c7c0cae 100644 --- a/include/linux/splice.h +++ b/include/linux/splice.h @@ -11,8 +11,7 @@ #include /* - * splice is tied to pipes as a transport (at least for now), so we'll just - * add the splice flags here. + * Flags passed in from splice/tee/vmsplice */ #define SPLICE_F_MOVE (0x01) /* move pages instead of copying */ #define SPLICE_F_NONBLOCK (0x02) /* don't block on the pipe splicing (but */ From 9d62dcdfa6f6fc843f7d9b494bcd48f00b94f883 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 11 May 2009 22:05:28 -0400 Subject: [PATCH 1023/2061] x86: merge process.c a bit Merge arch_align_stack() and arch_randomize_brk(), since they are the same. Tested on x86_64. [ Impact: cleanup ] Signed-off-by: Amerigo Wang Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 14 ++++++++++++++ arch/x86/kernel/process_32.c | 13 ------------- arch/x86/kernel/process_64.c | 13 ------------- 3 files changed, 14 insertions(+), 26 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index ca989158e84..2b9a8d0fb47 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -8,6 +8,7 @@ #include #include #include +#include #include #include #include @@ -613,3 +614,16 @@ static int __init idle_setup(char *str) } early_param("idle", idle_setup); +unsigned long arch_align_stack(unsigned long sp) +{ + if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) + sp -= get_random_int() % 8192; + return sp & ~0xf; +} + +unsigned long arch_randomize_brk(struct mm_struct *mm) +{ + unsigned long range_end = mm->brk + 0x02000000; + return randomize_range(mm->brk, range_end, 0) ? : mm->brk; +} + diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 76f8f84043a..a3bb049ad08 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -33,7 +33,6 @@ #include #include #include -#include #include #include #include @@ -497,15 +496,3 @@ unsigned long get_wchan(struct task_struct *p) return 0; } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index b751a41392b..34386f72bdc 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -32,7 +32,6 @@ #include #include #include -#include #include #include #include @@ -660,15 +659,3 @@ long sys_arch_prctl(int code, unsigned long addr) return do_arch_prctl(current, code, addr); } -unsigned long arch_align_stack(unsigned long sp) -{ - if (!(current->personality & ADDR_NO_RANDOMIZE) && randomize_va_space) - sp -= get_random_int() % 8192; - return sp & ~0xf; -} - -unsigned long arch_randomize_brk(struct mm_struct *mm) -{ - unsigned long range_end = mm->brk + 0x02000000; - return randomize_range(mm->brk, range_end, 0) ? : mm->brk; -} From bf78ad69cd351798b9447a269c6bd41ce1f111f4 Mon Sep 17 00:00:00 2001 From: Amerigo Wang Date: Mon, 11 May 2009 23:29:09 -0400 Subject: [PATCH 1024/2061] x86: process.c, remove useless headers is not needed by these files, remove them. [ Impact: cleanup ] Signed-off-by: WANG Cong Cc: akpm@linux-foundation.org LKML-Reference: <20090512032956.5040.77055.sendpatchset@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/kernel/process_32.c | 2 -- arch/x86/kernel/process_64.c | 2 -- 2 files changed, 4 deletions(-) diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index a3bb049ad08..56d50b7d71d 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -9,8 +9,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include - #include #include #include diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 34386f72bdc..9d6b20e6cd8 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -14,8 +14,6 @@ * This file handles the architecture-dependent parts of process handling.. */ -#include - #include #include #include From ed077b58f6146684069975122b1728a9d248a501 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Tue, 12 May 2009 16:40:00 +0800 Subject: [PATCH 1025/2061] x86: make sparse mem work in non-NUMA mode With sparse memory, holes should not be marked present for memmap. This patch makes sure sparsemem really works on SMP mode (!NUMA). [ Impact: use less memory to map fragmented RAM, avoid boot-OOM/crash ] Signed-off-by: Shaohua Li Signed-off-by: Sheng Yang LKML-Reference: <1242117600.22431.0.camel@sli10-desk.sh.intel.com> Signed-off-by: Ingo Molnar --- arch/x86/mm/init_32.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index fef1d90d4f1..949708d7a48 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -706,15 +706,15 @@ void __init initmem_init(unsigned long start_pfn, highstart_pfn = highend_pfn = max_pfn; if (max_pfn > max_low_pfn) highstart_pfn = max_low_pfn; - memory_present(0, 0, highend_pfn); e820_register_active_regions(0, 0, highend_pfn); + sparse_memory_present_with_active_regions(0); printk(KERN_NOTICE "%ldMB HIGHMEM available.\n", pages_to_mb(highend_pfn - highstart_pfn)); num_physpages = highend_pfn; high_memory = (void *) __va(highstart_pfn * PAGE_SIZE - 1) + 1; #else - memory_present(0, 0, max_low_pfn); e820_register_active_regions(0, 0, max_low_pfn); + sparse_memory_present_with_active_regions(0); num_physpages = max_low_pfn; high_memory = (void *) __va(max_low_pfn * PAGE_SIZE - 1) + 1; #endif From 9fe5ee0efb1b1d4a0939bc4252a8427e3337d96a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 19:29:04 +0900 Subject: [PATCH 1026/2061] sh: clkfwk: Use arch_clk_init() for on-chip clock registration. CPUs registering on-chip clocks should be using arch_clk_init() with the new scheme so that the CPUs have the opportunity to establish the topology prior to the initial root clock rate propagation. This ensures that CPUs with on-chip clocks that use CLK_ENABLE_ON_INIT are properly enabled at the initial propagation time, without having to further poke the root clocks. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/clock.c | 8 +------- arch/sh/kernel/cpu/sh4/clock-sh4-202.c | 3 +-- arch/sh/kernel/cpu/sh4a/clock-sh7763.c | 3 +-- arch/sh/kernel/cpu/sh4a/clock-sh7780.c | 3 +-- arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 3 +-- arch/sh/kernel/cpu/sh4a/clock-sh7786.c | 3 +-- arch/sh/kernel/cpu/sh4a/clock-shx3.c | 3 +-- 7 files changed, 7 insertions(+), 19 deletions(-) diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 9e1fc133a47..f833843a194 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -437,13 +437,7 @@ void clk_put(struct clk *clk) } EXPORT_SYMBOL_GPL(clk_put); -void __init __attribute__ ((weak)) -arch_init_clk_ops(struct clk_ops **ops, int type) -{ -} - -int __init __attribute__ ((weak)) -arch_clk_init(void) +int __init __weak arch_clk_init(void) { return 0; } diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c index 435f4f12ffb..a72dc326d0b 100644 --- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c @@ -150,7 +150,7 @@ static struct clk *sh4202_onchip_clocks[] = { &sh4202_shoc_clk, }; -static int __init sh4202_clk_init(void) +int __init arch_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); int i, ret = 0; @@ -166,4 +166,3 @@ static int __init sh4202_clk_init(void) return ret; } -arch_initcall(sh4202_clk_init); diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c index 0110da64a43..ce3d4e6319a 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c @@ -90,7 +90,7 @@ static struct clk *sh7763_onchip_clocks[] = { &sh7763_shyway_clk, }; -static int __init sh7763_clk_init(void) +int __init arch_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); int i, ret = 0; @@ -106,4 +106,3 @@ static int __init sh7763_clk_init(void) return ret; } -arch_initcall(sh7763_clk_init); diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c index 0a22d50b109..38b8b1ddb28 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c @@ -96,7 +96,7 @@ static struct clk *sh7780_onchip_clocks[] = { &sh7780_shyway_clk, }; -static int __init sh7780_clk_init(void) +int __init arch_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); int i, ret = 0; @@ -112,4 +112,3 @@ static int __init sh7780_clk_init(void) return ret; } -arch_initcall(sh7780_clk_init); diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c index 4dcd1f6f0cb..fa14f35bc11 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c @@ -134,7 +134,7 @@ static struct clk *sh7785_onchip_clocks[] = { &sh7785_ram_clk, }; -static int __init sh7785_clk_init(void) +int __init arch_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); int i, ret = 0; @@ -150,4 +150,3 @@ static int __init sh7785_clk_init(void) return ret; } -arch_initcall(sh7785_clk_init); diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c index 825556fe230..7907002fa1d 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c @@ -120,7 +120,7 @@ static struct clk *sh7786_onchip_clocks[] = { &sh7786_ddr_clk, }; -static int __init sh7786_clk_init(void) +int __init arch_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); int i, ret = 0; @@ -136,4 +136,3 @@ static int __init sh7786_clk_init(void) return ret; } -arch_initcall(sh7786_clk_init); diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c index 1eb149b0fe6..c14553e3e13 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c @@ -107,7 +107,7 @@ static struct clk *shx3_onchip_clocks[] = { &shx3_shyway_clk, }; -static int __init shx3_clk_init(void) +int __init arch_clk_init(void) { struct clk *clk = clk_get(NULL, "master_clk"); int i, ret = 0; @@ -123,4 +123,3 @@ static int __init shx3_clk_init(void) return ret; } -arch_initcall(shx3_clk_init); From 0ee8b4d7c7d39d9a9913e27686ef786642c3ccbb Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 09:13:59 +0000 Subject: [PATCH 1027/2061] sh: TMU platform data for sh7763 This patch adds TMU platform data for sh7763. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7763.c | 204 +++++++++++++++++++++++++ 1 file changed, 204 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c index bdf0f61ae1e..c91f34c9aa8 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -113,7 +114,195 @@ static struct platform_device usbf_device = { .resource = usbf_resources, }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 28, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 29, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 30, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + +static struct sh_timer_config tmu3_platform_data = { + .name = "TMU3", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", +}; + +static struct resource tmu3_resources[] = { + [0] = { + .name = "TMU3", + .start = 0xffd88008, + .end = 0xffd88013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 96, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu3_device = { + .name = "sh_tmu", + .id = 3, + .dev = { + .platform_data = &tmu3_platform_data, + }, + .resource = tmu3_resources, + .num_resources = ARRAY_SIZE(tmu3_resources), +}; + +static struct sh_timer_config tmu4_platform_data = { + .name = "TMU4", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource tmu4_resources[] = { + [0] = { + .name = "TMU4", + .start = 0xffd88014, + .end = 0xffd8801f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 97, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu4_device = { + .name = "sh_tmu", + .id = 4, + .dev = { + .platform_data = &tmu4_platform_data, + }, + .resource = tmu4_resources, + .num_resources = ARRAY_SIZE(tmu4_resources), +}; + +static struct sh_timer_config tmu5_platform_data = { + .name = "TMU5", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu5_resources[] = { + [0] = { + .name = "TMU5", + .start = 0xffd88020, + .end = 0xffd8802b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 98, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu5_device = { + .name = "sh_tmu", + .id = 5, + .dev = { + .platform_data = &tmu5_platform_data, + }, + .resource = tmu5_resources, + .num_resources = ARRAY_SIZE(tmu5_resources), +}; + static struct platform_device *sh7763_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, &rtc_device, &sci_device, &usb_ohci_device, @@ -127,6 +316,21 @@ static int __init sh7763_devices_setup(void) } __initcall(sh7763_devices_setup); +static struct platform_device *sh7763_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7763_early_devices, + ARRAY_SIZE(sh7763_early_devices)); +} + enum { UNUSED = 0, From 5d8728a71f416fd38a0945271c71cb2933fc5734 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 09:18:13 +0000 Subject: [PATCH 1028/2061] sh: add sh7770_generic_defconfig This patch adds a generic sh7770 defconfig file. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/configs/sh7770_generic_defconfig | 700 +++++++++++++++++++++++ 1 file changed, 700 insertions(+) create mode 100644 arch/sh/configs/sh7770_generic_defconfig diff --git a/arch/sh/configs/sh7770_generic_defconfig b/arch/sh/configs/sh7770_generic_defconfig new file mode 100644 index 00000000000..d6489b46ca5 --- /dev/null +++ b/arch/sh/configs/sh7770_generic_defconfig @@ -0,0 +1,700 @@ +# +# Automatically generated make config: don't edit +# Linux kernel version: 2.6.30-rc4 +# Tue May 12 14:48:21 2009 +# +CONFIG_SUPERH=y +CONFIG_SUPERH32=y +# CONFIG_SUPERH64 is not set +CONFIG_ARCH_DEFCONFIG="arch/sh/configs/shx3_defconfig" +CONFIG_RWSEM_GENERIC_SPINLOCK=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_FIND_NEXT_BIT=y +CONFIG_GENERIC_HWEIGHT=y +CONFIG_GENERIC_HARDIRQS=y +CONFIG_GENERIC_HARDIRQS_NO__DO_IRQ=y +CONFIG_GENERIC_IRQ_PROBE=y +# CONFIG_GENERIC_GPIO is not set +CONFIG_GENERIC_TIME=y +CONFIG_GENERIC_CLOCKEVENTS=y +# CONFIG_ARCH_SUSPEND_POSSIBLE is not set +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_SYS_SUPPORTS_TMU=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_HAVE_LATENCYTOP_SUPPORT=y +# CONFIG_ARCH_HAS_ILOG2_U32 is not set +# CONFIG_ARCH_HAS_ILOG2_U64 is not set +CONFIG_ARCH_NO_VIRT_TO_BUS=y +CONFIG_ARCH_HAS_DEFAULT_IDLE=y +CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" + +# +# General setup +# +CONFIG_EXPERIMENTAL=y +CONFIG_BROKEN_ON_SMP=y +CONFIG_INIT_ENV_ARG_LIMIT=32 +CONFIG_LOCALVERSION="" +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +# CONFIG_BSD_PROCESS_ACCT is not set + +# +# RCU Subsystem +# +# CONFIG_CLASSIC_RCU is not set +CONFIG_TREE_RCU=y +# CONFIG_PREEMPT_RCU is not set +# CONFIG_RCU_TRACE is not set +CONFIG_RCU_FANOUT=32 +# CONFIG_RCU_FANOUT_EXACT is not set +# CONFIG_TREE_RCU_TRACE is not set +# CONFIG_PREEMPT_RCU_TRACE is not set +# CONFIG_IKCONFIG is not set +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_GROUP_SCHED=y +CONFIG_FAIR_GROUP_SCHED=y +CONFIG_RT_GROUP_SCHED=y +CONFIG_USER_SCHED=y +# CONFIG_CGROUP_SCHED is not set +CONFIG_CGROUPS=y +# CONFIG_CGROUP_DEBUG is not set +# CONFIG_CGROUP_NS is not set +# CONFIG_CGROUP_FREEZER is not set +# CONFIG_CGROUP_DEVICE is not set +# CONFIG_CPUSETS is not set +# CONFIG_CGROUP_CPUACCT is not set +# CONFIG_RESOURCE_COUNTERS is not set +# CONFIG_RELAY is not set +# CONFIG_NAMESPACES is not set +# CONFIG_BLK_DEV_INITRD is not set +CONFIG_CC_OPTIMIZE_FOR_SIZE=y +CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y +CONFIG_EMBEDDED=y +# CONFIG_UID16 is not set +CONFIG_SYSCTL_SYSCALL=y +CONFIG_KALLSYMS=y +# CONFIG_KALLSYMS_EXTRA_PASS is not set +# CONFIG_STRIP_ASM_SYMS is not set +CONFIG_HOTPLUG=y +CONFIG_PRINTK=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_VM_EVENT_COUNTERS=y +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +# CONFIG_SLOB is not set +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +# CONFIG_MARKERS is not set +CONFIG_OPROFILE=y +CONFIG_HAVE_OPROFILE=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_DMA_API_DEBUG=y +# CONFIG_SLOW_WORK is not set +CONFIG_HAVE_GENERIC_DMA_COHERENT=y +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +# CONFIG_MODULES is not set +CONFIG_BLOCK=y +# CONFIG_LBD is not set +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_BLK_DEV_INTEGRITY is not set + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_AS=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_DEFAULT_AS=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="anticipatory" +CONFIG_FREEZER=y + +# +# System type +# +CONFIG_CPU_SH4=y +CONFIG_CPU_SH4A=y +# CONFIG_CPU_SUBTYPE_SH7619 is not set +# CONFIG_CPU_SUBTYPE_SH7201 is not set +# CONFIG_CPU_SUBTYPE_SH7203 is not set +# CONFIG_CPU_SUBTYPE_SH7206 is not set +# CONFIG_CPU_SUBTYPE_SH7263 is not set +# CONFIG_CPU_SUBTYPE_MXG is not set +# CONFIG_CPU_SUBTYPE_SH7705 is not set +# CONFIG_CPU_SUBTYPE_SH7706 is not set +# CONFIG_CPU_SUBTYPE_SH7707 is not set +# CONFIG_CPU_SUBTYPE_SH7708 is not set +# CONFIG_CPU_SUBTYPE_SH7709 is not set +# CONFIG_CPU_SUBTYPE_SH7710 is not set +# CONFIG_CPU_SUBTYPE_SH7712 is not set +# CONFIG_CPU_SUBTYPE_SH7720 is not set +# CONFIG_CPU_SUBTYPE_SH7721 is not set +# CONFIG_CPU_SUBTYPE_SH7750 is not set +# CONFIG_CPU_SUBTYPE_SH7091 is not set +# CONFIG_CPU_SUBTYPE_SH7750R is not set +# CONFIG_CPU_SUBTYPE_SH7750S is not set +# CONFIG_CPU_SUBTYPE_SH7751 is not set +# CONFIG_CPU_SUBTYPE_SH7751R is not set +# CONFIG_CPU_SUBTYPE_SH7760 is not set +# CONFIG_CPU_SUBTYPE_SH4_202 is not set +# CONFIG_CPU_SUBTYPE_SH7723 is not set +# CONFIG_CPU_SUBTYPE_SH7724 is not set +# CONFIG_CPU_SUBTYPE_SH7763 is not set +CONFIG_CPU_SUBTYPE_SH7770=y +# CONFIG_CPU_SUBTYPE_SH7780 is not set +# CONFIG_CPU_SUBTYPE_SH7785 is not set +# CONFIG_CPU_SUBTYPE_SH7786 is not set +# CONFIG_CPU_SUBTYPE_SHX3 is not set +# CONFIG_CPU_SUBTYPE_SH7343 is not set +# CONFIG_CPU_SUBTYPE_SH7722 is not set +# CONFIG_CPU_SUBTYPE_SH7366 is not set + +# +# Memory management options +# +CONFIG_QUICKLIST=y +CONFIG_MMU=y +CONFIG_PAGE_OFFSET=0x80000000 +CONFIG_MEMORY_START=0x08000000 +CONFIG_MEMORY_SIZE=0x04000000 +CONFIG_29BIT=y +CONFIG_VSYSCALL=y +CONFIG_ARCH_FLATMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_MAX_ACTIVE_REGIONS=1 +CONFIG_ARCH_POPULATES_NODE_MAP=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_PAGE_SIZE_4KB=y +# CONFIG_PAGE_SIZE_8KB is not set +# CONFIG_PAGE_SIZE_16KB is not set +# CONFIG_PAGE_SIZE_64KB is not set +CONFIG_SELECT_MEMORY_MODEL=y +# CONFIG_FLATMEM_MANUAL is not set +# CONFIG_DISCONTIGMEM_MANUAL is not set +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_STATIC=y + +# +# Memory hotplug is currently incompatible with Software Suspend +# +CONFIG_PAGEFLAGS_EXTENDED=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MIGRATION=y +# CONFIG_PHYS_ADDR_T_64BIT is not set +CONFIG_ZONE_DMA_FLAG=0 +CONFIG_NR_QUICK=2 +CONFIG_UNEVICTABLE_LRU=y +CONFIG_HAVE_MLOCK=y +CONFIG_HAVE_MLOCKED_PAGE_BIT=y + +# +# Cache configuration +# +CONFIG_CACHE_WRITEBACK=y +# CONFIG_CACHE_WRITETHROUGH is not set +# CONFIG_CACHE_OFF is not set + +# +# Processor features +# +CONFIG_CPU_LITTLE_ENDIAN=y +# CONFIG_CPU_BIG_ENDIAN is not set +CONFIG_SH_FPU=y +# CONFIG_SH_STORE_QUEUES is not set +CONFIG_CPU_HAS_INTEVT=y +CONFIG_CPU_HAS_SR_RB=y +CONFIG_CPU_HAS_FPU=y + +# +# Board support +# + +# +# Timer and clock configuration +# +CONFIG_SH_TMU=y +CONFIG_SH_TIMER_IRQ=16 +CONFIG_SH_PCLK_FREQ=41666666 +CONFIG_TICK_ONESHOT=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_GENERIC_CLOCKEVENTS_BUILD=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_TABLE=y +# CONFIG_CPU_FREQ_DEBUG is not set +CONFIG_CPU_FREQ_STAT=y +# CONFIG_CPU_FREQ_STAT_DETAILS is not set +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_GOV_CONSERVATIVE is not set +CONFIG_SH_CPU_FREQ=y + +# +# DMA support +# +# CONFIG_SH_DMA is not set + +# +# Companion Chips +# + +# +# Additional SuperH Device Drivers +# +# CONFIG_HEARTBEAT is not set +# CONFIG_PUSH_SWITCH is not set + +# +# Kernel features +# +# CONFIG_HZ_100 is not set +CONFIG_HZ_250=y +# CONFIG_HZ_300 is not set +# CONFIG_HZ_1000 is not set +CONFIG_HZ=250 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +# CONFIG_CRASH_DUMP is not set +CONFIG_KEXEC_JUMP=y +CONFIG_PREEMPT_NONE=y +# CONFIG_PREEMPT_VOLUNTARY is not set +# CONFIG_PREEMPT is not set +CONFIG_GUSA=y + +# +# Boot options +# +CONFIG_ZERO_PAGE_OFFSET=0x00001000 +CONFIG_BOOT_LINK_OFFSET=0x00800000 +CONFIG_ENTRY_OFFSET=0x00001000 +# CONFIG_CMDLINE_BOOL is not set + +# +# Bus options +# +# CONFIG_ARCH_SUPPORTS_MSI is not set +# CONFIG_PCCARD is not set + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +# CONFIG_HAVE_AOUT is not set +# CONFIG_BINFMT_MISC is not set + +# +# Power management options (EXPERIMENTAL) +# +CONFIG_PM=y +# CONFIG_PM_DEBUG is not set +CONFIG_PM_SLEEP=y +CONFIG_HIBERNATION=y +CONFIG_PM_STD_PARTITION="" +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +# CONFIG_NET is not set + +# +# Device Drivers +# + +# +# Generic Driver Options +# +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_STANDALONE=y +# CONFIG_PREVENT_FIRMWARE_BUILD is not set +CONFIG_FW_LOADER=y +CONFIG_FIRMWARE_IN_KERNEL=y +CONFIG_EXTRA_FIRMWARE="" +# CONFIG_SYS_HYPERVISOR is not set +# CONFIG_MTD is not set +# CONFIG_PARPORT is not set +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_COW_COMMON is not set +# CONFIG_BLK_DEV_LOOP is not set +# CONFIG_BLK_DEV_RAM is not set +# CONFIG_CDROM_PKTCDVD is not set +# CONFIG_BLK_DEV_HD is not set +# CONFIG_MISC_DEVICES is not set +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +# CONFIG_RAID_ATTRS is not set +# CONFIG_SCSI is not set +# CONFIG_SCSI_DMA is not set +# CONFIG_SCSI_NETLINK is not set +# CONFIG_ATA is not set +# CONFIG_MD is not set +# CONFIG_PHONE is not set + +# +# Input device support +# +# CONFIG_INPUT is not set + +# +# Hardware I/O ports +# +# CONFIG_SERIO is not set +# CONFIG_GAMEPORT is not set + +# +# Character devices +# +# CONFIG_VT is not set +# CONFIG_DEVKMEM is not set +# CONFIG_SERIAL_NONSTANDARD is not set + +# +# Serial drivers +# +# CONFIG_SERIAL_8250 is not set + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_SH_SCI=y +CONFIG_SERIAL_SH_SCI_NR_UARTS=6 +CONFIG_SERIAL_SH_SCI_CONSOLE=y +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +# CONFIG_UNIX98_PTYS is not set +# CONFIG_LEGACY_PTYS is not set +# CONFIG_IPMI_HANDLER is not set +# CONFIG_HW_RANDOM is not set +# CONFIG_R3964 is not set +# CONFIG_RAW_DRIVER is not set +# CONFIG_TCG_TPM is not set +CONFIG_I2C=y +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_CHARDEV=y +CONFIG_I2C_HELPER_AUTO=y + +# +# I2C Hardware Bus support +# + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +# CONFIG_I2C_OCORES is not set +CONFIG_I2C_SH_MOBILE=y +# CONFIG_I2C_SIMTEC is not set + +# +# External I2C/SMBus adapter drivers +# +# CONFIG_I2C_PARPORT_LIGHT is not set +# CONFIG_I2C_TAOS_EVM is not set + +# +# Other I2C/SMBus bus drivers +# +# CONFIG_I2C_PCA_PLATFORM is not set + +# +# Miscellaneous I2C Chip support +# +# CONFIG_DS1682 is not set +# CONFIG_SENSORS_PCF8574 is not set +# CONFIG_PCF8575 is not set +# CONFIG_SENSORS_PCA9539 is not set +# CONFIG_SENSORS_MAX6875 is not set +# CONFIG_SENSORS_TSL2550 is not set +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +# CONFIG_I2C_DEBUG_CHIP is not set +# CONFIG_SPI is not set +# CONFIG_W1 is not set +# CONFIG_POWER_SUPPLY is not set +# CONFIG_HWMON is not set +# CONFIG_THERMAL is not set +# CONFIG_THERMAL_HWMON is not set +# CONFIG_WATCHDOG is not set +CONFIG_SSB_POSSIBLE=y + +# +# Sonics Silicon Backplane +# +# CONFIG_SSB is not set + +# +# Multifunction device drivers +# +# CONFIG_MFD_CORE is not set +# CONFIG_MFD_SM501 is not set +# CONFIG_HTC_PASIC3 is not set +# CONFIG_TWL4030_CORE is not set +# CONFIG_MFD_TMIO is not set +# CONFIG_PMIC_DA903X is not set +# CONFIG_MFD_WM8400 is not set +# CONFIG_MFD_WM8350_I2C is not set +# CONFIG_MFD_PCF50633 is not set +# CONFIG_REGULATOR is not set + +# +# Multimedia devices +# + +# +# Multimedia core support +# +# CONFIG_VIDEO_DEV is not set +# CONFIG_VIDEO_MEDIA is not set + +# +# Multimedia drivers +# +# CONFIG_DAB is not set + +# +# Graphics support +# +# CONFIG_VGASTATE is not set +# CONFIG_VIDEO_OUTPUT_CONTROL is not set +# CONFIG_FB is not set +# CONFIG_BACKLIGHT_LCD_SUPPORT is not set + +# +# Display device support +# +# CONFIG_DISPLAY_SUPPORT is not set +# CONFIG_SOUND is not set +# CONFIG_USB_SUPPORT is not set +# CONFIG_MMC is not set +# CONFIG_MEMSTICK is not set +# CONFIG_NEW_LEDS is not set +# CONFIG_ACCESSIBILITY is not set +CONFIG_RTC_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set + +# +# RTC interfaces +# +CONFIG_RTC_INTF_DEV=y +# CONFIG_RTC_INTF_DEV_UIE_EMUL is not set +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +# CONFIG_RTC_DRV_DS1307 is not set +# CONFIG_RTC_DRV_DS1374 is not set +# CONFIG_RTC_DRV_DS1672 is not set +# CONFIG_RTC_DRV_MAX6900 is not set +# CONFIG_RTC_DRV_RS5C372 is not set +# CONFIG_RTC_DRV_ISL1208 is not set +# CONFIG_RTC_DRV_X1205 is not set +# CONFIG_RTC_DRV_PCF8563 is not set +# CONFIG_RTC_DRV_PCF8583 is not set +# CONFIG_RTC_DRV_M41T80 is not set +# CONFIG_RTC_DRV_S35390A is not set +# CONFIG_RTC_DRV_FM3130 is not set +# CONFIG_RTC_DRV_RX8581 is not set + +# +# SPI RTC drivers +# + +# +# Platform RTC drivers +# +# CONFIG_RTC_DRV_DS1286 is not set +# CONFIG_RTC_DRV_DS1511 is not set +# CONFIG_RTC_DRV_DS1553 is not set +# CONFIG_RTC_DRV_DS1742 is not set +# CONFIG_RTC_DRV_STK17TA8 is not set +# CONFIG_RTC_DRV_M48T86 is not set +# CONFIG_RTC_DRV_M48T35 is not set +# CONFIG_RTC_DRV_M48T59 is not set +# CONFIG_RTC_DRV_BQ4802 is not set +# CONFIG_RTC_DRV_V3020 is not set + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_SH=y +# CONFIG_RTC_DRV_GENERIC is not set +# CONFIG_DMADEVICES is not set +# CONFIG_AUXDISPLAY is not set +CONFIG_UIO=y +# CONFIG_UIO_PDRV is not set +CONFIG_UIO_PDRV_GENIRQ=y +# CONFIG_UIO_SMX is not set +# CONFIG_UIO_SERCOS3 is not set +# CONFIG_STAGING is not set + +# +# File systems +# +# CONFIG_EXT2_FS is not set +# CONFIG_EXT3_FS is not set +# CONFIG_EXT4_FS is not set +# CONFIG_REISERFS_FS is not set +# CONFIG_JFS_FS is not set +# CONFIG_FS_POSIX_ACL is not set +CONFIG_FILE_LOCKING=y +# CONFIG_XFS_FS is not set +# CONFIG_BTRFS_FS is not set +# CONFIG_DNOTIFY is not set +# CONFIG_INOTIFY is not set +# CONFIG_QUOTA is not set +# CONFIG_AUTOFS_FS is not set +# CONFIG_AUTOFS4_FS is not set +# CONFIG_FUSE_FS is not set + +# +# Caches +# +# CONFIG_FSCACHE is not set + +# +# CD-ROM/DVD Filesystems +# +# CONFIG_ISO9660_FS is not set +# CONFIG_UDF_FS is not set + +# +# DOS/FAT/NT Filesystems +# +# CONFIG_MSDOS_FS is not set +# CONFIG_VFAT_FS is not set +# CONFIG_NTFS_FS is not set + +# +# Pseudo filesystems +# +# CONFIG_PROC_FS is not set +# CONFIG_SYSFS is not set +# CONFIG_TMPFS is not set +# CONFIG_HUGETLBFS is not set +# CONFIG_HUGETLB_PAGE is not set +# CONFIG_MISC_FILESYSTEMS is not set + +# +# Partition Types +# +# CONFIG_PARTITION_ADVANCED is not set +CONFIG_MSDOS_PARTITION=y +# CONFIG_NLS is not set + +# +# Kernel hacking +# +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +# CONFIG_PRINTK_TIME is not set +# CONFIG_ENABLE_WARN_DEPRECATED is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=1024 +# CONFIG_MAGIC_SYSRQ is not set +# CONFIG_UNUSED_SYMBOLS is not set +CONFIG_DEBUG_FS=y +# CONFIG_HEADERS_CHECK is not set +# CONFIG_DEBUG_KERNEL is not set +CONFIG_STACKTRACE=y +# CONFIG_DEBUG_BUGVERBOSE is not set +# CONFIG_DEBUG_MEMORY_INIT is not set +# CONFIG_RCU_CPU_STALL_DETECTOR is not set +# CONFIG_LATENCYTOP is not set +# CONFIG_SYSCTL_SYSCALL_CHECK is not set +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_RING_BUFFER=y +CONFIG_TRACING=y +CONFIG_TRACING_SUPPORT=y + +# +# Tracers +# +# CONFIG_FUNCTION_TRACER is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_CONTEXT_SWITCH_TRACER is not set +# CONFIG_EVENT_TRACER is not set +# CONFIG_BOOT_TRACER is not set +# CONFIG_TRACE_BRANCH_PROFILING is not set +# CONFIG_STACK_TRACER is not set +# CONFIG_KMEMTRACE is not set +# CONFIG_WORKQUEUE_TRACER is not set +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_DYNAMIC_DEBUG is not set +# CONFIG_DMA_API_DEBUG is not set +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_SH_STANDARD_BIOS is not set +# CONFIG_EARLY_SCIF_CONSOLE is not set + +# +# Security options +# +# CONFIG_KEYS is not set +# CONFIG_SECURITYFS is not set +# CONFIG_SECURITY_FILE_CAPABILITIES is not set +# CONFIG_CRYPTO is not set +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_GENERIC_FIND_LAST_BIT=y +# CONFIG_CRC_CCITT is not set +# CONFIG_CRC16 is not set +# CONFIG_CRC_T10DIF is not set +# CONFIG_CRC_ITU_T is not set +# CONFIG_CRC32 is not set +# CONFIG_CRC7 is not set +# CONFIG_LIBCRC32C is not set +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT=y +CONFIG_HAS_DMA=y From f251935e02e89aa203e0729bfd727175018cec6f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 09:20:08 +0000 Subject: [PATCH 1029/2061] sh: TMU platform data for sh7770 This patch adds TMU platform data for sh7770. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7770.c | 300 +++++++++++++++++++++++++ 1 file changed, 300 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c index b73578ee295..b61d6143aaa 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c @@ -11,6 +11,7 @@ #include #include #include +#include static struct plat_sci_port sci_platform_data[] = { { @@ -76,7 +77,288 @@ static struct platform_device sci_device = { }, }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + +static struct sh_timer_config tmu3_platform_data = { + .name = "TMU3", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", +}; + +static struct resource tmu3_resources[] = { + [0] = { + .name = "TMU3", + .start = 0xffd81008, + .end = 0xffd81013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 19, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu3_device = { + .name = "sh_tmu", + .id = 3, + .dev = { + .platform_data = &tmu3_platform_data, + }, + .resource = tmu3_resources, + .num_resources = ARRAY_SIZE(tmu3_resources), +}; + +static struct sh_timer_config tmu4_platform_data = { + .name = "TMU4", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource tmu4_resources[] = { + [0] = { + .name = "TMU4", + .start = 0xffd81014, + .end = 0xffd8101f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 20, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu4_device = { + .name = "sh_tmu", + .id = 4, + .dev = { + .platform_data = &tmu4_platform_data, + }, + .resource = tmu4_resources, + .num_resources = ARRAY_SIZE(tmu4_resources), +}; + +static struct sh_timer_config tmu5_platform_data = { + .name = "TMU5", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu5_resources[] = { + [0] = { + .name = "TMU5", + .start = 0xffd81020, + .end = 0xffd8102f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 21, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu5_device = { + .name = "sh_tmu", + .id = 5, + .dev = { + .platform_data = &tmu5_platform_data, + }, + .resource = tmu5_resources, + .num_resources = ARRAY_SIZE(tmu5_resources), +}; + +static struct sh_timer_config tmu6_platform_data = { + .name = "TMU6", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", +}; + +static struct resource tmu6_resources[] = { + [0] = { + .name = "TMU6", + .start = 0xffd82008, + .end = 0xffd82013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 22, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu6_device = { + .name = "sh_tmu", + .id = 6, + .dev = { + .platform_data = &tmu6_platform_data, + }, + .resource = tmu6_resources, + .num_resources = ARRAY_SIZE(tmu6_resources), +}; + +static struct sh_timer_config tmu7_platform_data = { + .name = "TMU7", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource tmu7_resources[] = { + [0] = { + .name = "TMU7", + .start = 0xffd82014, + .end = 0xffd8201f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 23, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu7_device = { + .name = "sh_tmu", + .id = 7, + .dev = { + .platform_data = &tmu7_platform_data, + }, + .resource = tmu7_resources, + .num_resources = ARRAY_SIZE(tmu7_resources), +}; + +static struct sh_timer_config tmu8_platform_data = { + .name = "TMU8", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu8_resources[] = { + [0] = { + .name = "TMU8", + .start = 0xffd82020, + .end = 0xffd8202b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 24, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu8_device = { + .name = "sh_tmu", + .id = 8, + .dev = { + .platform_data = &tmu8_platform_data, + }, + .resource = tmu8_resources, + .num_resources = ARRAY_SIZE(tmu8_resources), +}; + static struct platform_device *sh7770_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, + &tmu6_device, + &tmu7_device, + &tmu8_device, &sci_device, }; @@ -87,6 +369,24 @@ static int __init sh7770_devices_setup(void) } __initcall(sh7770_devices_setup); +static struct platform_device *sh7770_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, + &tmu6_device, + &tmu7_device, + &tmu8_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7770_early_devices, + ARRAY_SIZE(sh7770_early_devices)); +} + void __init plat_irq_setup(void) { } From 5f8a29ba39d52b2eaaed907b3cb3016b949a8f9b Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 09:28:25 +0000 Subject: [PATCH 1030/2061] sh: TMU platform data for sh4-202 This patch adds TMU platform data for sh4-202. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4/setup-sh4-202.c | 108 +++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c index 7371abf64f8..034c069f715 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c @@ -11,6 +11,7 @@ #include #include #include +#include static struct plat_sci_port sci_platform_data[] = { { @@ -31,8 +32,103 @@ static struct platform_device sci_device = { }, }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + static struct platform_device *sh4202_devices[] __initdata = { &sci_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, }; static int __init sh4202_devices_setup(void) @@ -42,6 +138,18 @@ static int __init sh4202_devices_setup(void) } __initcall(sh4202_devices_setup); +static struct platform_device *sh4202_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh4202_early_devices, + ARRAY_SIZE(sh4202_early_devices)); +} + void __init plat_irq_setup(void) { /* do nothing - all IRL interrupts are handled by the board code */ From 67d889bd828a3cbf36ef27bd4a5c5f7ec76e20b9 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 09:34:26 +0000 Subject: [PATCH 1031/2061] sh: add sh4-202 INTC tables This patch adds INTC tables for sh4-202 with support for HUDI, TMU0, TMU1, TMU2, RTC, SCIF and WDT. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4/setup-sh4-202.c | 56 +++++++++++++++++++++++++- 1 file changed, 55 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c index 034c069f715..be79fa13625 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c @@ -2,6 +2,7 @@ * SH4-202 Setup * * Copyright (C) 2006 Paul Mundt + * Copyright (C) 2009 Magnus Damm * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive @@ -12,6 +13,7 @@ #include #include #include +#include static struct plat_sci_port sci_platform_data[] = { { @@ -150,7 +152,59 @@ void __init plat_early_device_setup(void) ARRAY_SIZE(sh4202_early_devices)); } +enum { + UNUSED = 0, + + /* interrupt sources */ + IRL0, IRL1, IRL2, IRL3, /* only IRLM mode supported */ + HUDI, TMU0, TMU1, TMU2, RTC, SCIF, WDT, +}; + +static struct intc_vect vectors[] __initdata = { + INTC_VECT(HUDI, 0x600), + INTC_VECT(TMU0, 0x400), INTC_VECT(TMU1, 0x420), + INTC_VECT(TMU2, 0x440), INTC_VECT(TMU2, 0x460), + INTC_VECT(RTC, 0x480), INTC_VECT(RTC, 0x4a0), + INTC_VECT(RTC, 0x4c0), + INTC_VECT(SCIF, 0x700), INTC_VECT(SCIF, 0x720), + INTC_VECT(SCIF, 0x740), INTC_VECT(SCIF, 0x760), + INTC_VECT(WDT, 0x560), +}; + +static struct intc_prio_reg prio_registers[] __initdata = { + { 0xffd00004, 0, 16, 4, /* IPRA */ { TMU0, TMU1, TMU2, RTC } }, + { 0xffd00008, 0, 16, 4, /* IPRB */ { WDT, 0, 0, 0 } }, + { 0xffd0000c, 0, 16, 4, /* IPRC */ { 0, 0, SCIF, HUDI } }, + { 0xffd00010, 0, 16, 4, /* IPRD */ { IRL0, IRL1, IRL2, IRL3 } }, +}; + +static DECLARE_INTC_DESC(intc_desc, "sh4-202", vectors, NULL, + NULL, prio_registers, NULL); + +static struct intc_vect vectors_irlm[] __initdata = { + INTC_VECT(IRL0, 0x240), INTC_VECT(IRL1, 0x2a0), + INTC_VECT(IRL2, 0x300), INTC_VECT(IRL3, 0x360), +}; + +static DECLARE_INTC_DESC(intc_desc_irlm, "sh4-202_irlm", vectors_irlm, NULL, + NULL, prio_registers, NULL); + void __init plat_irq_setup(void) { - /* do nothing - all IRL interrupts are handled by the board code */ + register_intc_controller(&intc_desc); +} + +#define INTC_ICR 0xffd00000UL +#define INTC_ICR_IRLM (1<<7) + +void __init plat_irq_setup_pins(int mode) +{ + switch (mode) { + case IRQ_MODE_IRQ: /* individual interrupt mode for IRL3-0 */ + ctrl_outw(ctrl_inw(INTC_ICR) | INTC_ICR_IRLM, INTC_ICR); + register_intc_controller(&intc_desc_irlm); + break; + default: + BUG(); + } } From e37677a429a67cb45004b060ea3376e500bc73c4 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 09:39:06 +0000 Subject: [PATCH 1032/2061] sh: TMU platform data for sh7343 This patch adds TMU platform data for sh7343. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7343.c | 98 ++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c index b9c361e8cc8..51204dc7ca2 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7343.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7343.c @@ -173,6 +173,98 @@ static struct platform_device cmt_device = { .num_resources = ARRAY_SIZE(cmt_resources), }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "tmu0", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "tmu0", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "tmu0", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + static struct plat_sci_port sci_platform_data[] = { { .mapbase = 0xffe00000, @@ -213,6 +305,9 @@ static struct platform_device sci_device = { static struct platform_device *sh7343_devices[] __initdata = { &cmt_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, &iic0_device, &iic1_device, &sci_device, @@ -240,6 +335,9 @@ __initcall(sh7343_devices_setup); static struct platform_device *sh7343_early_devices[] __initdata = { &cmt_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, }; void __init plat_early_device_setup(void) From f2710ebcd0a0404ccca3bbad68fd3639fe87ba6f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 09:48:21 +0000 Subject: [PATCH 1033/2061] sh: TMU platform data for sh7366 This patch adds TMU platform data for sh7366. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7366.c | 98 ++++++++++++++++++++++++++ 1 file changed, 98 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c index d4a994be4a7..04de0fa8512 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7366.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7366.c @@ -180,6 +180,98 @@ static struct platform_device cmt_device = { .num_resources = ARRAY_SIZE(cmt_resources), }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "tmu0", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "tmu0", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "tmu0", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + static struct plat_sci_port sci_platform_data[] = { { .mapbase = 0xffe00000, @@ -202,6 +294,9 @@ static struct platform_device sci_device = { static struct platform_device *sh7366_devices[] __initdata = { &cmt_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, &iic_device, &sci_device, &usb_host_device, @@ -229,6 +324,9 @@ __initcall(sh7366_devices_setup); static struct platform_device *sh7366_early_devices[] __initdata = { &cmt_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, }; void __init plat_early_device_setup(void) From 6a3501b63d30643120566cc1efba5acd0e64b12d Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 09:50:46 +0000 Subject: [PATCH 1034/2061] sh: TMU platform data for sh7724 This patch adds TMU platform data for sh7724. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7724.c | 195 +++++++++++++++++++++++++ 1 file changed, 195 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c index a3039ce1a6a..852f8104f03 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7724.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7724.c @@ -268,8 +268,197 @@ static struct platform_device cmt_device = { .num_resources = ARRAY_SIZE(cmt_resources), }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "tmu0", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xffd80008, + .end = 0xffd80013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "tmu0", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xffd80014, + .end = 0xffd8001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "tmu0", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xffd80020, + .end = 0xffd8002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + + +static struct sh_timer_config tmu3_platform_data = { + .name = "TMU3", + .channel_offset = 0x04, + .timer_bit = 0, + .clk = "tmu1", +}; + +static struct resource tmu3_resources[] = { + [0] = { + .name = "TMU3", + .start = 0xffd90008, + .end = 0xffd90013, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 57, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu3_device = { + .name = "sh_tmu", + .id = 3, + .dev = { + .platform_data = &tmu3_platform_data, + }, + .resource = tmu3_resources, + .num_resources = ARRAY_SIZE(tmu3_resources), +}; + +static struct sh_timer_config tmu4_platform_data = { + .name = "TMU4", + .channel_offset = 0x10, + .timer_bit = 1, + .clk = "tmu1", +}; + +static struct resource tmu4_resources[] = { + [0] = { + .name = "TMU4", + .start = 0xffd90014, + .end = 0xffd9001f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 58, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu4_device = { + .name = "sh_tmu", + .id = 4, + .dev = { + .platform_data = &tmu4_platform_data, + }, + .resource = tmu4_resources, + .num_resources = ARRAY_SIZE(tmu4_resources), +}; + +static struct sh_timer_config tmu5_platform_data = { + .name = "TMU5", + .channel_offset = 0x1c, + .timer_bit = 2, + .clk = "tmu1", +}; + +static struct resource tmu5_resources[] = { + [0] = { + .name = "TMU5", + .start = 0xffd90020, + .end = 0xffd9002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 57, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu5_device = { + .name = "sh_tmu", + .id = 5, + .dev = { + .platform_data = &tmu5_platform_data, + }, + .resource = tmu5_resources, + .num_resources = ARRAY_SIZE(tmu5_resources), +}; + static struct platform_device *sh7724_devices[] __initdata = { &cmt_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, &sci_device, &rtc_device, &iic0_device, @@ -296,6 +485,12 @@ device_initcall(sh7724_devices_setup); static struct platform_device *sh7724_early_devices[] __initdata = { &cmt_device, + &tmu0_device, + &tmu1_device, + &tmu2_device, + &tmu3_device, + &tmu4_device, + &tmu5_device, }; void __init plat_early_device_setup(void) From acd664ab54a449a7257dcb3c7ef9cd1e310b4c4f Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 10:01:01 +0000 Subject: [PATCH 1035/2061] sh: TMU platform data for sh7705 This patch adds TMU platform data for sh7705. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh3/setup-sh7705.c | 108 ++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c index 63b67badd67..39513664d5d 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c @@ -13,6 +13,7 @@ #include #include #include +#include #include enum { @@ -116,7 +117,102 @@ static struct platform_device rtc_device = { }, }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x02, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xfffffe94, + .end = 0xfffffe9f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0xe, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xfffffea0, + .end = 0xfffffeab, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1a, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xfffffeac, + .end = 0xfffffebb, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + static struct platform_device *sh7705_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, &sci_device, &rtc_device, }; @@ -128,6 +224,18 @@ static int __init sh7705_devices_setup(void) } __initcall(sh7705_devices_setup); +static struct platform_device *sh7705_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7705_early_devices, + ARRAY_SIZE(sh7705_early_devices)); +} + void __init plat_irq_setup(void) { register_intc_controller(&intc_desc); From c8a9011bce6c43f4988cbcf1db8fc31433e69964 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 10:04:49 +0000 Subject: [PATCH 1036/2061] sh: TMU platform data for sh7706/sh7707/sh7708/sh7709 Add TMU platform data for sh7706/sh7707/sh7708/sh7709. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh3/setup-sh770x.c | 108 ++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c index a74f960b5e7..9412d915b84 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c @@ -18,6 +18,7 @@ #include #include #include +#include enum { UNUSED = 0, @@ -144,7 +145,102 @@ static struct platform_device sci_device = { }, }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x02, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xfffffe94, + .end = 0xfffffe9f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0xe, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xfffffea0, + .end = 0xfffffeab, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1a, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xfffffeac, + .end = 0xfffffebb, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + static struct platform_device *sh770x_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, &sci_device, &rtc_device, }; @@ -156,6 +252,18 @@ static int __init sh770x_devices_setup(void) } __initcall(sh770x_devices_setup); +static struct platform_device *sh770x_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh770x_early_devices, + ARRAY_SIZE(sh770x_early_devices)); +} + void __init plat_irq_setup(void) { register_intc_controller(&intc_desc); From e5ad00896a381937326ac55fc173630fe731d041 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 10:15:30 +0000 Subject: [PATCH 1037/2061] sh: TMU platform data for sh7710/sh7712 This patch adds TMU platform data for sh7710 and sh7712. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh3/setup-sh7710.c | 108 ++++++++++++++++++++++++++ 1 file changed, 108 insertions(+) diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c index 335098b66e2..07ff38d055a 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c @@ -13,6 +13,7 @@ #include #include #include +#include #include enum { @@ -120,7 +121,102 @@ static struct platform_device sci_device = { }, }; +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x02, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xa412fe94, + .end = 0xa412fe9f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0xe, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xa412fea0, + .end = 0xa412feab, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1a, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xa412feac, + .end = 0xa412feb5, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + static struct platform_device *sh7710_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, &sci_device, &rtc_device, }; @@ -132,6 +228,18 @@ static int __init sh7710_devices_setup(void) } __initcall(sh7710_devices_setup); +static struct platform_device *sh7710_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7710_early_devices, + ARRAY_SIZE(sh7710_early_devices)); +} + void __init plat_irq_setup(void) { register_intc_controller(&intc_desc); From 4a1a5a2f60ceabc026ba28cdbf81d7d47603b480 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 10:17:52 +0000 Subject: [PATCH 1038/2061] sh: TMU platform data for sh7720/sh7721 This patch adds TMU platform data for sh7720 and sh7721. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh3/setup-sh7720.c | 109 ++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c index 003874a2fd2..c09619c25d9 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c @@ -18,6 +18,7 @@ #include #include #include +#include #include static struct resource rtc_resources[] = { @@ -123,7 +124,103 @@ static struct platform_device usbf_device = { .resource = usbf_resources, }; + +static struct sh_timer_config tmu0_platform_data = { + .name = "TMU0", + .channel_offset = 0x02, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 200, +}; + +static struct resource tmu0_resources[] = { + [0] = { + .name = "TMU0", + .start = 0xa412fe94, + .end = 0xa412fe9f, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 16, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu0_device = { + .name = "sh_tmu", + .id = 0, + .dev = { + .platform_data = &tmu0_platform_data, + }, + .resource = tmu0_resources, + .num_resources = ARRAY_SIZE(tmu0_resources), +}; + +static struct sh_timer_config tmu1_platform_data = { + .name = "TMU1", + .channel_offset = 0xe, + .timer_bit = 1, + .clk = "module_clk", + .clocksource_rating = 200, +}; + +static struct resource tmu1_resources[] = { + [0] = { + .name = "TMU1", + .start = 0xa412fea0, + .end = 0xa412feab, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 17, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu1_device = { + .name = "sh_tmu", + .id = 1, + .dev = { + .platform_data = &tmu1_platform_data, + }, + .resource = tmu1_resources, + .num_resources = ARRAY_SIZE(tmu1_resources), +}; + +static struct sh_timer_config tmu2_platform_data = { + .name = "TMU2", + .channel_offset = 0x1a, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource tmu2_resources[] = { + [0] = { + .name = "TMU2", + .start = 0xa412feac, + .end = 0xa412feb5, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 18, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device tmu2_device = { + .name = "sh_tmu", + .id = 2, + .dev = { + .platform_data = &tmu2_platform_data, + }, + .resource = tmu2_resources, + .num_resources = ARRAY_SIZE(tmu2_resources), +}; + static struct platform_device *sh7720_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, &rtc_device, &sci_device, &usb_ohci_device, @@ -137,6 +234,18 @@ static int __init sh7720_devices_setup(void) } __initcall(sh7720_devices_setup); +static struct platform_device *sh7720_early_devices[] __initdata = { + &tmu0_device, + &tmu1_device, + &tmu2_device, +}; + +void __init plat_early_device_setup(void) +{ + early_platform_add_devices(sh7720_early_devices, + ARRAY_SIZE(sh7720_early_devices)); +} + enum { UNUSED = 0, From 2b23a8826a60268ec52302729911dd7ac6b10776 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 10:21:11 +0000 Subject: [PATCH 1039/2061] sh: CMT platform data for sh7720/sh7721 This patch adds CMT platform data for sh7720 and sh7721. All 5 32-bit CMT channels unfortunately share a single IRQ. Both clockevent and clocksource support is enabled. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 2 + arch/sh/kernel/cpu/sh3/setup-sh7720.c | 161 ++++++++++++++++++++++++++ 2 files changed, 163 insertions(+) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index a9dee13ddd8..5391746be1e 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -302,6 +302,7 @@ config CPU_SUBTYPE_SH7720 bool "Support SH7720 processor" select CPU_SH3 select CPU_HAS_DSP + select SYS_SUPPORTS_CMT help Select SH7720 if you have a SH3-DSP SH7720 CPU. @@ -309,6 +310,7 @@ config CPU_SUBTYPE_SH7721 bool "Support SH7721 processor" select CPU_SH3 select CPU_HAS_DSP + select SYS_SUPPORTS_CMT help Select SH7721 if you have a SH3-DSP SH7721 CPU. diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c index c09619c25d9..d8b46f5dff6 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c @@ -124,6 +124,157 @@ static struct platform_device usbf_device = { .resource = usbf_resources, }; +static struct sh_timer_config cmt0_platform_data = { + .name = "CMT0", + .channel_offset = 0x10, + .timer_bit = 0, + .clk = "module_clk", + .clockevent_rating = 125, + .clocksource_rating = 125, +}; + +static struct resource cmt0_resources[] = { + [0] = { + .name = "CMT0", + .start = 0x044a0010, + .end = 0x044a001b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 104, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt0_device = { + .name = "sh_cmt", + .id = 0, + .dev = { + .platform_data = &cmt0_platform_data, + }, + .resource = cmt0_resources, + .num_resources = ARRAY_SIZE(cmt0_resources), +}; + +static struct sh_timer_config cmt1_platform_data = { + .name = "CMT1", + .channel_offset = 0x20, + .timer_bit = 1, + .clk = "module_clk", +}; + +static struct resource cmt1_resources[] = { + [0] = { + .name = "CMT1", + .start = 0x044a0020, + .end = 0x044a002b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 104, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt1_device = { + .name = "sh_cmt", + .id = 1, + .dev = { + .platform_data = &cmt1_platform_data, + }, + .resource = cmt1_resources, + .num_resources = ARRAY_SIZE(cmt1_resources), +}; + +static struct sh_timer_config cmt2_platform_data = { + .name = "CMT2", + .channel_offset = 0x30, + .timer_bit = 2, + .clk = "module_clk", +}; + +static struct resource cmt2_resources[] = { + [0] = { + .name = "CMT2", + .start = 0x044a0030, + .end = 0x044a003b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 104, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt2_device = { + .name = "sh_cmt", + .id = 2, + .dev = { + .platform_data = &cmt2_platform_data, + }, + .resource = cmt2_resources, + .num_resources = ARRAY_SIZE(cmt2_resources), +}; + +static struct sh_timer_config cmt3_platform_data = { + .name = "CMT3", + .channel_offset = 0x40, + .timer_bit = 3, + .clk = "module_clk", +}; + +static struct resource cmt3_resources[] = { + [0] = { + .name = "CMT3", + .start = 0x044a0040, + .end = 0x044a004b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 104, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt3_device = { + .name = "sh_cmt", + .id = 3, + .dev = { + .platform_data = &cmt3_platform_data, + }, + .resource = cmt3_resources, + .num_resources = ARRAY_SIZE(cmt3_resources), +}; + +static struct sh_timer_config cmt4_platform_data = { + .name = "CMT4", + .channel_offset = 0x50, + .timer_bit = 4, + .clk = "module_clk", +}; + +static struct resource cmt4_resources[] = { + [0] = { + .name = "CMT4", + .start = 0x044a0050, + .end = 0x044a005b, + .flags = IORESOURCE_MEM, + }, + [1] = { + .start = 104, + .flags = IORESOURCE_IRQ, + }, +}; + +static struct platform_device cmt4_device = { + .name = "sh_cmt", + .id = 4, + .dev = { + .platform_data = &cmt4_platform_data, + }, + .resource = cmt4_resources, + .num_resources = ARRAY_SIZE(cmt4_resources), +}; static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", @@ -218,6 +369,11 @@ static struct platform_device tmu2_device = { }; static struct platform_device *sh7720_devices[] __initdata = { + &cmt0_device, + &cmt1_device, + &cmt2_device, + &cmt3_device, + &cmt4_device, &tmu0_device, &tmu1_device, &tmu2_device, @@ -235,6 +391,11 @@ static int __init sh7720_devices_setup(void) __initcall(sh7720_devices_setup); static struct platform_device *sh7720_early_devices[] __initdata = { + &cmt0_device, + &cmt1_device, + &cmt2_device, + &cmt3_device, + &cmt4_device, &tmu0_device, &tmu1_device, &tmu2_device, From f19900b2e608b604777a74d6d711bbf744657756 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Tue, 12 May 2009 10:25:54 +0000 Subject: [PATCH 1040/2061] sh: remove old TMU driver This patch removes the old TMU driver (CONFIG_SH_TMU/timer-tmu.c) As replacement, select the sh_tmu driver with CONFIG_SH_TIMER_TMU and configure timer channel using platform data. If multiple TMU channels are enabled using platform data, use the earlytimer parameter on the kernel command line to select channel. For instance, use "earlytimer=sh_tmu.0" to select the first channel. To verify which timer is being used, look at printouts or the timer irq count in /proc/interrupts. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 9 +- arch/sh/include/asm/timer.h | 1 - arch/sh/include/cpu-sh3/cpu/timer.h | 67 ------- arch/sh/include/cpu-sh4/cpu/timer.h | 60 ------ arch/sh/kernel/timers/Makefile | 2 - arch/sh/kernel/timers/timer-tmu.c | 297 ---------------------------- arch/sh/kernel/timers/timer.c | 3 - 7 files changed, 1 insertion(+), 438 deletions(-) delete mode 100644 arch/sh/include/cpu-sh3/cpu/timer.h delete mode 100644 arch/sh/include/cpu-sh4/cpu/timer.h delete mode 100644 arch/sh/kernel/timers/timer-tmu.c diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index 5391746be1e..e7b6406c96b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -473,16 +473,9 @@ source "arch/sh/boards/Kconfig" menu "Timer and clock configuration" -config SH_TMU - bool "TMU timer support" - depends on CPU_SH3 || CPU_SH4 - default y - help - This enables the use of the TMU as the system timer. - config SH_TIMER_TMU bool "TMU timer driver" - depends on !SH_TMU && SYS_SUPPORTS_TMU + depends on SYS_SUPPORTS_TMU default y help This enables the build of the TMU timer driver. diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h index 6c0851188c0..f27a88bfdcc 100644 --- a/arch/sh/include/asm/timer.h +++ b/arch/sh/include/asm/timer.h @@ -18,7 +18,6 @@ struct sys_timer { struct sys_timer_ops *ops; }; -extern struct sys_timer tmu_timer; extern struct sys_timer *sys_timer; /* arch/sh/kernel/timers/timer.c */ diff --git a/arch/sh/include/cpu-sh3/cpu/timer.h b/arch/sh/include/cpu-sh3/cpu/timer.h deleted file mode 100644 index 793acf12aa0..00000000000 --- a/arch/sh/include/cpu-sh3/cpu/timer.h +++ /dev/null @@ -1,67 +0,0 @@ -/* - * include/asm-sh/cpu-sh3/timer.h - * - * Copyright (C) 2004 Lineo Solutions, Inc. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#ifndef __ASM_CPU_SH3_TIMER_H -#define __ASM_CPU_SH3_TIMER_H - -/* - * --------------------------------------------------------------------------- - * TMU Common definitions for SH3 processors - * SH7706 - * SH7709S - * SH7727 - * SH7729R - * SH7710 - * SH7720 - * SH7710 - * --------------------------------------------------------------------------- - */ - -#if !defined(CONFIG_CPU_SUBTYPE_SH7720) && !defined(CONFIG_CPU_SUBTYPE_SH7721) -#define TMU_TOCR 0xfffffe90 /* Byte access */ -#endif - -#if defined(CONFIG_CPU_SUBTYPE_SH7710) || \ - defined(CONFIG_CPU_SUBTYPE_SH7720) || \ - defined(CONFIG_CPU_SUBTYPE_SH7721) -#define TMU_012_TSTR 0xa412fe92 /* Byte access */ - -#define TMU0_TCOR 0xa412fe94 /* Long access */ -#define TMU0_TCNT 0xa412fe98 /* Long access */ -#define TMU0_TCR 0xa412fe9c /* Word access */ - -#define TMU1_TCOR 0xa412fea0 /* Long access */ -#define TMU1_TCNT 0xa412fea4 /* Long access */ -#define TMU1_TCR 0xa412fea8 /* Word access */ - -#define TMU2_TCOR 0xa412feac /* Long access */ -#define TMU2_TCNT 0xa412feb0 /* Long access */ -#define TMU2_TCR 0xa412feb4 /* Word access */ - -#else -#define TMU_012_TSTR 0xfffffe92 /* Byte access */ - -#define TMU0_TCOR 0xfffffe94 /* Long access */ -#define TMU0_TCNT 0xfffffe98 /* Long access */ -#define TMU0_TCR 0xfffffe9c /* Word access */ - -#define TMU1_TCOR 0xfffffea0 /* Long access */ -#define TMU1_TCNT 0xfffffea4 /* Long access */ -#define TMU1_TCR 0xfffffea8 /* Word access */ - -#define TMU2_TCOR 0xfffffeac /* Long access */ -#define TMU2_TCNT 0xfffffeb0 /* Long access */ -#define TMU2_TCR 0xfffffeb4 /* Word access */ -#if !defined(CONFIG_CPU_SUBTYPE_SH7720) && !defined(CONFIG_CPU_SUBTYPE_SH7721) -#define TMU2_TCPR2 0xfffffeb8 /* Long access */ -#endif -#endif - -#endif /* __ASM_CPU_SH3_TIMER_H */ - diff --git a/arch/sh/include/cpu-sh4/cpu/timer.h b/arch/sh/include/cpu-sh4/cpu/timer.h deleted file mode 100644 index d1e796b9688..00000000000 --- a/arch/sh/include/cpu-sh4/cpu/timer.h +++ /dev/null @@ -1,60 +0,0 @@ -/* - * include/asm-sh/cpu-sh4/timer.h - * - * Copyright (C) 2004 Lineo Solutions, Inc. - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#ifndef __ASM_CPU_SH4_TIMER_H -#define __ASM_CPU_SH4_TIMER_H - -/* - * --------------------------------------------------------------------------- - * TMU Common definitions for SH4 processors - * SH7750S/SH7750R - * SH7751/SH7751R - * SH7760 - * SH-X3 - * --------------------------------------------------------------------------- - */ -#ifdef CONFIG_CPU_SUBTYPE_SHX3 -#define TMU_012_BASE 0xffc10000 -#define TMU_345_BASE 0xffc20000 -#else -#define TMU_012_BASE 0xffd80000 -#define TMU_345_BASE 0xfe100000 -#endif - -#define TMU_TOCR TMU_012_BASE /* Not supported on all CPUs */ - -#define TMU_012_TSTR (TMU_012_BASE + 0x04) -#define TMU_345_TSTR (TMU_345_BASE + 0x04) - -#define TMU0_TCOR (TMU_012_BASE + 0x08) -#define TMU0_TCNT (TMU_012_BASE + 0x0c) -#define TMU0_TCR (TMU_012_BASE + 0x10) - -#define TMU1_TCOR (TMU_012_BASE + 0x14) -#define TMU1_TCNT (TMU_012_BASE + 0x18) -#define TMU1_TCR (TMU_012_BASE + 0x1c) - -#define TMU2_TCOR (TMU_012_BASE + 0x20) -#define TMU2_TCNT (TMU_012_BASE + 0x24) -#define TMU2_TCR (TMU_012_BASE + 0x28) -#define TMU2_TCPR (TMU_012_BASE + 0x2c) - -#define TMU3_TCOR (TMU_345_BASE + 0x08) -#define TMU3_TCNT (TMU_345_BASE + 0x0c) -#define TMU3_TCR (TMU_345_BASE + 0x10) - -#define TMU4_TCOR (TMU_345_BASE + 0x14) -#define TMU4_TCNT (TMU_345_BASE + 0x18) -#define TMU4_TCR (TMU_345_BASE + 0x1c) - -#define TMU5_TCOR (TMU_345_BASE + 0x20) -#define TMU5_TCNT (TMU_345_BASE + 0x24) -#define TMU5_TCR (TMU_345_BASE + 0x28) - -#endif /* __ASM_CPU_SH4_TIMER_H */ diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile index 8bceba59e76..fefd7edd413 100644 --- a/arch/sh/kernel/timers/Makefile +++ b/arch/sh/kernel/timers/Makefile @@ -3,5 +3,3 @@ # obj-y := timer.o - -obj-$(CONFIG_SH_TMU) += timer-tmu.o diff --git a/arch/sh/kernel/timers/timer-tmu.c b/arch/sh/kernel/timers/timer-tmu.c deleted file mode 100644 index fe8d8930ccb..00000000000 --- a/arch/sh/kernel/timers/timer-tmu.c +++ /dev/null @@ -1,297 +0,0 @@ -/* - * arch/sh/kernel/timers/timer-tmu.c - TMU Timer Support - * - * Copyright (C) 2005 - 2007 Paul Mundt - * - * TMU handling code hacked out of arch/sh/kernel/time.c - * - * Copyright (C) 1999 Tetsuya Okada & Niibe Yutaka - * Copyright (C) 2000 Philipp Rumpf - * Copyright (C) 2002, 2003, 2004 Paul Mundt - * Copyright (C) 2002 M. R. Brown - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#define TMU_TOCR_INIT 0x00 -#define TMU_TCR_INIT 0x0020 - -#define TMU0 (0) -#define TMU1 (1) - -static inline void _tmu_start(int tmu_num) -{ - ctrl_outb(ctrl_inb(TMU_012_TSTR) | (0x1<mode == CLOCK_EVT_MODE_PERIODIC); - _tmu_set_irq(TMU0,1); - return 0; -} - -static void tmu_set_mode(enum clock_event_mode mode, - struct clock_event_device *evt) -{ - switch (mode) { - case CLOCK_EVT_MODE_PERIODIC: - ctrl_outl(tmu_latest_interval[TMU0], TMU0_TCOR); - break; - case CLOCK_EVT_MODE_ONESHOT: - ctrl_outl(0, TMU0_TCOR); - break; - case CLOCK_EVT_MODE_UNUSED: - case CLOCK_EVT_MODE_SHUTDOWN: - case CLOCK_EVT_MODE_RESUME: - break; - } -} - -static struct clock_event_device tmu0_clockevent = { - .name = "tmu0", - .shift = 32, - .features = CLOCK_EVT_FEAT_PERIODIC | CLOCK_EVT_FEAT_ONESHOT, - .set_mode = tmu_set_mode, - .set_next_event = tmu_set_next_event, -}; - -static irqreturn_t tmu_timer_interrupt(int irq, void *dummy) -{ - struct clock_event_device *evt = &tmu0_clockevent; - _tmu_clear_status(TMU0); - _tmu_set_irq(TMU0,tmu0_clockevent.mode != CLOCK_EVT_MODE_ONESHOT); - - switch (tmu0_clockevent.mode) { - case CLOCK_EVT_MODE_ONESHOT: - case CLOCK_EVT_MODE_PERIODIC: - evt->event_handler(evt); - break; - default: - break; - } - - return IRQ_HANDLED; -} - -static struct irqaction tmu0_irq = { - .name = "periodic/oneshot timer", - .handler = tmu_timer_interrupt, - .flags = IRQF_DISABLED | IRQF_TIMER | IRQF_IRQPOLL, -}; - -static void __init tmu_clk_init(struct clk *clk) -{ - u8 divisor = TMU_TCR_INIT & 0x7; - int tmu_num = clk->name[3]-'0'; - ctrl_outw(TMU_TCR_INIT, TMU0_TCR+(tmu_num*0xC)); - clk->rate = clk_get_rate(clk->parent) / (4 << (divisor << 1)); -} - -static void tmu_clk_recalc(struct clk *clk) -{ - int tmu_num = clk->name[3]-'0'; - unsigned long prev_rate = clk_get_rate(clk); - unsigned long flags; - u8 divisor = ctrl_inw(TMU0_TCR+tmu_num*0xC) & 0x7; - clk->rate = clk_get_rate(clk->parent) / (4 << (divisor << 1)); - - if(prev_rate==clk_get_rate(clk)) - return; - - if(tmu_num) - return; /* No more work on TMU1 */ - - local_irq_save(flags); - tmus_are_scaled = (prev_rate > clk->rate); - - _tmu_stop(TMU0); - - tmu0_clockevent.mult = div_sc(clk->rate, NSEC_PER_SEC, - tmu0_clockevent.shift); - tmu0_clockevent.max_delta_ns = - clockevent_delta2ns(-1, &tmu0_clockevent); - tmu0_clockevent.min_delta_ns = - clockevent_delta2ns(1, &tmu0_clockevent); - - if (tmus_are_scaled) - tmu_latest_interval[TMU0] >>= 1; - else - tmu_latest_interval[TMU0] <<= 1; - - tmu_timer_set_interval(TMU0, - tmu_latest_interval[TMU0], - tmu0_clockevent.mode == CLOCK_EVT_MODE_PERIODIC); - - _tmu_start(TMU0); - - local_irq_restore(flags); -} - -static struct clk_ops tmu_clk_ops = { - .init = tmu_clk_init, - .recalc = tmu_clk_recalc, -}; - -static struct clk tmu0_clk = { - .name = "tmu0_clk", - .ops = &tmu_clk_ops, -}; - -static struct clk tmu1_clk = { - .name = "tmu1_clk", - .ops = &tmu_clk_ops, -}; - -static int tmu_timer_init(void) -{ - unsigned long interval; - unsigned long frequency; - - setup_irq(CONFIG_SH_TIMER_IRQ, &tmu0_irq); - - tmu0_clk.parent = clk_get(NULL, "module_clk"); - tmu1_clk.parent = clk_get(NULL, "module_clk"); - - tmu_timer_stop(); - -#if !defined(CONFIG_CPU_SUBTYPE_SH7720) && \ - !defined(CONFIG_CPU_SUBTYPE_SH7721) && \ - !defined(CONFIG_CPU_SUBTYPE_SH7760) && \ - !defined(CONFIG_CPU_SUBTYPE_SH7785) && \ - !defined(CONFIG_CPU_SUBTYPE_SH7786) && \ - !defined(CONFIG_CPU_SUBTYPE_SHX3) - ctrl_outb(TMU_TOCR_INIT, TMU_TOCR); -#endif - - clk_register(&tmu0_clk); - clk_register(&tmu1_clk); - clk_enable(&tmu0_clk); - clk_enable(&tmu1_clk); - - frequency = clk_get_rate(&tmu0_clk); - interval = (frequency + HZ / 2) / HZ; - - tmu_timer_set_interval(TMU0,interval, 1); - tmu_timer_set_interval(TMU1,~0,1); - - _tmu_start(TMU1); - - clocksource_sh.rating = 200; - clocksource_sh.mask = CLOCKSOURCE_MASK(32); - clocksource_sh.read = tmu_timer_read; - clocksource_sh.shift = 10; - clocksource_sh.mult = clocksource_hz2mult(clk_get_rate(&tmu1_clk), - clocksource_sh.shift); - clocksource_sh.flags = CLOCK_SOURCE_IS_CONTINUOUS; - clocksource_register(&clocksource_sh); - - tmu0_clockevent.mult = div_sc(frequency, NSEC_PER_SEC, - tmu0_clockevent.shift); - tmu0_clockevent.max_delta_ns = - clockevent_delta2ns(-1, &tmu0_clockevent); - tmu0_clockevent.min_delta_ns = - clockevent_delta2ns(1, &tmu0_clockevent); - - tmu0_clockevent.cpumask = cpumask_of(0); - tmu0_clockevent.rating = 100; - - clockevents_register_device(&tmu0_clockevent); - - return 0; -} - -static struct sys_timer_ops tmu_timer_ops = { - .init = tmu_timer_init, - .start = tmu_timer_start, - .stop = tmu_timer_stop, -}; - -struct sys_timer tmu_timer = { - .name = "tmu", - .ops = &tmu_timer_ops, -}; diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c index 920891acb77..f8812bd3b79 100644 --- a/arch/sh/kernel/timers/timer.c +++ b/arch/sh/kernel/timers/timer.c @@ -14,9 +14,6 @@ #include static struct sys_timer *sys_timers[] = { -#ifdef CONFIG_SH_TMU - &tmu_timer, -#endif NULL, }; From 8be5f1a68f2c14082939dd54e7037dcee2eb54f8 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 12 May 2009 19:53:55 +0900 Subject: [PATCH 1041/2061] sh: Kill off the remnants of the old timer code. Now with all of the TMU users moved over to the new TMU driver, and the old TMU driver killed off, the left-over infrastructure can go along with it. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 11 -------- arch/sh/include/asm/timer.h | 28 --------------------- arch/sh/kernel/Makefile_32 | 2 +- arch/sh/kernel/Makefile_64 | 2 +- arch/sh/kernel/cpu/clock.c | 1 - arch/sh/kernel/time.c | 29 ++------------------- arch/sh/kernel/timers/Makefile | 5 ---- arch/sh/kernel/timers/timer.c | 46 ---------------------------------- 8 files changed, 4 insertions(+), 120 deletions(-) delete mode 100644 arch/sh/include/asm/timer.h delete mode 100644 arch/sh/kernel/timers/Makefile delete mode 100644 arch/sh/kernel/timers/timer.c diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index e7b6406c96b..fb75c2d1928 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -494,17 +494,6 @@ config SH_TIMER_MTU2 help This enables build of the MTU2 timer driver. -config SH_TIMER_IRQ - int - default "28" if CPU_SUBTYPE_SH7780 || CPU_SUBTYPE_SH7785 || \ - CPU_SUBTYPE_SH7763 - default "86" if CPU_SUBTYPE_SH7619 - default "140" if CPU_SUBTYPE_SH7206 - default "142" if CPU_SUBTYPE_SH7203 && SH_CMT - default "153" if CPU_SUBTYPE_SH7203 && SH_MTU2 - default "238" if CPU_SUBTYPE_MXG - default "16" - config SH_PCLK_FREQ int "Peripheral clock frequency (in Hz)" default "27000000" if CPU_SUBTYPE_SH7343 diff --git a/arch/sh/include/asm/timer.h b/arch/sh/include/asm/timer.h deleted file mode 100644 index f27a88bfdcc..00000000000 --- a/arch/sh/include/asm/timer.h +++ /dev/null @@ -1,28 +0,0 @@ -#ifndef __ASM_SH_TIMER_H -#define __ASM_SH_TIMER_H - -#include -#include -#include - -struct sys_timer_ops { - int (*init)(void); - int (*start)(void); - int (*stop)(void); -}; - -struct sys_timer { - const char *name; - - struct sys_device dev; - struct sys_timer_ops *ops; -}; - -extern struct sys_timer *sys_timer; - -/* arch/sh/kernel/timers/timer.c */ -struct sys_timer *get_sys_timer(void); - -extern struct clocksource clocksource_sh; - -#endif /* __ASM_SH_TIMER_H */ diff --git a/arch/sh/kernel/Makefile_32 b/arch/sh/kernel/Makefile_32 index aee08afe5ff..9411e3e31e6 100644 --- a/arch/sh/kernel/Makefile_32 +++ b/arch/sh/kernel/Makefile_32 @@ -14,7 +14,7 @@ obj-y := debugtraps.o idle.o io.o io_generic.o irq.o \ sys_sh.o sys_sh32.o syscalls_32.o time.o topology.o \ traps.o traps_32.o -obj-y += cpu/ timers/ +obj-y += cpu/ obj-$(CONFIG_VSYSCALL) += vsyscall/ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SH_STANDARD_BIOS) += sh_bios.o diff --git a/arch/sh/kernel/Makefile_64 b/arch/sh/kernel/Makefile_64 index c845a0d6197..67b9f6c6326 100644 --- a/arch/sh/kernel/Makefile_64 +++ b/arch/sh/kernel/Makefile_64 @@ -4,7 +4,7 @@ obj-y := debugtraps.o idle.o io.o io_generic.o irq.o machvec.o process_64.o \ ptrace_64.o setup.o signal_64.o sys_sh.o sys_sh64.o \ syscalls_64.o time.o topology.o traps.o traps_64.o -obj-y += cpu/ timers/ +obj-y += cpu/ obj-$(CONFIG_SMP) += smp.o obj-$(CONFIG_SH_CPU_FREQ) += cpufreq.o obj-$(CONFIG_MODULES) += sh_ksyms_64.o module.o diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 133dbe40334..f54769f455b 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -26,7 +26,6 @@ #include #include #include -#include static LIST_HEAD(clock_list); static DEFINE_SPINLOCK(clock_lock); diff --git a/arch/sh/kernel/time.c b/arch/sh/kernel/time.c index a77838f539f..2edde32c764 100644 --- a/arch/sh/kernel/time.c +++ b/arch/sh/kernel/time.c @@ -22,9 +22,6 @@ #include #include #include -#include - -struct sys_timer *sys_timer; /* Dummy RTC ops */ static void null_rtc_get_time(struct timespec *tv) @@ -94,20 +91,9 @@ module_init(rtc_generic_init); void (*board_time_init)(void); -struct clocksource clocksource_sh = { - .name = "SuperH", -}; - unsigned long long sched_clock(void) { - unsigned long long cycles; - - /* jiffies based sched_clock if no clocksource is installed */ - if (!clocksource_sh.rating) - return (jiffies_64 - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); - - cycles = clocksource_sh.read(&clocksource_sh); - return cyc2ns(&clocksource_sh, cycles); + return (jiffies_64 - INITIAL_JIFFIES) * (NSEC_PER_SEC / HZ); } static void __init sh_late_time_init(void) @@ -117,18 +103,7 @@ static void __init sh_late_time_init(void) * Run probe() for one "earlytimer" device. */ early_platform_driver_register_all("earlytimer"); - if (early_platform_driver_probe("earlytimer", 1, 0)) - return; - - /* - * Find the timer to use as the system timer, it will be - * initialized for us. - */ - sys_timer = get_sys_timer(); - if (unlikely(!sys_timer)) - panic("System timer missing.\n"); - - printk(KERN_INFO "Using %s for system timer\n", sys_timer->name); + early_platform_driver_probe("earlytimer", 1, 0); } void __init time_init(void) diff --git a/arch/sh/kernel/timers/Makefile b/arch/sh/kernel/timers/Makefile deleted file mode 100644 index fefd7edd413..00000000000 --- a/arch/sh/kernel/timers/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -# -# Makefile for the various Linux/SuperH timers -# - -obj-y := timer.o diff --git a/arch/sh/kernel/timers/timer.c b/arch/sh/kernel/timers/timer.c deleted file mode 100644 index f8812bd3b79..00000000000 --- a/arch/sh/kernel/timers/timer.c +++ /dev/null @@ -1,46 +0,0 @@ -/* - * arch/sh/kernel/timers/timer.c - Common timer code - * - * Copyright (C) 2005 Paul Mundt - * - * This file is subject to the terms and conditions of the GNU General Public - * License. See the file "COPYING" in the main directory of this archive - * for more details. - */ -#include -#include -#include -#include -#include - -static struct sys_timer *sys_timers[] = { - NULL, -}; - -static char timer_override[10]; -static int __init timer_setup(char *str) -{ - if (str) - strlcpy(timer_override, str, sizeof(timer_override)); - return 1; -} -__setup("timer=", timer_setup); - -struct sys_timer *get_sys_timer(void) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(sys_timers); i++) { - struct sys_timer *t = sys_timers[i]; - - if (unlikely(!t)) - break; - if (unlikely(timer_override[0])) - if ((strcmp(timer_override, t->name) != 0)) - continue; - if (likely(t->ops->init() == 0)) - return t; - } - - return NULL; -} From e758a33d6fc5b9d6a3ae489863d04fcecad8120b Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 12 May 2009 21:59:01 +1000 Subject: [PATCH 1042/2061] perf_counter: call hw_perf_save_disable/restore around group_sched_in I noticed that when enabling a group via the PERF_COUNTER_IOC_ENABLE ioctl on the group leader, the counters weren't enabled and counting immediately on return from the ioctl, but did start counting a little while later (presumably after a context switch). The reason was that __perf_counter_enable calls group_sched_in which calls hw_perf_group_sched_in, which on powerpc assumes that the caller has called hw_perf_save_disable already. Until commit 46d686c6 ("perf_counter: put whole group on when enabling group leader") it was true that all callers of group_sched_in had called hw_perf_save_disable first, and the powerpc hw_perf_group_sched_in relies on that (there isn't an x86 version). This fixes the problem by putting calls to hw_perf_save_disable / hw_perf_restore around the calls to group_sched_in and counter_sched_in in __perf_counter_enable. Having the calls to hw_perf_save_disable/restore around the counter_sched_in call is harmless and makes this call consistent with the other call sites of counter_sched_in, which have all called hw_perf_save_disable first. [ Impact: more precise counter group disable/enable functionality ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18953.25733.53359.147452@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 19 ++++++++++++------- 1 file changed, 12 insertions(+), 7 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5ea0240adab..ff166c11b69 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -663,6 +663,7 @@ static void __perf_counter_enable(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = counter->ctx; struct perf_counter *leader = counter->group_leader; + unsigned long pmuflags; unsigned long flags; int err; @@ -689,14 +690,18 @@ static void __perf_counter_enable(void *info) if (leader != counter && leader->state != PERF_COUNTER_STATE_ACTIVE) goto unlock; - if (!group_can_go_on(counter, cpuctx, 1)) + if (!group_can_go_on(counter, cpuctx, 1)) { err = -EEXIST; - else if (counter == leader) - err = group_sched_in(counter, cpuctx, ctx, - smp_processor_id()); - else - err = counter_sched_in(counter, cpuctx, ctx, - smp_processor_id()); + } else { + pmuflags = hw_perf_save_disable(); + if (counter == leader) + err = group_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + else + err = counter_sched_in(counter, cpuctx, ctx, + smp_processor_id()); + hw_perf_restore(pmuflags); + } if (err) { /* From 7537d81aa7b7cd31b0caeac8091456e93d96fa8d Mon Sep 17 00:00:00 2001 From: Abhijith Das Date: Tue, 12 May 2009 11:16:20 -0500 Subject: [PATCH 1043/2061] GFS2: Fix timestamps on write This patch copies the timestamps from the vfs inode into gfs2 and syncs it to the disk inode during writes. Signed-off-by: Abhijith Das Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_address.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/ops_address.c index a6dde1751e1..e5664210f0d 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/ops_address.c @@ -781,10 +781,12 @@ static int gfs2_stuffed_write_end(struct inode *inode, struct buffer_head *dibh, unlock_page(page); page_cache_release(page); - if (inode->i_size < to) { - i_size_write(inode, to); - ip->i_disksize = inode->i_size; - di->di_size = cpu_to_be64(inode->i_size); + if (copied) { + if (inode->i_size < to) { + i_size_write(inode, to); + ip->i_disksize = inode->i_size; + } + gfs2_dinode_out(ip, di); mark_inode_dirty(inode); } @@ -824,7 +826,6 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, struct gfs2_sbd *sdp = GFS2_SB(inode); struct buffer_head *dibh; struct gfs2_alloc *al = ip->i_alloc; - struct gfs2_dinode *di; unsigned int from = pos & (PAGE_CACHE_SIZE - 1); unsigned int to = from + len; int ret; @@ -847,11 +848,10 @@ static int gfs2_write_end(struct file *file, struct address_space *mapping, gfs2_page_add_databufs(ip, page, from, to); ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); - - if (likely(ret >= 0) && (inode->i_size > ip->i_disksize)) { - di = (struct gfs2_dinode *)dibh->b_data; - ip->i_disksize = inode->i_size; - di->di_size = cpu_to_be64(inode->i_size); + if (ret > 0) { + if (inode->i_size > ip->i_disksize) + ip->i_disksize = inode->i_size; + gfs2_dinode_out(ip, dibh->b_data); mark_inode_dirty(inode); } From 12b7ac176831df1aa58a787e67c3e5d698b30163 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 May 2009 00:57:44 -0400 Subject: [PATCH 1044/2061] ext4: Rename ext4_get_blocks_wrap() to be ext4_get_blocks() Another function rename for clarity's sake. The _wrap prefix simply confuses people, and didn't add much people trying to follow the code paths. Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 3 +-- fs/ext4/ext4.h | 8 ++++---- fs/ext4/extents.c | 6 +++--- fs/ext4/inode.c | 35 +++++++++++++++++------------------ 4 files changed, 25 insertions(+), 27 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index b64789929a6..052d6378f99 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -131,8 +131,7 @@ static int ext4_readdir(struct file *filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext4_get_blocks_wrap(NULL, inode, blk, 1, &map_bh, - 0, 0, 0); + err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0, 0, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 89190ae671f..5dc8368e46b 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1617,10 +1617,10 @@ extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); -extern int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, - sector_t block, unsigned int max_blocks, - struct buffer_head *bh, int create, - int extend_disksize, int flag); +extern int ext4_get_blocks(handle_t *handle, struct inode *inode, + sector_t block, unsigned int max_blocks, + struct buffer_head *bh, int create, + int extend_disksize, int flag); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 4fec6b74638..7e7d02dd273 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3151,9 +3151,9 @@ retry: break; } map_bh.b_state = 0; - ret = ext4_get_blocks_wrap(handle, inode, block, - max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); + ret = ext4_get_blocks(handle, inode, block, + max_blocks, &map_bh, + EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index f758e8021d1..a9a9b9b77e8 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1121,7 +1121,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) } /* - * The ext4_get_blocks_wrap() function try to look up the requested blocks, + * The ext4_get_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. * * Otherwise it takes the write lock of the i_data_sem and allocate blocks @@ -1142,9 +1142,9 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) * * It returns the error in case of allocation failure. */ -int ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block, - unsigned int max_blocks, struct buffer_head *bh, - int create, int extend_disksize, int flag) +int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, + unsigned int max_blocks, struct buffer_head *bh, + int create, int extend_disksize, int flag) { int retval; @@ -1268,8 +1268,8 @@ int ext4_get_block(struct inode *inode, sector_t iblock, started = 1; } - ret = ext4_get_blocks_wrap(handle, inode, iblock, - max_blocks, bh_result, create, 0, 0); + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, + create, 0, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1294,10 +1294,9 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - err = ext4_get_blocks_wrap(handle, inode, block, 1, - &dummy, create, 1, 0); + err = ext4_get_blocks(handle, inode, block, 1, &dummy, create, 1, 0); /* - * ext4_get_blocks_wrap() returns number of blocks + * ext4_get_blocks() returns number of blocks * mapped. 0 in case of a HOLE. */ if (err > 0) { @@ -2009,8 +2008,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, handle = ext4_journal_current_handle(); BUG_ON(!handle); - ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks, - bh_result, 1, 0, EXT4_DELALLOC_RSVED); + ret = ext4_get_blocks(handle, inode, iblock, max_blocks, + bh_result, 1, 0, EXT4_DELALLOC_RSVED); if (ret <= 0) return ret; @@ -2067,11 +2066,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) /* * We need to make sure the BH_Delay flag is passed down to * ext4_da_get_block_write(), since it calls - * ext4_get_blocks_wrap() with the EXT4_DELALLOC_RSVED flag. - * This flag causes ext4_get_blocks_wrap() to call + * ext4_get_blocks() with the EXT4_DELALLOC_RSVED flag. + * This flag causes ext4_get_blocks() to call * ext4_da_update_reserve_space() if the passed buffer head * has the BH_Delay flag set. In the future, once we clean up - * the interfaces to ext4_get_blocks_wrap(), we should pass in + * the interfaces to ext4_get_blocks(), we should pass in * a separate flag which requests that the delayed allocation * statistics should be updated, instead of depending on the * state information getting passed down via the map_bh's @@ -2363,7 +2362,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_get_blocks_wrap(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0, 0, 0); if ((ret == 0) && !buffer_delay(bh_result)) { /* the block isn't (pre)allocated yet, let's reserve space */ /* @@ -2407,8 +2406,8 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, * we don't want to do block allocation in writepage * so call get_block_wrap with create = 0 */ - ret = ext4_get_blocks_wrap(NULL, inode, iblock, max_blocks, - bh_result, 0, 0, 0); + ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, + bh_result, 0, 0, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -5034,7 +5033,7 @@ int ext4_writepage_trans_blocks(struct inode *inode) * Calculate the journal credits for a chunk of data modification. * * This is called from DIO, fallocate or whoever calling - * ext4_get_blocks_wrap() to map/allocate a chunk of contigous disk blocks. + * ext4_get_blocks() to map/allocate a chunk of contigous disk blocks. * * journal buffers for data blocks are not included here, as DIO * and fallocate do no need to journal data buffers. From c21770573319922e3f3fcb331cfaa290c49f1c81 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 May 2009 00:58:52 -0400 Subject: [PATCH 1045/2061] ext4: Define a new set of flags for ext4_get_blocks() The functions ext4_get_blocks(), ext4_ext_get_blocks(), and ext4_ind_get_blocks() used an ad-hoc set of integer variables used as boolean flags passed in as arguments. Use a single flags parameter and a setandard set of bitfield flags instead. This saves space on the call stack, and it also makes the code a bit more understandable. Signed-off-by: "Theodore Ts'o" --- fs/ext4/dir.c | 2 +- fs/ext4/ext4.h | 22 ++++++++++++------ fs/ext4/extents.c | 22 +++++++++--------- fs/ext4/inode.c | 57 +++++++++++++++++++++++++---------------------- 4 files changed, 57 insertions(+), 46 deletions(-) diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 052d6378f99..9dc93168e26 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -131,7 +131,7 @@ static int ext4_readdir(struct file *filp, struct buffer_head *bh = NULL; map_bh.b_state = 0; - err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0, 0, 0); + err = ext4_get_blocks(NULL, inode, blk, 1, &map_bh, 0); if (err > 0) { pgoff_t index = map_bh.b_blocknr >> (PAGE_CACHE_SHIFT - inode->i_blkbits); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5dc8368e46b..17feb4ac633 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -314,10 +314,20 @@ struct ext4_new_group_data { }; /* - * Following is used by preallocation code to tell get_blocks() that we - * want uninitialzed extents. + * Flags used by ext4_get_blocks() */ -#define EXT4_CREATE_UNINITIALIZED_EXT 2 + /* Allocate any needed blocks and/or convert an unitialized + extent to be an initialized ext4 */ +#define EXT4_GET_BLOCKS_CREATE 1 + /* Request the creation of an unitialized extent */ +#define EXT4_GET_BLOCKS_UNINIT_EXT 2 +#define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ + EXT4_GET_BLOCKS_CREATE) + /* Update the ext4_inode_info i_disksize field */ +#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE 4 + /* Caller is from the delayed allocation writeout path, + so the filesystem blocks have already been accounted for */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 8 /* * ioctl commands @@ -1610,8 +1620,7 @@ extern int ext4_ext_index_trans_blocks(struct inode *inode, int nrblocks, int chunk); extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int max_blocks, - struct buffer_head *bh_result, - int create, int extend_disksize); + struct buffer_head *bh_result, int flags); extern void ext4_ext_truncate(struct inode *); extern void ext4_ext_init(struct super_block *); extern void ext4_ext_release(struct super_block *); @@ -1619,8 +1628,7 @@ extern long ext4_fallocate(struct inode *inode, int mode, loff_t offset, loff_t len); extern int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, - struct buffer_head *bh, int create, - int extend_disksize, int flag); + struct buffer_head *bh, int flags); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 7e7d02dd273..27c383c7b43 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -2784,7 +2784,7 @@ fix_extent_len: int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int max_blocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int flags) { struct ext4_ext_path *path = NULL; struct ext4_extent_header *eh; @@ -2803,7 +2803,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, cache_type = ext4_ext_in_cache(inode, iblock, &newex); if (cache_type) { if (cache_type == EXT4_EXT_CACHE_GAP) { - if (!create) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * block isn't allocated yet and * user doesn't want to allocate it @@ -2869,9 +2869,9 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, EXT4_EXT_CACHE_EXTENT); goto out; } - if (create == EXT4_CREATE_UNINITIALIZED_EXT) + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) goto out; - if (!create) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { if (allocated > max_blocks) allocated = max_blocks; /* @@ -2903,7 +2903,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * requested block isn't allocated yet; * we couldn't try to create block if create flag is zero */ - if (!create) { + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) { /* * put just found gap into cache to speed up * subsequent requests @@ -2932,10 +2932,10 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, * EXT_UNINIT_MAX_LEN. */ if (max_blocks > EXT_INIT_MAX_LEN && - create != EXT4_CREATE_UNINITIALIZED_EXT) + !(flags & EXT4_GET_BLOCKS_UNINIT_EXT)) max_blocks = EXT_INIT_MAX_LEN; else if (max_blocks > EXT_UNINIT_MAX_LEN && - create == EXT4_CREATE_UNINITIALIZED_EXT) + (flags & EXT4_GET_BLOCKS_UNINIT_EXT)) max_blocks = EXT_UNINIT_MAX_LEN; /* Check if we can really insert (iblock)::(iblock+max_blocks) extent */ @@ -2966,7 +2966,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, /* try to insert new extent into found leaf and return */ ext4_ext_store_pblock(&newex, newblock); newex.ee_len = cpu_to_le16(ar.len); - if (create == EXT4_CREATE_UNINITIALIZED_EXT) /* Mark uninitialized */ + if (flags & EXT4_GET_BLOCKS_UNINIT_EXT) /* Mark uninitialized */ ext4_ext_mark_uninitialized(&newex); err = ext4_ext_insert_extent(handle, inode, path, &newex); if (err) { @@ -2983,7 +2983,7 @@ int ext4_ext_get_blocks(handle_t *handle, struct inode *inode, newblock = ext_pblock(&newex); allocated = ext4_ext_get_actual_len(&newex); outnew: - if (extend_disksize) { + if (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE) { disksize = ((loff_t) iblock + ar.len) << inode->i_blkbits; if (disksize > i_size_read(inode)) disksize = i_size_read(inode); @@ -2994,7 +2994,7 @@ outnew: set_buffer_new(bh_result); /* Cache only when it is _not_ an uninitialized extent */ - if (create != EXT4_CREATE_UNINITIALIZED_EXT) + if ((flags & EXT4_GET_BLOCKS_UNINIT_EXT) == 0) ext4_ext_put_in_cache(inode, iblock, allocated, newblock, EXT4_EXT_CACHE_EXTENT); out: @@ -3153,7 +3153,7 @@ retry: map_bh.b_state = 0; ret = ext4_get_blocks(handle, inode, block, max_blocks, &map_bh, - EXT4_CREATE_UNINITIALIZED_EXT, 0, 0); + EXT4_GET_BLOCKS_CREATE_UNINIT_EXT); if (ret <= 0) { #ifdef EXT4FS_DEBUG WARN_ON(ret <= 0); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index a9a9b9b77e8..8b7564dfacd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -917,7 +917,7 @@ err_out: static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int maxblocks, struct buffer_head *bh_result, - int create, int extend_disksize) + int flags) { int err = -EIO; ext4_lblk_t offsets[4]; @@ -934,7 +934,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, J_ASSERT(!(EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)); - J_ASSERT(handle != NULL || create == 0); + J_ASSERT(handle != NULL || (flags & EXT4_GET_BLOCKS_CREATE) == 0); depth = ext4_block_to_path(inode, iblock, offsets, &blocks_to_boundary); @@ -963,7 +963,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, } /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0 || err == -EIO) goto cleanup; /* @@ -1002,7 +1002,7 @@ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, * protect it if you're about to implement concurrent * ext4_get_block() -bzzz */ - if (!err && extend_disksize) { + if (!err && (flags & EXT4_GET_BLOCKS_EXTEND_DISKSIZE)) { disksize = ((loff_t) iblock + count) << inode->i_blkbits; if (disksize > i_size_read(inode)) disksize = i_size_read(inode); @@ -1144,7 +1144,7 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) */ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, unsigned int max_blocks, struct buffer_head *bh, - int create, int extend_disksize, int flag) + int flags) { int retval; @@ -1158,15 +1158,15 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, down_read((&EXT4_I(inode)->i_data_sem)); if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, - bh, 0, 0); + bh, 0); } else { retval = ext4_ind_get_blocks(handle, inode, block, max_blocks, - bh, 0, 0); + bh, 0); } up_read((&EXT4_I(inode)->i_data_sem)); /* If it is only a block(s) look up */ - if (!create) + if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; /* @@ -1205,7 +1205,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, * let the underlying get_block() function know to * avoid double accounting */ - if (flag) + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) EXT4_I(inode)->i_delalloc_reserved_flag = 1; /* * We need to check for EXT4 here because migrate @@ -1213,10 +1213,10 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, */ if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { retval = ext4_ext_get_blocks(handle, inode, block, max_blocks, - bh, create, extend_disksize); + bh, flags); } else { retval = ext4_ind_get_blocks(handle, inode, block, - max_blocks, bh, create, extend_disksize); + max_blocks, bh, flags); if (retval > 0 && buffer_new(bh)) { /* @@ -1229,7 +1229,7 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, } } - if (flag) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { EXT4_I(inode)->i_delalloc_reserved_flag = 0; /* * Update reserved blocks/metadata blocks @@ -1269,7 +1269,7 @@ int ext4_get_block(struct inode *inode, sector_t iblock, } ret = ext4_get_blocks(handle, inode, iblock, max_blocks, bh_result, - create, 0, 0); + create ? EXT4_GET_BLOCKS_CREATE : 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; @@ -1288,16 +1288,19 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, { struct buffer_head dummy; int fatal = 0, err; + int flags = EXT4_GET_BLOCKS_EXTEND_DISKSIZE; J_ASSERT(handle != NULL || create == 0); dummy.b_state = 0; dummy.b_blocknr = -1000; buffer_trace_init(&dummy.b_history); - err = ext4_get_blocks(handle, inode, block, 1, &dummy, create, 1, 0); + if (create) + flags |= EXT4_GET_BLOCKS_CREATE; + err = ext4_get_blocks(handle, inode, block, 1, &dummy, flags); /* - * ext4_get_blocks() returns number of blocks - * mapped. 0 in case of a HOLE. + * ext4_get_blocks() returns number of blocks mapped. 0 in + * case of a HOLE. */ if (err > 0) { if (err > 1) @@ -1997,7 +2000,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -#define EXT4_DELALLOC_RSVED 1 static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result) { @@ -2009,7 +2011,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, handle = ext4_journal_current_handle(); BUG_ON(!handle); ret = ext4_get_blocks(handle, inode, iblock, max_blocks, - bh_result, 1, 0, EXT4_DELALLOC_RSVED); + bh_result, EXT4_GET_BLOCKS_CREATE| + EXT4_GET_BLOCKS_DELALLOC_RESERVE); if (ret <= 0) return ret; @@ -2065,16 +2068,17 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) return 0; /* * We need to make sure the BH_Delay flag is passed down to - * ext4_da_get_block_write(), since it calls - * ext4_get_blocks() with the EXT4_DELALLOC_RSVED flag. - * This flag causes ext4_get_blocks() to call + * ext4_da_get_block_write(), since it calls ext4_get_blocks() + * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. This flag + * causes ext4_get_blocks() to call * ext4_da_update_reserve_space() if the passed buffer head * has the BH_Delay flag set. In the future, once we clean up - * the interfaces to ext4_get_blocks(), we should pass in - * a separate flag which requests that the delayed allocation + * the interfaces to ext4_get_blocks(), we should pass in a + * separate flag which requests that the delayed allocation * statistics should be updated, instead of depending on the * state information getting passed down via the map_bh's - * state bitmasks plus the magic EXT4_DELALLOC_RSVED flag. + * state bitmasks plus the magic + * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. */ new.b_state = mpd->b_state & (1 << BH_Delay); new.b_blocknr = 0; @@ -2362,7 +2366,7 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, * preallocated blocks are unmapped but should treated * the same as allocated blocks. */ - ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0, 0, 0); + ret = ext4_get_blocks(NULL, inode, iblock, 1, bh_result, 0); if ((ret == 0) && !buffer_delay(bh_result)) { /* the block isn't (pre)allocated yet, let's reserve space */ /* @@ -2406,8 +2410,7 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, * we don't want to do block allocation in writepage * so call get_block_wrap with create = 0 */ - ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, - bh_result, 0, 0, 0); + ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; From b920c75502cb2c48654ef196d647c8eb81ab608a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 May 2009 00:54:29 -0400 Subject: [PATCH 1046/2061] ext4: Add documentation to the ext4_*get_block* functions This adds more documentation to various internal functions in fs/ext4/inode.c, most notably ext4_ind_get_blocks(), ext4_da_get_block_write(), ext4_da_get_block_prep(), ext4_normal_get_block_write(). In addition, the static function ext4_normal_get_block_write() has been renamed noalloc_get_block_write(), since it is used in many places far beyond ext4_normal_writepage(). Plenty of warnings have been added to the noalloc_get_block_write() function, since the way it is used is amazingly fragile. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 86 +++++++++++++++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 31 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 8b7564dfacd..fd5f27a9b81 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -892,6 +892,10 @@ err_out: } /* + * The ext4_ind_get_blocks() function handles non-extents inodes + * (i.e., using the traditional indirect/double-indirect i_blocks + * scheme) for ext4_get_blocks(). + * * Allocation strategy is simple: if we have to allocate something, we will * have to go the whole way to leaf. So let's do it before attaching anything * to tree, set linkage between the newborn blocks, write them if sync is @@ -909,10 +913,11 @@ err_out: * return = 0, if plain lookup failed. * return < 0, error case. * - * - * Need to be called with - * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system block - * (ie, create is zero). Otherwise down_write(&EXT4_I(inode)->i_data_sem) + * The ext4_ind_get_blocks() function should be called with + * down_write(&EXT4_I(inode)->i_data_sem) if allocating filesystem + * blocks (i.e., flags has EXT4_GET_BLOCKS_CREATE set) or + * down_read(&EXT4_I(inode)->i_data_sem) if not allocating file system + * blocks. */ static int ext4_ind_get_blocks(handle_t *handle, struct inode *inode, ext4_lblk_t iblock, unsigned int maxblocks, @@ -1152,8 +1157,8 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, clear_buffer_unwritten(bh); /* - * Try to see if we can get the block without requesting - * for new file system block. + * Try to see if we can get the block without requesting a new + * file system block. */ down_read((&EXT4_I(inode)->i_data_sem)); if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) { @@ -2000,6 +2005,12 @@ static void ext4_print_free_blocks(struct inode *inode) return; } +/* + * This function is used by mpage_da_map_blocks(). We separate it out + * as a separate function just to make life easier, and because + * mpage_da_map_blocks() used to be a generic function that took a + * get_block_t. + */ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result) { @@ -2031,8 +2042,8 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, /* * Update on-disk size along with block allocation we don't - * use 'extend_disksize' as size may change within already - * allocated block -bzzz + * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change + * within already allocated block -bzzz */ disksize = ((loff_t) iblock + ret) << inode->i_blkbits; if (disksize > i_size_read(inode)) @@ -2338,8 +2349,9 @@ static int __mpage_da_writepage(struct page *page, } /* - * this is a special callback for ->write_begin() only - * it's intention is to return mapped block or reserve space + * This is a special get_blocks_t callback which is used by + * ext4_da_write_begin(). It will either return mapped block or + * reserve space for a single block. * * For delayed buffer_head we have BH_Mapped, BH_New, BH_Delay set. * We also have b_blocknr = -1 and b_bdev initialized properly @@ -2347,7 +2359,6 @@ static int __mpage_da_writepage(struct page *page, * For unwritten buffer_head we have BH_Mapped, BH_New, BH_Unwritten set. * We also have b_blocknr = physicalblock mapping unwritten extent and b_bdev * initialized properly. - * */ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) @@ -2400,7 +2411,23 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, return ret; } -static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, +/* + * This function is used as a standard get_block_t calback function + * when there is no desire to allocate any blocks. It is used as a + * callback function for block_prepare_write(), nobh_writepage(), and + * block_write_full_page(). These functions should only try to map a + * single block at a time. + * + * Since this function doesn't do block allocations even if the caller + * requests it by passing in create=1, it is critically important that + * any caller checks to make sure that any buffer heads are returned + * by this function are either all already mapped or marked for + * delayed allocation before calling nobh_writepage() or + * block_write_full_page(). Otherwise, b_blocknr could be left + * unitialized, and the page write functions will be taken by + * surprise. + */ +static int noalloc_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { int ret = 0; @@ -2419,10 +2446,11 @@ static int ext4_normal_get_block_write(struct inode *inode, sector_t iblock, } /* - * get called vi ext4_da_writepages after taking page lock (have journal handle) - * get called via journal_submit_inode_data_buffers (no journal handle) - * get called via shrink_page_list via pdflush (no journal handle) - * or grab_page_cache when doing write_begin (have journal handle) + * This function can get called via... + * - ext4_da_writepages after taking page lock (have journal handle) + * - journal_submit_inode_data_buffers (no journal handle) + * - shrink_page_list via pdflush (no journal handle) + * - grab_page_cache when doing write_begin (have journal handle) */ static int ext4_da_writepage(struct page *page, struct writeback_control *wbc) @@ -2473,7 +2501,7 @@ static int ext4_da_writepage(struct page *page, * do block allocation here. */ ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_normal_get_block_write); + noalloc_get_block_write); if (!ret) { page_bufs = page_buffers(page); /* check whether all are mapped and non delay */ @@ -2498,11 +2526,10 @@ static int ext4_da_writepage(struct page *page, } if (test_opt(inode->i_sb, NOBH) && ext4_should_writeback_data(inode)) - ret = nobh_writepage(page, ext4_normal_get_block_write, wbc); + ret = nobh_writepage(page, noalloc_get_block_write, wbc); else - ret = block_write_full_page(page, - ext4_normal_get_block_write, - wbc); + ret = block_write_full_page(page, noalloc_get_block_write, + wbc); return ret; } @@ -2814,7 +2841,7 @@ retry: *pagep = page; ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext4_da_get_block_prep); + ext4_da_get_block_prep); if (ret < 0) { unlock_page(page); ext4_journal_stop(handle); @@ -3122,12 +3149,10 @@ static int __ext4_normal_writepage(struct page *page, struct inode *inode = page->mapping->host; if (test_opt(inode->i_sb, NOBH)) - return nobh_writepage(page, - ext4_normal_get_block_write, wbc); + return nobh_writepage(page, noalloc_get_block_write, wbc); else - return block_write_full_page(page, - ext4_normal_get_block_write, - wbc); + return block_write_full_page(page, noalloc_get_block_write, + wbc); } static int ext4_normal_writepage(struct page *page, @@ -3179,7 +3204,7 @@ static int __ext4_journalled_writepage(struct page *page, int err; ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE, - ext4_normal_get_block_write); + noalloc_get_block_write); if (ret != 0) goto out_unlock; @@ -3264,9 +3289,8 @@ static int ext4_journalled_writepage(struct page *page, * really know unless we go poke around in the buffer_heads. * But block_write_full_page will do the right thing. */ - return block_write_full_page(page, - ext4_normal_get_block_write, - wbc); + return block_write_full_page(page, noalloc_get_block_write, + wbc); } no_write: redirty_page_for_writepage(wbc, page); From a2dc52b5d1d8cc280b3e795abf1c80ac8c49f30c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 12 May 2009 13:51:29 -0400 Subject: [PATCH 1047/2061] ext4: Add BUG_ON debugging checks to noalloc_get_block_write() Enforce that noalloc_get_block_write() is only called to map one block at a time, and that it always is successful in finding a mapping for given an inode's logical block block number if it is called with create == 1. Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index fd5f27a9b81..e6113c3a126 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2433,11 +2433,14 @@ static int noalloc_get_block_write(struct inode *inode, sector_t iblock, int ret = 0; unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; + BUG_ON(bh_result->b_size != inode->i_sb->s_blocksize); + /* * we don't want to do block allocation in writepage * so call get_block_wrap with create = 0 */ ret = ext4_get_blocks(NULL, inode, iblock, max_blocks, bh_result, 0); + BUG_ON(create && ret == 0); if (ret > 0) { bh_result->b_size = (ret << inode->i_blkbits); ret = 0; From 7ed42a28b269f8682eefae27f5c11187eb56e63b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 12 May 2009 11:33:08 -0700 Subject: [PATCH 1048/2061] x86, boot: correct sanity checks in boot/compressed/misc.c arch/x86/boot/compressed/misc.c contains several sanity checks on the output address. Correct constraints that are no longer correct: - the alignment test should be MIN_KERNEL_ALIGN on both 32 and 64 bits. - the 64 bit maximum address was set to 2^40, which was the limit of one specific x86-64 implementation. Change the test to 2^46, the current Linux limit, and at least try to test the end rather than the beginning. - for non-relocatable kernels, test against LOAD_PHYSICAL_ADDR on both 32 and 64 bits. [ Impact: fix potential boot failure due to invalid tests ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/compressed/misc.c | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/arch/x86/boot/compressed/misc.c b/arch/x86/boot/compressed/misc.c index e45be73684f..842b2a36174 100644 --- a/arch/x86/boot/compressed/misc.c +++ b/arch/x86/boot/compressed/misc.c @@ -325,20 +325,18 @@ asmlinkage void decompress_kernel(void *rmode, memptr heap, free_mem_ptr = heap; /* Heap */ free_mem_end_ptr = heap + BOOT_HEAP_SIZE; + if ((unsigned long)output & (MIN_KERNEL_ALIGN - 1)) + error("Destination address inappropriately aligned"); #ifdef CONFIG_X86_64 - if ((unsigned long)output & (__KERNEL_ALIGN - 1)) - error("Destination address not 2M aligned"); - if ((unsigned long)output >= 0xffffffffffUL) + if (heap > 0x3fffffffffffUL) error("Destination address too large"); #else - if ((u32)output & (CONFIG_PHYSICAL_ALIGN - 1)) - error("Destination address not CONFIG_PHYSICAL_ALIGN aligned"); if (heap > ((-__PAGE_OFFSET-(512<<20)-1) & 0x7fffffff)) error("Destination address too large"); -#ifndef CONFIG_RELOCATABLE - if ((u32)output != LOAD_PHYSICAL_ADDR) - error("Wrong destination address"); #endif +#ifndef CONFIG_RELOCATABLE + if ((unsigned long)output != LOAD_PHYSICAL_ADDR) + error("Wrong destination address"); #endif if (!quiet) From c4f68236e41641494f9c8a418ccc0678c335bbb5 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Tue, 12 May 2009 11:37:34 -0700 Subject: [PATCH 1049/2061] x86-64: align __PHYSICAL_START, remove __KERNEL_ALIGN Handle the misconfiguration where CONFIG_PHYSICAL_START is incompatible with CONFIG_PHYSICAL_ALIGN. This is a configuration error, but one which arises easily since Kconfig doesn't have the smarts to express the true relationship between these two variables. Hence, align __PHYSICAL_START the same way we align LOAD_PHYSICAL_ADDR in . For non-relocatable kernels, this would cause the boot to fail. [ Impact: fix boot failures for non-relocatable kernels ] Reported-by: Ingo Molnar Signed-off-by: H. Peter Anvin --- arch/x86/include/asm/page_64_types.h | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/arch/x86/include/asm/page_64_types.h b/arch/x86/include/asm/page_64_types.h index d38c91b7024..e11900f2500 100644 --- a/arch/x86/include/asm/page_64_types.h +++ b/arch/x86/include/asm/page_64_types.h @@ -32,17 +32,9 @@ */ #define __PAGE_OFFSET _AC(0xffff880000000000, UL) -#define __PHYSICAL_START CONFIG_PHYSICAL_START -#define __KERNEL_ALIGN 0x200000 - -/* - * Make sure kernel is aligned to 2MB address. Catching it at compile - * time is better. Change your config file and compile the kernel - * for a 2MB aligned address (CONFIG_PHYSICAL_START) - */ -#if (CONFIG_PHYSICAL_START % __KERNEL_ALIGN) != 0 -#error "CONFIG_PHYSICAL_START must be a multiple of 2MB" -#endif +#define __PHYSICAL_START ((CONFIG_PHYSICAL_START + \ + (CONFIG_PHYSICAL_ALIGN - 1)) & \ + ~(CONFIG_PHYSICAL_ALIGN - 1)) #define __START_KERNEL (__START_KERNEL_map + __PHYSICAL_START) #define __START_KERNEL_map _AC(0xffffffff80000000, UL) From 5bb9efe33ea4001a17ab98186a40a134a3061d67 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 08:12:51 +0200 Subject: [PATCH 1050/2061] perf_counter: fix print debug irq disable inconsistent {IN-HARDIRQ-W} -> {HARDIRQ-ON-W} usage. bash/15802 [HC0[0]:SC0[0]:HE1:SE1] takes: (sysrq_key_table_lock){?.....}, Don't unconditionally enable interrupts in the perf_counter_print_debug() path. [ Impact: fix potential deadlock pointed out by lockdep ] LKML-Reference: Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index da27419923a..f7772ff7936 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -621,12 +621,13 @@ void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; struct cpu_hw_counters *cpuc; + unsigned long flags; int cpu, idx; if (!x86_pmu.num_counters) return; - local_irq_disable(); + local_irq_save(flags); cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); @@ -664,7 +665,7 @@ void perf_counter_print_debug(void) pr_info("CPU#%d: fixed-PMC%d count: %016llx\n", cpu, idx, pmc_count); } - local_irq_enable(); + local_irq_restore(flags); } static void x86_pmu_disable(struct perf_counter *counter) From 4f23122858a27ba97444b9b37a066d83edebd4c8 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 13 May 2009 08:35:35 +0200 Subject: [PATCH 1051/2061] splice: fix repeated kmap()'s in default_file_splice_read() We cannot reliably map more than one page at the time, or we risk deadlocking. Just allocate the pages from low mem instead. Reported-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/splice.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index eefd96b1d7f..c5e3c79b95a 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -580,13 +580,13 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, for (i = 0; i < nr_pages && i < PIPE_BUFFERS && len; i++) { struct page *page; - page = alloc_page(GFP_HIGHUSER); + page = alloc_page(GFP_USER); error = -ENOMEM; if (!page) goto err; this_len = min_t(size_t, len, PAGE_CACHE_SIZE - offset); - vec[i].iov_base = (void __user *) kmap(page); + vec[i].iov_base = (void __user *) page_address(page); vec[i].iov_len = this_len; pages[i] = page; spd.nr_pages++; @@ -604,7 +604,6 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, nr_freed = 0; for (i = 0; i < spd.nr_pages; i++) { - kunmap(pages[i]); this_len = min_t(size_t, vec[i].iov_len, res); partial[i].offset = 0; partial[i].len = this_len; @@ -624,10 +623,9 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, return res; err: - for (i = 0; i < spd.nr_pages; i++) { - kunmap(pages[i]); + for (i = 0; i < spd.nr_pages; i++) __free_page(pages[i]); - } + return error; } EXPORT_SYMBOL(default_file_splice_read); From af777ce42d3d51cdef353ce296d6f99dc503feef Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 13 May 2009 16:59:40 +0900 Subject: [PATCH 1052/2061] sh: clkfwk: module_clk -> peripheral_clk rename. For consistenct naming, and to allow us to fix up some confusion in the SH-Mobile clock framework, amongst other places. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/clock.c | 6 +++--- arch/sh/kernel/cpu/sh2/setup-sh7619.c | 4 ++-- arch/sh/kernel/cpu/sh2a/setup-mxg.c | 6 +++--- arch/sh/kernel/cpu/sh2a/setup-sh7201.c | 6 +++--- arch/sh/kernel/cpu/sh2a/setup-sh7203.c | 8 ++++---- arch/sh/kernel/cpu/sh2a/setup-sh7206.c | 10 +++++----- arch/sh/kernel/cpu/sh3/setup-sh7705.c | 6 +++--- arch/sh/kernel/cpu/sh3/setup-sh770x.c | 6 +++--- arch/sh/kernel/cpu/sh3/setup-sh7710.c | 6 +++--- arch/sh/kernel/cpu/sh3/setup-sh7720.c | 16 ++++++++-------- arch/sh/kernel/cpu/sh4/setup-sh4-202.c | 6 +++--- arch/sh/kernel/cpu/sh4/setup-sh7750.c | 10 +++++----- arch/sh/kernel/cpu/sh4/setup-sh7760.c | 6 +++--- arch/sh/kernel/cpu/sh4a/setup-sh7763.c | 12 ++++++------ arch/sh/kernel/cpu/sh4a/setup-sh7770.c | 18 +++++++++--------- arch/sh/kernel/cpu/sh4a/setup-sh7780.c | 12 ++++++------ arch/sh/kernel/cpu/sh4a/setup-sh7785.c | 12 ++++++------ arch/sh/kernel/cpu/sh4a/setup-sh7786.c | 24 ++++++++++++------------ arch/sh/kernel/cpu/sh4a/setup-shx3.c | 12 ++++++------ arch/sh/kernel/cpu/sh5/setup-sh5.c | 6 +++--- drivers/i2c/busses/i2c-sh7760.c | 2 +- drivers/serial/sh-sci.c | 2 +- sound/oss/sh_dac_audio.c | 10 +++++----- 23 files changed, 103 insertions(+), 103 deletions(-) diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 61ff227561d..0eedf939264 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -50,8 +50,8 @@ static struct clk master_clk = { .rate = CONFIG_SH_PCLK_FREQ, }; -static struct clk module_clk = { - .name = "module_clk", +static struct clk peripheral_clk = { + .name = "peripheral_clk", .parent = &master_clk, .flags = CLK_ENABLE_ON_INIT, }; @@ -73,7 +73,7 @@ static struct clk cpu_clk = { */ static struct clk *onchip_clocks[] = { &master_clk, - &module_clk, + &peripheral_clk, &bus_clk, &cpu_clk, }; diff --git a/arch/sh/kernel/cpu/sh2/setup-sh7619.c b/arch/sh/kernel/cpu/sh2/setup-sh7619.c index 94ac27fc223..13798733f2d 100644 --- a/arch/sh/kernel/cpu/sh2/setup-sh7619.c +++ b/arch/sh/kernel/cpu/sh2/setup-sh7619.c @@ -115,7 +115,7 @@ static struct sh_timer_config cmt0_platform_data = { .name = "CMT0", .channel_offset = 0x02, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 125, .clocksource_rating = 0, /* disabled due to code generation issues */ }; @@ -147,7 +147,7 @@ static struct sh_timer_config cmt1_platform_data = { .name = "CMT1", .channel_offset = 0x08, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 125, .clocksource_rating = 0, /* disabled due to code generation issues */ }; diff --git a/arch/sh/kernel/cpu/sh2a/setup-mxg.c b/arch/sh/kernel/cpu/sh2a/setup-mxg.c index a452d964906..869c2da4820 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-mxg.c +++ b/arch/sh/kernel/cpu/sh2a/setup-mxg.c @@ -118,7 +118,7 @@ static struct sh_timer_config mtu2_0_platform_data = { .name = "MTU2_0", .channel_offset = -0x80, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -149,7 +149,7 @@ static struct sh_timer_config mtu2_1_platform_data = { .name = "MTU2_1", .channel_offset = -0x100, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -180,7 +180,7 @@ static struct sh_timer_config mtu2_2_platform_data = { .name = "MTU2_2", .channel_offset = 0x80, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c index 772358b7685..d8febe12806 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7201.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7201.c @@ -255,7 +255,7 @@ static struct sh_timer_config mtu2_0_platform_data = { .name = "MTU2_0", .channel_offset = -0x80, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -286,7 +286,7 @@ static struct sh_timer_config mtu2_1_platform_data = { .name = "MTU2_1", .channel_offset = -0x100, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -317,7 +317,7 @@ static struct sh_timer_config mtu2_2_platform_data = { .name = "MTU2_2", .channel_offset = 0x80, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c index d7493418ba6..62e3039d239 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7203.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7203.c @@ -211,7 +211,7 @@ static struct sh_timer_config cmt0_platform_data = { .name = "CMT0", .channel_offset = 0x02, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 125, .clocksource_rating = 0, /* disabled due to code generation issues */ }; @@ -243,7 +243,7 @@ static struct sh_timer_config cmt1_platform_data = { .name = "CMT1", .channel_offset = 0x08, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 125, .clocksource_rating = 0, /* disabled due to code generation issues */ }; @@ -275,7 +275,7 @@ static struct sh_timer_config mtu2_0_platform_data = { .name = "MTU2_0", .channel_offset = -0x80, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -306,7 +306,7 @@ static struct sh_timer_config mtu2_1_platform_data = { .name = "MTU2_1", .channel_offset = -0x100, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; diff --git a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c index 2fc6bff5c5f..3e6f3d7a58b 100644 --- a/arch/sh/kernel/cpu/sh2a/setup-sh7206.c +++ b/arch/sh/kernel/cpu/sh2a/setup-sh7206.c @@ -171,7 +171,7 @@ static struct sh_timer_config cmt0_platform_data = { .name = "CMT0", .channel_offset = 0x02, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 125, .clocksource_rating = 0, /* disabled due to code generation issues */ }; @@ -203,7 +203,7 @@ static struct sh_timer_config cmt1_platform_data = { .name = "CMT1", .channel_offset = 0x08, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 125, .clocksource_rating = 0, /* disabled due to code generation issues */ }; @@ -235,7 +235,7 @@ static struct sh_timer_config mtu2_0_platform_data = { .name = "MTU2_0", .channel_offset = -0x80, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -266,7 +266,7 @@ static struct sh_timer_config mtu2_1_platform_data = { .name = "MTU2_1", .channel_offset = -0x100, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -297,7 +297,7 @@ static struct sh_timer_config mtu2_2_platform_data = { .name = "MTU2_2", .channel_offset = 0x80, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7705.c b/arch/sh/kernel/cpu/sh3/setup-sh7705.c index 39513664d5d..88f742fed9e 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7705.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7705.c @@ -121,7 +121,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x02, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -152,7 +152,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0xe, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -183,7 +183,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1a, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { diff --git a/arch/sh/kernel/cpu/sh3/setup-sh770x.c b/arch/sh/kernel/cpu/sh3/setup-sh770x.c index 9412d915b84..c5630679858 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh770x.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh770x.c @@ -149,7 +149,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x02, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -180,7 +180,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0xe, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -211,7 +211,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1a, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7710.c b/arch/sh/kernel/cpu/sh3/setup-sh7710.c index 07ff38d055a..efa76c8148f 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7710.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7710.c @@ -125,7 +125,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x02, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -156,7 +156,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0xe, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -187,7 +187,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1a, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { diff --git a/arch/sh/kernel/cpu/sh3/setup-sh7720.c b/arch/sh/kernel/cpu/sh3/setup-sh7720.c index d8b46f5dff6..5b2107798ed 100644 --- a/arch/sh/kernel/cpu/sh3/setup-sh7720.c +++ b/arch/sh/kernel/cpu/sh3/setup-sh7720.c @@ -128,7 +128,7 @@ static struct sh_timer_config cmt0_platform_data = { .name = "CMT0", .channel_offset = 0x10, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 125, .clocksource_rating = 125, }; @@ -160,7 +160,7 @@ static struct sh_timer_config cmt1_platform_data = { .name = "CMT1", .channel_offset = 0x20, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource cmt1_resources[] = { @@ -190,7 +190,7 @@ static struct sh_timer_config cmt2_platform_data = { .name = "CMT2", .channel_offset = 0x30, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource cmt2_resources[] = { @@ -220,7 +220,7 @@ static struct sh_timer_config cmt3_platform_data = { .name = "CMT3", .channel_offset = 0x40, .timer_bit = 3, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource cmt3_resources[] = { @@ -250,7 +250,7 @@ static struct sh_timer_config cmt4_platform_data = { .name = "CMT4", .channel_offset = 0x50, .timer_bit = 4, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource cmt4_resources[] = { @@ -280,7 +280,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x02, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -311,7 +311,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0xe, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -342,7 +342,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1a, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c index be79fa13625..6d088d12359 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh4-202.c @@ -38,7 +38,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -69,7 +69,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -100,7 +100,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7750.c b/arch/sh/kernel/cpu/sh4/setup-sh7750.c index 09da0c187d4..851672d15cf 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh7750.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh7750.c @@ -65,7 +65,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -96,7 +96,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -127,7 +127,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { @@ -162,7 +162,7 @@ static struct sh_timer_config tmu3_platform_data = { .name = "TMU3", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu3_resources[] = { @@ -192,7 +192,7 @@ static struct sh_timer_config tmu4_platform_data = { .name = "TMU4", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu4_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4/setup-sh7760.c b/arch/sh/kernel/cpu/sh4/setup-sh7760.c index cd097335758..5b822519bd9 100644 --- a/arch/sh/kernel/cpu/sh4/setup-sh7760.c +++ b/arch/sh/kernel/cpu/sh4/setup-sh7760.c @@ -164,7 +164,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -195,7 +195,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -226,7 +226,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c index c91f34c9aa8..f1e0c0d36da 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7763.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7763.c @@ -118,7 +118,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -149,7 +149,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -180,7 +180,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { @@ -210,7 +210,7 @@ static struct sh_timer_config tmu3_platform_data = { .name = "TMU3", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu3_resources[] = { @@ -240,7 +240,7 @@ static struct sh_timer_config tmu4_platform_data = { .name = "TMU4", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu4_resources[] = { @@ -270,7 +270,7 @@ static struct sh_timer_config tmu5_platform_data = { .name = "TMU5", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu5_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c index b61d6143aaa..12fb8752d4f 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c @@ -81,7 +81,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -112,7 +112,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -143,7 +143,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { @@ -173,7 +173,7 @@ static struct sh_timer_config tmu3_platform_data = { .name = "TMU3", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu3_resources[] = { @@ -203,7 +203,7 @@ static struct sh_timer_config tmu4_platform_data = { .name = "TMU4", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu4_resources[] = { @@ -233,7 +233,7 @@ static struct sh_timer_config tmu5_platform_data = { .name = "TMU5", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu5_resources[] = { @@ -263,7 +263,7 @@ static struct sh_timer_config tmu6_platform_data = { .name = "TMU6", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu6_resources[] = { @@ -293,7 +293,7 @@ static struct sh_timer_config tmu7_platform_data = { .name = "TMU7", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu7_resources[] = { @@ -323,7 +323,7 @@ static struct sh_timer_config tmu8_platform_data = { .name = "TMU8", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu8_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c index f1df0209506..715e05b431e 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7780.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7780.c @@ -18,7 +18,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -49,7 +49,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -80,7 +80,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { @@ -110,7 +110,7 @@ static struct sh_timer_config tmu3_platform_data = { .name = "TMU3", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu3_resources[] = { @@ -140,7 +140,7 @@ static struct sh_timer_config tmu4_platform_data = { .name = "TMU4", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu4_resources[] = { @@ -170,7 +170,7 @@ static struct sh_timer_config tmu5_platform_data = { .name = "TMU5", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu5_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c index dc5d3e507a2..d7e77bc77e2 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c @@ -20,7 +20,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -51,7 +51,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -82,7 +82,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { @@ -112,7 +112,7 @@ static struct sh_timer_config tmu3_platform_data = { .name = "TMU3", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu3_resources[] = { @@ -142,7 +142,7 @@ static struct sh_timer_config tmu4_platform_data = { .name = "TMU4", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu4_resources[] = { @@ -172,7 +172,7 @@ static struct sh_timer_config tmu5_platform_data = { .name = "TMU5", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu5_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c index 2c464bf5a89..93e0d2c017e 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7786.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7786.c @@ -75,7 +75,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -106,7 +106,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -137,7 +137,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { @@ -167,7 +167,7 @@ static struct sh_timer_config tmu3_platform_data = { .name = "TMU3", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu3_resources[] = { @@ -197,7 +197,7 @@ static struct sh_timer_config tmu4_platform_data = { .name = "TMU4", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu4_resources[] = { @@ -227,7 +227,7 @@ static struct sh_timer_config tmu5_platform_data = { .name = "TMU5", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu5_resources[] = { @@ -257,7 +257,7 @@ static struct sh_timer_config tmu6_platform_data = { .name = "TMU6", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu6_resources[] = { @@ -287,7 +287,7 @@ static struct sh_timer_config tmu7_platform_data = { .name = "TMU7", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu7_resources[] = { @@ -317,7 +317,7 @@ static struct sh_timer_config tmu8_platform_data = { .name = "TMU8", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu8_resources[] = { @@ -347,7 +347,7 @@ static struct sh_timer_config tmu9_platform_data = { .name = "TMU9", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu9_resources[] = { @@ -377,7 +377,7 @@ static struct sh_timer_config tmu10_platform_data = { .name = "TMU10", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu10_resources[] = { @@ -407,7 +407,7 @@ static struct sh_timer_config tmu11_platform_data = { .name = "TMU11", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu11_resources[] = { diff --git a/arch/sh/kernel/cpu/sh4a/setup-shx3.c b/arch/sh/kernel/cpu/sh4a/setup-shx3.c index 9d5185b42f1..53c65fd9cce 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/setup-shx3.c @@ -53,7 +53,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -84,7 +84,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -115,7 +115,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { @@ -145,7 +145,7 @@ static struct sh_timer_config tmu3_platform_data = { .name = "TMU3", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu3_resources[] = { @@ -175,7 +175,7 @@ static struct sh_timer_config tmu4_platform_data = { .name = "TMU4", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu4_resources[] = { @@ -205,7 +205,7 @@ static struct sh_timer_config tmu5_platform_data = { .name = "TMU5", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu5_resources[] = { diff --git a/arch/sh/kernel/cpu/sh5/setup-sh5.c b/arch/sh/kernel/cpu/sh5/setup-sh5.c index 678d69bdebb..f5ff1ac57fc 100644 --- a/arch/sh/kernel/cpu/sh5/setup-sh5.c +++ b/arch/sh/kernel/cpu/sh5/setup-sh5.c @@ -75,7 +75,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "module_clk", + .clk = "peripheral_clk", .clockevent_rating = 200, }; @@ -106,7 +106,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "module_clk", + .clk = "peripheral_clk", .clocksource_rating = 200, }; @@ -137,7 +137,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "module_clk", + .clk = "peripheral_clk", }; static struct resource tmu2_resources[] = { diff --git a/drivers/i2c/busses/i2c-sh7760.c b/drivers/i2c/busses/i2c-sh7760.c index baa28b73ae4..b9680f50f54 100644 --- a/drivers/i2c/busses/i2c-sh7760.c +++ b/drivers/i2c/busses/i2c-sh7760.c @@ -396,7 +396,7 @@ static int __devinit calc_CCR(unsigned long scl_hz) signed char cdf, cdfm; int scgd, scgdm, scgds; - mclk = clk_get(NULL, "module_clk"); + mclk = clk_get(NULL, "peripheral_clk"); if (IS_ERR(mclk)) { return PTR_ERR(mclk); } else { diff --git a/drivers/serial/sh-sci.c b/drivers/serial/sh-sci.c index 4e3248d5860..fa4d52a6c03 100644 --- a/drivers/serial/sh-sci.c +++ b/drivers/serial/sh-sci.c @@ -1084,7 +1084,7 @@ static void __devinit sci_init_single(struct platform_device *dev, sci_port->port.uartclk = CONFIG_CPU_CLOCK; #elif defined(CONFIG_HAVE_CLK) sci_port->iclk = p->clk ? clk_get(&dev->dev, p->clk) : NULL; - sci_port->dclk = clk_get(&dev->dev, "module_clk"); + sci_port->dclk = clk_get(&dev->dev, "peripheral_clk"); sci_port->enable = sci_clk_enable; sci_port->disable = sci_clk_disable; #else diff --git a/sound/oss/sh_dac_audio.c b/sound/oss/sh_dac_audio.c index 78cfb66e4c5..6fa3140720b 100644 --- a/sound/oss/sh_dac_audio.c +++ b/sound/oss/sh_dac_audio.c @@ -95,18 +95,18 @@ static void dac_audio_stop(void) outw(v, HD64461_GPADR); } - sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL); + sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL); sh_dac_disable(CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL); } static void dac_audio_set_rate(void) { unsigned long interval; - struct clk *clk; + struct clk *clk; - clk = clk_get(NULL, "module_clk"); - interval = (clk_get_rate(clk) / 4) / rate; - clk_put(clk); + clk = clk_get(NULL, "peripheral_clk"); + interval = (clk_get_rate(clk) / 4) / rate; + clk_put(clk); ctrl_outl(interval, TMU1_TCOR); ctrl_outl(interval, TMU1_TCNT); } From d672fef02738582bdeae6e77176e141eeb9169bc Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 13 May 2009 17:03:09 +0900 Subject: [PATCH 1053/2061] sh: clkfwk: Handle NULL clkops for root clocks. root clocks may simply be placeholders for rate and ancestry information, and have no real associated operations of their own. Account for this, so we are still able to use these sorts of clocks for rate propagation. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/clock.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 0eedf939264..2ced20f870d 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -103,7 +103,7 @@ void propagate_rate(struct clk *tclk) struct clk *clkp; list_for_each_entry(clkp, &tclk->children, sibling) { - if (clkp->ops->recalc) + if (clkp->ops && clkp->ops->recalc) clkp->rate = clkp->ops->recalc(clkp); propagate_rate(clkp); } @@ -196,7 +196,7 @@ void recalculate_root_clocks(void) struct clk *clkp; list_for_each_entry(clkp, &root_clks, sibling) { - if (clkp->ops->recalc) + if (clkp->ops && clkp->ops->recalc) clkp->rate = clkp->ops->recalc(clkp); propagate_rate(clkp); } @@ -224,7 +224,7 @@ int clk_register(struct clk *clk) list_add(&clk->sibling, &root_clks); list_add(&clk->node, &clock_list); - if (clk->ops->init) + if (clk->ops && clk->ops->init) clk->ops->init(clk); mutex_unlock(&clock_list_sem); From 100890c55e326a9acb4429593c5ad2012c194564 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 13 May 2009 17:05:51 +0900 Subject: [PATCH 1054/2061] sh: clkfwk: Provide a generic clk_set_rate_ex() path for root clocks. In the case of root clocks (such as clkin oscillators, extal, etc.), the rate information is entirely platform dependent and needs to be lazily set and propagated from the platform code. This provides a method for establishing the rate update on these types of clocks that define no set_rate() op of their own. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/clock.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 2ced20f870d..0a7755cc8a2 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -265,20 +265,27 @@ EXPORT_SYMBOL_GPL(clk_set_rate); int clk_set_rate_ex(struct clk *clk, unsigned long rate, int algo_id) { int ret = -EOPNOTSUPP; + unsigned long flags; + + spin_lock_irqsave(&clock_lock, flags); if (likely(clk->ops && clk->ops->set_rate)) { - unsigned long flags; - - spin_lock_irqsave(&clock_lock, flags); ret = clk->ops->set_rate(clk, rate, algo_id); - if (ret == 0) { - if (clk->ops->recalc) - clk->rate = clk->ops->recalc(clk); - propagate_rate(clk); - } - spin_unlock_irqrestore(&clock_lock, flags); + if (ret != 0) + goto out_unlock; + } else { + clk->rate = rate; + ret = 0; } + if (clk->ops && clk->ops->recalc) + clk->rate = clk->ops->recalc(clk); + + propagate_rate(clk); + +out_unlock: + spin_unlock_irqrestore(&clock_lock, flags); + return ret; } EXPORT_SYMBOL_GPL(clk_set_rate_ex); From 253b0887b3736160feac9ccdcf146a2073e41463 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 13 May 2009 17:38:11 +0900 Subject: [PATCH 1055/2061] sh: clkfwk: Rework legacy CPG clock handling. This moves out the old legacy CPG clocks to their own file, and converts over the existing users. With these clocks going away and each CPU dealing with them on their own, CPUs can gradually move over to the new interface. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 3 ++ arch/sh/include/asm/clock.h | 5 +- arch/sh/include/asm/machvec.h | 2 + arch/sh/kernel/cpu/Makefile | 1 + arch/sh/kernel/cpu/clock-cpg.c | 60 ++++++++++++++++++++++ arch/sh/kernel/cpu/clock.c | 70 ++++++-------------------- arch/sh/kernel/cpu/sh4/clock-sh4-202.c | 5 +- arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 2 + arch/sh/kernel/cpu/sh4a/clock-sh7763.c | 5 +- arch/sh/kernel/cpu/sh4a/clock-sh7780.c | 5 +- arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 5 +- arch/sh/kernel/cpu/sh4a/clock-sh7786.c | 5 +- arch/sh/kernel/cpu/sh4a/clock-shx3.c | 5 +- 13 files changed, 110 insertions(+), 63 deletions(-) create mode 100644 arch/sh/kernel/cpu/clock-cpg.c diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index fb75c2d1928..d7990cd2f8d 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -513,6 +513,9 @@ config SH_PCLK_FREQ This is necessary for determining the reference clock value on platforms lacking an RTC. +config SH_CLK_CPG_LEGACY + def_bool y + config SH_CLK_MD int "CPU Mode Pin Setting" depends on CPU_SH2 diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index e2f5bf1b4a9..c27e844db0d 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -47,7 +47,7 @@ struct clk_lookup { #define CLK_ENABLE_ON_INIT (1 << 0) /* Should be defined by processor-specific code */ -void arch_init_clk_ops(struct clk_ops **, int type); +void __deprecated arch_init_clk_ops(struct clk_ops **, int type); int __init arch_clk_init(void); /* arch/sh/kernel/cpu/clock.c */ @@ -59,6 +59,9 @@ int clk_reparent(struct clk *child, struct clk *parent); int clk_register(struct clk *); void clk_unregister(struct clk *); +/* arch/sh/kernel/cpu/clock-cpg.c */ +int __init __deprecated cpg_clk_init(void); + /* the exported API, in addition to clk_set_rate */ /** * clk_set_rate_ex - set the clock rate for a clock source, with additional parameter diff --git a/arch/sh/include/asm/machvec.h b/arch/sh/include/asm/machvec.h index 64b1c16a0f0..73d6d16fa06 100644 --- a/arch/sh/include/asm/machvec.h +++ b/arch/sh/include/asm/machvec.h @@ -46,6 +46,8 @@ struct sh_machine_vector { void __iomem *(*mv_ioport_map)(unsigned long port, unsigned int size); void (*mv_ioport_unmap)(void __iomem *); + + int (*mv_clk_init)(void); }; extern struct sh_machine_vector sh_mv; diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile index 2600641a483..05105b980c2 100644 --- a/arch/sh/kernel/cpu/Makefile +++ b/arch/sh/kernel/cpu/Makefile @@ -17,5 +17,6 @@ obj-$(CONFIG_ARCH_SHMOBILE) += shmobile/ obj-$(CONFIG_UBC_WAKEUP) += ubc.o obj-$(CONFIG_SH_ADC) += adc.o +obj-$(CONFIG_SH_CLK_CPG_LEGACY) += clock-cpg.o obj-y += irq/ init.o clock.o diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c new file mode 100644 index 00000000000..b4ca2004844 --- /dev/null +++ b/arch/sh/kernel/cpu/clock-cpg.c @@ -0,0 +1,60 @@ +#include +#include +#include + +static struct clk master_clk = { + .name = "master_clk", + .flags = CLK_ENABLE_ON_INIT, + .rate = CONFIG_SH_PCLK_FREQ, +}; + +static struct clk peripheral_clk = { + .name = "peripheral_clk", + .parent = &master_clk, + .flags = CLK_ENABLE_ON_INIT, +}; + +static struct clk bus_clk = { + .name = "bus_clk", + .parent = &master_clk, + .flags = CLK_ENABLE_ON_INIT, +}; + +static struct clk cpu_clk = { + .name = "cpu_clk", + .parent = &master_clk, + .flags = CLK_ENABLE_ON_INIT, +}; + +/* + * The ordering of these clocks matters, do not change it. + */ +static struct clk *onchip_clocks[] = { + &master_clk, + &peripheral_clk, + &bus_clk, + &cpu_clk, +}; + +int __init __deprecated cpg_clk_init(void) +{ + int i, ret = 0; + + for (i = 0; i < ARRAY_SIZE(onchip_clocks); i++) { + struct clk *clk = onchip_clocks[i]; + arch_init_clk_ops(&clk->ops, i); + if (clk->ops) + ret |= clk_register(clk); + } + + return ret; +} + +/* + * Placeholder for compatability, until the lazy CPUs do this + * on their own. + */ +int __init __weak arch_clk_init(void) +{ + return cpg_clk_init(); +} diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 0a7755cc8a2..033f4662b59 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -30,54 +30,12 @@ #include #include #include +#include static LIST_HEAD(clock_list); static DEFINE_SPINLOCK(clock_lock); static DEFINE_MUTEX(clock_list_sem); -/* - * Each subtype is expected to define the init routines for these clocks, - * as each subtype (or processor family) will have these clocks at the - * very least. These are all provided through the CPG, which even some of - * the more quirky parts (such as ST40, SH4-202, etc.) still have. - * - * The processor-specific code is expected to register any additional - * clock sources that are of interest. - */ -static struct clk master_clk = { - .name = "master_clk", - .flags = CLK_ENABLE_ON_INIT, - .rate = CONFIG_SH_PCLK_FREQ, -}; - -static struct clk peripheral_clk = { - .name = "peripheral_clk", - .parent = &master_clk, - .flags = CLK_ENABLE_ON_INIT, -}; - -static struct clk bus_clk = { - .name = "bus_clk", - .parent = &master_clk, - .flags = CLK_ENABLE_ON_INIT, -}; - -static struct clk cpu_clk = { - .name = "cpu_clk", - .parent = &master_clk, - .flags = CLK_ENABLE_ON_INIT, -}; - -/* - * The ordering of these clocks matters, do not change it. - */ -static struct clk *onchip_clocks[] = { - &master_clk, - &peripheral_clk, - &bus_clk, - &cpu_clk, -}; - /* Used for clocks that always have same value as the parent clock */ unsigned long followparent_recalc(struct clk *clk) { @@ -443,10 +401,6 @@ void clk_put(struct clk *clk) } EXPORT_SYMBOL_GPL(clk_put); -int __init __weak arch_clk_init(void) -{ - return 0; -} static int show_clocks(char *buf, char **start, off_t off, int len, int *eof, void *data) @@ -533,18 +487,22 @@ subsys_initcall(clk_sysdev_init); int __init clk_init(void) { - int i, ret = 0; + int ret; - BUG_ON(!master_clk.rate); - - for (i = 0; i < ARRAY_SIZE(onchip_clocks); i++) { - struct clk *clk = onchip_clocks[i]; - - arch_init_clk_ops(&clk->ops, i); - ret |= clk_register(clk); + ret = arch_clk_init(); + if (unlikely(ret)) { + pr_err("%s: CPU clock registration failed.\n", __func__); + return ret; } - ret |= arch_clk_init(); + if (sh_mv.mv_clk_init) { + ret = sh_mv.mv_clk_init(); + if (unlikely(ret)) { + pr_err("%s: machvec clock initialization failed.\n", + __func__); + return ret; + } + } /* Kick the child clocks.. */ recalculate_root_clocks(); diff --git a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c index a72dc326d0b..21421e34e7d 100644 --- a/arch/sh/kernel/cpu/sh4/clock-sh4-202.c +++ b/arch/sh/kernel/cpu/sh4/clock-sh4-202.c @@ -152,9 +152,12 @@ static struct clk *sh4202_onchip_clocks[] = { int __init arch_clk_init(void) { - struct clk *clk = clk_get(NULL, "master_clk"); + struct clk *clk; int i, ret = 0; + cpg_clk_init(); + + clk = clk_get(NULL, "master_clk"); for (i = 0; i < ARRAY_SIZE(sh4202_onchip_clocks); i++) { struct clk *clkp = sh4202_onchip_clocks[i]; diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c index 426065b4326..a98d7da67b9 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c @@ -892,6 +892,8 @@ int __init arch_clk_init(void) struct clk *clk; int i; + clk_cpg_init(); + clk = clk_get(NULL, "master_clk"); for (i = 0; i < ARRAY_SIZE(sh7722_clocks); i++) { pr_debug( "Registering clock '%s'\n", sh7722_clocks[i]->name); diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c index ce3d4e6319a..370cd47642e 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7763.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7763.c @@ -92,9 +92,12 @@ static struct clk *sh7763_onchip_clocks[] = { int __init arch_clk_init(void) { - struct clk *clk = clk_get(NULL, "master_clk"); + struct clk *clk; int i, ret = 0; + cpg_clk_init(); + + clk = clk_get(NULL, "master_clk"); for (i = 0; i < ARRAY_SIZE(sh7763_onchip_clocks); i++) { struct clk *clkp = sh7763_onchip_clocks[i]; diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c index 38b8b1ddb28..a249d823578 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7780.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7780.c @@ -98,9 +98,12 @@ static struct clk *sh7780_onchip_clocks[] = { int __init arch_clk_init(void) { - struct clk *clk = clk_get(NULL, "master_clk"); + struct clk *clk; int i, ret = 0; + cpg_clk_init(); + + clk = clk_get(NULL, "master_clk"); for (i = 0; i < ARRAY_SIZE(sh7780_onchip_clocks); i++) { struct clk *clkp = sh7780_onchip_clocks[i]; diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c index fa14f35bc11..bf5a0dacf8e 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c @@ -136,9 +136,12 @@ static struct clk *sh7785_onchip_clocks[] = { int __init arch_clk_init(void) { - struct clk *clk = clk_get(NULL, "master_clk"); + struct clk *clk; int i, ret = 0; + cpg_clk_init(); + + clk = clk_get(NULL, "master_clk"); for (i = 0; i < ARRAY_SIZE(sh7785_onchip_clocks); i++) { struct clk *clkp = sh7785_onchip_clocks[i]; diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c index 7907002fa1d..a0e8869071a 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7786.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7786.c @@ -122,9 +122,12 @@ static struct clk *sh7786_onchip_clocks[] = { int __init arch_clk_init(void) { - struct clk *clk = clk_get(NULL, "master_clk"); + struct clk *clk; int i, ret = 0; + cpg_clk_init(); + + clk = clk_get(NULL, "master_clk"); for (i = 0; i < ARRAY_SIZE(sh7786_onchip_clocks); i++) { struct clk *clkp = sh7786_onchip_clocks[i]; diff --git a/arch/sh/kernel/cpu/sh4a/clock-shx3.c b/arch/sh/kernel/cpu/sh4a/clock-shx3.c index c14553e3e13..23c27d32d98 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-shx3.c +++ b/arch/sh/kernel/cpu/sh4a/clock-shx3.c @@ -109,9 +109,12 @@ static struct clk *shx3_onchip_clocks[] = { int __init arch_clk_init(void) { - struct clk *clk = clk_get(NULL, "master_clk"); + struct clk *clk; int i, ret = 0; + cpg_clk_init(); + + clk = clk_get(NULL, "master_clk"); for (i = 0; i < ARRAY_SIZE(shx3_onchip_clocks); i++) { struct clk *clkp = shx3_onchip_clocks[i]; From a77b5ac0ea8e47c77008d3a9a9976dcfbc01c42a Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 13 May 2009 17:55:00 +0900 Subject: [PATCH 1056/2061] sh: clkfwk: Update SH7785 for refactored clock framework. This updates the SH7785 CPU code as well as the SH7785LCR board support code for making use of the newly refactored clock framework. Support for the legacy CPG clocks is dropped at this point, with the extal frequency fed in from the board code. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 2 +- arch/sh/boards/board-sh7785lcr.c | 22 ++- arch/sh/include/asm/clock.h | 2 + arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 247 +++++++++++++------------ 4 files changed, 147 insertions(+), 126 deletions(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index d7990cd2f8d..df764c56b05 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -514,7 +514,7 @@ config SH_PCLK_FREQ platforms lacking an RTC. config SH_CLK_CPG_LEGACY - def_bool y + def_bool y if !CPU_SUBTYPE_SH7785 config SH_CLK_MD int "CPU Mode Pin Setting" diff --git a/arch/sh/boards/board-sh7785lcr.c b/arch/sh/boards/board-sh7785lcr.c index 6f94f17adc4..33b194b0454 100644 --- a/arch/sh/boards/board-sh7785lcr.c +++ b/arch/sh/boards/board-sh7785lcr.c @@ -2,12 +2,12 @@ * Renesas Technology Corp. R0P7785LC0011RL Support. * * Copyright (C) 2008 Yoshihiro Shimoda + * Copyright (C) 2009 Paul Mundt * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive * for more details. */ - #include #include #include @@ -19,8 +19,11 @@ #include #include #include -#include +#include +#include #include +#include +#include /* * NOTE: This board has 2 physical memory maps. @@ -273,6 +276,20 @@ void __init init_sh7785lcr_IRQ(void) plat_irq_setup_pins(IRQ_MODE_IRQ3210); } +static int sh7785lcr_clk_init(void) +{ + struct clk *clk; + int ret; + + clk = clk_get(NULL, "extal"); + if (!clk || IS_ERR(clk)) + return PTR_ERR(clk); + ret = clk_set_rate(clk, 33333333); + clk_put(clk); + + return ret; +} + static void sh7785lcr_power_off(void) { unsigned char *p; @@ -309,6 +326,7 @@ static void __init sh7785lcr_setup(char **cmdline_p) static struct sh_machine_vector mv_sh7785lcr __initmv = { .mv_name = "SH7785LCR", .mv_setup = sh7785lcr_setup, + .mv_clk_init = sh7785lcr_clk_init, .mv_init_irq = init_sh7785lcr_IRQ, }; diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index c27e844db0d..40cf3c07d7e 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -34,7 +34,9 @@ struct clk { unsigned long rate; unsigned long flags; + unsigned long arch_flags; + void *priv; }; struct clk_lookup { diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c index bf5a0dacf8e..87584dc8192 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c @@ -3,7 +3,7 @@ * * SH7785 support for the clock framework * - * Copyright (C) 2007 Paul Mundt + * Copyright (C) 2007 - 2009 Paul Mundt * * This file is subject to the terms and conditions of the GNU General Public * License. See the file "COPYING" in the main directory of this archive @@ -11,145 +11,146 @@ */ #include #include +#include +#include #include #include -#include -static int ifc_divisors[] = { 1, 2, 4, 6 }; -static int ufc_divisors[] = { 1, 1, 4, 6 }; -static int sfc_divisors[] = { 1, 1, 4, 6 }; -static int bfc_divisors[] = { 1, 1, 1, 1, 1, 12, 16, 18, - 24, 32, 36, 48, 1, 1, 1, 1 }; -static int mfc_divisors[] = { 1, 1, 4, 6 }; -static int pfc_divisors[] = { 1, 1, 1, 1, 1, 1, 1, 18, - 24, 32, 36, 48, 1, 1, 1, 1 }; +static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18, + 24, 32, 36, 48 }; +struct clk_priv { + unsigned int shift; +}; -static void master_clk_init(struct clk *clk) +#define FRQMR_CLK_DATA(_name, _shift) \ +static struct clk_priv _name##_data = { .shift = _shift, } + +FRQMR_CLK_DATA(pfc, 0); +FRQMR_CLK_DATA(s3fc, 4); +FRQMR_CLK_DATA(s2fc, 8); +FRQMR_CLK_DATA(mfc, 12); +FRQMR_CLK_DATA(bfc, 16); +FRQMR_CLK_DATA(sfc, 20); +FRQMR_CLK_DATA(ufc, 24); +FRQMR_CLK_DATA(ifc, 28); + +static unsigned long frqmr_clk_recalc(struct clk *clk) { - clk->rate *= pfc_divisors[ctrl_inl(FRQMR1) & 0x000f]; + struct clk_priv *data = clk->priv; + unsigned int idx; + + idx = (__raw_readl(FRQMR1) >> data->shift) & 0x000f; + + /* + * XXX: PLL1 multiplier is locked for the default clock mode, + * when mode pin detection and configuration support is added, + * select the multiplier dynamically. + */ + return clk->parent->rate * 36 / div2[idx]; } -static struct clk_ops sh7785_master_clk_ops = { - .init = master_clk_init, -}; - -static unsigned long module_clk_recalc(struct clk *clk) -{ - int idx = (ctrl_inl(FRQMR1) & 0x000f); - return clk->parent->rate / pfc_divisors[idx]; -} - -static struct clk_ops sh7785_module_clk_ops = { - .recalc = module_clk_recalc, -}; - -static unsigned long bus_clk_recalc(struct clk *clk) -{ - int idx = ((ctrl_inl(FRQMR1) >> 16) & 0x000f); - return clk->parent->rate / bfc_divisors[idx]; -} - -static struct clk_ops sh7785_bus_clk_ops = { - .recalc = bus_clk_recalc, -}; - -static unsigned long cpu_clk_recalc(struct clk *clk) -{ - int idx = ((ctrl_inl(FRQMR1) >> 28) & 0x0003); - return clk->parent->rate / ifc_divisors[idx]; -} - -static struct clk_ops sh7785_cpu_clk_ops = { - .recalc = cpu_clk_recalc, -}; - -static struct clk_ops *sh7785_clk_ops[] = { - &sh7785_master_clk_ops, - &sh7785_module_clk_ops, - &sh7785_bus_clk_ops, - &sh7785_cpu_clk_ops, -}; - -void __init arch_init_clk_ops(struct clk_ops **ops, int idx) -{ - if (idx < ARRAY_SIZE(sh7785_clk_ops)) - *ops = sh7785_clk_ops[idx]; -} - -static unsigned long shyway_clk_recalc(struct clk *clk) -{ - int idx = ((ctrl_inl(FRQMR1) >> 20) & 0x0003); - return clk->parent->rate / sfc_divisors[idx]; -} - -static struct clk_ops sh7785_shyway_clk_ops = { - .recalc = shyway_clk_recalc, -}; - -static struct clk sh7785_shyway_clk = { - .name = "shyway_clk", - .flags = CLK_ENABLE_ON_INIT, - .ops = &sh7785_shyway_clk_ops, -}; - -static unsigned long ddr_clk_recalc(struct clk *clk) -{ - int idx = ((ctrl_inl(FRQMR1) >> 12) & 0x0003); - return clk->parent->rate / mfc_divisors[idx]; -} - -static struct clk_ops sh7785_ddr_clk_ops = { - .recalc = ddr_clk_recalc, -}; - -static struct clk sh7785_ddr_clk = { - .name = "ddr_clk", - .flags = CLK_ENABLE_ON_INIT, - .ops = &sh7785_ddr_clk_ops, -}; - -static unsigned long ram_clk_recalc(struct clk *clk) -{ - int idx = ((ctrl_inl(FRQMR1) >> 24) & 0x0003); - return clk->parent->rate / ufc_divisors[idx]; -} - -static struct clk_ops sh7785_ram_clk_ops = { - .recalc = ram_clk_recalc, -}; - -static struct clk sh7785_ram_clk = { - .name = "ram_clk", - .flags = CLK_ENABLE_ON_INIT, - .ops = &sh7785_ram_clk_ops, +static struct clk_ops frqmr_clk_ops = { + .recalc = frqmr_clk_recalc, }; /* - * Additional SH7785-specific on-chip clocks that aren't already part of the - * clock framework + * Default rate for the root input clock, reset this with clk_set_rate() + * from the platform code. */ -static struct clk *sh7785_onchip_clocks[] = { - &sh7785_shyway_clk, - &sh7785_ddr_clk, - &sh7785_ram_clk, +static struct clk extal_clk = { + .name = "extal", + .id = -1, + .rate = 33333333, +}; + +static struct clk cpu_clk = { + .name = "cpu_clk", /* Ick */ + .id = -1, + .ops = &frqmr_clk_ops, + .parent = &extal_clk, + .flags = CLK_ENABLE_ON_INIT, + .priv = &ifc_data, +}; + +static struct clk shyway_clk = { + .name = "shyway_clk", /* SHck */ + .id = -1, + .ops = &frqmr_clk_ops, + .parent = &extal_clk, + .flags = CLK_ENABLE_ON_INIT, + .priv = &sfc_data, +}; + +static struct clk peripheral_clk = { + .name = "peripheral_clk", /* Pck */ + .id = -1, + .ops = &frqmr_clk_ops, + .parent = &extal_clk, + .flags = CLK_ENABLE_ON_INIT, + .priv = &pfc_data, +}; + +static struct clk ddr_clk = { + .name = "ddr_clk", /* DDRck */ + .id = -1, + .ops = &frqmr_clk_ops, + .parent = &extal_clk, + .flags = CLK_ENABLE_ON_INIT, + .priv = &mfc_data, +}; + +static struct clk bus_clk = { + .name = "bus_clk", /* Bck */ + .id = -1, + .ops = &frqmr_clk_ops, + .parent = &extal_clk, + .flags = CLK_ENABLE_ON_INIT, + .priv = &bfc_data, +}; + +static struct clk ga_clk = { + .name = "ga_clk", /* GAck */ + .id = -1, + .ops = &frqmr_clk_ops, + .parent = &extal_clk, + .priv = &s2fc_data, +}; + +static struct clk du_clk = { + .name = "du_clk", /* DUck */ + .id = -1, + .ops = &frqmr_clk_ops, + .parent = &extal_clk, + .priv = &s3fc_data, +}; + +static struct clk umem_clk = { + .name = "umem_clk", /* uck */ + .id = -1, + .ops = &frqmr_clk_ops, + .parent = &extal_clk, + .flags = CLK_ENABLE_ON_INIT, + .priv = &ufc_data, +}; + +static struct clk *clks[] = { + &extal_clk, + &cpu_clk, + ­way_clk, + &peripheral_clk, + &ddr_clk, + &bus_clk, + &ga_clk, + &du_clk, + &umem_clk, }; int __init arch_clk_init(void) { - struct clk *clk; int i, ret = 0; - cpg_clk_init(); - - clk = clk_get(NULL, "master_clk"); - for (i = 0; i < ARRAY_SIZE(sh7785_onchip_clocks); i++) { - struct clk *clkp = sh7785_onchip_clocks[i]; - - clkp->parent = clk; - ret |= clk_register(clkp); - } - - clk_put(clk); + for (i = 0; i < ARRAY_SIZE(clks); i++) + ret |= clk_register(clks[i]); return ret; } From a1c0643ff9f360a30644f6e3cd643ca2a5083aea Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 13 May 2009 10:56:52 +0100 Subject: [PATCH 1057/2061] GFS2: Move journal live test at transaction start There seems little point grabbing the transaction glock only to have to release it again if the journal isn't live. This moves the test earlier to avoid grabbing the lock when we don't need it in the first place. Signed-off-by: Steven Whitehouse --- fs/gfs2/trans.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 053752d4b27..4ef0e9fa354 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -33,6 +33,9 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, BUG_ON(current->journal_info); BUG_ON(blocks == 0 && revokes == 0); + if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) + return -EROFS; + tr = kzalloc(sizeof(struct gfs2_trans), GFP_NOFS); if (!tr) return -ENOMEM; @@ -54,12 +57,6 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (error) goto fail_holder_uninit; - if (!test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { - tr->tr_t_gh.gh_flags |= GL_NOCACHE; - error = -EROFS; - goto fail_gunlock; - } - error = gfs2_log_reserve(sdp, tr->tr_reserved); if (error) goto fail_gunlock; From cc96eace48fdf0f8925a74c6c1f7ffa512e458d2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 13 May 2009 20:28:15 +0900 Subject: [PATCH 1058/2061] sh: clkfwk: rate table construction and rounding for SH7785. This adds support for constructing a rate table by looking at potential divisors for a specified clock. Each FQRMR clock is given its own table. Presently each table is rebuilt when the parent propagates down a new rate, so some more logic needs to be added to do this more intelligently. Additionally, a fairly generic round_rate() implementation is then layered on top of it, which subsequently provides us with cpufreq support. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 1 + arch/sh/kernel/cpu/clock.c | 3 + arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 109 ++++++++++++++++++++++--- 3 files changed, 100 insertions(+), 13 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index 40cf3c07d7e..da681de1500 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -16,6 +16,7 @@ struct clk_ops { int (*set_rate)(struct clk *clk, unsigned long rate, int algo_id); int (*set_parent)(struct clk *clk, struct clk *parent); long (*round_rate)(struct clk *clk, unsigned long rate); + void (*build_rate_table)(struct clk *clk); }; struct clk { diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 033f4662b59..56c6e11fa83 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -63,6 +63,9 @@ void propagate_rate(struct clk *tclk) list_for_each_entry(clkp, &tclk->children, sibling) { if (clkp->ops && clkp->ops->recalc) clkp->rate = clkp->ops->recalc(clkp); + if (clkp->ops && clkp->ops->build_rate_table) + clkp->ops->build_rate_table(clkp); + propagate_rate(clkp); } } diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c index 87584dc8192..b7a32dd1b2d 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c @@ -13,28 +13,43 @@ #include #include #include +#include #include #include static unsigned int div2[] = { 1, 2, 4, 6, 8, 12, 16, 18, 24, 32, 36, 48 }; struct clk_priv { - unsigned int shift; + unsigned int shift; + + /* allowable divisor bitmap */ + unsigned long div_bitmap; + + /* Supportable frequencies + termination entry */ + struct cpufreq_frequency_table freq_table[ARRAY_SIZE(div2)+1]; }; -#define FRQMR_CLK_DATA(_name, _shift) \ -static struct clk_priv _name##_data = { .shift = _shift, } +#define FRQMR_CLK_DATA(_name, _shift, _div_bitmap) \ +static struct clk_priv _name##_data = { \ + .shift = _shift, \ + .div_bitmap = _div_bitmap, \ + \ + .freq_table[0] = { \ + .index = 0, \ + .frequency = CPUFREQ_TABLE_END, \ + }, \ +} -FRQMR_CLK_DATA(pfc, 0); -FRQMR_CLK_DATA(s3fc, 4); -FRQMR_CLK_DATA(s2fc, 8); -FRQMR_CLK_DATA(mfc, 12); -FRQMR_CLK_DATA(bfc, 16); -FRQMR_CLK_DATA(sfc, 20); -FRQMR_CLK_DATA(ufc, 24); -FRQMR_CLK_DATA(ifc, 28); +FRQMR_CLK_DATA(pfc, 0, 0x0f80); +FRQMR_CLK_DATA(s3fc, 4, 0x0ff0); +FRQMR_CLK_DATA(s2fc, 8, 0x0030); +FRQMR_CLK_DATA(mfc, 12, 0x000c); +FRQMR_CLK_DATA(bfc, 16, 0x0fe0); +FRQMR_CLK_DATA(sfc, 20, 0x000c); +FRQMR_CLK_DATA(ufc, 24, 0x000c); +FRQMR_CLK_DATA(ifc, 28, 0x000e); -static unsigned long frqmr_clk_recalc(struct clk *clk) +static unsigned long frqmr_recalc(struct clk *clk) { struct clk_priv *data = clk->priv; unsigned int idx; @@ -49,8 +64,76 @@ static unsigned long frqmr_clk_recalc(struct clk *clk) return clk->parent->rate * 36 / div2[idx]; } +static void frqmr_build_rate_table(struct clk *clk) +{ + struct clk_priv *data = clk->priv; + int i, entry; + + for (i = entry = 0; i < ARRAY_SIZE(div2); i++) { + if ((data->div_bitmap & (1 << i)) == 0) + continue; + + data->freq_table[entry].index = entry; + data->freq_table[entry].frequency = + clk->parent->rate * 36 / div2[i]; + + entry++; + } + + if (entry == 0) { + pr_warning("clkfwk: failed to build frequency table " + "for \"%s\" clk!\n", clk->name); + return; + } + + /* Termination entry */ + data->freq_table[entry].index = entry; + data->freq_table[entry].frequency = CPUFREQ_TABLE_END; +} + +static long frqmr_round_rate(struct clk *clk, unsigned long rate) +{ + struct clk_priv *data = clk->priv; + unsigned long rate_error, rate_error_prev = ~0UL; + unsigned long rate_best_fit = rate; + unsigned long highest, lowest; + int i; + + highest = lowest = 0; + + for (i = 0; data->freq_table[i].frequency != CPUFREQ_TABLE_END; i++) { + unsigned long freq = data->freq_table[i].frequency; + + if (freq == CPUFREQ_ENTRY_INVALID) + continue; + + if (freq > highest) + highest = freq; + if (freq < lowest) + lowest = freq; + + rate_error = abs(freq - rate); + if (rate_error < rate_error_prev) { + rate_best_fit = freq; + rate_error_prev = rate_error; + } + + if (rate_error == 0) + break; + } + + if (rate >= highest) + rate_best_fit = highest; + if (rate <= lowest) + rate_best_fit = lowest; + + return rate_best_fit; +} + static struct clk_ops frqmr_clk_ops = { - .recalc = frqmr_clk_recalc, + .recalc = frqmr_recalc, + .build_rate_table = frqmr_build_rate_table, + .round_rate = frqmr_round_rate, }; /* From cedcf3366f2191885aff92d33d6078ef08203e52 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 13 May 2009 21:51:28 +0900 Subject: [PATCH 1059/2061] sh: clkfwk: Map tree hierarchy in debugfs. This adopts the OMAP clock framework debugfs bits and replaces the aging procfs bits. The procfs clocks entry was primarily a debugging aid, and used to be tied in to cpuinfo before the clock list grew too unweildly. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 1 + arch/sh/kernel/cpu/clock.c | 112 +++++++++++++++++++++++++++--------- 2 files changed, 86 insertions(+), 27 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index da681de1500..c499d470b8c 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -38,6 +38,7 @@ struct clk { unsigned long arch_flags; void *priv; + struct dentry *dentry; }; struct clk_lookup { diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 56c6e11fa83..686477f8ae5 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include @@ -404,24 +404,6 @@ void clk_put(struct clk *clk) } EXPORT_SYMBOL_GPL(clk_put); - -static int show_clocks(char *buf, char **start, off_t off, - int len, int *eof, void *data) -{ - struct clk *clk; - char *p = buf; - - list_for_each_entry_reverse(clk, &clock_list, node) { - unsigned long rate = clk_get_rate(clk); - - p += sprintf(p, "%-12s\t: %ld.%02ldMHz\t%s\n", clk->name, - rate / 1000000, (rate % 1000000) / 10000, - (clk->usecount > 0) ? "enabled" : "disabled"); - } - - return p - buf; -} - #ifdef CONFIG_PM static int clks_sysdev_suspend(struct sys_device *dev, pm_message_t state) { @@ -516,14 +498,90 @@ int __init clk_init(void) return ret; } -static int __init clk_proc_init(void) -{ - struct proc_dir_entry *p; - p = create_proc_read_entry("clocks", S_IRUSR, NULL, - show_clocks, NULL); - if (unlikely(!p)) - return -EINVAL; +/* + * debugfs support to trace clock tree hierarchy and attributes + */ +static struct dentry *clk_debugfs_root; +static int clk_debugfs_register_one(struct clk *c) +{ + int err; + struct dentry *d, *child; + struct clk *pa = c->parent; + char s[255]; + char *p = s; + + p += sprintf(p, "%s", c->name); + if (c->id > 0) + sprintf(p, ":%d", c->id); + d = debugfs_create_dir(s, pa ? pa->dentry : clk_debugfs_root); + if (!d) + return -ENOMEM; + c->dentry = d; + + d = debugfs_create_u8("usecount", S_IRUGO, c->dentry, (u8 *)&c->usecount); + if (!d) { + err = -ENOMEM; + goto err_out; + } + d = debugfs_create_u32("rate", S_IRUGO, c->dentry, (u32 *)&c->rate); + if (!d) { + err = -ENOMEM; + goto err_out; + } + d = debugfs_create_x32("flags", S_IRUGO, c->dentry, (u32 *)&c->flags); + if (!d) { + err = -ENOMEM; + goto err_out; + } + return 0; + +err_out: + d = c->dentry; + list_for_each_entry(child, &d->d_subdirs, d_u.d_child) + debugfs_remove(child); + debugfs_remove(c->dentry); + return err; +} + +static int clk_debugfs_register(struct clk *c) +{ + int err; + struct clk *pa = c->parent; + + if (pa && !pa->dentry) { + err = clk_debugfs_register(pa); + if (err) + return err; + } + + if (!c->dentry) { + err = clk_debugfs_register_one(c); + if (err) + return err; + } return 0; } -subsys_initcall(clk_proc_init); + +static int __init clk_debugfs_init(void) +{ + struct clk *c; + struct dentry *d; + int err; + + d = debugfs_create_dir("clock", NULL); + if (!d) + return -ENOMEM; + clk_debugfs_root = d; + + list_for_each_entry(c, &clock_list, node) { + err = clk_debugfs_register(c); + if (err) + goto err_out; + } + return 0; +err_out: + debugfs_remove(clk_debugfs_root); /* REVISIT: Cleanup correctly */ + return err; +} +late_initcall(clk_debugfs_init); From 48c2b613616235d7c97fda5982f50100a6c79166 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 13 May 2009 14:49:48 +0100 Subject: [PATCH 1060/2061] GFS2: Add commit= mount option It has always been possible to adjust the gfs2 log commit interval, but only from the sysfs interface. This adds a mount option, commit=, which will be familar to ext3 users. The sysfs interface continues to be available as well, although this might be removed in the future. Also this patch cleans up some duplicated structures in the GFS2 sysfs code. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 1 + fs/gfs2/mount.c | 10 +++++ fs/gfs2/ops_fstype.c | 4 +- fs/gfs2/ops_super.c | 13 ++++++- fs/gfs2/sys.c | 92 ++++++++++++++++---------------------------- 5 files changed, 60 insertions(+), 60 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 399d1b97804..65f438e9537 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -418,6 +418,7 @@ struct gfs2_args { unsigned int ar_data:2; /* ordered/writeback */ unsigned int ar_meta:1; /* mount metafs */ unsigned int ar_discard:1; /* discard requests */ + int ar_commit; /* Commit interval */ }; struct gfs2_tune { diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c index f7e8527a21e..947af151fa2 100644 --- a/fs/gfs2/mount.c +++ b/fs/gfs2/mount.c @@ -45,6 +45,7 @@ enum { Opt_meta, Opt_discard, Opt_nodiscard, + Opt_commit, Opt_err, }; @@ -73,6 +74,7 @@ static const match_table_t tokens = { {Opt_meta, "meta"}, {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, + {Opt_commit, "commit=%d"}, {Opt_err, NULL} }; @@ -89,6 +91,7 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) char *o; int token; substring_t tmp[MAX_OPT_ARGS]; + int rv; /* Split the options into tokens with the "," character and process them */ @@ -173,6 +176,13 @@ int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) case Opt_nodiscard: args->ar_discard = 0; break; + case Opt_commit: + rv = match_int(&tmp[0], &args->ar_commit); + if (rv || args->ar_commit <= 0) { + fs_info(sdp, "commit mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; case Opt_err: default: fs_info(sdp, "invalid mount option: %s\n", o); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1ff9473ea75..7981fbc9fc3 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -55,7 +55,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) spin_lock_init(>->gt_spin); gt->gt_incore_log_blocks = 1024; - gt->gt_log_flush_secs = 60; gt->gt_recoverd_secs = 60; gt->gt_logd_secs = 1; gt->gt_quota_simul_sync = 64; @@ -1165,6 +1164,7 @@ static int fill_super(struct super_block *sb, void *data, int silent) sdp->sd_args.ar_quota = GFS2_QUOTA_DEFAULT; sdp->sd_args.ar_data = GFS2_DATA_DEFAULT; + sdp->sd_args.ar_commit = 60; error = gfs2_mount_args(sdp, &sdp->sd_args, data); if (error) { @@ -1191,6 +1191,8 @@ static int fill_super(struct super_block *sb, void *data, int silent) GFS2_BASIC_BLOCK_SHIFT; sdp->sd_fsb2bb = 1 << sdp->sd_fsb2bb_shift; + sdp->sd_tune.gt_log_flush_secs = sdp->sd_args.ar_commit; + error = init_names(sdp, silent); if (error) goto fail; diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 458019569dc..0677a837856 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -436,8 +436,12 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) { struct gfs2_sbd *sdp = sb->s_fs_info; struct gfs2_args args = sdp->sd_args; /* Default to current settings */ + struct gfs2_tune *gt = &sdp->sd_tune; int error; + spin_lock(>->gt_spin); + args.ar_commit = gt->gt_log_flush_secs; + spin_unlock(>->gt_spin); error = gfs2_mount_args(sdp, &args, data); if (error) return error; @@ -473,6 +477,10 @@ static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) sb->s_flags |= MS_POSIXACL; else sb->s_flags &= ~MS_POSIXACL; + spin_lock(>->gt_spin); + gt->gt_log_flush_secs = args.ar_commit; + spin_unlock(>->gt_spin); + return 0; } @@ -550,6 +558,7 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) { struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; struct gfs2_args *args = &sdp->sd_args; + int lfsecs; if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) seq_printf(s, ",meta"); @@ -610,7 +619,9 @@ static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) } if (args->ar_discard) seq_printf(s, ",discard"); - + lfsecs = sdp->sd_tune.gt_log_flush_secs; + if (lfsecs != 60) + seq_printf(s, ",commit=%d", lfsecs); return 0; } diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 7655f5025fe..d53b22edc98 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -26,6 +26,36 @@ #include "util.h" #include "glops.h" +struct gfs2_attr { + struct attribute attr; + ssize_t (*show)(struct gfs2_sbd *, char *); + ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); +}; + +static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, + char *buf) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->show ? a->show(sdp, buf) : 0; +} + +static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); + struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); + return a->store ? a->store(sdp, buf, len) : len; +} + +static struct sysfs_ops gfs2_attr_ops = { + .show = gfs2_attr_show, + .store = gfs2_attr_store, +}; + + +static struct kset *gfs2_kset; + static ssize_t id_show(struct gfs2_sbd *sdp, char *buf) { return snprintf(buf, PAGE_SIZE, "%u:%u\n", @@ -212,11 +242,6 @@ static ssize_t demote_rq_store(struct gfs2_sbd *sdp, const char *buf, size_t len return len; } -struct gfs2_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); - ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); -}; #define GFS2_ATTR(name, mode, show, store) \ static struct gfs2_attr gfs2_attr_##name = __ATTR(name, mode, show, store) @@ -246,49 +271,21 @@ static struct attribute *gfs2_attrs[] = { NULL, }; -static ssize_t gfs2_attr_show(struct kobject *kobj, struct attribute *attr, - char *buf) -{ - struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); - return a->show ? a->show(sdp, buf) : 0; -} - -static ssize_t gfs2_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct gfs2_sbd *sdp = container_of(kobj, struct gfs2_sbd, sd_kobj); - struct gfs2_attr *a = container_of(attr, struct gfs2_attr, attr); - return a->store ? a->store(sdp, buf, len) : len; -} - -static struct sysfs_ops gfs2_attr_ops = { - .show = gfs2_attr_show, - .store = gfs2_attr_store, -}; - static struct kobj_type gfs2_ktype = { .default_attrs = gfs2_attrs, .sysfs_ops = &gfs2_attr_ops, }; -static struct kset *gfs2_kset; - /* * display struct lm_lockstruct fields */ -struct lockstruct_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); -}; - #define LOCKSTRUCT_ATTR(name, fmt) \ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ { \ return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_lockstruct.ls_##name); \ } \ -static struct lockstruct_attr lockstruct_attr_##name = __ATTR_RO(name) +static struct gfs2_attr lockstruct_attr_##name = __ATTR_RO(name) LOCKSTRUCT_ATTR(jid, "%u\n"); LOCKSTRUCT_ATTR(first, "%u\n"); @@ -401,14 +398,8 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%d\n", ls->ls_recover_jid_status); } -struct gdlm_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *sdp, char *); - ssize_t (*store)(struct gfs2_sbd *sdp, const char *, size_t); -}; - #define GDLM_ATTR(_name,_mode,_show,_store) \ -static struct gdlm_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) +static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); GDLM_ATTR(block, 0644, block_show, block_store); @@ -434,21 +425,12 @@ static struct attribute *lock_module_attrs[] = { NULL, }; -/* - * display struct gfs2_args fields - */ - -struct args_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); -}; - #define ARGS_ATTR(name, fmt) \ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ { \ return snprintf(buf, PAGE_SIZE, fmt, sdp->sd_args.ar_##name); \ } \ -static struct args_attr args_attr_##name = __ATTR_RO(name) +static struct gfs2_attr args_attr_##name = __ATTR_RO(name) ARGS_ATTR(lockproto, "%s\n"); ARGS_ATTR(locktable, "%s\n"); @@ -531,14 +513,8 @@ static ssize_t tune_set(struct gfs2_sbd *sdp, unsigned int *field, return len; } -struct tune_attr { - struct attribute attr; - ssize_t (*show)(struct gfs2_sbd *, char *); - ssize_t (*store)(struct gfs2_sbd *, const char *, size_t); -}; - #define TUNE_ATTR_3(name, show, store) \ -static struct tune_attr tune_attr_##name = __ATTR(name, 0644, show, store) +static struct gfs2_attr tune_attr_##name = __ATTR(name, 0644, show, store) #define TUNE_ATTR_2(name, store) \ static ssize_t name##_show(struct gfs2_sbd *sdp, char *buf) \ From 9582d41135c0d362f04ed6bf3dc8d693a7eafee2 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 13 May 2009 15:06:25 +0100 Subject: [PATCH 1061/2061] GFS2: Remove a couple of unused sysfs entries These two tunables are pointless and would never need to be changed anyway. There is also a race between them and umount as the deamons which they refer to might have gone away. The easiest way to fix the race is to remove the interface. Signed-off-by: Steven Whitehouse --- fs/gfs2/sys.c | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index d53b22edc98..894bf773ec9 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -530,15 +530,6 @@ static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ } \ TUNE_ATTR_2(name, name##_store) -#define TUNE_ATTR_DAEMON(name, process) \ -static ssize_t name##_store(struct gfs2_sbd *sdp, const char *buf, size_t len)\ -{ \ - ssize_t r = tune_set(sdp, &sdp->sd_tune.gt_##name, 1, buf, len); \ - wake_up_process(sdp->sd_##process); \ - return r; \ -} \ -TUNE_ATTR_2(name, name##_store) - TUNE_ATTR(incore_log_blocks, 0); TUNE_ATTR(log_flush_secs, 0); TUNE_ATTR(quota_warn_period, 0); @@ -550,8 +541,6 @@ TUNE_ATTR(new_files_jdata, 0); TUNE_ATTR(quota_simul_sync, 1); TUNE_ATTR(stall_secs, 1); TUNE_ATTR(statfs_quantum, 1); -TUNE_ATTR_DAEMON(recoverd_secs, recoverd_process); -TUNE_ATTR_DAEMON(logd_secs, logd_process); TUNE_ATTR_3(quota_scale, quota_scale_show, quota_scale_store); static struct attribute *tune_attrs[] = { @@ -565,8 +554,6 @@ static struct attribute *tune_attrs[] = { &tune_attr_quota_simul_sync.attr, &tune_attr_stall_secs.attr, &tune_attr_statfs_quantum.attr, - &tune_attr_recoverd_secs.attr, - &tune_attr_logd_secs.attr, &tune_attr_quota_scale.attr, &tune_attr_new_files_jdata.attr, NULL, From b103387037cea2ba0f04b44d408d54c53f678061 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Wed, 13 May 2009 12:50:40 -0400 Subject: [PATCH 1062/2061] TPM: get_event_name stack corruption get_event_name uses sprintf to fill a buffer declared on the stack. It fills the buffer 2 bytes at a time. What the code doesn't take into account is that sprintf(buf, "%02x", data) actually writes 3 bytes. 2 bytes for the data and then it nul terminates the string. Since we declare buf to be 40 characters long and then we write 40 bytes of data into buf sprintf is going to write 41 characters. The fix is to leave room in buf for the nul terminator. Signed-off-by: Eric Paris Signed-off-by: James Morris --- drivers/char/tpm/tpm_bios.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/char/tpm/tpm_bios.c b/drivers/char/tpm/tpm_bios.c index ed306eb1057..0c2f55a38b9 100644 --- a/drivers/char/tpm/tpm_bios.c +++ b/drivers/char/tpm/tpm_bios.c @@ -212,7 +212,8 @@ static int get_event_name(char *dest, struct tcpa_event *event, unsigned char * event_entry) { const char *name = ""; - char data[40] = ""; + /* 41 so there is room for 40 data and 1 nul */ + char data[41] = ""; int i, n_len = 0, d_len = 0; struct tcpa_pc_event *pc_event; From 77f6bf57ba9d2c50173536dbfdacdab27cb867ca Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 14 May 2009 09:49:44 +0200 Subject: [PATCH 1063/2061] splice: fix error return code fs/splice.c: In function 'default_file_splice_read': fs/splice.c:566: warning: 'error' may be used uninitialized in this function which is sort-of true. The code will in fact return -ENOMEM instead of the kernel_readv() return value. Cc: Miklos Szeredi Signed-off-by: Andrew Morton Signed-off-by: Jens Axboe --- fs/splice.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/splice.c b/fs/splice.c index c5e3c79b95a..41179c0a655 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -595,8 +595,10 @@ ssize_t default_file_splice_read(struct file *in, loff_t *ppos, } res = kernel_readv(in, vec, spd.nr_pages, *ppos); - if (res < 0) + if (res < 0) { + error = res; goto err; + } error = 0; if (!res) From 549b5e358d17a8c04953ed80896ce07d37722451 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 14 May 2009 17:38:46 +0900 Subject: [PATCH 1064/2061] sh: clkfwk: Add MSTP bits to SH7785 clock framework. This plugs in all of the MSTP functions in to the clock framework, and hands them off to the platform devices that want them. Signed-off-by: Paul Mundt --- arch/sh/include/asm/clock.h | 3 ++ arch/sh/kernel/cpu/clock.c | 4 +- arch/sh/kernel/cpu/sh4a/clock-sh7785.c | 63 ++++++++++++++++++++++++++ arch/sh/kernel/cpu/sh4a/setup-sh7785.c | 18 +++++--- 4 files changed, 80 insertions(+), 8 deletions(-) diff --git a/arch/sh/include/asm/clock.h b/arch/sh/include/asm/clock.h index c499d470b8c..64c93cb3d68 100644 --- a/arch/sh/include/asm/clock.h +++ b/arch/sh/include/asm/clock.h @@ -36,6 +36,9 @@ struct clk { unsigned long rate; unsigned long flags; + void __iomem *enable_reg; + unsigned int enable_bit; + unsigned long arch_flags; void *priv; struct dentry *dentry; diff --git a/arch/sh/kernel/cpu/clock.c b/arch/sh/kernel/cpu/clock.c index 686477f8ae5..012d23476a7 100644 --- a/arch/sh/kernel/cpu/clock.c +++ b/arch/sh/kernel/cpu/clock.c @@ -39,7 +39,7 @@ static DEFINE_MUTEX(clock_list_sem); /* Used for clocks that always have same value as the parent clock */ unsigned long followparent_recalc(struct clk *clk) { - return clk->parent->rate; + return clk->parent ? clk->parent->rate : 0; } int clk_reparent(struct clk *child, struct clk *parent) @@ -512,7 +512,7 @@ static int clk_debugfs_register_one(struct clk *c) char *p = s; p += sprintf(p, "%s", c->name); - if (c->id > 0) + if (c->id >= 0) sprintf(p, ":%d", c->id); d = debugfs_create_dir(s, pa ? pa->dentry : clk_debugfs_root); if (!d) diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c index b7a32dd1b2d..cf042b53b3a 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7785.c @@ -228,12 +228,75 @@ static struct clk *clks[] = { &umem_clk, }; +static int mstpcr_clk_enable(struct clk *clk) +{ + __raw_writel(__raw_readl(clk->enable_reg) & ~(1 << clk->enable_bit), + clk->enable_reg); + return 0; +} + +static void mstpcr_clk_disable(struct clk *clk) +{ + __raw_writel(__raw_readl(clk->enable_reg) | (1 << clk->enable_bit), + clk->enable_reg); +} + +static struct clk_ops mstpcr_clk_ops = { + .enable = mstpcr_clk_enable, + .disable = mstpcr_clk_disable, + .recalc = followparent_recalc, +}; + +#define MSTPCR0 0xffc80030 +#define MSTPCR1 0xffc80034 + +#define CLK(_name, _id, _parent, _enable_reg, \ + _enable_bit, _flags) \ +{ \ + .name = _name, \ + .id = _id, \ + .parent = _parent, \ + .enable_reg = (void __iomem *)_enable_reg, \ + .enable_bit = _enable_bit, \ + .flags = _flags, \ + .ops = &mstpcr_clk_ops, \ +} + +static struct clk mstpcr_clks[] = { + /* MSTPCR0 */ + CLK("scif_fck", 5, &peripheral_clk, MSTPCR0, 29, 0), + CLK("scif_fck", 4, &peripheral_clk, MSTPCR0, 28, 0), + CLK("scif_fck", 3, &peripheral_clk, MSTPCR0, 27, 0), + CLK("scif_fck", 2, &peripheral_clk, MSTPCR0, 26, 0), + CLK("scif_fck", 1, &peripheral_clk, MSTPCR0, 25, 0), + CLK("scif_fck", 0, &peripheral_clk, MSTPCR0, 24, 0), + CLK("ssi_fck", 1, &peripheral_clk, MSTPCR0, 21, 0), + CLK("ssi_fck", 0, &peripheral_clk, MSTPCR0, 20, 0), + CLK("hac_fck", 1, &peripheral_clk, MSTPCR0, 17, 0), + CLK("hac_fck", 0, &peripheral_clk, MSTPCR0, 16, 0), + CLK("mmcif_fck", -1, &peripheral_clk, MSTPCR0, 13, 0), + CLK("flctl_fck", -1, &peripheral_clk, MSTPCR0, 12, 0), + CLK("tmu345_fck", -1, &peripheral_clk, MSTPCR0, 9, 0), + CLK("tmu012_fck", -1, &peripheral_clk, MSTPCR0, 8, 0), + CLK("siof_fck", -1, &peripheral_clk, MSTPCR0, 3, 0), + CLK("hspi_fck", -1, &peripheral_clk, MSTPCR0, 2, 0), + + /* MSTPCR1 */ + CLK("hudi_fck", -1, NULL, MSTPCR1, 19, 0), + CLK("ubc_fck", -1, NULL, MSTPCR1, 17, 0), + CLK("dmac_11_6_fck", -1, NULL, MSTPCR1, 5, 0), + CLK("dmac_5_0_fck", -1, NULL, MSTPCR1, 4, 0), + CLK("gdta_fck", -1, NULL, MSTPCR1, 0, 0), +}; + int __init arch_clk_init(void) { int i, ret = 0; for (i = 0; i < ARRAY_SIZE(clks); i++) ret |= clk_register(clks[i]); + for (i = 0; i < ARRAY_SIZE(mstpcr_clks); i++) + ret |= clk_register(&mstpcr_clks[i]); return ret; } diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c index d7e77bc77e2..af561402570 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7785.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7785.c @@ -20,7 +20,7 @@ static struct sh_timer_config tmu0_platform_data = { .name = "TMU0", .channel_offset = 0x04, .timer_bit = 0, - .clk = "peripheral_clk", + .clk = "tmu012_fck", .clockevent_rating = 200, }; @@ -51,7 +51,7 @@ static struct sh_timer_config tmu1_platform_data = { .name = "TMU1", .channel_offset = 0x10, .timer_bit = 1, - .clk = "peripheral_clk", + .clk = "tmu012_fck", .clocksource_rating = 200, }; @@ -82,7 +82,7 @@ static struct sh_timer_config tmu2_platform_data = { .name = "TMU2", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "peripheral_clk", + .clk = "tmu012_fck", }; static struct resource tmu2_resources[] = { @@ -112,7 +112,7 @@ static struct sh_timer_config tmu3_platform_data = { .name = "TMU3", .channel_offset = 0x04, .timer_bit = 0, - .clk = "peripheral_clk", + .clk = "tmu345_fck", }; static struct resource tmu3_resources[] = { @@ -142,7 +142,7 @@ static struct sh_timer_config tmu4_platform_data = { .name = "TMU4", .channel_offset = 0x10, .timer_bit = 1, - .clk = "peripheral_clk", + .clk = "tmu345_fck", }; static struct resource tmu4_resources[] = { @@ -172,7 +172,7 @@ static struct sh_timer_config tmu5_platform_data = { .name = "TMU5", .channel_offset = 0x1c, .timer_bit = 2, - .clk = "peripheral_clk", + .clk = "tmu345_fck", }; static struct resource tmu5_resources[] = { @@ -204,31 +204,37 @@ static struct plat_sci_port sci_platform_data[] = { .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 40, 40, 40, 40 }, + .clk = "scif_fck", }, { .mapbase = 0xffeb0000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 44, 44, 44, 44 }, + .clk = "scif_fck", }, { .mapbase = 0xffec0000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 60, 60, 60, 60 }, + .clk = "scif_fck", }, { .mapbase = 0xffed0000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 61, 61, 61, 61 }, + .clk = "scif_fck", }, { .mapbase = 0xffee0000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 62, 62, 62, 62 }, + .clk = "scif_fck", }, { .mapbase = 0xffef0000, .flags = UPF_BOOT_AUTOCONF, .type = PORT_SCIF, .irqs = { 63, 63, 63, 63 }, + .clk = "scif_fck", }, { .flags = 0, } From ad3256e361663923342bff7f292dd289f794aa33 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Thu, 14 May 2009 17:40:08 +0900 Subject: [PATCH 1065/2061] sh: Provide FORCE_MAX_ZONEORDER. Several platforms want to be able to do large physically contiguous allocations (primarily nommu and video codecs on SH-Mobile), provide a MAX_ORDER override for those cases. Tested-by: Conrad Parker Signed-off-by: Paul Mundt --- arch/sh/mm/Kconfig | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/arch/sh/mm/Kconfig b/arch/sh/mm/Kconfig index b900d2cd18f..2795618e4f0 100644 --- a/arch/sh/mm/Kconfig +++ b/arch/sh/mm/Kconfig @@ -21,6 +21,29 @@ config PAGE_OFFSET default "0x20000000" if MMU && SUPERH64 default "0x00000000" +config FORCE_MAX_ZONEORDER + int "Maximum zone order" + range 9 64 if PAGE_SIZE_16KB + default "9" if PAGE_SIZE_16KB + range 7 64 if PAGE_SIZE_64KB + default "7" if PAGE_SIZE_64KB + range 11 64 + default "14" if !MMU + default "11" + help + The kernel memory allocator divides physically contiguous memory + blocks into "zones", where each zone is a power of two number of + pages. This option selects the largest power of two that the kernel + keeps in the memory allocator. If you need to allocate very large + blocks of physically contiguous memory, then you may need to + increase this value. + + This config option is actually maximum order plus one. For example, + a value of 11 means that the largest free memory block is 2^10 pages. + + The page size is not necessarily 4KB. Keep this in mind when + choosing a value for this option. + config MEMORY_START hex "Physical memory start address" default "0x08000000" From 9304d0ccf1922b9b954b6e9223d1b2bc83198ad2 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 14 May 2009 07:53:39 +0000 Subject: [PATCH 1066/2061] sh: intc tables for sh7770 This patch adds INTC tables for sh7770, thanks goes to Paul for the first prototype version. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/setup-sh7770.c | 246 +++++++++++++++++++++++++ 1 file changed, 246 insertions(+) diff --git a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c index b61d6143aaa..0feba41d218 100644 --- a/arch/sh/kernel/cpu/sh4a/setup-sh7770.c +++ b/arch/sh/kernel/cpu/sh4a/setup-sh7770.c @@ -12,6 +12,7 @@ #include #include #include +#include static struct plat_sci_port sci_platform_data[] = { { @@ -387,6 +388,251 @@ void __init plat_early_device_setup(void) ARRAY_SIZE(sh7770_early_devices)); } +enum { + UNUSED = 0, + + /* interrupt sources */ + IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH, + IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH, + IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH, + IRL_HHLL, IRL_HHLH, IRL_HHHL, + + IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, + + GPIO, + TMU0, TMU1, TMU2, TMU2_TICPI, + TMU3, TMU4, TMU5, TMU5_TICPI, + TMU6, TMU7, TMU8, + HAC, IPI, SPDIF, HUDI, I2C, + DMAC0_DMINT0, DMAC0_DMINT1, DMAC0_DMINT2, + I2S0, I2S1, I2S2, I2S3, + SRC_RX, SRC_TX, SRC_SPDIF, + DU, VIDEO_IN, REMOTE, YUV, USB, ATAPI, CAN, GPS, GFX2D, + GFX3D_MBX, GFX3D_DMAC, + EXBUS_ATA, + SPI0, SPI1, + SCIF089, SCIF1234, SCIF567, + ADC, + BBDMAC_0_3, BBDMAC_4_7, BBDMAC_8_10, BBDMAC_11_14, + BBDMAC_15_18, BBDMAC_19_22, BBDMAC_23_26, BBDMAC_27, + BBDMAC_28, BBDMAC_29, BBDMAC_30, BBDMAC_31, + + /* interrupt groups */ + TMU, DMAC, I2S, SRC, GFX3D, SPI, SCIF, BBDMAC, +}; + +static struct intc_vect vectors[] __initdata = { + INTC_VECT(GPIO, 0x3e0), + INTC_VECT(TMU0, 0x400), INTC_VECT(TMU1, 0x420), + INTC_VECT(TMU2, 0x440), INTC_VECT(TMU2_TICPI, 0x460), + INTC_VECT(TMU3, 0x480), INTC_VECT(TMU4, 0x4a0), + INTC_VECT(TMU5, 0x4c0), INTC_VECT(TMU5_TICPI, 0x4e0), + INTC_VECT(TMU6, 0x500), INTC_VECT(TMU7, 0x520), + INTC_VECT(TMU8, 0x540), + INTC_VECT(HAC, 0x580), INTC_VECT(IPI, 0x5c0), + INTC_VECT(SPDIF, 0x5e0), + INTC_VECT(HUDI, 0x600), INTC_VECT(I2C, 0x620), + INTC_VECT(DMAC0_DMINT0, 0x640), INTC_VECT(DMAC0_DMINT1, 0x660), + INTC_VECT(DMAC0_DMINT2, 0x680), + INTC_VECT(I2S0, 0x6a0), INTC_VECT(I2S1, 0x6c0), + INTC_VECT(I2S2, 0x6e0), INTC_VECT(I2S3, 0x700), + INTC_VECT(SRC_RX, 0x720), INTC_VECT(SRC_TX, 0x740), + INTC_VECT(SRC_SPDIF, 0x760), + INTC_VECT(DU, 0x780), INTC_VECT(VIDEO_IN, 0x7a0), + INTC_VECT(REMOTE, 0x7c0), INTC_VECT(YUV, 0x7e0), + INTC_VECT(USB, 0x840), INTC_VECT(ATAPI, 0x860), + INTC_VECT(CAN, 0x880), INTC_VECT(GPS, 0x8a0), + INTC_VECT(GFX2D, 0x8c0), + INTC_VECT(GFX3D_MBX, 0x900), INTC_VECT(GFX3D_DMAC, 0x920), + INTC_VECT(EXBUS_ATA, 0x940), + INTC_VECT(SPI0, 0x960), INTC_VECT(SPI1, 0x980), + INTC_VECT(SCIF089, 0x9a0), INTC_VECT(SCIF1234, 0x9c0), + INTC_VECT(SCIF1234, 0x9e0), INTC_VECT(SCIF1234, 0xa00), + INTC_VECT(SCIF1234, 0xa20), INTC_VECT(SCIF567, 0xa40), + INTC_VECT(SCIF567, 0xa60), INTC_VECT(SCIF567, 0xa80), + INTC_VECT(SCIF089, 0xaa0), INTC_VECT(SCIF089, 0xac0), + INTC_VECT(ADC, 0xb20), + INTC_VECT(BBDMAC_0_3, 0xba0), INTC_VECT(BBDMAC_0_3, 0xbc0), + INTC_VECT(BBDMAC_0_3, 0xbe0), INTC_VECT(BBDMAC_0_3, 0xc00), + INTC_VECT(BBDMAC_4_7, 0xc20), INTC_VECT(BBDMAC_4_7, 0xc40), + INTC_VECT(BBDMAC_4_7, 0xc60), INTC_VECT(BBDMAC_4_7, 0xc80), + INTC_VECT(BBDMAC_8_10, 0xca0), INTC_VECT(BBDMAC_8_10, 0xcc0), + INTC_VECT(BBDMAC_8_10, 0xce0), INTC_VECT(BBDMAC_11_14, 0xd00), + INTC_VECT(BBDMAC_11_14, 0xd20), INTC_VECT(BBDMAC_11_14, 0xd40), + INTC_VECT(BBDMAC_11_14, 0xd60), INTC_VECT(BBDMAC_15_18, 0xd80), + INTC_VECT(BBDMAC_15_18, 0xda0), INTC_VECT(BBDMAC_15_18, 0xdc0), + INTC_VECT(BBDMAC_15_18, 0xde0), INTC_VECT(BBDMAC_19_22, 0xe00), + INTC_VECT(BBDMAC_19_22, 0xe20), INTC_VECT(BBDMAC_19_22, 0xe40), + INTC_VECT(BBDMAC_19_22, 0xe60), INTC_VECT(BBDMAC_23_26, 0xe80), + INTC_VECT(BBDMAC_23_26, 0xea0), INTC_VECT(BBDMAC_23_26, 0xec0), + INTC_VECT(BBDMAC_23_26, 0xee0), INTC_VECT(BBDMAC_27, 0xf00), + INTC_VECT(BBDMAC_28, 0xf20), INTC_VECT(BBDMAC_29, 0xf40), + INTC_VECT(BBDMAC_30, 0xf60), INTC_VECT(BBDMAC_31, 0xf80), +}; + +static struct intc_group groups[] __initdata = { + INTC_GROUP(TMU, TMU0, TMU1, TMU2, TMU2_TICPI, TMU3, TMU4, TMU5, + TMU5_TICPI, TMU6, TMU7, TMU8), + INTC_GROUP(DMAC, DMAC0_DMINT0, DMAC0_DMINT1, DMAC0_DMINT2), + INTC_GROUP(I2S, I2S0, I2S1, I2S2, I2S3), + INTC_GROUP(SRC, SRC_RX, SRC_TX, SRC_SPDIF), + INTC_GROUP(GFX3D, GFX3D_MBX, GFX3D_DMAC), + INTC_GROUP(SPI, SPI0, SPI1), + INTC_GROUP(SCIF, SCIF089, SCIF1234, SCIF567), + INTC_GROUP(BBDMAC, + BBDMAC_0_3, BBDMAC_4_7, BBDMAC_8_10, BBDMAC_11_14, + BBDMAC_15_18, BBDMAC_19_22, BBDMAC_23_26, BBDMAC_27, + BBDMAC_28, BBDMAC_29, BBDMAC_30, BBDMAC_31), +}; + +static struct intc_mask_reg mask_registers[] __initdata = { + { 0xffe00040, 0xffe00044, 32, /* INT2MSKR / INT2MSKCR */ + { 0, BBDMAC, ADC, SCIF, SPI, EXBUS_ATA, GFX3D, GFX2D, + GPS, CAN, ATAPI, USB, YUV, REMOTE, VIDEO_IN, DU, SRC, I2S, + DMAC, I2C, HUDI, SPDIF, IPI, HAC, TMU, GPIO } }, +}; + +static struct intc_prio_reg prio_registers[] __initdata = { + { 0xffe00000, 0, 32, 8, /* INT2PRI0 */ { GPIO, TMU0, 0, HAC } }, + { 0xffe00004, 0, 32, 8, /* INT2PRI1 */ { IPI, SPDIF, HUDI, I2C } }, + { 0xffe00008, 0, 32, 8, /* INT2PRI2 */ { DMAC, I2S, SRC, DU } }, + { 0xffe0000c, 0, 32, 8, /* INT2PRI3 */ { VIDEO_IN, REMOTE, YUV, USB } }, + { 0xffe00010, 0, 32, 8, /* INT2PRI4 */ { ATAPI, CAN, GPS, GFX2D } }, + { 0xffe00014, 0, 32, 8, /* INT2PRI5 */ { 0, GFX3D, EXBUS_ATA, SPI } }, + { 0xffe00018, 0, 32, 8, /* INT2PRI6 */ { SCIF1234, SCIF567, SCIF089 } }, + { 0xffe0001c, 0, 32, 8, /* INT2PRI7 */ { ADC, 0, 0, BBDMAC_0_3 } }, + { 0xffe00020, 0, 32, 8, /* INT2PRI8 */ + { BBDMAC_4_7, BBDMAC_8_10, BBDMAC_11_14, BBDMAC_15_18 } }, + { 0xffe00024, 0, 32, 8, /* INT2PRI9 */ + { BBDMAC_19_22, BBDMAC_23_26, BBDMAC_27, BBDMAC_28 } }, + { 0xffe00028, 0, 32, 8, /* INT2PRI10 */ + { BBDMAC_29, BBDMAC_30, BBDMAC_31 } }, + { 0xffe0002c, 0, 32, 8, /* INT2PRI11 */ + { TMU1, TMU2, TMU2_TICPI, TMU3 } }, + { 0xffe00030, 0, 32, 8, /* INT2PRI12 */ + { TMU4, TMU5, TMU5_TICPI, TMU6 } }, + { 0xffe00034, 0, 32, 8, /* INT2PRI13 */ + { TMU7, TMU8 } }, +}; + +static DECLARE_INTC_DESC(intc_desc, "sh7770", vectors, groups, + mask_registers, prio_registers, NULL); + +/* Support for external interrupt pins in IRQ mode */ +static struct intc_vect irq_vectors[] __initdata = { + INTC_VECT(IRQ0, 0x240), INTC_VECT(IRQ1, 0x280), + INTC_VECT(IRQ2, 0x2c0), INTC_VECT(IRQ3, 0x300), + INTC_VECT(IRQ4, 0x340), INTC_VECT(IRQ5, 0x380), +}; + +static struct intc_mask_reg irq_mask_registers[] __initdata = { + { 0xffd00044, 0xffd00064, 32, /* INTMSK0 / INTMSKCLR0 */ + { IRQ0, IRQ1, IRQ2, IRQ3, IRQ4, IRQ5, } }, +}; + +static struct intc_prio_reg irq_prio_registers[] __initdata = { + { 0xffd00010, 0, 32, 4, /* INTPRI */ { IRQ0, IRQ1, IRQ2, IRQ3, + IRQ4, IRQ5, } }, +}; + +static struct intc_sense_reg irq_sense_registers[] __initdata = { + { 0xffd0001c, 32, 2, /* ICR1 */ { IRQ0, IRQ1, IRQ2, IRQ3, + IRQ4, IRQ5, } }, +}; + +static DECLARE_INTC_DESC(intc_irq_desc, "sh7770-irq", irq_vectors, + NULL, irq_mask_registers, irq_prio_registers, + irq_sense_registers); + +/* External interrupt pins in IRL mode */ +static struct intc_vect irl_vectors[] __initdata = { + INTC_VECT(IRL_LLLL, 0x200), INTC_VECT(IRL_LLLH, 0x220), + INTC_VECT(IRL_LLHL, 0x240), INTC_VECT(IRL_LLHH, 0x260), + INTC_VECT(IRL_LHLL, 0x280), INTC_VECT(IRL_LHLH, 0x2a0), + INTC_VECT(IRL_LHHL, 0x2c0), INTC_VECT(IRL_LHHH, 0x2e0), + INTC_VECT(IRL_HLLL, 0x300), INTC_VECT(IRL_HLLH, 0x320), + INTC_VECT(IRL_HLHL, 0x340), INTC_VECT(IRL_HLHH, 0x360), + INTC_VECT(IRL_HHLL, 0x380), INTC_VECT(IRL_HHLH, 0x3a0), + INTC_VECT(IRL_HHHL, 0x3c0), +}; + +static struct intc_mask_reg irl3210_mask_registers[] __initdata = { + { 0xffd40080, 0xffd40084, 32, /* INTMSK2 / INTMSKCLR2 */ + { IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH, + IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH, + IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH, + IRL_HHLL, IRL_HHLH, IRL_HHHL, } }, +}; + +static struct intc_mask_reg irl7654_mask_registers[] __initdata = { + { 0xffd40080, 0xffd40084, 32, /* INTMSK2 / INTMSKCLR2 */ + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + IRL_LLLL, IRL_LLLH, IRL_LLHL, IRL_LLHH, + IRL_LHLL, IRL_LHLH, IRL_LHHL, IRL_LHHH, + IRL_HLLL, IRL_HLLH, IRL_HLHL, IRL_HLHH, + IRL_HHLL, IRL_HHLH, IRL_HHHL, } }, +}; + +static DECLARE_INTC_DESC(intc_irl7654_desc, "sh7780-irl7654", irl_vectors, + NULL, irl7654_mask_registers, NULL, NULL); + +static DECLARE_INTC_DESC(intc_irl3210_desc, "sh7780-irl3210", irl_vectors, + NULL, irl3210_mask_registers, NULL, NULL); + +#define INTC_ICR0 0xffd00000 +#define INTC_INTMSK0 0xffd00044 +#define INTC_INTMSK1 0xffd00048 +#define INTC_INTMSK2 0xffd40080 +#define INTC_INTMSKCLR1 0xffd00068 +#define INTC_INTMSKCLR2 0xffd40084 + void __init plat_irq_setup(void) { + /* disable IRQ7-0 */ + ctrl_outl(0xff000000, INTC_INTMSK0); + + /* disable IRL3-0 + IRL7-4 */ + ctrl_outl(0xc0000000, INTC_INTMSK1); + ctrl_outl(0xfffefffe, INTC_INTMSK2); + + /* select IRL mode for IRL3-0 + IRL7-4 */ + ctrl_outl(ctrl_inl(INTC_ICR0) & ~0x00c00000, INTC_ICR0); + + /* disable holding function, ie enable "SH-4 Mode" */ + ctrl_outl(ctrl_inl(INTC_ICR0) | 0x00200000, INTC_ICR0); + + register_intc_controller(&intc_desc); +} + +void __init plat_irq_setup_pins(int mode) +{ + switch (mode) { + case IRQ_MODE_IRQ: + /* select IRQ mode for IRL3-0 + IRL7-4 */ + ctrl_outl(ctrl_inl(INTC_ICR0) | 0x00c00000, INTC_ICR0); + register_intc_controller(&intc_irq_desc); + break; + case IRQ_MODE_IRL7654: + /* enable IRL7-4 but don't provide any masking */ + ctrl_outl(0x40000000, INTC_INTMSKCLR1); + ctrl_outl(0x0000fffe, INTC_INTMSKCLR2); + break; + case IRQ_MODE_IRL3210: + /* enable IRL0-3 but don't provide any masking */ + ctrl_outl(0x80000000, INTC_INTMSKCLR1); + ctrl_outl(0xfffe0000, INTC_INTMSKCLR2); + break; + case IRQ_MODE_IRL7654_MASK: + /* enable IRL7-4 and mask using cpu intc controller */ + ctrl_outl(0x40000000, INTC_INTMSKCLR1); + register_intc_controller(&intc_irl7654_desc); + break; + case IRQ_MODE_IRL3210_MASK: + /* enable IRL0-3 and mask using cpu intc controller */ + ctrl_outl(0x80000000, INTC_INTMSKCLR1); + register_intc_controller(&intc_irl3210_desc); + break; + default: + BUG(); + } } From 2fa3cdfb319055fd8b25abdafa413e16f00ad493 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 May 2009 09:29:45 -0400 Subject: [PATCH 1067/2061] ext4: Merge ext4_da_get_block_write() into mpage_da_map_blocks() The static function ext4_da_get_block_write() was only used by mpage_da_map_blocks(). So to simplify the code, merge that function into mpage_da_map_blocks(). Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 110 +++++++++++++++++++----------------------------- 1 file changed, 43 insertions(+), 67 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index e6113c3a126..bfe50a22363 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2005,57 +2005,6 @@ static void ext4_print_free_blocks(struct inode *inode) return; } -/* - * This function is used by mpage_da_map_blocks(). We separate it out - * as a separate function just to make life easier, and because - * mpage_da_map_blocks() used to be a generic function that took a - * get_block_t. - */ -static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result) -{ - int ret; - unsigned max_blocks = bh_result->b_size >> inode->i_blkbits; - loff_t disksize = EXT4_I(inode)->i_disksize; - handle_t *handle = NULL; - - handle = ext4_journal_current_handle(); - BUG_ON(!handle); - ret = ext4_get_blocks(handle, inode, iblock, max_blocks, - bh_result, EXT4_GET_BLOCKS_CREATE| - EXT4_GET_BLOCKS_DELALLOC_RESERVE); - if (ret <= 0) - return ret; - - bh_result->b_size = (ret << inode->i_blkbits); - - if (ext4_should_order_data(inode)) { - int retval; - retval = ext4_jbd2_file_inode(handle, inode); - if (retval) - /* - * Failed to add inode for ordered mode. Don't - * update file size - */ - return retval; - } - - /* - * Update on-disk size along with block allocation we don't - * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change - * within already allocated block -bzzz - */ - disksize = ((loff_t) iblock + ret) << inode->i_blkbits; - if (disksize > i_size_read(inode)) - disksize = i_size_read(inode); - if (disksize > EXT4_I(inode)->i_disksize) { - ext4_update_i_disksize(inode, disksize); - ret = ext4_mark_inode_dirty(handle, inode); - return ret; - } - return 0; -} - /* * mpage_da_map_blocks - go through given space * @@ -2066,9 +2015,12 @@ static int ext4_da_get_block_write(struct inode *inode, sector_t iblock, */ static int mpage_da_map_blocks(struct mpage_da_data *mpd) { - int err = 0; + int err, blks; struct buffer_head new; - sector_t next; + sector_t next = mpd->b_blocknr; + unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; + loff_t disksize = EXT4_I(mpd->inode)->i_disksize; + handle_t *handle = NULL; /* * We consider only non-mapped and non-allocated blocks @@ -2077,6 +2029,16 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) !(mpd->b_state & (1 << BH_Delay)) && !(mpd->b_state & (1 << BH_Unwritten))) return 0; + + /* + * If we didn't accumulate anything to write simply return + */ + if (!mpd->b_size) + return 0; + + handle = ext4_journal_current_handle(); + BUG_ON(!handle); + /* * We need to make sure the BH_Delay flag is passed down to * ext4_da_get_block_write(), since it calls ext4_get_blocks() @@ -2092,18 +2054,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. */ new.b_state = mpd->b_state & (1 << BH_Delay); - new.b_blocknr = 0; - new.b_size = mpd->b_size; - next = mpd->b_blocknr; - /* - * If we didn't accumulate anything - * to write simply return - */ - if (!new.b_size) - return 0; - - err = ext4_da_get_block_write(mpd->inode, next, &new); - if (err) { + blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, + &new, EXT4_GET_BLOCKS_CREATE| + EXT4_GET_BLOCKS_DELALLOC_RESERVE); + if (blks < 0) { + err = blks; /* * If get block returns with error we simply * return. Later writepage will redirty the page and @@ -2136,12 +2091,14 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) if (err == -ENOSPC) { ext4_print_free_blocks(mpd->inode); } - /* invlaidate all the pages */ + /* invalidate all the pages */ ext4_da_block_invalidatepages(mpd, next, mpd->b_size >> mpd->inode->i_blkbits); return err; } - BUG_ON(new.b_size == 0); + BUG_ON(blks == 0); + + new.b_size = (blks << mpd->inode->i_blkbits); if (buffer_new(&new)) __unmap_underlying_blocks(mpd->inode, &new); @@ -2154,6 +2111,25 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) (mpd->b_state & (1 << BH_Unwritten))) mpage_put_bnr_to_bhs(mpd, next, &new); + if (ext4_should_order_data(mpd->inode)) { + err = ext4_jbd2_file_inode(handle, mpd->inode); + if (err) + return err; + } + + /* + * Update on-disk size along with block allocation we don't + * use EXT4_GET_BLOCKS_EXTEND_DISKSIZE as size may change + * within already allocated block -bzzz + */ + disksize = ((loff_t) next + blks) << mpd->inode->i_blkbits; + if (disksize > i_size_read(mpd->inode)) + disksize = i_size_read(mpd->inode); + if (disksize > EXT4_I(mpd->inode)->i_disksize) { + ext4_update_i_disksize(mpd->inode, disksize); + return ext4_mark_inode_dirty(handle, mpd->inode); + } + return 0; } From 2ac3b6e00acb46406c993d57921f86a594aafe08 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 14 May 2009 13:57:08 -0400 Subject: [PATCH 1068/2061] ext4: Clean up ext4_get_blocks() so it does not depend on bh_result->b_state The ext4_get_blocks() function was depending on the value of bh_result->b_state as an input parameter to decide whether or not update the delalloc accounting statistics by calling ext4_da_update_reserve_space(). We now use a separate flag, EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE, to requests this update, so that all callers of ext4_get_blocks() can clear map_bh.b_state before calling ext4_get_blocks() without worrying about any consistency issues. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 15 ++++++++----- fs/ext4/inode.c | 56 +++++++++++++++++++++++++++---------------------- 2 files changed, 41 insertions(+), 30 deletions(-) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 17feb4ac633..d164f1294e5 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -318,16 +318,21 @@ struct ext4_new_group_data { */ /* Allocate any needed blocks and/or convert an unitialized extent to be an initialized ext4 */ -#define EXT4_GET_BLOCKS_CREATE 1 +#define EXT4_GET_BLOCKS_CREATE 0x0001 /* Request the creation of an unitialized extent */ -#define EXT4_GET_BLOCKS_UNINIT_EXT 2 +#define EXT4_GET_BLOCKS_UNINIT_EXT 0x0002 #define EXT4_GET_BLOCKS_CREATE_UNINIT_EXT (EXT4_GET_BLOCKS_UNINIT_EXT|\ EXT4_GET_BLOCKS_CREATE) /* Update the ext4_inode_info i_disksize field */ -#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE 4 +#define EXT4_GET_BLOCKS_EXTEND_DISKSIZE 0x0004 /* Caller is from the delayed allocation writeout path, - so the filesystem blocks have already been accounted for */ -#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 8 + so set the magic i_delalloc_reserve_flag after taking the + inode allocation semaphore for */ +#define EXT4_GET_BLOCKS_DELALLOC_RESERVE 0x0008 + /* Call ext4_da_update_reserve_space() after successfully + allocating the blocks */ +#define EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE 0x0010 + /* * ioctl commands diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index bfe50a22363..d7b7480682b 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1234,16 +1234,15 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, } } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) { + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) EXT4_I(inode)->i_delalloc_reserved_flag = 0; - /* - * Update reserved blocks/metadata blocks - * after successful block allocation - * which were deferred till now - */ - if ((retval > 0) && buffer_delay(bh)) - ext4_da_update_reserve_space(inode, retval); - } + + /* + * Update reserved blocks/metadata blocks after successful + * block allocation which had been deferred till now. + */ + if ((retval > 0) && (flags & EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE)) + ext4_da_update_reserve_space(inode, retval); up_write((&EXT4_I(inode)->i_data_sem)); return retval; @@ -2015,7 +2014,7 @@ static void ext4_print_free_blocks(struct inode *inode) */ static int mpage_da_map_blocks(struct mpage_da_data *mpd) { - int err, blks; + int err, blks, get_blocks_flags; struct buffer_head new; sector_t next = mpd->b_blocknr; unsigned max_blocks = mpd->b_size >> mpd->inode->i_blkbits; @@ -2040,23 +2039,30 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd) BUG_ON(!handle); /* - * We need to make sure the BH_Delay flag is passed down to - * ext4_da_get_block_write(), since it calls ext4_get_blocks() - * with the EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. This flag - * causes ext4_get_blocks() to call - * ext4_da_update_reserve_space() if the passed buffer head - * has the BH_Delay flag set. In the future, once we clean up - * the interfaces to ext4_get_blocks(), we should pass in a - * separate flag which requests that the delayed allocation - * statistics should be updated, instead of depending on the - * state information getting passed down via the map_bh's - * state bitmasks plus the magic - * EXT4_GET_BLOCKS_DELALLOC_RESERVE flag. + * Call ext4_get_blocks() to allocate any delayed allocation + * blocks, or to convert an uninitialized extent to be + * initialized (in the case where we have written into + * one or more preallocated blocks). + * + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE to + * indicate that we are on the delayed allocation path. This + * affects functions in many different parts of the allocation + * call path. This flag exists primarily because we don't + * want to change *many* call functions, so ext4_get_blocks() + * will set the magic i_delalloc_reserved_flag once the + * inode's allocation semaphore is taken. + * + * If the blocks in questions were delalloc blocks, set + * EXT4_GET_BLOCKS_DELALLOC_RESERVE so the delalloc accounting + * variables are updated after the blocks have been allocated. */ - new.b_state = mpd->b_state & (1 << BH_Delay); + new.b_state = 0; + get_blocks_flags = (EXT4_GET_BLOCKS_CREATE | + EXT4_GET_BLOCKS_DELALLOC_RESERVE); + if (mpd->b_state & (1 << BH_Delay)) + get_blocks_flags |= EXT4_GET_BLOCKS_UPDATE_RESERVE_SPACE; blks = ext4_get_blocks(handle, mpd->inode, next, max_blocks, - &new, EXT4_GET_BLOCKS_CREATE| - EXT4_GET_BLOCKS_DELALLOC_RESERVE); + &new, get_blocks_flags); if (blks < 0) { err = blks; /* From f850a7c040d9faafb41bceb0a05d6bb7432c8c7a Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 12 May 2009 15:13:55 -0400 Subject: [PATCH 1069/2061] IMA: remove read permissions on the ima policy file The IMA policy file does not implement read. Trying to just open/read/close the file will load a blank policy and you cannot then change the policy without a reboot. This removes the read permission from the file so one must at least be attempting to write... Signed-off-by: Eric Paris Acked-by: Mimi Zohar Signed-off-by: James Morris --- security/integrity/ima/ima_fs.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index ffbe259700b..3305a961586 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -15,6 +15,7 @@ * implemenents security file system for reporting * current measurement list and IMA statistics */ +#include #include #include #include @@ -283,6 +284,9 @@ static atomic_t policy_opencount = ATOMIC_INIT(1); */ int ima_open_policy(struct inode * inode, struct file * filp) { + /* No point in being allowed to open it if you aren't going to write */ + if (!(filp->f_flags & O_WRONLY)) + return -EACCES; if (atomic_dec_and_test(&policy_opencount)) return 0; return -EBUSY; @@ -349,7 +353,7 @@ int ima_fs_init(void) goto out; ima_policy = securityfs_create_file("policy", - S_IRUSR | S_IRGRP | S_IWUSR, + S_IWUSR, ima_dir, NULL, &ima_measure_policy_ops); if (IS_ERR(ima_policy)) From c3d20103d08e5c0b6738fbd0acf3ca004e5356c5 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Tue, 12 May 2009 15:14:23 -0400 Subject: [PATCH 1070/2061] IMA: do not measure everything opened by root by default The IMA default policy measures every single file opened by root. This is terrible for most users. Consider a system (like mine) with virtual machine images. When those images are touched (which happens at boot for me) those images are measured. This is just way too much for the default case. Signed-off-by: Eric Paris Acked-by: Mimi Zohar Signed-off-by: James Morris --- security/integrity/ima/ima_policy.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index b168c1d595c..dec6dcb1c8d 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -61,8 +61,6 @@ static struct ima_measure_rule_entry default_rules[] = { .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, - {.action = MEASURE,.func = PATH_CHECK,.mask = MAY_READ,.uid = 0, - .flags = IMA_FUNC | IMA_MASK | IMA_UID} }; static LIST_HEAD(measure_default_rules); From bec36eca6f5d1d83a9c3733fc40ba173ad849df2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 15 May 2009 12:03:04 +0900 Subject: [PATCH 1071/2061] sh: hd64461: Fix up I/O base register offsets. hd64461 is mapped in a fixed location, so the I/O base itself is fairly meaningless as a configuration item. Additionally, this makes it impossible to share hd64461 code alongside generic drivers (in the case of sh_dac_audio), so simply make it commonly defined and permit the mach_is_foo() logic to work out the proper semantics. Signed-off-by: Paul Mundt --- arch/sh/cchips/Kconfig | 5 -- arch/sh/include/asm/hd64461.h | 146 +++++++++++++++++----------------- 2 files changed, 74 insertions(+), 77 deletions(-) diff --git a/arch/sh/cchips/Kconfig b/arch/sh/cchips/Kconfig index f43d18373f2..a5ab2eccdaa 100644 --- a/arch/sh/cchips/Kconfig +++ b/arch/sh/cchips/Kconfig @@ -34,11 +34,6 @@ config HD64461_IRQ Do not change this unless you know what you are doing. -config HD64461_IOBASE - hex "HD64461 start address" - depends on HD64461 - default "0xb0000000" - config HD64461_ENABLER bool "HD64461 PCMCIA enabler" depends on HD64461 diff --git a/arch/sh/include/asm/hd64461.h b/arch/sh/include/asm/hd64461.h index 52b4b623827..c16d54563c9 100644 --- a/arch/sh/include/asm/hd64461.h +++ b/arch/sh/include/asm/hd64461.h @@ -13,13 +13,15 @@ #define HD64461_PCC_WINDOW 0x01000000 /* Area 6 - Slot 0 - memory and/or IO card */ -#define HD64461_PCC0_BASE (CONFIG_HD64461_IOBASE + 0x8000000) +#define HD64461_IOBASE 0xb0000000 +#define HD64461_IO_OFFSET(x) (HD64461_IOBASE + (x)) +#define HD64461_PCC0_BASE HD64461_IO_OFFSET(0x8000000) #define HD64461_PCC0_ATTR (HD64461_PCC0_BASE) /* 0xb80000000 */ #define HD64461_PCC0_COMM (HD64461_PCC0_BASE+HD64461_PCC_WINDOW) /* 0xb90000000 */ #define HD64461_PCC0_IO (HD64461_PCC0_BASE+2*HD64461_PCC_WINDOW) /* 0xba0000000 */ /* Area 5 - Slot 1 - memory card only */ -#define HD64461_PCC1_BASE (CONFIG_HD64461_IOBASE + 0x4000000) +#define HD64461_PCC1_BASE HD64461_IO_OFFSET(0x4000000) #define HD64461_PCC1_ATTR (HD64461_PCC1_BASE) /* 0xb4000000 */ #define HD64461_PCC1_COMM (HD64461_PCC1_BASE+HD64461_PCC_WINDOW) /* 0xb5000000 */ @@ -41,19 +43,19 @@ #define HD64461_STBCR_SURTST 0x0001 /* System Configuration Register */ -#define HD64461_SYSCR (CONFIG_HD64461_IOBASE + 0x02) +#define HD64461_SYSCR HD64461_IO_OFFSET(0x02) /* CPU Data Bus Control Register */ -#define HD64461_SCPUCR (CONFIG_HD64461_IOBASE + 0x04) +#define HD64461_SCPUCR HD64461_IO_OFFSET(0x04) /* Base Address Register */ -#define HD64461_LCDCBAR (CONFIG_HD64461_IOBASE + 0x1000) +#define HD64461_LCDCBAR HD64461_IO_OFFSET(0x1000) /* Line increment address */ -#define HD64461_LCDCLOR (CONFIG_HD64461_IOBASE + 0x1002) +#define HD64461_LCDCLOR HD64461_IO_OFFSET(0x1002) /* Controls LCD controller */ -#define HD64461_LCDCCR (CONFIG_HD64461_IOBASE + 0x1004) +#define HD64461_LCDCCR HD64461_IO_OFFSET(0x1004) /* LCCDR control bits */ #define HD64461_LCDCCR_STBACK 0x0400 /* Standby Back */ @@ -64,30 +66,30 @@ #define HD64461_LCDCCR_SPON 0x0010 /* Start Power On */ /* Controls LCD (1) */ -#define HD64461_LDR1 (CONFIG_HD64461_IOBASE + 0x1010) +#define HD64461_LDR1 HD64461_IO_OFFSET(0x1010) #define HD64461_LDR1_DON 0x01 /* Display On */ #define HD64461_LDR1_DINV 0x80 /* Display Invert */ /* Controls LCD (2) */ -#define HD64461_LDR2 (CONFIG_HD64461_IOBASE + 0x1012) -#define HD64461_LDHNCR (CONFIG_HD64461_IOBASE + 0x1014) /* Number of horizontal characters */ -#define HD64461_LDHNSR (CONFIG_HD64461_IOBASE + 0x1016) /* Specify output start position + width of CL1 */ -#define HD64461_LDVNTR (CONFIG_HD64461_IOBASE + 0x1018) /* Specify total vertical lines */ -#define HD64461_LDVNDR (CONFIG_HD64461_IOBASE + 0x101a) /* specify number of display vertical lines */ -#define HD64461_LDVSPR (CONFIG_HD64461_IOBASE + 0x101c) /* specify vertical synchronization pos and AC nr */ +#define HD64461_LDR2 HD64461_IO_OFFSET(0x1012) +#define HD64461_LDHNCR HD64461_IO_OFFSET(0x1014) /* Number of horizontal characters */ +#define HD64461_LDHNSR HD64461_IO_OFFSET(0x1016) /* Specify output start position + width of CL1 */ +#define HD64461_LDVNTR HD64461_IO_OFFSET(0x1018) /* Specify total vertical lines */ +#define HD64461_LDVNDR HD64461_IO_OFFSET(0x101a) /* specify number of display vertical lines */ +#define HD64461_LDVSPR HD64461_IO_OFFSET(0x101c) /* specify vertical synchronization pos and AC nr */ /* Controls LCD (3) */ -#define HD64461_LDR3 (CONFIG_HD64461_IOBASE + 0x101e) +#define HD64461_LDR3 HD64461_IO_OFFSET(0x101e) /* Palette Registers */ -#define HD64461_CPTWAR (CONFIG_HD64461_IOBASE + 0x1030) /* Color Palette Write Address Register */ -#define HD64461_CPTWDR (CONFIG_HD64461_IOBASE + 0x1032) /* Color Palette Write Data Register */ -#define HD64461_CPTRAR (CONFIG_HD64461_IOBASE + 0x1034) /* Color Palette Read Address Register */ -#define HD64461_CPTRDR (CONFIG_HD64461_IOBASE + 0x1036) /* Color Palette Read Data Register */ +#define HD64461_CPTWAR HD64461_IO_OFFSET(0x1030) /* Color Palette Write Address Register */ +#define HD64461_CPTWDR HD64461_IO_OFFSET(0x1032) /* Color Palette Write Data Register */ +#define HD64461_CPTRAR HD64461_IO_OFFSET(0x1034) /* Color Palette Read Address Register */ +#define HD64461_CPTRDR HD64461_IO_OFFSET(0x1036) /* Color Palette Read Data Register */ -#define HD64461_GRDOR (CONFIG_HD64461_IOBASE + 0x1040) /* Display Resolution Offset Register */ -#define HD64461_GRSCR (CONFIG_HD64461_IOBASE + 0x1042) /* Solid Color Register */ -#define HD64461_GRCFGR (CONFIG_HD64461_IOBASE + 0x1044) /* Accelerator Configuration Register */ +#define HD64461_GRDOR HD64461_IO_OFFSET(0x1040) /* Display Resolution Offset Register */ +#define HD64461_GRSCR HD64461_IO_OFFSET(0x1042) /* Solid Color Register */ +#define HD64461_GRCFGR HD64461_IO_OFFSET(0x1044) /* Accelerator Configuration Register */ #define HD64461_GRCFGR_ACCSTATUS 0x10 /* Accelerator Status */ #define HD64461_GRCFGR_ACCRESET 0x08 /* Accelerator Reset */ @@ -97,41 +99,41 @@ #define HD64461_GRCFGR_COLORDEPTH8 0x01 /* Sets Colordepth 8 for Accelerator */ /* Line Drawing Registers */ -#define HD64461_LNSARH (CONFIG_HD64461_IOBASE + 0x1046) /* Line Start Address Register (H) */ -#define HD64461_LNSARL (CONFIG_HD64461_IOBASE + 0x1048) /* Line Start Address Register (L) */ -#define HD64461_LNAXLR (CONFIG_HD64461_IOBASE + 0x104a) /* Axis Pixel Length Register */ -#define HD64461_LNDGR (CONFIG_HD64461_IOBASE + 0x104c) /* Diagonal Register */ -#define HD64461_LNAXR (CONFIG_HD64461_IOBASE + 0x104e) /* Axial Register */ -#define HD64461_LNERTR (CONFIG_HD64461_IOBASE + 0x1050) /* Start Error Term Register */ -#define HD64461_LNMDR (CONFIG_HD64461_IOBASE + 0x1052) /* Line Mode Register */ +#define HD64461_LNSARH HD64461_IO_OFFSET(0x1046) /* Line Start Address Register (H) */ +#define HD64461_LNSARL HD64461_IO_OFFSET(0x1048) /* Line Start Address Register (L) */ +#define HD64461_LNAXLR HD64461_IO_OFFSET(0x104a) /* Axis Pixel Length Register */ +#define HD64461_LNDGR HD64461_IO_OFFSET(0x104c) /* Diagonal Register */ +#define HD64461_LNAXR HD64461_IO_OFFSET(0x104e) /* Axial Register */ +#define HD64461_LNERTR HD64461_IO_OFFSET(0x1050) /* Start Error Term Register */ +#define HD64461_LNMDR HD64461_IO_OFFSET(0x1052) /* Line Mode Register */ /* BitBLT Registers */ -#define HD64461_BBTSSARH (CONFIG_HD64461_IOBASE + 0x1054) /* Source Start Address Register (H) */ -#define HD64461_BBTSSARL (CONFIG_HD64461_IOBASE + 0x1056) /* Source Start Address Register (L) */ -#define HD64461_BBTDSARH (CONFIG_HD64461_IOBASE + 0x1058) /* Destination Start Address Register (H) */ -#define HD64461_BBTDSARL (CONFIG_HD64461_IOBASE + 0x105a) /* Destination Start Address Register (L) */ -#define HD64461_BBTDWR (CONFIG_HD64461_IOBASE + 0x105c) /* Destination Block Width Register */ -#define HD64461_BBTDHR (CONFIG_HD64461_IOBASE + 0x105e) /* Destination Block Height Register */ -#define HD64461_BBTPARH (CONFIG_HD64461_IOBASE + 0x1060) /* Pattern Start Address Register (H) */ -#define HD64461_BBTPARL (CONFIG_HD64461_IOBASE + 0x1062) /* Pattern Start Address Register (L) */ -#define HD64461_BBTMARH (CONFIG_HD64461_IOBASE + 0x1064) /* Mask Start Address Register (H) */ -#define HD64461_BBTMARL (CONFIG_HD64461_IOBASE + 0x1066) /* Mask Start Address Register (L) */ -#define HD64461_BBTROPR (CONFIG_HD64461_IOBASE + 0x1068) /* ROP Register */ -#define HD64461_BBTMDR (CONFIG_HD64461_IOBASE + 0x106a) /* BitBLT Mode Register */ +#define HD64461_BBTSSARH HD64461_IO_OFFSET(0x1054) /* Source Start Address Register (H) */ +#define HD64461_BBTSSARL HD64461_IO_OFFSET(0x1056) /* Source Start Address Register (L) */ +#define HD64461_BBTDSARH HD64461_IO_OFFSET(0x1058) /* Destination Start Address Register (H) */ +#define HD64461_BBTDSARL HD64461_IO_OFFSET(0x105a) /* Destination Start Address Register (L) */ +#define HD64461_BBTDWR HD64461_IO_OFFSET(0x105c) /* Destination Block Width Register */ +#define HD64461_BBTDHR HD64461_IO_OFFSET(0x105e) /* Destination Block Height Register */ +#define HD64461_BBTPARH HD64461_IO_OFFSET(0x1060) /* Pattern Start Address Register (H) */ +#define HD64461_BBTPARL HD64461_IO_OFFSET(0x1062) /* Pattern Start Address Register (L) */ +#define HD64461_BBTMARH HD64461_IO_OFFSET(0x1064) /* Mask Start Address Register (H) */ +#define HD64461_BBTMARL HD64461_IO_OFFSET(0x1066) /* Mask Start Address Register (L) */ +#define HD64461_BBTROPR HD64461_IO_OFFSET(0x1068) /* ROP Register */ +#define HD64461_BBTMDR HD64461_IO_OFFSET(0x106a) /* BitBLT Mode Register */ /* PC Card Controller Registers */ /* Maps to Physical Area 6 */ -#define HD64461_PCC0ISR (CONFIG_HD64461_IOBASE + 0x2000) /* socket 0 interface status */ -#define HD64461_PCC0GCR (CONFIG_HD64461_IOBASE + 0x2002) /* socket 0 general control */ -#define HD64461_PCC0CSCR (CONFIG_HD64461_IOBASE + 0x2004) /* socket 0 card status change */ -#define HD64461_PCC0CSCIER (CONFIG_HD64461_IOBASE + 0x2006) /* socket 0 card status change interrupt enable */ -#define HD64461_PCC0SCR (CONFIG_HD64461_IOBASE + 0x2008) /* socket 0 software control */ +#define HD64461_PCC0ISR HD64461_IO_OFFSET(0x2000) /* socket 0 interface status */ +#define HD64461_PCC0GCR HD64461_IO_OFFSET(0x2002) /* socket 0 general control */ +#define HD64461_PCC0CSCR HD64461_IO_OFFSET(0x2004) /* socket 0 card status change */ +#define HD64461_PCC0CSCIER HD64461_IO_OFFSET(0x2006) /* socket 0 card status change interrupt enable */ +#define HD64461_PCC0SCR HD64461_IO_OFFSET(0x2008) /* socket 0 software control */ /* Maps to Physical Area 5 */ -#define HD64461_PCC1ISR (CONFIG_HD64461_IOBASE + 0x2010) /* socket 1 interface status */ -#define HD64461_PCC1GCR (CONFIG_HD64461_IOBASE + 0x2012) /* socket 1 general control */ -#define HD64461_PCC1CSCR (CONFIG_HD64461_IOBASE + 0x2014) /* socket 1 card status change */ -#define HD64461_PCC1CSCIER (CONFIG_HD64461_IOBASE + 0x2016) /* socket 1 card status change interrupt enable */ -#define HD64461_PCC1SCR (CONFIG_HD64461_IOBASE + 0x2018) /* socket 1 software control */ +#define HD64461_PCC1ISR HD64461_IO_OFFSET(0x2010) /* socket 1 interface status */ +#define HD64461_PCC1GCR HD64461_IO_OFFSET(0x2012) /* socket 1 general control */ +#define HD64461_PCC1CSCR HD64461_IO_OFFSET(0x2014) /* socket 1 card status change */ +#define HD64461_PCC1CSCIER HD64461_IO_OFFSET(0x2016) /* socket 1 card status change interrupt enable */ +#define HD64461_PCC1SCR HD64461_IO_OFFSET(0x2018) /* socket 1 software control */ /* PCC Interface Status Register */ #define HD64461_PCCISR_READY 0x80 /* card ready */ @@ -189,41 +191,41 @@ #define HD64461_PCCSCR_SWP 0x01 /* write protect */ /* PCC0 Output Pins Control Register */ -#define HD64461_P0OCR (CONFIG_HD64461_IOBASE + 0x202a) +#define HD64461_P0OCR HD64461_IO_OFFSET(0x202a) /* PCC1 Output Pins Control Register */ -#define HD64461_P1OCR (CONFIG_HD64461_IOBASE + 0x202c) +#define HD64461_P1OCR HD64461_IO_OFFSET(0x202c) /* PC Card General Control Register */ -#define HD64461_PGCR (CONFIG_HD64461_IOBASE + 0x202e) +#define HD64461_PGCR HD64461_IO_OFFSET(0x202e) /* Port Control Registers */ -#define HD64461_GPACR (CONFIG_HD64461_IOBASE + 0x4000) /* Port A - Handles IRDA/TIMER */ -#define HD64461_GPBCR (CONFIG_HD64461_IOBASE + 0x4002) /* Port B - Handles UART */ -#define HD64461_GPCCR (CONFIG_HD64461_IOBASE + 0x4004) /* Port C - Handles PCMCIA 1 */ -#define HD64461_GPDCR (CONFIG_HD64461_IOBASE + 0x4006) /* Port D - Handles PCMCIA 1 */ +#define HD64461_GPACR HD64461_IO_OFFSET(0x4000) /* Port A - Handles IRDA/TIMER */ +#define HD64461_GPBCR HD64461_IO_OFFSET(0x4002) /* Port B - Handles UART */ +#define HD64461_GPCCR HD64461_IO_OFFSET(0x4004) /* Port C - Handles PCMCIA 1 */ +#define HD64461_GPDCR HD64461_IO_OFFSET(0x4006) /* Port D - Handles PCMCIA 1 */ /* Port Control Data Registers */ -#define HD64461_GPADR (CONFIG_HD64461_IOBASE + 0x4010) /* A */ -#define HD64461_GPBDR (CONFIG_HD64461_IOBASE + 0x4012) /* B */ -#define HD64461_GPCDR (CONFIG_HD64461_IOBASE + 0x4014) /* C */ -#define HD64461_GPDDR (CONFIG_HD64461_IOBASE + 0x4016) /* D */ +#define HD64461_GPADR HD64461_IO_OFFSET(0x4010) /* A */ +#define HD64461_GPBDR HD64461_IO_OFFSET(0x4012) /* B */ +#define HD64461_GPCDR HD64461_IO_OFFSET(0x4014) /* C */ +#define HD64461_GPDDR HD64461_IO_OFFSET(0x4016) /* D */ /* Interrupt Control Registers */ -#define HD64461_GPAICR (CONFIG_HD64461_IOBASE + 0x4020) /* A */ -#define HD64461_GPBICR (CONFIG_HD64461_IOBASE + 0x4022) /* B */ -#define HD64461_GPCICR (CONFIG_HD64461_IOBASE + 0x4024) /* C */ -#define HD64461_GPDICR (CONFIG_HD64461_IOBASE + 0x4026) /* D */ +#define HD64461_GPAICR HD64461_IO_OFFSET(0x4020) /* A */ +#define HD64461_GPBICR HD64461_IO_OFFSET(0x4022) /* B */ +#define HD64461_GPCICR HD64461_IO_OFFSET(0x4024) /* C */ +#define HD64461_GPDICR HD64461_IO_OFFSET(0x4026) /* D */ /* Interrupt Status Registers */ -#define HD64461_GPAISR (CONFIG_HD64461_IOBASE + 0x4040) /* A */ -#define HD64461_GPBISR (CONFIG_HD64461_IOBASE + 0x4042) /* B */ -#define HD64461_GPCISR (CONFIG_HD64461_IOBASE + 0x4044) /* C */ -#define HD64461_GPDISR (CONFIG_HD64461_IOBASE + 0x4046) /* D */ +#define HD64461_GPAISR HD64461_IO_OFFSET(0x4040) /* A */ +#define HD64461_GPBISR HD64461_IO_OFFSET(0x4042) /* B */ +#define HD64461_GPCISR HD64461_IO_OFFSET(0x4044) /* C */ +#define HD64461_GPDISR HD64461_IO_OFFSET(0x4046) /* D */ /* Interrupt Request Register & Interrupt Mask Register */ -#define HD64461_NIRR (CONFIG_HD64461_IOBASE + 0x5000) -#define HD64461_NIMR (CONFIG_HD64461_IOBASE + 0x5002) +#define HD64461_NIRR HD64461_IO_OFFSET(0x5000) +#define HD64461_NIMR HD64461_IO_OFFSET(0x5002) #define HD64461_IRQBASE OFFCHIP_IRQ_BASE #define OFFCHIP_IRQ_BASE 64 From 639138a991aaf1a3f65cc66700d097edc5f602fe Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Fri, 15 May 2009 12:07:17 +0900 Subject: [PATCH 1072/2061] sound: oss: sh_dac_audio timer fixes. This patch modifies sh_dac_audio in the following ways: - use high resolution timer instead of TMU1 - fix cpu/dac.h include - add future rewrite comment Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- sound/oss/Kconfig | 2 +- sound/oss/sh_dac_audio.c | 85 +++++++++++++++------------------------- 2 files changed, 33 insertions(+), 54 deletions(-) diff --git a/sound/oss/Kconfig b/sound/oss/Kconfig index 1ca7427c4b6..bcf2a0698d5 100644 --- a/sound/oss/Kconfig +++ b/sound/oss/Kconfig @@ -561,7 +561,7 @@ endif # SOUND_OSS config SOUND_SH_DAC_AUDIO tristate "SuperH DAC audio support" - depends on CPU_SH3 + depends on CPU_SH3 && HIGH_RES_TIMERS config SOUND_SH_DAC_AUDIO_CHANNEL int "DAC channel" diff --git a/sound/oss/sh_dac_audio.c b/sound/oss/sh_dac_audio.c index 78cfb66e4c5..b2ed8757542 100644 --- a/sound/oss/sh_dac_audio.c +++ b/sound/oss/sh_dac_audio.c @@ -18,47 +18,36 @@ #include #include #include +#include #include #include #include #include #include -#include -#include +#include #include #include #include #define MODNAME "sh_dac_audio" -#define TMU_TOCR_INIT 0x00 - -#define TMU1_TCR_INIT 0x0020 /* Clock/4, rising edge; interrupt on */ -#define TMU1_TSTR_INIT 0x02 /* Bit to turn on TMU1 */ - #define BUFFER_SIZE 48000 static int rate; static int empty; static char *data_buffer, *buffer_begin, *buffer_end; static int in_use, device_major; +static struct hrtimer hrtimer; +static ktime_t wakeups_per_second; static void dac_audio_start_timer(void) { - u8 tstr; - - tstr = ctrl_inb(TMU_TSTR); - tstr |= TMU1_TSTR_INIT; - ctrl_outb(tstr, TMU_TSTR); + hrtimer_start(&hrtimer, wakeups_per_second, HRTIMER_MODE_REL); } static void dac_audio_stop_timer(void) { - u8 tstr; - - tstr = ctrl_inb(TMU_TSTR); - tstr &= ~TMU1_TSTR_INIT; - ctrl_outb(tstr, TMU_TSTR); + hrtimer_cancel(&hrtimer); } static void dac_audio_reset(void) @@ -77,38 +66,30 @@ static void dac_audio_sync(void) static void dac_audio_start(void) { if (mach_is_hp6xx()) { - u16 v = inw(HD64461_GPADR); + u16 v = __raw_readw(HD64461_GPADR); v &= ~HD64461_GPADR_SPEAKER; - outw(v, HD64461_GPADR); + __raw_writew(v, HD64461_GPADR); } sh_dac_enable(CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL); - ctrl_outw(TMU1_TCR_INIT, TMU1_TCR); } static void dac_audio_stop(void) { dac_audio_stop_timer(); if (mach_is_hp6xx()) { - u16 v = inw(HD64461_GPADR); + u16 v = __raw_readw(HD64461_GPADR); v |= HD64461_GPADR_SPEAKER; - outw(v, HD64461_GPADR); + __raw_writew(v, HD64461_GPADR); } - sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL); + sh_dac_output(0, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL); sh_dac_disable(CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL); } static void dac_audio_set_rate(void) { - unsigned long interval; - struct clk *clk; - - clk = clk_get(NULL, "module_clk"); - interval = (clk_get_rate(clk) / 4) / rate; - clk_put(clk); - ctrl_outl(interval, TMU1_TCOR); - ctrl_outl(interval, TMU1_TCNT); + wakeups_per_second = ktime_set(0, 1000000000 / rate); } static int dac_audio_ioctl(struct inode *inode, struct file *file, @@ -265,32 +246,26 @@ const struct file_operations dac_audio_fops = { .release = dac_audio_release, }; -static irqreturn_t timer1_interrupt(int irq, void *dev) +static enum hrtimer_restart sh_dac_audio_timer(struct hrtimer *handle) { - unsigned long timer_status; - - timer_status = ctrl_inw(TMU1_TCR); - timer_status &= ~0x100; - ctrl_outw(timer_status, TMU1_TCR); - if (!empty) { sh_dac_output(*buffer_begin, CONFIG_SOUND_SH_DAC_AUDIO_CHANNEL); buffer_begin++; if (buffer_begin == data_buffer + BUFFER_SIZE) buffer_begin = data_buffer; - if (buffer_begin == buffer_end) { + if (buffer_begin == buffer_end) empty = 1; - dac_audio_stop_timer(); - } } - return IRQ_HANDLED; + + if (!empty) + hrtimer_start(&hrtimer, wakeups_per_second, HRTIMER_MODE_REL); + + return HRTIMER_NORESTART; } static int __init dac_audio_init(void) { - int retval; - if ((device_major = register_sound_dsp(&dac_audio_fops, -1)) < 0) { printk(KERN_ERR "Cannot register dsp device"); return device_major; @@ -306,21 +281,25 @@ static int __init dac_audio_init(void) rate = 8000; dac_audio_set_rate(); - retval = - request_irq(TIMER1_IRQ, timer1_interrupt, IRQF_DISABLED, MODNAME, 0); - if (retval < 0) { - printk(KERN_ERR "sh_dac_audio: IRQ %d request failed\n", - TIMER1_IRQ); - return retval; - } + /* Today: High Resolution Timer driven DAC playback. + * The timer callback gets called once per sample. Ouch. + * + * Future: A much better approach would be to use the + * SH7720 CMT+DMAC+DAC hardware combination like this: + * - Program sample rate using CMT0 or CMT1 + * - Program DMAC to use CMT for timing and output to DAC + * - Play sound using DMAC, let CPU sleep. + * - While at it, rewrite this driver to use ALSA. + */ + + hrtimer_init(&hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hrtimer.function = sh_dac_audio_timer; return 0; } static void __exit dac_audio_exit(void) { - free_irq(TIMER1_IRQ, 0); - unregister_sound_dsp(device_major); kfree((void *)data_buffer); } From 29a679754b1a2581ee456eada6c2de7ce95068bb Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 May 2009 23:19:09 -0400 Subject: [PATCH 1073/2061] x86/stacktrace: return 0 instead of -1 for stack ops If we return -1 in the ops->stack for the stacktrace saving, we end up breaking out of the loop if the stack we are tracing is in the exception stack. This causes traces like: -0 [002] 34263.745825: raise_softirq_irqoff <-__blk_complete_request -0 [002] 34263.745826: <= 0 <= 0 <= 0 <= 0 <= 0 <= 0 <= 0 By returning "0" instead, the irq stack is saved as well, and we see: -0 [003] 883.280992: raise_softirq_irqoff <-__hrtimer_star t_range_ns -0 [003] 883.280992: <= hrtimer_start_range_ns <= tick_nohz_restart_sched_tick <= cpu_idle <= start_secondary <= <= 0 <= 0 [ Impact: record stacks from interrupts ] Signed-off-by: Steven Rostedt --- arch/x86/kernel/stacktrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/stacktrace.c b/arch/x86/kernel/stacktrace.c index f7bddc2e37d..4aaf7e48394 100644 --- a/arch/x86/kernel/stacktrace.c +++ b/arch/x86/kernel/stacktrace.c @@ -20,7 +20,7 @@ save_stack_warning_symbol(void *data, char *msg, unsigned long symbol) static int save_stack_stack(void *data, char *name) { - return -1; + return 0; } static void save_stack_address(void *data, unsigned long addr, int reliable) From 1ec7c4849c214fc78b023230264399836ea3b245 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Thu, 14 May 2009 23:40:06 -0400 Subject: [PATCH 1074/2061] tracing: stop stack trace on first empty entry The stack tracer stores eight entries in the ring buffer when an event traces the stack. The output outputs all eight entries regardless of how many entries were recorded. This patch breaks out of the loop when a null entry is discovered. [ Impact: only print the stack that is recorded ] Signed-off-by: Steven Rostedt --- kernel/trace/trace_output.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 8bd9a2c1a46..489c0e8ada0 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -898,6 +898,8 @@ static enum print_line_t trace_stack_print(struct trace_iterator *iter, trace_assign_type(field, iter->ent); for (i = 0; i < FTRACE_STACK_ENTRIES; i++) { + if (!field->caller[i]) + break; if (i) { if (!trace_seq_puts(s, " <= ")) goto partial; From 8cd995b6deedf98b7694ed32a786ee7f793d1eec Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 May 2009 11:07:27 +0800 Subject: [PATCH 1075/2061] tracing/filters: add missing unlock in a failure path [ Impact: fix deadlock in a rare case we fail to allocate memory ] Signed-off-by: Li Zefan LKML-Reference: <4A0CDC6F.7070200@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 85ad6a8939a..22c29984fe0 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1079,9 +1079,10 @@ int apply_event_filter(struct ftrace_event_call *call, char *filter_string) return 0; } + err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - return -ENOMEM; + goto out_unlock; filter_disable_preds(call); replace_filter_string(call->filter, filter_string); @@ -1101,7 +1102,7 @@ out: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); - +out_unlock: mutex_unlock(&filter_mutex); return err; @@ -1123,9 +1124,10 @@ int apply_subsystem_event_filter(struct event_subsystem *system, return 0; } + err = -ENOMEM; ps = kzalloc(sizeof(*ps), GFP_KERNEL); if (!ps) - return -ENOMEM; + goto out_unlock; filter_free_subsystem_preds(system); replace_filter_string(system->filter, filter_string); @@ -1145,7 +1147,7 @@ out: filter_opstack_clear(ps); postfix_clear(ps); kfree(ps); - +out_unlock: mutex_unlock(&filter_mutex); return err; From 5872144f64b34a5942f6b4acedc90b02de72c58b Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Fri, 15 May 2009 11:07:56 +0800 Subject: [PATCH 1076/2061] tracing/filters: fix off-by-one bug We should leave the last slot for the ending '\0'. [ Impact: fix possible crash when the length of an operand is 128 ] Signed-off-by: Li Zefan LKML-Reference: <4A0CDC8C.30602@cn.fujitsu.com> Signed-off-by: Steven Rostedt --- kernel/trace/trace_events_filter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 22c29984fe0..a7430b16d24 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -736,7 +736,7 @@ static inline void clear_operand_string(struct filter_parse_state *ps) static inline int append_operand_char(struct filter_parse_state *ps, char c) { - if (ps->operand.tail == MAX_FILTER_STR_VAL) + if (ps->operand.tail == MAX_FILTER_STR_VAL - 1) return -EINVAL; ps->operand.string[ps->operand.tail++] = c; From 1a853e36871b533ccc3f3c5bdd5cd0d867043a00 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 14 May 2009 22:50:46 -0300 Subject: [PATCH 1077/2061] perf record: Allow specifying a pid to record Allow specifying a pid instead of always fork+exec'ing a command. Because the PERF_EVENT_COMM and PERF_EVENT_MMAP events happened before we connected, we must synthesize them so that 'perf report' can get what it needs. [ Impact: add new command line option ] Signed-off-by: Arnaldo Carvalho de Melo Cc: Clark Williams Cc: John Kacur Cc: Peter Zijlstra LKML-Reference: <20090515015046.GA13664@ghostprotocols.net> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-record.c | 163 ++++++++++++++++++-- 1 file changed, 146 insertions(+), 17 deletions(-) diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c index 5f5e6df0260..efb87595f3c 100644 --- a/Documentation/perf_counter/builtin-record.c +++ b/Documentation/perf_counter/builtin-record.c @@ -34,6 +34,9 @@ #include "perf.h" +#define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1) +#define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) + static int nr_counters = 0; static __u64 event_id[MAX_COUNTERS] = { }; static int default_interval = 100000; @@ -47,6 +50,7 @@ static char *output_name = "output.perf"; static int group = 0; static unsigned int realtime_prio = 0; static int system_wide = 0; +static pid_t target_pid = -1; static int inherit = 1; static int nmi = 1; @@ -180,6 +184,7 @@ static void display_help(void) " -c CNT --count=CNT # event period to sample\n" " -m pages --mmap_pages= # number of mmap data pages\n" " -o file --output= # output file\n" + " -p pid --pid= # record events on existing pid\n" " -r prio --realtime= # use RT prio\n" " -s --system # system wide profiling\n" ); @@ -199,13 +204,14 @@ static void process_options(int argc, const char *argv[]) {"event", required_argument, NULL, 'e'}, {"mmap_pages", required_argument, NULL, 'm'}, {"output", required_argument, NULL, 'o'}, + {"pid", required_argument, NULL, 'p'}, {"realtime", required_argument, NULL, 'r'}, {"system", no_argument, NULL, 's'}, {"inherit", no_argument, NULL, 'i'}, {"nmi", no_argument, NULL, 'n'}, {NULL, 0, NULL, 0 } }; - int c = getopt_long(argc, argv, "+:c:e:m:o:r:sin", + int c = getopt_long(argc, argv, "+:c:e:m:o:p:r:sin", long_options, &option_index); if (c == -1) break; @@ -215,6 +221,7 @@ static void process_options(int argc, const char *argv[]) case 'e': error = parse_events(optarg); break; case 'm': mmap_pages = atoi(optarg); break; case 'o': output_name = strdup(optarg); break; + case 'p': target_pid = atoi(optarg); break; case 'r': realtime_prio = atoi(optarg); break; case 's': system_wide ^= 1; break; case 'i': inherit ^= 1; break; @@ -223,7 +230,7 @@ static void process_options(int argc, const char *argv[]) } } - if (argc - optind == 0) + if (argc - optind == 0 && target_pid == -1) error = 1; if (error) @@ -350,15 +357,135 @@ static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; static int nr_poll; static int nr_cpu; -static void open_counters(int cpu) +struct mmap_event { + struct perf_event_header header; + __u32 pid, tid; + __u64 start; + __u64 len; + __u64 pgoff; + char filename[PATH_MAX]; +}; +struct comm_event { + struct perf_event_header header; + __u32 pid,tid; + char comm[16]; +}; + +static pid_t pid_synthesize_comm_event(pid_t pid) +{ + char filename[PATH_MAX]; + char bf[BUFSIZ]; + struct comm_event comm_ev; + size_t size; + int fd; + + snprintf(filename, sizeof(filename), "/proc/%d/stat", pid); + + fd = open(filename, O_RDONLY); + if (fd < 0) { + fprintf(stderr, "couldn't open %s\n", filename); + exit(EXIT_FAILURE); + } + if (read(fd, bf, sizeof(bf)) < 0) { + fprintf(stderr, "couldn't read %s\n", filename); + exit(EXIT_FAILURE); + } + close(fd); + + pid_t spid, ppid; + char state; + char comm[18]; + + memset(&comm_ev, 0, sizeof(comm_ev)); + int nr = sscanf(bf, "%d %s %c %d %d ", + &spid, comm, &state, &ppid, &comm_ev.pid); + if (nr != 5) { + fprintf(stderr, "couldn't get COMM and pgid, malformed %s\n", + filename); + exit(EXIT_FAILURE); + } + comm_ev.header.type = PERF_EVENT_COMM; + comm_ev.tid = pid; + size = strlen(comm); + comm[--size] = '\0'; /* Remove the ')' at the end */ + --size; /* Remove the '(' at the begin */ + memcpy(comm_ev.comm, comm + 1, size); + size = ALIGN(size, sizeof(uint64_t)); + comm_ev.header.size = sizeof(comm_ev) - (sizeof(comm_ev.comm) - size); + int ret = write(output, &comm_ev, comm_ev.header.size); + if (ret < 0) { + perror("failed to write"); + exit(-1); + } + return comm_ev.pid; +} + +static void pid_synthesize_mmap_events(pid_t pid, pid_t pgid) +{ + char filename[PATH_MAX]; + FILE *fp; + + snprintf(filename, sizeof(filename), "/proc/%d/maps", pid); + + fp = fopen(filename, "r"); + if (fp == NULL) { + fprintf(stderr, "couldn't open %s\n", filename); + exit(EXIT_FAILURE); + } + while (1) { + char bf[BUFSIZ]; + unsigned char vm_read, vm_write, vm_exec, vm_mayshare; + struct mmap_event mmap_ev = { + .header.type = PERF_EVENT_MMAP, + }; + unsigned long ino; + int major, minor; + size_t size; + if (fgets(bf, sizeof(bf), fp) == NULL) + break; + + /* 00400000-0040c000 r-xp 00000000 fd:01 41038 /bin/cat */ + sscanf(bf, "%llx-%llx %c%c%c%c %llx %x:%x %lu", + &mmap_ev.start, &mmap_ev.len, + &vm_read, &vm_write, &vm_exec, &vm_mayshare, + &mmap_ev.pgoff, &major, &minor, &ino); + if (vm_exec == 'x') { + char *execname = strrchr(bf, ' '); + + if (execname == NULL || execname[1] != '/') + continue; + + execname += 1; + size = strlen(execname); + execname[size - 1] = '\0'; /* Remove \n */ + memcpy(mmap_ev.filename, execname, size); + size = ALIGN(size, sizeof(uint64_t)); + mmap_ev.len -= mmap_ev.start; + mmap_ev.header.size = (sizeof(mmap_ev) - + (sizeof(mmap_ev.filename) - size)); + mmap_ev.pid = pgid; + mmap_ev.tid = pid; + + if (write(output, &mmap_ev, mmap_ev.header.size) < 0) { + perror("failed to write"); + exit(-1); + } + } + } + + fclose(fp); +} + +static void open_counters(int cpu, pid_t pid) { struct perf_counter_hw_event hw_event; int counter, group_fd; int track = 1; - pid_t pid = -1; - if (cpu < 0) - pid = 0; + if (pid > 0) { + pid_t pgid = pid_synthesize_comm_event(pid); + pid_synthesize_mmap_events(pid, pgid); + } group_fd = -1; for (counter = 0; counter < nr_counters; counter++) { @@ -435,22 +562,24 @@ int cmd_record(int argc, const char **argv) argc -= optind; argv += optind; - if (!system_wide) - open_counters(-1); - else for (i = 0; i < nr_cpus; i++) - open_counters(i); + if (!system_wide) { + open_counters(-1, target_pid != -1 ? target_pid : 0); + } else for (i = 0; i < nr_cpus; i++) + open_counters(i, target_pid); signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); - pid = fork(); - if (pid < 0) - perror("failed to fork"); + if (target_pid == -1) { + pid = fork(); + if (pid < 0) + perror("failed to fork"); - if (!pid) { - if (execvp(argv[0], argv)) { - perror(argv[0]); - exit(-1); + if (!pid) { + if (execvp(argv[0], argv)) { + perror(argv[0]); + exit(-1); + } } } From ec3232bdf8518bea8410f0027f870b24d3aa8753 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 09:45:19 +0200 Subject: [PATCH 1078/2061] perf_counter: x86: More accurate counter update Take the counter width into account instead of assuming 32 bits. In particular Nehalem has 44 bit wide counters, and all arithmetics should happen on a 44-bit signed integer basis. [ Impact: fix rare event imprecision, warning message on Nehalem ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index f7772ff7936..3a92a2b2a80 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -138,7 +138,9 @@ static u64 x86_perf_counter_update(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { - u64 prev_raw_count, new_raw_count, delta; + int shift = 64 - x86_pmu.counter_bits; + u64 prev_raw_count, new_raw_count; + s64 delta; /* * Careful: an NMI might modify the previous counter value. @@ -161,9 +163,10 @@ again: * (counter-)time and add that to the generic counter. * * Careful, not all hw sign-extends above the physical width - * of the count, so we do that by clipping the delta to 32 bits: + * of the count. */ - delta = (u64)(u32)((s32)new_raw_count - (s32)prev_raw_count); + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; atomic64_add(delta, &counter->count); atomic64_sub(delta, &hwc->period_left); From f5a5a2f6e69e88647ae12da39f0ff3a510bcf0a6 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 13 May 2009 12:54:01 +0200 Subject: [PATCH 1079/2061] perf_counter: x86: Fix throttling If counters are disabled globally when a perfcounter IRQ/NMI hits, and if we throttle in that case, we'll promote the '0' value to the next lapic IRQ and disable all perfcounters at that point, permanently ... Fix it. [ Impact: fix hung perfcounters under load ] Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 3a92a2b2a80..88ae8cebf3c 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -765,8 +765,13 @@ out: /* * Restore - do not reenable when global enable is off or throttled: */ - if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) - intel_pmu_restore_all(cpuc->throttle_ctrl); + if (cpuc->throttle_ctrl) { + if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) { + intel_pmu_restore_all(cpuc->throttle_ctrl); + } else { + pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id()); + } + } return ret; } @@ -817,11 +822,16 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - if (printk_ratelimit()) - printk(KERN_WARNING "perfcounters: max interrupts exceeded!\n"); + pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id()); + + /* + * Clear them before re-enabling irqs/NMIs again: + */ + cpuc->interrupts = 0; hw_perf_restore(cpuc->throttle_ctrl); + } else { + cpuc->interrupts = 0; } - cpuc->interrupts = 0; } void smp_perf_counter_interrupt(struct pt_regs *regs) From a026dfecc035f213c1cfa0bf6407ce3155f6a9df Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 10:02:57 +0200 Subject: [PATCH 1080/2061] perf_counter: x86: Allow unpriviliged use of NMIs Apply sysctl_perf_counter_priv to NMIs. Also, fail the counter creation instead of silently down-grading to regular interrupts. [ Impact: allow wider perf-counter usage ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 88ae8cebf3c..c19e927b697 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -280,8 +280,11 @@ static int __hw_perf_counter_init(struct perf_counter *counter) * If privileged enough, allow NMI events: */ hwc->nmi = 0; - if (capable(CAP_SYS_ADMIN) && hw_event->nmi) + if (hw_event->nmi) { + if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) + return -EACCES; hwc->nmi = 1; + } hwc->irq_period = hw_event->irq_period; if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) From 53020fe81eecd0b7be295868ce5850ef8f41074e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 21:26:19 +0200 Subject: [PATCH 1081/2061] perf_counter: Fix perf_output_copy() WARN to account for overflow The simple reservation test in perf_output_copy() failed to take unsigned int overflow into account, fix this. [ Impact: fix false positive warning with more than 4GB of profiling data ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ff166c11b69..985be0b662a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1927,7 +1927,11 @@ static void perf_output_copy(struct perf_output_handle *handle, handle->offset = offset; - WARN_ON_ONCE(handle->offset > handle->head); + /* + * Check we didn't copy past our reservation window, taking the + * possible unsigned int wrap into account. + */ + WARN_ON_ONCE(((int)(handle->head - handle->offset)) < 0); } #define perf_output_put(handle, x) \ From 962bf7a66edca4d36a730a38ff8410a67f560e40 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 13:21:36 +0200 Subject: [PATCH 1082/2061] perf_counter: x86: Fix up the amd NMI/INT throttle perf_counter_unthrottle() restores throttle_ctrl, buts its never set. Also, we fail to disable all counters when throttling. [ Impact: fix rare stuck perf-counters when they are throttled ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 38 ++++++++++++++++++++---------- 1 file changed, 26 insertions(+), 12 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c19e927b697..7601c014f8f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -334,6 +334,8 @@ static u64 amd_pmu_save_disable_all(void) * right thing. */ barrier(); + if (!enabled) + goto out; for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -347,6 +349,7 @@ static u64 amd_pmu_save_disable_all(void) wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } +out: return enabled; } @@ -787,32 +790,43 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) int handled = 0; struct perf_counter *counter; struct hw_perf_counter *hwc; - int idx; + int idx, throttle = 0; + + cpuc->throttle_ctrl = cpuc->enabled; + cpuc->enabled = 0; + barrier(); + + if (cpuc->throttle_ctrl) { + if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) + throttle = 1; + } - ++cpuc->interrupts; for (idx = 0; idx < x86_pmu.num_counters; idx++) { + int disable = 0; + if (!test_bit(idx, cpuc->active_mask)) continue; + counter = cpuc->counters[idx]; hwc = &counter->hw; val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) - continue; + goto next; + /* counter overflow */ x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); - if (perf_counter_overflow(counter, nmi, regs, 0)) + disable = perf_counter_overflow(counter, nmi, regs, 0); + +next: + if (disable || throttle) amd_pmu_disable_counter(hwc, idx); - else if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) - /* - * do not reenable when throttled, but reload - * the register - */ - amd_pmu_disable_counter(hwc, idx); - else if (counter->state == PERF_COUNTER_STATE_ACTIVE) - amd_pmu_enable_counter(hwc, idx); } + + if (cpuc->throttle_ctrl && !throttle) + cpuc->enabled = 1; + return handled; } From 9e35ad388bea89f7d6f375af4c0ae98803688666 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 13 May 2009 16:21:38 +0200 Subject: [PATCH 1083/2061] perf_counter: Rework the perf counter disable/enable The current disable/enable mechanism is: token = hw_perf_save_disable(); ... /* do bits */ ... hw_perf_restore(token); This works well, provided that the use nests properly. Except we don't. x86 NMI/INT throttling has non-nested use of this, breaking things. Therefore provide a reference counter disable/enable interface, where the first disable disables the hardware, and the last enable enables the hardware again. [ Impact: refactor, simplify the PMU disable/enable logic ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 24 +++--- arch/x86/kernel/cpu/perf_counter.c | 113 +++++++++++------------------ drivers/acpi/processor_idle.c | 6 +- include/linux/perf_counter.h | 10 ++- kernel/perf_counter.c | 76 +++++++++++-------- 5 files changed, 109 insertions(+), 120 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 15cdc8e6722..bb1b463c136 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -386,7 +386,7 @@ static void write_mmcr0(struct cpu_hw_counters *cpuhw, unsigned long mmcr0) * Disable all counters to prevent PMU interrupts and to allow * counters to be added or removed. */ -u64 hw_perf_save_disable(void) +void hw_perf_disable(void) { struct cpu_hw_counters *cpuhw; unsigned long ret; @@ -428,7 +428,6 @@ u64 hw_perf_save_disable(void) mb(); } local_irq_restore(flags); - return ret; } /* @@ -436,7 +435,7 @@ u64 hw_perf_save_disable(void) * If we were previously disabled and counters were added, then * put the new config on the PMU. */ -void hw_perf_restore(u64 disable) +void hw_perf_enable(void) { struct perf_counter *counter; struct cpu_hw_counters *cpuhw; @@ -448,9 +447,12 @@ void hw_perf_restore(u64 disable) int n_lim; int idx; - if (disable) - return; local_irq_save(flags); + if (!cpuhw->disabled) { + local_irq_restore(flags); + return; + } + cpuhw = &__get_cpu_var(cpu_hw_counters); cpuhw->disabled = 0; @@ -649,19 +651,18 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, /* * Add a counter to the PMU. * If all counters are not already frozen, then we disable and - * re-enable the PMU in order to get hw_perf_restore to do the + * re-enable the PMU in order to get hw_perf_enable to do the * actual work of reconfiguring the PMU. */ static int power_pmu_enable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; unsigned long flags; - u64 pmudis; int n0; int ret = -EAGAIN; local_irq_save(flags); - pmudis = hw_perf_save_disable(); + perf_disable(); /* * Add the counter to the list (if there is room) @@ -685,7 +686,7 @@ static int power_pmu_enable(struct perf_counter *counter) ret = 0; out: - hw_perf_restore(pmudis); + perf_enable(); local_irq_restore(flags); return ret; } @@ -697,11 +698,10 @@ static void power_pmu_disable(struct perf_counter *counter) { struct cpu_hw_counters *cpuhw; long i; - u64 pmudis; unsigned long flags; local_irq_save(flags); - pmudis = hw_perf_save_disable(); + perf_disable(); power_pmu_read(counter); @@ -735,7 +735,7 @@ static void power_pmu_disable(struct perf_counter *counter) cpuhw->mmcr[0] &= ~(MMCR0_PMXE | MMCR0_FCECE); } - hw_perf_restore(pmudis); + perf_enable(); local_irq_restore(flags); } diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 7601c014f8f..313638cecbb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -31,7 +31,6 @@ struct cpu_hw_counters { unsigned long used_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long active_mask[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; unsigned long interrupts; - u64 throttle_ctrl; int enabled; }; @@ -42,8 +41,8 @@ struct x86_pmu { const char *name; int version; int (*handle_irq)(struct pt_regs *, int); - u64 (*save_disable_all)(void); - void (*restore_all)(u64); + void (*disable_all)(void); + void (*enable_all)(void); void (*enable)(struct hw_perf_counter *, int); void (*disable)(struct hw_perf_counter *, int); unsigned eventsel; @@ -56,6 +55,7 @@ struct x86_pmu { int counter_bits; u64 counter_mask; u64 max_period; + u64 intel_ctrl; }; static struct x86_pmu x86_pmu __read_mostly; @@ -311,22 +311,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return 0; } -static u64 intel_pmu_save_disable_all(void) +static void intel_pmu_disable_all(void) { - u64 ctrl; - - rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0); - - return ctrl; } -static u64 amd_pmu_save_disable_all(void) +static void amd_pmu_disable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); - int enabled, idx; + int idx; + + if (!cpuc->enabled) + return; - enabled = cpuc->enabled; cpuc->enabled = 0; /* * ensure we write the disable before we start disabling the @@ -334,8 +331,6 @@ static u64 amd_pmu_save_disable_all(void) * right thing. */ barrier(); - if (!enabled) - goto out; for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -348,37 +343,31 @@ static u64 amd_pmu_save_disable_all(void) val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE; wrmsrl(MSR_K7_EVNTSEL0 + idx, val); } - -out: - return enabled; } -u64 hw_perf_save_disable(void) +void hw_perf_disable(void) { if (!x86_pmu_initialized()) - return 0; - return x86_pmu.save_disable_all(); + return; + return x86_pmu.disable_all(); } -/* - * Exported because of ACPI idle - */ -EXPORT_SYMBOL_GPL(hw_perf_save_disable); -static void intel_pmu_restore_all(u64 ctrl) +static void intel_pmu_enable_all(void) { - wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, ctrl); + wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); } -static void amd_pmu_restore_all(u64 ctrl) +static void amd_pmu_enable_all(void) { struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); int idx; - cpuc->enabled = ctrl; - barrier(); - if (!ctrl) + if (cpuc->enabled) return; + cpuc->enabled = 1; + barrier(); + for (idx = 0; idx < x86_pmu.num_counters; idx++) { u64 val; @@ -392,16 +381,12 @@ static void amd_pmu_restore_all(u64 ctrl) } } -void hw_perf_restore(u64 ctrl) +void hw_perf_enable(void) { if (!x86_pmu_initialized()) return; - x86_pmu.restore_all(ctrl); + x86_pmu.enable_all(); } -/* - * Exported because of ACPI idle - */ -EXPORT_SYMBOL_GPL(hw_perf_restore); static inline u64 intel_pmu_get_status(void) { @@ -735,15 +720,14 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) int bit, cpu = smp_processor_id(); u64 ack, status; struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - int ret = 0; - - cpuc->throttle_ctrl = intel_pmu_save_disable_all(); + perf_disable(); status = intel_pmu_get_status(); - if (!status) - goto out; + if (!status) { + perf_enable(); + return 0; + } - ret = 1; again: inc_irq_stat(apic_perf_irqs); ack = status; @@ -767,19 +751,11 @@ again: status = intel_pmu_get_status(); if (status) goto again; -out: - /* - * Restore - do not reenable when global enable is off or throttled: - */ - if (cpuc->throttle_ctrl) { - if (++cpuc->interrupts < PERFMON_MAX_INTERRUPTS) { - intel_pmu_restore_all(cpuc->throttle_ctrl); - } else { - pr_info("CPU#%d: perfcounters: max interrupt rate exceeded! Throttle on.\n", smp_processor_id()); - } - } - return ret; + if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) + perf_enable(); + + return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) @@ -792,13 +768,11 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) struct hw_perf_counter *hwc; int idx, throttle = 0; - cpuc->throttle_ctrl = cpuc->enabled; - cpuc->enabled = 0; - barrier(); - - if (cpuc->throttle_ctrl) { - if (++cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) - throttle = 1; + if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { + throttle = 1; + __perf_disable(); + cpuc->enabled = 0; + barrier(); } for (idx = 0; idx < x86_pmu.num_counters; idx++) { @@ -824,9 +798,6 @@ next: amd_pmu_disable_counter(hwc, idx); } - if (cpuc->throttle_ctrl && !throttle) - cpuc->enabled = 1; - return handled; } @@ -839,13 +810,11 @@ void perf_counter_unthrottle(void) cpuc = &__get_cpu_var(cpu_hw_counters); if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - pr_info("CPU#%d: perfcounters: throttle off.\n", smp_processor_id()); - /* * Clear them before re-enabling irqs/NMIs again: */ cpuc->interrupts = 0; - hw_perf_restore(cpuc->throttle_ctrl); + perf_enable(); } else { cpuc->interrupts = 0; } @@ -931,8 +900,8 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = { static struct x86_pmu intel_pmu = { .name = "Intel", .handle_irq = intel_pmu_handle_irq, - .save_disable_all = intel_pmu_save_disable_all, - .restore_all = intel_pmu_restore_all, + .disable_all = intel_pmu_disable_all, + .enable_all = intel_pmu_enable_all, .enable = intel_pmu_enable_counter, .disable = intel_pmu_disable_counter, .eventsel = MSR_ARCH_PERFMON_EVENTSEL0, @@ -951,8 +920,8 @@ static struct x86_pmu intel_pmu = { static struct x86_pmu amd_pmu = { .name = "AMD", .handle_irq = amd_pmu_handle_irq, - .save_disable_all = amd_pmu_save_disable_all, - .restore_all = amd_pmu_restore_all, + .disable_all = amd_pmu_disable_all, + .enable_all = amd_pmu_enable_all, .enable = amd_pmu_enable_counter, .disable = amd_pmu_disable_counter, .eventsel = MSR_K7_EVNTSEL0, @@ -1003,6 +972,8 @@ static int intel_pmu_init(void) x86_pmu.counter_bits = eax.split.bit_width; x86_pmu.counter_mask = (1ULL << eax.split.bit_width) - 1; + rdmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl); + return 0; } diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index d2830f39d46..9645758c047 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -763,11 +763,9 @@ static int acpi_idle_bm_check(void) */ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) { - u64 perf_flags; - /* Don't trace irqs off for idle */ stop_critical_timings(); - perf_flags = hw_perf_save_disable(); + perf_disable(); if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -782,7 +780,7 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } - hw_perf_restore(perf_flags); + perf_enable(); start_critical_timings(); } diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 614f921d616..e543ecc129f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -544,8 +544,10 @@ extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); extern void perf_counter_unthrottle(void); -extern u64 hw_perf_save_disable(void); -extern void hw_perf_restore(u64 ctrl); +extern void __perf_disable(void); +extern bool __perf_enable(void); +extern void perf_disable(void); +extern void perf_enable(void); extern int perf_counter_task_disable(void); extern int perf_counter_task_enable(void); extern int hw_perf_group_sched_in(struct perf_counter *group_leader, @@ -600,8 +602,8 @@ static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } static inline void perf_counter_unthrottle(void) { } -static inline void hw_perf_restore(u64 ctrl) { } -static inline u64 hw_perf_save_disable(void) { return 0; } +static inline void perf_disable(void) { } +static inline void perf_enable(void) { } static inline int perf_counter_task_disable(void) { return -EINVAL; } static inline int perf_counter_task_enable(void) { return -EINVAL; } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 985be0b662a..e814ff04d7c 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -60,8 +60,9 @@ extern __weak const struct pmu *hw_perf_counter_init(struct perf_counter *counte return NULL; } -u64 __weak hw_perf_save_disable(void) { return 0; } -void __weak hw_perf_restore(u64 ctrl) { barrier(); } +void __weak hw_perf_disable(void) { barrier(); } +void __weak hw_perf_enable(void) { barrier(); } + void __weak hw_perf_counter_setup(int cpu) { barrier(); } int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, struct perf_cpu_context *cpuctx, @@ -72,6 +73,32 @@ int __weak hw_perf_group_sched_in(struct perf_counter *group_leader, void __weak perf_counter_print_debug(void) { } +static DEFINE_PER_CPU(int, disable_count); + +void __perf_disable(void) +{ + __get_cpu_var(disable_count)++; +} + +bool __perf_enable(void) +{ + return !--__get_cpu_var(disable_count); +} + +void perf_disable(void) +{ + __perf_disable(); + hw_perf_disable(); +} +EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */ + +void perf_enable(void) +{ + if (__perf_enable()) + hw_perf_enable(); +} +EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */ + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -170,7 +197,6 @@ static void __perf_counter_remove_from_context(void *info) struct perf_counter *counter = info; struct perf_counter_context *ctx = counter->ctx; unsigned long flags; - u64 perf_flags; /* * If this is a task context, we need to check whether it is @@ -191,9 +217,9 @@ static void __perf_counter_remove_from_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_del_counter(counter, ctx); - hw_perf_restore(perf_flags); + perf_enable(); if (!ctx->task) { /* @@ -538,7 +564,6 @@ static void __perf_install_in_context(void *info) struct perf_counter *leader = counter->group_leader; int cpu = smp_processor_id(); unsigned long flags; - u64 perf_flags; int err; /* @@ -556,7 +581,7 @@ static void __perf_install_in_context(void *info) * Protect the list operation against NMI by disabling the * counters on a global level. NOP for non NMI based counters. */ - perf_flags = hw_perf_save_disable(); + perf_disable(); add_counter_to_ctx(counter, ctx); @@ -596,7 +621,7 @@ static void __perf_install_in_context(void *info) cpuctx->max_pertask--; unlock: - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock_irqrestore(&ctx->lock, flags); } @@ -663,7 +688,6 @@ static void __perf_counter_enable(void *info) struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); struct perf_counter_context *ctx = counter->ctx; struct perf_counter *leader = counter->group_leader; - unsigned long pmuflags; unsigned long flags; int err; @@ -693,14 +717,14 @@ static void __perf_counter_enable(void *info) if (!group_can_go_on(counter, cpuctx, 1)) { err = -EEXIST; } else { - pmuflags = hw_perf_save_disable(); + perf_disable(); if (counter == leader) err = group_sched_in(counter, cpuctx, ctx, smp_processor_id()); else err = counter_sched_in(counter, cpuctx, ctx, smp_processor_id()); - hw_perf_restore(pmuflags); + perf_enable(); } if (err) { @@ -795,7 +819,6 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx) { struct perf_counter *counter; - u64 flags; spin_lock(&ctx->lock); ctx->is_active = 0; @@ -803,12 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, goto out; update_context_time(ctx); - flags = hw_perf_save_disable(); + perf_disable(); if (ctx->nr_active) { list_for_each_entry(counter, &ctx->counter_list, list_entry) group_sched_out(counter, cpuctx, ctx); } - hw_perf_restore(flags); + perf_enable(); out: spin_unlock(&ctx->lock); } @@ -860,7 +883,6 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, struct perf_cpu_context *cpuctx, int cpu) { struct perf_counter *counter; - u64 flags; int can_add_hw = 1; spin_lock(&ctx->lock); @@ -870,7 +892,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, ctx->timestamp = perf_clock(); - flags = hw_perf_save_disable(); + perf_disable(); /* * First go through the list and put on any pinned groups @@ -917,7 +939,7 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, can_add_hw = 0; } } - hw_perf_restore(flags); + perf_enable(); out: spin_unlock(&ctx->lock); } @@ -955,7 +977,6 @@ int perf_counter_task_disable(void) struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; unsigned long flags; - u64 perf_flags; if (likely(!ctx->nr_counters)) return 0; @@ -969,7 +990,7 @@ int perf_counter_task_disable(void) /* * Disable all the counters: */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state != PERF_COUNTER_STATE_ERROR) { @@ -978,7 +999,7 @@ int perf_counter_task_disable(void) } } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock_irqrestore(&ctx->lock, flags); @@ -991,7 +1012,6 @@ int perf_counter_task_enable(void) struct perf_counter_context *ctx = &curr->perf_counter_ctx; struct perf_counter *counter; unsigned long flags; - u64 perf_flags; int cpu; if (likely(!ctx->nr_counters)) @@ -1007,7 +1027,7 @@ int perf_counter_task_enable(void) /* * Disable all the counters: */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { if (counter->state > PERF_COUNTER_STATE_OFF) @@ -1017,7 +1037,7 @@ int perf_counter_task_enable(void) ctx->time - counter->total_time_enabled; counter->hw_event.disabled = 0; } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock(&ctx->lock); @@ -1034,7 +1054,6 @@ int perf_counter_task_enable(void) static void rotate_ctx(struct perf_counter_context *ctx) { struct perf_counter *counter; - u64 perf_flags; if (!ctx->nr_counters) return; @@ -1043,12 +1062,12 @@ static void rotate_ctx(struct perf_counter_context *ctx) /* * Rotate the first entry last (works just fine for group counters too): */ - perf_flags = hw_perf_save_disable(); + perf_disable(); list_for_each_entry(counter, &ctx->counter_list, list_entry) { list_move_tail(&counter->list_entry, &ctx->counter_list); break; } - hw_perf_restore(perf_flags); + perf_enable(); spin_unlock(&ctx->lock); } @@ -3194,7 +3213,6 @@ __perf_counter_exit_task(struct task_struct *child, } else { struct perf_cpu_context *cpuctx; unsigned long flags; - u64 perf_flags; /* * Disable and unlink this counter. @@ -3203,7 +3221,7 @@ __perf_counter_exit_task(struct task_struct *child, * could still be processing it: */ local_irq_save(flags); - perf_flags = hw_perf_save_disable(); + perf_disable(); cpuctx = &__get_cpu_var(perf_cpu_context); @@ -3214,7 +3232,7 @@ __perf_counter_exit_task(struct task_struct *child, child_ctx->nr_counters--; - hw_perf_restore(perf_flags); + perf_enable(); local_irq_restore(flags); } From a4016a79fcbd139e7378944c0d86a39fdbc70ecc Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 14 May 2009 14:52:17 +0200 Subject: [PATCH 1084/2061] perf_counter: x86: Robustify interrupt handling Two consecutive NMIs could daze and confuse the machine when the first would handle the overflow of both counters. [ Impact: fix false-positive syslog messages under multi-session profiling ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 313638cecbb..1dcf67057f1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -783,6 +783,10 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) counter = cpuc->counters[idx]; hwc = &counter->hw; + + if (counter->hw_event.nmi != nmi) + goto next; + val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) goto next; @@ -869,7 +873,6 @@ perf_counter_nmi_handler(struct notifier_block *self, { struct die_args *args = __args; struct pt_regs *regs; - int ret; if (!atomic_read(&active_counters)) return NOTIFY_DONE; @@ -886,9 +889,16 @@ perf_counter_nmi_handler(struct notifier_block *self, regs = args->regs; apic_write(APIC_LVTPC, APIC_DM_NMI); - ret = x86_pmu.handle_irq(regs, 1); + /* + * Can't rely on the handled return value to say it was our NMI, two + * counters could trigger 'simultaneously' raising two back-to-back NMIs. + * + * If the first NMI handles both, the latter will be empty and daze + * the CPU. + */ + x86_pmu.handle_irq(regs, 1); - return ret ? NOTIFY_STOP : NOTIFY_OK; + return NOTIFY_STOP; } static __read_mostly struct notifier_block perf_counter_nmi_notifier = { From 1c80f4b598d9b075a2a0be694e28be93a6702bcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 May 2009 08:25:22 +0200 Subject: [PATCH 1085/2061] perf_counter: x86: Disallow interval of 1 On certain CPUs i have observed a stuck PMU if interval was set to 1 and NMIs were used. The PMU had PMC0 set in MSR_CORE_PERF_GLOBAL_STATUS, but it was not possible to ack it via MSR_CORE_PERF_GLOBAL_OVF_CTRL, and the NMI loop got stuck infinitely. [ Impact: fix rare hangs during high perfcounter load ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 1dcf67057f1..46a82d1e4cb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -473,6 +473,11 @@ x86_perf_counter_set_period(struct perf_counter *counter, left += period; atomic64_set(&hwc->period_left, left); } + /* + * Quirk: certain CPUs dont like it if just 1 event is left: + */ + if (unlikely(left < 2)) + left = 2; per_cpu(prev_left[idx], smp_processor_id()) = left; From 9029a5e3801f1cc7cdaab80169d82427acf928d8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 May 2009 08:26:20 +0200 Subject: [PATCH 1086/2061] perf_counter: x86: Protect against infinite loops in intel_pmu_handle_irq() intel_pmu_handle_irq() can lock up in an infinite loop if the hardware does not allow the acking of irqs. Alas, this happened in testing so make this robust and emit a warning if it happens in the future. Also, clean up the IRQ handlers a bit. [ Impact: improve perfcounter irq/nmi handling robustness ] Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 46a82d1e4cb..5a7f718eb1e 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -722,9 +722,13 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) */ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int bit, cpu = smp_processor_id(); + struct cpu_hw_counters *cpuc; + struct cpu_hw_counters; + int bit, cpu, loops; u64 ack, status; - struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); perf_disable(); status = intel_pmu_get_status(); @@ -733,7 +737,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) return 0; } + loops = 0; again: + if (++loops > 100) { + WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); + return 1; + } + inc_irq_stat(apic_perf_irqs); ack = status; for_each_bit(bit, (unsigned long *)&status, X86_PMC_IDX_MAX) { @@ -765,13 +775,14 @@ again: static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int cpu = smp_processor_id(); - struct cpu_hw_counters *cpuc = &per_cpu(cpu_hw_counters, cpu); - u64 val; - int handled = 0; + int cpu, idx, throttle = 0, handled = 0; + struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; - int idx, throttle = 0; + u64 val; + + cpu = smp_processor_id(); + cpuc = &per_cpu(cpu_hw_counters, cpu); if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { throttle = 1; From 251e8e3c7235f5944805a64f24c79fc4696793f1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 14 May 2009 05:16:59 +0200 Subject: [PATCH 1087/2061] perf_counter: Remove ACPI quirk We had a disable/enable around acpi_idle_do_entry() due to an erratum in an early prototype CPU i had access to. That erratum has been fixed in the BIOS so remove the quirk. The quirk also kept us from profiling interrupts that hit the ACPI idle instruction - so this is an improvement as well, beyond a cleanup and a micro-optimization. [ Impact: improve profiling scope, cleanup, micro-optimization ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford LKML-Reference: Signed-off-by: Ingo Molnar --- drivers/acpi/processor_idle.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/acpi/processor_idle.c b/drivers/acpi/processor_idle.c index 9645758c047..f7ca8c55956 100644 --- a/drivers/acpi/processor_idle.c +++ b/drivers/acpi/processor_idle.c @@ -765,7 +765,6 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) { /* Don't trace irqs off for idle */ stop_critical_timings(); - perf_disable(); if (cx->entry_method == ACPI_CSTATE_FFH) { /* Call into architectural FFH based C-state */ acpi_processor_ffh_cstate_enter(cx); @@ -780,7 +779,6 @@ static inline void acpi_idle_do_entry(struct acpi_processor_cx *cx) gets asserted in time to freeze execution properly. */ unused = inl(acpi_gbl_FADT.xpm_timer_block.address); } - perf_enable(); start_critical_timings(); } From 58d7e993b16b62d30f8ef27757614056fe4def11 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 15 May 2009 11:03:23 +0200 Subject: [PATCH 1088/2061] perf stat: handle Ctrl-C Before this change, if a long-running perf stat workload was Ctrl-C-ed, the utility exited without displaying statistics. After the change, the Ctrl-C gets propagated into the workload (and causes its early exit there), but perf stat itself will still continue to run and will display counter results. This is useful to run open-ended workloads, let them run for a while, then Ctrl-C them to get the stats. [ Impact: extend perf stat with new functionality ] Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-stat.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index cf575c305a6..03518d75d86 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -538,8 +538,14 @@ static void process_options(int argc, char **argv) } } +static void skip_signal(int signo) +{ +} + int cmd_stat(int argc, char **argv, const char *prefix) { + sigset_t blocked; + page_size = sysconf(_SC_PAGE_SIZE); process_options(argc, argv); @@ -548,5 +554,15 @@ int cmd_stat(int argc, char **argv, const char *prefix) assert(nr_cpus <= MAX_NR_CPUS); assert(nr_cpus >= 0); + /* + * We dont want to block the signals - that would cause + * child tasks to inherit that and Ctrl-C would not work. + * What we want is for Ctrl-C to work in the exec()-ed + * task, but being ignored by perf stat itself: + */ + signal(SIGINT, skip_signal); + signal(SIGALRM, skip_signal); + signal(SIGABRT, skip_signal); + return do_perfstat(argc, argv); } From f1a11e0576c7a73d759d05d776692b2b2d37172b Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 5 May 2009 19:21:40 +0200 Subject: [PATCH 1089/2061] futex: remove the wait queue The waitqueue which is used in struct futex_q is a leftover from the futexfd implementation. There is no need to use a waitqueue at all, as the waiting task is the only user of it. The waitqueue just adds additional locking and a loop in the wake up path which both can be avoided. We have already a task reference in struct futex_q which is used for PI futexes. Use it for normal futexes as well and just wake up the task directly. The logic of signalling the futex wakeup via setting q->lock_ptr to NULL is kept with the difference that we set it NULL before doing the wakeup. This opens an exit race window vs. a non futex wake up of the to be woken up task, which we prevent with get_task_struct / put_task_struct on the waiter. [ Impact: simplification ] Signed-off-by: Thomas Gleixner --- kernel/futex.c | 60 ++++++++++++++++++++++---------------------------- 1 file changed, 26 insertions(+), 34 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index aec8bf89bf4..157bfcd725b 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -100,8 +100,8 @@ struct futex_pi_state { */ struct futex_q { struct plist_node list; - /* There can only be a single waiter */ - wait_queue_head_t waiter; + /* Waiter reference */ + struct task_struct *task; /* Which hash list lock to use: */ spinlock_t *lock_ptr; @@ -111,7 +111,6 @@ struct futex_q { /* Optional priority inheritance state: */ struct futex_pi_state *pi_state; - struct task_struct *task; /* rt_waiter storage for requeue_pi: */ struct rt_mutex_waiter *rt_waiter; @@ -694,22 +693,29 @@ retry: */ static void wake_futex(struct futex_q *q) { + struct task_struct *p = q->task; + + /* + * We set q->lock_ptr = NULL _before_ we wake up the task. If + * a non futex wake up happens on another CPU then the task + * might exit and p would dereference a non existing task + * struct. Prevent this by holding a reference on p across the + * wake up. + */ + get_task_struct(p); + plist_del(&q->list, &q->list.plist); /* - * The lock in wake_up_all() is a crucial memory barrier after the - * plist_del() and also before assigning to q->lock_ptr. - */ - wake_up(&q->waiter); - /* - * The waiting task can free the futex_q as soon as this is written, - * without taking any locks. This must come last. - * - * A memory barrier is required here to prevent the following store to - * lock_ptr from getting ahead of the wakeup. Clearing the lock at the - * end of wake_up() does not prevent this store from moving. + * The waiting task can free the futex_q as soon as + * q->lock_ptr = NULL is written, without taking any locks. A + * memory barrier is required here to prevent the following + * store to lock_ptr from getting ahead of the plist_del. */ smp_wmb(); q->lock_ptr = NULL; + + wake_up_state(p, TASK_NORMAL); + put_task_struct(p); } static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_q *this) @@ -1003,7 +1009,7 @@ void requeue_pi_wake_futex(struct futex_q *q, union futex_key *key) WARN_ON(!q->rt_waiter); q->rt_waiter = NULL; - wake_up(&q->waiter); + wake_up_state(q->task, TASK_NORMAL); } /** @@ -1280,8 +1286,6 @@ static inline struct futex_hash_bucket *queue_lock(struct futex_q *q) { struct futex_hash_bucket *hb; - init_waitqueue_head(&q->waiter); - get_futex_key_refs(&q->key); hb = hash_futex(&q->key); q->lock_ptr = &hb->lock; @@ -1575,11 +1579,9 @@ out: * @hb: the futex hash bucket, must be locked by the caller * @q: the futex_q to queue up on * @timeout: the prepared hrtimer_sleeper, or null for no timeout - * @wait: the wait_queue to add to the futex_q after queueing in the hb */ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, - struct hrtimer_sleeper *timeout, - wait_queue_t *wait) + struct hrtimer_sleeper *timeout) { queue_me(q, hb); @@ -1587,19 +1589,11 @@ static void futex_wait_queue_me(struct futex_hash_bucket *hb, struct futex_q *q, * There might have been scheduling since the queue_me(), as we * cannot hold a spinlock across the get_user() in case it * faults, and we cannot just set TASK_INTERRUPTIBLE state when - * queueing ourselves into the futex hash. This code thus has to + * queueing ourselves into the futex hash. This code thus has to * rely on the futex_wake() code removing us from hash when it * wakes us up. */ - - /* add_wait_queue is the barrier after __set_current_state. */ - __set_current_state(TASK_INTERRUPTIBLE); - - /* - * Add current as the futex_q waiter. We don't remove ourselves from - * the wait_queue because we are the only user of it. - */ - add_wait_queue(&q->waiter, wait); + set_current_state(TASK_INTERRUPTIBLE); /* Arm the timer */ if (timeout) { @@ -1704,7 +1698,6 @@ static int futex_wait(u32 __user *uaddr, int fshared, u32 val, ktime_t *abs_time, u32 bitset, int clockrt) { struct hrtimer_sleeper timeout, *to = NULL; - DECLARE_WAITQUEUE(wait, current); struct restart_block *restart; struct futex_hash_bucket *hb; struct futex_q q; @@ -1733,7 +1726,7 @@ static int futex_wait(u32 __user *uaddr, int fshared, goto out; /* queue_me and wait for wakeup, timeout, or a signal. */ - futex_wait_queue_me(hb, &q, to, &wait); + futex_wait_queue_me(hb, &q, to); /* If we were woken (and unqueued), we succeeded, whatever. */ ret = 0; @@ -2147,7 +2140,6 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; - DECLARE_WAITQUEUE(wait, current); struct restart_block *restart; struct futex_hash_bucket *hb; union futex_key key2; @@ -2191,7 +2183,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, } /* Queue the futex_q, drop the hb lock, wait for wakeup. */ - futex_wait_queue_me(hb, &q, to, &wait); + futex_wait_queue_me(hb, &q, to); spin_lock(&hb->lock); ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); From 548e1ddf255b4ebfb4ef20c08936fd8d4deb3bd9 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:26 +0200 Subject: [PATCH 1090/2061] perf_counter: remove perf_disable/enable exports Now that ACPI idle doesn't use it anymore, remove the exports. [ Impact: remove dead code/data ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.429826617@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index e814ff04d7c..0173738dd54 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -90,14 +90,12 @@ void perf_disable(void) __perf_disable(); hw_perf_disable(); } -EXPORT_SYMBOL_GPL(perf_disable); /* ACPI idle */ void perf_enable(void) { if (__perf_enable()) hw_perf_enable(); } -EXPORT_SYMBOL_GPL(perf_enable); /* ACPI idle */ static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) From 789f90fcf6b0b54e655740e9396c954378542c79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:27 +0200 Subject: [PATCH 1091/2061] perf_counter: per user mlock gift Instead of a per-process mlock gift for perf-counters, use a per-user gift so that there is less of a DoS potential. [ Impact: allow less worst-case unprivileged memory consumption ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.496182835@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 4 ++++ kernel/perf_counter.c | 22 +++++++++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index d1857580a13..ff59d123151 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -674,6 +674,10 @@ struct user_struct { struct work_struct work; #endif #endif + +#ifdef CONFIG_PERF_COUNTERS + atomic_long_t locked_vm; +#endif }; extern int uids_sysfs_init(void); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0173738dd54..93f4a0e4b87 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -45,7 +45,7 @@ static atomic_t nr_munmap_tracking __read_mostly; static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ -int sysctl_perf_counter_mlock __read_mostly = 128; /* 'free' kb per counter */ +int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ /* * Lock for (sysadmin-configurable) counter reservations: @@ -1522,6 +1522,9 @@ static void perf_mmap_close(struct vm_area_struct *vma) if (atomic_dec_and_mutex_lock(&counter->mmap_count, &counter->mmap_mutex)) { + struct user_struct *user = current_user(); + + atomic_long_sub(counter->data->nr_pages + 1, &user->locked_vm); vma->vm_mm->locked_vm -= counter->data->nr_locked; perf_mmap_data_free(counter); mutex_unlock(&counter->mmap_mutex); @@ -1537,11 +1540,13 @@ static struct vm_operations_struct perf_mmap_vmops = { static int perf_mmap(struct file *file, struct vm_area_struct *vma) { struct perf_counter *counter = file->private_data; + struct user_struct *user = current_user(); unsigned long vma_size; unsigned long nr_pages; + unsigned long user_locked, user_lock_limit; unsigned long locked, lock_limit; + long user_extra, extra; int ret = 0; - long extra; if (!(vma->vm_flags & VM_SHARED) || (vma->vm_flags & VM_WRITE)) return -EINVAL; @@ -1569,15 +1574,17 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; } - extra = nr_pages /* + 1 only account the data pages */; - extra -= sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); - if (extra < 0) - extra = 0; + user_extra = nr_pages + 1; + user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + user_locked = atomic_long_read(&user->locked_vm) + user_extra; - locked = vma->vm_mm->locked_vm + extra; + extra = 0; + if (user_locked > user_lock_limit) + extra = user_locked - user_lock_limit; lock_limit = current->signal->rlim[RLIMIT_MEMLOCK].rlim_cur; lock_limit >>= PAGE_SHIFT; + locked = vma->vm_mm->locked_vm + extra; if ((locked > lock_limit) && !capable(CAP_IPC_LOCK)) { ret = -EPERM; @@ -1590,6 +1597,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) goto unlock; atomic_set(&counter->mmap_count, 1); + atomic_long_add(user_extra, &user->locked_vm); vma->vm_mm->locked_vm += extra; counter->data->nr_locked = extra; unlock: From 60db5e09c13109b13830cc9dcae688003fd39e79 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:28 +0200 Subject: [PATCH 1092/2061] perf_counter: frequency based adaptive irq_period Instead of specifying the irq_period for a counter, provide a target interrupt frequency and dynamically adapt the irq_period to match this frequency. [ Impact: new perf-counter attribute/feature ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.646195868@chello.nl> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 13 +++--- arch/x86/kernel/cpu/perf_counter.c | 9 ++--- include/linux/perf_counter.h | 10 ++++- kernel/perf_counter.c | 63 ++++++++++++++++++++++++------ 4 files changed, 68 insertions(+), 27 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index bb1b463c136..db8d5cafc15 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -534,7 +534,7 @@ void hw_perf_enable(void) continue; } val = 0; - if (counter->hw_event.irq_period) { + if (counter->hw.irq_period) { left = atomic64_read(&counter->hw.period_left); if (left < 0x80000000L) val = 0x80000000L - left; @@ -829,8 +829,6 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) if (!ppmu) return ERR_PTR(-ENXIO); - if ((s64)counter->hw_event.irq_period < 0) - return ERR_PTR(-EINVAL); if (!perf_event_raw(&counter->hw_event)) { ev = perf_event_id(&counter->hw_event); if (ev >= ppmu->n_generic || ppmu->generic_events[ev] == 0) @@ -901,7 +899,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) counter->hw.config = events[n]; counter->hw.counter_base = cflags[n]; - atomic64_set(&counter->hw.period_left, counter->hw_event.irq_period); + atomic64_set(&counter->hw.period_left, counter->hw.irq_period); /* * See if we need to reserve the PMU. @@ -934,6 +932,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) static void record_and_restart(struct perf_counter *counter, long val, struct pt_regs *regs, int nmi) { + u64 period = counter->hw.irq_period; s64 prev, delta, left; int record = 0; @@ -948,11 +947,11 @@ static void record_and_restart(struct perf_counter *counter, long val, */ val = 0; left = atomic64_read(&counter->hw.period_left) - delta; - if (counter->hw_event.irq_period) { + if (period) { if (left <= 0) { - left += counter->hw_event.irq_period; + left += period; if (left <= 0) - left = counter->hw_event.irq_period; + left = period; record = 1; } if (left < 0x80000000L) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5a7f718eb1e..886dcf334bc 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -286,11 +286,8 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } - hwc->irq_period = hw_event->irq_period; - if ((s64)hwc->irq_period <= 0 || hwc->irq_period > x86_pmu.max_period) - hwc->irq_period = x86_pmu.max_period; - - atomic64_set(&hwc->period_left, hwc->irq_period); + atomic64_set(&hwc->period_left, + min(x86_pmu.max_period, hwc->irq_period)); /* * Raw event type provide the config in the event structure @@ -458,7 +455,7 @@ x86_perf_counter_set_period(struct perf_counter *counter, struct hw_perf_counter *hwc, int idx) { s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->irq_period; + s64 period = min(x86_pmu.max_period, hwc->irq_period); int err; /* diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e543ecc129f..004b6e162b9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -130,7 +130,11 @@ struct perf_counter_hw_event { */ __u64 config; - __u64 irq_period; + union { + __u64 irq_period; + __u64 irq_freq; + }; + __u32 record_type; __u32 read_format; @@ -146,8 +150,9 @@ struct perf_counter_hw_event { mmap : 1, /* include mmap data */ munmap : 1, /* include munmap data */ comm : 1, /* include comm data */ + freq : 1, /* use freq, not period */ - __reserved_1 : 52; + __reserved_1 : 51; __u32 extra_config_len; __u32 wakeup_events; /* wakeup every n events */ @@ -337,6 +342,7 @@ struct hw_perf_counter { atomic64_t prev_count; u64 irq_period; atomic64_t period_left; + u64 interrupts; #endif }; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 93f4a0e4b87..0ad1db4f3d6 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1046,6 +1046,38 @@ int perf_counter_task_enable(void) return 0; } +void perf_adjust_freq(struct perf_counter_context *ctx) +{ + struct perf_counter *counter; + u64 irq_period; + u64 events, period; + s64 delta; + + spin_lock(&ctx->lock); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter->state != PERF_COUNTER_STATE_ACTIVE) + continue; + + if (!counter->hw_event.freq || !counter->hw_event.irq_freq) + continue; + + events = HZ * counter->hw.interrupts * counter->hw.irq_period; + period = div64_u64(events, counter->hw_event.irq_freq); + + delta = (s64)(1 + period - counter->hw.irq_period); + delta >>= 1; + + irq_period = counter->hw.irq_period + delta; + + if (!irq_period) + irq_period = 1; + + counter->hw.irq_period = irq_period; + counter->hw.interrupts = 0; + } + spin_unlock(&ctx->lock); +} + /* * Round-robin a context's counters: */ @@ -1081,6 +1113,9 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) cpuctx = &per_cpu(perf_cpu_context, cpu); ctx = &curr->perf_counter_ctx; + perf_adjust_freq(&cpuctx->ctx); + perf_adjust_freq(ctx); + perf_counter_cpu_sched_out(cpuctx); __perf_counter_task_sched_out(ctx); @@ -2382,6 +2417,8 @@ int perf_counter_overflow(struct perf_counter *counter, int events = atomic_read(&counter->event_limit); int ret = 0; + counter->hw.interrupts++; + /* * XXX event_limit might not quite work as expected on inherited * counters @@ -2450,6 +2487,7 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) enum hrtimer_restart ret = HRTIMER_RESTART; struct perf_counter *counter; struct pt_regs *regs; + u64 period; counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); counter->pmu->read(counter); @@ -2468,7 +2506,8 @@ static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) ret = HRTIMER_NORESTART; } - hrtimer_forward_now(hrtimer, ns_to_ktime(counter->hw.irq_period)); + period = max_t(u64, 10000, counter->hw.irq_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); return ret; } @@ -2629,8 +2668,9 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { + u64 period = max_t(u64, 10000, hwc->irq_period); __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(hwc->irq_period), 0, + ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); } @@ -2679,8 +2719,9 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) hrtimer_init(&hwc->hrtimer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); hwc->hrtimer.function = perf_swcounter_hrtimer; if (hwc->irq_period) { + u64 period = max_t(u64, 10000, hwc->irq_period); __hrtimer_start_range_ns(&hwc->hrtimer, - ns_to_ktime(hwc->irq_period), 0, + ns_to_ktime(period), 0, HRTIMER_MODE_REL, 0); } @@ -2811,9 +2852,7 @@ static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) { - struct perf_counter_hw_event *hw_event = &counter->hw_event; const struct pmu *pmu = NULL; - struct hw_perf_counter *hwc = &counter->hw; /* * Software counters (currently) can't in general distinguish @@ -2826,8 +2865,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) case PERF_COUNT_CPU_CLOCK: pmu = &perf_ops_cpu_clock; - if (hw_event->irq_period && hw_event->irq_period < 10000) - hw_event->irq_period = 10000; break; case PERF_COUNT_TASK_CLOCK: /* @@ -2839,8 +2876,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) else pmu = &perf_ops_cpu_clock; - if (hw_event->irq_period && hw_event->irq_period < 10000) - hw_event->irq_period = 10000; break; case PERF_COUNT_PAGE_FAULTS: case PERF_COUNT_PAGE_FAULTS_MIN: @@ -2854,9 +2889,6 @@ static const struct pmu *sw_perf_counter_init(struct perf_counter *counter) break; } - if (pmu) - hwc->irq_period = hw_event->irq_period; - return pmu; } @@ -2872,6 +2904,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, { const struct pmu *pmu; struct perf_counter *counter; + struct hw_perf_counter *hwc; long err; counter = kzalloc(sizeof(*counter), gfpflags); @@ -2907,6 +2940,12 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, pmu = NULL; + hwc = &counter->hw; + if (hw_event->freq && hw_event->irq_freq) + hwc->irq_period = TICK_NSEC / hw_event->irq_freq; + else + hwc->irq_period = hw_event->irq_period; + /* * we currently do not support PERF_RECORD_GROUP on inherited counters */ From f5456a6b056b709282e87a68b4c1b81ac2e866fa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:19:29 +0200 Subject: [PATCH 1093/2061] perf top: update to use the new freq interface Provide perf top -F as alternative to -c. [ Impact: new 'perf top' feature ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <20090515132018.707922166@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-top.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index b1549dd2772..814b2e4925e 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -98,6 +98,7 @@ static unsigned int page_size; static unsigned int mmap_pages = 16; static int use_mmap = 0; static int use_munmap = 0; +static int freq = 0; static char *vmlinux; @@ -846,9 +847,10 @@ static void process_options(int argc, char **argv) {"stat", no_argument, NULL, 'S'}, {"vmlinux", required_argument, NULL, 'x'}, {"zero", no_argument, NULL, 'z'}, + {"freq", required_argument, NULL, 'F'}, {NULL, 0, NULL, 0 } }; - int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMU", + int c = getopt_long(argc, argv, "+:ac:C:d:De:f:g:hln:m:p:r:s:Sx:zMUF:", long_options, &option_index); if (c == -1) break; @@ -889,6 +891,7 @@ static void process_options(int argc, char **argv) case 'm': mmap_pages = atoi(optarg); break; case 'M': use_mmap = 1; break; case 'U': use_munmap = 1; break; + case 'F': freq = 1; default_interval = atoi(optarg); break; default: error = 1; break; } } @@ -1075,6 +1078,7 @@ int cmd_top(int argc, char **argv, const char *prefix) hw_event.nmi = nmi; hw_event.mmap = use_mmap; hw_event.munmap = use_munmap; + hw_event.freq = freq; fd[i][counter] = sys_perf_counter_open(&hw_event, tid, cpu, group_fd, 0); if (fd[i][counter] < 0) { From dce48a84adf1806676319f6f480e30a6daa012f9 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 11 Apr 2009 10:43:41 +0200 Subject: [PATCH 1094/2061] sched, timers: move calc_load() to scheduler Dimitri Sivanich noticed that xtime_lock is held write locked across calc_load() which iterates over all online CPUs. That can cause long latencies for xtime_lock readers on large SMP systems. The load average calculation is an rough estimate anyway so there is no real need to protect the readers vs. the update. It's not a problem when the avenrun array is updated while a reader copies the values. Instead of iterating over all online CPUs let the scheduler_tick code update the number of active tasks shortly before the avenrun update happens. The avenrun update itself is handled by the CPU which calls do_timer(). [ Impact: reduce xtime_lock write locked section ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- include/linux/sched.h | 2 +- kernel/sched.c | 86 ++++++++++++++++++++++++++++++++++----- kernel/sched_idletask.c | 3 +- kernel/time/timekeeping.c | 2 +- kernel/timer.c | 54 +----------------------- 5 files changed, 81 insertions(+), 66 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index b4c38bc8049..6eb4892efe4 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -135,8 +135,8 @@ DECLARE_PER_CPU(unsigned long, process_counts); extern int nr_processes(void); extern unsigned long nr_running(void); extern unsigned long nr_uninterruptible(void); -extern unsigned long nr_active(void); extern unsigned long nr_iowait(void); +extern void calc_global_load(void); extern unsigned long get_parent_ip(unsigned long addr); diff --git a/kernel/sched.c b/kernel/sched.c index 8908d190a34..f4eb88153bd 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -630,6 +630,10 @@ struct rq { struct list_head migration_queue; #endif + /* calc_load related fields */ + unsigned long calc_load_update; + long calc_load_active; + #ifdef CONFIG_SCHED_HRTICK #ifdef CONFIG_SMP int hrtick_csd_pending; @@ -1728,6 +1732,8 @@ static void cfs_rq_set_shares(struct cfs_rq *cfs_rq, unsigned long shares) } #endif +static void calc_load_account_active(struct rq *this_rq); + #include "sched_stats.h" #include "sched_idletask.c" #include "sched_fair.c" @@ -2856,19 +2862,57 @@ unsigned long nr_iowait(void) return sum; } -unsigned long nr_active(void) +/* Variables and functions for calc_load */ +static atomic_long_t calc_load_tasks; +static unsigned long calc_load_update; +unsigned long avenrun[3]; +EXPORT_SYMBOL(avenrun); + +static unsigned long +calc_load(unsigned long load, unsigned long exp, unsigned long active) { - unsigned long i, running = 0, uninterruptible = 0; + load *= exp; + load += active * (FIXED_1 - exp); + return load >> FSHIFT; +} - for_each_online_cpu(i) { - running += cpu_rq(i)->nr_running; - uninterruptible += cpu_rq(i)->nr_uninterruptible; +/* + * calc_load - update the avenrun load estimates 10 ticks after the + * CPUs have updated calc_load_tasks. + */ +void calc_global_load(void) +{ + unsigned long upd = calc_load_update + 10; + long active; + + if (time_before(jiffies, upd)) + return; + + active = atomic_long_read(&calc_load_tasks); + active = active > 0 ? active * FIXED_1 : 0; + + avenrun[0] = calc_load(avenrun[0], EXP_1, active); + avenrun[1] = calc_load(avenrun[1], EXP_5, active); + avenrun[2] = calc_load(avenrun[2], EXP_15, active); + + calc_load_update += LOAD_FREQ; +} + +/* + * Either called from update_cpu_load() or from a cpu going idle + */ +static void calc_load_account_active(struct rq *this_rq) +{ + long nr_active, delta; + + nr_active = this_rq->nr_running; + nr_active += (long) this_rq->nr_uninterruptible; + + if (nr_active != this_rq->calc_load_active) { + delta = nr_active - this_rq->calc_load_active; + this_rq->calc_load_active = nr_active; + atomic_long_add(delta, &calc_load_tasks); } - - if (unlikely((long)uninterruptible < 0)) - uninterruptible = 0; - - return running + uninterruptible; } /* @@ -2899,6 +2943,11 @@ static void update_cpu_load(struct rq *this_rq) new_load += scale-1; this_rq->cpu_load[i] = (old_load*(scale-1) + new_load) >> i; } + + if (time_after_eq(jiffies, this_rq->calc_load_update)) { + this_rq->calc_load_update += LOAD_FREQ; + calc_load_account_active(this_rq); + } } #ifdef CONFIG_SMP @@ -7091,6 +7140,14 @@ static void migrate_dead_tasks(unsigned int dead_cpu) } } + +/* + * remove the tasks which were accounted by rq from calc_load_tasks. + */ +static void calc_global_load_remove(struct rq *rq) +{ + atomic_long_sub(rq->calc_load_active, &calc_load_tasks); +} #endif /* CONFIG_HOTPLUG_CPU */ #if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) @@ -7325,6 +7382,8 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) /* Update our root-domain */ rq = cpu_rq(cpu); spin_lock_irqsave(&rq->lock, flags); + rq->calc_load_update = calc_load_update; + rq->calc_load_active = 0; if (rq->rd) { BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); @@ -7364,7 +7423,7 @@ migration_call(struct notifier_block *nfb, unsigned long action, void *hcpu) cpuset_unlock(); migrate_nr_uninterruptible(rq); BUG_ON(rq->nr_running != 0); - + calc_global_load_remove(rq); /* * No need to migrate the tasks: it was best-effort if * they didn't take sched_hotcpu_mutex. Just wake up @@ -9059,6 +9118,8 @@ void __init sched_init(void) rq = cpu_rq(i); spin_lock_init(&rq->lock); rq->nr_running = 0; + rq->calc_load_active = 0; + rq->calc_load_update = jiffies + LOAD_FREQ; init_cfs_rq(&rq->cfs, rq); init_rt_rq(&rq->rt, rq); #ifdef CONFIG_FAIR_GROUP_SCHED @@ -9166,6 +9227,9 @@ void __init sched_init(void) * when this runqueue becomes "idle". */ init_idle(current, smp_processor_id()); + + calc_load_update = jiffies + LOAD_FREQ; + /* * During early bootup we pretend to be a normal task: */ diff --git a/kernel/sched_idletask.c b/kernel/sched_idletask.c index 8a21a2e28c1..499672c10cb 100644 --- a/kernel/sched_idletask.c +++ b/kernel/sched_idletask.c @@ -22,7 +22,8 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int sy static struct task_struct *pick_next_task_idle(struct rq *rq) { schedstat_inc(rq, sched_goidle); - + /* adjust the active tasks as we might go into a long sleep */ + calc_load_account_active(rq); return rq->idle; } diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index 687dff49f6e..52a8bf8931f 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -22,7 +22,7 @@ /* * This read-write spinlock protects us from races in SMP while - * playing with xtime and avenrun. + * playing with xtime. */ __cacheline_aligned_in_smp DEFINE_SEQLOCK(xtime_lock); diff --git a/kernel/timer.c b/kernel/timer.c index cffffad01c3..6a21d7af962 100644 --- a/kernel/timer.c +++ b/kernel/timer.c @@ -1122,47 +1122,6 @@ void update_process_times(int user_tick) run_posix_cpu_timers(p); } -/* - * Nr of active tasks - counted in fixed-point numbers - */ -static unsigned long count_active_tasks(void) -{ - return nr_active() * FIXED_1; -} - -/* - * Hmm.. Changed this, as the GNU make sources (load.c) seems to - * imply that avenrun[] is the standard name for this kind of thing. - * Nothing else seems to be standardized: the fractional size etc - * all seem to differ on different machines. - * - * Requires xtime_lock to access. - */ -unsigned long avenrun[3]; - -EXPORT_SYMBOL(avenrun); - -/* - * calc_load - given tick count, update the avenrun load estimates. - * This is called while holding a write_lock on xtime_lock. - */ -static inline void calc_load(unsigned long ticks) -{ - unsigned long active_tasks; /* fixed-point */ - static int count = LOAD_FREQ; - - count -= ticks; - if (unlikely(count < 0)) { - active_tasks = count_active_tasks(); - do { - CALC_LOAD(avenrun[0], EXP_1, active_tasks); - CALC_LOAD(avenrun[1], EXP_5, active_tasks); - CALC_LOAD(avenrun[2], EXP_15, active_tasks); - count += LOAD_FREQ; - } while (count < 0); - } -} - /* * This function runs timers and the timer-tq in bottom half context. */ @@ -1186,16 +1145,6 @@ void run_local_timers(void) softlockup_tick(); } -/* - * Called by the timer interrupt. xtime_lock must already be taken - * by the timer IRQ! - */ -static inline void update_times(unsigned long ticks) -{ - update_wall_time(); - calc_load(ticks); -} - /* * The 64-bit jiffies value is not atomic - you MUST NOT read it * without sampling the sequence number in xtime_lock. @@ -1205,7 +1154,8 @@ static inline void update_times(unsigned long ticks) void do_timer(unsigned long ticks) { jiffies_64 += ticks; - update_times(ticks); + update_wall_time(); + calc_global_load(); } #ifdef __ARCH_WANT_SYS_ALARM From 2d02494f5a90f2e4b3c4c6acc85ec94674cdc431 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 2 May 2009 20:08:52 +0200 Subject: [PATCH 1095/2061] sched, timers: cleanup avenrun users avenrun is an rough estimate so we don't have to worry about consistency of the three avenrun values. Remove the xtime lock dependency and provide a function to scale the values. Cleanup the users. [ Impact: cleanup ] Signed-off-by: Thomas Gleixner Acked-by: Peter Zijlstra --- fs/proc/loadavg.c | 18 ++++++------------ include/linux/sched.h | 1 + kernel/sched.c | 15 +++++++++++++++ kernel/timer.c | 32 ++++++-------------------------- 4 files changed, 28 insertions(+), 38 deletions(-) diff --git a/fs/proc/loadavg.c b/fs/proc/loadavg.c index 9bca39cf99e..1afa4dd4cae 100644 --- a/fs/proc/loadavg.c +++ b/fs/proc/loadavg.c @@ -12,20 +12,14 @@ static int loadavg_proc_show(struct seq_file *m, void *v) { - int a, b, c; - unsigned long seq; + unsigned long avnrun[3]; - do { - seq = read_seqbegin(&xtime_lock); - a = avenrun[0] + (FIXED_1/200); - b = avenrun[1] + (FIXED_1/200); - c = avenrun[2] + (FIXED_1/200); - } while (read_seqretry(&xtime_lock, seq)); + get_avenrun(avnrun, FIXED_1/200, 0); - seq_printf(m, "%d.%02d %d.%02d %d.%02d %ld/%d %d\n", - LOAD_INT(a), LOAD_FRAC(a), - LOAD_INT(b), LOAD_FRAC(b), - LOAD_INT(c), LOAD_FRAC(c), + seq_printf(m, "%lu.%02lu %lu.%02lu %lu.%02lu %ld/%d %d\n", + LOAD_INT(avnrun[0]), LOAD_FRAC(avnrun[0]), + LOAD_INT(avnrun[1]), LOAD_FRAC(avnrun[1]), + LOAD_INT(avnrun[2]), LOAD_FRAC(avnrun[2]), nr_running(), nr_threads, task_active_pid_ns(current)->last_pid); return 0; diff --git a/include/linux/sched.h b/include/linux/sched.h index 6eb4892efe4..de7b3b21777 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -116,6 +116,7 @@ struct fs_struct; * 11 bit fractions. */ extern unsigned long avenrun[]; /* Load averages */ +extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift); #define FSHIFT 11 /* nr of bits of precision */ #define FIXED_1 (1<uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - /* - * This is annoying. The below is the same thing - * posix_get_clock_monotonic() does, but it wants to - * take the lock which we want to cover the loads stuff - * too. - */ + get_avenrun(info->loads, 0, SI_LOAD_SHIFT - FSHIFT); - getnstimeofday(&tp); - tp.tv_sec += wall_to_monotonic.tv_sec; - tp.tv_nsec += wall_to_monotonic.tv_nsec; - monotonic_to_bootbased(&tp); - if (tp.tv_nsec - NSEC_PER_SEC >= 0) { - tp.tv_nsec = tp.tv_nsec - NSEC_PER_SEC; - tp.tv_sec++; - } - info->uptime = tp.tv_sec + (tp.tv_nsec ? 1 : 0); - - info->loads[0] = avenrun[0] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[1] = avenrun[1] << (SI_LOAD_SHIFT - FSHIFT); - info->loads[2] = avenrun[2] << (SI_LOAD_SHIFT - FSHIFT); - - info->procs = nr_threads; - } while (read_seqretry(&xtime_lock, seq)); + info->procs = nr_threads; si_meminfo(info); si_swapinfo(info); From 2e569d36729c8105ae066a9b105068305442cc77 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 15:37:47 +0200 Subject: [PATCH 1096/2061] perf_counter: frequency based adaptive irq_period, 32-bit fix fix: kernel/built-in.o: In function `perf_counter_alloc': perf_counter.c:(.text+0x7ddc7): undefined reference to `__udivdi3' [ Impact: build fix on 32-bit systems ] Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner LKML-Reference: <1242394667.6642.1887.camel@laptop> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0ad1db4f3d6..728a595399b 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2942,7 +2942,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, hwc = &counter->hw; if (hw_event->freq && hw_event->irq_freq) - hwc->irq_period = TICK_NSEC / hw_event->irq_freq; + hwc->irq_period = div64_u64(TICK_NSEC, hw_event->irq_freq); else hwc->irq_period = hw_event->irq_period; From ef923214a4816c289e4af2d67a9ebb1a31e4ac61 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 May 2009 13:29:14 +1000 Subject: [PATCH 1097/2061] perf_counter: powerpc: use u64 for event codes internally Although the perf_counter API allows 63-bit raw event codes, internally in the powerpc back-end we had been using 32-bit event codes. This expands them to 64 bits so that we can add bits for specifying threshold start/stop events and instruction sampling modes later. This also corrects the return value of can_go_on_limited_pmc; we were returning an event code rather than just a 0/1 value in some circumstances. That didn't particularly matter while event codes were 32-bit, but now that event codes are 64-bit it might, so this fixes it. [ Impact: extend PowerPC perfcounter interfaces from u32 to u64 ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <18955.36874.472452.353104@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/perf_counter.h | 10 +++++----- arch/powerpc/kernel/perf_counter.c | 26 ++++++++++++------------- arch/powerpc/kernel/power4-pmu.c | 9 ++++----- arch/powerpc/kernel/power5+-pmu.c | 14 ++++++------- arch/powerpc/kernel/power5-pmu.c | 16 +++++++-------- arch/powerpc/kernel/power6-pmu.c | 16 +++++++-------- arch/powerpc/kernel/ppc970-pmu.c | 9 ++++----- 7 files changed, 48 insertions(+), 52 deletions(-) diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h index 56d66c38143..ceea76a48e3 100644 --- a/arch/powerpc/include/asm/perf_counter.h +++ b/arch/powerpc/include/asm/perf_counter.h @@ -23,13 +23,13 @@ struct power_pmu { int max_alternatives; u64 add_fields; u64 test_adder; - int (*compute_mmcr)(unsigned int events[], int n_ev, + int (*compute_mmcr)(u64 events[], int n_ev, unsigned int hwc[], u64 mmcr[]); - int (*get_constraint)(unsigned int event, u64 *mskp, u64 *valp); - int (*get_alternatives)(unsigned int event, unsigned int flags, - unsigned int alt[]); + int (*get_constraint)(u64 event, u64 *mskp, u64 *valp); + int (*get_alternatives)(u64 event, unsigned int flags, + u64 alt[]); void (*disable_pmc)(unsigned int pmc, u64 mmcr[]); - int (*limited_pmc_event)(unsigned int event); + int (*limited_pmc_event)(u64 event); int limited_pmc5_6; /* PMC5 and PMC6 have limited function */ int n_generic; int *generic_events; diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index db8d5cafc15..8d4cafc84b8 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -26,7 +26,7 @@ struct cpu_hw_counters { int n_limited; u8 pmcs_enabled; struct perf_counter *counter[MAX_HWCOUNTERS]; - unsigned int events[MAX_HWCOUNTERS]; + u64 events[MAX_HWCOUNTERS]; unsigned int flags[MAX_HWCOUNTERS]; u64 mmcr[3]; struct perf_counter *limited_counter[MAX_LIMITED_HWCOUNTERS]; @@ -131,11 +131,11 @@ static void write_pmc(int idx, unsigned long val) * and see if any combination of alternative codes is feasible. * The feasible set is returned in event[]. */ -static int power_check_constraints(unsigned int event[], unsigned int cflags[], +static int power_check_constraints(u64 event[], unsigned int cflags[], int n_ev) { u64 mask, value, nv; - unsigned int alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; + u64 alternatives[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; u64 amasks[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; u64 avalues[MAX_HWCOUNTERS][MAX_EVENT_ALTERNATIVES]; u64 smasks[MAX_HWCOUNTERS], svalues[MAX_HWCOUNTERS]; @@ -564,7 +564,7 @@ void hw_perf_enable(void) } static int collect_events(struct perf_counter *group, int max_count, - struct perf_counter *ctrs[], unsigned int *events, + struct perf_counter *ctrs[], u64 *events, unsigned int *flags) { int n = 0; @@ -752,11 +752,11 @@ struct pmu power_pmu = { * that a limited PMC can count, doesn't require interrupts, and * doesn't exclude any processor mode. */ -static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev, +static int can_go_on_limited_pmc(struct perf_counter *counter, u64 ev, unsigned int flags) { int n; - unsigned int alt[MAX_EVENT_ALTERNATIVES]; + u64 alt[MAX_EVENT_ALTERNATIVES]; if (counter->hw_event.exclude_user || counter->hw_event.exclude_kernel @@ -776,10 +776,8 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev, flags |= PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD; n = ppmu->get_alternatives(ev, flags, alt); - if (n) - return alt[0]; - return 0; + return n > 0; } /* @@ -787,10 +785,9 @@ static int can_go_on_limited_pmc(struct perf_counter *counter, unsigned int ev, * and return the event code, or 0 if there is no such alternative. * (Note: event code 0 is "don't count" on all machines.) */ -static unsigned long normal_pmc_alternative(unsigned long ev, - unsigned long flags) +static u64 normal_pmc_alternative(u64 ev, unsigned long flags) { - unsigned int alt[MAX_EVENT_ALTERNATIVES]; + u64 alt[MAX_EVENT_ALTERNATIVES]; int n; flags &= ~(PPMU_LIMITED_PMC_OK | PPMU_LIMITED_PMC_REQD); @@ -820,9 +817,10 @@ static void hw_perf_counter_destroy(struct perf_counter *counter) const struct pmu *hw_perf_counter_init(struct perf_counter *counter) { - unsigned long ev, flags; + u64 ev; + unsigned long flags; struct perf_counter *ctrs[MAX_HWCOUNTERS]; - unsigned int events[MAX_HWCOUNTERS]; + u64 events[MAX_HWCOUNTERS]; unsigned int cflags[MAX_HWCOUNTERS]; int n; int err; diff --git a/arch/powerpc/kernel/power4-pmu.c b/arch/powerpc/kernel/power4-pmu.c index 744a2756958..836fa118eb1 100644 --- a/arch/powerpc/kernel/power4-pmu.c +++ b/arch/powerpc/kernel/power4-pmu.c @@ -213,7 +213,7 @@ static unsigned char direct_marked_event[8] = { * Returns 1 if event counts things relating to marked instructions * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. */ -static int p4_marked_instr_event(unsigned int event) +static int p4_marked_instr_event(u64 event) { int pmc, psel, unit, byte, bit; unsigned int mask; @@ -249,7 +249,7 @@ static int p4_marked_instr_event(unsigned int event) return (mask >> (byte * 8 + bit)) & 1; } -static int p4_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +static int p4_get_constraint(u64 event, u64 *maskp, u64 *valp) { int pmc, byte, unit, lower, sh; u64 mask = 0, value = 0; @@ -320,8 +320,7 @@ static unsigned int ppc_inst_cmpl[] = { 0x1001, 0x4001, 0x6001, 0x7001, 0x8001 }; -static int p4_get_alternatives(unsigned int event, unsigned int flags, - unsigned int alt[]) +static int p4_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { int i, j, na; @@ -353,7 +352,7 @@ static int p4_get_alternatives(unsigned int event, unsigned int flags, return na; } -static int p4_compute_mmcr(unsigned int event[], int n_ev, +static int p4_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 8154eaa2404..3ac0654372a 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -135,7 +135,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = { [PM_GRS] = { 0x0e00000000ull, 0x0c40000000ull }, }; -static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +static int power5p_get_constraint(u64 event, u64 *maskp, u64 *valp) { int pmc, byte, unit, sh; int bit, fmask; @@ -188,7 +188,7 @@ static int power5p_get_constraint(unsigned int event, u64 *maskp, u64 *valp) return 0; } -static int power5p_limited_pmc_event(unsigned int event) +static int power5p_limited_pmc_event(u64 event) { int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; @@ -273,11 +273,11 @@ static int find_alternative_bdecode(unsigned int event) return -1; } -static int power5p_get_alternatives(unsigned int event, unsigned int flags, - unsigned int alt[]) +static int power5p_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { - int i, j, ae, nalt = 1; + int i, j, nalt = 1; int nlim; + u64 ae; alt[0] = event; nalt = 1; @@ -402,7 +402,7 @@ static unsigned char direct_event_is_marked[0x28] = { * Returns 1 if event counts things relating to marked instructions * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. */ -static int power5p_marked_instr_event(unsigned int event) +static int power5p_marked_instr_event(u64 event) { int pmc, psel; int bit, byte, unit; @@ -451,7 +451,7 @@ static int power5p_marked_instr_event(unsigned int event) return (mask >> (byte * 8 + bit)) & 1; } -static int power5p_compute_mmcr(unsigned int event[], int n_ev, +static int power5p_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; diff --git a/arch/powerpc/kernel/power5-pmu.c b/arch/powerpc/kernel/power5-pmu.c index 6e667dc8647..d5344968ee9 100644 --- a/arch/powerpc/kernel/power5-pmu.c +++ b/arch/powerpc/kernel/power5-pmu.c @@ -139,7 +139,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = { [PM_GRS] = { 0x30002000000000ull, 0x30000400000000ull }, }; -static int power5_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +static int power5_get_constraint(u64 event, u64 *maskp, u64 *valp) { int pmc, byte, unit, sh; int bit, fmask; @@ -224,7 +224,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = { * Scan the alternatives table for a match and return the * index into the alternatives table if found, else -1. */ -static int find_alternative(unsigned int event) +static int find_alternative(u64 event) { int i, j; @@ -250,7 +250,7 @@ static const unsigned char bytedecode_alternatives[4][4] = { * PMCSEL values on other counters. This returns the alternative * event code for those that do, or -1 otherwise. */ -static int find_alternative_bdecode(unsigned int event) +static u64 find_alternative_bdecode(u64 event) { int pmc, altpmc, pp, j; @@ -269,10 +269,10 @@ static int find_alternative_bdecode(unsigned int event) return -1; } -static int power5_get_alternatives(unsigned int event, unsigned int flags, - unsigned int alt[]) +static int power5_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { - int i, j, ae, nalt = 1; + int i, j, nalt = 1; + u64 ae; alt[0] = event; nalt = 1; @@ -338,7 +338,7 @@ static unsigned char direct_event_is_marked[0x28] = { * Returns 1 if event counts things relating to marked instructions * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. */ -static int power5_marked_instr_event(unsigned int event) +static int power5_marked_instr_event(u64 event) { int pmc, psel; int bit, byte, unit; @@ -382,7 +382,7 @@ static int power5_marked_instr_event(unsigned int event) return (mask >> (byte * 8 + bit)) & 1; } -static int power5_compute_mmcr(unsigned int event[], int n_ev, +static int power5_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index d44049f0ae2..ab7c615c458 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -134,7 +134,7 @@ static u32 marked_bus_events[16] = { * Returns 1 if event counts things relating to marked instructions * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. */ -static int power6_marked_instr_event(unsigned int event) +static int power6_marked_instr_event(u64 event) { int pmc, psel, ptype; int bit, byte, unit; @@ -172,7 +172,7 @@ static int power6_marked_instr_event(unsigned int event) /* * Assign PMC numbers and compute MMCR1 value for a set of events */ -static int p6_compute_mmcr(unsigned int event[], int n_ev, +static int p6_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr1 = 0; @@ -265,7 +265,7 @@ static int p6_compute_mmcr(unsigned int event[], int n_ev, * 20-23, 24-27, 28-31 ditto for bytes 1, 2, 3 * 32-34 select field: nest (subunit) event selector */ -static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +static int p6_get_constraint(u64 event, u64 *maskp, u64 *valp) { int pmc, byte, sh, subunit; u64 mask = 0, value = 0; @@ -298,7 +298,7 @@ static int p6_get_constraint(unsigned int event, u64 *maskp, u64 *valp) return 0; } -static int p6_limited_pmc_event(unsigned int event) +static int p6_limited_pmc_event(u64 event) { int pmc = (event >> PM_PMC_SH) & PM_PMC_MSK; @@ -337,7 +337,7 @@ static const unsigned int event_alternatives[][MAX_ALT] = { * This could be made more efficient with a binary search on * a presorted list, if necessary */ -static int find_alternatives_list(unsigned int event) +static int find_alternatives_list(u64 event) { int i, j; unsigned int alt; @@ -356,12 +356,12 @@ static int find_alternatives_list(unsigned int event) return -1; } -static int p6_get_alternatives(unsigned int event, unsigned int flags, - unsigned int alt[]) +static int p6_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { int i, j, nlim; - unsigned int aevent, psel, pmc; + unsigned int psel, pmc; unsigned int nalt = 1; + u64 aevent; alt[0] = event; nlim = p6_limited_pmc_event(event); diff --git a/arch/powerpc/kernel/ppc970-pmu.c b/arch/powerpc/kernel/ppc970-pmu.c index af2d1884058..eed47c4523f 100644 --- a/arch/powerpc/kernel/ppc970-pmu.c +++ b/arch/powerpc/kernel/ppc970-pmu.c @@ -147,7 +147,7 @@ static unsigned char direct_marked_event[8] = { * Returns 1 if event counts things relating to marked instructions * and thus needs the MMCRA_SAMPLE_ENABLE bit set, or 0 if not. */ -static int p970_marked_instr_event(unsigned int event) +static int p970_marked_instr_event(u64 event) { int pmc, psel, unit, byte, bit; unsigned int mask; @@ -192,7 +192,7 @@ static u64 unit_cons[PM_LASTUNIT+1][2] = { [PM_STS] = { 0x380000000000ull, 0x310000000000ull }, }; -static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) +static int p970_get_constraint(u64 event, u64 *maskp, u64 *valp) { int pmc, byte, unit, sh, spcsel; u64 mask = 0, value = 0; @@ -243,8 +243,7 @@ static int p970_get_constraint(unsigned int event, u64 *maskp, u64 *valp) return 0; } -static int p970_get_alternatives(unsigned int event, unsigned int flags, - unsigned int alt[]) +static int p970_get_alternatives(u64 event, unsigned int flags, u64 alt[]) { alt[0] = event; @@ -257,7 +256,7 @@ static int p970_get_alternatives(unsigned int event, unsigned int flags, return 1; } -static int p970_compute_mmcr(unsigned int event[], int n_ev, +static int p970_compute_mmcr(u64 event[], int n_ev, unsigned int hwc[], u64 mmcr[]) { u64 mmcr0 = 0, mmcr1 = 0, mmcra = 0; From 9d23a90a67261e73b2fcac04d8ca963c6b496afb Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 May 2009 21:48:08 +1000 Subject: [PATCH 1098/2061] perf_counter: allow arch to supply event misc flags and instruction pointer At present the values we put in overflow events for the misc flags indicating processor mode and the instruction pointer are obtained using the standard user_mode() and instruction_pointer() functions. Those functions tell you where the performance monitor interrupt was taken, which might not be exactly where the counter overflow occurred, for example because interrupts were disabled at the point where the overflow occurred, or because the processor had many instructions in flight and chose to complete some more instructions beyond the one that caused the counter overflow. Some architectures (e.g. powerpc) can supply more precise information about where the counter overflow occurred and the processor mode at that point. This introduces new functions, perf_misc_flags() and perf_instruction_pointer(), which arch code can override to provide more precise information if available. They have default implementations which are identical to the existing code. This also adds a new misc flag value, PERF_EVENT_MISC_HYPERVISOR, for the case where a counter overflow occurred in the hypervisor. We encode the processor mode in the 2 bits previously used to indicate user or kernel mode; the values for user and kernel mode are unchanged and hypervisor mode is indicated by both bits being set. [ Impact: generalize perfcounter core facilities ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <18956.1272.818511.561835@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 11 ++++++++++- kernel/perf_counter.c | 5 ++--- 2 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 004b6e162b9..c8c1dfc22c9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -215,8 +215,11 @@ struct perf_counter_mmap_page { __u32 data_head; /* head in the data section */ }; +#define PERF_EVENT_MISC_CPUMODE_MASK (3 << 0) +#define PERF_EVENT_MISC_CPUMODE_UNKNOWN (0 << 0) #define PERF_EVENT_MISC_KERNEL (1 << 0) -#define PERF_EVENT_MISC_USER (1 << 1) +#define PERF_EVENT_MISC_USER (2 << 0) +#define PERF_EVENT_MISC_HYPERVISOR (3 << 0) #define PERF_EVENT_MISC_OVERFLOW (1 << 2) struct perf_event_header { @@ -596,6 +599,12 @@ extern int sysctl_perf_counter_mlock; extern void perf_counter_init(void); +#ifndef perf_misc_flags +#define perf_misc_flags(regs) (user_mode(regs) ? PERF_EVENT_MISC_USER : \ + PERF_EVENT_MISC_KERNEL) +#define perf_instruction_pointer(regs) instruction_pointer(regs) +#endif + #else static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 728a595399b..57840a94b16 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2042,11 +2042,10 @@ static void perf_counter_output(struct perf_counter *counter, header.size = sizeof(header); header.misc = PERF_EVENT_MISC_OVERFLOW; - header.misc |= user_mode(regs) ? - PERF_EVENT_MISC_USER : PERF_EVENT_MISC_KERNEL; + header.misc |= perf_misc_flags(regs); if (record_type & PERF_RECORD_IP) { - ip = instruction_pointer(regs); + ip = perf_instruction_pointer(regs); header.type |= PERF_RECORD_IP; header.size += sizeof(ip); } From 0bbd0d4be8d5d3676c126e06e3c75c16def00441 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Thu, 14 May 2009 13:31:48 +1000 Subject: [PATCH 1099/2061] perf_counter: powerpc: supply more precise information on counter overflow events This uses values from the MMCRA, SIAR and SDAR registers on powerpc to supply more precise information for overflow events, including a data address when PERF_RECORD_ADDR is specified. Since POWER6 uses different bit positions in MMCRA from earlier processors, this converts the struct power_pmu limited_pmc5_6 field, which only had 0/1 values, into a flags field and defines bit values for its previous use (PPMU_LIMITED_PMC5_6) and a new flag (PPMU_ALT_SIPR) to indicate that the processor uses the POWER6 bit positions rather than the earlier positions. It also adds definitions in reg.h for the new and old positions of the bit that indicates that the SIAR and SDAR values come from the same instruction. For the data address, the SDAR value is supplied if we are not doing instruction sampling. In that case there is no guarantee that the address given in the PERF_RECORD_ADDR subrecord will correspond to the instruction whose address is given in the PERF_RECORD_IP subrecord. If instruction sampling is enabled (e.g. because this counter is counting a marked instruction event), then we only supply the SDAR value for the PERF_RECORD_ADDR subrecord if it corresponds to the instruction whose address is in the PERF_RECORD_IP subrecord. Otherwise we supply 0. [ Impact: support more PMU hardware features on PowerPC ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo LKML-Reference: <18955.37028.48861.555309@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/include/asm/perf_counter.h | 14 ++++- arch/powerpc/include/asm/reg.h | 2 + arch/powerpc/kernel/perf_counter.c | 84 +++++++++++++++++++++++-- arch/powerpc/kernel/power5+-pmu.c | 2 +- arch/powerpc/kernel/power6-pmu.c | 2 +- 5 files changed, 97 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/perf_counter.h b/arch/powerpc/include/asm/perf_counter.h index ceea76a48e3..1c60f0ca792 100644 --- a/arch/powerpc/include/asm/perf_counter.h +++ b/arch/powerpc/include/asm/perf_counter.h @@ -30,13 +30,19 @@ struct power_pmu { u64 alt[]); void (*disable_pmc)(unsigned int pmc, u64 mmcr[]); int (*limited_pmc_event)(u64 event); - int limited_pmc5_6; /* PMC5 and PMC6 have limited function */ + u32 flags; int n_generic; int *generic_events; }; extern struct power_pmu *ppmu; +/* + * Values for power_pmu.flags + */ +#define PPMU_LIMITED_PMC5_6 1 /* PMC5/6 have limited function */ +#define PPMU_ALT_SIPR 2 /* uses alternate posn for SIPR/HV */ + /* * Values for flags to get_alternatives() */ @@ -44,6 +50,12 @@ extern struct power_pmu *ppmu; #define PPMU_LIMITED_PMC_REQD 2 /* have to put this on a limited PMC */ #define PPMU_ONLY_COUNT_RUN 4 /* only counting in run state */ +struct pt_regs; +extern unsigned long perf_misc_flags(struct pt_regs *regs); +#define perf_misc_flags(regs) perf_misc_flags(regs) + +extern unsigned long perf_instruction_pointer(struct pt_regs *regs); + /* * The power_pmu.get_constraint function returns a 64-bit value and * a 64-bit mask that express the constraints between this event and diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index e8018d540e8..fb359b0a693 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -492,11 +492,13 @@ #define MMCR0_FCHV 0x00000001UL /* freeze conditions in hypervisor mode */ #define SPRN_MMCR1 798 #define SPRN_MMCRA 0x312 +#define MMCRA_SDSYNC 0x80000000UL /* SDAR synced with SIAR */ #define MMCRA_SIHV 0x10000000UL /* state of MSR HV when SIAR set */ #define MMCRA_SIPR 0x08000000UL /* state of MSR PR when SIAR set */ #define MMCRA_SLOT 0x07000000UL /* SLOT bits (37-39) */ #define MMCRA_SLOT_SHIFT 24 #define MMCRA_SAMPLE_ENABLE 0x00000001UL /* enable sampling */ +#define POWER6_MMCRA_SDSYNC 0x0000080000000000ULL /* SDAR/SIAR synced */ #define POWER6_MMCRA_SIHV 0x0000040000000000ULL #define POWER6_MMCRA_SIPR 0x0000020000000000ULL #define POWER6_MMCRA_THRM 0x00000020UL diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 8d4cafc84b8..6baae5a5c33 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -17,6 +17,7 @@ #include #include #include +#include struct cpu_hw_counters { int n_counters; @@ -310,7 +311,8 @@ static void power_pmu_read(struct perf_counter *counter) */ static int is_limited_pmc(int pmcnum) { - return ppmu->limited_pmc5_6 && (pmcnum == 5 || pmcnum == 6); + return (ppmu->flags & PPMU_LIMITED_PMC5_6) + && (pmcnum == 5 || pmcnum == 6); } static void freeze_limited_counters(struct cpu_hw_counters *cpuhw, @@ -860,7 +862,7 @@ const struct pmu *hw_perf_counter_init(struct perf_counter *counter) * If this machine has limited counters, check whether this * event could go on a limited counter. */ - if (ppmu->limited_pmc5_6) { + if (ppmu->flags & PPMU_LIMITED_PMC5_6) { if (can_go_on_limited_pmc(counter, ev, flags)) { flags |= PPMU_LIMITED_PMC_OK; } else if (ppmu->limited_pmc_event(ev)) { @@ -933,6 +935,7 @@ static void record_and_restart(struct perf_counter *counter, long val, u64 period = counter->hw.irq_period; s64 prev, delta, left; int record = 0; + u64 addr, mmcra, sdsync; /* we don't have to worry about interrupts here */ prev = atomic64_read(&counter->hw.prev_count); @@ -963,8 +966,76 @@ static void record_and_restart(struct perf_counter *counter, long val, /* * Finally record data if requested. */ - if (record) - perf_counter_overflow(counter, nmi, regs, 0); + if (record) { + addr = 0; + if (counter->hw_event.record_type & PERF_RECORD_ADDR) { + /* + * The user wants a data address recorded. + * If we're not doing instruction sampling, + * give them the SDAR (sampled data address). + * If we are doing instruction sampling, then only + * give them the SDAR if it corresponds to the + * instruction pointed to by SIAR; this is indicated + * by the [POWER6_]MMCRA_SDSYNC bit in MMCRA. + */ + mmcra = regs->dsisr; + sdsync = (ppmu->flags & PPMU_ALT_SIPR) ? + POWER6_MMCRA_SDSYNC : MMCRA_SDSYNC; + if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) + addr = mfspr(SPRN_SDAR); + } + perf_counter_overflow(counter, nmi, regs, addr); + } +} + +/* + * Called from generic code to get the misc flags (i.e. processor mode) + * for an event. + */ +unsigned long perf_misc_flags(struct pt_regs *regs) +{ + unsigned long mmcra; + + if (TRAP(regs) != 0xf00) { + /* not a PMU interrupt */ + return user_mode(regs) ? PERF_EVENT_MISC_USER : + PERF_EVENT_MISC_KERNEL; + } + + mmcra = regs->dsisr; + if (ppmu->flags & PPMU_ALT_SIPR) { + if (mmcra & POWER6_MMCRA_SIHV) + return PERF_EVENT_MISC_HYPERVISOR; + return (mmcra & POWER6_MMCRA_SIPR) ? PERF_EVENT_MISC_USER : + PERF_EVENT_MISC_KERNEL; + } + if (mmcra & MMCRA_SIHV) + return PERF_EVENT_MISC_HYPERVISOR; + return (mmcra & MMCRA_SIPR) ? PERF_EVENT_MISC_USER : + PERF_EVENT_MISC_KERNEL; +} + +/* + * Called from generic code to get the instruction pointer + * for an event. + */ +unsigned long perf_instruction_pointer(struct pt_regs *regs) +{ + unsigned long mmcra; + unsigned long ip; + unsigned long slot; + + if (TRAP(regs) != 0xf00) + return regs->nip; /* not a PMU interrupt */ + + ip = mfspr(SPRN_SIAR); + mmcra = regs->dsisr; + if ((mmcra & MMCRA_SAMPLE_ENABLE) && !(ppmu->flags & PPMU_ALT_SIPR)) { + slot = (mmcra & MMCRA_SLOT) >> MMCRA_SLOT_SHIFT; + if (slot > 1) + ip += 4 * (slot - 1); + } + return ip; } /* @@ -983,6 +1054,11 @@ static void perf_counter_interrupt(struct pt_regs *regs) freeze_limited_counters(cpuhw, mfspr(SPRN_PMC5), mfspr(SPRN_PMC6)); + /* + * Overload regs->dsisr to store MMCRA so we only need to read it once. + */ + regs->dsisr = mfspr(SPRN_MMCRA); + /* * If interrupts were soft-disabled when this PMU interrupt * occurred, treat it as an NMI. diff --git a/arch/powerpc/kernel/power5+-pmu.c b/arch/powerpc/kernel/power5+-pmu.c index 3ac0654372a..c6cdfc165d6 100644 --- a/arch/powerpc/kernel/power5+-pmu.c +++ b/arch/powerpc/kernel/power5+-pmu.c @@ -625,6 +625,6 @@ struct power_pmu power5p_pmu = { .disable_pmc = power5p_disable_pmc, .n_generic = ARRAY_SIZE(power5p_generic_events), .generic_events = power5p_generic_events, - .limited_pmc5_6 = 1, + .flags = PPMU_LIMITED_PMC5_6, .limited_pmc_event = power5p_limited_pmc_event, }; diff --git a/arch/powerpc/kernel/power6-pmu.c b/arch/powerpc/kernel/power6-pmu.c index ab7c615c458..cd4fbe06c35 100644 --- a/arch/powerpc/kernel/power6-pmu.c +++ b/arch/powerpc/kernel/power6-pmu.c @@ -485,6 +485,6 @@ struct power_pmu power6_pmu = { .disable_pmc = p6_disable_pmc, .n_generic = ARRAY_SIZE(power6_generic_events), .generic_events = power6_generic_events, - .limited_pmc5_6 = 1, + .flags = PPMU_LIMITED_PMC5_6 | PPMU_ALT_SIPR, .limited_pmc_event = p6_limited_pmc_event, }; From d9bcc01d58d18ed287091707b0b45c6ac888a11a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:06:12 +0530 Subject: [PATCH 1100/2061] x86, mtrr: replace MTRRcap_MSR with msr-index's MSR_MTRRcap Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/main.c | 2 +- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 0b776c09aff..de9c20ffa0d 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -306,7 +306,7 @@ void __init get_mtrr_state(void) vrs = mtrr_state.var_ranges; - rdmsr(MTRRcap_MSR, lo, dummy); + rdmsr(MSR_MTRRcap, lo, dummy); mtrr_state.have_fixed = (lo >> 8) & 1; for (i = 0; i < num_var_ranges; i++) @@ -703,7 +703,7 @@ int generic_validate_add_page(unsigned long base, unsigned long size, unsigned i static int generic_have_wrcomb(void) { unsigned long config, dummy; - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); return (config & (1 << 10)); } diff --git a/arch/x86/kernel/cpu/mtrr/main.c b/arch/x86/kernel/cpu/mtrr/main.c index 03cda01f57c..8fc248b5aea 100644 --- a/arch/x86/kernel/cpu/mtrr/main.c +++ b/arch/x86/kernel/cpu/mtrr/main.c @@ -104,7 +104,7 @@ static void __init set_num_var_ranges(void) unsigned long config = 0, dummy; if (use_intel()) { - rdmsr(MTRRcap_MSR, config, dummy); + rdmsr(MSR_MTRRcap, config, dummy); } else if (is_cpu(AMD)) config = 2; else if (is_cpu(CYRIX) || is_cpu(CENTAUR)) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 77f67f7b347..5d37fb14523 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,7 +5,6 @@ #include #include -#define MTRRcap_MSR 0x0fe #define MTRRdefType_MSR 0x2ff #define MTRRfix64K_00000_MSR 0x250 From a036c7a358cc9d7ed28a188480b9a4d709e09b24 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:10:43 +0530 Subject: [PATCH 1101/2061] x86, mtrr: replace MTRRfix64K_00000_MSR with msr-index's MSR_MTRRfix64K_00000 Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index de9c20ffa0d..8b115c0e590 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -20,7 +20,7 @@ struct fixed_range_block { }; static struct fixed_range_block fixed_range_blocks[] = { - { MTRRfix64K_00000_MSR, 1 }, /* one 64k MTRR */ + { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ {} @@ -194,7 +194,7 @@ get_fixed_ranges(mtrr_type * frs) k8_check_syscfg_dram_mod_en(); - rdmsr(MTRRfix64K_00000_MSR, p[0], p[1]); + rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 5d37fb14523..7f23caede71 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,7 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix64K_00000_MSR 0x250 #define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 From 7d9d55e449089df8463bca2045d702ae6cda64a2 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:15:32 +0530 Subject: [PATCH 1102/2061] x86, mtrr: replace MTRRfix16K_80000_MSR with msr-index's MSR_MTRRfix16K_80000 Use standard msr-index.h's MSR declaration and no need to declare again [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 8b115c0e590..00437c2e8dd 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -21,7 +21,7 @@ struct fixed_range_block { static struct fixed_range_block fixed_range_blocks[] = { { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ - { MTRRfix16K_80000_MSR, 2 }, /* two 16k MTRRs */ + { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ {} }; @@ -197,7 +197,7 @@ get_fixed_ranges(mtrr_type * frs) rdmsr(MSR_MTRRfix64K_00000, p[0], p[1]); for (i = 0; i < 2; i++) - rdmsr(MTRRfix16K_80000_MSR + i, p[2 + i * 2], p[3 + i * 2]); + rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); } diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 7f23caede71..712b60524e6 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,7 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix16K_80000_MSR 0x258 #define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 #define MTRRfix4K_C8000_MSR 0x269 From 654ac05801ae806661c8fdeb3b5c420a31cbc5b1 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:21:54 +0530 Subject: [PATCH 1103/2061] x86, mtrr: remove mtrr MSRs double declaration Removed MTRR MSR from mtrr/mtrr.h as these are already declared in msr-index.h and nobody is using them: MTRRfix16K_A0000_MSR MTRRfix4K_C8000_MSR MTRRfix4K_D0000_MSR MTRRfix4K_D8000_MSR MTRRfix4K_E0000_MSR MTRRfix4K_E8000_MSR MTRRfix4K_F0000_MSR MTRRfix4K_F8000_MSR Use standard msr-index.h's MSR declaration and no need to declare again [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/mtrr.h | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 712b60524e6..5053793f912 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,15 +7,7 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix16K_A0000_MSR 0x259 #define MTRRfix4K_C0000_MSR 0x268 -#define MTRRfix4K_C8000_MSR 0x269 -#define MTRRfix4K_D0000_MSR 0x26a -#define MTRRfix4K_D8000_MSR 0x26b -#define MTRRfix4K_E0000_MSR 0x26c -#define MTRRfix4K_E8000_MSR 0x26d -#define MTRRfix4K_F0000_MSR 0x26e -#define MTRRfix4K_F8000_MSR 0x26f #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 From ba5673ff1ff5f428256db4cedd4b05b7be008bb6 Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:29:25 +0530 Subject: [PATCH 1104/2061] x86, mtrr: replace MTRRfix4K_C0000_MSR with msr-index's MSR_MTRRfix4K_C0000 Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/generic.c | 4 ++-- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 -- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 00437c2e8dd..3cf58e26534 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -22,7 +22,7 @@ struct fixed_range_block { static struct fixed_range_block fixed_range_blocks[] = { { MSR_MTRRfix64K_00000, 1 }, /* one 64k MTRR */ { MSR_MTRRfix16K_80000, 2 }, /* two 16k MTRRs */ - { MTRRfix4K_C0000_MSR, 8 }, /* eight 4k MTRRs */ + { MSR_MTRRfix4K_C0000, 8 }, /* eight 4k MTRRs */ {} }; @@ -199,7 +199,7 @@ get_fixed_ranges(mtrr_type * frs) for (i = 0; i < 2; i++) rdmsr(MSR_MTRRfix16K_80000 + i, p[2 + i * 2], p[3 + i * 2]); for (i = 0; i < 8; i++) - rdmsr(MTRRfix4K_C0000_MSR + i, p[6 + i * 2], p[7 + i * 2]); + rdmsr(MSR_MTRRfix4K_C0000 + i, p[6 + i * 2], p[7 + i * 2]); } void mtrr_save_fixed_ranges(void *info) diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index 5053793f912..e5ee686d2c3 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -7,8 +7,6 @@ #define MTRRdefType_MSR 0x2ff -#define MTRRfix4K_C0000_MSR 0x268 - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 From 52650257ea06bb15c2e2bbe854cbdf463726141a Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Thu, 14 May 2009 12:35:46 +0530 Subject: [PATCH 1105/2061] x86, mtrr: replace MTRRdefType_MSR with msr-index's MSR_MTRRdefType Use standard msr-index.h's MSR declaration and no need to declare again. [ Impact: cleanup, no object code change ] Signed-off-by: Jaswinder Singh Rajput Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/mtrr/cleanup.c | 4 ++-- arch/x86/kernel/cpu/mtrr/generic.c | 8 ++++---- arch/x86/kernel/cpu/mtrr/mtrr.h | 2 -- arch/x86/kernel/cpu/mtrr/state.c | 6 +++--- 4 files changed, 9 insertions(+), 11 deletions(-) diff --git a/arch/x86/kernel/cpu/mtrr/cleanup.c b/arch/x86/kernel/cpu/mtrr/cleanup.c index ce0fe4b5c04..1d584a18a50 100644 --- a/arch/x86/kernel/cpu/mtrr/cleanup.c +++ b/arch/x86/kernel/cpu/mtrr/cleanup.c @@ -808,7 +808,7 @@ int __init mtrr_cleanup(unsigned address_bits) if (!is_cpu(INTEL) || enable_mtrr_cleanup < 1) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; @@ -1003,7 +1003,7 @@ int __init mtrr_trim_uncached_memory(unsigned long end_pfn) */ if (!is_cpu(INTEL) || disable_mtrr_trim) return 0; - rdmsr(MTRRdefType_MSR, def, dummy); + rdmsr(MSR_MTRRdefType, def, dummy); def &= 0xff; if (def != MTRR_TYPE_UNCACHABLE) return 0; diff --git a/arch/x86/kernel/cpu/mtrr/generic.c b/arch/x86/kernel/cpu/mtrr/generic.c index 3cf58e26534..e930a311770 100644 --- a/arch/x86/kernel/cpu/mtrr/generic.c +++ b/arch/x86/kernel/cpu/mtrr/generic.c @@ -314,7 +314,7 @@ void __init get_mtrr_state(void) if (mtrr_state.have_fixed) get_fixed_ranges(mtrr_state.fixed_ranges); - rdmsr(MTRRdefType_MSR, lo, dummy); + rdmsr(MSR_MTRRdefType, lo, dummy); mtrr_state.def_type = (lo & 0xff); mtrr_state.enabled = (lo & 0xc00) >> 10; @@ -579,10 +579,10 @@ static void prepare_set(void) __acquires(set_atomicity_lock) __flush_tlb(); /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + rdmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo & ~0xcff, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo & ~0xcff, deftype_hi); } static void post_set(void) __releases(set_atomicity_lock) @@ -591,7 +591,7 @@ static void post_set(void) __releases(set_atomicity_lock) __flush_tlb(); /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, deftype_lo, deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, deftype_lo, deftype_hi); /* Enable caches */ write_cr0(read_cr0() & 0xbfffffff); diff --git a/arch/x86/kernel/cpu/mtrr/mtrr.h b/arch/x86/kernel/cpu/mtrr/mtrr.h index e5ee686d2c3..7538b767f20 100644 --- a/arch/x86/kernel/cpu/mtrr/mtrr.h +++ b/arch/x86/kernel/cpu/mtrr/mtrr.h @@ -5,8 +5,6 @@ #include #include -#define MTRRdefType_MSR 0x2ff - #define MTRR_CHANGE_MASK_FIXED 0x01 #define MTRR_CHANGE_MASK_VARIABLE 0x02 #define MTRR_CHANGE_MASK_DEFTYPE 0x04 diff --git a/arch/x86/kernel/cpu/mtrr/state.c b/arch/x86/kernel/cpu/mtrr/state.c index 7f7e2753685..1f5fb1588d1 100644 --- a/arch/x86/kernel/cpu/mtrr/state.c +++ b/arch/x86/kernel/cpu/mtrr/state.c @@ -35,7 +35,7 @@ void set_mtrr_prepare_save(struct set_mtrr_context *ctxt) if (use_intel()) /* Save MTRR state */ - rdmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + rdmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else were excluded at the top */ ctxt->ccr3 = getCx86(CX86_CCR3); @@ -46,7 +46,7 @@ void set_mtrr_cache_disable(struct set_mtrr_context *ctxt) { if (use_intel()) /* Disable MTRRs, and set the default type to uncached */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo & 0xf300UL, + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo & 0xf300UL, ctxt->deftype_hi); else if (is_cpu(CYRIX)) /* Cyrix ARRs - everything else were excluded at the top */ @@ -64,7 +64,7 @@ void set_mtrr_done(struct set_mtrr_context *ctxt) /* Restore MTRRdefType */ if (use_intel()) /* Intel (P6) standard MTRRs */ - mtrr_wrmsr(MTRRdefType_MSR, ctxt->deftype_lo, ctxt->deftype_hi); + mtrr_wrmsr(MSR_MTRRdefType, ctxt->deftype_lo, ctxt->deftype_hi); else /* Cyrix ARRs - everything else was excluded at the top */ setCx86(CX86_CCR3, ctxt->ccr3); From 8bc2095951517e2c74b8aeeca4685ddd6b16ed4b Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 20:45:59 +0200 Subject: [PATCH 1106/2061] perf_counter: Fix inheritance cleanup code Clean up code that open-coded the list_{add,del}_counter() code in __perf_counter_exit_task() which consequently diverged. This could lead to software counter crashes. Also, fold the ctx->nr_counter inc/dec into those functions and clean up some of the related code. [ Impact: fix potential sw counter crash, cleanup ] Signed-off-by: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 57840a94b16..59a926d04ba 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -115,6 +115,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) } list_add_rcu(&counter->event_entry, &ctx->event_list); + ctx->nr_counters++; } static void @@ -122,6 +123,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { struct perf_counter *sibling, *tmp; + ctx->nr_counters--; + list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -209,7 +212,6 @@ static void __perf_counter_remove_from_context(void *info) counter_sched_out(counter, cpuctx, ctx); counter->task = NULL; - ctx->nr_counters--; /* * Protect the list operation against NMI by disabling the @@ -276,7 +278,6 @@ retry: * succeed. */ if (!list_empty(&counter->list_entry)) { - ctx->nr_counters--; list_del_counter(counter, ctx); counter->task = NULL; } @@ -544,7 +545,6 @@ static void add_counter_to_ctx(struct perf_counter *counter, struct perf_counter_context *ctx) { list_add_counter(counter, ctx); - ctx->nr_counters++; counter->prev_state = PERF_COUNTER_STATE_OFF; counter->tstamp_enabled = ctx->time; counter->tstamp_running = ctx->time; @@ -3206,9 +3206,8 @@ static int inherit_group(struct perf_counter *parent_counter, static void sync_child_counter(struct perf_counter *child_counter, struct perf_counter *parent_counter) { - u64 parent_val, child_val; + u64 child_val; - parent_val = atomic64_read(&parent_counter->count); child_val = atomic64_read(&child_counter->count); /* @@ -3240,7 +3239,6 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter_context *child_ctx) { struct perf_counter *parent_counter; - struct perf_counter *sub, *tmp; /* * If we do not self-reap then we have to wait for the @@ -3252,8 +3250,8 @@ __perf_counter_exit_task(struct task_struct *child, */ if (child != current) { wait_task_inactive(child, 0); - list_del_init(&child_counter->list_entry); update_counter_times(child_counter); + list_del_counter(child_counter, child_ctx); } else { struct perf_cpu_context *cpuctx; unsigned long flags; @@ -3272,9 +3270,7 @@ __perf_counter_exit_task(struct task_struct *child, group_sched_out(child_counter, cpuctx, child_ctx); update_counter_times(child_counter); - list_del_init(&child_counter->list_entry); - - child_ctx->nr_counters--; + list_del_counter(child_counter, child_ctx); perf_enable(); local_irq_restore(flags); @@ -3288,13 +3284,6 @@ __perf_counter_exit_task(struct task_struct *child, */ if (parent_counter) { sync_child_counter(child_counter, parent_counter); - list_for_each_entry_safe(sub, tmp, &child_counter->sibling_list, - list_entry) { - if (sub->parent) { - sync_child_counter(sub, sub->parent); - free_counter(sub); - } - } free_counter(child_counter); } } @@ -3315,9 +3304,18 @@ void perf_counter_exit_task(struct task_struct *child) if (likely(!child_ctx->nr_counters)) return; +again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) __perf_counter_exit_task(child, child_counter, child_ctx); + + /* + * If the last counter was a group counter, it will have appended all + * its siblings to the list, but we obtained 'tmp' before that which + * will still point to the list head terminating the iteration. + */ + if (!list_empty(&child_ctx->counter_list)) + goto again; } /* From 856d56b9e5de650a64a6c41c17aaed702b55d578 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 15 May 2009 20:45:59 +0200 Subject: [PATCH 1107/2061] perf_counter: Fix counter inheritance Srivatsa Vaddagiri reported that a Java workload triggers this warning in kernel/exit.c: WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); Add the inherited counter propagation on self-detach, this could cause counter leaks and incomplete stats in threaded code like the below: #include #include void *thread(void *arg) { sleep(5); return NULL; } void main(void) { pthread_t thr; pthread_create(&thr, NULL, thread, NULL); } Reported-by: Srivatsa Vaddagiri Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- kernel/exit.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/exit.c b/kernel/exit.c index 4741376c8de..16d74f13a3e 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,6 +128,12 @@ static void __exit_signal(struct task_struct *tsk) sig = NULL; /* Marker for below. */ } + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + __unhash_process(tsk); /* From 0203026b58b4299ba7281c0b4b417207c1f05d0e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:24:08 +0200 Subject: [PATCH 1108/2061] perf_counter: fix threaded task exit Flushing counters in __exit_signal() with irqs disabled is not a good idea as perf_counter_exit_task() acquires mutexes. So flush it before acquiring the tasklist lock. (Note, we still need a fix for when the PID has been unhashed.) [ Impact: fix crash with inherited counters ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- kernel/exit.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 16d74f13a3e..73affd35e76 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -128,12 +128,6 @@ static void __exit_signal(struct task_struct *tsk) sig = NULL; /* Marker for below. */ } - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(tsk); - __unhash_process(tsk); /* @@ -183,6 +177,13 @@ repeat: atomic_dec(&__task_cred(p)->user->processes); proc_flush_task(p); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(p); + write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); From d2517a49d55536b38c7a87e5289550cfedaa4dcc Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 10:04:45 +0200 Subject: [PATCH 1109/2061] perf_counter, x86: fix zero irq_period counters The quirk to irq_period unearthed an unrobustness we had in the hw_counter initialization sequence: we left irq_period at 0, which was then quirked up to 2 ... which then generated a _lot_ of interrupts during 'perf stat' runs, slowed them down and skewed the counter results in general. Initialize irq_period to the maximum instead. [ Impact: fix perf stat results ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 886dcf334bc..5bfd30ab392 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -286,6 +286,9 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->nmi = 1; } + if (!hwc->irq_period) + hwc->irq_period = x86_pmu.max_period; + atomic64_set(&hwc->period_left, min(x86_pmu.max_period, hwc->irq_period)); From e8e7526c3c0863be25ab03a0871ee0978de5ba50 Mon Sep 17 00:00:00 2001 From: Mark de Wever Date: Sun, 17 May 2009 17:22:53 +0200 Subject: [PATCH 1110/2061] ide-tape: fix debug call This error only occurs when IDETAPE_DEBUG_LOG is enabled. Signed-off-by: Mark de Wever Signed-off-by: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- drivers/ide/ide-tape.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index 8dfc68892d6..203bbeac182 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -617,7 +617,7 @@ static ide_startstop_t idetape_do_request(ide_drive_t *drive, struct ide_cmd cmd; u8 stat; - debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n" + debug_log(DBG_SENSE, "sector: %llu, nr_sectors: %lu\n", (unsigned long long)rq->sector, rq->nr_sectors); if (!(blk_special_request(rq) || blk_sense_request(rq))) { From 6fd058f7791087648c683eb8572edf3be3c4c23c Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 17 May 2009 15:38:01 -0400 Subject: [PATCH 1111/2061] ext4: Add a comprehensive block validity check to ext4_get_blocks() To catch filesystem bugs or corruption which could lead to the filesystem getting severly damaged, this patch adds a facility for tracking all of the filesystem metadata blocks by contiguous regions in a red-black tree. This allows quick searching of the tree to locate extents which might overlap with filesystem metadata blocks. This facility is also used by the multi-block allocator to assure that it is not allocating blocks out of the system zone, as well as by the routines used when reading indirect blocks and extents information from disk to make sure their contents are valid. Signed-off-by: "Theodore Ts'o" --- fs/ext4/Makefile | 4 +- fs/ext4/block_validity.c | 244 +++++++++++++++++++++++++++++++++++++++ fs/ext4/ext4.h | 11 ++ fs/ext4/extents.c | 22 +--- fs/ext4/inode.c | 47 ++++++-- fs/ext4/mballoc.c | 11 +- fs/ext4/super.c | 32 ++++- 7 files changed, 332 insertions(+), 39 deletions(-) create mode 100644 fs/ext4/block_validity.c diff --git a/fs/ext4/Makefile b/fs/ext4/Makefile index a8ff003a00f..8a34710ecf4 100644 --- a/fs/ext4/Makefile +++ b/fs/ext4/Makefile @@ -5,8 +5,8 @@ obj-$(CONFIG_EXT4_FS) += ext4.o ext4-y := balloc.o bitmap.o dir.o file.o fsync.o ialloc.o inode.o \ - ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ - ext4_jbd2.o migrate.o mballoc.o + ioctl.o namei.o super.o symlink.o hash.o resize.o extents.o \ + ext4_jbd2.o migrate.o mballoc.o block_validity.o ext4-$(CONFIG_EXT4_FS_XATTR) += xattr.o xattr_user.o xattr_trusted.o ext4-$(CONFIG_EXT4_FS_POSIX_ACL) += acl.o diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c new file mode 100644 index 00000000000..50784ef0756 --- /dev/null +++ b/fs/ext4/block_validity.c @@ -0,0 +1,244 @@ +/* + * linux/fs/ext4/block_validity.c + * + * Copyright (C) 2009 + * Theodore Ts'o (tytso@mit.edu) + * + * Track which blocks in the filesystem are metadata blocks that + * should never be used as data blocks by files or directories. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "ext4.h" + +struct ext4_system_zone { + struct rb_node node; + ext4_fsblk_t start_blk; + unsigned int count; +}; + +static struct kmem_cache *ext4_system_zone_cachep; + +int __init init_ext4_system_zone(void) +{ + ext4_system_zone_cachep = KMEM_CACHE(ext4_system_zone, + SLAB_RECLAIM_ACCOUNT); + if (ext4_system_zone_cachep == NULL) + return -ENOMEM; + return 0; +} + +void exit_ext4_system_zone(void) +{ + kmem_cache_destroy(ext4_system_zone_cachep); +} + +static inline int can_merge(struct ext4_system_zone *entry1, + struct ext4_system_zone *entry2) +{ + if ((entry1->start_blk + entry1->count) == entry2->start_blk) + return 1; + return 0; +} + +/* + * Mark a range of blocks as belonging to the "system zone" --- that + * is, filesystem metadata blocks which should never be used by + * inodes. + */ +static int add_system_zone(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *new_entry = NULL, *entry; + struct rb_node **n = &sbi->system_blks.rb_node, *node; + struct rb_node *parent = NULL, *new_node = NULL; + + while (*n) { + parent = *n; + entry = rb_entry(parent, struct ext4_system_zone, node); + if (start_blk < entry->start_blk) + n = &(*n)->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = &(*n)->rb_right; + else { + if (start_blk + count > (entry->start_blk + + entry->count)) + entry->count = (start_blk + count - + entry->start_blk); + new_node = *n; + new_entry = rb_entry(new_node, struct ext4_system_zone, + node); + break; + } + } + + if (!new_entry) { + new_entry = kmem_cache_alloc(ext4_system_zone_cachep, + GFP_KERNEL); + if (!new_entry) + return -ENOMEM; + new_entry->start_blk = start_blk; + new_entry->count = count; + new_node = &new_entry->node; + + rb_link_node(new_node, parent, n); + rb_insert_color(new_node, &sbi->system_blks); + } + + /* Can we merge to the left? */ + node = rb_prev(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(entry, new_entry)) { + new_entry->start_blk = entry->start_blk; + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + + /* Can we merge to the right? */ + node = rb_next(new_node); + if (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + if (can_merge(new_entry, entry)) { + new_entry->count += entry->count; + rb_erase(node, &sbi->system_blks); + kmem_cache_free(ext4_system_zone_cachep, entry); + } + } + return 0; +} + +static void debug_print_tree(struct ext4_sb_info *sbi) +{ + struct rb_node *node; + struct ext4_system_zone *entry; + int first = 1; + + printk(KERN_INFO "System zones: "); + node = rb_first(&sbi->system_blks); + while (node) { + entry = rb_entry(node, struct ext4_system_zone, node); + printk("%s%llu-%llu", first ? "" : ", ", + entry->start_blk, entry->start_blk + entry->count - 1); + first = 0; + node = rb_next(node); + } + printk("\n"); +} + +int ext4_setup_system_zone(struct super_block *sb) +{ + ext4_group_t ngroups = ext4_get_groups_count(sb); + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_group_desc *gdp; + ext4_group_t i; + int flex_size = ext4_flex_bg_size(sbi); + int ret; + + if (!test_opt(sb, BLOCK_VALIDITY)) { + if (EXT4_SB(sb)->system_blks.rb_node) + ext4_release_system_zone(sb); + return 0; + } + if (EXT4_SB(sb)->system_blks.rb_node) + return 0; + + for (i=0; i < ngroups; i++) { + if (ext4_bg_has_super(sb, i) && + ((i < 5) || ((i % flex_size) == 0))) + add_system_zone(sbi, ext4_group_first_block_no(sb, i), + sbi->s_gdb_count + 1); + gdp = ext4_get_group_desc(sb, i, NULL); + ret = add_system_zone(sbi, ext4_block_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_bitmap(sb, gdp), 1); + if (ret) + return ret; + ret = add_system_zone(sbi, ext4_inode_table(sb, gdp), + sbi->s_itb_per_group); + if (ret) + return ret; + } + + if (test_opt(sb, DEBUG)) + debug_print_tree(EXT4_SB(sb)); + return 0; +} + +/* Called when the filesystem is unmounted */ +void ext4_release_system_zone(struct super_block *sb) +{ + struct rb_node *n = EXT4_SB(sb)->system_blks.rb_node; + struct rb_node *parent; + struct ext4_system_zone *entry; + + while (n) { + /* Do the node's children first */ + if (n->rb_left) { + n = n->rb_left; + continue; + } + if (n->rb_right) { + n = n->rb_right; + continue; + } + /* + * The node has no children; free it, and then zero + * out parent's link to it. Finally go to the + * beginning of the loop and try to free the parent + * node. + */ + parent = rb_parent(n); + entry = rb_entry(n, struct ext4_system_zone, node); + kmem_cache_free(ext4_system_zone_cachep, entry); + if (!parent) + EXT4_SB(sb)->system_blks.rb_node = NULL; + else if (parent->rb_left == n) + parent->rb_left = NULL; + else if (parent->rb_right == n) + parent->rb_right = NULL; + n = parent; + } + EXT4_SB(sb)->system_blks.rb_node = NULL; +} + +/* + * Returns 1 if the passed-in block region (start_blk, + * start_blk+count) is valid; 0 if some part of the block region + * overlaps with filesystem metadata blocks. + */ +int ext4_data_block_valid(struct ext4_sb_info *sbi, ext4_fsblk_t start_blk, + unsigned int count) +{ + struct ext4_system_zone *entry; + struct rb_node *n = sbi->system_blks.rb_node; + + if ((start_blk <= le32_to_cpu(sbi->s_es->s_first_data_block)) || + (start_blk + count > ext4_blocks_count(sbi->s_es))) + return 0; + while (n) { + entry = rb_entry(n, struct ext4_system_zone, node); + if (start_blk + count - 1 < entry->start_blk) + n = n->rb_left; + else if (start_blk >= (entry->start_blk + entry->count)) + n = n->rb_right; + else + return 0; + } + return 1; +} + diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index d164f1294e5..4311cc85b53 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -696,6 +696,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_I_VERSION 0x2000000 /* i_version support */ #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ +#define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */ #ifndef _LINUX_EXT2_FS_H @@ -887,6 +888,7 @@ struct ext4_sb_info { int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ + struct rb_root system_blks; #ifdef EXTENTS_STATS /* ext4 extents stats */ @@ -1618,6 +1620,15 @@ extern struct dentry *ext4_get_parent(struct dentry *child); extern const struct inode_operations ext4_symlink_inode_operations; extern const struct inode_operations ext4_fast_symlink_inode_operations; +/* block_validity */ +extern void ext4_release_system_zone(struct super_block *sb); +extern int ext4_setup_system_zone(struct super_block *sb); +extern int __init init_ext4_system_zone(void); +extern void exit_ext4_system_zone(void); +extern int ext4_data_block_valid(struct ext4_sb_info *sbi, + ext4_fsblk_t start_blk, + unsigned int count); + /* extents.c */ extern int ext4_ext_tree_init(handle_t *handle, struct inode *); extern int ext4_ext_writepage_trans_blocks(struct inode *, int); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 27c383c7b43..d04b779b780 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -326,32 +326,18 @@ ext4_ext_max_entries(struct inode *inode, int depth) static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext) { - ext4_fsblk_t block = ext_pblock(ext), valid_block; + ext4_fsblk_t block = ext_pblock(ext); int len = ext4_ext_get_actual_len(ext); - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; - valid_block = le32_to_cpu(es->s_first_data_block) + - EXT4_SB(inode->i_sb)->s_gdb_count; - if (unlikely(block <= valid_block || - ((block + len) > ext4_blocks_count(es)))) - return 0; - else - return 1; + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len); } static int ext4_valid_extent_idx(struct inode *inode, struct ext4_extent_idx *ext_idx) { - ext4_fsblk_t block = idx_pblock(ext_idx), valid_block; - struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; + ext4_fsblk_t block = idx_pblock(ext_idx); - valid_block = le32_to_cpu(es->s_first_data_block) + - EXT4_SB(inode->i_sb)->s_gdb_count; - if (unlikely(block <= valid_block || - (block >= ext4_blocks_count(es)))) - return 0; - else - return 1; + return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, 1); } static int ext4_valid_extent_entries(struct inode *inode, diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index d7b7480682b..dadd3f995db 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -372,20 +372,21 @@ static int ext4_block_to_path(struct inode *inode, } static int __ext4_check_blockref(const char *function, struct inode *inode, - __le32 *p, unsigned int max) { - - unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es); + __le32 *p, unsigned int max) +{ __le32 *bref = p; + unsigned int blk; + while (bref < p+max) { - if (unlikely(le32_to_cpu(*bref) >= maxblocks)) { + blk = le32_to_cpu(*bref++); + if (blk && + unlikely(!ext4_data_block_valid(EXT4_SB(inode->i_sb), + blk, 1))) { ext4_error(inode->i_sb, function, - "block reference %u >= max (%u) " - "in inode #%lu, offset=%d", - le32_to_cpu(*bref), maxblocks, - inode->i_ino, (int)(bref-p)); + "invalid block reference %u " + "in inode #%lu", blk, inode->i_ino); return -EIO; } - bref++; } return 0; } @@ -1125,6 +1126,21 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used) ext4_discard_preallocations(inode); } +static int check_block_validity(struct inode *inode, sector_t logical, + sector_t phys, int len) +{ + if (!ext4_data_block_valid(EXT4_SB(inode->i_sb), phys, len)) { + ext4_error(inode->i_sb, "check_block_validity", + "inode #%lu logical block %llu mapped to %llu " + "(size %d)", inode->i_ino, + (unsigned long long) logical, + (unsigned long long) phys, len); + WARN_ON(1); + return -EIO; + } + return 0; +} + /* * The ext4_get_blocks() function tries to look up the requested blocks, * and returns if the blocks are already mapped. @@ -1170,6 +1186,13 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, } up_read((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && buffer_mapped(bh)) { + int ret = check_block_validity(inode, block, + bh->b_blocknr, retval); + if (ret != 0) + return ret; + } + /* If it is only a block(s) look up */ if ((flags & EXT4_GET_BLOCKS_CREATE) == 0) return retval; @@ -1245,6 +1268,12 @@ int ext4_get_blocks(handle_t *handle, struct inode *inode, sector_t block, ext4_da_update_reserve_space(inode, retval); up_write((&EXT4_I(inode)->i_data_sem)); + if (retval > 0 && buffer_mapped(bh)) { + int ret = check_block_validity(inode, block, + bh->b_blocknr, retval); + if (ret != 0) + return ret; + } return retval; } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 541bd9adffa..ed8482e22c0 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -2961,15 +2961,10 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac, + le32_to_cpu(es->s_first_data_block); len = ac->ac_b_ex.fe_len; - if (in_range(ext4_block_bitmap(sb, gdp), block, len) || - in_range(ext4_inode_bitmap(sb, gdp), block, len) || - in_range(block, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group) || - in_range(block + len - 1, ext4_inode_table(sb, gdp), - EXT4_SB(sb)->s_itb_per_group)) { + if (!ext4_data_block_valid(sbi, block, len)) { ext4_error(sb, __func__, - "Allocating block %llu in system zone of %d group\n", - block, ac->ac_b_ex.fe_group); + "Allocating blocks %llu-%llu which overlap " + "fs metadata\n", block, block+len); /* File system mounted not to panic on error * Fix the bitmap and repeat the block allocation * We leak some of the blocks here. diff --git a/fs/ext4/super.c b/fs/ext4/super.c index dc34ed3d132..600b7ad699b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -568,6 +568,7 @@ static void ext4_put_super(struct super_block *sb) struct ext4_super_block *es = sbi->s_es; int i, err; + ext4_release_system_zone(sb); ext4_mb_release(sb); ext4_ext_release(sb); ext4_xattr_put_super(sb); @@ -1055,6 +1056,7 @@ enum { Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version, Opt_stripe, Opt_delalloc, Opt_nodelalloc, + Opt_block_validity, Opt_noblock_validity, Opt_inode_readahead_blks, Opt_journal_ioprio }; @@ -1114,6 +1116,8 @@ static const match_table_t tokens = { {Opt_resize, "resize"}, {Opt_delalloc, "delalloc"}, {Opt_nodelalloc, "nodelalloc"}, + {Opt_block_validity, "block_validity"}, + {Opt_noblock_validity, "noblock_validity"}, {Opt_inode_readahead_blks, "inode_readahead_blks=%u"}, {Opt_journal_ioprio, "journal_ioprio=%u"}, {Opt_auto_da_alloc, "auto_da_alloc=%u"}, @@ -1508,6 +1512,12 @@ set_qf_format: case Opt_delalloc: set_opt(sbi->s_mount_opt, DELALLOC); break; + case Opt_block_validity: + set_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + break; + case Opt_noblock_validity: + clear_opt(sbi->s_mount_opt, BLOCK_VALIDITY); + break; case Opt_inode_readahead_blks: if (match_int(&args[0], &option)) return 0; @@ -2826,6 +2836,13 @@ no_journal: } else if (test_opt(sb, DELALLOC)) printk(KERN_INFO "EXT4-fs: delayed allocation enabled\n"); + err = ext4_setup_system_zone(sb); + if (err) { + printk(KERN_ERR "EXT4-fs: failed to initialize system " + "zone (%d)\n", err); + goto failed_mount4; + } + ext4_ext_init(sb); err = ext4_mb_init(sb, needs_recovery); if (err) { @@ -2875,6 +2892,7 @@ cantfind_ext4: failed_mount4: printk(KERN_ERR "EXT4-fs (device %s): mount failed\n", sb->s_id); + ext4_release_system_zone(sb); if (sbi->s_journal) { jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; @@ -3515,6 +3533,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) sb->s_flags &= ~MS_RDONLY; } } + ext4_setup_system_zone(sb); if (sbi->s_journal == NULL) ext4_commit_super(sb, 1); @@ -3927,13 +3946,16 @@ static int __init init_ext4_fs(void) { int err; + err = init_ext4_system_zone(); + if (err) + return err; ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj); if (!ext4_kset) - return -ENOMEM; + goto out4; ext4_proc_root = proc_mkdir("fs/ext4", NULL); err = init_ext4_mballoc(); if (err) - return err; + goto out3; err = init_ext4_xattr(); if (err) @@ -3958,6 +3980,11 @@ out1: exit_ext4_xattr(); out2: exit_ext4_mballoc(); +out3: + remove_proc_entry("fs/ext4", NULL); + kset_unregister(ext4_kset); +out4: + exit_ext4_system_zone(); return err; } @@ -3972,6 +3999,7 @@ static void __exit exit_ext4_fs(void) exit_ext4_mballoc(); remove_proc_entry("fs/ext4", NULL); kset_unregister(ext4_kset); + exit_ext4_system_zone(); } MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others"); From 0568c518937ee3a9b6a94d18bae9c150fe5d6832 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sun, 17 May 2009 23:31:23 -0400 Subject: [PATCH 1112/2061] ext4: down i_data_sem only for read when walking tree for fiemap Not sure why I put this in as down_write originally; all we are doing is walking the tree, nothing will change under us and concurrent reads should be no problem. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index d04b779b780..d4e99e96fdd 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3312,10 +3312,10 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * Walk the extent tree gathering extent information. * ext4_ext_fiemap_cb will push extents back to user. */ - down_write(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); error = ext4_ext_walk_space(inode, start_blk, len_blks, ext4_ext_fiemap_cb, fieinfo); - up_write(&EXT4_I(inode)->i_data_sem); + up_read(&EXT4_I(inode)->i_data_sem); } return error; From f68301656b5f5d2de104f2687add6beeb8f3c3b9 Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Sun, 17 May 2009 23:52:44 -0400 Subject: [PATCH 1113/2061] ext4: Fix memory leak in ext4_fill_super() in case of a failed mount Signed-off-by: Manish Katiyar Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 600b7ad699b..eca6c057b11 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2924,6 +2924,7 @@ failed_mount: brelse(bh); out_fail: sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); return ret; From de5ce037304f2c88a319b1c3b808ab0c4c618c1c Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Sun, 17 May 2009 23:52:47 -0400 Subject: [PATCH 1114/2061] ext3: Fix memory leak in ext3_fill_super() in case of a failed mount Signed-off-by: Manish Katiyar Signed-off-by: "Theodore Ts'o" --- fs/ext3/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 599dbfe504c..d8b73d4abe3 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2021,6 +2021,7 @@ failed_mount: brelse(bh); out_fail: sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); lock_kernel(); return ret; From 0f7ee7c17241915fdaff49d1a36f5aafd80a7dce Mon Sep 17 00:00:00 2001 From: Manish Katiyar Date: Sun, 17 May 2009 23:52:51 -0400 Subject: [PATCH 1115/2061] ext2: Fix memory leak in ext2_fill_super() in case of a failed mount Signed-off-by: Manish Katiyar Signed-off-by: "Theodore Ts'o" --- fs/ext2/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 5c4afe65224..e3c748faf2d 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1093,6 +1093,7 @@ failed_mount: brelse(bh); failed_sbi: sb->s_fs_info = NULL; + kfree(sbi->s_blockgroup_lock); kfree(sbi); return ret; } From c0daaf3f1f672defa3a45ca449b76d0e86c55892 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 18 May 2009 14:02:12 +1000 Subject: [PATCH 1116/2061] perf_counter: powerpc: initialize cpuhw pointer before use Commit 9e35ad38 ("perf_counter: Rework the perf counter disable/enable") added code to the powerpc hw_perf_enable (renamed from hw_perf_restore) to test cpuhw->disabled and return immediately if it is not set (i.e. if the PMU is already enabled). Unfortunately the test got added before cpuhw was initialized, resulting in an oops the first time hw_perf_enable got called. This fixes it by moving the initialization of cpuhw to before cpuhw->disabled is tested. [ Impact: fix oops-causing bug on powerpc ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18960.56772.869734.304631@drongo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 6baae5a5c33..fe21b2440f2 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -450,12 +450,11 @@ void hw_perf_enable(void) int idx; local_irq_save(flags); + cpuhw = &__get_cpu_var(cpu_hw_counters); if (!cpuhw->disabled) { local_irq_restore(flags); return; } - - cpuhw = &__get_cpu_var(cpu_hw_counters); cpuhw->disabled = 0; /* From 888a589f6be07d624e21e2174d98375e9f95911b Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:59:37 -0700 Subject: [PATCH 1117/2061] mm, x86: remove MEMORY_HOTPLUG_RESERVE related code after: | commit b263295dbffd33b0fbff670720fa178c30e3392a | Author: Christoph Lameter | Date: Wed Jan 30 13:30:47 2008 +0100 | | x86: 64-bit, make sparsemem vmemmap the only memory model we don't have MEMORY_HOTPLUG_RESERVE anymore. Historically, x86-64 had an architecture-specific method for memory hotplug whereby it scanned the SRAT for physical memory ranges that could be potentially used for memory hot-add later. By reserving those ranges without physical memory, the memmap would be allocated and left dormant until needed. This depended on the DISCONTIG memory model which has been removed so the code implementing HOTPLUG_RESERVE is now dead. This patch removes the dead code used by MEMORY_HOTPLUG_RESERVE. (Changelog authored by Mel.) v2: updated changelog, and remove hotadd= in doc [ Impact: remove dead code ] Signed-off-by: Yinghai Lu Reviewed-by: Christoph Lameter Reviewed-by: Mel Gorman Workflow-found-OK-by: Andrew Morton LKML-Reference: <4A0C4910.7090508@kernel.org> Signed-off-by: Ingo Molnar --- Documentation/x86/x86_64/boot-options.txt | 5 -- arch/x86/include/asm/numa_64.h | 3 - arch/x86/mm/numa_64.c | 5 -- arch/x86/mm/srat_64.c | 63 ++++----------------- include/linux/mm.h | 2 - mm/page_alloc.c | 69 ----------------------- 6 files changed, 12 insertions(+), 135 deletions(-) diff --git a/Documentation/x86/x86_64/boot-options.txt b/Documentation/x86/x86_64/boot-options.txt index 34c13040a71..2db5893d6c9 100644 --- a/Documentation/x86/x86_64/boot-options.txt +++ b/Documentation/x86/x86_64/boot-options.txt @@ -150,11 +150,6 @@ NUMA Otherwise, the remaining system RAM is allocated to an additional node. - numa=hotadd=percent - Only allow hotadd memory to preallocate page structures upto - percent of already available memory. - numa=hotadd=0 will disable hotadd memory. - ACPI acpi=off Don't enable ACPI diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 064ed6df4cb..7feff0648d7 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -17,9 +17,6 @@ extern int compute_hash_shift(struct bootnode *nodes, int numblks, extern void numa_init_array(void); extern int numa_off; -extern void srat_reserve_add_area(int nodeid); -extern int hotadd_percent; - extern s16 apicid_to_node[MAX_LOCAL_APIC]; extern unsigned long numa_free_all_bootmem(void); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index fb61d81a656..a6a93c39523 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -272,9 +272,6 @@ void __init setup_node_bootmem(int nodeid, unsigned long start, reserve_bootmem_node(NODE_DATA(nodeid), bootmap_start, bootmap_pages<> PAGE_SHIFT; unsigned long e_pfn = end >> PAGE_SHIFT; - int ret = 0, changed = 0; + int changed = 0; struct bootnode *nd = &nodes_add[node]; /* I had some trouble with strange memory hotadd regions breaking @@ -210,7 +201,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) mistakes */ if ((signed long)(end - start) < NODE_MIN_SIZE) { printk(KERN_ERR "SRAT: Hotplug area too small\n"); - return -1; + return; } /* This check might be a bit too strict, but I'm keeping it for now. */ @@ -218,12 +209,7 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug area %lu -> %lu has existing memory\n", s_pfn, e_pfn); - return -1; - } - - if (!hotadd_enough_memory(&nodes_add[node])) { - printk(KERN_ERR "SRAT: Hotplug area too large\n"); - return -1; + return; } /* Looks good */ @@ -245,11 +231,9 @@ reserve_hotadd(int node, unsigned long start, unsigned long end) printk(KERN_ERR "SRAT: Hotplug zone not continuous. Partly ignored\n"); } - ret = update_end_of_memory(nd->end); - if (changed) - printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", nd->start, nd->end); - return ret; + printk(KERN_INFO "SRAT: hot plug zone found %Lx - %Lx\n", + nd->start, nd->end); } /* Callback for parsing of the Proximity Domain <-> Memory Area mappings */ @@ -310,13 +294,10 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) start, end); e820_register_active_regions(node, start >> PAGE_SHIFT, end >> PAGE_SHIFT); - push_node_boundaries(node, nd->start >> PAGE_SHIFT, - nd->end >> PAGE_SHIFT); - if ((ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) && - (reserve_hotadd(node, start, end) < 0)) { - /* Ignore hotadd region. Undo damage */ - printk(KERN_NOTICE "SRAT: Hotplug region ignored\n"); + if (ma->flags & ACPI_SRAT_MEM_HOT_PLUGGABLE) { + update_nodes_add(node, start, end); + /* restore nodes[node] */ *nd = oldnode; if ((nd->start | nd->end) == 0) node_clear(node, nodes_parsed); @@ -510,26 +491,6 @@ static int null_slit_node_compare(int a, int b) } #endif /* CONFIG_NUMA_EMU */ -void __init srat_reserve_add_area(int nodeid) -{ - if (found_add_area && nodes_add[nodeid].end) { - u64 total_mb; - - printk(KERN_INFO "SRAT: Reserving hot-add memory space " - "for node %d at %Lx-%Lx\n", - nodeid, nodes_add[nodeid].start, nodes_add[nodeid].end); - total_mb = (nodes_add[nodeid].end - nodes_add[nodeid].start) - >> PAGE_SHIFT; - total_mb *= sizeof(struct page); - total_mb >>= 20; - printk(KERN_INFO "SRAT: This will cost you %Lu MB of " - "pre-allocated memory.\n", (unsigned long long)total_mb); - reserve_bootmem_node(NODE_DATA(nodeid), nodes_add[nodeid].start, - nodes_add[nodeid].end - nodes_add[nodeid].start, - BOOTMEM_DEFAULT); - } -} - int __node_distance(int a, int b) { int index; diff --git a/include/linux/mm.h b/include/linux/mm.h index bff1f0d475c..511b0986709 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1031,8 +1031,6 @@ extern void add_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); extern void remove_active_range(unsigned int nid, unsigned long start_pfn, unsigned long end_pfn); -extern void push_node_boundaries(unsigned int nid, unsigned long start_pfn, - unsigned long end_pfn); extern void remove_all_active_ranges(void); extern unsigned long absent_pages_in_range(unsigned long start_pfn, unsigned long end_pfn); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fe753ecf2aa..474c7e9dd51 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -149,10 +149,6 @@ static unsigned long __meminitdata dma_reserve; static int __meminitdata nr_nodemap_entries; static unsigned long __meminitdata arch_zone_lowest_possible_pfn[MAX_NR_ZONES]; static unsigned long __meminitdata arch_zone_highest_possible_pfn[MAX_NR_ZONES]; -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - static unsigned long __meminitdata node_boundary_start_pfn[MAX_NUMNODES]; - static unsigned long __meminitdata node_boundary_end_pfn[MAX_NUMNODES]; -#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ static unsigned long __initdata required_kernelcore; static unsigned long __initdata required_movablecore; static unsigned long __meminitdata zone_movable_pfn[MAX_NUMNODES]; @@ -3102,64 +3098,6 @@ void __init sparse_memory_present_with_active_regions(int nid) early_node_map[i].end_pfn); } -/** - * push_node_boundaries - Push node boundaries to at least the requested boundary - * @nid: The nid of the node to push the boundary for - * @start_pfn: The start pfn of the node - * @end_pfn: The end pfn of the node - * - * In reserve-based hot-add, mem_map is allocated that is unused until hotadd - * time. Specifically, on x86_64, SRAT will report ranges that can potentially - * be hotplugged even though no physical memory exists. This function allows - * an arch to push out the node boundaries so mem_map is allocated that can - * be used later. - */ -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE -void __init push_node_boundaries(unsigned int nid, - unsigned long start_pfn, unsigned long end_pfn) -{ - mminit_dprintk(MMINIT_TRACE, "zoneboundary", - "Entering push_node_boundaries(%u, %lu, %lu)\n", - nid, start_pfn, end_pfn); - - /* Initialise the boundary for this node if necessary */ - if (node_boundary_end_pfn[nid] == 0) - node_boundary_start_pfn[nid] = -1UL; - - /* Update the boundaries */ - if (node_boundary_start_pfn[nid] > start_pfn) - node_boundary_start_pfn[nid] = start_pfn; - if (node_boundary_end_pfn[nid] < end_pfn) - node_boundary_end_pfn[nid] = end_pfn; -} - -/* If necessary, push the node boundary out for reserve hotadd */ -static void __meminit account_node_boundary(unsigned int nid, - unsigned long *start_pfn, unsigned long *end_pfn) -{ - mminit_dprintk(MMINIT_TRACE, "zoneboundary", - "Entering account_node_boundary(%u, %lu, %lu)\n", - nid, *start_pfn, *end_pfn); - - /* Return if boundary information has not been provided */ - if (node_boundary_end_pfn[nid] == 0) - return; - - /* Check the boundaries and update if necessary */ - if (node_boundary_start_pfn[nid] < *start_pfn) - *start_pfn = node_boundary_start_pfn[nid]; - if (node_boundary_end_pfn[nid] > *end_pfn) - *end_pfn = node_boundary_end_pfn[nid]; -} -#else -void __init push_node_boundaries(unsigned int nid, - unsigned long start_pfn, unsigned long end_pfn) {} - -static void __meminit account_node_boundary(unsigned int nid, - unsigned long *start_pfn, unsigned long *end_pfn) {} -#endif - - /** * get_pfn_range_for_nid - Return the start and end page frames for a node * @nid: The nid to return the range for. If MAX_NUMNODES, the min and max PFN are returned. @@ -3185,9 +3123,6 @@ void __meminit get_pfn_range_for_nid(unsigned int nid, if (*start_pfn == -1UL) *start_pfn = 0; - - /* Push the node boundaries out if requested */ - account_node_boundary(nid, start_pfn, end_pfn); } /* @@ -3793,10 +3728,6 @@ void __init remove_all_active_ranges(void) { memset(early_node_map, 0, sizeof(early_node_map)); nr_nodemap_entries = 0; -#ifdef CONFIG_MEMORY_HOTPLUG_RESERVE - memset(node_boundary_start_pfn, 0, sizeof(node_boundary_start_pfn)); - memset(node_boundary_end_pfn, 0, sizeof(node_boundary_end_pfn)); -#endif /* CONFIG_MEMORY_HOTPLUG_RESERVE */ } /* Compare two active node_active_regions */ From 7c43769a9776141ec23ca81a1bdd5a9c0512f165 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:59:37 -0700 Subject: [PATCH 1118/2061] x86, mm: Fix node_possible_map logic Recently there were some changes to the meaning of node_possible_map, and it is quite strange: - the node without memory would be set in node_possible_map - but some node with less NODE_MIN_SIZE will be kicked out of node_possible_map. fix it by adding strict_setup_node_bootmem(). Also, remove unparse_node(). so result will be: 1. cpu_to_node() will return online node only (nearest one) 2. apicid_to_node() still returns the node that could be not online but is set in node_possible_map. 3. node_possible_map will include nodes that mem on it are less NODE_MIN_SIZE v2: after move_cpus_to_node change. [ Impact: get node_possible_map right ] Signed-off-by: Yinghai Lu Tested-by: Jack Steiner LKML-Reference: <4A0C49BE.6080800@kernel.org> [ v3: various small cleanups and comment clarifications ] Signed-off-by: Ingo Molnar --- arch/x86/include/asm/numa_64.h | 7 +++++++ arch/x86/mm/numa_64.c | 13 ++++++++++--- arch/x86/mm/srat_64.c | 29 ++--------------------------- 3 files changed, 19 insertions(+), 30 deletions(-) diff --git a/arch/x86/include/asm/numa_64.h b/arch/x86/include/asm/numa_64.h index 7feff0648d7..c4ae822e415 100644 --- a/arch/x86/include/asm/numa_64.h +++ b/arch/x86/include/asm/numa_64.h @@ -24,6 +24,13 @@ extern void setup_node_bootmem(int nodeid, unsigned long start, unsigned long end); #ifdef CONFIG_NUMA +/* + * Too small node sizes may confuse the VM badly. Usually they + * result from BIOS bugs. So dont recognize nodes as standalone + * NUMA entities that have less than this amount of RAM listed: + */ +#define NODE_MIN_SIZE (4*1024*1024) + extern void __init init_cpu_to_node(void); extern void __cpuinit numa_set_node(int cpu, int node); extern void __cpuinit numa_clear_node(int cpu); diff --git a/arch/x86/mm/numa_64.c b/arch/x86/mm/numa_64.c index a6a93c39523..459913beac7 100644 --- a/arch/x86/mm/numa_64.c +++ b/arch/x86/mm/numa_64.c @@ -179,18 +179,25 @@ static void * __init early_node_mem(int nodeid, unsigned long start, } /* Initialize bootmem allocator for a node */ -void __init setup_node_bootmem(int nodeid, unsigned long start, - unsigned long end) +void __init +setup_node_bootmem(int nodeid, unsigned long start, unsigned long end) { unsigned long start_pfn, last_pfn, bootmap_pages, bootmap_size; + const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); unsigned long bootmap_start, nodedata_phys; void *bootmap; - const int pgdat_size = roundup(sizeof(pg_data_t), PAGE_SIZE); int nid; if (!end) return; + /* + * Don't confuse VM with a node that doesn't have the + * minimum amount of memory: + */ + if (end && (end - start) < NODE_MIN_SIZE) + return; + start = roundup(start, ZONE_ALIGN); printk(KERN_INFO "Bootmem setup node %d %016lx-%016lx\n", nodeid, diff --git a/arch/x86/mm/srat_64.c b/arch/x86/mm/srat_64.c index b0dbbd48e58..2dfcbf9df2a 100644 --- a/arch/x86/mm/srat_64.c +++ b/arch/x86/mm/srat_64.c @@ -36,10 +36,6 @@ static int num_node_memblks __initdata; static struct bootnode node_memblk_range[NR_NODE_MEMBLKS] __initdata; static int memblk_nodeid[NR_NODE_MEMBLKS] __initdata; -/* Too small nodes confuse the VM badly. Usually they result - from BIOS bugs. */ -#define NODE_MIN_SIZE (4*1024*1024) - static __init int setup_node(int pxm) { return acpi_map_pxm_to_node(pxm); @@ -338,17 +334,6 @@ static int __init nodes_cover_memory(const struct bootnode *nodes) return 1; } -static void __init unparse_node(int node) -{ - int i; - node_clear(node, nodes_parsed); - node_clear(node, cpu_nodes_parsed); - for (i = 0; i < MAX_LOCAL_APIC; i++) { - if (apicid_to_node[i] == node) - apicid_to_node[i] = NUMA_NO_NODE; - } -} - void __init acpi_numa_arch_fixup(void) {} /* Use the information discovered above to actually set up the nodes. */ @@ -360,18 +345,8 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) return -1; /* First clean up the node list */ - for (i = 0; i < MAX_NUMNODES; i++) { + for (i = 0; i < MAX_NUMNODES; i++) cutoff_node(i, start, end); - /* - * don't confuse VM with a node that doesn't have the - * minimum memory. - */ - if (nodes[i].end && - (nodes[i].end - nodes[i].start) < NODE_MIN_SIZE) { - unparse_node(i); - node_set_offline(i); - } - } if (!nodes_cover_memory(nodes)) { bad_srat(); @@ -404,7 +379,7 @@ int __init acpi_scan_nodes(unsigned long start, unsigned long end) if (node == NUMA_NO_NODE) continue; - if (!node_isset(node, node_possible_map)) + if (!node_online(node)) numa_clear_node(i); } numa_init_array(); From 35d5a9a61490bf39d2e48d7f499c8c801a39ebe9 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 15 May 2009 13:59:37 -0700 Subject: [PATCH 1119/2061] x86: fix system without memory on node0 Jack found a boot crash on a system which doesn't have memory on node0. It turns out with recent per_cpu changes, node_number for BSP will always be 0, and it is not consistent to cpu_to_node() that might set it to a different (nearer) node already. aka when numa_set_node() for node0 is called early before per_cpu area is setup: two places touched that per_cpu(node_number,): 1. in cpu/common.c::cpu_init() and it is not for BP | #ifdef CONFIG_NUMA | if (cpu != 0 && percpu_read(node_number) == 0 && | cpu_to_node(cpu) != NUMA_NO_NODE) | percpu_write(node_number, cpu_to_node(cpu)); | #endif for BP: traps_init ==> cpu_init for AP: start_secondary ==> cpu_init 2. cpu/intel.c or amd.c::srat_detect_node via numa_set_node() for BP: check_bugs ==> identify_boot_cpu ==> identify_cpu() that is rather later before numa_node_id() is used for BP... for AP: start_secondary => smp_callin => smp_store_cpu_info() => => identify_secondary_cpu => identify_cpu() so try to set that for BP earlier in setup_per_cpu_areas(), and don't bother to set that for APs there (it will be updated later and will be used later) (and don't mess the 0 before the copying BP per_cpu data to APs) [ Impact: fix boot crash on memoryless node-0 ] Reported-and-tested-by: Jack Steiner Cc: Tejun Heo Signed-off-by: Yinghai Lu LKML-Reference: <4A0C4A02.7050401@kernel.org> Signed-off-by: Ingo Molnar --- arch/x86/kernel/setup_percpu.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c index 3a97a4cf187..3b5f3271e73 100644 --- a/arch/x86/kernel/setup_percpu.c +++ b/arch/x86/kernel/setup_percpu.c @@ -423,6 +423,14 @@ void __init setup_per_cpu_areas(void) early_per_cpu_ptr(x86_cpu_to_node_map) = NULL; #endif +#if defined(CONFIG_X86_64) && defined(CONFIG_NUMA) + /* + * make sure boot cpu node_number is right, when boot cpu is on the + * node that doesn't have mem installed + */ + per_cpu(node_number, boot_cpu_id) = cpu_to_node(boot_cpu_id); +#endif + /* Setup node to cpumask map */ setup_node_to_cpumask_map(); From b68f1d2e7aa21029d73c7d453a8046e95d351740 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 19:37:25 +0200 Subject: [PATCH 1120/2061] perf_counter, x86: speed up the scheduling fast-path We have to set up the LVT entry only at counter init time, not at every switch-in time. There's friction between NMI and non-NMI use here - we'll probably remove the per counter configurability of it - but until then, dont slow down things ... [ Impact: micro-optimization ] Cc: Peter Zijlstra Cc: Srivatsa Vaddagiri Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 5bfd30ab392..c109819c2cb 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,6 +285,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EACCES; hwc->nmi = 1; } + perf_counters_lapic_init(hwc->nmi); if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -603,8 +604,6 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } - perf_counters_lapic_init(hwc->nmi); - x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; @@ -1054,7 +1053,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(0); + perf_counters_lapic_init(1); register_die_notifier(&perf_counter_nmi_notifier); } From 24ed0c4bfc7d2d7507bb9d50f7f3bbdcd85d76dd Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Sun, 17 May 2009 15:31:38 +0800 Subject: [PATCH 1121/2061] tracing: fix check for return value of register_module_notifier return zero should be correct, so fix it. [ Impact: eliminate incorrect syslog message ] Signed-off-by: Ming Lei Acked-by: Frederic Weisbecker Acked-by: Li Zefan Cc: rostedt@goodmis.org LKML-Reference: <1242545498-7285-1-git-send-email-tom.leiming@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/ftrace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 5b606f45b6c..140699a9a8a 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2826,7 +2826,7 @@ void __init ftrace_init(void) __stop_mcount_loc); ret = register_module_notifier(&ftrace_module_nb); - if (!ret) + if (ret) pr_warning("Failed to register trace ftrace module notifier\n"); return; From 6c3b46f74587d46e71b8c2b78fdca626a3aca280 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Mon, 18 May 2009 14:38:28 +0200 Subject: [PATCH 1122/2061] virtio_blk: don't blindly derefence req->rq_disk request->rq_disk is only set for FS requests or BLOCK_PC requests originating from the generic block layer scsi ioctls. It's not set for requests origination from other soures or internal cache flush commands implemented by the patch I'll send after this. So instead of using it to get at the private data in do_virtblk_request setup queue->queuedata and use it. Signed-off-by: Christoph Hellwig Signed-off-by: Rusty Russell Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 29a9daf4862..dfe9ee5f169 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -124,12 +124,11 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, static void do_virtblk_request(struct request_queue *q) { - struct virtio_blk *vblk = NULL; + struct virtio_blk *vblk = q->queuedata; struct request *req; unsigned int issued = 0; while ((req = blk_peek_request(q)) != NULL) { - vblk = req->rq_disk->private_data; BUG_ON(req->nr_phys_segments + 2 > vblk->sg_elems); /* If this request fails, stop queue and wait for something to @@ -249,6 +248,7 @@ static int virtblk_probe(struct virtio_device *vdev) goto out_put_disk; } + vblk->disk->queue->queuedata = vblk; queue_flag_set_unlocked(QUEUE_FLAG_VIRT, vblk->disk->queue); if (index < 26) { From 1cde26f928863d90e9e7c1217880c8450464d305 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 18 May 2009 14:41:30 +0200 Subject: [PATCH 1123/2061] virtio_blk: SG_IO passthru support Add support for SG_IO passthru to virtio_blk. We add the scsi command block after the normal outhdr, and the scsi inhdr with full status information aswell as the sense buffer before the regular inhdr. [hch: forward ported, added the VIRTIO_BLK_F_SCSI flags, some comments and tested the whole beast] [axboe: updated to use ->resid and not dual-path the byte count] Signed-off-by: Hannes Reinecke Signed-off-by: Christoph Hellwig Signed-off-by: Rusty Russell (+ checkpatch.pl tweak) Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 64 +++++++++++++++++++++++++++++--------- include/linux/virtio_blk.h | 8 +++++ 2 files changed, 58 insertions(+), 14 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index dfe9ee5f169..62275dbdf2e 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -37,6 +37,7 @@ struct virtblk_req struct list_head list; struct request *req; struct virtio_blk_outhdr out_hdr; + struct virtio_scsi_inhdr in_hdr; u8 status; }; @@ -49,7 +50,9 @@ static void blk_done(struct virtqueue *vq) spin_lock_irqsave(&vblk->lock, flags); while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) { + unsigned int nr_bytes; int error; + switch (vbr->status) { case VIRTIO_BLK_S_OK: error = 0; @@ -62,6 +65,12 @@ static void blk_done(struct virtqueue *vq) break; } + if (blk_pc_request(vbr->req)) { + vbr->req->resid_len = vbr->in_hdr.residual; + vbr->req->sense_len = vbr->in_hdr.sense_len; + vbr->req->errors = vbr->in_hdr.errors; + } + __blk_end_request_all(vbr->req, error); list_del(&vbr->list); mempool_free(vbr, vblk->pool); @@ -74,7 +83,7 @@ static void blk_done(struct virtqueue *vq) static bool do_req(struct request_queue *q, struct virtio_blk *vblk, struct request *req) { - unsigned long num, out, in; + unsigned long num, out = 0, in = 0; struct virtblk_req *vbr; vbr = mempool_alloc(vblk->pool, GFP_ATOMIC); @@ -99,18 +108,36 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, if (blk_barrier_rq(vbr->req)) vbr->out_hdr.type |= VIRTIO_BLK_T_BARRIER; - sg_set_buf(&vblk->sg[0], &vbr->out_hdr, sizeof(vbr->out_hdr)); - num = blk_rq_map_sg(q, vbr->req, vblk->sg+1); - sg_set_buf(&vblk->sg[num+1], &vbr->status, sizeof(vbr->status)); + sg_set_buf(&vblk->sg[out++], &vbr->out_hdr, sizeof(vbr->out_hdr)); - if (rq_data_dir(vbr->req) == WRITE) { - vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; - out = 1 + num; - in = 1; - } else { - vbr->out_hdr.type |= VIRTIO_BLK_T_IN; - out = 1; - in = 1 + num; + /* + * If this is a packet command we need a couple of additional headers. + * Behind the normal outhdr we put a segment with the scsi command + * block, and before the normal inhdr we put the sense data and the + * inhdr with additional status information before the normal inhdr. + */ + if (blk_pc_request(vbr->req)) + sg_set_buf(&vblk->sg[out++], vbr->req->cmd, vbr->req->cmd_len); + + num = blk_rq_map_sg(q, vbr->req, vblk->sg + out); + + if (blk_pc_request(vbr->req)) { + sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96); + sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr, + sizeof(vbr->in_hdr)); + } + + sg_set_buf(&vblk->sg[num + out + in++], &vbr->status, + sizeof(vbr->status)); + + if (num) { + if (rq_data_dir(vbr->req) == WRITE) { + vbr->out_hdr.type |= VIRTIO_BLK_T_OUT; + out += num; + } else { + vbr->out_hdr.type |= VIRTIO_BLK_T_IN; + in += num; + } } if (vblk->vq->vq_ops->add_buf(vblk->vq, vblk->sg, out, in, vbr)) { @@ -148,8 +175,16 @@ static void do_virtblk_request(struct request_queue *q) static int virtblk_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, unsigned long data) { - return scsi_cmd_ioctl(bdev->bd_disk->queue, - bdev->bd_disk, mode, cmd, + struct gendisk *disk = bdev->bd_disk; + struct virtio_blk *vblk = disk->private_data; + + /* + * Only allow the generic SCSI ioctls if the host can support it. + */ + if (!virtio_has_feature(vblk->vdev, VIRTIO_BLK_F_SCSI)) + return -ENOIOCTLCMD; + + return scsi_cmd_ioctl(disk->queue, disk, mode, cmd, (void __user *)data); } @@ -356,6 +391,7 @@ static struct virtio_device_id id_table[] = { static unsigned int features[] = { VIRTIO_BLK_F_BARRIER, VIRTIO_BLK_F_SEG_MAX, VIRTIO_BLK_F_SIZE_MAX, VIRTIO_BLK_F_GEOMETRY, VIRTIO_BLK_F_RO, VIRTIO_BLK_F_BLK_SIZE, + VIRTIO_BLK_F_SCSI, }; static struct virtio_driver virtio_blk = { diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index 94c56d29869..4dbcbc1c348 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -15,6 +15,7 @@ #define VIRTIO_BLK_F_GEOMETRY 4 /* Legacy geometry available */ #define VIRTIO_BLK_F_RO 5 /* Disk is read-only */ #define VIRTIO_BLK_F_BLK_SIZE 6 /* Block size of disk is available*/ +#define VIRTIO_BLK_F_SCSI 7 /* Supports scsi command passthru */ struct virtio_blk_config { @@ -55,6 +56,13 @@ struct virtio_blk_outhdr __u64 sector; }; +struct virtio_scsi_inhdr { + __u32 errors; + __u32 data_len; + __u32 sense_len; + __u32 residual; +}; + /* And this is the final byte of the write scatter-gather list. */ #define VIRTIO_BLK_S_OK 0 #define VIRTIO_BLK_S_IOERR 1 From f831cc03490c78a76e2d35ad77ec2292d0323728 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Mon, 18 May 2009 14:44:45 +0200 Subject: [PATCH 1124/2061] virtio_blk: get rid of unused variable drivers/block/virtio_blk.c: In function 'blk_done': drivers/block/virtio_blk.c:53: warning: unused variable 'nr_bytes' Leftover from commit 1cde26f928863d90e9e7c1217880c8450464d305 Signed-off-by: Jens Axboe --- drivers/block/virtio_blk.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 62275dbdf2e..511d4ae2d17 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -50,7 +50,6 @@ static void blk_done(struct virtqueue *vq) spin_lock_irqsave(&vblk->lock, flags); while ((vbr = vblk->vq->vq_ops->get_buf(vblk->vq, &len)) != NULL) { - unsigned int nr_bytes; int error; switch (vbr->status) { From 75834fc3b6fcff00327f5d2a18760c1e8e0179c5 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Mon, 18 May 2009 10:26:10 -0400 Subject: [PATCH 1125/2061] SELinux: move SELINUX_MAGIC into magic.h The selinuxfs superblock magic is used inside the IMA code, but is being defined in two places and could someday get out of sync. This patch moves the declaration into magic.h so it is only done once. Signed-off-by: Eric Paris Signed-off-by: James Morris --- include/linux/magic.h | 1 + security/integrity/ima/ima_policy.c | 8 +++----- security/selinux/include/security.h | 3 +-- 3 files changed, 5 insertions(+), 7 deletions(-) diff --git a/include/linux/magic.h b/include/linux/magic.h index 5b4e28bcb78..927138cf305 100644 --- a/include/linux/magic.h +++ b/include/linux/magic.h @@ -9,6 +9,7 @@ #define DEBUGFS_MAGIC 0x64626720 #define SYSFS_MAGIC 0x62656572 #define SECURITYFS_MAGIC 0x73636673 +#define SELINUX_MAGIC 0xf97cff8c #define TMPFS_MAGIC 0x01021994 #define SQUASHFS_MAGIC 0x73717368 #define EFS_SUPER_MAGIC 0x414A53 diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index dec6dcb1c8d..31d677f7c65 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -49,14 +49,12 @@ struct ima_measure_rule_entry { * written in terms of .action, .func, .mask, .fsmagic, and .uid */ static struct ima_measure_rule_entry default_rules[] = { - {.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC, - .flags = IMA_FSMAGIC}, + {.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,.flags = IMA_FSMAGIC}, {.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC}, {.action = DONT_MEASURE,.fsmagic = DEBUGFS_MAGIC,.flags = IMA_FSMAGIC}, {.action = DONT_MEASURE,.fsmagic = TMPFS_MAGIC,.flags = IMA_FSMAGIC}, - {.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC, - .flags = IMA_FSMAGIC}, - {.action = DONT_MEASURE,.fsmagic = 0xF97CFF8C,.flags = IMA_FSMAGIC}, + {.action = DONT_MEASURE,.fsmagic = SECURITYFS_MAGIC,.flags = IMA_FSMAGIC}, + {.action = DONT_MEASURE,.fsmagic = SELINUX_MAGIC,.flags = IMA_FSMAGIC}, {.action = MEASURE,.func = FILE_MMAP,.mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC, diff --git a/security/selinux/include/security.h b/security/selinux/include/security.h index a7be3f01fb0..ca835795a8b 100644 --- a/security/selinux/include/security.h +++ b/security/selinux/include/security.h @@ -8,14 +8,13 @@ #ifndef _SELINUX_SECURITY_H_ #define _SELINUX_SECURITY_H_ +#include #include "flask.h" #define SECSID_NULL 0x00000000 /* unspecified SID */ #define SECSID_WILD 0xffffffff /* wildcard SID */ #define SECCLASS_NULL 0x0000 /* no class */ -#define SELINUX_MAGIC 0xf97cff8c - /* Identify specific policy version changes */ #define POLICYDB_VERSION_BASE 15 #define POLICYDB_VERSION_BOOL 16 From 4200efd9acda4accf24640f1e77d24fdcdb524df Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 May 2009 09:22:19 +0200 Subject: [PATCH 1126/2061] sched: properly define the sched_group::cpumask and sched_domain::span fields Properly document the variable-size structure tricks we are doing wrt. struct sched_group and sched_domain, and use the field[0] GCC extension instead of defining a vla array. Dont use unions for this, as pointed out by Linus. [ Impact: cleanup, un-confuse Sparse and LLVM ] Reported-by: Jeff Garzik Acked-by: Linus Torvalds LKML-Reference: Signed-off-by: Ingo Molnar --- include/linux/sched.h | 25 ++++++++++++++++++++++--- kernel/sched.c | 5 +++-- 2 files changed, 25 insertions(+), 5 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index de7b3b21777..dbb1043e865 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -839,7 +839,17 @@ struct sched_group { */ u32 reciprocal_cpu_power; - unsigned long cpumask[]; + /* + * The CPUs this group covers. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + * + * It is also be embedded into static data structures at build + * time. (See 'struct static_sched_group' in kernel/sched.c) + */ + unsigned long cpumask[0]; }; static inline struct cpumask *sched_group_cpus(struct sched_group *sg) @@ -925,8 +935,17 @@ struct sched_domain { char *name; #endif - /* span of all CPUs in this domain */ - unsigned long span[]; + /* + * Span of all CPUs in this domain. + * + * NOTE: this field is variable length. (Allocated dynamically + * by attaching extra space to the end of the structure, + * depending on how many CPUs the kernel has booted up with) + * + * It is also be embedded into static data structures at build + * time. (See 'struct static_sched_domain' in kernel/sched.c) + */ + unsigned long span[0]; }; static inline struct cpumask *sched_domain_span(struct sched_domain *sd) diff --git a/kernel/sched.c b/kernel/sched.c index 497c09ba61e..228acae8821 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7948,8 +7948,9 @@ int sched_smt_power_savings = 0, sched_mc_power_savings = 0; /* * The cpus mask in sched_group and sched_domain hangs off the end. - * FIXME: use cpumask_var_t or dynamic percpu alloc to avoid wasting space - * for nr_cpu_ids < CONFIG_NR_CPUS. + * + * ( See the the comments in include/linux/sched.h:struct sched_group + * and struct sched_domain. ) */ struct static_sched_group { struct sched_group sg; From 143c145e3a475065a4be661468d0df1bd0b25f74 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Tue, 19 May 2009 14:43:15 +0800 Subject: [PATCH 1127/2061] tracing/events: Documentation updates - fix some typos - document the difference between '>' and '>>' - document the 'enable' toggle - remove section "Defining an event-enabled tracepoint", since it's out-dated and sample/trace_events/ already serves this purpose. v2: add "Updated by Li Zefan" [ Impact: make documentation up-to-date ] Signed-off-by: Li Zefan Cc: Steven Rostedt Cc: Frederic Weisbecker Cc: "Theodore Ts'o" LKML-Reference: <4A125503.5060406@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- Documentation/trace/events.txt | 129 +++++++++++---------------------- 1 file changed, 42 insertions(+), 87 deletions(-) diff --git a/Documentation/trace/events.txt b/Documentation/trace/events.txt index abdee664c0f..f157d7594ea 100644 --- a/Documentation/trace/events.txt +++ b/Documentation/trace/events.txt @@ -1,9 +1,10 @@ Event Tracing Documentation written by Theodore Ts'o + Updated by Li Zefan -Introduction -============ +1. Introduction +=============== Tracepoints (see Documentation/trace/tracepoints.txt) can be used without creating custom kernel modules to register probe functions @@ -12,30 +13,37 @@ using the event tracing infrastructure. Not all tracepoints can be traced using the event tracing system; the kernel developer must provide code snippets which define how the tracing information is saved into the tracing buffer, and how the -the tracing information should be printed. +tracing information should be printed. -Using Event Tracing -=================== +2. Using Event Tracing +====================== + +2.1 Via the 'set_event' interface +--------------------------------- The events which are available for tracing can be found in the file -/sys/kernel/debug/tracing/available_events. +/debug/tracing/available_events. To enable a particular event, such as 'sched_wakeup', simply echo it -to /sys/debug/tracing/set_event. For example: +to /debug/tracing/set_event. For example: - # echo sched_wakeup > /sys/kernel/debug/tracing/set_event + # echo sched_wakeup >> /debug/tracing/set_event -[ Note: events can also be enabled/disabled via the 'enabled' toggle - found in the /sys/kernel/tracing/events/ hierarchy of directories. ] +[ Note: '>>' is necessary, otherwise it will firstly disable + all the events. ] To disable an event, echo the event name to the set_event file prefixed with an exclamation point: - # echo '!sched_wakeup' >> /sys/kernel/debug/tracing/set_event + # echo '!sched_wakeup' >> /debug/tracing/set_event -To disable events, echo an empty line to the set_event file: +To disable all events, echo an empty line to the set_event file: - # echo > /sys/kernel/debug/tracing/set_event + # echo > /debug/tracing/set_event + +To enable all events, echo '*:*' or '*:' to the set_event file: + + # echo *:* > /debug/tracing/set_event The events are organized into subsystems, such as ext4, irq, sched, etc., and a full event name looks like this: :. The @@ -44,92 +52,39 @@ file. All of the events in a subsystem can be specified via the syntax ":*"; for example, to enable all irq events, you can use the command: - # echo 'irq:*' > /sys/kernel/debug/tracing/set_event + # echo 'irq:*' > /debug/tracing/set_event -Defining an event-enabled tracepoint ------------------------------------- +2.2 Via the 'enable' toggle +--------------------------- -A kernel developer which wishes to define an event-enabled tracepoint -must declare the tracepoint using TRACE_EVENT instead of DECLARE_TRACE. -This is done via two header files in include/trace. For example, to -event-enable the jbd2 subsystem, we must create two files, -include/trace/jbd2.h and include/trace/jbd2_event_types.h. The -include/trace/jbd2.h file should be included by kernel source files that -will have a tracepoint inserted, and might look like this: +The events available are also listed in /debug/tracing/events/ hierarchy +of directories. -#ifndef _TRACE_JBD2_H -#define _TRACE_JBD2_H +To enable event 'sched_wakeup': -#include -#include + # echo 1 > /debug/tracing/events/sched/sched_wakeup/enable -#include +To disable it: -#endif + # echo 0 > /debug/tracing/events/sched/sched_wakeup/enable -In a file that utilizes a jbd2 tracepoint, this header file would be -included. Note that you still have to use DEFINE_TRACE(). So for -example, if fs/jbd2/commit.c planned to use the jbd2_start_commit -tracepoint, it would have the following near the beginning of the file: +To enable all events in sched subsystem: -#include + # echo 1 > /debug/tracing/events/sched/enable -DEFINE_TRACE(jbd2_start_commit); +To eanble all events: -Then in the function that would call the tracepoint, it would call the -tracepoint function. (For more information, please see the tracepoint -documentation in Documentation/trace/tracepoints.txt): + # echo 1 > /debug/tracing/events/enable - trace_jbd2_start_commit(journal, commit_transaction); +When reading one of these enable files, there are four results: -The code snippets which allow jbd2_start_commit to be an event-enabled -tracepoint are placed in the file include/trace/jbd2_event_types.h: + 0 - all events this file affects are disabled + 1 - all events this file affects are enabled + X - there is a mixture of events enabled and disabled + ? - this file does not affect any event -/* use instead */ -#ifndef TRACE_EVENT -# error Do not include this file directly. -# error Unless you know what you are doing. -#endif +3. Defining an event-enabled tracepoint +======================================= -#undef TRACE_SYSTEM -#define TRACE_SYSTEM jbd2 +See The example provided in samples/trace_events -#include - -TRACE_EVENT(jbd2_start_commit, - TP_PROTO(journal_t *journal, transaction_t *commit_transaction), - TP_ARGS(journal, commit_transaction), - TP_STRUCT__entry( - __array( char, devname, BDEVNAME_SIZE+24 ) - __field( int, transaction ) - ), - TP_fast_assign( - memcpy(__entry->devname, journal->j_devname, BDEVNAME_SIZE+24); - __entry->transaction = commit_transaction->t_tid; - ), - TP_printk("dev %s transaction %d", - __entry->devname, __entry->transaction) -); - -The TP_PROTO and TP_ARGS are unchanged from DECLARE_TRACE. The new -arguments to TRACE_EVENT are TP_STRUCT__entry, TP_fast_assign, and -TP_printk. - -TP_STRUCT__entry defines the data structure which will be stored in the -trace buffer. Normally, fields in __entry will be arrays or simple -types. It is possible to place data structures in __entry --- however, -pointers in the data structure can not be trusted, since they will be -accessed sometime later by TP_printk, and if the data structure contains -fields that will not or cannot be used by TP_printk, this will waste -space in the trace buffer. In general, data structures should be -avoided, unless they do only contain non-pointer types and all of the -fields will be used by TP_printk. - -TP_fast_assign defines the code snippet which saves information into the -__entry data structure, using the passed-in arguments defined in -TP_PROTO and TP_ARGS. - -Finally, TP_printk will print the __entry data structure. At the time -when the code snippet defined by TP_printk is executed, it will not have -access to the TP_ARGS arguments; it can only use the information saved -in the __entry data structure. From fd51d251e4cdb21f68e9dbc4336514d64a105a79 Mon Sep 17 00:00:00 2001 From: Stefan Raspl Date: Tue, 19 May 2009 09:59:08 +0200 Subject: [PATCH 1128/2061] blktrace: remove debugfs entries on bad path debugfs directory entries for devices are not removed on some of the failure pathes in do_blk_trace_setup(). One way to reproduce is to start blktrace on multiple devices with insufficient Vmalloc space: Devices will fail with a message like this: BLKTRACESETUP(2) /dev/sdu failed: 5/Input/output error If so, the respective entries in debugfs (e.g. /sys/kernel/debug/block/sdu) will remain and subsequent attempts to start blktrace on the respective devices will not succeed due to existing directories. [ Impact: fix /debug/tracing file cleanup corner case ] Signed-off-by: Stefan Raspl Acked-by: Li Zefan Cc: Li Zefan Cc: schwidefsky@de.ibm.com Cc: heiko.carstens@de.ibm.com LKML-Reference: <4A1266CC.5040801@linux.vnet.ibm.com> Signed-off-by: Ingo Molnar --- kernel/trace/blktrace.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 05b4747fd87..e3abf55bc8e 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -262,6 +262,7 @@ static void blk_trace_free(struct blk_trace *bt) { debugfs_remove(bt->msg_file); debugfs_remove(bt->dropped_file); + debugfs_remove(bt->dir); relay_close(bt->rchan); free_percpu(bt->sequence); free_percpu(bt->msg_data); From fe64d517df0970a68417184a12fcd4ba0589cc28 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 19 May 2009 10:01:18 +0100 Subject: [PATCH 1129/2061] GFS2: Umount recovery race fix This patch fixes a race condition where we can receive recovery requests part way through processing a umount. This was causing problems since the recovery thread had already gone away. Looking in more detail at the recovery code, it was really trying to implement a slight variation on a work queue, and that happens to align nicely with the recently introduced slow-work subsystem. As a result I've updated the code to use slow-work, rather than its own home grown variety of work queue. When using the wait_on_bit() function, I noticed that the wait function that was supplied as an argument was appearing in the WCHAN field, so I've updated the function names in order to produce more meaningful output. Signed-off-by: Steven Whitehouse --- fs/gfs2/Kconfig | 1 + fs/gfs2/glock.c | 21 ++++++-- fs/gfs2/incore.h | 14 ++---- fs/gfs2/main.c | 8 +++ fs/gfs2/ops_fstype.c | 20 +++----- fs/gfs2/ops_super.c | 25 +++++++++- fs/gfs2/recovery.c | 114 +++++++++++++++---------------------------- fs/gfs2/recovery.h | 2 +- fs/gfs2/sys.c | 53 ++++++++++---------- 9 files changed, 128 insertions(+), 130 deletions(-) diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index 3a981b7f64c..cad957cdb1e 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -7,6 +7,7 @@ config GFS2_FS select IP_SCTP if DLM_SCTP select FS_POSIX_ACL select CRC32 + select SLOW_WORK help A cluster filesystem. diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ff498109048..2bf62bcc518 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -796,22 +796,37 @@ void gfs2_holder_uninit(struct gfs2_holder *gh) gh->gh_ip = 0; } -static int just_schedule(void *word) +/** + * gfs2_glock_holder_wait + * @word: unused + * + * This function and gfs2_glock_demote_wait both show up in the WCHAN + * field. Thus I've separated these otherwise identical functions in + * order to be more informative to the user. + */ + +static int gfs2_glock_holder_wait(void *word) { schedule(); return 0; } +static int gfs2_glock_demote_wait(void *word) +{ + schedule(); + return 0; +} + static void wait_on_holder(struct gfs2_holder *gh) { might_sleep(); - wait_on_bit(&gh->gh_iflags, HIF_WAIT, just_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gh->gh_iflags, HIF_WAIT, gfs2_glock_holder_wait, TASK_UNINTERRUPTIBLE); } static void wait_on_demote(struct gfs2_glock *gl) { might_sleep(); - wait_on_bit(&gl->gl_flags, GLF_DEMOTE, just_schedule, TASK_UNINTERRUPTIBLE); + wait_on_bit(&gl->gl_flags, GLF_DEMOTE, gfs2_glock_demote_wait, TASK_UNINTERRUPTIBLE); } /** diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 65f438e9537..0060e9564bb 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -376,11 +377,11 @@ struct gfs2_journal_extent { struct gfs2_jdesc { struct list_head jd_list; struct list_head extent_list; - + struct slow_work jd_work; struct inode *jd_inode; + unsigned long jd_flags; +#define JDF_RECOVERY 1 unsigned int jd_jid; - int jd_dirty; - unsigned int jd_blocks; }; @@ -390,9 +391,6 @@ struct gfs2_statfs_change_host { s64 sc_dinodes; }; -#define GFS2_GLOCKD_DEFAULT 1 -#define GFS2_GLOCKD_MAX 16 - #define GFS2_QUOTA_DEFAULT GFS2_QUOTA_OFF #define GFS2_QUOTA_OFF 0 #define GFS2_QUOTA_ACCOUNT 1 @@ -427,7 +425,6 @@ struct gfs2_tune { unsigned int gt_incore_log_blocks; unsigned int gt_log_flush_secs; - unsigned int gt_recoverd_secs; unsigned int gt_logd_secs; unsigned int gt_quota_simul_sync; /* Max quotavals to sync at once */ @@ -448,6 +445,7 @@ enum { SDF_JOURNAL_LIVE = 1, SDF_SHUTDOWN = 2, SDF_NOBARRIERS = 3, + SDF_NORECOVERY = 4, }; #define GFS2_FSNAME_LEN 256 @@ -494,7 +492,6 @@ struct lm_lockstruct { unsigned long ls_flags; dlm_lockspace_t *ls_dlm; - int ls_recover_jid; int ls_recover_jid_done; int ls_recover_jid_status; }; @@ -583,7 +580,6 @@ struct gfs2_sbd { /* Daemon stuff */ - struct task_struct *sd_recoverd_process; struct task_struct *sd_logd_process; struct task_struct *sd_quotad_process; diff --git a/fs/gfs2/main.c b/fs/gfs2/main.c index a6892ed0840..eacd78a5d08 100644 --- a/fs/gfs2/main.c +++ b/fs/gfs2/main.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -113,12 +114,18 @@ static int __init init_gfs2_fs(void) if (error) goto fail_unregister; + error = slow_work_register_user(); + if (error) + goto fail_slow; + gfs2_register_debugfs(); printk("GFS2 (built %s %s) installed\n", __DATE__, __TIME__); return 0; +fail_slow: + unregister_filesystem(&gfs2meta_fs_type); fail_unregister: unregister_filesystem(&gfs2_fs_type); fail: @@ -156,6 +163,7 @@ static void __exit exit_gfs2_fs(void) gfs2_unregister_debugfs(); unregister_filesystem(&gfs2_fs_type); unregister_filesystem(&gfs2meta_fs_type); + slow_work_unregister_user(); kmem_cache_destroy(gfs2_quotad_cachep); kmem_cache_destroy(gfs2_rgrpd_cachep); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 7981fbc9fc3..2cd1164c88d 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -17,6 +17,7 @@ #include #include #include +#include #include "gfs2.h" #include "incore.h" @@ -55,7 +56,6 @@ static void gfs2_tune_init(struct gfs2_tune *gt) spin_lock_init(>->gt_spin); gt->gt_incore_log_blocks = 1024; - gt->gt_recoverd_secs = 60; gt->gt_logd_secs = 1; gt->gt_quota_simul_sync = 64; gt->gt_quota_warn_period = 10; @@ -675,6 +675,7 @@ static int gfs2_jindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ji_gh) break; INIT_LIST_HEAD(&jd->extent_list); + slow_work_init(&jd->jd_work, &gfs2_recover_ops); jd->jd_inode = gfs2_lookupi(sdp->sd_jindex, &name, 1); if (!jd->jd_inode || IS_ERR(jd->jd_inode)) { if (!jd->jd_inode) @@ -700,14 +701,13 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) { struct inode *master = sdp->sd_master_dir->d_inode; struct gfs2_holder ji_gh; - struct task_struct *p; struct gfs2_inode *ip; int jindex = 1; int error = 0; if (undo) { jindex = 0; - goto fail_recoverd; + goto fail_jinode_gh; } sdp->sd_jindex = gfs2_lookup_simple(master, "jindex"); @@ -800,18 +800,8 @@ static int init_journal(struct gfs2_sbd *sdp, int undo) gfs2_glock_dq_uninit(&ji_gh); jindex = 0; - p = kthread_run(gfs2_recoverd, sdp, "gfs2_recoverd"); - error = IS_ERR(p); - if (error) { - fs_err(sdp, "can't start recoverd thread: %d\n", error); - goto fail_jinode_gh; - } - sdp->sd_recoverd_process = p; - return 0; -fail_recoverd: - kthread_stop(sdp->sd_recoverd_process); fail_jinode_gh: if (!sdp->sd_args.ar_spectator) gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); @@ -1172,8 +1162,10 @@ static int fill_super(struct super_block *sb, void *data, int silent) goto fail; } - if (sdp->sd_args.ar_spectator) + if (sdp->sd_args.ar_spectator) { sb->s_flags |= MS_RDONLY; + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + } if (sdp->sd_args.ar_posix_acl) sb->s_flags |= MS_POSIXACL; diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index 0677a837856..a3c2272e7ca 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -121,6 +121,12 @@ static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) return error; } +static int gfs2_umount_recovery_wait(void *word) +{ + schedule(); + return 0; +} + /** * gfs2_put_super - Unmount the filesystem * @sb: The VFS superblock @@ -131,6 +137,7 @@ static void gfs2_put_super(struct super_block *sb) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; + struct gfs2_jdesc *jd; /* Unfreeze the filesystem, if we need to */ @@ -139,9 +146,25 @@ static void gfs2_put_super(struct super_block *sb) gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); mutex_unlock(&sdp->sd_freeze_lock); + /* No more recovery requests */ + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + smp_mb(); + + /* Wait on outstanding recovery */ +restart: + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) + continue; + spin_unlock(&sdp->sd_jindex_spin); + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, + gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); + goto restart; + } + spin_unlock(&sdp->sd_jindex_spin); + kthread_stop(sdp->sd_quotad_process); kthread_stop(sdp->sd_logd_process); - kthread_stop(sdp->sd_recoverd_process); if (!(sb->s_flags & MS_RDONLY)) { error = gfs2_make_fs_ro(sdp); diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index 247e8f7d6b3..59d2695509d 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -13,8 +13,7 @@ #include #include #include -#include -#include +#include #include "gfs2.h" #include "incore.h" @@ -441,18 +440,25 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid, kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp); } -/** - * gfs2_recover_journal - recover a given journal - * @jd: the struct gfs2_jdesc describing the journal - * - * Acquire the journal's lock, check to see if the journal is clean, and - * do recovery if necessary. - * - * Returns: errno - */ - -int gfs2_recover_journal(struct gfs2_jdesc *jd) +static int gfs2_recover_get_ref(struct slow_work *work) { + struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); + if (test_and_set_bit(JDF_RECOVERY, &jd->jd_flags)) + return -EBUSY; + return 0; +} + +static void gfs2_recover_put_ref(struct slow_work *work) +{ + struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); + clear_bit(JDF_RECOVERY, &jd->jd_flags); + smp_mb__after_clear_bit(); + wake_up_bit(&jd->jd_flags, JDF_RECOVERY); +} + +static void gfs2_recover_work(struct slow_work *work) +{ + struct gfs2_jdesc *jd = container_of(work, struct gfs2_jdesc, jd_work); struct gfs2_inode *ip = GFS2_I(jd->jd_inode); struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host head; @@ -569,7 +575,7 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd) gfs2_glock_dq_uninit(&j_gh); fs_info(sdp, "jid=%u: Done\n", jd->jd_jid); - return 0; + return; fail_gunlock_tr: gfs2_glock_dq_uninit(&t_gh); @@ -584,70 +590,28 @@ fail_gunlock_j: fail: gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_GAVEUP); - return error; } -static struct gfs2_jdesc *gfs2_jdesc_find_dirty(struct gfs2_sbd *sdp) +struct slow_work_ops gfs2_recover_ops = { + .get_ref = gfs2_recover_get_ref, + .put_ref = gfs2_recover_put_ref, + .execute = gfs2_recover_work, +}; + + +static int gfs2_recovery_wait(void *word) { - struct gfs2_jdesc *jd; - int found = 0; - - spin_lock(&sdp->sd_jindex_spin); - - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - if (jd->jd_dirty) { - jd->jd_dirty = 0; - found = 1; - break; - } - } - spin_unlock(&sdp->sd_jindex_spin); - - if (!found) - jd = NULL; - - return jd; -} - -/** - * gfs2_check_journals - Recover any dirty journals - * @sdp: the filesystem - * - */ - -static void gfs2_check_journals(struct gfs2_sbd *sdp) -{ - struct gfs2_jdesc *jd; - - for (;;) { - jd = gfs2_jdesc_find_dirty(sdp); - if (!jd) - break; - - if (jd != sdp->sd_jdesc) - gfs2_recover_journal(jd); - } -} - -/** - * gfs2_recoverd - Recover dead machine's journals - * @sdp: Pointer to GFS2 superblock - * - */ - -int gfs2_recoverd(void *data) -{ - struct gfs2_sbd *sdp = data; - unsigned long t; - - while (!kthread_should_stop()) { - gfs2_check_journals(sdp); - t = gfs2_tune_get(sdp, gt_recoverd_secs) * HZ; - if (freezing(current)) - refrigerator(); - schedule_timeout_interruptible(t); - } - + schedule(); + return 0; +} + +int gfs2_recover_journal(struct gfs2_jdesc *jd) +{ + int rv; + rv = slow_work_enqueue(&jd->jd_work); + if (rv) + return rv; + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, gfs2_recovery_wait, TASK_UNINTERRUPTIBLE); return 0; } diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index a8218ea15b5..1616ac22569 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -28,7 +28,7 @@ extern void gfs2_revoke_clean(struct gfs2_sbd *sdp); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head); extern int gfs2_recover_journal(struct gfs2_jdesc *gfs2_jd); -extern int gfs2_recoverd(void *data); +extern struct slow_work_ops gfs2_recover_ops; #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index 894bf773ec9..9f6d48b75fd 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -356,34 +356,33 @@ static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf) return sprintf(buf, "%d\n", ls->ls_first_done); } -static ssize_t recover_show(struct gfs2_sbd *sdp, char *buf) -{ - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - return sprintf(buf, "%d\n", ls->ls_recover_jid); -} - -static void gfs2_jdesc_make_dirty(struct gfs2_sbd *sdp, unsigned int jid) +static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) { + unsigned jid; struct gfs2_jdesc *jd; + int rv; + rv = sscanf(buf, "%u", &jid); + if (rv != 1) + return -EINVAL; + + rv = -ESHUTDOWN; spin_lock(&sdp->sd_jindex_spin); + if (test_bit(SDF_NORECOVERY, &sdp->sd_flags)) + goto out; + rv = -EBUSY; + if (sdp->sd_jdesc->jd_jid == jid) + goto out; + rv = -ENOENT; list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { if (jd->jd_jid != jid) continue; - jd->jd_dirty = 1; + rv = slow_work_enqueue(&jd->jd_work); break; } +out: spin_unlock(&sdp->sd_jindex_spin); -} - -static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len) -{ - struct lm_lockstruct *ls = &sdp->sd_lockstruct; - ls->ls_recover_jid = simple_strtol(buf, NULL, 0); - gfs2_jdesc_make_dirty(sdp, ls->ls_recover_jid); - if (sdp->sd_recoverd_process) - wake_up_process(sdp->sd_recoverd_process); - return len; + return rv ? rv : len; } static ssize_t recover_done_show(struct gfs2_sbd *sdp, char *buf) @@ -401,15 +400,15 @@ static ssize_t recover_status_show(struct gfs2_sbd *sdp, char *buf) #define GDLM_ATTR(_name,_mode,_show,_store) \ static struct gfs2_attr gdlm_attr_##_name = __ATTR(_name,_mode,_show,_store) -GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); -GDLM_ATTR(block, 0644, block_show, block_store); -GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); -GDLM_ATTR(id, 0444, lkid_show, NULL); -GDLM_ATTR(first, 0444, lkfirst_show, NULL); -GDLM_ATTR(first_done, 0444, first_done_show, NULL); -GDLM_ATTR(recover, 0644, recover_show, recover_store); -GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); -GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); +GDLM_ATTR(proto_name, 0444, proto_name_show, NULL); +GDLM_ATTR(block, 0644, block_show, block_store); +GDLM_ATTR(withdraw, 0644, withdraw_show, withdraw_store); +GDLM_ATTR(id, 0444, lkid_show, NULL); +GDLM_ATTR(first, 0444, lkfirst_show, NULL); +GDLM_ATTR(first_done, 0444, first_done_show, NULL); +GDLM_ATTR(recover, 0200, NULL, recover_store); +GDLM_ATTR(recover_done, 0444, recover_done_show, NULL); +GDLM_ATTR(recover_status, 0444, recover_status_show, NULL); static struct attribute *lock_module_attrs[] = { &gdlm_attr_proto_name.attr, From e9ccb73ab57ada469602506496c42e5b4468ac3e Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 19 May 2009 10:23:23 +0100 Subject: [PATCH 1130/2061] GFS2: Update docs Update a few things which were out of date, and fix a typo. Signed-off-by: Steven Whitehouse --- Documentation/filesystems/gfs2-glocks.txt | 2 +- Documentation/filesystems/gfs2.txt | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) diff --git a/Documentation/filesystems/gfs2-glocks.txt b/Documentation/filesystems/gfs2-glocks.txt index 4dae9a3840b..0494f78d87e 100644 --- a/Documentation/filesystems/gfs2-glocks.txt +++ b/Documentation/filesystems/gfs2-glocks.txt @@ -60,7 +60,7 @@ go_lock | Called for the first local holder of a lock go_unlock | Called on the final local unlock of a lock go_dump | Called to print content of object for debugfs file, or on | error to dump glock to the log. -go_type; | The type of the glock, LM_TYPE_..... +go_type | The type of the glock, LM_TYPE_..... go_min_hold_time | The minimum hold time The minimum hold time for each lock is the time after a remote lock diff --git a/Documentation/filesystems/gfs2.txt b/Documentation/filesystems/gfs2.txt index 593004b6bba..5e3ab8f3bef 100644 --- a/Documentation/filesystems/gfs2.txt +++ b/Documentation/filesystems/gfs2.txt @@ -11,18 +11,15 @@ their I/O so file system consistency is maintained. One of the nifty features of GFS is perfect consistency -- changes made to the file system on one machine show up immediately on all other machines in the cluster. -GFS uses interchangable inter-node locking mechanisms. Different lock -modules can plug into GFS and each file system selects the appropriate -lock module at mount time. Lock modules include: +GFS uses interchangable inter-node locking mechanisms, the currently +supported mechanisms are: lock_nolock -- allows gfs to be used as a local file system lock_dlm -- uses a distributed lock manager (dlm) for inter-node locking The dlm is found at linux/fs/dlm/ -In addition to interfacing with an external locking manager, a gfs lock -module is responsible for interacting with external cluster management -systems. Lock_dlm depends on user space cluster management systems found +Lock_dlm depends on user space cluster management systems found at the URL above. To use gfs as a local file system, no external clustering systems are @@ -31,13 +28,19 @@ needed, simply: $ mkfs -t gfs2 -p lock_nolock -j 1 /dev/block_device $ mount -t gfs2 /dev/block_device /dir -GFS2 is not on-disk compatible with previous versions of GFS. +If you are using Fedora, you need to install the gfs2-utils package +and, for lock_dlm, you will also need to install the cman package +and write a cluster.conf as per the documentation. + +GFS2 is not on-disk compatible with previous versions of GFS, but it +is pretty close. The following man pages can be found at the URL above: - gfs2_fsck to repair a filesystem + fsck.gfs2 to repair a filesystem gfs2_grow to expand a filesystem online gfs2_jadd to add journals to a filesystem online gfs2_tool to manipulate, examine and tune a filesystem gfs2_quota to examine and change quota values in a filesystem + gfs2_convert to convert a gfs filesystem to gfs2 in-place mount.gfs2 to help mount(8) mount a filesystem mkfs.gfs2 to make a filesystem From 3755100dd5f66761aaaa7ae44c70b319a7c78a56 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 May 2009 18:33:04 +0900 Subject: [PATCH 1131/2061] ub: use __blk_end_request_all() ub_end_rq() always tries to complete full request. The @cmd_len parameter was there because rq->data_len used to be overwritten with residue count. Drop @cmd_len and use __blk_end_request_all(). Signed-off-by: Tejun Heo Cc: Pete Zaitcev Signed-off-by: Jens Axboe --- drivers/block/ub.c | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/drivers/block/ub.c b/drivers/block/ub.c index 178f459a50e..f32781cff45 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -360,8 +360,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun, static void ub_cmd_build_packet(struct ub_dev *sc, struct ub_lun *lun, struct ub_scsi_cmd *cmd, struct ub_request *urq); static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd); -static void ub_end_rq(struct request *rq, unsigned int status, - unsigned int cmd_len); +static void ub_end_rq(struct request *rq, unsigned int status); static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun, struct ub_request *urq, struct ub_scsi_cmd *cmd); static int ub_submit_scsi(struct ub_dev *sc, struct ub_scsi_cmd *cmd); @@ -644,13 +643,13 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq) if (atomic_read(&sc->poison)) { blk_start_request(rq); - ub_end_rq(rq, DID_NO_CONNECT << 16, blk_rq_bytes(rq)); + ub_end_rq(rq, DID_NO_CONNECT << 16); return 0; } if (lun->changed && !blk_pc_request(rq)) { blk_start_request(rq); - ub_end_rq(rq, SAM_STAT_CHECK_CONDITION, blk_rq_bytes(rq)); + ub_end_rq(rq, SAM_STAT_CHECK_CONDITION); return 0; } @@ -702,7 +701,7 @@ static int ub_request_fn_1(struct ub_lun *lun, struct request *rq) drop: ub_put_cmd(lun, cmd); - ub_end_rq(rq, DID_ERROR << 16, blk_rq_bytes(rq)); + ub_end_rq(rq, DID_ERROR << 16); return 0; } @@ -777,7 +776,6 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd) struct ub_request *urq = cmd->back; struct request *rq; unsigned int scsi_status; - unsigned int cmd_len; rq = urq->rq; @@ -816,17 +814,14 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd) urq->rq = NULL; - cmd_len = cmd->len; ub_put_cmd(lun, cmd); - ub_end_rq(rq, scsi_status, cmd_len); + ub_end_rq(rq, scsi_status); blk_start_queue(lun->disk->queue); } -static void ub_end_rq(struct request *rq, unsigned int scsi_status, - unsigned int cmd_len) +static void ub_end_rq(struct request *rq, unsigned int scsi_status) { int error; - long rqlen; if (scsi_status == 0) { error = 0; @@ -834,12 +829,7 @@ static void ub_end_rq(struct request *rq, unsigned int scsi_status, error = -EIO; rq->errors = scsi_status; } - rqlen = blk_rq_bytes(rq); /* Oddly enough, this is the residue. */ - if (__blk_end_request(rq, error, cmd_len)) { - printk(KERN_WARNING DRV_NAME - ": __blk_end_request blew, %s-cmd total %u rqlen %ld\n", - blk_pc_request(rq)? "pc": "fs", cmd_len, rqlen); - } + __blk_end_request_all(rq, error); } static int ub_rw_cmd_retry(struct ub_dev *sc, struct ub_lun *lun, From 5f49f63178360b07a095bd33b0d850d60edf7590 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 May 2009 18:33:05 +0900 Subject: [PATCH 1132/2061] block: set rq->resid_len to blk_rq_bytes() on issue In commit c3a4d78c580de4edc9ef0f7c59812fb02ceb037f, while introducing rq->resid_len, the default value of residue count was changed from full count to zero. The conversion was done under the assumption that when a request fails residue count wasn't defined. However, Boaz and James pointed out that this wasn't true and the residue count should be preserved for failed requests too. This patchset restores the original behavior by setting rq->resid_len to blk_rq_bytes(rq) on request start and restoring explicit clearing in affected drivers. While at it, take advantage of the fact that rq->resid_len is set to full count where applicable. * ide-cd: rq->resid_len cleared on pc success * mptsas: req->resid_len cleared on success * sas_expander: rsp/req->resid_len cleared on success * mpt2sas_transport: req->resid_len cleared on success * ide-cd, ide-tape, mptsas, sas_host_smp, mpt2sas_transport, ub: take advantage of initial full count to simplify code Boaz Harrosh spotted bug in resid_len initialization. Fixed as suggested. Signed-off-by: Tejun Heo Acked-by: Borislav Petkov Cc: Boaz Harrosh Cc: James Bottomley Cc: Pete Zaitcev Cc: Bartlomiej Zolnierkiewicz Cc: Sergei Shtylyov Cc: Eric Moore Cc: Darrick J. Wong Signed-off-by: Jens Axboe --- block/blk-core.c | 5 +++-- drivers/block/ub.c | 6 ++++-- drivers/ide/ide-cd.c | 4 ++-- drivers/ide/ide-tape.c | 2 +- drivers/message/fusion/mptsas.c | 3 ++- drivers/scsi/libsas/sas_expander.c | 4 ++++ drivers/scsi/libsas/sas_host_smp.c | 3 --- drivers/scsi/mpt2sas/mpt2sas_transport.c | 4 ++-- 8 files changed, 18 insertions(+), 13 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index a2d97de1a12..e3f7e6a3a09 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1783,9 +1783,10 @@ void blk_start_request(struct request *req) blk_dequeue_request(req); /* - * We are now handing the request to the hardware, add the - * timeout handler. + * We are now handing the request to the hardware, initialize + * resid_len to full count and add the timeout handler. */ + req->resid_len = blk_rq_bytes(req); blk_add_timer(req); } EXPORT_SYMBOL(blk_start_request); diff --git a/drivers/block/ub.c b/drivers/block/ub.c index f32781cff45..e67bbae9547 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -781,8 +781,10 @@ static void ub_rw_cmd_done(struct ub_dev *sc, struct ub_scsi_cmd *cmd) if (cmd->error == 0) { if (blk_pc_request(rq)) { - if (cmd->act_len < blk_rq_bytes(rq)) - rq->resid_len = blk_rq_bytes(rq) - cmd->act_len; + if (cmd->act_len >= rq->resid_len) + rq->resid_len = 0; + else + rq->resid_len -= cmd->act_len; scsi_status = 0; } else { if (cmd->act_len != cmd->len) { diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 4c7792fd5f9..081aed6781c 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -699,6 +699,7 @@ static ide_startstop_t cdrom_newpc_intr(ide_drive_t *drive) out_end: if (blk_pc_request(rq) && rc == 0) { + rq->resid_len = 0; blk_end_request_all(rq, 0); hwif->rq = NULL; } else { @@ -718,8 +719,7 @@ out_end: /* make sure it's fully ended */ if (blk_fs_request(rq) == 0) { - rq->resid_len = blk_rq_bytes(rq) - - (cmd->nbytes - cmd->nleft); + rq->resid_len -= cmd->nbytes - cmd->nleft; if (uptodate == 0 && (cmd->tf_flags & IDE_TFLAG_WRITE)) rq->resid_len += cmd->last_xfer_len; } diff --git a/drivers/ide/ide-tape.c b/drivers/ide/ide-tape.c index e16604562f2..683ff37d407 100644 --- a/drivers/ide/ide-tape.c +++ b/drivers/ide/ide-tape.c @@ -380,7 +380,7 @@ static int ide_tape_callback(ide_drive_t *drive, int dsc) } tape->first_frame += blocks; - rq->resid_len = blk_rq_bytes(rq) - blocks * tape->blk_size; + rq->resid_len -= blocks * tape->blk_size; if (pc->error) { uptodate = 0; diff --git a/drivers/message/fusion/mptsas.c b/drivers/message/fusion/mptsas.c index 4e6fcf06a6f..79f5433359f 100644 --- a/drivers/message/fusion/mptsas.c +++ b/drivers/message/fusion/mptsas.c @@ -1357,7 +1357,8 @@ static int mptsas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, smprep = (SmpPassthroughReply_t *)ioc->sas_mgmt.reply; memcpy(req->sense, smprep, sizeof(*smprep)); req->sense_len = sizeof(*smprep); - rsp->resid_len = blk_rq_bytes(rsp) - smprep->ResponseDataLength; + req->resid_len = 0; + rsp->resid_len -= smprep->ResponseDataLength; } else { printk(MYIOC_s_ERR_FMT "%s: smp passthru reply failed to be returned\n", ioc->name, __func__); diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 531af9ed719..54fa1e42dc4 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -1937,7 +1937,11 @@ int sas_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, if (ret > 0) { /* positive number is the untransferred residual */ rsp->resid_len = ret; + req->resid_len = 0; ret = 0; + } else if (ret == 0) { + rsp->resid_len = 0; + req->resid_len = 0; } return ret; diff --git a/drivers/scsi/libsas/sas_host_smp.c b/drivers/scsi/libsas/sas_host_smp.c index be9a951b977..1bc3b756799 100644 --- a/drivers/scsi/libsas/sas_host_smp.c +++ b/drivers/scsi/libsas/sas_host_smp.c @@ -176,9 +176,6 @@ int sas_smp_host_handler(struct Scsi_Host *shost, struct request *req, resp_data[1] = req_data[1]; resp_data[2] = SMP_RESP_FUNC_UNK; - req->resid_len = blk_rq_bytes(req); - rsp->resid_len = blk_rq_bytes(rsp); - switch (req_data[1]) { case SMP_REPORT_GENERAL: req->resid_len -= 8; diff --git a/drivers/scsi/mpt2sas/mpt2sas_transport.c b/drivers/scsi/mpt2sas/mpt2sas_transport.c index af95a449930..5c65da519e3 100644 --- a/drivers/scsi/mpt2sas/mpt2sas_transport.c +++ b/drivers/scsi/mpt2sas/mpt2sas_transport.c @@ -1170,8 +1170,8 @@ transport_smp_handler(struct Scsi_Host *shost, struct sas_rphy *rphy, memcpy(req->sense, mpi_reply, sizeof(*mpi_reply)); req->sense_len = sizeof(*mpi_reply); - rsp->resid_len = blk_rq_bytes(rsp) - - mpi_reply->ResponseDataLength; + req->resid_len = 0; + rsp->resid_len -= mpi_reply->ResponseDataLength; } else { dtransportprintk(ioc, printk(MPT2SAS_DEBUG_FMT "%s - no reply\n", ioc->name, __func__)); From 4fc981ef9e7c0953d5c4896ce088b19c50cb018f Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 May 2009 18:33:06 +0900 Subject: [PATCH 1133/2061] bio: always copy back data for copied kernel requests When a read bio_copy_kern() request fails, the content of the bounce buffer is not copied back. However, as request failure doesn't necessarily mean complete failure, the buffer state can be useful. This behavior is also inconsistent with the user map counterpart and causes the subtle difference between bounced and unbounced IO causes confusion. This patch makes bio_copy_kern_endio() ignore @err and always copy back data on request completion. Signed-off-by: Tejun Heo Cc: Boaz Harrosh Cc: James Bottomley Signed-off-by: Jens Axboe --- fs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/bio.c b/fs/bio.c index 7bbc98f0eda..ee3bc67833d 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1198,7 +1198,7 @@ static void bio_copy_kern_endio(struct bio *bio, int err) char *addr = page_address(bvec->bv_page); int len = bmd->iovecs[i].bv_len; - if (read && !err) + if (read) memcpy(p, addr, len); __free_page(bvec->bv_page); From b2858d7d1639c04ca3c54988d76c5f7300b76f1c Mon Sep 17 00:00:00 2001 From: Miklos Szeredi Date: Tue, 19 May 2009 11:37:46 +0200 Subject: [PATCH 1134/2061] splice: fix kmaps in default_file_splice_write() Unfortunately multiple kmap() within a single thread are deadlockable, so writing out multiple buffers with writev() isn't possible. Change the implementation so that it does a separate write() for each buffer. This actually simplifies the code a lot since the splice_from_pipe() helper can be used. This limitation is caused by HIGHMEM pages, and so only affects a subset of architectures and configurations. In the future it may be worth to implement default_file_splice_write() in a more efficient way on configs that allow it. Signed-off-by: Miklos Szeredi Signed-off-by: Jens Axboe --- fs/splice.c | 130 +++++++++------------------------------------------- 1 file changed, 22 insertions(+), 108 deletions(-) diff --git a/fs/splice.c b/fs/splice.c index 41179c0a655..73766d24f97 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -535,8 +535,8 @@ static ssize_t kernel_readv(struct file *file, const struct iovec *vec, return res; } -static ssize_t kernel_writev(struct file *file, const struct iovec *vec, - unsigned long vlen, loff_t *ppos) +static ssize_t kernel_write(struct file *file, const char *buf, size_t count, + loff_t pos) { mm_segment_t old_fs; ssize_t res; @@ -544,7 +544,7 @@ static ssize_t kernel_writev(struct file *file, const struct iovec *vec, old_fs = get_fs(); set_fs(get_ds()); /* The cast to a user pointer is valid due to the set_fs() */ - res = vfs_writev(file, (const struct iovec __user *)vec, vlen, ppos); + res = vfs_write(file, (const char __user *)buf, count, &pos); set_fs(old_fs); return res; @@ -1003,120 +1003,34 @@ generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, EXPORT_SYMBOL(generic_file_splice_write); -static struct pipe_buffer *nth_pipe_buf(struct pipe_inode_info *pipe, int n) +static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf, + struct splice_desc *sd) { - return &pipe->bufs[(pipe->curbuf + n) % PIPE_BUFFERS]; + int ret; + void *data; + + ret = buf->ops->confirm(pipe, buf); + if (ret) + return ret; + + data = buf->ops->map(pipe, buf, 0); + ret = kernel_write(sd->u.file, data + buf->offset, sd->len, sd->pos); + buf->ops->unmap(pipe, buf, data); + + return ret; } static ssize_t default_file_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - ssize_t ret = 0; - ssize_t total_len = 0; - int do_wakeup = 0; + ssize_t ret; - pipe_lock(pipe); - while (len) { - struct pipe_buffer *buf; - void *data[PIPE_BUFFERS]; - struct iovec vec[PIPE_BUFFERS]; - unsigned int nr_pages = 0; - unsigned int write_len = 0; - unsigned int now_len = len; - unsigned int this_len; - int i; + ret = splice_from_pipe(pipe, out, ppos, len, flags, write_pipe_buf); + if (ret > 0) + *ppos += ret; - BUG_ON(pipe->nrbufs > PIPE_BUFFERS); - for (i = 0; i < pipe->nrbufs && now_len; i++) { - buf = nth_pipe_buf(pipe, i); - - ret = buf->ops->confirm(pipe, buf); - if (ret) - break; - - data[i] = buf->ops->map(pipe, buf, 0); - this_len = min(buf->len, now_len); - vec[i].iov_base = (void __user *) data[i] + buf->offset; - vec[i].iov_len = this_len; - now_len -= this_len; - write_len += this_len; - nr_pages++; - } - - if (nr_pages) { - ret = kernel_writev(out, vec, nr_pages, ppos); - if (ret == 0) - ret = -EIO; - if (ret > 0) { - len -= ret; - total_len += ret; - } - } - - for (i = 0; i < nr_pages; i++) { - buf = nth_pipe_buf(pipe, i); - buf->ops->unmap(pipe, buf, data[i]); - - if (ret > 0) { - this_len = min_t(unsigned, vec[i].iov_len, ret); - buf->offset += this_len; - buf->len -= this_len; - ret -= this_len; - } - } - - if (ret < 0) - break; - - while (pipe->nrbufs) { - const struct pipe_buf_operations *ops; - - buf = nth_pipe_buf(pipe, 0); - if (buf->len) - break; - - ops = buf->ops; - buf->ops = NULL; - ops->release(pipe, buf); - pipe->curbuf = (pipe->curbuf + 1) % PIPE_BUFFERS; - pipe->nrbufs--; - if (pipe->inode) - do_wakeup = 1; - } - - if (pipe->nrbufs) - continue; - if (!pipe->writers) - break; - if (!pipe->waiting_writers) { - if (total_len) - break; - } - - if (flags & SPLICE_F_NONBLOCK) { - ret = -EAGAIN; - break; - } - - if (signal_pending(current)) { - ret = -ERESTARTSYS; - break; - } - - if (do_wakeup) { - wakeup_pipe_writers(pipe); - do_wakeup = 0; - } - - pipe_wait(pipe); - } - pipe_unlock(pipe); - - if (do_wakeup) - wakeup_pipe_writers(pipe); - - return total_len ? total_len : ret; + return ret; } /** From 3a5a39276d2a32b05b1ee25b384516805b17cf87 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Sun, 17 May 2009 18:55:18 +0300 Subject: [PATCH 1135/2061] block: allow blk_rq_map_kern to append to requests Use blk_rq_append_bio() internally instead of blk_rq_bio_prep() so blk_rq_map_kern can be called multiple times, to map multiple buffers. This is in the effort to un-export blk_rq_append_bio() Signed-off-by: James Bottomley Signed-off-by: Boaz Harrosh Signed-off-by: Jens Axboe --- block/blk-map.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index 56082bea450..caa05a66774 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -282,7 +282,8 @@ EXPORT_SYMBOL(blk_rq_unmap_user); * * Description: * Data will be mapped directly if possible. Otherwise a bounce - * buffer is used. + * buffer is used. Can be called multple times to append multple + * buffers. */ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, unsigned int len, gfp_t gfp_mask) @@ -290,6 +291,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, int reading = rq_data_dir(rq) == READ; int do_copy = 0; struct bio *bio; + int ret; if (len > (q->max_hw_sectors << 9)) return -EINVAL; @@ -311,7 +313,13 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (do_copy) rq->cmd_flags |= REQ_COPY_USER; - blk_rq_bio_prep(q, rq, bio); + ret = blk_rq_append_bio(q, rq, bio); + if (unlikely(ret)) { + /* request is too big */ + bio_put(bio); + return ret; + } + blk_queue_bounce(q, &rq->bio); rq->buffer = NULL; return 0; From bc38bf106c967389a465d926be22c7371abba69d Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 17 May 2009 18:56:17 +0300 Subject: [PATCH 1136/2061] libosd: Use new blk_rq_map_kern Now that blk_rq_map_kern will append the buffer onto the request we can use it easily for adding extra segments (eg. attributes) This patch is dependent on a block layer patch titled: [BLOCK] allow blk_rq_map_kern to append to requests Signed-off-by: Boaz Harrosh Signed-off-by: Jens Axboe --- drivers/scsi/osd/osd_initiator.c | 24 ++---------------------- 1 file changed, 2 insertions(+), 22 deletions(-) diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index d178a8b799e..bf66e301866 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -826,26 +826,6 @@ int osd_req_add_set_attr_list(struct osd_request *or, } EXPORT_SYMBOL(osd_req_add_set_attr_list); -static int _append_map_kern(struct request *req, - void *buff, unsigned len, gfp_t flags) -{ - struct bio *bio; - int ret; - - bio = bio_map_kern(req->q, buff, len, flags); - if (IS_ERR(bio)) { - OSD_ERR("Failed bio_map_kern(%p, %d) => %ld\n", buff, len, - PTR_ERR(bio)); - return PTR_ERR(bio); - } - ret = blk_rq_append_bio(req->q, req, bio); - if (ret) { - OSD_ERR("Failed blk_rq_append_bio(%p) => %d\n", bio, ret); - bio_put(bio); - } - return ret; -} - static int _req_append_segment(struct osd_request *or, unsigned padding, struct _osd_req_data_segment *seg, struct _osd_req_data_segment *last_seg, struct _osd_io_info *io) @@ -861,14 +841,14 @@ static int _req_append_segment(struct osd_request *or, else pad_buff = io->pad_buff; - ret = _append_map_kern(io->req, pad_buff, padding, + ret = blk_rq_map_kern(io->req->q, io->req, pad_buff, padding, or->alloc_flags); if (ret) return ret; io->total_bytes += padding; } - ret = _append_map_kern(io->req, seg->buff, seg->total_bytes, + ret = blk_rq_map_kern(io->req->q, io->req, seg->buff, seg->total_bytes, or->alloc_flags); if (ret) return ret; From 79eb63e9e5875b84341a3a05f8e6ae9cdb4bb6f6 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 17 May 2009 18:57:15 +0300 Subject: [PATCH 1137/2061] block: Add blk_make_request(), takes bio, returns a request New block API: given a struct bio allocates a new request. This is the parallel of generic_make_request for BLOCK_PC commands users. The passed bio may be a chained-bio. The bio is bounced if needed inside the call to this member. This is in the effort of un-exporting blk_rq_append_bio(). Signed-off-by: Boaz Harrosh CC: Jeff Garzik Signed-off-by: Jens Axboe --- block/blk-core.c | 45 ++++++++++++++++++++++++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 47 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index e3f7e6a3a09..bec1d69952d 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -890,6 +890,51 @@ struct request *blk_get_request(struct request_queue *q, int rw, gfp_t gfp_mask) } EXPORT_SYMBOL(blk_get_request); +/** + * blk_make_request - given a bio, allocate a corresponding struct request. + * + * @bio: The bio describing the memory mappings that will be submitted for IO. + * It may be a chained-bio properly constructed by block/bio layer. + * + * blk_make_request is the parallel of generic_make_request for BLOCK_PC + * type commands. Where the struct request needs to be farther initialized by + * the caller. It is passed a &struct bio, which describes the memory info of + * the I/O transfer. + * + * The caller of blk_make_request must make sure that bi_io_vec + * are set to describe the memory buffers. That bio_data_dir() will return + * the needed direction of the request. (And all bio's in the passed bio-chain + * are properly set accordingly) + * + * If called under none-sleepable conditions, mapped bio buffers must not + * need bouncing, by calling the appropriate masked or flagged allocator, + * suitable for the target device. Otherwise the call to blk_queue_bounce will + * BUG. + */ +struct request *blk_make_request(struct request_queue *q, struct bio *bio, + gfp_t gfp_mask) +{ + struct request *rq = blk_get_request(q, bio_data_dir(bio), gfp_mask); + + if (unlikely(!rq)) + return ERR_PTR(-ENOMEM); + + for_each_bio(bio) { + struct bio *bounce_bio = bio; + int ret; + + blk_queue_bounce(q, &bounce_bio); + ret = blk_rq_append_bio(q, rq, bounce_bio); + if (unlikely(ret)) { + blk_put_request(rq); + return ERR_PTR(ret); + } + } + + return rq; +} +EXPORT_SYMBOL(blk_make_request); + /** * blk_requeue_request - put a request back on queue * @q: request queue where request should be inserted diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f9d60a78c08..88a83e112c9 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -740,6 +740,8 @@ extern void blk_rq_init(struct request_queue *q, struct request *rq); extern void blk_put_request(struct request *); extern void __blk_put_request(struct request_queue *, struct request *); extern struct request *blk_get_request(struct request_queue *, int, gfp_t); +extern struct request *blk_make_request(struct request_queue *, struct bio *, + gfp_t); extern void blk_insert_request(struct request_queue *, struct request *, int, void *); extern void blk_requeue_request(struct request_queue *, struct request *); extern int blk_rq_check_limits(struct request_queue *q, struct request *rq); From c29b70f6ee4f2fa3ef07f55bc9082945861e5391 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 17 May 2009 18:58:41 +0300 Subject: [PATCH 1138/2061] libosd: Use of new blk_make_request Use new blk_make_request() to allocate a request from bio and avoid using deprecated blk_rq_append_bio(). This patch is dependent on a block layer patch titled: [BLOCK] New blk_make_request() takes bio returns request This is the last usage of blk_rq_append_bio in osd, it can now be un-exported. Signed-off-by: Boaz Harrosh CC: Jeff Garzik CC: FUJITA Tomonori Signed-off-by: Jens Axboe --- drivers/scsi/osd/osd_initiator.c | 48 +++++++++++++++----------------- 1 file changed, 23 insertions(+), 25 deletions(-) diff --git a/drivers/scsi/osd/osd_initiator.c b/drivers/scsi/osd/osd_initiator.c index bf66e301866..865ec0f4aa8 100644 --- a/drivers/scsi/osd/osd_initiator.c +++ b/drivers/scsi/osd/osd_initiator.c @@ -1200,6 +1200,21 @@ static int _osd_req_finalize_data_integrity(struct osd_request *or, /* * osd_finalize_request and helpers */ +static struct request *_make_request(struct request_queue *q, bool has_write, + struct _osd_io_info *oii, gfp_t flags) +{ + if (oii->bio) + return blk_make_request(q, oii->bio, flags); + else { + struct request *req; + + req = blk_get_request(q, has_write ? WRITE : READ, flags); + if (unlikely(!req)) + return ERR_PTR(-ENOMEM); + + return req; + } +} static int _init_blk_request(struct osd_request *or, bool has_in, bool has_out) @@ -1208,11 +1223,13 @@ static int _init_blk_request(struct osd_request *or, struct scsi_device *scsi_device = or->osd_dev->scsi_device; struct request_queue *q = scsi_device->request_queue; struct request *req; - int ret = -ENOMEM; + int ret; - req = blk_get_request(q, has_out, flags); - if (!req) + req = _make_request(q, has_out, has_out ? &or->out : &or->in, flags); + if (IS_ERR(req)) { + ret = PTR_ERR(req); goto out; + } or->request = req; req->cmd_type = REQ_TYPE_BLOCK_PC; @@ -1225,9 +1242,10 @@ static int _init_blk_request(struct osd_request *or, or->out.req = req; if (has_in) { /* allocate bidi request */ - req = blk_get_request(q, READ, flags); - if (!req) { + req = _make_request(q, false, &or->in, flags); + if (IS_ERR(req)) { OSD_DEBUG("blk_get_request for bidi failed\n"); + ret = PTR_ERR(req); goto out; } req->cmd_type = REQ_TYPE_BLOCK_PC; @@ -1271,26 +1289,6 @@ int osd_finalize_request(struct osd_request *or, return ret; } - if (or->out.bio) { - ret = blk_rq_append_bio(or->request->q, or->out.req, - or->out.bio); - if (ret) { - OSD_DEBUG("blk_rq_append_bio out failed\n"); - return ret; - } - OSD_DEBUG("out bytes=%llu (bytes_req=%u)\n", - _LLU(or->out.total_bytes), blk_rq_bytes(or->out.req)); - } - if (or->in.bio) { - ret = blk_rq_append_bio(or->request->q, or->in.req, or->in.bio); - if (ret) { - OSD_DEBUG("blk_rq_append_bio in failed\n"); - return ret; - } - OSD_DEBUG("in bytes=%llu (bytes_req=%u)\n", - _LLU(or->in.total_bytes), blk_rq_bytes(or->in.req)); - } - or->out.pad_buff = sg_out_pad_buffer; or->in.pad_buff = sg_in_pad_buffer; From a411f4bbb89f1f08687b344064d6775bce1e4658 Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Sun, 17 May 2009 19:00:01 +0300 Subject: [PATCH 1139/2061] block: Un-export blk_rq_append_bio OSD was the last in-tree user of blk_rq_append_bio(). Now that it is fixed blk_rq_append_bio is un-exported and is only used internally by block layer. Signed-off-by: Boaz Harrosh Signed-off-by: Jens Axboe --- block/blk-map.c | 1 - block/blk.h | 2 ++ include/linux/blkdev.h | 6 ------ 3 files changed, 2 insertions(+), 7 deletions(-) diff --git a/block/blk-map.c b/block/blk-map.c index caa05a66774..ef2492adca7 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -24,7 +24,6 @@ int blk_rq_append_bio(struct request_queue *q, struct request *rq, } return 0; } -EXPORT_SYMBOL(blk_rq_append_bio); static int __blk_rq_unmap_user(struct bio *bio) { diff --git a/block/blk.h b/block/blk.h index 9e0042ca949..c863ec2281e 100644 --- a/block/blk.h +++ b/block/blk.h @@ -13,6 +13,8 @@ extern struct kobj_type blk_queue_ktype; void init_request_from_bio(struct request *req, struct bio *bio); void blk_rq_bio_prep(struct request_queue *q, struct request *rq, struct bio *bio); +int blk_rq_append_bio(struct request_queue *q, struct request *rq, + struct bio *bio); void blk_dequeue_request(struct request *rq); void __blk_queue_free_tags(struct request_queue *q); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 88a83e112c9..564445be7a6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -757,12 +757,6 @@ extern int scsi_cmd_ioctl(struct request_queue *, struct gendisk *, fmode_t, extern int sg_scsi_ioctl(struct request_queue *, struct gendisk *, fmode_t, struct scsi_ioctl_command __user *); -/* - * Temporary export, until SCSI gets fixed up. - */ -extern int blk_rq_append_bio(struct request_queue *q, struct request *rq, - struct bio *bio); - /* * A queue has just exitted congestion. Note this in the global counter of * congested queues, and wake up anyone who was waiting for requests to be From 4aee2ad461889132bfb5a1518a9580d00e17008c Mon Sep 17 00:00:00 2001 From: Jaswinder Singh Rajput Date: Tue, 19 May 2009 17:07:01 +0530 Subject: [PATCH 1140/2061] x86: asm/processor.h: remove double declaration Remove double declaration of: extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); extern unsigned short num_cache_leaves; they are already defined in the same file. [ Impact: cleanup ] Signed-off-by: Jaswinder Singh Rajput LKML-Reference: <1242733021.3377.1.camel@localhost.localdomain> Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 34c52370f2f..85628ea9d9b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -403,9 +403,6 @@ DECLARE_PER_CPU(unsigned long, stack_canary); extern unsigned int xstate_size; extern void free_thread_xstate(struct task_struct *); extern struct kmem_cache *task_xstate_cachep; -extern void init_scattered_cpuid_features(struct cpuinfo_x86 *c); -extern unsigned int init_intel_cacheinfo(struct cpuinfo_x86 *c); -extern unsigned short num_cache_leaves; struct thread_struct { /* Cached TLS descriptors: */ From ef9e8b14a5c1d0afbaf12b4c3b271188ddfc52a4 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Tue, 19 May 2009 14:25:16 +0100 Subject: [PATCH 1141/2061] GFS2: Don't warn when delete inode fails on ro filesystem If the filesystem is read-only, then we expect that delete inode will fail, so there is no need to warn about it. Signed-off-by: Steven Whitehouse --- fs/gfs2/ops_super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c index a3c2272e7ca..2fd1dcbcc5b 100644 --- a/fs/gfs2/ops_super.c +++ b/fs/gfs2/ops_super.c @@ -714,7 +714,7 @@ out_unlock: gfs2_glock_dq(&ip->i_iopen_gh); gfs2_holder_uninit(&ip->i_iopen_gh); gfs2_glock_dq_uninit(&gh); - if (error && error != GLR_TRYFAILED) + if (error && error != GLR_TRYFAILED && error != -EROFS) fs_warn(sdp, "gfs2_delete_inode: %d\n", error); out: truncate_inode_pages(&inode->i_data, 0); From c5642f4bbae30122beb696e723f6da273caa570e Mon Sep 17 00:00:00 2001 From: Stephen Smalley Date: Tue, 19 May 2009 09:02:23 -0400 Subject: [PATCH 1142/2061] selinux: remove obsolete read buffer limit from sel_read_bool On Tue, 2009-05-19 at 00:05 -0400, Eamon Walsh wrote: > Recent versions of coreutils have bumped the read buffer size from 4K to > 32K in several of the utilities. > > This means that "cat /selinux/booleans/xserver_object_manager" no longer > works, it returns "Invalid argument" on F11. getsebool works fine. > > sel_read_bool has a check for "count > PAGE_SIZE" that doesn't seem to > be present in the other read functions. Maybe it could be removed? Yes, that check is obsoleted by the conversion of those functions to using simple_read_from_buffer(), which will reduce count if necessary to what is available in the buffer. Signed-off-by: Stephen Smalley Signed-off-by: James Morris --- security/selinux/selinuxfs.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 8d4007fbe0e..b4fc506e7a8 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -803,10 +803,6 @@ static ssize_t sel_read_bool(struct file *filep, char __user *buf, goto out; } - if (count > PAGE_SIZE) { - ret = -EINVAL; - goto out; - } page = (char *)get_zeroed_page(GFP_KERNEL); if (!page) { ret = -ENOMEM; From 53674ac5a997a8eedbb2dfdc308b895170746c8b Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 19 May 2009 19:52:35 +0200 Subject: [PATCH 1143/2061] block: add warning to blk_make_request() Add a note about how one needs to be careful when setting up these bio chains. Extracted from Boaz's updated patch. Signed-off-by: Jens Axboe --- block/blk-core.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/block/blk-core.c b/block/blk-core.c index bec1d69952d..49065075d46 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -910,6 +910,15 @@ EXPORT_SYMBOL(blk_get_request); * need bouncing, by calling the appropriate masked or flagged allocator, * suitable for the target device. Otherwise the call to blk_queue_bounce will * BUG. + * + * WARNING: When allocating/cloning a bio-chain, careful consideration should be + * given to how you allocate bios. In particular, you cannot use __GFP_WAIT for + * anything but the first bio in the chain. Otherwise you risk waiting for IO + * completion of a bio that hasn't been submitted yet, thus resulting in a + * deadlock. Alternatively bios should be allocated using bio_kmalloc() instead + * of bio_alloc(), as that avoids the mempool deadlock. + * If possible a big IO should be split into smaller parts when allocation + * fails. Partial allocation should not be an error, or you risk a live-lock. */ struct request *blk_make_request(struct request_queue *q, struct bio *bio, gfp_t gfp_mask) From ac36552a52a6ec8563ac0a109e2a0935673f4abb Mon Sep 17 00:00:00 2001 From: Boaz Harrosh Date: Tue, 19 May 2009 19:54:09 +0200 Subject: [PATCH 1144/2061] scsi_lib: remove unused variable The last request completion cleanup in scsi_lib left an unused this_count variable in scsi_io_completion(). (It was used before in a code segment that now uses blk_end_request_all()) Signed-off-by: Boaz Harrosh Acked-by: Tejun Heo Acked-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- drivers/scsi/scsi_lib.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index e410d667910..d7c6c752e0a 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -706,7 +706,6 @@ EXPORT_SYMBOL(scsi_release_buffers); void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) { int result = cmd->result; - int this_count; struct request_queue *q = cmd->device->request_queue; struct request *req = cmd->request; int error = 0; @@ -789,7 +788,6 @@ void scsi_io_completion(struct scsi_cmnd *cmd, unsigned int good_bytes) */ if (scsi_end_request(cmd, error, good_bytes, result == 0) == NULL) return; - this_count = blk_rq_bytes(req); error = -EIO; From 33b2fb303fe7f6b08bbb32f708e67b96eaa94a7a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:08:41 +0200 Subject: [PATCH 1145/2061] perf_counter: fix counter freeing logic Fix counter lifetime bugs which explain the crashes reported by Marcelo Tosatti and Arnaldo Carvalho de Melo. The new rule is: flushing + freeing is only done for a task's own counters, never for other tasks. [ Impact: fix crashes/lockups with inherited counters ] Reported-by: Arnaldo Carvalho de Melo Reported-by: Marcelo Tosatti Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Signed-off-by: Ingo Molnar --- kernel/exit.c | 19 +++++++------------ kernel/perf_counter.c | 2 ++ 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/kernel/exit.c b/kernel/exit.c index 73affd35e76..f9dfedd94af 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -178,12 +178,6 @@ repeat: proc_flush_task(p); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(p); - write_lock_irq(&tasklist_lock); tracehook_finish_release_task(p); __exit_signal(p); @@ -985,6 +979,13 @@ NORET_TYPE void do_exit(long code) module_put(tsk->binfmt->module); proc_exit_connector(tsk); + + /* + * Flush inherited counters to the parent - before the parent + * gets woken up by child-exit notifications. + */ + perf_counter_exit_task(tsk); + exit_notify(tsk, group_dead); #ifdef CONFIG_NUMA mpol_put(tsk->mempolicy); @@ -1257,12 +1258,6 @@ static int wait_task_zombie(struct task_struct *p, int options, */ read_unlock(&tasklist_lock); - /* - * Flush inherited counters to the parent - before the parent - * gets woken up by child-exit notifications. - */ - perf_counter_exit_task(p); - retval = ru ? getrusage(p, RUSAGE_BOTH, ru) : 0; status = (p->signal->flags & SIGNAL_GROUP_EXIT) ? p->signal->group_exit_code : p->exit_code; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 59a926d04ba..7af16d1c480 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3299,6 +3299,8 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter *child_counter, *tmp; struct perf_counter_context *child_ctx; + WARN_ON_ONCE(child != current); + child_ctx = &child->perf_counter_ctx; if (likely(!child_ctx->nr_counters)) From c44d70a340554a33071339064a303ac0f1a31623 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 17 May 2009 11:24:08 +0200 Subject: [PATCH 1146/2061] perf_counter: fix counter inheritance race Context rotation should not occur when we are in the middle of walking the counter list when inheriting counters ... [ Impact: fix occasionally incorrect perf stat results ] Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 10 +++++++++- 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c8c1dfc22c9..13cb2fbbf33 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -508,6 +508,7 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; + int rr_allowed; struct task_struct *task; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7af16d1c480..4d8f97375f3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1120,7 +1120,8 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - rotate_ctx(ctx); + if (ctx->rr_allowed) + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); @@ -3108,6 +3109,7 @@ __perf_counter_init_context(struct perf_counter_context *ctx, mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); INIT_LIST_HEAD(&ctx->event_list); + ctx->rr_allowed = 1; ctx->task = task; } @@ -3348,6 +3350,9 @@ void perf_counter_init_task(struct task_struct *child) */ mutex_lock(&parent_ctx->mutex); + parent_ctx->rr_allowed = 0; + barrier(); /* irqs */ + /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: @@ -3361,6 +3366,9 @@ void perf_counter_init_task(struct task_struct *child) break; } + barrier(); /* irqs */ + parent_ctx->rr_allowed = 1; + mutex_unlock(&parent_ctx->mutex); } From 62669e61a5f559826b1d2e863649a6005eee629b Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Wed, 20 May 2009 11:27:13 +0900 Subject: [PATCH 1147/2061] sh: mach-hp6xx: Fix up the hp6xx build for hd64461 changes. Fixes several compile errors due to the recent hd64461 I/O base changes. Signed-off-by: Paul Mundt --- arch/sh/cchips/hd6446x/hd64461.c | 2 +- arch/sh/include/asm/hd64461.h | 2 +- drivers/video/hitfb.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sh/cchips/hd6446x/hd64461.c b/arch/sh/cchips/hd6446x/hd64461.c index 25ef9106152..50aa0c1f76e 100644 --- a/arch/sh/cchips/hd6446x/hd64461.c +++ b/arch/sh/cchips/hd6446x/hd64461.c @@ -80,7 +80,7 @@ int __init setup_hd64461(void) printk(KERN_INFO "HD64461 configured at 0x%x on irq %d(mapped into %d to %d)\n", - CONFIG_HD64461_IOBASE, CONFIG_HD64461_IRQ, HD64461_IRQBASE, + HD64461_IOBASE, CONFIG_HD64461_IRQ, HD64461_IRQBASE, HD64461_IRQBASE + 15); /* Should be at processor specific part.. */ diff --git a/arch/sh/include/asm/hd64461.h b/arch/sh/include/asm/hd64461.h index c16d54563c9..977355f0a48 100644 --- a/arch/sh/include/asm/hd64461.h +++ b/arch/sh/include/asm/hd64461.h @@ -26,7 +26,7 @@ #define HD64461_PCC1_COMM (HD64461_PCC1_BASE+HD64461_PCC_WINDOW) /* 0xb5000000 */ /* Standby Control Register for HD64461 */ -#define HD64461_STBCR CONFIG_HD64461_IOBASE +#define HD64461_STBCR HD64461_IO_OFFSET(0x00000000) #define HD64461_STBCR_CKIO_STBY 0x2000 #define HD64461_STBCR_SAFECKE_IST 0x1000 #define HD64461_STBCR_SLCKE_IST 0x0800 diff --git a/drivers/video/hitfb.c b/drivers/video/hitfb.c index e6467cf9f19..020db7fc915 100644 --- a/drivers/video/hitfb.c +++ b/drivers/video/hitfb.c @@ -335,9 +335,9 @@ static int __init hitfb_probe(struct platform_device *dev) if (fb_get_options("hitfb", NULL)) return -ENODEV; - hitfb_fix.mmio_start = CONFIG_HD64461_IOBASE+0x1000; + hitfb_fix.mmio_start = HD64461_IO_OFFSET(0x1000); hitfb_fix.mmio_len = 0x1000; - hitfb_fix.smem_start = CONFIG_HD64461_IOBASE + 0x02000000; + hitfb_fix.smem_start = HD64461_IO_OFFSET(0x02000000); hitfb_fix.smem_len = 512 * 1024; lcdclor = fb_readw(HD64461_LCDCLOR); From 0a7ae2ff0d29bb3b327edff4c8ab67b3834fa811 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 20 May 2009 08:54:31 +0200 Subject: [PATCH 1148/2061] block: change the tag sync vs async restriction logic Make them fully share the tag space, but disallow async requests using the last any two slots. Signed-off-by: Jens Axboe --- block/blk-barrier.c | 2 +- block/blk-core.c | 2 +- block/blk-tag.c | 15 +++++++++------ block/elevator.c | 8 ++++---- include/linux/blkdev.h | 7 ++++++- 5 files changed, 21 insertions(+), 13 deletions(-) diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 0ab81a0a750..0d98054cdbd 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -218,7 +218,7 @@ static inline bool start_ordered(struct request_queue *q, struct request **rqp) } else skip |= QUEUE_ORDSEQ_PREFLUSH; - if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && q->in_flight) + if ((q->ordered & QUEUE_ORDERED_BY_DRAIN) && queue_in_flight(q)) rq = NULL; else skip |= QUEUE_ORDSEQ_DRAIN; diff --git a/block/blk-core.c b/block/blk-core.c index 49065075d46..1c748403882 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1815,7 +1815,7 @@ void blk_dequeue_request(struct request *rq) * the driver side. */ if (blk_account_rq(rq)) - q->in_flight++; + q->in_flight[rq_is_sync(rq)]++; } /** diff --git a/block/blk-tag.c b/block/blk-tag.c index c260f7c30dd..2e5cfeb5933 100644 --- a/block/blk-tag.c +++ b/block/blk-tag.c @@ -336,7 +336,7 @@ EXPORT_SYMBOL(blk_queue_end_tag); int blk_queue_start_tag(struct request_queue *q, struct request *rq) { struct blk_queue_tag *bqt = q->queue_tags; - unsigned max_depth, offset; + unsigned max_depth; int tag; if (unlikely((rq->cmd_flags & REQ_QUEUED))) { @@ -355,13 +355,16 @@ int blk_queue_start_tag(struct request_queue *q, struct request *rq) * to starve sync IO on behalf of flooding async IO. */ max_depth = bqt->max_depth; - if (rq_is_sync(rq)) - offset = 0; - else - offset = max_depth >> 2; + if (!rq_is_sync(rq) && max_depth > 1) { + max_depth -= 2; + if (!max_depth) + max_depth = 1; + if (q->in_flight[0] > max_depth) + return 1; + } do { - tag = find_next_zero_bit(bqt->tag_map, max_depth, offset); + tag = find_first_zero_bit(bqt->tag_map, max_depth); if (tag >= max_depth) return 1; diff --git a/block/elevator.c b/block/elevator.c index 918920056e4..ebee948293e 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -546,7 +546,7 @@ void elv_requeue_request(struct request_queue *q, struct request *rq) * in_flight count again */ if (blk_account_rq(rq)) { - q->in_flight--; + q->in_flight[rq_is_sync(rq)]--; if (blk_sorted_rq(rq)) elv_deactivate_rq(q, rq); } @@ -685,7 +685,7 @@ void elv_insert(struct request_queue *q, struct request *rq, int where) if (unplug_it && blk_queue_plugged(q)) { int nrq = q->rq.count[BLK_RW_SYNC] + q->rq.count[BLK_RW_ASYNC] - - q->in_flight; + - queue_in_flight(q); if (nrq >= q->unplug_thresh) __generic_unplug_device(q); @@ -823,7 +823,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq) * request is released from the driver, io must be done */ if (blk_account_rq(rq)) { - q->in_flight--; + q->in_flight[rq_is_sync(rq)]--; if (blk_sorted_rq(rq) && e->ops->elevator_completed_req_fn) e->ops->elevator_completed_req_fn(q, rq); } @@ -838,7 +838,7 @@ void elv_completed_request(struct request_queue *q, struct request *rq) if (!list_empty(&q->queue_head)) next = list_entry_rq(q->queue_head.next); - if (!q->in_flight && + if (!queue_in_flight(q) && blk_ordered_cur_seq(q) == QUEUE_ORDSEQ_DRAIN && (!next || blk_ordered_req_seq(next) > QUEUE_ORDSEQ_DRAIN)) { blk_ordered_complete_seq(q, QUEUE_ORDSEQ_DRAIN, 0); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 564445be7a6..a967dd775db 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -404,7 +404,7 @@ struct request_queue struct list_head tag_busy_list; unsigned int nr_sorted; - unsigned int in_flight; + unsigned int in_flight[2]; unsigned int rq_timeout; struct timer_list timeout; @@ -511,6 +511,11 @@ static inline void queue_flag_clear_unlocked(unsigned int flag, __clear_bit(flag, &q->queue_flags); } +static inline int queue_in_flight(struct request_queue *q) +{ + return q->in_flight[0] + q->in_flight[1]; +} + static inline void queue_flag_clear(unsigned int flag, struct request_queue *q) { WARN_ON_ONCE(!queue_is_locked(q)); From c8b15a706d921baed3195407e4f55270112bb3c6 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 May 2009 09:18:50 +0200 Subject: [PATCH 1149/2061] futex: cleanup error exit Reuse the put_key_ref(key2) call in the exit path. Signed-off-by: Thomas Gleixner --- kernel/futex.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 476603afd14..381125a9f1e 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2185,10 +2185,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, /* Prepare to wait on uaddr. */ ret = futex_wait_setup(uaddr, val, fshared, &q, &hb); - if (ret) { - put_futex_key(fshared, &key2); - goto out; - } + if (ret) + goto out_key2; /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); @@ -2282,6 +2280,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, out_put_keys: put_futex_key(fshared, &q.key); +out_key2: put_futex_key(fshared, &key2); out: From 1c840c14906d4ddf66c1f4f5daea059aad951c82 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 20 May 2009 09:22:40 +0200 Subject: [PATCH 1150/2061] futex: fix restart for early wakeup in futex_wait_requeue_pi() The futex_wait_requeue_pi op should restart unconditionally like futex_lock_pi. The user of that function e.g. pthread_cond_wait can not be interrupted so we do not care about the SA_RESTART flag of the signal. Clean up the FIXMEs. Signed-off-by: Thomas Gleixner --- kernel/futex.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 381125a9f1e..2aa216e5b59 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -2060,7 +2060,7 @@ pi_faulted: * * Returns * 0 - no early wakeup detected - * <0 - -ETIMEDOUT or -ERESTARTSYS (FIXME: or ERESTARTNOINTR?) + * <0 - -ETIMEDOUT or -ERESTARTNOINTR */ static inline int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, @@ -2087,15 +2087,8 @@ int handle_early_requeue_pi_wakeup(struct futex_hash_bucket *hb, if (timeout && !timeout->task) ret = -ETIMEDOUT; - else { - /* - * We expect signal_pending(current), but another - * thread may have handled it for us already. - */ - /* FIXME: ERESTARTSYS or ERESTARTNOINTR? Do we care if - * the user specified SA_RESTART or not? */ - ret = -ERESTARTSYS; - } + else + ret = -ERESTARTNOINTR; } return ret; } From 2070887fdeacd9c13f3e805e3f0086c9f22a4d93 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 19 May 2009 23:04:59 +0200 Subject: [PATCH 1151/2061] futex: fix restart in wait_requeue_pi If the waiter has been requeued to the outer PI futex and is interrupted by a signal and the thread handles the signal then ERESTART_RESTARTBLOCK is changed to EINTR and the restart block is discarded. That way we return an unexcpected EINTR to user space instead of ending up in futex_lock_pi_restart. But we do not need to restart the syscall because we know that the condition has changed since we have been requeued. If we would simply restart the syscall then we would drop out via the comparison of the user space value with EWOULDBLOCK. The user space side needs to handle EWOULDBLOCK anyway as the enqueueing on the inner futex can race with a requeue/wake. So we can simply return EWOULDBLOCK to user space which also signals that we did not take the outer futex and let user space handle it in the same way it has to handle the requeue/wake race. Signed-off-by: Thomas Gleixner --- kernel/futex.c | 49 +++++++++---------------------------------------- 1 file changed, 9 insertions(+), 40 deletions(-) diff --git a/kernel/futex.c b/kernel/futex.c index 2aa216e5b59..80b5ce71659 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1507,7 +1507,6 @@ handle_fault: #define FLAGS_HAS_TIMEOUT 0x04 static long futex_wait_restart(struct restart_block *restart); -static long futex_lock_pi_restart(struct restart_block *restart); /** * fixup_owner() - Post lock pi_state and corner case management @@ -1930,21 +1929,6 @@ uaddr_faulted: goto retry; } -static long futex_lock_pi_restart(struct restart_block *restart) -{ - u32 __user *uaddr = (u32 __user *)restart->futex.uaddr; - ktime_t t, *tp = NULL; - int fshared = restart->futex.flags & FLAGS_SHARED; - - if (restart->futex.flags & FLAGS_HAS_TIMEOUT) { - t.tv64 = restart->futex.time; - tp = &t; - } - restart->fn = do_no_restart_syscall; - - return (long)futex_lock_pi(uaddr, fshared, restart->futex.val, tp, 0); -} - /* * Userspace attempted a TID -> 0 atomic transition, and failed. * This is the in-kernel slowpath: we look up the PI state (if any), @@ -2141,12 +2125,10 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, struct hrtimer_sleeper timeout, *to = NULL; struct rt_mutex_waiter rt_waiter; struct rt_mutex *pi_mutex = NULL; - struct restart_block *restart; struct futex_hash_bucket *hb; union futex_key key2; struct futex_q q; int res, ret; - u32 uval; if (!bitset) return -EINVAL; @@ -2245,30 +2227,17 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, int fshared, if (rt_mutex_owner(pi_mutex) == current) rt_mutex_unlock(pi_mutex); } else if (ret == -EINTR) { - ret = -EFAULT; - if (get_user(uval, uaddr2)) - goto out_put_keys; - /* - * We've already been requeued, so restart by calling - * futex_lock_pi() directly, rather then returning to this - * function. + * We've already been requeued, but we have no way to + * restart by calling futex_lock_pi() directly. We + * could restart the syscall, but that will look at + * the user space value and return right away. So we + * drop back with EWOULDBLOCK to tell user space that + * "val" has been changed. That's the same what the + * restart of the syscall would do in + * futex_wait_setup(). */ - ret = -ERESTART_RESTARTBLOCK; - restart = ¤t_thread_info()->restart_block; - restart->fn = futex_lock_pi_restart; - restart->futex.uaddr = (u32 *)uaddr2; - restart->futex.val = uval; - restart->futex.flags = 0; - if (abs_time) { - restart->futex.flags |= FLAGS_HAS_TIMEOUT; - restart->futex.time = abs_time->tv64; - } - - if (fshared) - restart->futex.flags |= FLAGS_SHARED; - if (clockrt) - restart->futex.flags |= FLAGS_CLOCKRT; + ret = -EWOULDBLOCK; } out_put_keys: From 09010978345e8883003bf411bb99753710eb5a3a Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Wed, 20 May 2009 10:48:47 +0100 Subject: [PATCH 1152/2061] GFS2: Improve resource group error handling This patch improves the error handling in the case where we discover that the summary information in the resource group doesn't match the bitmap information while in the process of allocating blocks. Originally this resulted in a kernel bug, but this patch changes that so that we return -EIO and print some messages explaining what went wrong, and how to fix it. We also remember locally not to try and allocate from the same rgrp again, so that a subsequent allocation in a different rgrp should succeed. Signed-off-by: Steven Whitehouse --- fs/gfs2/bmap.c | 9 ++++++-- fs/gfs2/dir.c | 11 +++++++-- fs/gfs2/eattr.c | 14 ++++++++--- fs/gfs2/glops.c | 20 +--------------- fs/gfs2/incore.h | 7 +++--- fs/gfs2/rgrp.c | 60 +++++++++++++++++++++++++++++++++++------------- fs/gfs2/rgrp.h | 47 ++++++++++++++++++------------------- 7 files changed, 100 insertions(+), 68 deletions(-) diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 3a5d3f883e1..253e1a39f84 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -136,7 +136,9 @@ int gfs2_unstuff_dinode(struct gfs2_inode *ip, struct page *page) and write it out to disk */ unsigned int n = 1; - block = gfs2_alloc_block(ip, &n); + error = gfs2_alloc_block(ip, &block, &n); + if (error) + goto out_brelse; if (isdir) { gfs2_trans_add_unrevoke(GFS2_SB(&ip->i_inode), block, 1); error = gfs2_dir_get_new_buffer(ip, block, &bh); @@ -476,8 +478,11 @@ static int gfs2_bmap_alloc(struct inode *inode, const sector_t lblock, blks = dblks + iblks; i = sheight; do { + int error; n = blks - alloced; - bn = gfs2_alloc_block(ip, &n); + error = gfs2_alloc_block(ip, &bn, &n); + if (error) + return error; alloced += n; if (state != ALLOC_DATA || gfs2_is_jdata(ip)) gfs2_trans_add_unrevoke(sdp, bn, n); diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index aef4d0c0674..297d7e5ceba 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -803,13 +803,20 @@ static struct gfs2_leaf *new_leaf(struct inode *inode, struct buffer_head **pbh, { struct gfs2_inode *ip = GFS2_I(inode); unsigned int n = 1; - u64 bn = gfs2_alloc_block(ip, &n); - struct buffer_head *bh = gfs2_meta_new(ip->i_gl, bn); + u64 bn; + int error; + struct buffer_head *bh; struct gfs2_leaf *leaf; struct gfs2_dirent *dent; struct qstr name = { .name = "", .len = 0, .hash = 0 }; + + error = gfs2_alloc_block(ip, &bn, &n); + if (error) + return NULL; + bh = gfs2_meta_new(ip->i_gl, bn); if (!bh) return NULL; + gfs2_trans_add_unrevoke(GFS2_SB(inode), bn, 1); gfs2_trans_add_bh(ip->i_gl, bh, 1); gfs2_metatype_set(bh, GFS2_METATYPE_LF, GFS2_FORMAT_LF); diff --git a/fs/gfs2/eattr.c b/fs/gfs2/eattr.c index 899763aed21..07ea9529add 100644 --- a/fs/gfs2/eattr.c +++ b/fs/gfs2/eattr.c @@ -582,8 +582,11 @@ static int ea_alloc_blk(struct gfs2_inode *ip, struct buffer_head **bhp) struct gfs2_ea_header *ea; unsigned int n = 1; u64 block; + int error; - block = gfs2_alloc_block(ip, &n); + error = gfs2_alloc_block(ip, &block, &n); + if (error) + return error; gfs2_trans_add_unrevoke(sdp, block, 1); *bhp = gfs2_meta_new(ip->i_gl, block); gfs2_trans_add_bh(ip->i_gl, *bhp, 1); @@ -617,6 +620,7 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, struct gfs2_ea_request *er) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + int error; ea->ea_data_len = cpu_to_be32(er->er_data_len); ea->ea_name_len = er->er_name_len; @@ -642,7 +646,9 @@ static int ea_write(struct gfs2_inode *ip, struct gfs2_ea_header *ea, int mh_size = sizeof(struct gfs2_meta_header); unsigned int n = 1; - block = gfs2_alloc_block(ip, &n); + error = gfs2_alloc_block(ip, &block, &n); + if (error) + return error; gfs2_trans_add_unrevoke(sdp, block, 1); bh = gfs2_meta_new(ip->i_gl, block); gfs2_trans_add_bh(ip->i_gl, bh, 1); @@ -963,7 +969,9 @@ static int ea_set_block(struct gfs2_inode *ip, struct gfs2_ea_request *er, } else { u64 blk; unsigned int n = 1; - blk = gfs2_alloc_block(ip, &n); + error = gfs2_alloc_block(ip, &blk, &n); + if (error) + return error; gfs2_trans_add_unrevoke(sdp, blk, 1); indbh = gfs2_meta_new(ip->i_gl, blk); gfs2_trans_add_bh(ip->i_gl, indbh, 1); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 70f87f43afa..d5e4ab155ca 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -309,24 +309,6 @@ static void rgrp_go_unlock(struct gfs2_holder *gh) gfs2_rgrp_bh_put(gh->gh_gl->gl_object); } -/** - * rgrp_go_dump - print out an rgrp - * @seq: The iterator - * @gl: The glock in question - * - */ - -static int rgrp_go_dump(struct seq_file *seq, const struct gfs2_glock *gl) -{ - const struct gfs2_rgrpd *rgd = gl->gl_object; - if (rgd == NULL) - return 0; - gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n", - (unsigned long long)rgd->rd_addr, rgd->rd_flags, - rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes); - return 0; -} - /** * trans_go_sync - promote/demote the transaction glock * @gl: the glock @@ -410,7 +392,7 @@ const struct gfs2_glock_operations gfs2_rgrp_glops = { .go_demote_ok = rgrp_go_demote_ok, .go_lock = rgrp_go_lock, .go_unlock = rgrp_go_unlock, - .go_dump = rgrp_go_dump, + .go_dump = gfs2_rgrp_dump, .go_type = LM_TYPE_RGRP, .go_min_hold_time = HZ / 5, }; diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 0060e9564bb..de50d86fec1 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -92,9 +92,10 @@ struct gfs2_rgrpd { unsigned int rd_bh_count; u32 rd_last_alloc; unsigned char rd_flags; -#define GFS2_RDF_CHECK 0x01 /* Need to check for unlinked inodes */ -#define GFS2_RDF_NOALLOC 0x02 /* rg prohibits allocation */ -#define GFS2_RDF_UPTODATE 0x04 /* rg is up to date */ +#define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ +#define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ +#define GFS2_RDF_ERROR 0x40000000 /* error in rg */ +#define GFS2_RDF_MASK 0xf0000000 /* mask for internal flags */ }; enum gfs2_state_bits { diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 565038243fa..fbacf09ee34 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -701,10 +701,7 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) u32 rg_flags; rg_flags = be32_to_cpu(str->rg_flags); - if (rg_flags & GFS2_RGF_NOALLOC) - rgd->rd_flags |= GFS2_RDF_NOALLOC; - else - rgd->rd_flags &= ~GFS2_RDF_NOALLOC; + rg_flags &= ~GFS2_RDF_MASK; rgd->rd_free = be32_to_cpu(str->rg_free); rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); @@ -713,11 +710,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) static void gfs2_rgrp_out(struct gfs2_rgrpd *rgd, void *buf) { struct gfs2_rgrp *str = buf; - u32 rg_flags = 0; - if (rgd->rd_flags & GFS2_RDF_NOALLOC) - rg_flags |= GFS2_RGF_NOALLOC; - str->rg_flags = cpu_to_be32(rg_flags); + str->rg_flags = cpu_to_be32(rgd->rd_flags & ~GFS2_RDF_MASK); str->rg_free = cpu_to_be32(rgd->rd_free); str->rg_dinodes = cpu_to_be32(rgd->rd_dinodes); str->__pad = cpu_to_be32(0); @@ -942,7 +936,7 @@ static int try_rgrp_fit(struct gfs2_rgrpd *rgd, struct gfs2_alloc *al) struct gfs2_sbd *sdp = rgd->rd_sbd; int ret = 0; - if (rgd->rd_flags & GFS2_RDF_NOALLOC) + if (rgd->rd_flags & (GFS2_RGF_NOALLOC | GFS2_RDF_ERROR)) return 0; spin_lock(&sdp->sd_rindex_spin); @@ -1435,13 +1429,33 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, } /** - * gfs2_alloc_block - Allocate a block - * @ip: the inode to allocate the block for + * gfs2_rgrp_dump - print out an rgrp + * @seq: The iterator + * @gl: The glock in question * - * Returns: the allocated block */ -u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) +int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl) +{ + const struct gfs2_rgrpd *rgd = gl->gl_object; + if (rgd == NULL) + return 0; + gfs2_print_dbg(seq, " R: n:%llu f:%02x b:%u/%u i:%u\n", + (unsigned long long)rgd->rd_addr, rgd->rd_flags, + rgd->rd_free, rgd->rd_free_clone, rgd->rd_dinodes); + return 0; +} + +/** + * gfs2_alloc_block - Allocate one or more blocks + * @ip: the inode to allocate the block for + * @bn: Used to return the starting block number + * @n: requested number of blocks/extent length (value/result) + * + * Returns: 0 or error + */ + +int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct buffer_head *dibh; @@ -1457,7 +1471,10 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) goal = rgd->rd_last_alloc; blk = rgblk_search(rgd, goal, GFS2_BLKST_FREE, GFS2_BLKST_USED, n); - BUG_ON(blk == BFITNOENT); + + /* Since all blocks are reserved in advance, this shouldn't happen */ + if (blk == BFITNOENT) + goto rgrp_error; rgd->rd_last_alloc = blk; block = rgd->rd_data0 + blk; @@ -1469,7 +1486,9 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) di->di_goal_meta = di->di_goal_data = cpu_to_be64(ip->i_goal); brelse(dibh); } - gfs2_assert_withdraw(sdp, rgd->rd_free >= *n); + if (rgd->rd_free < *n) + goto rgrp_error; + rgd->rd_free -= *n; gfs2_trans_add_bh(rgd->rd_gl, rgd->rd_bits[0].bi_bh, 1); @@ -1484,7 +1503,16 @@ u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n) rgd->rd_free_clone -= *n; spin_unlock(&sdp->sd_rindex_spin); - return block; + *bn = block; + return 0; + +rgrp_error: + fs_warn(sdp, "rgrp %llu has an error, marking it readonly until umount\n", + (unsigned long long)rgd->rd_addr); + fs_warn(sdp, "umount on all nodes and run fsck.gfs2 to fix the error\n"); + gfs2_rgrp_dump(NULL, rgd->rd_gl); + rgd->rd_flags |= GFS2_RDF_ERROR; + return -EIO; } /** diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 3181c7e624b..1e76ff0f3e0 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -14,22 +14,22 @@ struct gfs2_rgrpd; struct gfs2_sbd; struct gfs2_holder; -void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_verify(struct gfs2_rgrpd *rgd); struct gfs2_rgrpd *gfs2_blk2rgrpd(struct gfs2_sbd *sdp, u64 blk); struct gfs2_rgrpd *gfs2_rgrpd_get_first(struct gfs2_sbd *sdp); struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd); -void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); -int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); +extern void gfs2_clear_rgrpd(struct gfs2_sbd *sdp); +extern int gfs2_rindex_hold(struct gfs2_sbd *sdp, struct gfs2_holder *ri_gh); -int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); -void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); -void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); +extern int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_bh_hold(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_bh_put(struct gfs2_rgrpd *rgd); -void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); +extern void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd); -struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); +extern struct gfs2_alloc *gfs2_alloc_get(struct gfs2_inode *ip); static inline void gfs2_alloc_put(struct gfs2_inode *ip) { BUG_ON(ip->i_alloc == NULL); @@ -37,22 +37,22 @@ static inline void gfs2_alloc_put(struct gfs2_inode *ip) ip->i_alloc = NULL; } -int gfs2_inplace_reserve_i(struct gfs2_inode *ip, - char *file, unsigned int line); +extern int gfs2_inplace_reserve_i(struct gfs2_inode *ip, char *file, + unsigned int line); #define gfs2_inplace_reserve(ip) \ gfs2_inplace_reserve_i((ip), __FILE__, __LINE__) -void gfs2_inplace_release(struct gfs2_inode *ip); +extern void gfs2_inplace_release(struct gfs2_inode *ip); -unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); +extern unsigned char gfs2_get_block_type(struct gfs2_rgrpd *rgd, u64 block); -u64 gfs2_alloc_block(struct gfs2_inode *ip, unsigned int *n); -u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); +extern int gfs2_alloc_block(struct gfs2_inode *ip, u64 *bn, unsigned int *n); +extern u64 gfs2_alloc_di(struct gfs2_inode *ip, u64 *generation); -void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); -void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); -void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); -void gfs2_unlink_di(struct inode *inode); +extern void gfs2_free_data(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_free_meta(struct gfs2_inode *ip, u64 bstart, u32 blen); +extern void gfs2_free_di(struct gfs2_rgrpd *rgd, struct gfs2_inode *ip); +extern void gfs2_unlink_di(struct inode *inode); struct gfs2_rgrp_list { unsigned int rl_rgrps; @@ -61,10 +61,11 @@ struct gfs2_rgrp_list { struct gfs2_holder *rl_ghs; }; -void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, - u64 block); -void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); -void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); -u64 gfs2_ri_total(struct gfs2_sbd *sdp); +extern void gfs2_rlist_add(struct gfs2_sbd *sdp, struct gfs2_rgrp_list *rlist, + u64 block); +extern void gfs2_rlist_alloc(struct gfs2_rgrp_list *rlist, unsigned int state); +extern void gfs2_rlist_free(struct gfs2_rgrp_list *rlist); +extern u64 gfs2_ri_total(struct gfs2_sbd *sdp); +extern int gfs2_rgrp_dump(struct seq_file *seq, const struct gfs2_glock *gl); #endif /* __RGRP_DOT_H__ */ From d7b629a34fc4134a43c730b5f0197855dc4948d0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:19 +0200 Subject: [PATCH 1153/2061] perf_counter: Solve the rotate_ctx vs inherit race differently Instead of disabling RR scheduling of the counters, use a different list that does not get rotated to iterate the counters on inheritance. [ Impact: cleanup, optimization ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.237504544@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - kernel/perf_counter.c | 15 +++++---------- 2 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 13cb2fbbf33..c8c1dfc22c9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -508,7 +508,6 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; - int rr_allowed; struct task_struct *task; /* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4d8f97375f3..64113e6d194 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1120,8 +1120,7 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - if (ctx->rr_allowed) - rotate_ctx(ctx); + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); perf_counter_task_sched_in(curr, cpu); @@ -3109,7 +3108,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx, mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->counter_list); INIT_LIST_HEAD(&ctx->event_list); - ctx->rr_allowed = 1; ctx->task = task; } @@ -3350,14 +3348,14 @@ void perf_counter_init_task(struct task_struct *child) */ mutex_lock(&parent_ctx->mutex); - parent_ctx->rr_allowed = 0; - barrier(); /* irqs */ - /* * We dont have to disable NMIs - we are only looking at * the list, not manipulating it: */ - list_for_each_entry(counter, &parent_ctx->counter_list, list_entry) { + list_for_each_entry_rcu(counter, &parent_ctx->event_list, event_entry) { + if (counter != counter->group_leader) + continue; + if (!counter->hw_event.inherit) continue; @@ -3366,9 +3364,6 @@ void perf_counter_init_task(struct task_struct *child) break; } - barrier(); /* irqs */ - parent_ctx->rr_allowed = 1; - mutex_unlock(&parent_ctx->mutex); } From 26b119bc811a73bac6ecf95bdf284bf31c7955f0 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:20 +0200 Subject: [PATCH 1154/2061] perf_counter: Log irq_period changes For the dynamic irq_period code, log whenever we change the period so that analyzing code can normalize the event flow. [ Impact: add new feature to allow more precise profiling ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.298769743@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 8 ++++++++ kernel/perf_counter.c | 40 +++++++++++++++++++++++++++++++++++- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index c8c1dfc22c9..f612941ef46 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -257,6 +257,14 @@ enum perf_event_type { */ PERF_EVENT_COMM = 3, + /* + * struct { + * struct perf_event_header header; + * u64 irq_period; + * }; + */ + PERF_EVENT_PERIOD = 4, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 64113e6d194..db02eb16c77 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1046,7 +1046,9 @@ int perf_counter_task_enable(void) return 0; } -void perf_adjust_freq(struct perf_counter_context *ctx) +static void perf_log_period(struct perf_counter *counter, u64 period); + +static void perf_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; u64 irq_period; @@ -1072,6 +1074,8 @@ void perf_adjust_freq(struct perf_counter_context *ctx) if (!irq_period) irq_period = 1; + perf_log_period(counter, irq_period); + counter->hw.irq_period = irq_period; counter->hw.interrupts = 0; } @@ -2406,6 +2410,40 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, perf_counter_mmap_event(&mmap_event); } +/* + * + */ + +static void perf_log_period(struct perf_counter *counter, u64 period) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + u64 period; + } freq_event = { + .header = { + .type = PERF_EVENT_PERIOD, + .misc = 0, + .size = sizeof(freq_event), + }, + .time = sched_clock(), + .period = period, + }; + + if (counter->hw.irq_period == period) + return; + + ret = perf_output_begin(&handle, counter, sizeof(freq_event), 0, 0); + if (ret) + return; + + perf_output_put(&handle, freq_event); + perf_output_end(&handle); +} + /* * Generic counter overflow handling. */ From b986d7ec0f8b7ea3cc7366d80a137fbe839df227 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:21 +0200 Subject: [PATCH 1155/2061] perf_counter: Optimize disable of time based sw counters Currently we call hrtimer_cancel() unconditionally on disable of time based software counters. Avoid when possible. [ Impact: micro-optimize the code ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.388185031@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index db02eb16c77..473ed2cafbf 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2716,7 +2716,8 @@ static int cpu_clock_perf_counter_enable(struct perf_counter *counter) static void cpu_clock_perf_counter_disable(struct perf_counter *counter) { - hrtimer_cancel(&counter->hw.hrtimer); + if (counter->hw.irq_period) + hrtimer_cancel(&counter->hw.hrtimer); cpu_clock_perf_counter_update(counter); } @@ -2767,7 +2768,8 @@ static int task_clock_perf_counter_enable(struct perf_counter *counter) static void task_clock_perf_counter_disable(struct perf_counter *counter) { - hrtimer_cancel(&counter->hw.hrtimer); + if (counter->hw.irq_period) + hrtimer_cancel(&counter->hw.hrtimer); task_clock_perf_counter_update(counter, counter->ctx->time); } From afedadf23a2c90f3ba0d963282cbe6a6be129494 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 20 May 2009 12:21:22 +0200 Subject: [PATCH 1156/2061] perf_counter: Optimize sched in/out of counters Avoid a function call for !group counters by directly calling the counter function. [ Impact: micro-optimize the code ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Marcelo Tosatti Cc: John Kacur LKML-Reference: <20090520102553.511933670@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 473ed2cafbf..69d4de81596 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -826,8 +826,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, perf_disable(); if (ctx->nr_active) { - list_for_each_entry(counter, &ctx->counter_list, list_entry) - group_sched_out(counter, cpuctx, ctx); + list_for_each_entry(counter, &ctx->counter_list, list_entry) { + if (counter != counter->group_leader) + counter_sched_out(counter, cpuctx, ctx); + else + group_sched_out(counter, cpuctx, ctx); + } } perf_enable(); out: @@ -903,8 +907,12 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (counter->cpu != -1 && counter->cpu != cpu) continue; - if (group_can_go_on(counter, cpuctx, 1)) - group_sched_in(counter, cpuctx, ctx, cpu); + if (counter != counter->group_leader) + counter_sched_in(counter, cpuctx, ctx, cpu); + else { + if (group_can_go_on(counter, cpuctx, 1)) + group_sched_in(counter, cpuctx, ctx, cpu); + } /* * If this pinned group hasn't been scheduled, @@ -932,9 +940,14 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, if (counter->cpu != -1 && counter->cpu != cpu) continue; - if (group_can_go_on(counter, cpuctx, can_add_hw)) { - if (group_sched_in(counter, cpuctx, ctx, cpu)) + if (counter != counter->group_leader) { + if (counter_sched_in(counter, cpuctx, ctx, cpu)) can_add_hw = 0; + } else { + if (group_can_go_on(counter, cpuctx, can_add_hw)) { + if (group_sched_in(counter, cpuctx, ctx, cpu)) + can_add_hw = 0; + } } } perf_enable(); From 5537937696c55530447c20aa27daccb8d0d29b33 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 18 May 2009 23:04:46 +0800 Subject: [PATCH 1157/2061] ftrace: fix check for return value of register_module_notifier in event_trace_init register_module_notifier() returns zero in the success case. So fix the inverted fail case check in trace events modules handler. [ Impact: fix spurious warning on ftrace initialization] Reported-by: Li Zefan Signed-off-by: Ming Lei Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 0eec0c55dd8..9e91c4ad7c8 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -1174,7 +1174,7 @@ static __init int event_trace_init(void) } ret = register_module_notifier(&trace_module_nb); - if (!ret) + if (ret) pr_warning("Failed to register trace events module notifier\n"); return 0; From 34adc8062227f41b04ade0ff3fbd1dbe3002669e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 20 May 2009 20:13:28 +0200 Subject: [PATCH 1158/2061] perf_counter: Fix context removal deadlock Disable the PMU globally before removing a counter from a context. This fixes the following lockup: [22081.741922] ------------[ cut here ]------------ [22081.746668] WARNING: at arch/x86/kernel/cpu/perf_counter.c:803 intel_pmu_handle_irq+0x9b/0x24e() [22081.755624] Hardware name: X8DTN [22081.758903] perfcounters: irq loop stuck! [22081.762985] Modules linked in: [22081.766136] Pid: 11082, comm: perf Not tainted 2.6.30-rc6-tip #226 [22081.772432] Call Trace: [22081.774940] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.781993] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.788368] [] ? warn_slowpath_common+0x77/0xa3 [22081.794649] [] ? warn_slowpath_fmt+0x40/0x45 [22081.800696] [] ? intel_pmu_handle_irq+0x9b/0x24e [22081.807080] [] ? perf_counter_nmi_handler+0x3f/0x4a [22081.813751] [] ? notifier_call_chain+0x58/0x86 [22081.819951] [] ? notify_die+0x2d/0x32 [22081.825392] [] ? do_nmi+0x8e/0x242 [22081.830538] [] ? nmi+0x1a/0x20 [22081.835342] [] ? selinux_file_free_security+0x0/0x1a [22081.842105] [] ? x86_pmu_disable_counter+0x15/0x41 [22081.848673] <> [] ? x86_pmu_disable+0x86/0x103 [22081.855512] [] ? __perf_counter_remove_from_context+0x0/0xfe [22081.862926] [] ? counter_sched_out+0x30/0xce [22081.868909] [] ? __perf_counter_remove_from_context+0x59/0xfe [22081.876382] [] ? smp_call_function_single+0x6c/0xe6 [22081.882955] [] ? perf_release+0x86/0x14c [22081.888600] [] ? __fput+0xe7/0x195 [22081.893718] [] ? filp_close+0x5b/0x62 [22081.899107] [] ? put_files_struct+0x64/0xc2 [22081.905031] [] ? do_exit+0x1e2/0x6ef [22081.910360] [] ? _spin_lock_irqsave+0x9/0xe [22081.916292] [] ? do_group_exit+0x67/0x93 [22081.921953] [] ? sys_exit_group+0x12/0x16 [22081.927759] [] ? system_call_fastpath+0x16/0x1b [22081.934076] ---[ end trace 3a3936ce3e1b4505 ]--- And could potentially also fix the lockup reported by Marcelo Tosatti. Also, print more debug info in case of a detected lockup. [ Impact: fix lockup ] Reported-by: Marcelo Tosatti Acked-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 1 + kernel/perf_counter.c | 12 ++++++------ 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c109819c2cb..6cc1660db8d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -740,6 +740,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs, int nmi) again: if (++loops > 100) { WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); + perf_counter_print_debug(); return 1; } diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 69d4de81596..08584c16049 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -208,18 +208,17 @@ static void __perf_counter_remove_from_context(void *info) return; spin_lock_irqsave(&ctx->lock, flags); + /* + * Protect the list operation against NMI by disabling the + * counters on a global level. + */ + perf_disable(); counter_sched_out(counter, cpuctx, ctx); counter->task = NULL; - /* - * Protect the list operation against NMI by disabling the - * counters on a global level. NOP for non NMI based counters. - */ - perf_disable(); list_del_counter(counter, ctx); - perf_enable(); if (!ctx->task) { /* @@ -231,6 +230,7 @@ static void __perf_counter_remove_from_context(void *info) perf_max_counters - perf_reserved_percpu); } + perf_enable(); spin_unlock_irqrestore(&ctx->lock, flags); } From c6ac4c18fbc92a26df71ece609b082bc3099676b Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Wed, 20 May 2009 11:26:09 -0700 Subject: [PATCH 1159/2061] x86, boot: correct the calculation of ZO_INIT_SIZE Correct the calculation of ZO_INIT_SIZE (the amount of memory we need during decompression). One symbol (ZO_startup_32) was missing from zoffset.h, and another (ZO_z_extract_offset) was misspelled. [ Impact: build fix ] Signed-off-by: H. Peter Anvin --- arch/x86/boot/Makefile | 2 +- arch/x86/boot/header.S | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile index 75e0301fc69..619d297aa2b 100644 --- a/arch/x86/boot/Makefile +++ b/arch/x86/boot/Makefile @@ -95,7 +95,7 @@ targets += voffset.h $(obj)/voffset.h: vmlinux FORCE $(call if_changed,voffset) -sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' +sed-zoffset := -e 's/^\([0-9a-fA-F]*\) . \(startup_32\|input_data\|_end\|z_.*\)$$/\#define ZO_\2 0x\1/p' quiet_cmd_zoffset = ZOFFSET $@ cmd_zoffset = $(NM) $< | sed -n $(sed-zoffset) > $@ diff --git a/arch/x86/boot/header.S b/arch/x86/boot/header.S index 68c3bfbaff2..1040f6e8010 100644 --- a/arch/x86/boot/header.S +++ b/arch/x86/boot/header.S @@ -226,7 +226,7 @@ setup_data: .quad 0 # 64-bit physical pointer to pref_address: .quad LOAD_PHYSICAL_ADDR # preferred load addr -#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_extract_offset) +#define ZO_INIT_SIZE (ZO__end - ZO_startup_32 + ZO_z_extract_offset) #define VO_INIT_SIZE (VO__end - VO__text) #if ZO_INIT_SIZE > VO_INIT_SIZE #define INIT_SIZE ZO_INIT_SIZE From 60a0b8f93664621a07b93273fc8ebc29590c62f5 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 21 May 2009 12:23:12 +0100 Subject: [PATCH 1160/2061] GFS2: Add a rgrp bitmap full flag During block allocation, it is useful to know if sections of disk are full on a finer grained basis than a single resource group. This can make a performance difference when resource groups have larger numbers of bitmap blocks, since we no longer have to search them all block by block in each individual bitmap. The full flag is set on a per-bitmap basis when it has been searched and found to have no free space. It is then skipped in subsequent searches until the flag is reset. The resetting occurs if we have to drop the glock on the resource group for any reason, or if we deallocate some blocks within that resource group and thus free up some space. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 3 ++ fs/gfs2/rgrp.c | 79 +++++++++++++++++++++++++++++------------------- 2 files changed, 51 insertions(+), 31 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index de50d86fec1..dd87379b61e 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -64,9 +64,12 @@ struct gfs2_log_element { const struct gfs2_log_operations *le_ops; }; +#define GBF_FULL 1 + struct gfs2_bitmap { struct buffer_head *bi_bh; char *bi_clone; + unsigned long bi_flags; u32 bi_offset; u32 bi_start; u32 bi_len; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index fbacf09ee34..23637b9d1c7 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -442,6 +442,7 @@ static int compute_bitstructs(struct gfs2_rgrpd *rgd) for (x = 0; x < length; x++) { bi = rgd->rd_bits + x; + bi->bi_flags = 0; /* small rgrp; bitmap stored completely in header block */ if (length == 1) { bytes = bytes_left; @@ -769,6 +770,8 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) } if (!(rgd->rd_flags & GFS2_RDF_UPTODATE)) { + for (x = 0; x < length; x++) + clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); rgd->rd_flags |= GFS2_RDF_UPTODATE; } @@ -897,6 +900,7 @@ void gfs2_rgrp_repolish_clones(struct gfs2_rgrpd *rgd) continue; if (sdp->sd_args.ar_discard) gfs2_rgrp_send_discards(sdp, rgd->rd_data0, bi); + clear_bit(GBF_FULL, &bi->bi_flags); memcpy(bi->bi_clone + bi->bi_offset, bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); } @@ -1309,30 +1313,37 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, { struct gfs2_bitmap *bi = NULL; const u32 length = rgd->rd_length; - u32 blk = 0; + u32 blk = BFITNOENT; unsigned int buf, x; const unsigned int elen = *n; - const u8 *buffer; + const u8 *buffer = NULL; *n = 0; /* Find bitmap block that contains bits for goal block */ for (buf = 0; buf < length; buf++) { bi = rgd->rd_bits + buf; - if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) - break; + /* Convert scope of "goal" from rgrp-wide to within found bit block */ + if (goal < (bi->bi_start + bi->bi_len) * GFS2_NBBY) { + goal -= bi->bi_start * GFS2_NBBY; + goto do_search; + } } + buf = 0; + goal = 0; - gfs2_assert(rgd->rd_sbd, buf < length); - - /* Convert scope of "goal" from rgrp-wide to within found bit block */ - goal -= bi->bi_start * GFS2_NBBY; - +do_search: /* Search (up to entire) bitmap in this rgrp for allocatable block. "x <= length", instead of "x < length", because we typically start the search in the middle of a bit block, but if we can't find an allocatable block anywhere else, we want to be able wrap around and search in the first part of our first-searched bit block. */ for (x = 0; x <= length; x++) { + bi = rgd->rd_bits + buf; + + if (test_bit(GBF_FULL, &bi->bi_flags) && + (old_state == GFS2_BLKST_FREE)) + goto skip; + /* The GFS2_BLKST_UNLINKED state doesn't apply to the clone bitmaps, so we must search the originals for that. */ buffer = bi->bi_bh->b_data + bi->bi_offset; @@ -1343,33 +1354,39 @@ static u32 rgblk_search(struct gfs2_rgrpd *rgd, u32 goal, if (blk != BFITNOENT) break; + if ((goal == 0) && (old_state == GFS2_BLKST_FREE)) + set_bit(GBF_FULL, &bi->bi_flags); + /* Try next bitmap block (wrap back to rgrp header if at end) */ - buf = (buf + 1) % length; - bi = rgd->rd_bits + buf; +skip: + buf++; + buf %= length; goal = 0; } - if (blk != BFITNOENT && old_state != new_state) { - *n = 1; - gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); - gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, - bi->bi_len, blk, new_state); - goal = blk; - while (*n < elen) { - goal++; - if (goal >= (bi->bi_len * GFS2_NBBY)) - break; - if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) != - GFS2_BLKST_FREE) - break; - gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, - bi->bi_offset, bi->bi_len, goal, - new_state); - (*n)++; - } - } + if (blk == BFITNOENT) + return blk; + *n = 1; + if (old_state == new_state) + goto out; - return (blk == BFITNOENT) ? blk : (bi->bi_start * GFS2_NBBY) + blk; + gfs2_trans_add_bh(rgd->rd_gl, bi->bi_bh, 1); + gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, + bi->bi_len, blk, new_state); + goal = blk; + while (*n < elen) { + goal++; + if (goal >= (bi->bi_len * GFS2_NBBY)) + break; + if (gfs2_testbit(rgd, buffer, bi->bi_len, goal) != + GFS2_BLKST_FREE) + break; + gfs2_setbit(rgd, bi->bi_bh->b_data, bi->bi_clone, bi->bi_offset, + bi->bi_len, goal, new_state); + (*n)++; + } +out: + return (bi->bi_start * GFS2_NBBY) + blk; } /** From 1ce97e564b628bee30b8dbb64e5e653a484308f6 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Thu, 21 May 2009 15:18:19 +0100 Subject: [PATCH 1161/2061] GFS2: Be more aggressive in reclaiming unlinked inodes This patch increases the frequency with which gfs2 looks for unlinked, but still allocated inodes. Its the equivalent operation to ext3's orphan list, but done with bitmaps in the resource groups. This also fixes a bug where a field in the rgrp was too small. Signed-off-by: Steven Whitehouse --- fs/gfs2/incore.h | 2 +- fs/gfs2/rgrp.c | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index dd87379b61e..225347fbff3 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -94,7 +94,7 @@ struct gfs2_rgrpd { struct gfs2_sbd *rd_sbd; unsigned int rd_bh_count; u32 rd_last_alloc; - unsigned char rd_flags; + u32 rd_flags; #define GFS2_RDF_CHECK 0x10000000 /* check for unlinked inodes */ #define GFS2_RDF_UPTODATE 0x20000000 /* rg is up to date */ #define GFS2_RDF_ERROR 0x40000000 /* error in rg */ diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 23637b9d1c7..ee3d5c1876a 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -581,7 +581,6 @@ static int read_rindex_entry(struct gfs2_inode *ip, rgd->rd_gl->gl_object = rgd; rgd->rd_flags &= ~GFS2_RDF_UPTODATE; - rgd->rd_flags |= GFS2_RDF_CHECK; return error; } @@ -703,6 +702,8 @@ static void gfs2_rgrp_in(struct gfs2_rgrpd *rgd, const void *buf) rg_flags = be32_to_cpu(str->rg_flags); rg_flags &= ~GFS2_RDF_MASK; + rgd->rd_flags &= GFS2_RDF_MASK; + rgd->rd_flags |= rg_flags; rgd->rd_free = be32_to_cpu(str->rg_free); rgd->rd_dinodes = be32_to_cpu(str->rg_dinodes); rgd->rd_igeneration = be64_to_cpu(str->rg_igeneration); @@ -773,7 +774,7 @@ int gfs2_rgrp_bh_get(struct gfs2_rgrpd *rgd) for (x = 0; x < length; x++) clear_bit(GBF_FULL, &rgd->rd_bits[x].bi_flags); gfs2_rgrp_in(rgd, (rgd->rd_bits[0].bi_bh)->b_data); - rgd->rd_flags |= GFS2_RDF_UPTODATE; + rgd->rd_flags |= (GFS2_RDF_UPTODATE | GFS2_RDF_CHECK); } spin_lock(&sdp->sd_rindex_spin); From 55620c86ebc29013c0d26c5b3bc423faea299fee Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 22 May 2009 01:16:17 +0900 Subject: [PATCH 1162/2061] sh: irq: Rework the SR.IMASK bitmap handling. This tidies up how the SR.IMASK bitmap is managed, using the bitmap API directly instead. At the same time, tidy up the irq_chip conversion a bit. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/irq/imask.c | 64 +++++++++------------------------- 1 file changed, 16 insertions(+), 48 deletions(-) diff --git a/arch/sh/kernel/cpu/irq/imask.c b/arch/sh/kernel/cpu/irq/imask.c index f43753b54e1..3bef3c2629a 100644 --- a/arch/sh/kernel/cpu/irq/imask.c +++ b/arch/sh/kernel/cpu/irq/imask.c @@ -18,38 +18,17 @@ #include #include #include +#include #include #include /* Bitmap of IRQ masked */ -static unsigned long imask_mask = 0x7fff; -static int interrupt_priority = 0; - -static void enable_imask_irq(unsigned int irq); -static void disable_imask_irq(unsigned int irq); -static void shutdown_imask_irq(unsigned int irq); -static void mask_and_ack_imask(unsigned int); -static void end_imask_irq(unsigned int irq); - #define IMASK_PRIORITY 15 -static unsigned int startup_imask_irq(unsigned int irq) -{ - /* Nothing to do */ - return 0; /* never anything pending */ -} +static DECLARE_BITMAP(imask_mask, IMASK_PRIORITY); +static int interrupt_priority; -static struct irq_chip imask_irq_type = { - .typename = "SR.IMASK", - .startup = startup_imask_irq, - .shutdown = shutdown_imask_irq, - .enable = enable_imask_irq, - .disable = disable_imask_irq, - .ack = mask_and_ack_imask, - .end = end_imask_irq -}; - -void static inline set_interrupt_registers(int ip) +static inline void set_interrupt_registers(int ip) { unsigned long __dummy; @@ -72,42 +51,31 @@ void static inline set_interrupt_registers(int ip) : "t"); } -static void disable_imask_irq(unsigned int irq) +static void mask_imask_irq(unsigned int irq) { clear_bit(irq, &imask_mask); if (interrupt_priority < IMASK_PRIORITY - irq) interrupt_priority = IMASK_PRIORITY - irq; - set_interrupt_registers(interrupt_priority); } -static void enable_imask_irq(unsigned int irq) +static void unmask_imask_irq(unsigned int irq) { set_bit(irq, &imask_mask); - interrupt_priority = IMASK_PRIORITY - ffz(imask_mask); - + interrupt_priority = IMASK_PRIORITY - + find_first_zero_bit(imask_mask, IMASK_PRIORITY); set_interrupt_registers(interrupt_priority); } -static void mask_and_ack_imask(unsigned int irq) -{ - disable_imask_irq(irq); -} - -static void end_imask_irq(unsigned int irq) -{ - if (!(irq_desc[irq].status & (IRQ_DISABLED|IRQ_INPROGRESS))) - enable_imask_irq(irq); -} - -static void shutdown_imask_irq(unsigned int irq) -{ - /* Nothing to do */ -} +static struct irq_chip imask_irq_chip = { + .typename = "SR.IMASK", + .mask = mask_imask_irq, + .unmask = unmask_imask_irq, + .mask_ack = mask_imask_irq, +}; void make_imask_irq(unsigned int irq) { - disable_irq_nosync(irq); - irq_desc[irq].chip = &imask_irq_type; - enable_irq(irq); + set_irq_chip_and_handler_name(irq, &imask_irq_chip, + handle_level_irq, "level"); } From fa1d43ab451084785153d37ae559c4fdd1546a5b Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 22 May 2009 01:26:16 +0900 Subject: [PATCH 1163/2061] sh: irq: Convert from irq_desc[] to irq_to_desc(). This converts a few places that were using the old irq_desc[] array over to the shiny new irq_to_desc() helper. Preperatory work for sparse irq support. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/irq/intc-sh5.c | 10 +---- arch/sh/kernel/irq.c | 69 +++++++++++++++++++++---------- arch/sh/kernel/sh_ksyms_32.c | 8 ---- 3 files changed, 48 insertions(+), 39 deletions(-) diff --git a/arch/sh/kernel/cpu/irq/intc-sh5.c b/arch/sh/kernel/cpu/irq/intc-sh5.c index a619eaf6e6b..6c092f1f555 100644 --- a/arch/sh/kernel/cpu/irq/intc-sh5.c +++ b/arch/sh/kernel/cpu/irq/intc-sh5.c @@ -152,14 +152,6 @@ static void end_intc_irq(unsigned int irq) enable_intc_irq(irq); } -/* For future use, if we ever support IRLM=0) */ -void make_intc_irq(unsigned int irq) -{ - disable_irq_nosync(irq); - irq_desc[irq].chip = &intc_irq_type; - disable_intc_irq(irq); -} - void __init plat_irq_setup(void) { unsigned long long __dummy0, __dummy1=~0x00000000100000f0; @@ -174,7 +166,7 @@ void __init plat_irq_setup(void) /* Set default: per-line enable/disable, priority driven ack/eoi */ for (i = 0; i < NR_INTC_IRQS; i++) - irq_desc[i].chip = &intc_irq_type; + set_irq_chip_and_handler(i, &intc_irq_type, handle_level_irq); /* Disable all interrupts and set all priorities to 0 to avoid trouble */ diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index 3f1372eb009..878532b6176 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -31,39 +31,64 @@ void ack_bad_irq(unsigned int irq) } #if defined(CONFIG_PROC_FS) +/* + * /proc/interrupts printing: + */ +static int show_other_interrupts(struct seq_file *p, int prec) +{ + seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); + return 0; +} + int show_interrupts(struct seq_file *p, void *v) { - int i = *(loff_t *) v, j; - struct irqaction * action; - unsigned long flags; + unsigned long flags, any_count = 0; + int i = *(loff_t *)v, j, prec; + struct irqaction *action; + struct irq_desc *desc; + + if (i > nr_irqs) + return 0; + + for (prec = 3, j = 1000; prec < 10 && j <= nr_irqs; ++prec) + j *= 10; + + if (i == nr_irqs) + return show_other_interrupts(p, prec); if (i == 0) { - seq_puts(p, " "); + seq_printf(p, "%*s", prec + 8, ""); for_each_online_cpu(j) - seq_printf(p, "CPU%d ",j); + seq_printf(p, "CPU%-8d", j); seq_putc(p, '\n'); } - if (i < sh_mv.mv_nr_irqs) { - spin_lock_irqsave(&irq_desc[i].lock, flags); - action = irq_desc[i].action; - if (!action) - goto unlock; - seq_printf(p, "%3d: ",i); - for_each_online_cpu(j) - seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); - seq_printf(p, " %14s", irq_desc[i].chip->name); - seq_printf(p, "-%-8s", irq_desc[i].name); + desc = irq_to_desc(i); + if (!desc) + return 0; + + spin_lock_irqsave(&desc->lock, flags); + for_each_online_cpu(j) + any_count |= kstat_irqs_cpu(i, j); + action = desc->action; + if (!action && !any_count) + goto out; + + seq_printf(p, "%*d: ", prec, i); + for_each_online_cpu(j) + seq_printf(p, "%10u ", kstat_irqs_cpu(i, j)); + seq_printf(p, " %14s", desc->chip->name); + seq_printf(p, "-%-8s", desc->name); + + if (action) { seq_printf(p, " %s", action->name); - - for (action=action->next; action; action = action->next) + while ((action = action->next) != NULL) seq_printf(p, ", %s", action->name); - seq_putc(p, '\n'); -unlock: - spin_unlock_irqrestore(&irq_desc[i].lock, flags); - } else if (i == sh_mv.mv_nr_irqs) - seq_printf(p, "Err: %10u\n", atomic_read(&irq_err_count)); + } + seq_putc(p, '\n'); +out: + spin_unlock_irqrestore(&desc->lock, flags); return 0; } #endif diff --git a/arch/sh/kernel/sh_ksyms_32.c b/arch/sh/kernel/sh_ksyms_32.c index 2d868c60aa6..fcc5de31f83 100644 --- a/arch/sh/kernel/sh_ksyms_32.c +++ b/arch/sh/kernel/sh_ksyms_32.c @@ -23,9 +23,6 @@ extern int dump_fpu(struct pt_regs *, elf_fpregset_t *); /* platform dependent support */ EXPORT_SYMBOL(dump_fpu); EXPORT_SYMBOL(kernel_thread); -EXPORT_SYMBOL(irq_desc); -EXPORT_SYMBOL(no_irq_chip); - EXPORT_SYMBOL(strlen); /* PCI exports */ @@ -40,11 +37,6 @@ EXPORT_SYMBOL(memcpy); EXPORT_SYMBOL(memset); EXPORT_SYMBOL(memmove); EXPORT_SYMBOL(__copy_user); - -#ifdef CONFIG_MMU -EXPORT_SYMBOL(get_vm_area); -#endif - EXPORT_SYMBOL(__udelay); EXPORT_SYMBOL(__ndelay); EXPORT_SYMBOL(__const_udelay); From 05ff3004d278b54760abd71530506d803182c71d Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 22 May 2009 01:28:33 +0900 Subject: [PATCH 1164/2061] sh: irq: Teach ipr and intc about dynamically allocating irq_descs. This hooks in irq_to_desc_alloc_cpu() to the necessary code paths in the intc and ipr controller registration paths. As these are the primary call paths for all SH CPUs, this alone will make all CPUs sparse IRQ ready. There is the added benefit now that each CPU contains specific IPR and INTC tables, so only the vectors with interrupt sources backing them will ever see an irq_desc instantiation. This effectively packs irq_desc down to match the CPU, rather than padding NR_IRQS out to cover the valid vector range. Boards with extra sources will still have to fiddle with the nr_irqs setting, but they can continue doing so through the machvec as before. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/irq/ipr.c | 8 ++++++++ drivers/sh/intc.c | 12 ++++++++++-- 2 files changed, 18 insertions(+), 2 deletions(-) diff --git a/arch/sh/kernel/cpu/irq/ipr.c b/arch/sh/kernel/cpu/irq/ipr.c index 3eb17ee5540..fa0c8467a28 100644 --- a/arch/sh/kernel/cpu/irq/ipr.c +++ b/arch/sh/kernel/cpu/irq/ipr.c @@ -59,10 +59,18 @@ void register_ipr_controller(struct ipr_desc *desc) for (i = 0; i < desc->nr_irqs; i++) { struct ipr_data *p = desc->ipr_data + i; + struct irq_desc *irq_desc; BUG_ON(p->ipr_idx >= desc->nr_offsets); BUG_ON(!desc->ipr_offsets[p->ipr_idx]); + irq_desc = irq_to_desc_alloc_cpu(p->irq, smp_processor_id()); + if (unlikely(!irq_desc)) { + printk(KERN_INFO "can not get irq_desc for %d\n", + p->irq); + continue; + } + disable_irq_nosync(p->irq); set_irq_chip_and_handler_name(p->irq, &desc->chip, handle_level_irq, "level"); diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c index 12d13d99b6f..098b767e9af 100644 --- a/drivers/sh/intc.c +++ b/drivers/sh/intc.c @@ -671,7 +671,7 @@ unsigned int intc_evt2irq(unsigned int vector) void __init register_intc_controller(struct intc_desc *desc) { - unsigned int i, k, smp; + unsigned int i, k, smp, cpu = smp_processor_id(); struct intc_desc_int *d; d = alloc_bootmem(sizeof(*d)); @@ -770,11 +770,19 @@ void __init register_intc_controller(struct intc_desc *desc) /* register the vectors one by one */ for (i = 0; i < desc->nr_vectors; i++) { struct intc_vect *vect = desc->vectors + i; + unsigned int irq = evt2irq(vect->vect); + struct irq_desc *irq_desc; if (!vect->enum_id) continue; - intc_register_irq(desc, d, vect->enum_id, evt2irq(vect->vect)); + irq_desc = irq_to_desc_alloc_cpu(irq, cpu); + if (unlikely(!irq_desc)) { + printk(KERN_INFO "can not get irq_desc for %d\n", irq); + continue; + } + + intc_register_irq(desc, d, vect->enum_id, irq); } } From d8586ba6e1415150e1bab89f0a05447bb6f2d6d5 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 22 May 2009 01:36:13 +0900 Subject: [PATCH 1165/2061] sh: irq: Provide an arch_probe_nr_irqs() that wraps the machvec def. This is just a simple arch_probe_nr_irqs() stub that wraps to the platform defined number of IRQs. This can be made gradually more intelligent based on what we can infer from the INTC tables and so on. Signed-off-by: Paul Mundt --- arch/sh/kernel/irq.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index 878532b6176..3d09062f468 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -279,3 +279,11 @@ void __init init_IRQ(void) irq_ctx_init(smp_processor_id()); } + +#ifdef CONFIG_SPARSE_IRQ +int __init arch_probe_nr_irqs(void) +{ + nr_irqs = sh_mv.mv_nr_irqs; + return 0; +} +#endif From 30cff215b54b20d201a2bd7d50361c4c14700281 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Wed, 20 May 2009 13:44:40 +0000 Subject: [PATCH 1166/2061] sh: clkfwk branch compile fix for clock-sh7722 Fix clkfwk branch compile error in clock-sh7722.c. Signed-off-by: Magnus Damm Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c index a98d7da67b9..ae78efd0582 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c @@ -892,7 +892,7 @@ int __init arch_clk_init(void) struct clk *clk; int i; - clk_cpg_init(); + cpg_clk_init(); clk = clk_get(NULL, "master_clk"); for (i = 0; i < ARRAY_SIZE(sh7722_clocks); i++) { From 5789ba3bd0a3cd20df5980ebf03358f2eb44fd67 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 15:47:06 -0400 Subject: [PATCH 1167/2061] IMA: Minimal IMA policy and boot param for TCB IMA policy The IMA TCB policy is dangerous. A normal use can use all of a system's memory (which cannot be freed) simply by building and running lots of executables. The TCB policy is also nearly useless because logging in as root often causes a policy violation when dealing with utmp, thus rendering the measurements meaningless. There is no good fix for this in the kernel. A full TCB policy would need to be loaded in userspace using LSM rule matching to get both a protected and useful system. But, if too little is measured before userspace can load a real policy one again ends up with a meaningless set of measurements. One option would be to put the policy load inside the initrd in order to get it early enough in the boot sequence to be useful, but this runs into trouble with the LSM. For IMA to measure the LSM policy and the LSM policy loading mechanism it needs rules to do so, but we already talked about problems with defaulting to such broad rules.... IMA also depends on the files being measured to be on an FS which implements and supports i_version. Since the only FS with this support (ext4) doesn't even use it by default it seems silly to have any IMA rules by default. This should reduce the performance overhead of IMA to near 0 while still letting users who choose to configure their machine as such to inclue the ima_tcb kernel paramenter and get measurements during boot before they can load a customized, reasonable policy in userspace. Signed-off-by: Eric Paris Acked-by: Mimi Zohar Signed-off-by: James Morris --- Documentation/kernel-parameters.txt | 6 ++++++ security/integrity/ima/ima_policy.c | 30 ++++++++++++++++++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e87bdbfbcc7..d9a24a04cfb 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -914,6 +914,12 @@ and is between 256 and 4096 characters. It is defined in the file Formt: { "sha1" | "md5" } default: "sha1" + ima_tcb [IMA] + Load a policy which meets the needs of the Trusted + Computing Base. This means IMA will measure all + programs exec'd, files mmap'd for exec, and all files + opened for read by uid=0. + in2000= [HW,SCSI] See header of drivers/scsi/in2000.c. diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 31d677f7c65..4719bbf1641 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -45,9 +45,17 @@ struct ima_measure_rule_entry { } lsm[MAX_LSM_RULES]; }; -/* Without LSM specific knowledge, the default policy can only be +/* + * Without LSM specific knowledge, the default policy can only be * written in terms of .action, .func, .mask, .fsmagic, and .uid */ + +/* + * The minimum rule set to allow for full TCB coverage. Measures all files + * opened or mmap for exec and everything read by root. Dangerous because + * normal users can easily run the machine out of memory simply building + * and running executables. + */ static struct ima_measure_rule_entry default_rules[] = { {.action = DONT_MEASURE,.fsmagic = PROC_SUPER_MAGIC,.flags = IMA_FSMAGIC}, {.action = DONT_MEASURE,.fsmagic = SYSFS_MAGIC,.flags = IMA_FSMAGIC}, @@ -59,6 +67,8 @@ static struct ima_measure_rule_entry default_rules[] = { .flags = IMA_FUNC | IMA_MASK}, {.action = MEASURE,.func = BPRM_CHECK,.mask = MAY_EXEC, .flags = IMA_FUNC | IMA_MASK}, + {.action = MEASURE,.func = PATH_CHECK,.mask = MAY_READ,.uid = 0, + .flags = IMA_FUNC | IMA_MASK | IMA_UID}, }; static LIST_HEAD(measure_default_rules); @@ -67,6 +77,14 @@ static struct list_head *ima_measure; static DEFINE_MUTEX(ima_measure_mutex); +static bool ima_use_tcb __initdata; +static int __init default_policy_setup(char *str) +{ + ima_use_tcb = 1; + return 1; +} +__setup("ima_tcb", default_policy_setup); + /** * ima_match_rules - determine whether an inode matches the measure rule. * @rule: a pointer to a rule @@ -162,9 +180,15 @@ int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask) */ void ima_init_policy(void) { - int i; + int i, entries; - for (i = 0; i < ARRAY_SIZE(default_rules); i++) + /* if !ima_use_tcb set entries = 0 so we load NO default rules */ + if (ima_use_tcb) + entries = ARRAY_SIZE(default_rules); + else + entries = 0; + + for (i = 0; i < entries; i++) list_add_tail(&default_rules[i].list, &measure_default_rules); ima_measure = &measure_default_rules; } From 932995f0ce52525b32ff5127b522c2c164de3810 Mon Sep 17 00:00:00 2001 From: Eric Paris Date: Thu, 21 May 2009 15:43:32 -0400 Subject: [PATCH 1168/2061] IMA: Add __init notation to ima functions A number of IMA functions only used during init are not marked with __init. Add those notations so they are freed automatically. Signed-off-by: Eric Paris Acked-by: Mimi Zohar Signed-off-by: James Morris --- security/integrity/ima/ima_crypto.c | 4 ++-- security/integrity/ima/ima_fs.c | 2 +- security/integrity/ima/ima_iint.c | 2 +- security/integrity/ima/ima_init.c | 4 ++-- security/integrity/ima/ima_policy.c | 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/security/integrity/ima/ima_crypto.c b/security/integrity/ima/ima_crypto.c index 50d572b74ca..63003a63aae 100644 --- a/security/integrity/ima/ima_crypto.c +++ b/security/integrity/ima/ima_crypto.c @@ -103,7 +103,7 @@ int ima_calc_template_hash(int template_len, void *template, char *digest) return rc; } -static void ima_pcrread(int idx, u8 *pcr) +static void __init ima_pcrread(int idx, u8 *pcr) { if (!ima_used_chip) return; @@ -115,7 +115,7 @@ static void ima_pcrread(int idx, u8 *pcr) /* * Calculate the boot aggregate hash */ -int ima_calc_boot_aggregate(char *digest) +int __init ima_calc_boot_aggregate(char *digest) { struct hash_desc desc; struct scatterlist sg; diff --git a/security/integrity/ima/ima_fs.c b/security/integrity/ima/ima_fs.c index 3305a961586..7039b14e1f7 100644 --- a/security/integrity/ima/ima_fs.c +++ b/security/integrity/ima/ima_fs.c @@ -319,7 +319,7 @@ static struct file_operations ima_measure_policy_ops = { .release = ima_release_policy }; -int ima_fs_init(void) +int __init ima_fs_init(void) { ima_dir = securityfs_create_dir("ima", NULL); if (IS_ERR(ima_dir)) diff --git a/security/integrity/ima/ima_iint.c b/security/integrity/ima/ima_iint.c index ec79f1ee992..b8dd693f879 100644 --- a/security/integrity/ima/ima_iint.c +++ b/security/integrity/ima/ima_iint.c @@ -196,7 +196,7 @@ static void init_once(void *foo) kref_set(&iint->refcount, 1); } -void ima_iintcache_init(void) +void __init ima_iintcache_init(void) { iint_cache = kmem_cache_create("iint_cache", sizeof(struct ima_iint_cache), 0, diff --git a/security/integrity/ima/ima_init.c b/security/integrity/ima/ima_init.c index 0b0bb8c978c..a40da7ae590 100644 --- a/security/integrity/ima/ima_init.c +++ b/security/integrity/ima/ima_init.c @@ -38,7 +38,7 @@ int ima_used_chip; * a different value.) Violations add a zero entry to the measurement * list and extend the aggregate PCR value with ff...ff's. */ -static void ima_add_boot_aggregate(void) +static void __init ima_add_boot_aggregate(void) { struct ima_template_entry *entry; const char *op = "add_boot_aggregate"; @@ -71,7 +71,7 @@ err_out: audit_cause, result, 0); } -int ima_init(void) +int __init ima_init(void) { u8 pcr_i[IMA_DIGEST_SIZE]; int rc; diff --git a/security/integrity/ima/ima_policy.c b/security/integrity/ima/ima_policy.c index 4719bbf1641..e1278399b34 100644 --- a/security/integrity/ima/ima_policy.c +++ b/security/integrity/ima/ima_policy.c @@ -178,7 +178,7 @@ int ima_match_policy(struct inode *inode, enum ima_hooks func, int mask) * ima_measure points to either the measure_default_rules or the * the new measure_policy_rules. */ -void ima_init_policy(void) +void __init ima_init_policy(void) { int i, entries; From b9fc745db833bbf74b4988493b8cd902a84c9415 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 19 May 2009 13:25:57 -0400 Subject: [PATCH 1169/2061] integrity: path_check update - Add support in ima_path_check() for integrity checking without incrementing the counts. (Required for nfsd.) - rename and export opencount_get to ima_counts_get - replace ima_shm_check calls with ima_counts_get - export ima_path_check Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- fs/exec.c | 5 ++-- fs/namei.c | 6 ++-- include/linux/ima.h | 11 ++++--- ipc/shm.c | 4 +-- mm/shmem.c | 2 +- security/integrity/ima/ima_main.c | 48 +++++++++++++++++++------------ 6 files changed, 46 insertions(+), 30 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 998e856c307..618d6d1e2c5 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -130,7 +130,8 @@ SYSCALL_DEFINE1(uselib, const char __user *, library) MAY_READ | MAY_EXEC | MAY_OPEN); if (error) goto exit; - error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN); + error = ima_path_check(&nd.path, MAY_READ | MAY_EXEC | MAY_OPEN, + IMA_COUNT_UPDATE); if (error) goto exit; @@ -680,7 +681,7 @@ struct file *open_exec(const char *name) err = inode_permission(nd.path.dentry->d_inode, MAY_EXEC | MAY_OPEN); if (err) goto out_path_put; - err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN); + err = ima_path_check(&nd.path, MAY_EXEC | MAY_OPEN, IMA_COUNT_UPDATE); if (err) goto out_path_put; diff --git a/fs/namei.c b/fs/namei.c index 78f253cd2d4..b05a2b1dea6 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -853,7 +853,8 @@ static int __link_path_walk(const char *name, struct nameidata *nd) err = inode_permission(nd->path.dentry->d_inode, MAY_EXEC); if (!err) - err = ima_path_check(&nd->path, MAY_EXEC); + err = ima_path_check(&nd->path, MAY_EXEC, + IMA_COUNT_UPDATE); if (err) break; @@ -1515,7 +1516,8 @@ int may_open(struct path *path, int acc_mode, int flag) return error; error = ima_path_check(path, - acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC)); + acc_mode & (MAY_READ | MAY_WRITE | MAY_EXEC), + IMA_COUNT_UPDATE); if (error) return error; /* diff --git a/include/linux/ima.h b/include/linux/ima.h index 0e2aa45cb0c..b1b827d091a 100644 --- a/include/linux/ima.h +++ b/include/linux/ima.h @@ -13,14 +13,17 @@ #include struct linux_binprm; +#define IMA_COUNT_UPDATE 1 +#define IMA_COUNT_LEAVE 0 + #ifdef CONFIG_IMA extern int ima_bprm_check(struct linux_binprm *bprm); extern int ima_inode_alloc(struct inode *inode); extern void ima_inode_free(struct inode *inode); -extern int ima_path_check(struct path *path, int mask); +extern int ima_path_check(struct path *path, int mask, int update_counts); extern void ima_file_free(struct file *file); extern int ima_file_mmap(struct file *file, unsigned long prot); -extern void ima_shm_check(struct file *file); +extern void ima_counts_get(struct file *file); #else static inline int ima_bprm_check(struct linux_binprm *bprm) @@ -38,7 +41,7 @@ static inline void ima_inode_free(struct inode *inode) return; } -static inline int ima_path_check(struct path *path, int mask) +static inline int ima_path_check(struct path *path, int mask, int update_counts) { return 0; } @@ -53,7 +56,7 @@ static inline int ima_file_mmap(struct file *file, unsigned long prot) return 0; } -static inline void ima_shm_check(struct file *file) +static inline void ima_counts_get(struct file *file) { return; } diff --git a/ipc/shm.c b/ipc/shm.c index faa46da99eb..47b464229cd 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -384,7 +384,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) error = PTR_ERR(file); if (IS_ERR(file)) goto no_file; - ima_shm_check(file); + ima_counts_get(file); id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); if (id < 0) { @@ -891,7 +891,7 @@ long do_shmat(int shmid, char __user *shmaddr, int shmflg, ulong *raddr) file = alloc_file(path.mnt, path.dentry, f_mode, &shm_file_operations); if (!file) goto out_free; - ima_shm_check(file); + ima_counts_get(file); file->private_data = sfd; file->f_mapping = shp->shm_file->f_mapping; diff --git a/mm/shmem.c b/mm/shmem.c index b25f95ce3db..a817f75f144 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2684,7 +2684,7 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (IS_ERR(file)) return PTR_ERR(file); - ima_shm_check(file); + ima_counts_get(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index c4228c0eb2d..a2eb23310ea 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -125,6 +125,15 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file, return rc; } +static void ima_update_counts(struct ima_iint_cache *iint, int mask) +{ + iint->opencount++; + if ((mask & MAY_WRITE) || (mask == 0)) + iint->writecount++; + else if (mask & (MAY_READ | MAY_EXEC)) + iint->readcount++; +} + /** * ima_path_check - based on policy, collect/store measurement. * @path: contains a pointer to the path to be measured @@ -143,7 +152,7 @@ static int get_path_measurement(struct ima_iint_cache *iint, struct file *file, * Return 0 on success, an error code on failure. * (Based on the results of appraise_measurement().) */ -int ima_path_check(struct path *path, int mask) +int ima_path_check(struct path *path, int mask, int update_counts) { struct inode *inode = path->dentry->d_inode; struct ima_iint_cache *iint; @@ -157,11 +166,8 @@ int ima_path_check(struct path *path, int mask) return 0; mutex_lock(&iint->mutex); - iint->opencount++; - if ((mask & MAY_WRITE) || (mask == 0)) - iint->writecount++; - else if (mask & (MAY_READ | MAY_EXEC)) - iint->readcount++; + if (update_counts) + ima_update_counts(iint, mask); rc = ima_must_measure(iint, inode, MAY_READ, PATH_CHECK); if (rc < 0) @@ -197,6 +203,7 @@ out: kref_put(&iint->refcount, iint_free); return 0; } +EXPORT_SYMBOL_GPL(ima_path_check); static int process_measurement(struct file *file, const unsigned char *filename, int mask, int function) @@ -225,7 +232,16 @@ out: return rc; } -static void opencount_get(struct file *file) +/* + * ima_opens_get - increment file counts + * + * - for IPC shm and shmat file. + * - for nfsd exported files. + * + * Increment the counts for these files to prevent unnecessary + * imbalance messages. + */ +void ima_counts_get(struct file *file) { struct inode *inode = file->f_dentry->d_inode; struct ima_iint_cache *iint; @@ -237,8 +253,14 @@ static void opencount_get(struct file *file) return; mutex_lock(&iint->mutex); iint->opencount++; + if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ) + iint->readcount++; + + if (file->f_mode & FMODE_WRITE) + iint->writecount++; mutex_unlock(&iint->mutex); } +EXPORT_SYMBOL_GPL(ima_counts_get); /** * ima_file_mmap - based on policy, collect/store measurement. @@ -263,18 +285,6 @@ int ima_file_mmap(struct file *file, unsigned long prot) return 0; } -/* - * ima_shm_check - IPC shm and shmat create/fput a file - * - * Maintain the opencount for these files to prevent unnecessary - * imbalance messages. - */ -void ima_shm_check(struct file *file) -{ - opencount_get(file); - return; -} - /** * ima_bprm_check - based on policy, collect/store measurement. * @bprm: contains the linux_binprm structure From c9d9ac525a0285a5b5ad9c3f9aa8b7c1753e6121 Mon Sep 17 00:00:00 2001 From: Mimi Zohar Date: Tue, 19 May 2009 13:25:58 -0400 Subject: [PATCH 1170/2061] integrity: move ima_counts_get Based on discussion on lkml (Andrew Morton and Eric Paris), move ima_counts_get down a layer into shmem/hugetlb__file_setup(). Resolves drm shmem_file_setup() usage case as well. HD comment: I still think you're doing this at the wrong level, but recognize that you probably won't be persuaded until a few more users of alloc_file() emerge, all wanting your ima_counts_get(). Resolving GEM's shmem_file_setup() is an improvement, so I'll say Acked-by: Hugh Dickins Signed-off-by: Mimi Zohar Signed-off-by: James Morris --- fs/hugetlbfs/inode.c | 2 ++ ipc/shm.c | 1 - mm/shmem.c | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 153d9681192..ccc62de96df 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -30,6 +30,7 @@ #include #include #include +#include #include @@ -997,6 +998,7 @@ struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) &hugetlbfs_file_operations); if (!file) goto out_dentry; /* inode is already attached */ + ima_counts_get(file); return file; diff --git a/ipc/shm.c b/ipc/shm.c index 47b464229cd..56081835359 100644 --- a/ipc/shm.c +++ b/ipc/shm.c @@ -384,7 +384,6 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) error = PTR_ERR(file); if (IS_ERR(file)) goto no_file; - ima_counts_get(file); id = ipc_addid(&shm_ids(ns), &shp->shm_perm, ns->shm_ctlmni); if (id < 0) { diff --git a/mm/shmem.c b/mm/shmem.c index a817f75f144..0132fbd45a2 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2659,6 +2659,7 @@ struct file *shmem_file_setup(char *name, loff_t size, unsigned long flags) if (error) goto close_file; #endif + ima_counts_get(file); return file; close_file: @@ -2684,7 +2685,6 @@ int shmem_zero_setup(struct vm_area_struct *vma) if (IS_ERR(file)) return PTR_ERR(file); - ima_counts_get(file); if (vma->vm_file) fput(vma->vm_file); vma->vm_file = file; From 6470c077cae12227318f40f3e6d756caadcce4b0 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 21 May 2009 18:42:54 +0200 Subject: [PATCH 1171/2061] smack: do not beyond ARRAY_SIZE of data Do not go beyond ARRAY_SIZE of data Signed-off-by: Roel Kluin Acked-by: Casey Schaufler Signed-off-by: James Morris --- security/smack/smackfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/smack/smackfs.c b/security/smack/smackfs.c index 904af348328..8d3c2a051c7 100644 --- a/security/smack/smackfs.c +++ b/security/smack/smackfs.c @@ -776,7 +776,7 @@ static ssize_t smk_write_netlbladdr(struct file *file, const char __user *buf, struct sockaddr_in newname; char smack[SMK_LABELLEN]; char *sp; - char data[SMK_NETLBLADDRMAX]; + char data[SMK_NETLBLADDRMAX + 1]; char *host = (char *)&newname.sin_addr.s_addr; int rc; struct netlbl_audit audit_info; From 7fc1e5c15fde0fa9d2c08441f6898a9e51593d47 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 21 May 2009 18:25:23 +0000 Subject: [PATCH 1172/2061] sh: clkfwk: beyond ARRAY_SIZE of onchip_ops for sh7722. Do not go beyond ARRAY_SIZE of onchip_ops Signed-off-by: Roel Kluin Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/sh4a/clock-sh7722.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c index ae78efd0582..c090c9a373f 100644 --- a/arch/sh/kernel/cpu/sh4a/clock-sh7722.c +++ b/arch/sh/kernel/cpu/sh4a/clock-sh7722.c @@ -883,7 +883,7 @@ struct clk_ops *onchip_ops[] = { void __init arch_init_clk_ops(struct clk_ops **ops, int type) { - BUG_ON(type < 0 || type > ARRAY_SIZE(onchip_ops)); + BUG_ON(type < 0 || type >= ARRAY_SIZE(onchip_ops)); *ops = onchip_ops[type]; } From 2f3ed17e010e8c0873094016f93c1afbb4adb666 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 22 May 2009 13:47:52 +0900 Subject: [PATCH 1173/2061] sh: Wrap irq_to_desc_alloc_cpu() around CONFIG_SPARSE_IRQ temporarily. irq_to_desc_alloc_cpu() has been renamed to irq_to_desc_alloc_node() in -next, but as we can not presently enable SPARSE_IRQ without the early irq_desc alloc patch, protect it with an ifdef until the interface has settled and we are ready to enable it system-wide. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/irq/ipr.c | 4 ++++ drivers/sh/intc.c | 5 ++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/arch/sh/kernel/cpu/irq/ipr.c b/arch/sh/kernel/cpu/irq/ipr.c index fa0c8467a28..6ad40dbad88 100644 --- a/arch/sh/kernel/cpu/irq/ipr.c +++ b/arch/sh/kernel/cpu/irq/ipr.c @@ -59,17 +59,21 @@ void register_ipr_controller(struct ipr_desc *desc) for (i = 0; i < desc->nr_irqs; i++) { struct ipr_data *p = desc->ipr_data + i; +#ifdef CONFIG_SPARSE_IRQ struct irq_desc *irq_desc; +#endif BUG_ON(p->ipr_idx >= desc->nr_offsets); BUG_ON(!desc->ipr_offsets[p->ipr_idx]); +#ifdef CONFIG_SPARSE_IRQ irq_desc = irq_to_desc_alloc_cpu(p->irq, smp_processor_id()); if (unlikely(!irq_desc)) { printk(KERN_INFO "can not get irq_desc for %d\n", p->irq); continue; } +#endif disable_irq_nosync(p->irq); set_irq_chip_and_handler_name(p->irq, &desc->chip, diff --git a/drivers/sh/intc.c b/drivers/sh/intc.c index 098b767e9af..caf06569404 100644 --- a/drivers/sh/intc.c +++ b/drivers/sh/intc.c @@ -771,16 +771,19 @@ void __init register_intc_controller(struct intc_desc *desc) for (i = 0; i < desc->nr_vectors; i++) { struct intc_vect *vect = desc->vectors + i; unsigned int irq = evt2irq(vect->vect); +#ifdef CONFIG_SPARSE_IRQ struct irq_desc *irq_desc; - +#endif if (!vect->enum_id) continue; +#ifdef CONFIG_SPARSE_IRQ irq_desc = irq_to_desc_alloc_cpu(irq, cpu); if (unlikely(!irq_desc)) { printk(KERN_INFO "can not get irq_desc for %d\n", irq); continue; } +#endif intc_register_irq(desc, d, vect->enum_id, irq); } From 62fad39be0662a924b60e4354b802525ceda0bb1 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 22 May 2009 13:50:18 +0900 Subject: [PATCH 1174/2061] sh: Add a NR_IRQS_LEGACY for external IRQ0-7. This adds a NR_IRQS_LEGACY definition, which will be used by sparse irq. Signed-off-by: Paul Mundt --- arch/sh/include/asm/irq.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/sh/include/asm/irq.h b/arch/sh/include/asm/irq.h index d319baaf4fb..a2b8c99cc06 100644 --- a/arch/sh/include/asm/irq.h +++ b/arch/sh/include/asm/irq.h @@ -8,7 +8,8 @@ * advised to cap this at the hard limit that they're interested in * through the machvec. */ -#define NR_IRQS 256 +#define NR_IRQS 256 +#define NR_IRQS_LEGACY 8 /* Legacy external IRQ0-7 */ /* * Convert back and forth between INTEVT and IRQ values. From 36aa1e32f451b664adaf3fc9a77d8279b7a833b2 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 22 May 2009 14:00:34 +0900 Subject: [PATCH 1175/2061] sh: clkfwk: Make clock-cpg usable for non-legacy platforms. This adds a new SH_CLK_CPG for parts that have CPG support. SH_CLK_CPG_LEGACY is made to depend on this, and still needs to be set for platforms that want clock-cpg to register the legacy clocks. With this new config item in place, it is now possible to start layering more generic CPG code in place while other platforms transition off of the legacy clocks. Signed-off-by: Paul Mundt --- arch/sh/Kconfig | 4 ++++ arch/sh/kernel/cpu/Makefile | 2 +- arch/sh/kernel/cpu/clock-cpg.c | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index df764c56b05..c815975b8d7 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -513,7 +513,11 @@ config SH_PCLK_FREQ This is necessary for determining the reference clock value on platforms lacking an RTC. +config SH_CLK_CPG + def_bool y + config SH_CLK_CPG_LEGACY + depends on SH_CLK_CPG def_bool y if !CPU_SUBTYPE_SH7785 config SH_CLK_MD diff --git a/arch/sh/kernel/cpu/Makefile b/arch/sh/kernel/cpu/Makefile index 05105b980c2..eecad7cbd61 100644 --- a/arch/sh/kernel/cpu/Makefile +++ b/arch/sh/kernel/cpu/Makefile @@ -17,6 +17,6 @@ obj-$(CONFIG_ARCH_SHMOBILE) += shmobile/ obj-$(CONFIG_UBC_WAKEUP) += ubc.o obj-$(CONFIG_SH_ADC) += adc.o -obj-$(CONFIG_SH_CLK_CPG_LEGACY) += clock-cpg.o +obj-$(CONFIG_SH_CLK_CPG) += clock-cpg.o obj-y += irq/ init.o clock.o diff --git a/arch/sh/kernel/cpu/clock-cpg.c b/arch/sh/kernel/cpu/clock-cpg.c index b4ca2004844..b78c237ab36 100644 --- a/arch/sh/kernel/cpu/clock-cpg.c +++ b/arch/sh/kernel/cpu/clock-cpg.c @@ -2,6 +2,7 @@ #include #include +#ifdef CONFIG_SH_CLK_CPG_LEGACY static struct clk master_clk = { .name = "master_clk", .flags = CLK_ENABLE_ON_INIT, @@ -58,3 +59,4 @@ int __init __weak arch_clk_init(void) { return cpg_clk_init(); } +#endif /* CONFIG_SH_CPG_CLK_LEGACY */ From 8fc40238b4ebf07cd11ca9707843338be22af72f Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Fri, 22 May 2009 14:21:03 +0900 Subject: [PATCH 1176/2061] sh: Prefer slab_is_available() over after_bootmem. This kills off after_bootmem and switches to using slab_is_available() instead. Presently the only place this is used is by the sh64 ioremap, and there's not much point in keeping the reference around otherwise. Signed-off-by: Paul Mundt --- arch/sh/mm/init.c | 3 --- arch/sh/mm/ioremap_64.c | 3 ++- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3edf297c829..ee8e6bbe882 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -184,7 +184,6 @@ void __init paging_init(void) } static struct kcore_list kcore_mem, kcore_vmalloc; -int after_bootmem = 0; void __init mem_init(void) { @@ -217,8 +216,6 @@ void __init mem_init(void) memset(empty_zero_page, 0, PAGE_SIZE); __flush_wback_region(empty_zero_page, PAGE_SIZE); - after_bootmem = 1; - codesize = (unsigned long) &_etext - (unsigned long) &_text; datasize = (unsigned long) &_edata - (unsigned long) &_etext; initsize = (unsigned long) &__init_end - (unsigned long) &__init_begin; diff --git a/arch/sh/mm/ioremap_64.c b/arch/sh/mm/ioremap_64.c index 2331229f812..828c8597219 100644 --- a/arch/sh/mm/ioremap_64.c +++ b/arch/sh/mm/ioremap_64.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include @@ -175,7 +176,7 @@ static __init_refok void *sh64_get_page(void) { void *page; - if (after_bootmem) + if (slab_is_available()) page = (void *)get_zeroed_page(GFP_KERNEL); else page = alloc_bootmem_pages(PAGE_SIZE); From b9ed7252d219c1c663944bf03846eabb515dbe75 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Fri, 22 May 2009 09:25:32 +0200 Subject: [PATCH 1177/2061] xen-blkfront: beyond ARRAY_SIZE of info->shadow Do not go beyond ARRAY_SIZE of info->shadow Signed-off-by: Roel Kluin Acked-by: Jeremy Fitzhardinge Signed-off-by: Jens Axboe --- drivers/block/xen-blkfront.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 6d4ac76c280..6d5950839bd 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -122,7 +122,7 @@ static DEFINE_SPINLOCK(blkif_io_lock); static int get_id_from_freelist(struct blkfront_info *info) { unsigned long free = info->shadow_free; - BUG_ON(free > BLK_RING_SIZE); + BUG_ON(free >= BLK_RING_SIZE); info->shadow_free = info->shadow[free].req.id; info->shadow[free].req.id = 0x0fffffee; /* debug */ return free; From b1e71b0622974953e46a284aa986504a90869a9b Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 22 May 2009 10:01:55 +0100 Subject: [PATCH 1178/2061] GFS2: Clean up some file names This patch renames the ops_*.c files which have no counterpart without the ops_ prefix in order to shorten the name and make it more readable. In addition, ops_address.h (which was very small) is moved into inode.h and inode.h is cleaned up by adding extern where required. Signed-off-by: Steven Whitehouse --- fs/gfs2/Makefile | 2 +- fs/gfs2/{ops_address.c => aops.c} | 1 - fs/gfs2/bmap.c | 1 - fs/gfs2/{ops_dentry.c => dentry.c} | 0 fs/gfs2/{ops_export.c => export.c} | 0 fs/gfs2/{ops_file.c => file.c} | 1 - fs/gfs2/inode.c | 1 - fs/gfs2/inode.h | 53 +++++++++++++++++------------- fs/gfs2/meta_io.c | 1 - fs/gfs2/ops_address.h | 23 ------------- fs/gfs2/quota.c | 1 - fs/gfs2/rgrp.c | 1 - 12 files changed, 32 insertions(+), 53 deletions(-) rename fs/gfs2/{ops_address.c => aops.c} (99%) rename fs/gfs2/{ops_dentry.c => dentry.c} (100%) rename fs/gfs2/{ops_export.c => export.c} (100%) rename fs/gfs2/{ops_file.c => file.c} (99%) delete mode 100644 fs/gfs2/ops_address.h diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index a851ea4bdf7..4f7332c7682 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,7 +1,7 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ glops.o inode.o log.o lops.o main.o meta_io.o \ - mount.o ops_address.o ops_dentry.o ops_export.o ops_file.o \ + mount.o aops.o dentry.o export.o file.o \ ops_fstype.o ops_inode.o ops_super.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o diff --git a/fs/gfs2/ops_address.c b/fs/gfs2/aops.c similarity index 99% rename from fs/gfs2/ops_address.c rename to fs/gfs2/aops.c index e5664210f0d..03ebb439ace 100644 --- a/fs/gfs2/ops_address.c +++ b/fs/gfs2/aops.c @@ -28,7 +28,6 @@ #include "inode.h" #include "log.h" #include "meta_io.h" -#include "ops_address.h" #include "quota.h" #include "trans.h" #include "rgrp.h" diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 253e1a39f84..1153a078920 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -25,7 +25,6 @@ #include "trans.h" #include "dir.h" #include "util.h" -#include "ops_address.h" /* This doesn't need to be that large as max 64 bit pointers in a 4k * block is 512, so __u16 is fine for that. It saves stack space to diff --git a/fs/gfs2/ops_dentry.c b/fs/gfs2/dentry.c similarity index 100% rename from fs/gfs2/ops_dentry.c rename to fs/gfs2/dentry.c diff --git a/fs/gfs2/ops_export.c b/fs/gfs2/export.c similarity index 100% rename from fs/gfs2/ops_export.c rename to fs/gfs2/export.c diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/file.c similarity index 99% rename from fs/gfs2/ops_file.c rename to fs/gfs2/file.c index 0ee7bd287c5..73b6f552f06 100644 --- a/fs/gfs2/ops_file.c +++ b/fs/gfs2/file.c @@ -39,7 +39,6 @@ #include "trans.h" #include "util.h" #include "eaops.h" -#include "ops_address.h" /** * gfs2_llseek - seek to a location in a file diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 5a31d426116..c03a1a384e7 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -30,7 +30,6 @@ #include "inode.h" #include "log.h" #include "meta_io.h" -#include "ops_address.h" #include "quota.h" #include "rgrp.h" #include "trans.h" diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index c30be2b6658..2c3ec072d60 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -11,8 +11,16 @@ #define __INODE_DOT_H__ #include +#include +#include #include "util.h" +extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); +extern int gfs2_internal_read(struct gfs2_inode *ip, + struct file_ra_state *ra_state, + char *buf, loff_t *pos, unsigned size); +extern void gfs2_set_aops(struct inode *inode); + static inline int gfs2_is_stuffed(const struct gfs2_inode *ip) { return !ip->i_height; @@ -73,30 +81,31 @@ static inline void gfs2_inum_out(const struct gfs2_inode *ip, } -void gfs2_set_iop(struct inode *inode); -struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, - u64 no_addr, u64 no_formal_ino, - int skip_freeing); -struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); +extern void gfs2_set_iop(struct inode *inode); +extern struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, + u64 no_addr, u64 no_formal_ino, + int skip_freeing); +extern struct inode *gfs2_ilookup(struct super_block *sb, u64 no_addr); -int gfs2_inode_refresh(struct gfs2_inode *ip); +extern int gfs2_inode_refresh(struct gfs2_inode *ip); -int gfs2_dinode_dealloc(struct gfs2_inode *inode); -int gfs2_change_nlink(struct gfs2_inode *ip, int diff); -struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, - int is_root); -struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, - unsigned int mode, dev_t dev); -int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip); -int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - const struct gfs2_inode *ip); -int gfs2_permission(struct inode *inode, int mask); -int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); -int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); -struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); -void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); -void gfs2_dinode_print(const struct gfs2_inode *ip); +extern int gfs2_dinode_dealloc(struct gfs2_inode *inode); +extern int gfs2_change_nlink(struct gfs2_inode *ip, int diff); +extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, + int is_root); +extern struct inode *gfs2_createi(struct gfs2_holder *ghs, + const struct qstr *name, + unsigned int mode, dev_t dev); +extern int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip); +extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + const struct gfs2_inode *ip); +extern int gfs2_permission(struct inode *inode, int mask); +extern int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); +extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); +extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); +extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); +extern void gfs2_dinode_print(const struct gfs2_inode *ip); extern const struct inode_operations gfs2_file_iops; extern const struct inode_operations gfs2_dir_iops; diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index 78a5f431266..cb8d7a93d5e 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -31,7 +31,6 @@ #include "rgrp.h" #include "trans.h" #include "util.h" -#include "ops_address.h" static int gfs2_aspace_writepage(struct page *page, struct writeback_control *wbc) { diff --git a/fs/gfs2/ops_address.h b/fs/gfs2/ops_address.h deleted file mode 100644 index 5da21285bba..00000000000 --- a/fs/gfs2/ops_address.h +++ /dev/null @@ -1,23 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2007 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#ifndef __OPS_ADDRESS_DOT_H__ -#define __OPS_ADDRESS_DOT_H__ - -#include -#include -#include - -extern int gfs2_releasepage(struct page *page, gfp_t gfp_mask); -extern int gfs2_internal_read(struct gfs2_inode *ip, - struct file_ra_state *ra_state, - char *buf, loff_t *pos, unsigned size); -extern void gfs2_set_aops(struct inode *inode); - -#endif /* __OPS_ADDRESS_DOT_H__ */ diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index 152e6c4a0dc..2e9b9326bfc 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -60,7 +60,6 @@ #include "super.h" #include "trans.h" #include "inode.h" -#include "ops_address.h" #include "util.h" #define QUOTA_USER 1 diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index ee3d5c1876a..6122c7ee364 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -29,7 +29,6 @@ #include "util.h" #include "log.h" #include "inode.h" -#include "ops_address.h" #define BFITNOENT ((u32)~0) #define NO_BLOCK ((u64)~0) From 9e6e0a128bca0a151d8d3fbd9459b22fc21cfebb Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 22 May 2009 10:36:01 +0100 Subject: [PATCH 1179/2061] GFS2: Merge mount.c and ops_super.c into super.c mount.c only contained a single function, so is not really worth retaining on its own. All of the super related code is now either in super.c or ops_fstype.c Signed-off-by: Steven Whitehouse --- fs/gfs2/Makefile | 4 +- fs/gfs2/mount.c | 195 ---------- fs/gfs2/ops_super.c | 757 ------------------------------------- fs/gfs2/super.c | 903 +++++++++++++++++++++++++++++++++++++++++++- 4 files changed, 903 insertions(+), 956 deletions(-) delete mode 100644 fs/gfs2/mount.c delete mode 100644 fs/gfs2/ops_super.c diff --git a/fs/gfs2/Makefile b/fs/gfs2/Makefile index 4f7332c7682..d53a9bea1c2 100644 --- a/fs/gfs2/Makefile +++ b/fs/gfs2/Makefile @@ -1,8 +1,8 @@ obj-$(CONFIG_GFS2_FS) += gfs2.o gfs2-y := acl.o bmap.o dir.o eaops.o eattr.o glock.o \ glops.o inode.o log.o lops.o main.o meta_io.o \ - mount.o aops.o dentry.o export.o file.o \ - ops_fstype.o ops_inode.o ops_super.o quota.o \ + aops.o dentry.o export.o file.o \ + ops_fstype.o ops_inode.o quota.o \ recovery.o rgrp.o super.o sys.o trans.o util.o gfs2-$(CONFIG_GFS2_FS_LOCKING_DLM) += lock_dlm.o diff --git a/fs/gfs2/mount.c b/fs/gfs2/mount.c deleted file mode 100644 index 947af151fa2..00000000000 --- a/fs/gfs2/mount.c +++ /dev/null @@ -1,195 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2006 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include -#include -#include -#include -#include - -#include "gfs2.h" -#include "incore.h" -#include "super.h" -#include "sys.h" -#include "util.h" - -enum { - Opt_lockproto, - Opt_locktable, - Opt_hostdata, - Opt_spectator, - Opt_ignore_local_fs, - Opt_localflocks, - Opt_localcaching, - Opt_debug, - Opt_nodebug, - Opt_upgrade, - Opt_acl, - Opt_noacl, - Opt_quota_off, - Opt_quota_account, - Opt_quota_on, - Opt_quota, - Opt_noquota, - Opt_suiddir, - Opt_nosuiddir, - Opt_data_writeback, - Opt_data_ordered, - Opt_meta, - Opt_discard, - Opt_nodiscard, - Opt_commit, - Opt_err, -}; - -static const match_table_t tokens = { - {Opt_lockproto, "lockproto=%s"}, - {Opt_locktable, "locktable=%s"}, - {Opt_hostdata, "hostdata=%s"}, - {Opt_spectator, "spectator"}, - {Opt_ignore_local_fs, "ignore_local_fs"}, - {Opt_localflocks, "localflocks"}, - {Opt_localcaching, "localcaching"}, - {Opt_debug, "debug"}, - {Opt_nodebug, "nodebug"}, - {Opt_upgrade, "upgrade"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_quota_off, "quota=off"}, - {Opt_quota_account, "quota=account"}, - {Opt_quota_on, "quota=on"}, - {Opt_quota, "quota"}, - {Opt_noquota, "noquota"}, - {Opt_suiddir, "suiddir"}, - {Opt_nosuiddir, "nosuiddir"}, - {Opt_data_writeback, "data=writeback"}, - {Opt_data_ordered, "data=ordered"}, - {Opt_meta, "meta"}, - {Opt_discard, "discard"}, - {Opt_nodiscard, "nodiscard"}, - {Opt_commit, "commit=%d"}, - {Opt_err, NULL} -}; - -/** - * gfs2_mount_args - Parse mount options - * @sdp: - * @data: - * - * Return: errno - */ - -int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) -{ - char *o; - int token; - substring_t tmp[MAX_OPT_ARGS]; - int rv; - - /* Split the options into tokens with the "," character and - process them */ - - while (1) { - o = strsep(&options, ","); - if (o == NULL) - break; - if (*o == '\0') - continue; - - token = match_token(o, tokens, tmp); - switch (token) { - case Opt_lockproto: - match_strlcpy(args->ar_lockproto, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_locktable: - match_strlcpy(args->ar_locktable, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_hostdata: - match_strlcpy(args->ar_hostdata, &tmp[0], - GFS2_LOCKNAME_LEN); - break; - case Opt_spectator: - args->ar_spectator = 1; - break; - case Opt_ignore_local_fs: - args->ar_ignore_local_fs = 1; - break; - case Opt_localflocks: - args->ar_localflocks = 1; - break; - case Opt_localcaching: - args->ar_localcaching = 1; - break; - case Opt_debug: - args->ar_debug = 1; - break; - case Opt_nodebug: - args->ar_debug = 0; - break; - case Opt_upgrade: - args->ar_upgrade = 1; - break; - case Opt_acl: - args->ar_posix_acl = 1; - break; - case Opt_noacl: - args->ar_posix_acl = 0; - break; - case Opt_quota_off: - case Opt_noquota: - args->ar_quota = GFS2_QUOTA_OFF; - break; - case Opt_quota_account: - args->ar_quota = GFS2_QUOTA_ACCOUNT; - break; - case Opt_quota_on: - case Opt_quota: - args->ar_quota = GFS2_QUOTA_ON; - break; - case Opt_suiddir: - args->ar_suiddir = 1; - break; - case Opt_nosuiddir: - args->ar_suiddir = 0; - break; - case Opt_data_writeback: - args->ar_data = GFS2_DATA_WRITEBACK; - break; - case Opt_data_ordered: - args->ar_data = GFS2_DATA_ORDERED; - break; - case Opt_meta: - args->ar_meta = 1; - break; - case Opt_discard: - args->ar_discard = 1; - break; - case Opt_nodiscard: - args->ar_discard = 0; - break; - case Opt_commit: - rv = match_int(&tmp[0], &args->ar_commit); - if (rv || args->ar_commit <= 0) { - fs_info(sdp, "commit mount option requires a positive numeric argument\n"); - return rv ? rv : -EINVAL; - } - break; - case Opt_err: - default: - fs_info(sdp, "invalid mount option: %s\n", o); - return -EINVAL; - } - } - - return 0; -} - diff --git a/fs/gfs2/ops_super.c b/fs/gfs2/ops_super.c deleted file mode 100644 index 2fd1dcbcc5b..00000000000 --- a/fs/gfs2/ops_super.c +++ /dev/null @@ -1,757 +0,0 @@ -/* - * Copyright (C) Sistina Software, Inc. 1997-2003 All rights reserved. - * Copyright (C) 2004-2008 Red Hat, Inc. All rights reserved. - * - * This copyrighted material is made available to anyone wishing to use, - * modify, copy, or redistribute it subject to the terms and conditions - * of the GNU General Public License version 2. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "gfs2.h" -#include "incore.h" -#include "glock.h" -#include "inode.h" -#include "log.h" -#include "quota.h" -#include "recovery.h" -#include "rgrp.h" -#include "super.h" -#include "sys.h" -#include "util.h" -#include "trans.h" -#include "dir.h" -#include "eattr.h" -#include "bmap.h" -#include "meta_io.h" - -#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) - -/** - * gfs2_write_inode - Make sure the inode is stable on the disk - * @inode: The inode - * @sync: synchronous write flag - * - * Returns: errno - */ - -static int gfs2_write_inode(struct inode *inode, int sync) -{ - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_sbd *sdp = GFS2_SB(inode); - struct gfs2_holder gh; - struct buffer_head *bh; - struct timespec atime; - struct gfs2_dinode *di; - int ret = 0; - - /* Check this is a "normal" inode, etc */ - if (!test_bit(GIF_USER, &ip->i_flags) || - (current->flags & PF_MEMALLOC)) - return 0; - ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (ret) - goto do_flush; - ret = gfs2_trans_begin(sdp, RES_DINODE, 0); - if (ret) - goto do_unlock; - ret = gfs2_meta_inode_buffer(ip, &bh); - if (ret == 0) { - di = (struct gfs2_dinode *)bh->b_data; - atime.tv_sec = be64_to_cpu(di->di_atime); - atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); - if (timespec_compare(&inode->i_atime, &atime) > 0) { - gfs2_trans_add_bh(ip->i_gl, bh, 1); - gfs2_dinode_out(ip, bh->b_data); - } - brelse(bh); - } - gfs2_trans_end(sdp); -do_unlock: - gfs2_glock_dq_uninit(&gh); -do_flush: - if (sync != 0) - gfs2_log_flush(GFS2_SB(inode), ip->i_gl); - return ret; -} - -/** - * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one - * @sdp: the filesystem - * - * Returns: errno - */ - -static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) -{ - struct gfs2_holder t_gh; - int error; - - gfs2_quota_sync(sdp); - gfs2_statfs_sync(sdp); - - error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, - &t_gh); - if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return error; - - gfs2_meta_syncfs(sdp); - gfs2_log_shutdown(sdp); - - clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); - - if (t_gh.gh_gl) - gfs2_glock_dq_uninit(&t_gh); - - gfs2_quota_cleanup(sdp); - - return error; -} - -static int gfs2_umount_recovery_wait(void *word) -{ - schedule(); - return 0; -} - -/** - * gfs2_put_super - Unmount the filesystem - * @sb: The VFS superblock - * - */ - -static void gfs2_put_super(struct super_block *sb) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - int error; - struct gfs2_jdesc *jd; - - /* Unfreeze the filesystem, if we need to */ - - mutex_lock(&sdp->sd_freeze_lock); - if (sdp->sd_freeze_count) - gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); - mutex_unlock(&sdp->sd_freeze_lock); - - /* No more recovery requests */ - set_bit(SDF_NORECOVERY, &sdp->sd_flags); - smp_mb(); - - /* Wait on outstanding recovery */ -restart: - spin_lock(&sdp->sd_jindex_spin); - list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { - if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) - continue; - spin_unlock(&sdp->sd_jindex_spin); - wait_on_bit(&jd->jd_flags, JDF_RECOVERY, - gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); - goto restart; - } - spin_unlock(&sdp->sd_jindex_spin); - - kthread_stop(sdp->sd_quotad_process); - kthread_stop(sdp->sd_logd_process); - - if (!(sb->s_flags & MS_RDONLY)) { - error = gfs2_make_fs_ro(sdp); - if (error) - gfs2_io_error(sdp); - } - /* At this point, we're through modifying the disk */ - - /* Release stuff */ - - iput(sdp->sd_jindex); - iput(sdp->sd_inum_inode); - iput(sdp->sd_statfs_inode); - iput(sdp->sd_rindex); - iput(sdp->sd_quota_inode); - - gfs2_glock_put(sdp->sd_rename_gl); - gfs2_glock_put(sdp->sd_trans_gl); - - if (!sdp->sd_args.ar_spectator) { - gfs2_glock_dq_uninit(&sdp->sd_journal_gh); - gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); - gfs2_glock_dq_uninit(&sdp->sd_ir_gh); - gfs2_glock_dq_uninit(&sdp->sd_sc_gh); - gfs2_glock_dq_uninit(&sdp->sd_qc_gh); - iput(sdp->sd_ir_inode); - iput(sdp->sd_sc_inode); - iput(sdp->sd_qc_inode); - } - - gfs2_glock_dq_uninit(&sdp->sd_live_gh); - gfs2_clear_rgrpd(sdp); - gfs2_jindex_free(sdp); - /* Take apart glock structures and buffer lists */ - gfs2_gl_hash_clear(sdp); - /* Unmount the locking protocol */ - gfs2_lm_unmount(sdp); - - /* At this point, we're through participating in the lockspace */ - gfs2_sys_fs_del(sdp); -} - -/** - * gfs2_write_super - * @sb: the superblock - * - */ - -static void gfs2_write_super(struct super_block *sb) -{ - sb->s_dirt = 0; -} - -/** - * gfs2_sync_fs - sync the filesystem - * @sb: the superblock - * - * Flushes the log to disk. - */ - -static int gfs2_sync_fs(struct super_block *sb, int wait) -{ - sb->s_dirt = 0; - if (wait && sb->s_fs_info) - gfs2_log_flush(sb->s_fs_info, NULL); - return 0; -} - -/** - * gfs2_freeze - prevent further writes to the filesystem - * @sb: the VFS structure for the filesystem - * - */ - -static int gfs2_freeze(struct super_block *sb) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - int error; - - if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) - return -EINVAL; - - for (;;) { - error = gfs2_freeze_fs(sdp); - if (!error) - break; - - switch (error) { - case -EBUSY: - fs_err(sdp, "waiting for recovery before freeze\n"); - break; - - default: - fs_err(sdp, "error freezing FS: %d\n", error); - break; - } - - fs_err(sdp, "retrying...\n"); - msleep(1000); - } - return 0; -} - -/** - * gfs2_unfreeze - reallow writes to the filesystem - * @sb: the VFS structure for the filesystem - * - */ - -static int gfs2_unfreeze(struct super_block *sb) -{ - gfs2_unfreeze_fs(sb->s_fs_info); - return 0; -} - -/** - * statfs_fill - fill in the sg for a given RG - * @rgd: the RG - * @sc: the sc structure - * - * Returns: 0 on success, -ESTALE if the LVB is invalid - */ - -static int statfs_slow_fill(struct gfs2_rgrpd *rgd, - struct gfs2_statfs_change_host *sc) -{ - gfs2_rgrp_verify(rgd); - sc->sc_total += rgd->rd_data; - sc->sc_free += rgd->rd_free; - sc->sc_dinodes += rgd->rd_dinodes; - return 0; -} - -/** - * gfs2_statfs_slow - Stat a filesystem using asynchronous locking - * @sdp: the filesystem - * @sc: the sc info that will be returned - * - * Any error (other than a signal) will cause this routine to fall back - * to the synchronous version. - * - * FIXME: This really shouldn't busy wait like this. - * - * Returns: errno - */ - -static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) -{ - struct gfs2_holder ri_gh; - struct gfs2_rgrpd *rgd_next; - struct gfs2_holder *gha, *gh; - unsigned int slots = 64; - unsigned int x; - int done; - int error = 0, err; - - memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); - gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); - if (!gha) - return -ENOMEM; - - error = gfs2_rindex_hold(sdp, &ri_gh); - if (error) - goto out; - - rgd_next = gfs2_rgrpd_get_first(sdp); - - for (;;) { - done = 1; - - for (x = 0; x < slots; x++) { - gh = gha + x; - - if (gh->gh_gl && gfs2_glock_poll(gh)) { - err = gfs2_glock_wait(gh); - if (err) { - gfs2_holder_uninit(gh); - error = err; - } else { - if (!error) - error = statfs_slow_fill( - gh->gh_gl->gl_object, sc); - gfs2_glock_dq_uninit(gh); - } - } - - if (gh->gh_gl) - done = 0; - else if (rgd_next && !error) { - error = gfs2_glock_nq_init(rgd_next->rd_gl, - LM_ST_SHARED, - GL_ASYNC, - gh); - rgd_next = gfs2_rgrpd_get_next(rgd_next); - done = 0; - } - - if (signal_pending(current)) - error = -ERESTARTSYS; - } - - if (done) - break; - - yield(); - } - - gfs2_glock_dq_uninit(&ri_gh); - -out: - kfree(gha); - return error; -} - -/** - * gfs2_statfs_i - Do a statfs - * @sdp: the filesystem - * @sg: the sg structure - * - * Returns: errno - */ - -static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) -{ - struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; - struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; - - spin_lock(&sdp->sd_statfs_spin); - - *sc = *m_sc; - sc->sc_total += l_sc->sc_total; - sc->sc_free += l_sc->sc_free; - sc->sc_dinodes += l_sc->sc_dinodes; - - spin_unlock(&sdp->sd_statfs_spin); - - if (sc->sc_free < 0) - sc->sc_free = 0; - if (sc->sc_free > sc->sc_total) - sc->sc_free = sc->sc_total; - if (sc->sc_dinodes < 0) - sc->sc_dinodes = 0; - - return 0; -} - -/** - * gfs2_statfs - Gather and return stats about the filesystem - * @sb: The superblock - * @statfsbuf: The buffer - * - * Returns: 0 on success or error code - */ - -static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_inode->i_sb; - struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_statfs_change_host sc; - int error; - - if (gfs2_tune_get(sdp, gt_statfs_slow)) - error = gfs2_statfs_slow(sdp, &sc); - else - error = gfs2_statfs_i(sdp, &sc); - - if (error) - return error; - - buf->f_type = GFS2_MAGIC; - buf->f_bsize = sdp->sd_sb.sb_bsize; - buf->f_blocks = sc.sc_total; - buf->f_bfree = sc.sc_free; - buf->f_bavail = sc.sc_free; - buf->f_files = sc.sc_dinodes + sc.sc_free; - buf->f_ffree = sc.sc_free; - buf->f_namelen = GFS2_FNAMESIZE; - - return 0; -} - -/** - * gfs2_remount_fs - called when the FS is remounted - * @sb: the filesystem - * @flags: the remount flags - * @data: extra data passed in (not used right now) - * - * Returns: errno - */ - -static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) -{ - struct gfs2_sbd *sdp = sb->s_fs_info; - struct gfs2_args args = sdp->sd_args; /* Default to current settings */ - struct gfs2_tune *gt = &sdp->sd_tune; - int error; - - spin_lock(>->gt_spin); - args.ar_commit = gt->gt_log_flush_secs; - spin_unlock(>->gt_spin); - error = gfs2_mount_args(sdp, &args, data); - if (error) - return error; - - /* Not allowed to change locking details */ - if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || - strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || - strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) - return -EINVAL; - - /* Some flags must not be changed */ - if (args_neq(&args, &sdp->sd_args, spectator) || - args_neq(&args, &sdp->sd_args, ignore_local_fs) || - args_neq(&args, &sdp->sd_args, localflocks) || - args_neq(&args, &sdp->sd_args, localcaching) || - args_neq(&args, &sdp->sd_args, meta)) - return -EINVAL; - - if (sdp->sd_args.ar_spectator) - *flags |= MS_RDONLY; - - if ((sb->s_flags ^ *flags) & MS_RDONLY) { - if (*flags & MS_RDONLY) - error = gfs2_make_fs_ro(sdp); - else - error = gfs2_make_fs_rw(sdp); - if (error) - return error; - } - - sdp->sd_args = args; - if (sdp->sd_args.ar_posix_acl) - sb->s_flags |= MS_POSIXACL; - else - sb->s_flags &= ~MS_POSIXACL; - spin_lock(>->gt_spin); - gt->gt_log_flush_secs = args.ar_commit; - spin_unlock(>->gt_spin); - - return 0; -} - -/** - * gfs2_drop_inode - Drop an inode (test for remote unlink) - * @inode: The inode to drop - * - * If we've received a callback on an iopen lock then its because a - * remote node tried to deallocate the inode but failed due to this node - * still having the inode open. Here we mark the link count zero - * since we know that it must have reached zero if the GLF_DEMOTE flag - * is set on the iopen glock. If we didn't do a disk read since the - * remote node removed the final link then we might otherwise miss - * this event. This check ensures that this node will deallocate the - * inode's blocks, or alternatively pass the baton on to another - * node for later deallocation. - */ - -static void gfs2_drop_inode(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - - if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) { - struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; - if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) - clear_nlink(inode); - } - generic_drop_inode(inode); -} - -/** - * gfs2_clear_inode - Deallocate an inode when VFS is done with it - * @inode: The VFS inode - * - */ - -static void gfs2_clear_inode(struct inode *inode) -{ - struct gfs2_inode *ip = GFS2_I(inode); - - /* This tells us its a "real" inode and not one which only - * serves to contain an address space (see rgrp.c, meta_io.c) - * which therefore doesn't have its own glocks. - */ - if (test_bit(GIF_USER, &ip->i_flags)) { - ip->i_gl->gl_object = NULL; - gfs2_glock_put(ip->i_gl); - ip->i_gl = NULL; - if (ip->i_iopen_gh.gh_gl) { - ip->i_iopen_gh.gh_gl->gl_object = NULL; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - } - } -} - -static int is_ancestor(const struct dentry *d1, const struct dentry *d2) -{ - do { - if (d1 == d2) - return 1; - d1 = d1->d_parent; - } while (!IS_ROOT(d1)); - return 0; -} - -/** - * gfs2_show_options - Show mount options for /proc/mounts - * @s: seq_file structure - * @mnt: vfsmount - * - * Returns: 0 on success or error code - */ - -static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) -{ - struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; - struct gfs2_args *args = &sdp->sd_args; - int lfsecs; - - if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) - seq_printf(s, ",meta"); - if (args->ar_lockproto[0]) - seq_printf(s, ",lockproto=%s", args->ar_lockproto); - if (args->ar_locktable[0]) - seq_printf(s, ",locktable=%s", args->ar_locktable); - if (args->ar_hostdata[0]) - seq_printf(s, ",hostdata=%s", args->ar_hostdata); - if (args->ar_spectator) - seq_printf(s, ",spectator"); - if (args->ar_ignore_local_fs) - seq_printf(s, ",ignore_local_fs"); - if (args->ar_localflocks) - seq_printf(s, ",localflocks"); - if (args->ar_localcaching) - seq_printf(s, ",localcaching"); - if (args->ar_debug) - seq_printf(s, ",debug"); - if (args->ar_upgrade) - seq_printf(s, ",upgrade"); - if (args->ar_posix_acl) - seq_printf(s, ",acl"); - if (args->ar_quota != GFS2_QUOTA_DEFAULT) { - char *state; - switch (args->ar_quota) { - case GFS2_QUOTA_OFF: - state = "off"; - break; - case GFS2_QUOTA_ACCOUNT: - state = "account"; - break; - case GFS2_QUOTA_ON: - state = "on"; - break; - default: - state = "unknown"; - break; - } - seq_printf(s, ",quota=%s", state); - } - if (args->ar_suiddir) - seq_printf(s, ",suiddir"); - if (args->ar_data != GFS2_DATA_DEFAULT) { - char *state; - switch (args->ar_data) { - case GFS2_DATA_WRITEBACK: - state = "writeback"; - break; - case GFS2_DATA_ORDERED: - state = "ordered"; - break; - default: - state = "unknown"; - break; - } - seq_printf(s, ",data=%s", state); - } - if (args->ar_discard) - seq_printf(s, ",discard"); - lfsecs = sdp->sd_tune.gt_log_flush_secs; - if (lfsecs != 60) - seq_printf(s, ",commit=%d", lfsecs); - return 0; -} - -/* - * We have to (at the moment) hold the inodes main lock to cover - * the gap between unlocking the shared lock on the iopen lock and - * taking the exclusive lock. I'd rather do a shared -> exclusive - * conversion on the iopen lock, but we can change that later. This - * is safe, just less efficient. - */ - -static void gfs2_delete_inode(struct inode *inode) -{ - struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; - struct gfs2_inode *ip = GFS2_I(inode); - struct gfs2_holder gh; - int error; - - if (!test_bit(GIF_USER, &ip->i_flags)) - goto out; - - error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); - if (unlikely(error)) { - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - goto out; - } - - gfs2_glock_dq_wait(&ip->i_iopen_gh); - gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); - error = gfs2_glock_nq(&ip->i_iopen_gh); - if (error) - goto out_truncate; - - if (S_ISDIR(inode->i_mode) && - (ip->i_diskflags & GFS2_DIF_EXHASH)) { - error = gfs2_dir_exhash_dealloc(ip); - if (error) - goto out_unlock; - } - - if (ip->i_eattr) { - error = gfs2_ea_dealloc(ip); - if (error) - goto out_unlock; - } - - if (!gfs2_is_stuffed(ip)) { - error = gfs2_file_dealloc(ip); - if (error) - goto out_unlock; - } - - error = gfs2_dinode_dealloc(ip); - if (error) - goto out_unlock; - -out_truncate: - error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); - if (error) - goto out_unlock; - /* Needs to be done before glock release & also in a transaction */ - truncate_inode_pages(&inode->i_data, 0); - gfs2_trans_end(sdp); - -out_unlock: - if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) - gfs2_glock_dq(&ip->i_iopen_gh); - gfs2_holder_uninit(&ip->i_iopen_gh); - gfs2_glock_dq_uninit(&gh); - if (error && error != GLR_TRYFAILED && error != -EROFS) - fs_warn(sdp, "gfs2_delete_inode: %d\n", error); -out: - truncate_inode_pages(&inode->i_data, 0); - clear_inode(inode); -} - -static struct inode *gfs2_alloc_inode(struct super_block *sb) -{ - struct gfs2_inode *ip; - - ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); - if (ip) { - ip->i_flags = 0; - ip->i_gl = NULL; - } - return &ip->i_inode; -} - -static void gfs2_destroy_inode(struct inode *inode) -{ - kmem_cache_free(gfs2_inode_cachep, inode); -} - -const struct super_operations gfs2_super_ops = { - .alloc_inode = gfs2_alloc_inode, - .destroy_inode = gfs2_destroy_inode, - .write_inode = gfs2_write_inode, - .delete_inode = gfs2_delete_inode, - .put_super = gfs2_put_super, - .write_super = gfs2_write_super, - .sync_fs = gfs2_sync_fs, - .freeze_fs = gfs2_freeze, - .unfreeze_fs = gfs2_unfreeze, - .statfs = gfs2_statfs, - .remount_fs = gfs2_remount_fs, - .clear_inode = gfs2_clear_inode, - .drop_inode = gfs2_drop_inode, - .show_options = gfs2_show_options, -}; - diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 601913e0a48..40bcc37e5a7 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -7,14 +7,20 @@ * of the GNU General Public License version 2. */ +#include #include #include #include #include #include -#include +#include +#include +#include +#include +#include #include -#include +#include +#include #include "gfs2.h" #include "incore.h" @@ -31,6 +37,183 @@ #include "super.h" #include "trans.h" #include "util.h" +#include "sys.h" +#include "eattr.h" + +#define args_neq(a1, a2, x) ((a1)->ar_##x != (a2)->ar_##x) + +enum { + Opt_lockproto, + Opt_locktable, + Opt_hostdata, + Opt_spectator, + Opt_ignore_local_fs, + Opt_localflocks, + Opt_localcaching, + Opt_debug, + Opt_nodebug, + Opt_upgrade, + Opt_acl, + Opt_noacl, + Opt_quota_off, + Opt_quota_account, + Opt_quota_on, + Opt_quota, + Opt_noquota, + Opt_suiddir, + Opt_nosuiddir, + Opt_data_writeback, + Opt_data_ordered, + Opt_meta, + Opt_discard, + Opt_nodiscard, + Opt_commit, + Opt_error, +}; + +static const match_table_t tokens = { + {Opt_lockproto, "lockproto=%s"}, + {Opt_locktable, "locktable=%s"}, + {Opt_hostdata, "hostdata=%s"}, + {Opt_spectator, "spectator"}, + {Opt_ignore_local_fs, "ignore_local_fs"}, + {Opt_localflocks, "localflocks"}, + {Opt_localcaching, "localcaching"}, + {Opt_debug, "debug"}, + {Opt_nodebug, "nodebug"}, + {Opt_upgrade, "upgrade"}, + {Opt_acl, "acl"}, + {Opt_noacl, "noacl"}, + {Opt_quota_off, "quota=off"}, + {Opt_quota_account, "quota=account"}, + {Opt_quota_on, "quota=on"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_suiddir, "suiddir"}, + {Opt_nosuiddir, "nosuiddir"}, + {Opt_data_writeback, "data=writeback"}, + {Opt_data_ordered, "data=ordered"}, + {Opt_meta, "meta"}, + {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, + {Opt_commit, "commit=%d"}, + {Opt_error, NULL} +}; + +/** + * gfs2_mount_args - Parse mount options + * @sdp: + * @data: + * + * Return: errno + */ + +int gfs2_mount_args(struct gfs2_sbd *sdp, struct gfs2_args *args, char *options) +{ + char *o; + int token; + substring_t tmp[MAX_OPT_ARGS]; + int rv; + + /* Split the options into tokens with the "," character and + process them */ + + while (1) { + o = strsep(&options, ","); + if (o == NULL) + break; + if (*o == '\0') + continue; + + token = match_token(o, tokens, tmp); + switch (token) { + case Opt_lockproto: + match_strlcpy(args->ar_lockproto, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_locktable: + match_strlcpy(args->ar_locktable, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_hostdata: + match_strlcpy(args->ar_hostdata, &tmp[0], + GFS2_LOCKNAME_LEN); + break; + case Opt_spectator: + args->ar_spectator = 1; + break; + case Opt_ignore_local_fs: + args->ar_ignore_local_fs = 1; + break; + case Opt_localflocks: + args->ar_localflocks = 1; + break; + case Opt_localcaching: + args->ar_localcaching = 1; + break; + case Opt_debug: + args->ar_debug = 1; + break; + case Opt_nodebug: + args->ar_debug = 0; + break; + case Opt_upgrade: + args->ar_upgrade = 1; + break; + case Opt_acl: + args->ar_posix_acl = 1; + break; + case Opt_noacl: + args->ar_posix_acl = 0; + break; + case Opt_quota_off: + case Opt_noquota: + args->ar_quota = GFS2_QUOTA_OFF; + break; + case Opt_quota_account: + args->ar_quota = GFS2_QUOTA_ACCOUNT; + break; + case Opt_quota_on: + case Opt_quota: + args->ar_quota = GFS2_QUOTA_ON; + break; + case Opt_suiddir: + args->ar_suiddir = 1; + break; + case Opt_nosuiddir: + args->ar_suiddir = 0; + break; + case Opt_data_writeback: + args->ar_data = GFS2_DATA_WRITEBACK; + break; + case Opt_data_ordered: + args->ar_data = GFS2_DATA_ORDERED; + break; + case Opt_meta: + args->ar_meta = 1; + break; + case Opt_discard: + args->ar_discard = 1; + break; + case Opt_nodiscard: + args->ar_discard = 0; + break; + case Opt_commit: + rv = match_int(&tmp[0], &args->ar_commit); + if (rv || args->ar_commit <= 0) { + fs_info(sdp, "commit mount option requires a positive numeric argument\n"); + return rv ? rv : -EINVAL; + } + break; + case Opt_error: + default: + fs_info(sdp, "invalid mount option: %s\n", o); + return -EINVAL; + } + } + + return 0; +} /** * gfs2_jindex_free - Clear all the journal index information @@ -436,3 +619,719 @@ void gfs2_unfreeze_fs(struct gfs2_sbd *sdp) mutex_unlock(&sdp->sd_freeze_lock); } + +/** + * gfs2_write_inode - Make sure the inode is stable on the disk + * @inode: The inode + * @sync: synchronous write flag + * + * Returns: errno + */ + +static int gfs2_write_inode(struct inode *inode, int sync) +{ + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(inode); + struct gfs2_holder gh; + struct buffer_head *bh; + struct timespec atime; + struct gfs2_dinode *di; + int ret = 0; + + /* Check this is a "normal" inode, etc */ + if (!test_bit(GIF_USER, &ip->i_flags) || + (current->flags & PF_MEMALLOC)) + return 0; + ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (ret) + goto do_flush; + ret = gfs2_trans_begin(sdp, RES_DINODE, 0); + if (ret) + goto do_unlock; + ret = gfs2_meta_inode_buffer(ip, &bh); + if (ret == 0) { + di = (struct gfs2_dinode *)bh->b_data; + atime.tv_sec = be64_to_cpu(di->di_atime); + atime.tv_nsec = be32_to_cpu(di->di_atime_nsec); + if (timespec_compare(&inode->i_atime, &atime) > 0) { + gfs2_trans_add_bh(ip->i_gl, bh, 1); + gfs2_dinode_out(ip, bh->b_data); + } + brelse(bh); + } + gfs2_trans_end(sdp); +do_unlock: + gfs2_glock_dq_uninit(&gh); +do_flush: + if (sync != 0) + gfs2_log_flush(GFS2_SB(inode), ip->i_gl); + return ret; +} + +/** + * gfs2_make_fs_ro - Turn a Read-Write FS into a Read-Only one + * @sdp: the filesystem + * + * Returns: errno + */ + +static int gfs2_make_fs_ro(struct gfs2_sbd *sdp) +{ + struct gfs2_holder t_gh; + int error; + + gfs2_quota_sync(sdp); + gfs2_statfs_sync(sdp); + + error = gfs2_glock_nq_init(sdp->sd_trans_gl, LM_ST_SHARED, GL_NOCACHE, + &t_gh); + if (error && !test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return error; + + gfs2_meta_syncfs(sdp); + gfs2_log_shutdown(sdp); + + clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); + + if (t_gh.gh_gl) + gfs2_glock_dq_uninit(&t_gh); + + gfs2_quota_cleanup(sdp); + + return error; +} + +static int gfs2_umount_recovery_wait(void *word) +{ + schedule(); + return 0; +} + +/** + * gfs2_put_super - Unmount the filesystem + * @sb: The VFS superblock + * + */ + +static void gfs2_put_super(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + struct gfs2_jdesc *jd; + + /* Unfreeze the filesystem, if we need to */ + + mutex_lock(&sdp->sd_freeze_lock); + if (sdp->sd_freeze_count) + gfs2_glock_dq_uninit(&sdp->sd_freeze_gh); + mutex_unlock(&sdp->sd_freeze_lock); + + /* No more recovery requests */ + set_bit(SDF_NORECOVERY, &sdp->sd_flags); + smp_mb(); + + /* Wait on outstanding recovery */ +restart: + spin_lock(&sdp->sd_jindex_spin); + list_for_each_entry(jd, &sdp->sd_jindex_list, jd_list) { + if (!test_bit(JDF_RECOVERY, &jd->jd_flags)) + continue; + spin_unlock(&sdp->sd_jindex_spin); + wait_on_bit(&jd->jd_flags, JDF_RECOVERY, + gfs2_umount_recovery_wait, TASK_UNINTERRUPTIBLE); + goto restart; + } + spin_unlock(&sdp->sd_jindex_spin); + + kthread_stop(sdp->sd_quotad_process); + kthread_stop(sdp->sd_logd_process); + + if (!(sb->s_flags & MS_RDONLY)) { + error = gfs2_make_fs_ro(sdp); + if (error) + gfs2_io_error(sdp); + } + /* At this point, we're through modifying the disk */ + + /* Release stuff */ + + iput(sdp->sd_jindex); + iput(sdp->sd_inum_inode); + iput(sdp->sd_statfs_inode); + iput(sdp->sd_rindex); + iput(sdp->sd_quota_inode); + + gfs2_glock_put(sdp->sd_rename_gl); + gfs2_glock_put(sdp->sd_trans_gl); + + if (!sdp->sd_args.ar_spectator) { + gfs2_glock_dq_uninit(&sdp->sd_journal_gh); + gfs2_glock_dq_uninit(&sdp->sd_jinode_gh); + gfs2_glock_dq_uninit(&sdp->sd_ir_gh); + gfs2_glock_dq_uninit(&sdp->sd_sc_gh); + gfs2_glock_dq_uninit(&sdp->sd_qc_gh); + iput(sdp->sd_ir_inode); + iput(sdp->sd_sc_inode); + iput(sdp->sd_qc_inode); + } + + gfs2_glock_dq_uninit(&sdp->sd_live_gh); + gfs2_clear_rgrpd(sdp); + gfs2_jindex_free(sdp); + /* Take apart glock structures and buffer lists */ + gfs2_gl_hash_clear(sdp); + /* Unmount the locking protocol */ + gfs2_lm_unmount(sdp); + + /* At this point, we're through participating in the lockspace */ + gfs2_sys_fs_del(sdp); +} + +/** + * gfs2_write_super + * @sb: the superblock + * + */ + +static void gfs2_write_super(struct super_block *sb) +{ + sb->s_dirt = 0; +} + +/** + * gfs2_sync_fs - sync the filesystem + * @sb: the superblock + * + * Flushes the log to disk. + */ + +static int gfs2_sync_fs(struct super_block *sb, int wait) +{ + sb->s_dirt = 0; + if (wait && sb->s_fs_info) + gfs2_log_flush(sb->s_fs_info, NULL); + return 0; +} + +/** + * gfs2_freeze - prevent further writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_freeze(struct super_block *sb) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + int error; + + if (test_bit(SDF_SHUTDOWN, &sdp->sd_flags)) + return -EINVAL; + + for (;;) { + error = gfs2_freeze_fs(sdp); + if (!error) + break; + + switch (error) { + case -EBUSY: + fs_err(sdp, "waiting for recovery before freeze\n"); + break; + + default: + fs_err(sdp, "error freezing FS: %d\n", error); + break; + } + + fs_err(sdp, "retrying...\n"); + msleep(1000); + } + return 0; +} + +/** + * gfs2_unfreeze - reallow writes to the filesystem + * @sb: the VFS structure for the filesystem + * + */ + +static int gfs2_unfreeze(struct super_block *sb) +{ + gfs2_unfreeze_fs(sb->s_fs_info); + return 0; +} + +/** + * statfs_fill - fill in the sg for a given RG + * @rgd: the RG + * @sc: the sc structure + * + * Returns: 0 on success, -ESTALE if the LVB is invalid + */ + +static int statfs_slow_fill(struct gfs2_rgrpd *rgd, + struct gfs2_statfs_change_host *sc) +{ + gfs2_rgrp_verify(rgd); + sc->sc_total += rgd->rd_data; + sc->sc_free += rgd->rd_free; + sc->sc_dinodes += rgd->rd_dinodes; + return 0; +} + +/** + * gfs2_statfs_slow - Stat a filesystem using asynchronous locking + * @sdp: the filesystem + * @sc: the sc info that will be returned + * + * Any error (other than a signal) will cause this routine to fall back + * to the synchronous version. + * + * FIXME: This really shouldn't busy wait like this. + * + * Returns: errno + */ + +static int gfs2_statfs_slow(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_holder ri_gh; + struct gfs2_rgrpd *rgd_next; + struct gfs2_holder *gha, *gh; + unsigned int slots = 64; + unsigned int x; + int done; + int error = 0, err; + + memset(sc, 0, sizeof(struct gfs2_statfs_change_host)); + gha = kcalloc(slots, sizeof(struct gfs2_holder), GFP_KERNEL); + if (!gha) + return -ENOMEM; + + error = gfs2_rindex_hold(sdp, &ri_gh); + if (error) + goto out; + + rgd_next = gfs2_rgrpd_get_first(sdp); + + for (;;) { + done = 1; + + for (x = 0; x < slots; x++) { + gh = gha + x; + + if (gh->gh_gl && gfs2_glock_poll(gh)) { + err = gfs2_glock_wait(gh); + if (err) { + gfs2_holder_uninit(gh); + error = err; + } else { + if (!error) + error = statfs_slow_fill( + gh->gh_gl->gl_object, sc); + gfs2_glock_dq_uninit(gh); + } + } + + if (gh->gh_gl) + done = 0; + else if (rgd_next && !error) { + error = gfs2_glock_nq_init(rgd_next->rd_gl, + LM_ST_SHARED, + GL_ASYNC, + gh); + rgd_next = gfs2_rgrpd_get_next(rgd_next); + done = 0; + } + + if (signal_pending(current)) + error = -ERESTARTSYS; + } + + if (done) + break; + + yield(); + } + + gfs2_glock_dq_uninit(&ri_gh); + +out: + kfree(gha); + return error; +} + +/** + * gfs2_statfs_i - Do a statfs + * @sdp: the filesystem + * @sg: the sg structure + * + * Returns: errno + */ + +static int gfs2_statfs_i(struct gfs2_sbd *sdp, struct gfs2_statfs_change_host *sc) +{ + struct gfs2_statfs_change_host *m_sc = &sdp->sd_statfs_master; + struct gfs2_statfs_change_host *l_sc = &sdp->sd_statfs_local; + + spin_lock(&sdp->sd_statfs_spin); + + *sc = *m_sc; + sc->sc_total += l_sc->sc_total; + sc->sc_free += l_sc->sc_free; + sc->sc_dinodes += l_sc->sc_dinodes; + + spin_unlock(&sdp->sd_statfs_spin); + + if (sc->sc_free < 0) + sc->sc_free = 0; + if (sc->sc_free > sc->sc_total) + sc->sc_free = sc->sc_total; + if (sc->sc_dinodes < 0) + sc->sc_dinodes = 0; + + return 0; +} + +/** + * gfs2_statfs - Gather and return stats about the filesystem + * @sb: The superblock + * @statfsbuf: The buffer + * + * Returns: 0 on success or error code + */ + +static int gfs2_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct super_block *sb = dentry->d_inode->i_sb; + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_statfs_change_host sc; + int error; + + if (gfs2_tune_get(sdp, gt_statfs_slow)) + error = gfs2_statfs_slow(sdp, &sc); + else + error = gfs2_statfs_i(sdp, &sc); + + if (error) + return error; + + buf->f_type = GFS2_MAGIC; + buf->f_bsize = sdp->sd_sb.sb_bsize; + buf->f_blocks = sc.sc_total; + buf->f_bfree = sc.sc_free; + buf->f_bavail = sc.sc_free; + buf->f_files = sc.sc_dinodes + sc.sc_free; + buf->f_ffree = sc.sc_free; + buf->f_namelen = GFS2_FNAMESIZE; + + return 0; +} + +/** + * gfs2_remount_fs - called when the FS is remounted + * @sb: the filesystem + * @flags: the remount flags + * @data: extra data passed in (not used right now) + * + * Returns: errno + */ + +static int gfs2_remount_fs(struct super_block *sb, int *flags, char *data) +{ + struct gfs2_sbd *sdp = sb->s_fs_info; + struct gfs2_args args = sdp->sd_args; /* Default to current settings */ + struct gfs2_tune *gt = &sdp->sd_tune; + int error; + + spin_lock(>->gt_spin); + args.ar_commit = gt->gt_log_flush_secs; + spin_unlock(>->gt_spin); + error = gfs2_mount_args(sdp, &args, data); + if (error) + return error; + + /* Not allowed to change locking details */ + if (strcmp(args.ar_lockproto, sdp->sd_args.ar_lockproto) || + strcmp(args.ar_locktable, sdp->sd_args.ar_locktable) || + strcmp(args.ar_hostdata, sdp->sd_args.ar_hostdata)) + return -EINVAL; + + /* Some flags must not be changed */ + if (args_neq(&args, &sdp->sd_args, spectator) || + args_neq(&args, &sdp->sd_args, ignore_local_fs) || + args_neq(&args, &sdp->sd_args, localflocks) || + args_neq(&args, &sdp->sd_args, localcaching) || + args_neq(&args, &sdp->sd_args, meta)) + return -EINVAL; + + if (sdp->sd_args.ar_spectator) + *flags |= MS_RDONLY; + + if ((sb->s_flags ^ *flags) & MS_RDONLY) { + if (*flags & MS_RDONLY) + error = gfs2_make_fs_ro(sdp); + else + error = gfs2_make_fs_rw(sdp); + if (error) + return error; + } + + sdp->sd_args = args; + if (sdp->sd_args.ar_posix_acl) + sb->s_flags |= MS_POSIXACL; + else + sb->s_flags &= ~MS_POSIXACL; + spin_lock(>->gt_spin); + gt->gt_log_flush_secs = args.ar_commit; + spin_unlock(>->gt_spin); + + return 0; +} + +/** + * gfs2_drop_inode - Drop an inode (test for remote unlink) + * @inode: The inode to drop + * + * If we've received a callback on an iopen lock then its because a + * remote node tried to deallocate the inode but failed due to this node + * still having the inode open. Here we mark the link count zero + * since we know that it must have reached zero if the GLF_DEMOTE flag + * is set on the iopen glock. If we didn't do a disk read since the + * remote node removed the final link then we might otherwise miss + * this event. This check ensures that this node will deallocate the + * inode's blocks, or alternatively pass the baton on to another + * node for later deallocation. + */ + +static void gfs2_drop_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + if (test_bit(GIF_USER, &ip->i_flags) && inode->i_nlink) { + struct gfs2_glock *gl = ip->i_iopen_gh.gh_gl; + if (gl && test_bit(GLF_DEMOTE, &gl->gl_flags)) + clear_nlink(inode); + } + generic_drop_inode(inode); +} + +/** + * gfs2_clear_inode - Deallocate an inode when VFS is done with it + * @inode: The VFS inode + * + */ + +static void gfs2_clear_inode(struct inode *inode) +{ + struct gfs2_inode *ip = GFS2_I(inode); + + /* This tells us its a "real" inode and not one which only + * serves to contain an address space (see rgrp.c, meta_io.c) + * which therefore doesn't have its own glocks. + */ + if (test_bit(GIF_USER, &ip->i_flags)) { + ip->i_gl->gl_object = NULL; + gfs2_glock_put(ip->i_gl); + ip->i_gl = NULL; + if (ip->i_iopen_gh.gh_gl) { + ip->i_iopen_gh.gh_gl->gl_object = NULL; + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + } + } +} + +static int is_ancestor(const struct dentry *d1, const struct dentry *d2) +{ + do { + if (d1 == d2) + return 1; + d1 = d1->d_parent; + } while (!IS_ROOT(d1)); + return 0; +} + +/** + * gfs2_show_options - Show mount options for /proc/mounts + * @s: seq_file structure + * @mnt: vfsmount + * + * Returns: 0 on success or error code + */ + +static int gfs2_show_options(struct seq_file *s, struct vfsmount *mnt) +{ + struct gfs2_sbd *sdp = mnt->mnt_sb->s_fs_info; + struct gfs2_args *args = &sdp->sd_args; + int lfsecs; + + if (is_ancestor(mnt->mnt_root, sdp->sd_master_dir)) + seq_printf(s, ",meta"); + if (args->ar_lockproto[0]) + seq_printf(s, ",lockproto=%s", args->ar_lockproto); + if (args->ar_locktable[0]) + seq_printf(s, ",locktable=%s", args->ar_locktable); + if (args->ar_hostdata[0]) + seq_printf(s, ",hostdata=%s", args->ar_hostdata); + if (args->ar_spectator) + seq_printf(s, ",spectator"); + if (args->ar_ignore_local_fs) + seq_printf(s, ",ignore_local_fs"); + if (args->ar_localflocks) + seq_printf(s, ",localflocks"); + if (args->ar_localcaching) + seq_printf(s, ",localcaching"); + if (args->ar_debug) + seq_printf(s, ",debug"); + if (args->ar_upgrade) + seq_printf(s, ",upgrade"); + if (args->ar_posix_acl) + seq_printf(s, ",acl"); + if (args->ar_quota != GFS2_QUOTA_DEFAULT) { + char *state; + switch (args->ar_quota) { + case GFS2_QUOTA_OFF: + state = "off"; + break; + case GFS2_QUOTA_ACCOUNT: + state = "account"; + break; + case GFS2_QUOTA_ON: + state = "on"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",quota=%s", state); + } + if (args->ar_suiddir) + seq_printf(s, ",suiddir"); + if (args->ar_data != GFS2_DATA_DEFAULT) { + char *state; + switch (args->ar_data) { + case GFS2_DATA_WRITEBACK: + state = "writeback"; + break; + case GFS2_DATA_ORDERED: + state = "ordered"; + break; + default: + state = "unknown"; + break; + } + seq_printf(s, ",data=%s", state); + } + if (args->ar_discard) + seq_printf(s, ",discard"); + lfsecs = sdp->sd_tune.gt_log_flush_secs; + if (lfsecs != 60) + seq_printf(s, ",commit=%d", lfsecs); + return 0; +} + +/* + * We have to (at the moment) hold the inodes main lock to cover + * the gap between unlocking the shared lock on the iopen lock and + * taking the exclusive lock. I'd rather do a shared -> exclusive + * conversion on the iopen lock, but we can change that later. This + * is safe, just less efficient. + */ + +static void gfs2_delete_inode(struct inode *inode) +{ + struct gfs2_sbd *sdp = inode->i_sb->s_fs_info; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_holder gh; + int error; + + if (!test_bit(GIF_USER, &ip->i_flags)) + goto out; + + error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); + if (unlikely(error)) { + gfs2_glock_dq_uninit(&ip->i_iopen_gh); + goto out; + } + + gfs2_glock_dq_wait(&ip->i_iopen_gh); + gfs2_holder_reinit(LM_ST_EXCLUSIVE, LM_FLAG_TRY_1CB | GL_NOCACHE, &ip->i_iopen_gh); + error = gfs2_glock_nq(&ip->i_iopen_gh); + if (error) + goto out_truncate; + + if (S_ISDIR(inode->i_mode) && + (ip->i_diskflags & GFS2_DIF_EXHASH)) { + error = gfs2_dir_exhash_dealloc(ip); + if (error) + goto out_unlock; + } + + if (ip->i_eattr) { + error = gfs2_ea_dealloc(ip); + if (error) + goto out_unlock; + } + + if (!gfs2_is_stuffed(ip)) { + error = gfs2_file_dealloc(ip); + if (error) + goto out_unlock; + } + + error = gfs2_dinode_dealloc(ip); + if (error) + goto out_unlock; + +out_truncate: + error = gfs2_trans_begin(sdp, 0, sdp->sd_jdesc->jd_blocks); + if (error) + goto out_unlock; + /* Needs to be done before glock release & also in a transaction */ + truncate_inode_pages(&inode->i_data, 0); + gfs2_trans_end(sdp); + +out_unlock: + if (test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + gfs2_glock_dq(&ip->i_iopen_gh); + gfs2_holder_uninit(&ip->i_iopen_gh); + gfs2_glock_dq_uninit(&gh); + if (error && error != GLR_TRYFAILED && error != -EROFS) + fs_warn(sdp, "gfs2_delete_inode: %d\n", error); +out: + truncate_inode_pages(&inode->i_data, 0); + clear_inode(inode); +} + +static struct inode *gfs2_alloc_inode(struct super_block *sb) +{ + struct gfs2_inode *ip; + + ip = kmem_cache_alloc(gfs2_inode_cachep, GFP_KERNEL); + if (ip) { + ip->i_flags = 0; + ip->i_gl = NULL; + } + return &ip->i_inode; +} + +static void gfs2_destroy_inode(struct inode *inode) +{ + kmem_cache_free(gfs2_inode_cachep, inode); +} + +const struct super_operations gfs2_super_ops = { + .alloc_inode = gfs2_alloc_inode, + .destroy_inode = gfs2_destroy_inode, + .write_inode = gfs2_write_inode, + .delete_inode = gfs2_delete_inode, + .put_super = gfs2_put_super, + .write_super = gfs2_write_super, + .sync_fs = gfs2_sync_fs, + .freeze_fs = gfs2_freeze, + .unfreeze_fs = gfs2_unfreeze, + .statfs = gfs2_statfs, + .remount_fs = gfs2_remount_fs, + .clear_inode = gfs2_clear_inode, + .drop_inode = gfs2_drop_inode, + .show_options = gfs2_show_options, +}; + From 2286dbfad1fb622ee2691537e5caaedee4618860 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 22 May 2009 10:45:09 +0100 Subject: [PATCH 1180/2061] GFS2: Move gfs2_rmdiri into ops_inode.c Move gfs2_rmdiri() into ops_inode.c and make it static. Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 52 -------------------------------------------- fs/gfs2/inode.h | 2 -- fs/gfs2/ops_inode.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 53 insertions(+), 54 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index c03a1a384e7..9b17447a0f9 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1046,58 +1046,6 @@ fail: return ERR_PTR(error); } -/** - * gfs2_rmdiri - Remove a directory - * @dip: The parent directory of the directory to be removed - * @name: The name of the directory to be removed - * @ip: The GFS2 inode of the directory to be removed - * - * Assumes Glocks on dip and ip are held - * - * Returns: errno - */ - -int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip) -{ - struct qstr dotname; - int error; - - if (ip->i_entries != 2) { - if (gfs2_consist_inode(ip)) - gfs2_dinode_print(ip); - return -EIO; - } - - error = gfs2_dir_del(dip, name); - if (error) - return error; - - error = gfs2_change_nlink(dip, -1); - if (error) - return error; - - gfs2_str2qstr(&dotname, "."); - error = gfs2_dir_del(ip, &dotname); - if (error) - return error; - - gfs2_str2qstr(&dotname, ".."); - error = gfs2_dir_del(ip, &dotname); - if (error) - return error; - - /* It looks odd, but it really should be done twice */ - error = gfs2_change_nlink(ip, -1); - if (error) - return error; - - error = gfs2_change_nlink(ip, -1); - if (error) - return error; - - return error; -} /* * gfs2_unlink_ok - check to see that a inode is still in a directory diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 2c3ec072d60..6cd39284eb0 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -96,8 +96,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, extern struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, unsigned int mode, dev_t dev); -extern int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, - struct gfs2_inode *ip); extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, const struct gfs2_inode *ip); extern int gfs2_permission(struct inode *inode, int mask); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 1c70fa5168d..5dacd647ff0 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -472,6 +472,59 @@ static int gfs2_mkdir(struct inode *dir, struct dentry *dentry, int mode) return 0; } +/** + * gfs2_rmdiri - Remove a directory + * @dip: The parent directory of the directory to be removed + * @name: The name of the directory to be removed + * @ip: The GFS2 inode of the directory to be removed + * + * Assumes Glocks on dip and ip are held + * + * Returns: errno + */ + +static int gfs2_rmdiri(struct gfs2_inode *dip, const struct qstr *name, + struct gfs2_inode *ip) +{ + struct qstr dotname; + int error; + + if (ip->i_entries != 2) { + if (gfs2_consist_inode(ip)) + gfs2_dinode_print(ip); + return -EIO; + } + + error = gfs2_dir_del(dip, name); + if (error) + return error; + + error = gfs2_change_nlink(dip, -1); + if (error) + return error; + + gfs2_str2qstr(&dotname, "."); + error = gfs2_dir_del(ip, &dotname); + if (error) + return error; + + gfs2_str2qstr(&dotname, ".."); + error = gfs2_dir_del(ip, &dotname); + if (error) + return error; + + /* It looks odd, but it really should be done twice */ + error = gfs2_change_nlink(ip, -1); + if (error) + return error; + + error = gfs2_change_nlink(ip, -1); + if (error) + return error; + + return error; +} + /** * gfs2_rmdir - Remove a directory * @dir: The parent directory of the directory to be removed From 536baf02f650f4547f105386878b4736fbc181e8 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 22 May 2009 10:48:59 +0100 Subject: [PATCH 1181/2061] GFS2: Move gfs2_readlinki into ops_inode.c Move gfs2_readlinki into ops_inode.c and make it static Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 58 +-------------------------------------------- fs/gfs2/inode.h | 1 - fs/gfs2/ops_inode.c | 55 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 56 insertions(+), 58 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 9b17447a0f9..676e750fc84 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1085,63 +1085,7 @@ int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, return 0; } -/** - * gfs2_readlinki - return the contents of a symlink - * @ip: the symlink's inode - * @buf: a pointer to the buffer to be filled - * @len: a pointer to the length of @buf - * - * If @buf is too small, a piece of memory is kmalloc()ed and needs - * to be freed by the caller. - * - * Returns: errno - */ - -int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) -{ - struct gfs2_holder i_gh; - struct buffer_head *dibh; - unsigned int x; - int error; - - gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); - error = gfs2_glock_nq(&i_gh); - if (error) { - gfs2_holder_uninit(&i_gh); - return error; - } - - if (!ip->i_disksize) { - gfs2_consist_inode(ip); - error = -EIO; - goto out; - } - - error = gfs2_meta_inode_buffer(ip, &dibh); - if (error) - goto out; - - x = ip->i_disksize + 1; - if (x > *len) { - *buf = kmalloc(x, GFP_NOFS); - if (!*buf) { - error = -ENOMEM; - goto out_brelse; - } - } - - memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); - *len = x; - -out_brelse: - brelse(dibh); -out: - gfs2_glock_dq_uninit(&i_gh); - return error; -} - -static int -__gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) +static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) { struct buffer_head *dibh; int error; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 6cd39284eb0..fc9a08f45be 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -99,7 +99,6 @@ extern struct inode *gfs2_createi(struct gfs2_holder *ghs, extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, const struct gfs2_inode *ip); extern int gfs2_permission(struct inode *inode, int mask); -extern int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len); extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); extern void gfs2_dinode_out(const struct gfs2_inode *ip, void *buf); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index 5dacd647ff0..f607f0908cf 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -937,6 +937,61 @@ out: return error; } +/** + * gfs2_readlinki - return the contents of a symlink + * @ip: the symlink's inode + * @buf: a pointer to the buffer to be filled + * @len: a pointer to the length of @buf + * + * If @buf is too small, a piece of memory is kmalloc()ed and needs + * to be freed by the caller. + * + * Returns: errno + */ + +static int gfs2_readlinki(struct gfs2_inode *ip, char **buf, unsigned int *len) +{ + struct gfs2_holder i_gh; + struct buffer_head *dibh; + unsigned int x; + int error; + + gfs2_holder_init(ip->i_gl, LM_ST_SHARED, 0, &i_gh); + error = gfs2_glock_nq(&i_gh); + if (error) { + gfs2_holder_uninit(&i_gh); + return error; + } + + if (!ip->i_disksize) { + gfs2_consist_inode(ip); + error = -EIO; + goto out; + } + + error = gfs2_meta_inode_buffer(ip, &dibh); + if (error) + goto out; + + x = ip->i_disksize + 1; + if (x > *len) { + *buf = kmalloc(x, GFP_NOFS); + if (!*buf) { + error = -ENOMEM; + goto out_brelse; + } + } + + memcpy(*buf, dibh->b_data + sizeof(struct gfs2_dinode), x); + *len = x; + +out_brelse: + brelse(dibh); +out: + gfs2_glock_dq_uninit(&i_gh); + return error; +} + /** * gfs2_readlink - Read the value of a symlink * @dentry: the symlink From 87ec21741138bb42e7f943bb142b1d8567c10925 Mon Sep 17 00:00:00 2001 From: Steven Whitehouse Date: Fri, 22 May 2009 10:54:50 +0100 Subject: [PATCH 1182/2061] GFS2: Move gfs2_unlink_ok into ops_inode.c Another function which is only called from one ops_inode.c so we move it and make it static. Signed-off-by: Steven Whitehouse --- fs/gfs2/inode.c | 39 --------------------------------------- fs/gfs2/inode.h | 2 -- fs/gfs2/ops_inode.c | 38 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 38 insertions(+), 41 deletions(-) diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 676e750fc84..2f94bd72369 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1046,45 +1046,6 @@ fail: return ERR_PTR(error); } - -/* - * gfs2_unlink_ok - check to see that a inode is still in a directory - * @dip: the directory - * @name: the name of the file - * @ip: the inode - * - * Assumes that the lock on (at least) @dip is held. - * - * Returns: 0 if the parent/child relationship is correct, errno if it isn't - */ - -int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - const struct gfs2_inode *ip) -{ - int error; - - if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) - return -EPERM; - - if ((dip->i_inode.i_mode & S_ISVTX) && - dip->i_inode.i_uid != current_fsuid() && - ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER)) - return -EPERM; - - if (IS_APPEND(&dip->i_inode)) - return -EPERM; - - error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); - if (error) - return error; - - error = gfs2_dir_check(&dip->i_inode, name, ip); - if (error) - return error; - - return 0; -} - static int __gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr) { struct buffer_head *dibh; diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index fc9a08f45be..c341aaf67ad 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -96,8 +96,6 @@ extern struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, extern struct inode *gfs2_createi(struct gfs2_holder *ghs, const struct qstr *name, unsigned int mode, dev_t dev); -extern int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, - const struct gfs2_inode *ip); extern int gfs2_permission(struct inode *inode, int mask); extern int gfs2_setattr_simple(struct gfs2_inode *ip, struct iattr *attr); extern struct inode *gfs2_lookup_simple(struct inode *dip, const char *name); diff --git a/fs/gfs2/ops_inode.c b/fs/gfs2/ops_inode.c index f607f0908cf..f8bd20baf99 100644 --- a/fs/gfs2/ops_inode.c +++ b/fs/gfs2/ops_inode.c @@ -262,6 +262,44 @@ out_parent: return error; } +/* + * gfs2_unlink_ok - check to see that a inode is still in a directory + * @dip: the directory + * @name: the name of the file + * @ip: the inode + * + * Assumes that the lock on (at least) @dip is held. + * + * Returns: 0 if the parent/child relationship is correct, errno if it isn't + */ + +static int gfs2_unlink_ok(struct gfs2_inode *dip, const struct qstr *name, + const struct gfs2_inode *ip) +{ + int error; + + if (IS_IMMUTABLE(&ip->i_inode) || IS_APPEND(&ip->i_inode)) + return -EPERM; + + if ((dip->i_inode.i_mode & S_ISVTX) && + dip->i_inode.i_uid != current_fsuid() && + ip->i_inode.i_uid != current_fsuid() && !capable(CAP_FOWNER)) + return -EPERM; + + if (IS_APPEND(&dip->i_inode)) + return -EPERM; + + error = gfs2_permission(&dip->i_inode, MAY_WRITE | MAY_EXEC); + if (error) + return error; + + error = gfs2_dir_check(&dip->i_inode, name, ip); + if (error) + return error; + + return 0; +} + /** * gfs2_unlink - Unlink a file * @dir: The inode of the directory containing the file to unlink From a63eaf34ae60bdb067a354cc8def2e8f4a01f5f4 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:17:31 +1000 Subject: [PATCH 1183/2061] perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 1 + include/linux/init_task.h | 13 --- include/linux/perf_counter.h | 4 +- include/linux/sched.h | 6 +- kernel/exit.c | 3 +- kernel/fork.c | 1 + kernel/perf_counter.c | 218 ++++++++++++++++++++++------------- 7 files changed, 145 insertions(+), 101 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index e9021a90802..b4f64402a82 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -14,6 +14,7 @@ * Mikael Pettersson : PM converted to driver model. */ +#include #include #include #include diff --git a/include/linux/init_task.h b/include/linux/init_task.h index 503afaa0afa..d87247d2641 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -108,18 +108,6 @@ extern struct group_info init_groups; extern struct cred init_cred; -#ifdef CONFIG_PERF_COUNTERS -# define INIT_PERF_COUNTERS(tsk) \ - .perf_counter_ctx.counter_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.counter_list), \ - .perf_counter_ctx.event_list = \ - LIST_HEAD_INIT(tsk.perf_counter_ctx.event_list), \ - .perf_counter_ctx.lock = \ - __SPIN_LOCK_UNLOCKED(tsk.perf_counter_ctx.lock), -#else -# define INIT_PERF_COUNTERS(tsk) -#endif - /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -183,7 +171,6 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ - INIT_PERF_COUNTERS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index f612941ef46..07130900546 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -449,7 +449,6 @@ struct perf_counter { struct hw_perf_counter hw; struct perf_counter_context *ctx; - struct task_struct *task; struct file *filp; struct perf_counter *parent; @@ -498,7 +497,6 @@ struct perf_counter { * Used as a container for task counters and CPU counters as well: */ struct perf_counter_context { -#ifdef CONFIG_PERF_COUNTERS /* * Protect the states of the counters in the list, * nr_active, and the list: @@ -516,6 +514,7 @@ struct perf_counter_context { int nr_counters; int nr_active; int is_active; + atomic_t refcount; struct task_struct *task; /* @@ -523,7 +522,6 @@ struct perf_counter_context { */ u64 time; u64 timestamp; -#endif }; /** diff --git a/include/linux/sched.h b/include/linux/sched.h index ff59d123151..9714d450f41 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -71,7 +71,6 @@ struct sched_param { #include #include #include -#include #include #include #include @@ -99,6 +98,7 @@ struct robust_list_head; struct bio; struct bts_tracer; struct fs_struct; +struct perf_counter_context; /* * List of flags we want to share for kernel threads, @@ -1387,7 +1387,9 @@ struct task_struct { struct list_head pi_state_list; struct futex_pi_state *pi_state_cache; #endif - struct perf_counter_context perf_counter_ctx; +#ifdef CONFIG_PERF_COUNTERS + struct perf_counter_context *perf_counter_ctxp; +#endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; short il_next; diff --git a/kernel/exit.c b/kernel/exit.c index f9dfedd94af..99ad4063ee4 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include @@ -159,7 +160,7 @@ static void delayed_put_task_struct(struct rcu_head *rhp) struct task_struct *tsk = container_of(rhp, struct task_struct, rcu); #ifdef CONFIG_PERF_COUNTERS - WARN_ON_ONCE(!list_empty(&tsk->perf_counter_ctx.counter_list)); + WARN_ON_ONCE(tsk->perf_counter_ctxp); #endif trace_sched_process_free(tsk); put_task_struct(tsk); diff --git a/kernel/fork.c b/kernel/fork.c index d32fef4d38e..e72a09f5355 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -63,6 +63,7 @@ #include #include #include +#include #include #include diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 08584c16049..06ea3eae886 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -97,6 +97,17 @@ void perf_enable(void) hw_perf_enable(); } +static void get_ctx(struct perf_counter_context *ctx) +{ + atomic_inc(&ctx->refcount); +} + +static void put_ctx(struct perf_counter_context *ctx) +{ + if (atomic_dec_and_test(&ctx->refcount)) + kfree(ctx); +} + static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -118,11 +129,17 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) ctx->nr_counters++; } +/* + * Remove a counter from the lists for its context. + * Must be called with counter->mutex and ctx->mutex held. + */ static void list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { struct perf_counter *sibling, *tmp; + if (list_empty(&counter->list_entry)) + return; ctx->nr_counters--; list_del_init(&counter->list_entry); @@ -216,8 +233,6 @@ static void __perf_counter_remove_from_context(void *info) counter_sched_out(counter, cpuctx, ctx); - counter->task = NULL; - list_del_counter(counter, ctx); if (!ctx->task) { @@ -279,7 +294,6 @@ retry: */ if (!list_empty(&counter->list_entry)) { list_del_counter(counter, ctx); - counter->task = NULL; } spin_unlock_irq(&ctx->lock); } @@ -568,11 +582,17 @@ static void __perf_install_in_context(void *info) * If this is a task context, we need to check whether it is * the current task context of this cpu. If not it has been * scheduled out before the smp call arrived. + * Or possibly this is the right context but it isn't + * on this cpu because it had no counters. */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } spin_lock_irqsave(&ctx->lock, flags); + ctx->is_active = 1; update_context_time(ctx); /* @@ -653,7 +673,6 @@ perf_install_in_context(struct perf_counter_context *ctx, return; } - counter->task = task; retry: task_oncpu_function_call(task, __perf_install_in_context, counter); @@ -693,10 +712,14 @@ static void __perf_counter_enable(void *info) * If this is a per-task counter, need to check whether this * counter's task is the current task on this cpu. */ - if (ctx->task && cpuctx->task_ctx != ctx) - return; + if (ctx->task && cpuctx->task_ctx != ctx) { + if (cpuctx->task_ctx || ctx->task != current) + return; + cpuctx->task_ctx = ctx; + } spin_lock_irqsave(&ctx->lock, flags); + ctx->is_active = 1; update_context_time(ctx); counter->prev_state = counter->state; @@ -852,10 +875,10 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, void perf_counter_task_sched_out(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter_context *ctx = task->perf_counter_ctxp; struct pt_regs *regs; - if (likely(!cpuctx->task_ctx)) + if (likely(!ctx || !cpuctx->task_ctx)) return; update_context_time(ctx); @@ -871,6 +894,8 @@ static void __perf_counter_task_sched_out(struct perf_counter_context *ctx) { struct perf_cpu_context *cpuctx = &__get_cpu_var(perf_cpu_context); + if (!cpuctx->task_ctx) + return; __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; } @@ -969,8 +994,10 @@ __perf_counter_sched_in(struct perf_counter_context *ctx, void perf_counter_task_sched_in(struct task_struct *task, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); - struct perf_counter_context *ctx = &task->perf_counter_ctx; + struct perf_counter_context *ctx = task->perf_counter_ctxp; + if (likely(!ctx)) + return; __perf_counter_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } @@ -985,11 +1012,11 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) int perf_counter_task_disable(void) { struct task_struct *curr = current; - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; unsigned long flags; - if (likely(!ctx->nr_counters)) + if (!ctx || !ctx->nr_counters) return 0; local_irq_save(flags); @@ -1020,12 +1047,12 @@ int perf_counter_task_disable(void) int perf_counter_task_enable(void) { struct task_struct *curr = current; - struct perf_counter_context *ctx = &curr->perf_counter_ctx; + struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; unsigned long flags; int cpu; - if (likely(!ctx->nr_counters)) + if (!ctx || !ctx->nr_counters) return 0; local_irq_save(flags); @@ -1128,19 +1155,23 @@ void perf_counter_task_tick(struct task_struct *curr, int cpu) return; cpuctx = &per_cpu(perf_cpu_context, cpu); - ctx = &curr->perf_counter_ctx; + ctx = curr->perf_counter_ctxp; perf_adjust_freq(&cpuctx->ctx); - perf_adjust_freq(ctx); + if (ctx) + perf_adjust_freq(ctx); perf_counter_cpu_sched_out(cpuctx); - __perf_counter_task_sched_out(ctx); + if (ctx) + __perf_counter_task_sched_out(ctx); rotate_ctx(&cpuctx->ctx); - rotate_ctx(ctx); + if (ctx) + rotate_ctx(ctx); perf_counter_cpu_sched_in(cpuctx, cpu); - perf_counter_task_sched_in(curr, cpu); + if (ctx) + perf_counter_task_sched_in(curr, cpu); } /* @@ -1176,6 +1207,22 @@ static u64 perf_counter_read(struct perf_counter *counter) return atomic64_read(&counter->count); } +/* + * Initialize the perf_counter context in a task_struct: + */ +static void +__perf_counter_init_context(struct perf_counter_context *ctx, + struct task_struct *task) +{ + memset(ctx, 0, sizeof(*ctx)); + spin_lock_init(&ctx->lock); + mutex_init(&ctx->mutex); + INIT_LIST_HEAD(&ctx->counter_list); + INIT_LIST_HEAD(&ctx->event_list); + atomic_set(&ctx->refcount, 1); + ctx->task = task; +} + static void put_context(struct perf_counter_context *ctx) { if (ctx->task) @@ -1186,6 +1233,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) { struct perf_cpu_context *cpuctx; struct perf_counter_context *ctx; + struct perf_counter_context *tctx; struct task_struct *task; /* @@ -1225,15 +1273,36 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu) if (!task) return ERR_PTR(-ESRCH); - ctx = &task->perf_counter_ctx; - ctx->task = task; - /* Reuse ptrace permission checks for now. */ if (!ptrace_may_access(task, PTRACE_MODE_READ)) { - put_context(ctx); + put_task_struct(task); return ERR_PTR(-EACCES); } + ctx = task->perf_counter_ctxp; + if (!ctx) { + ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!ctx) { + put_task_struct(task); + return ERR_PTR(-ENOMEM); + } + __perf_counter_init_context(ctx, task); + /* + * Make sure other cpus see correct values for *ctx + * once task->perf_counter_ctxp is visible to them. + */ + smp_wmb(); + tctx = cmpxchg(&task->perf_counter_ctxp, NULL, ctx); + if (tctx) { + /* + * We raced with some other task; use + * the context they set. + */ + kfree(ctx); + ctx = tctx; + } + } + return ctx; } @@ -1242,6 +1311,7 @@ static void free_counter_rcu(struct rcu_head *head) struct perf_counter *counter; counter = container_of(head, struct perf_counter, rcu_head); + put_ctx(counter->ctx); kfree(counter); } @@ -2247,7 +2317,7 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event) perf_counter_comm_ctx(&cpuctx->ctx, comm_event); put_cpu_var(perf_cpu_context); - perf_counter_comm_ctx(¤t->perf_counter_ctx, comm_event); + perf_counter_comm_ctx(current->perf_counter_ctxp, comm_event); } void perf_counter_comm(struct task_struct *task) @@ -2256,7 +2326,9 @@ void perf_counter_comm(struct task_struct *task) if (!atomic_read(&nr_comm_tracking)) return; - + if (!current->perf_counter_ctxp) + return; + comm_event = (struct perf_comm_event){ .task = task, .event = { @@ -2372,7 +2444,7 @@ got_name: perf_counter_mmap_ctx(&cpuctx->ctx, mmap_event); put_cpu_var(perf_cpu_context); - perf_counter_mmap_ctx(¤t->perf_counter_ctx, mmap_event); + perf_counter_mmap_ctx(current->perf_counter_ctxp, mmap_event); kfree(buf); } @@ -2384,6 +2456,8 @@ void perf_counter_mmap(unsigned long addr, unsigned long len, if (!atomic_read(&nr_mmap_tracking)) return; + if (!current->perf_counter_ctxp) + return; mmap_event = (struct perf_mmap_event){ .file = file, @@ -2985,6 +3059,7 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->pmu = NULL; counter->ctx = ctx; + get_ctx(ctx); counter->state = PERF_COUNTER_STATE_INACTIVE; if (hw_event->disabled) @@ -3149,21 +3224,6 @@ err_put_context: goto out_fput; } -/* - * Initialize the perf_counter context in a task_struct: - */ -static void -__perf_counter_init_context(struct perf_counter_context *ctx, - struct task_struct *task) -{ - memset(ctx, 0, sizeof(*ctx)); - spin_lock_init(&ctx->lock); - mutex_init(&ctx->mutex); - INIT_LIST_HEAD(&ctx->counter_list); - INIT_LIST_HEAD(&ctx->event_list); - ctx->task = task; -} - /* * inherit a counter from parent task to child task: */ @@ -3195,7 +3255,6 @@ inherit_counter(struct perf_counter *parent_counter, /* * Link it up in the child's context: */ - child_counter->task = child; add_counter_to_ctx(child_counter, child_ctx); child_counter->parent = parent_counter; @@ -3294,40 +3353,15 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter *parent_counter; /* - * If we do not self-reap then we have to wait for the - * child task to unschedule (it will happen for sure), - * so that its counter is at its final count. (This - * condition triggers rarely - child tasks usually get - * off their CPU before the parent has a chance to - * get this far into the reaping action) + * Protect against concurrent operations on child_counter + * due its fd getting closed, etc. */ - if (child != current) { - wait_task_inactive(child, 0); - update_counter_times(child_counter); - list_del_counter(child_counter, child_ctx); - } else { - struct perf_cpu_context *cpuctx; - unsigned long flags; + mutex_lock(&child_counter->mutex); - /* - * Disable and unlink this counter. - * - * Be careful about zapping the list - IRQ/NMI context - * could still be processing it: - */ - local_irq_save(flags); - perf_disable(); + update_counter_times(child_counter); + list_del_counter(child_counter, child_ctx); - cpuctx = &__get_cpu_var(perf_cpu_context); - - group_sched_out(child_counter, cpuctx, child_ctx); - update_counter_times(child_counter); - - list_del_counter(child_counter, child_ctx); - - perf_enable(); - local_irq_restore(flags); - } + mutex_unlock(&child_counter->mutex); parent_counter = child_counter->parent; /* @@ -3346,19 +3380,29 @@ __perf_counter_exit_task(struct task_struct *child, * * Note: we may be running in child context, but the PID is not hashed * anymore so new counters will not be added. + * (XXX not sure that is true when we get called from flush_old_exec. + * -- paulus) */ void perf_counter_exit_task(struct task_struct *child) { struct perf_counter *child_counter, *tmp; struct perf_counter_context *child_ctx; + unsigned long flags; WARN_ON_ONCE(child != current); - child_ctx = &child->perf_counter_ctx; + child_ctx = child->perf_counter_ctxp; - if (likely(!child_ctx->nr_counters)) + if (likely(!child_ctx)) return; + local_irq_save(flags); + __perf_counter_task_sched_out(child_ctx); + child->perf_counter_ctxp = NULL; + local_irq_restore(flags); + + mutex_lock(&child_ctx->mutex); + again: list_for_each_entry_safe(child_counter, tmp, &child_ctx->counter_list, list_entry) @@ -3371,6 +3415,10 @@ again: */ if (!list_empty(&child_ctx->counter_list)) goto again; + + mutex_unlock(&child_ctx->mutex); + + put_ctx(child_ctx); } /* @@ -3382,19 +3430,25 @@ void perf_counter_init_task(struct task_struct *child) struct perf_counter *counter; struct task_struct *parent = current; - child_ctx = &child->perf_counter_ctx; - parent_ctx = &parent->perf_counter_ctx; - - __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = NULL; /* * This is executed from the parent task context, so inherit - * counters that have been marked for cloning: + * counters that have been marked for cloning. + * First allocate and initialize a context for the child. */ - if (likely(!parent_ctx->nr_counters)) + child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); + if (!child_ctx) return; + parent_ctx = parent->perf_counter_ctxp; + if (likely(!parent_ctx || !parent_ctx->nr_counters)) + return; + + __perf_counter_init_context(child_ctx, child); + child->perf_counter_ctxp = child_ctx; + /* * Lock the parent list. No need to lock the child - not PID * hashed yet and not running, so nobody can access it. From 564c2b210add41df9a3a5aaa365c1d97cff6110d Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 22 May 2009 14:27:22 +1000 Subject: [PATCH 1184/2061] perf_counter: Optimize context switch between identical inherited contexts When monitoring a process and its descendants with a set of inherited counters, we can often get the situation in a context switch where both the old (outgoing) and new (incoming) process have the same set of counters, and their values are ultimately going to be added together. In that situation it doesn't matter which set of counters are used to count the activity for the new process, so there is really no need to go through the process of reading the hardware counters and updating the old task's counters and then setting up the PMU for the new task. This optimizes the context switch in this situation. Instead of scheduling out the perf_counter_context for the old task and scheduling in the new context, we simply transfer the old context to the new task and keep using it without interruption. The new context gets transferred to the old task. This means that both tasks still have a valid perf_counter_context, so no special case is introduced when the old task gets scheduled in again, either on this CPU or another CPU. The equivalence of contexts is detected by keeping a pointer in each cloned context pointing to the context it was cloned from. To cope with the situation where a context is changed by adding or removing counters after it has been cloned, we also keep a generation number on each context which is incremented every time a context is changed. When a context is cloned we take a copy of the parent's generation number, and two cloned contexts are equivalent only if they have the same parent and the same generation number. In order that the parent context pointer remains valid (and is not reused), we increment the parent context's reference count for each context cloned from it. Since we don't have individual fds for the counters in a cloned context, the only thing that can make two clones of a given parent different after they have been cloned is enabling or disabling all counters with prctl. To account for this, we keep a count of the number of enabled counters in each context. Two contexts must have the same number of enabled counters to be considered equivalent. Here are some measurements of the context switch time as measured with the lat_ctx benchmark from lmbench, comparing the times obtained with and without this patch series: -----Unmodified----- With this patch series Counters: none 2 HW 4H+4S none 2 HW 4H+4S 2 processes: Average 3.44 6.45 11.24 3.12 3.39 3.60 St dev 0.04 0.04 0.13 0.05 0.17 0.19 8 processes: Average 6.45 8.79 14.00 5.57 6.23 7.57 St dev 1.27 1.04 0.88 1.42 1.46 1.42 32 processes: Average 5.56 8.43 13.78 5.28 5.55 7.15 St dev 0.41 0.47 0.53 0.54 0.57 0.81 The numbers are the mean and standard deviation of 20 runs of lat_ctx. The "none" columns are lat_ctx run directly without any counters. The "2 HW" columns are with lat_ctx run under perfstat, counting cycles and instructions. The "4H+4S" columns are lat_ctx run under perfstat with 4 hardware counters and 4 software counters (cycles, instructions, cache references, cache misses, task clock, context switch, cpu migrations, and page faults). [ Impact: performance optimization of counter context-switches ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 12 +++- kernel/perf_counter.c | 109 ++++++++++++++++++++++++++++++----- kernel/sched.c | 2 +- 3 files changed, 107 insertions(+), 16 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 07130900546..4cae01a5045 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -513,6 +513,7 @@ struct perf_counter_context { struct list_head event_list; int nr_counters; int nr_active; + int nr_enabled; int is_active; atomic_t refcount; struct task_struct *task; @@ -522,6 +523,14 @@ struct perf_counter_context { */ u64 time; u64 timestamp; + + /* + * These fields let us detect when two contexts have both + * been cloned (inherited) from a common ancestor. + */ + struct perf_counter_context *parent_ctx; + u32 parent_gen; + u32 generation; }; /** @@ -552,7 +561,8 @@ extern int perf_max_counters; extern const struct pmu *hw_perf_counter_init(struct perf_counter *counter); extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); -extern void perf_counter_task_sched_out(struct task_struct *task, int cpu); +extern void perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); extern void perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 06ea3eae886..c10055416de 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -104,8 +104,11 @@ static void get_ctx(struct perf_counter_context *ctx) static void put_ctx(struct perf_counter_context *ctx) { - if (atomic_dec_and_test(&ctx->refcount)) + if (atomic_dec_and_test(&ctx->refcount)) { + if (ctx->parent_ctx) + put_ctx(ctx->parent_ctx); kfree(ctx); + } } static void @@ -127,6 +130,8 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_rcu(&counter->event_entry, &ctx->event_list); ctx->nr_counters++; + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + ctx->nr_enabled++; } /* @@ -141,6 +146,8 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) if (list_empty(&counter->list_entry)) return; ctx->nr_counters--; + if (counter->state >= PERF_COUNTER_STATE_INACTIVE) + ctx->nr_enabled--; list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -203,6 +210,22 @@ group_sched_out(struct perf_counter *group_counter, cpuctx->exclusive = 0; } +/* + * Mark this context as not being a clone of another. + * Called when counters are added to or removed from this context. + * We also increment our generation number so that anything that + * was cloned from this context before this will not match anything + * cloned from this context after this. + */ +static void unclone_ctx(struct perf_counter_context *ctx) +{ + ++ctx->generation; + if (!ctx->parent_ctx) + return; + put_ctx(ctx->parent_ctx); + ctx->parent_ctx = NULL; +} + /* * Cross CPU call to remove a performance counter * @@ -263,6 +286,7 @@ static void perf_counter_remove_from_context(struct perf_counter *counter) struct perf_counter_context *ctx = counter->ctx; struct task_struct *task = ctx->task; + unclone_ctx(ctx); if (!task) { /* * Per cpu counters are removed via an smp call and @@ -378,6 +402,7 @@ static void __perf_counter_disable(void *info) else counter_sched_out(counter, cpuctx, ctx); counter->state = PERF_COUNTER_STATE_OFF; + ctx->nr_enabled--; } spin_unlock_irqrestore(&ctx->lock, flags); @@ -419,6 +444,7 @@ static void perf_counter_disable(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_INACTIVE) { update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; + ctx->nr_enabled--; } spin_unlock_irq(&ctx->lock); @@ -727,6 +753,7 @@ static void __perf_counter_enable(void *info) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + ctx->nr_enabled++; /* * If the counter is in a group and isn't the group leader, @@ -817,6 +844,7 @@ static void perf_counter_enable(struct perf_counter *counter) counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; + ctx->nr_enabled++; } out: spin_unlock_irq(&ctx->lock); @@ -861,6 +889,25 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, spin_unlock(&ctx->lock); } +/* + * Test whether two contexts are equivalent, i.e. whether they + * have both been cloned from the same version of the same context + * and they both have the same number of enabled counters. + * If the number of enabled counters is the same, then the set + * of enabled counters should be the same, because these are both + * inherited contexts, therefore we can't access individual counters + * in them directly with an fd; we can only enable/disable all + * counters via prctl, or enable/disable all counters in a family + * via ioctl, which will have the same effect on both contexts. + */ +static int context_equiv(struct perf_counter_context *ctx1, + struct perf_counter_context *ctx2) +{ + return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx + && ctx1->parent_gen == ctx2->parent_gen + && ctx1->nr_enabled == ctx2->nr_enabled; +} + /* * Called from scheduler to remove the counters of the current task, * with interrupts disabled. @@ -872,10 +919,12 @@ void __perf_counter_sched_out(struct perf_counter_context *ctx, * accessing the counter control register. If a NMI hits, then it will * not restart the counter. */ -void perf_counter_task_sched_out(struct task_struct *task, int cpu) +void perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) { struct perf_cpu_context *cpuctx = &per_cpu(perf_cpu_context, cpu); struct perf_counter_context *ctx = task->perf_counter_ctxp; + struct perf_counter_context *next_ctx; struct pt_regs *regs; if (likely(!ctx || !cpuctx->task_ctx)) @@ -885,6 +934,16 @@ void perf_counter_task_sched_out(struct task_struct *task, int cpu) regs = task_pt_regs(task); perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); + + next_ctx = next->perf_counter_ctxp; + if (next_ctx && context_equiv(ctx, next_ctx)) { + task->perf_counter_ctxp = next_ctx; + next->perf_counter_ctxp = ctx; + ctx->task = next; + next_ctx->task = task; + return; + } + __perf_counter_sched_out(ctx, cpuctx); cpuctx->task_ctx = NULL; @@ -998,6 +1057,8 @@ void perf_counter_task_sched_in(struct task_struct *task, int cpu) if (likely(!ctx)) return; + if (cpuctx->task_ctx == ctx) + return; __perf_counter_sched_in(ctx, cpuctx, cpu); cpuctx->task_ctx = ctx; } @@ -3252,6 +3313,16 @@ inherit_counter(struct perf_counter *parent_counter, if (IS_ERR(child_counter)) return child_counter; + /* + * Make the child state follow the state of the parent counter, + * not its hw_event.disabled bit. We hold the parent's mutex, + * so we won't race with perf_counter_{en,dis}able_family. + */ + if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) + child_counter->state = PERF_COUNTER_STATE_INACTIVE; + else + child_counter->state = PERF_COUNTER_STATE_OFF; + /* * Link it up in the child's context: */ @@ -3277,16 +3348,6 @@ inherit_counter(struct perf_counter *parent_counter, mutex_lock(&parent_counter->mutex); list_add_tail(&child_counter->child_list, &parent_counter->child_list); - /* - * Make the child state follow the state of the parent counter, - * not its hw_event.disabled bit. We hold the parent's mutex, - * so we won't race with perf_counter_{en,dis}able_family. - */ - if (parent_counter->state >= PERF_COUNTER_STATE_INACTIVE) - child_counter->state = PERF_COUNTER_STATE_INACTIVE; - else - child_counter->state = PERF_COUNTER_STATE_OFF; - mutex_unlock(&parent_counter->mutex); return child_counter; @@ -3429,6 +3490,7 @@ void perf_counter_init_task(struct task_struct *child) struct perf_counter_context *child_ctx, *parent_ctx; struct perf_counter *counter; struct task_struct *parent = current; + int inherited_all = 1; child->perf_counter_ctxp = NULL; @@ -3463,12 +3525,31 @@ void perf_counter_init_task(struct task_struct *child) if (counter != counter->group_leader) continue; - if (!counter->hw_event.inherit) + if (!counter->hw_event.inherit) { + inherited_all = 0; continue; + } if (inherit_group(counter, parent, - parent_ctx, child, child_ctx)) + parent_ctx, child, child_ctx)) { + inherited_all = 0; break; + } + } + + if (inherited_all) { + /* + * Mark the child context as a clone of the parent + * context, or of whatever the parent is a clone of. + */ + if (parent_ctx->parent_ctx) { + child_ctx->parent_ctx = parent_ctx->parent_ctx; + child_ctx->parent_gen = parent_ctx->parent_gen; + } else { + child_ctx->parent_ctx = parent_ctx; + child_ctx->parent_gen = parent_ctx->generation; + } + get_ctx(child_ctx->parent_ctx); } mutex_unlock(&parent_ctx->mutex); diff --git a/kernel/sched.c b/kernel/sched.c index 419a39d0988..4c0d58bce6b 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -5091,7 +5091,7 @@ need_resched_nonpreemptible: if (likely(prev != next)) { sched_info_switch(prev, next); - perf_counter_task_sched_out(prev, cpu); + perf_counter_task_sched_out(prev, next, cpu); rq->nr_switches++; rq->curr = next; From 910431c7f2e963017d767b29c80ae706421e569f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 May 2009 12:32:15 +0200 Subject: [PATCH 1185/2061] perf_counter: fix !PERF_COUNTERS build failure Update the !CONFIG_PERF_COUNTERS prototype too, for perf_counter_task_sched_out(). [ Impact: build fix ] Signed-off-by: Paul Mackerras Acked-by: Peter Zijlstra Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo LKML-Reference: <18966.10666.517218.332164@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4cae01a5045..2eedae8498d 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -625,7 +625,8 @@ extern void perf_counter_init(void); static inline void perf_counter_task_sched_in(struct task_struct *task, int cpu) { } static inline void -perf_counter_task_sched_out(struct task_struct *task, int cpu) { } +perf_counter_task_sched_out(struct task_struct *task, + struct task_struct *next, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } static inline void perf_counter_init_task(struct task_struct *child) { } From c6eb13847ba081552d2af644219bddeff7110caf Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 22 May 2009 18:18:28 +0200 Subject: [PATCH 1186/2061] perf_counter tools: increase limits I tried to run with 300 active counters and the tools bailed out because our limit was at 64. So increase the counter limit to 1024 and the CPU limit to 4096. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/perf_counter/perf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h index 6fa3656399f..81a737444c8 100644 --- a/Documentation/perf_counter/perf.h +++ b/Documentation/perf_counter/perf.h @@ -54,8 +54,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, group_fd, flags); } -#define MAX_COUNTERS 64 -#define MAX_NR_CPUS 256 +#define MAX_COUNTERS 1024 +#define MAX_NR_CPUS 4096 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) From e1defc4ff0cf57aca6c5e3ff99fa503f5943c1f1 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 22 May 2009 17:17:49 -0400 Subject: [PATCH 1187/2061] block: Do away with the notion of hardsect_size Until now we have had a 1:1 mapping between storage device physical block size and the logical block sized used when addressing the device. With SATA 4KB drives coming out that will no longer be the case. The sector size will be 4KB but the logical block size will remain 512-bytes. Hence we need to distinguish between the physical block size and the logical ditto. This patch renames hardsect_size to logical_block_size. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- arch/powerpc/sysdev/axonram.c | 2 +- block/blk-integrity.c | 2 +- block/blk-settings.c | 21 ++++++++++----------- block/blk-sysfs.c | 12 +++++++++--- block/compat_ioctl.c | 2 +- block/ioctl.c | 2 +- drivers/block/cciss.c | 6 +++--- drivers/block/cpqarray.c | 4 ++-- drivers/block/hd.c | 2 +- drivers/block/mg_disk.c | 2 +- drivers/block/pktcdvd.c | 2 +- drivers/block/ps3disk.c | 2 +- drivers/block/ub.c | 6 +++--- drivers/block/virtio_blk.c | 2 +- drivers/block/xen-blkfront.c | 2 +- drivers/block/xsysace.c | 2 +- drivers/cdrom/gdrom.c | 2 +- drivers/cdrom/viocd.c | 4 ++-- drivers/char/raw.c | 2 +- drivers/ide/ide-cd.c | 12 ++++++------ drivers/md/bitmap.c | 4 ++-- drivers/md/dm-exception-store.c | 2 +- drivers/md/dm-log.c | 3 ++- drivers/md/dm-snap-persistent.c | 2 +- drivers/md/dm-table.c | 12 +++++++----- drivers/md/md.c | 2 +- drivers/memstick/core/mspro_block.c | 2 +- drivers/message/i2o/i2o_block.c | 5 +++-- drivers/mmc/card/block.c | 2 +- drivers/mtd/mtd_blkdevs.c | 2 +- drivers/s390/block/dasd.c | 2 +- drivers/s390/block/dcssblk.c | 2 +- drivers/s390/block/xpram.c | 2 +- drivers/s390/char/tape_block.c | 2 +- drivers/scsi/sd.c | 2 +- drivers/scsi/sr.c | 2 +- fs/bio.c | 3 ++- fs/block_dev.c | 6 +++--- fs/buffer.c | 6 +++--- fs/direct-io.c | 2 +- fs/ext3/super.c | 4 ++-- fs/ext4/super.c | 2 +- fs/gfs2/ops_fstype.c | 4 ++-- fs/gfs2/rgrp.c | 2 +- fs/nilfs2/the_nilfs.c | 2 +- fs/ntfs/super.c | 6 +++--- fs/ocfs2/cluster/heartbeat.c | 2 +- fs/ocfs2/super.c | 2 +- fs/partitions/ibm.c | 2 +- fs/partitions/msdos.c | 4 ++-- fs/udf/super.c | 2 +- fs/xfs/linux-2.6/xfs_buf.c | 2 +- include/linux/blkdev.h | 14 +++++++------- include/linux/device-mapper.h | 2 +- 54 files changed, 108 insertions(+), 98 deletions(-) diff --git a/arch/powerpc/sysdev/axonram.c b/arch/powerpc/sysdev/axonram.c index 9e105cbc5e5..a4779912a5c 100644 --- a/arch/powerpc/sysdev/axonram.c +++ b/arch/powerpc/sysdev/axonram.c @@ -250,7 +250,7 @@ axon_ram_probe(struct of_device *device, const struct of_device_id *device_id) set_capacity(bank->disk, bank->size >> AXON_RAM_SECTOR_SHIFT); blk_queue_make_request(bank->disk->queue, axon_ram_make_request); - blk_queue_hardsect_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE); + blk_queue_logical_block_size(bank->disk->queue, AXON_RAM_SECTOR_SIZE); add_disk(bank->disk); bank->irq_id = irq_of_parse_and_map(device->node, 0); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index 91fa8e06b6a..73e28d35568 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -340,7 +340,7 @@ int blk_integrity_register(struct gendisk *disk, struct blk_integrity *template) kobject_uevent(&bi->kobj, KOBJ_ADD); bi->flags |= INTEGRITY_FLAG_READ | INTEGRITY_FLAG_WRITE; - bi->sector_size = disk->queue->hardsect_size; + bi->sector_size = queue_logical_block_size(disk->queue); disk->integrity = bi; } else bi = disk->integrity; diff --git a/block/blk-settings.c b/block/blk-settings.c index 57af728d94b..15c3164537b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -134,7 +134,7 @@ void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn) q->backing_dev_info.state = 0; q->backing_dev_info.capabilities = BDI_CAP_MAP_COPY; blk_queue_max_sectors(q, SAFE_MAX_SECTORS); - blk_queue_hardsect_size(q, 512); + blk_queue_logical_block_size(q, 512); blk_queue_dma_alignment(q, 511); blk_queue_congestion_threshold(q); q->nr_batching = BLK_BATCH_REQ; @@ -288,21 +288,20 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) EXPORT_SYMBOL(blk_queue_max_segment_size); /** - * blk_queue_hardsect_size - set hardware sector size for the queue + * blk_queue_logical_block_size - set logical block size for the queue * @q: the request queue for the device - * @size: the hardware sector size, in bytes + * @size: the logical block size, in bytes * * Description: - * This should typically be set to the lowest possible sector size - * that the hardware can operate on (possible without reverting to - * even internal read-modify-write operations). Usually the default - * of 512 covers most hardware. + * This should be set to the lowest possible block size that the + * storage device can address. The default of 512 covers most + * hardware. **/ -void blk_queue_hardsect_size(struct request_queue *q, unsigned short size) +void blk_queue_logical_block_size(struct request_queue *q, unsigned short size) { - q->hardsect_size = size; + q->logical_block_size = size; } -EXPORT_SYMBOL(blk_queue_hardsect_size); +EXPORT_SYMBOL(blk_queue_logical_block_size); /* * Returns the minimum that is _not_ zero, unless both are zero. @@ -324,7 +323,7 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments); t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments); t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); - t->hardsect_size = max(t->hardsect_size, b->hardsect_size); + t->logical_block_size = max(t->logical_block_size, b->logical_block_size); if (!t->queue_lock) WARN_ON_ONCE(1); else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3ff9bba3379..13d38b7e4d0 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -100,9 +100,9 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) return queue_var_show(max_sectors_kb, (page)); } -static ssize_t queue_hw_sector_size_show(struct request_queue *q, char *page) +static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) { - return queue_var_show(q->hardsect_size, page); + return queue_var_show(queue_logical_block_size(q), page); } static ssize_t @@ -249,7 +249,12 @@ static struct queue_sysfs_entry queue_iosched_entry = { static struct queue_sysfs_entry queue_hw_sector_size_entry = { .attr = {.name = "hw_sector_size", .mode = S_IRUGO }, - .show = queue_hw_sector_size_show, + .show = queue_logical_block_size_show, +}; + +static struct queue_sysfs_entry queue_logical_block_size_entry = { + .attr = {.name = "logical_block_size", .mode = S_IRUGO }, + .show = queue_logical_block_size_show, }; static struct queue_sysfs_entry queue_nonrot_entry = { @@ -283,6 +288,7 @@ static struct attribute *default_attrs[] = { &queue_max_sectors_entry.attr, &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, + &queue_logical_block_size_entry.attr, &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index f87615dea46..9eaa1940273 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -763,7 +763,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ return compat_put_int(arg, block_size(bdev)); case BLKSSZGET: /* get block device hardware sector size */ - return compat_put_int(arg, bdev_hardsect_size(bdev)); + return compat_put_int(arg, bdev_logical_block_size(bdev)); case BLKSECTGET: return compat_put_ushort(arg, bdev_get_queue(bdev)->max_sectors); diff --git a/block/ioctl.c b/block/ioctl.c index ad474d4bbcc..7aa97f65da8 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -311,7 +311,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKBSZGET: /* get the logical block size (cf. BLKSSZGET) */ return put_int(arg, block_size(bdev)); case BLKSSZGET: /* get block device hardware sector size */ - return put_int(arg, bdev_hardsect_size(bdev)); + return put_int(arg, bdev_logical_block_size(bdev)); case BLKSECTGET: return put_ushort(arg, bdev_get_queue(bdev)->max_sectors); case BLKRASET: diff --git a/drivers/block/cciss.c b/drivers/block/cciss.c index e714e7cce6f..94474f5f8bc 100644 --- a/drivers/block/cciss.c +++ b/drivers/block/cciss.c @@ -1389,8 +1389,8 @@ static void cciss_add_disk(ctlr_info_t *h, struct gendisk *disk, disk->queue->queuedata = h; - blk_queue_hardsect_size(disk->queue, - h->drv[drv_index].block_size); + blk_queue_logical_block_size(disk->queue, + h->drv[drv_index].block_size); /* Make sure all queue data is written out before */ /* setting h->drv[drv_index].queue, as setting this */ @@ -2298,7 +2298,7 @@ static int cciss_revalidate(struct gendisk *disk) cciss_geometry_inquiry(h->ctlr, logvol, 1, total_size, block_size, inq_buff, drv); - blk_queue_hardsect_size(drv->queue, drv->block_size); + blk_queue_logical_block_size(drv->queue, drv->block_size); set_capacity(disk, drv->nr_blocks); kfree(inq_buff); diff --git a/drivers/block/cpqarray.c b/drivers/block/cpqarray.c index a02dcfc00f1..44fa2018f6b 100644 --- a/drivers/block/cpqarray.c +++ b/drivers/block/cpqarray.c @@ -474,7 +474,7 @@ static int __init cpqarray_register_ctlr( int i, struct pci_dev *pdev) disk->fops = &ida_fops; if (j && !drv->nr_blks) continue; - blk_queue_hardsect_size(hba[i]->queue, drv->blk_size); + blk_queue_logical_block_size(hba[i]->queue, drv->blk_size); set_capacity(disk, drv->nr_blks); disk->queue = hba[i]->queue; disk->private_data = drv; @@ -1546,7 +1546,7 @@ static int revalidate_allvol(ctlr_info_t *host) drv_info_t *drv = &host->drv[i]; if (i && !drv->nr_blks) continue; - blk_queue_hardsect_size(host->queue, drv->blk_size); + blk_queue_logical_block_size(host->queue, drv->blk_size); set_capacity(disk, drv->nr_blks); disk->queue = host->queue; disk->private_data = drv; diff --git a/drivers/block/hd.c b/drivers/block/hd.c index 961de56d00a..f65b3f369eb 100644 --- a/drivers/block/hd.c +++ b/drivers/block/hd.c @@ -724,7 +724,7 @@ static int __init hd_init(void) blk_queue_max_sectors(hd_queue, 255); init_timer(&device_timer); device_timer.function = hd_times_out; - blk_queue_hardsect_size(hd_queue, 512); + blk_queue_logical_block_size(hd_queue, 512); if (!NR_HD) { /* diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index c0cd0a03f69..60de5a01e71 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -996,7 +996,7 @@ static int mg_probe(struct platform_device *plat_dev) goto probe_err_6; } blk_queue_max_sectors(host->breq, MG_MAX_SECTS); - blk_queue_hardsect_size(host->breq, MG_SECTOR_SIZE); + blk_queue_logical_block_size(host->breq, MG_SECTOR_SIZE); init_timer(&host->timer); host->timer.function = mg_times_out; diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index dc7a8c352da..293f5858921 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -2657,7 +2657,7 @@ static void pkt_init_queue(struct pktcdvd_device *pd) struct request_queue *q = pd->disk->queue; blk_queue_make_request(q, pkt_make_request); - blk_queue_hardsect_size(q, CD_FRAMESIZE); + blk_queue_logical_block_size(q, CD_FRAMESIZE); blk_queue_max_sectors(q, PACKET_MAX_SECTORS); blk_queue_merge_bvec(q, pkt_merge_bvec); q->queuedata = pd; diff --git a/drivers/block/ps3disk.c b/drivers/block/ps3disk.c index 338cee4cc0b..aaeeb544228 100644 --- a/drivers/block/ps3disk.c +++ b/drivers/block/ps3disk.c @@ -477,7 +477,7 @@ static int __devinit ps3disk_probe(struct ps3_system_bus_device *_dev) blk_queue_max_sectors(queue, dev->bounce_size >> 9); blk_queue_segment_boundary(queue, -1UL); blk_queue_dma_alignment(queue, dev->blk_size-1); - blk_queue_hardsect_size(queue, dev->blk_size); + blk_queue_logical_block_size(queue, dev->blk_size); blk_queue_ordered(queue, QUEUE_ORDERED_DRAIN_FLUSH, ps3disk_prepare_flush); diff --git a/drivers/block/ub.c b/drivers/block/ub.c index e67bbae9547..cc54473b8e7 100644 --- a/drivers/block/ub.c +++ b/drivers/block/ub.c @@ -722,7 +722,7 @@ static void ub_cmd_build_block(struct ub_dev *sc, struct ub_lun *lun, /* * build the command * - * The call to blk_queue_hardsect_size() guarantees that request + * The call to blk_queue_logical_block_size() guarantees that request * is aligned, but it is given in terms of 512 byte units, always. */ block = blk_rq_pos(rq) >> lun->capacity.bshift; @@ -1749,7 +1749,7 @@ static int ub_bd_revalidate(struct gendisk *disk) ub_revalidate(lun->udev, lun); /* XXX Support sector size switching like in sr.c */ - blk_queue_hardsect_size(disk->queue, lun->capacity.bsize); + blk_queue_logical_block_size(disk->queue, lun->capacity.bsize); set_capacity(disk, lun->capacity.nsec); // set_disk_ro(sdkp->disk, lun->readonly); @@ -2324,7 +2324,7 @@ static int ub_probe_lun(struct ub_dev *sc, int lnum) blk_queue_max_phys_segments(q, UB_MAX_REQ_SG); blk_queue_segment_boundary(q, 0xffffffff); /* Dubious. */ blk_queue_max_sectors(q, UB_MAX_SECTORS); - blk_queue_hardsect_size(q, lun->capacity.bsize); + blk_queue_logical_block_size(q, lun->capacity.bsize); lun->disk = disk; q->queuedata = lun; diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 511d4ae2d17..c4845b16946 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -347,7 +347,7 @@ static int virtblk_probe(struct virtio_device *vdev) offsetof(struct virtio_blk_config, blk_size), &blk_size); if (!err) - blk_queue_hardsect_size(vblk->disk->queue, blk_size); + blk_queue_logical_block_size(vblk->disk->queue, blk_size); add_disk(vblk->disk); return 0; diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 132120ae4bd..c1996829d5e 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -344,7 +344,7 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size) queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); /* Hard sector size and max sectors impersonate the equiv. hardware. */ - blk_queue_hardsect_size(rq, sector_size); + blk_queue_logical_block_size(rq, sector_size); blk_queue_max_sectors(rq, 512); /* Each segment in a request is up to an aligned page in size. */ diff --git a/drivers/block/xsysace.c b/drivers/block/xsysace.c index 3a4397edab7..f08491a3a81 100644 --- a/drivers/block/xsysace.c +++ b/drivers/block/xsysace.c @@ -984,7 +984,7 @@ static int __devinit ace_setup(struct ace_device *ace) ace->queue = blk_init_queue(ace_request, &ace->lock); if (ace->queue == NULL) goto err_blk_initq; - blk_queue_hardsect_size(ace->queue, 512); + blk_queue_logical_block_size(ace->queue, 512); /* * Allocate and initialize GD structure diff --git a/drivers/cdrom/gdrom.c b/drivers/cdrom/gdrom.c index 1e366ad8f68..b5621f27c4b 100644 --- a/drivers/cdrom/gdrom.c +++ b/drivers/cdrom/gdrom.c @@ -739,7 +739,7 @@ static void __devinit probe_gdrom_setupdisk(void) static int __devinit probe_gdrom_setupqueue(void) { - blk_queue_hardsect_size(gd.gdrom_rq, GDROM_HARD_SECTOR); + blk_queue_logical_block_size(gd.gdrom_rq, GDROM_HARD_SECTOR); /* using DMA so memory will need to be contiguous */ blk_queue_max_hw_segments(gd.gdrom_rq, 1); /* set a large max size to get most from DMA */ diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index f177c2d4017..0fff646cc2f 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -469,8 +469,8 @@ static void vio_handle_cd_event(struct HvLpEvent *event) case viocdopen: if (event->xRc == 0) { di = &viocd_diskinfo[bevent->disk]; - blk_queue_hardsect_size(di->viocd_disk->queue, - bevent->block_size); + blk_queue_logical_block_size(di->viocd_disk->queue, + bevent->block_size); set_capacity(di->viocd_disk, bevent->media_size * bevent->block_size / 512); diff --git a/drivers/char/raw.c b/drivers/char/raw.c index 20d90e6a6e5..db32f0e4c7d 100644 --- a/drivers/char/raw.c +++ b/drivers/char/raw.c @@ -71,7 +71,7 @@ static int raw_open(struct inode *inode, struct file *filp) err = bd_claim(bdev, raw_open); if (err) goto out1; - err = set_blocksize(bdev, bdev_hardsect_size(bdev)); + err = set_blocksize(bdev, bdev_logical_block_size(bdev)); if (err) goto out2; filp->f_flags |= O_DIRECT; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 1799328decf..424140c6c40 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -182,7 +182,7 @@ static void cdrom_analyze_sense_data(ide_drive_t *drive, (sense->information[2] << 8) | (sense->information[3]); - if (drive->queue->hardsect_size == 2048) + if (queue_logical_block_size(drive->queue) == 2048) /* device sector size is 2K */ sector <<= 2; @@ -737,7 +737,7 @@ static ide_startstop_t cdrom_start_rw(ide_drive_t *drive, struct request *rq) struct request_queue *q = drive->queue; int write = rq_data_dir(rq) == WRITE; unsigned short sectors_per_frame = - queue_hardsect_size(q) >> SECTOR_BITS; + queue_logical_block_size(q) >> SECTOR_BITS; ide_debug_log(IDE_DBG_RQ, "rq->cmd[0]: 0x%x, rq->cmd_flags: 0x%x, " "secs_per_frame: %u", @@ -1021,8 +1021,8 @@ int ide_cd_read_toc(ide_drive_t *drive, struct request_sense *sense) /* save a private copy of the TOC capacity for error handling */ drive->probed_capacity = toc->capacity * sectors_per_frame; - blk_queue_hardsect_size(drive->queue, - sectors_per_frame << SECTOR_BITS); + blk_queue_logical_block_size(drive->queue, + sectors_per_frame << SECTOR_BITS); /* first read just the header, so we know how long the TOC is */ stat = cdrom_read_tocentry(drive, 0, 1, 0, (char *) &toc->hdr, @@ -1338,7 +1338,7 @@ static int ide_cdrom_probe_capabilities(ide_drive_t *drive) /* standard prep_rq_fn that builds 10 byte cmds */ static int ide_cdrom_prep_fs(struct request_queue *q, struct request *rq) { - int hard_sect = queue_hardsect_size(q); + int hard_sect = queue_logical_block_size(q); long block = (long)blk_rq_pos(rq) / (hard_sect >> 9); unsigned long blocks = blk_rq_sectors(rq) / (hard_sect >> 9); @@ -1543,7 +1543,7 @@ static int ide_cdrom_setup(ide_drive_t *drive) nslots = ide_cdrom_probe_capabilities(drive); - blk_queue_hardsect_size(q, CD_FRAMESIZE); + blk_queue_logical_block_size(q, CD_FRAMESIZE); if (ide_cdrom_register(drive, nslots)) { printk(KERN_ERR PFX "%s: %s failed to register device with the" diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 47c68bc75a1..06b0ded1ce2 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -232,7 +232,7 @@ static struct page *read_sb_page(mddev_t *mddev, long offset, target = rdev->sb_start + offset + index * (PAGE_SIZE/512); if (sync_page_io(rdev->bdev, target, - roundup(size, bdev_hardsect_size(rdev->bdev)), + roundup(size, bdev_logical_block_size(rdev->bdev)), page, READ)) { page->index = index; attach_page_buffers(page, NULL); /* so that free_buffer will @@ -287,7 +287,7 @@ static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait) int size = PAGE_SIZE; if (page->index == bitmap->file_pages-1) size = roundup(bitmap->last_page_size, - bdev_hardsect_size(rdev->bdev)); + bdev_logical_block_size(rdev->bdev)); /* Just make sure we aren't corrupting data or * metadata */ diff --git a/drivers/md/dm-exception-store.c b/drivers/md/dm-exception-store.c index a2e26c24214..75d8081a904 100644 --- a/drivers/md/dm-exception-store.c +++ b/drivers/md/dm-exception-store.c @@ -178,7 +178,7 @@ static int set_chunk_size(struct dm_exception_store *store, } /* Validate the chunk size against the device block size */ - if (chunk_size_ulong % (bdev_hardsect_size(store->cow->bdev) >> 9)) { + if (chunk_size_ulong % (bdev_logical_block_size(store->cow->bdev) >> 9)) { *error = "Chunk size is not a multiple of device blocksize"; return -EINVAL; } diff --git a/drivers/md/dm-log.c b/drivers/md/dm-log.c index be233bc4d91..6fa8ccf91c7 100644 --- a/drivers/md/dm-log.c +++ b/drivers/md/dm-log.c @@ -413,7 +413,8 @@ static int create_log_context(struct dm_dirty_log *log, struct dm_target *ti, * Buffer holds both header and bitset. */ buf_size = dm_round_up((LOG_OFFSET << SECTOR_SHIFT) + - bitset_size, ti->limits.hardsect_size); + bitset_size, + ti->limits.logical_block_size); if (buf_size > dev->bdev->bd_inode->i_size) { DMWARN("log device %s too small: need %llu bytes", diff --git a/drivers/md/dm-snap-persistent.c b/drivers/md/dm-snap-persistent.c index e75c6dd76a9..2662a41337e 100644 --- a/drivers/md/dm-snap-persistent.c +++ b/drivers/md/dm-snap-persistent.c @@ -282,7 +282,7 @@ static int read_header(struct pstore *ps, int *new_snapshot) */ if (!ps->store->chunk_size) { ps->store->chunk_size = max(DM_CHUNK_SIZE_DEFAULT_SECTORS, - bdev_hardsect_size(ps->store->cow->bdev) >> 9); + bdev_logical_block_size(ps->store->cow->bdev) >> 9); ps->store->chunk_mask = ps->store->chunk_size - 1; ps->store->chunk_shift = ffs(ps->store->chunk_size) - 1; chunk_size_supplied = 0; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 429b50b975d..65e2d975985 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -108,7 +108,8 @@ static void combine_restrictions_low(struct io_restrictions *lhs, lhs->max_hw_segments = min_not_zero(lhs->max_hw_segments, rhs->max_hw_segments); - lhs->hardsect_size = max(lhs->hardsect_size, rhs->hardsect_size); + lhs->logical_block_size = max(lhs->logical_block_size, + rhs->logical_block_size); lhs->max_segment_size = min_not_zero(lhs->max_segment_size, rhs->max_segment_size); @@ -529,7 +530,8 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) rs->max_hw_segments = min_not_zero(rs->max_hw_segments, q->max_hw_segments); - rs->hardsect_size = max(rs->hardsect_size, q->hardsect_size); + rs->logical_block_size = max(rs->logical_block_size, + queue_logical_block_size(q)); rs->max_segment_size = min_not_zero(rs->max_segment_size, q->max_segment_size); @@ -683,8 +685,8 @@ static void check_for_valid_limits(struct io_restrictions *rs) rs->max_phys_segments = MAX_PHYS_SEGMENTS; if (!rs->max_hw_segments) rs->max_hw_segments = MAX_HW_SEGMENTS; - if (!rs->hardsect_size) - rs->hardsect_size = 1 << SECTOR_SHIFT; + if (!rs->logical_block_size) + rs->logical_block_size = 1 << SECTOR_SHIFT; if (!rs->max_segment_size) rs->max_segment_size = MAX_SEGMENT_SIZE; if (!rs->seg_boundary_mask) @@ -914,7 +916,7 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) blk_queue_max_sectors(q, t->limits.max_sectors); q->max_phys_segments = t->limits.max_phys_segments; q->max_hw_segments = t->limits.max_hw_segments; - q->hardsect_size = t->limits.hardsect_size; + q->logical_block_size = t->limits.logical_block_size; q->max_segment_size = t->limits.max_segment_size; q->max_hw_sectors = t->limits.max_hw_sectors; q->seg_boundary_mask = t->limits.seg_boundary_mask; diff --git a/drivers/md/md.c b/drivers/md/md.c index fccc8343a25..4cbc19f5c30 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1202,7 +1202,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version) atomic_set(&rdev->corrected_errors, le32_to_cpu(sb->cnt_corrected_read)); rdev->sb_size = le32_to_cpu(sb->max_dev) * 2 + 256; - bmask = queue_hardsect_size(rdev->bdev->bd_disk->queue)-1; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; if (rdev->sb_size & bmask) rdev->sb_size = (rdev->sb_size | bmask) + 1; diff --git a/drivers/memstick/core/mspro_block.c b/drivers/memstick/core/mspro_block.c index c0bebc6a2f2..7847bbc1440 100644 --- a/drivers/memstick/core/mspro_block.c +++ b/drivers/memstick/core/mspro_block.c @@ -1242,7 +1242,7 @@ static int mspro_block_init_disk(struct memstick_dev *card) sprintf(msb->disk->disk_name, "mspblk%d", disk_id); - blk_queue_hardsect_size(msb->queue, msb->page_size); + blk_queue_logical_block_size(msb->queue, msb->page_size); capacity = be16_to_cpu(sys_info->user_block_count); capacity *= be16_to_cpu(sys_info->block_size); diff --git a/drivers/message/i2o/i2o_block.c b/drivers/message/i2o/i2o_block.c index 6573ef4408f..335d4c78a77 100644 --- a/drivers/message/i2o/i2o_block.c +++ b/drivers/message/i2o/i2o_block.c @@ -794,8 +794,9 @@ static int i2o_block_transfer(struct request *req) if (c->adaptec) { u8 cmd[10]; u32 scsi_flags; - u16 hwsec = queue_hardsect_size(req->q) >> KERNEL_SECTOR_SHIFT; + u16 hwsec; + hwsec = queue_logical_block_size(req->q) >> KERNEL_SECTOR_SHIFT; memset(cmd, 0, 10); sgl_offset = SGL_OFFSET_12; @@ -1078,7 +1079,7 @@ static int i2o_block_probe(struct device *dev) */ if (!i2o_parm_field_get(i2o_dev, 0x0004, 1, &blocksize, 4) || !i2o_parm_field_get(i2o_dev, 0x0000, 3, &blocksize, 4)) { - blk_queue_hardsect_size(queue, le32_to_cpu(blocksize)); + blk_queue_logical_block_size(queue, le32_to_cpu(blocksize)); } else osm_warn("unable to get blocksize of %s\n", gd->disk_name); diff --git a/drivers/mmc/card/block.c b/drivers/mmc/card/block.c index c5df8654645..98ffc41eaf2 100644 --- a/drivers/mmc/card/block.c +++ b/drivers/mmc/card/block.c @@ -521,7 +521,7 @@ static struct mmc_blk_data *mmc_blk_alloc(struct mmc_card *card) sprintf(md->disk->disk_name, "mmcblk%d", devidx); - blk_queue_hardsect_size(md->queue.queue, 512); + blk_queue_logical_block_size(md->queue.queue, 512); if (!mmc_card_sd(card) && mmc_card_blockaddr(card)) { /* diff --git a/drivers/mtd/mtd_blkdevs.c b/drivers/mtd/mtd_blkdevs.c index 502622f628b..aaac3b6800b 100644 --- a/drivers/mtd/mtd_blkdevs.c +++ b/drivers/mtd/mtd_blkdevs.c @@ -378,7 +378,7 @@ int register_mtd_blktrans(struct mtd_blktrans_ops *tr) } tr->blkcore_priv->rq->queuedata = tr; - blk_queue_hardsect_size(tr->blkcore_priv->rq, tr->blksize); + blk_queue_logical_block_size(tr->blkcore_priv->rq, tr->blksize); if (tr->discard) blk_queue_set_discard(tr->blkcore_priv->rq, blktrans_discard_request); diff --git a/drivers/s390/block/dasd.c b/drivers/s390/block/dasd.c index e64f62d5e0f..27a1be0cd4d 100644 --- a/drivers/s390/block/dasd.c +++ b/drivers/s390/block/dasd.c @@ -1990,7 +1990,7 @@ static void dasd_setup_queue(struct dasd_block *block) { int max; - blk_queue_hardsect_size(block->request_queue, block->bp_block); + blk_queue_logical_block_size(block->request_queue, block->bp_block); max = block->base->discipline->max_blocks << block->s2b_shift; blk_queue_max_sectors(block->request_queue, max); blk_queue_max_phys_segments(block->request_queue, -1L); diff --git a/drivers/s390/block/dcssblk.c b/drivers/s390/block/dcssblk.c index cfdcf1aed33..a4c7ffcd998 100644 --- a/drivers/s390/block/dcssblk.c +++ b/drivers/s390/block/dcssblk.c @@ -602,7 +602,7 @@ dcssblk_add_store(struct device *dev, struct device_attribute *attr, const char dev_info->gd->private_data = dev_info; dev_info->gd->driverfs_dev = &dev_info->dev; blk_queue_make_request(dev_info->dcssblk_queue, dcssblk_make_request); - blk_queue_hardsect_size(dev_info->dcssblk_queue, 4096); + blk_queue_logical_block_size(dev_info->dcssblk_queue, 4096); seg_byte_size = (dev_info->end - dev_info->start + 1); set_capacity(dev_info->gd, seg_byte_size >> 9); // size in sectors diff --git a/drivers/s390/block/xpram.c b/drivers/s390/block/xpram.c index 76814f3e898..0ae0c83ef87 100644 --- a/drivers/s390/block/xpram.c +++ b/drivers/s390/block/xpram.c @@ -343,7 +343,7 @@ static int __init xpram_setup_blkdev(void) goto out; } blk_queue_make_request(xpram_queues[i], xpram_make_request); - blk_queue_hardsect_size(xpram_queues[i], 4096); + blk_queue_logical_block_size(xpram_queues[i], 4096); } /* diff --git a/drivers/s390/char/tape_block.c b/drivers/s390/char/tape_block.c index 1e796767598..47ff695255e 100644 --- a/drivers/s390/char/tape_block.c +++ b/drivers/s390/char/tape_block.c @@ -222,7 +222,7 @@ tapeblock_setup_device(struct tape_device * device) if (rc) goto cleanup_queue; - blk_queue_hardsect_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE); + blk_queue_logical_block_size(blkdat->request_queue, TAPEBLOCK_HSEC_SIZE); blk_queue_max_sectors(blkdat->request_queue, TAPEBLOCK_MAX_SEC); blk_queue_max_phys_segments(blkdat->request_queue, -1L); blk_queue_max_hw_segments(blkdat->request_queue, -1L); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 40d2860f235..bcf3bd40bbd 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1510,7 +1510,7 @@ got_data: */ sector_size = 512; } - blk_queue_hardsect_size(sdp->request_queue, sector_size); + blk_queue_logical_block_size(sdp->request_queue, sector_size); { char cap_str_2[10], cap_str_10[10]; diff --git a/drivers/scsi/sr.c b/drivers/scsi/sr.c index fddba53c7fe..cd350dfc121 100644 --- a/drivers/scsi/sr.c +++ b/drivers/scsi/sr.c @@ -727,7 +727,7 @@ static void get_sectorsize(struct scsi_cd *cd) } queue = cd->device->request_queue; - blk_queue_hardsect_size(queue, sector_size); + blk_queue_logical_block_size(queue, sector_size); return; } diff --git a/fs/bio.c b/fs/bio.c index 81dc93e7253..4445c382173 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1490,11 +1490,12 @@ struct bio_pair *bio_split(struct bio *bi, int first_sectors) sector_t bio_sector_offset(struct bio *bio, unsigned short index, unsigned int offset) { - unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue); + unsigned int sector_sz; struct bio_vec *bv; sector_t sectors; int i; + sector_sz = queue_logical_block_size(bio->bi_bdev->bd_disk->queue); sectors = 0; if (index >= bio->bi_idx) diff --git a/fs/block_dev.c b/fs/block_dev.c index a85fe310fc6..a29b4dcc1bc 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -76,7 +76,7 @@ int set_blocksize(struct block_device *bdev, int size) return -EINVAL; /* Size cannot be smaller than the size supported by the device */ - if (size < bdev_hardsect_size(bdev)) + if (size < bdev_logical_block_size(bdev)) return -EINVAL; /* Don't change the size if it is same as current */ @@ -106,7 +106,7 @@ EXPORT_SYMBOL(sb_set_blocksize); int sb_min_blocksize(struct super_block *sb, int size) { - int minsize = bdev_hardsect_size(sb->s_bdev); + int minsize = bdev_logical_block_size(sb->s_bdev); if (size < minsize) size = minsize; return sb_set_blocksize(sb, size); @@ -1117,7 +1117,7 @@ EXPORT_SYMBOL(check_disk_change); void bd_set_size(struct block_device *bdev, loff_t size) { - unsigned bsize = bdev_hardsect_size(bdev); + unsigned bsize = bdev_logical_block_size(bdev); bdev->bd_inode->i_size = size; while (bsize < PAGE_CACHE_SIZE) { diff --git a/fs/buffer.c b/fs/buffer.c index aed297739eb..36e2bbc60ec 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1085,12 +1085,12 @@ static struct buffer_head * __getblk_slow(struct block_device *bdev, sector_t block, int size) { /* Size must be multiple of hard sectorsize */ - if (unlikely(size & (bdev_hardsect_size(bdev)-1) || + if (unlikely(size & (bdev_logical_block_size(bdev)-1) || (size < 512 || size > PAGE_SIZE))) { printk(KERN_ERR "getblk(): invalid block size %d requested\n", size); - printk(KERN_ERR "hardsect size: %d\n", - bdev_hardsect_size(bdev)); + printk(KERN_ERR "logical block size: %d\n", + bdev_logical_block_size(bdev)); dump_stack(); return NULL; diff --git a/fs/direct-io.c b/fs/direct-io.c index 05763bbc205..8b10b87dc01 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1127,7 +1127,7 @@ __blockdev_direct_IO(int rw, struct kiocb *iocb, struct inode *inode, rw = WRITE_ODIRECT; if (bdev) - bdev_blkbits = blksize_bits(bdev_hardsect_size(bdev)); + bdev_blkbits = blksize_bits(bdev_logical_block_size(bdev)); if (offset & blocksize_mask) { if (bdev) diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 599dbfe504c..acbb94fdf90 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -1696,7 +1696,7 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount; } - hblock = bdev_hardsect_size(sb->s_bdev); + hblock = bdev_logical_block_size(sb->s_bdev); if (sb->s_blocksize != blocksize) { /* * Make sure the blocksize for the filesystem is larger @@ -2119,7 +2119,7 @@ static journal_t *ext3_get_dev_journal(struct super_block *sb, } blocksize = sb->s_blocksize; - hblock = bdev_hardsect_size(bdev); + hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { printk(KERN_ERR "EXT3-fs: blocksize too small for journal device.\n"); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 2958f4e6f22..a30549f7a30 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2962,7 +2962,7 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, } blocksize = sb->s_blocksize; - hblock = bdev_hardsect_size(bdev); + hblock = bdev_logical_block_size(bdev); if (blocksize < hblock) { printk(KERN_ERR "EXT4-fs: blocksize too small for journal device.\n"); diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index 1ff9473ea75..a3b2ac989fc 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -526,11 +526,11 @@ static int init_sb(struct gfs2_sbd *sdp, int silent) } /* Set up the buffer cache and SB for real */ - if (sdp->sd_sb.sb_bsize < bdev_hardsect_size(sb->s_bdev)) { + if (sdp->sd_sb.sb_bsize < bdev_logical_block_size(sb->s_bdev)) { ret = -EINVAL; fs_err(sdp, "FS block size (%u) is too small for device " "block size (%u)\n", - sdp->sd_sb.sb_bsize, bdev_hardsect_size(sb->s_bdev)); + sdp->sd_sb.sb_bsize, bdev_logical_block_size(sb->s_bdev)); goto out; } if (sdp->sd_sb.sb_bsize > PAGE_SIZE) { diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index 565038243fa..a971d24e10c 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -845,7 +845,7 @@ static void gfs2_rgrp_send_discards(struct gfs2_sbd *sdp, u64 offset, struct super_block *sb = sdp->sd_vfs; struct block_device *bdev = sb->s_bdev; const unsigned int sects_per_blk = sdp->sd_sb.sb_bsize / - bdev_hardsect_size(sb->s_bdev); + bdev_logical_block_size(sb->s_bdev); u64 blk; sector_t start = 0; sector_t nr_sects = 0; diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index 7f65b3be4aa..a91f15b8673 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -515,7 +515,7 @@ int init_nilfs(struct the_nilfs *nilfs, struct nilfs_sb_info *sbi, char *data) blocksize = BLOCK_SIZE << le32_to_cpu(sbp->s_log_block_size); if (sb->s_blocksize != blocksize) { - int hw_blocksize = bdev_hardsect_size(sb->s_bdev); + int hw_blocksize = bdev_logical_block_size(sb->s_bdev); if (blocksize < hw_blocksize) { printk(KERN_ERR diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index f76951dcd4a..6aa7c471353 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -25,7 +25,7 @@ #include #include #include -#include /* For bdev_hardsect_size(). */ +#include /* For bdev_logical_block_size(). */ #include #include #include @@ -2785,13 +2785,13 @@ static int ntfs_fill_super(struct super_block *sb, void *opt, const int silent) goto err_out_now; /* We support sector sizes up to the PAGE_CACHE_SIZE. */ - if (bdev_hardsect_size(sb->s_bdev) > PAGE_CACHE_SIZE) { + if (bdev_logical_block_size(sb->s_bdev) > PAGE_CACHE_SIZE) { if (!silent) ntfs_error(sb, "Device has unsupported sector size " "(%i). The maximum supported sector " "size on this architecture is %lu " "bytes.", - bdev_hardsect_size(sb->s_bdev), + bdev_logical_block_size(sb->s_bdev), PAGE_CACHE_SIZE); goto err_out_now; } diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 4f85eceab37..09cc25d0461 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -1371,7 +1371,7 @@ static ssize_t o2hb_region_dev_write(struct o2hb_region *reg, bdevname(reg->hr_bdev, reg->hr_dev_name); - sectsize = bdev_hardsect_size(reg->hr_bdev); + sectsize = bdev_logical_block_size(reg->hr_bdev); if (sectsize != reg->hr_block_bytes) { mlog(ML_ERROR, "blocksize %u incorrect for device, expected %d", diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 79ff8d9d37e..5c6163f5503 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -713,7 +713,7 @@ static int ocfs2_sb_probe(struct super_block *sb, *bh = NULL; /* may be > 512 */ - *sector_size = bdev_hardsect_size(sb->s_bdev); + *sector_size = bdev_logical_block_size(sb->s_bdev); if (*sector_size > OCFS2_MAX_BLOCKSIZE) { mlog(ML_ERROR, "Hardware sector size too large: %d (max=%d)\n", *sector_size, OCFS2_MAX_BLOCKSIZE); diff --git a/fs/partitions/ibm.c b/fs/partitions/ibm.c index 46297683cd3..fc71aab0846 100644 --- a/fs/partitions/ibm.c +++ b/fs/partitions/ibm.c @@ -76,7 +76,7 @@ ibm_partition(struct parsed_partitions *state, struct block_device *bdev) Sector sect; res = 0; - blocksize = bdev_hardsect_size(bdev); + blocksize = bdev_logical_block_size(bdev); if (blocksize <= 0) goto out_exit; i_size = i_size_read(bdev->bd_inode); diff --git a/fs/partitions/msdos.c b/fs/partitions/msdos.c index 796511886f2..0028d2ef066 100644 --- a/fs/partitions/msdos.c +++ b/fs/partitions/msdos.c @@ -110,7 +110,7 @@ parse_extended(struct parsed_partitions *state, struct block_device *bdev, Sector sect; unsigned char *data; u32 this_sector, this_size; - int sector_size = bdev_hardsect_size(bdev) / 512; + int sector_size = bdev_logical_block_size(bdev) / 512; int loopct = 0; /* number of links followed without finding a data partition */ int i; @@ -415,7 +415,7 @@ static struct { int msdos_partition(struct parsed_partitions *state, struct block_device *bdev) { - int sector_size = bdev_hardsect_size(bdev) / 512; + int sector_size = bdev_logical_block_size(bdev) / 512; Sector sect; unsigned char *data; struct partition *p; diff --git a/fs/udf/super.c b/fs/udf/super.c index 72348cc855a..0ba44107d8f 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -1915,7 +1915,7 @@ static int udf_fill_super(struct super_block *sb, void *options, int silent) if (uopt.flags & (1 << UDF_FLAG_BLOCKSIZE_SET)) { ret = udf_load_vrs(sb, &uopt, silent, &fileset); } else { - uopt.blocksize = bdev_hardsect_size(sb->s_bdev); + uopt.blocksize = bdev_logical_block_size(sb->s_bdev); ret = udf_load_vrs(sb, &uopt, silent, &fileset); if (!ret && uopt.blocksize != UDF_DEFAULT_BLOCKSIZE) { if (!silent) diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index e28800a9f2b..1418b916fc2 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -1501,7 +1501,7 @@ xfs_setsize_buftarg_early( struct block_device *bdev) { return xfs_setsize_buftarg_flags(btp, - PAGE_CACHE_SIZE, bdev_hardsect_size(bdev), 0); + PAGE_CACHE_SIZE, bdev_logical_block_size(bdev), 0); } int diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 56ce53fce72..872b78b7a10 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -391,7 +391,7 @@ struct request_queue unsigned int max_hw_sectors; unsigned short max_phys_segments; unsigned short max_hw_segments; - unsigned short hardsect_size; + unsigned short logical_block_size; unsigned int max_segment_size; unsigned long seg_boundary_mask; @@ -901,7 +901,7 @@ extern void blk_queue_max_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short); extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); -extern void blk_queue_hardsect_size(struct request_queue *, unsigned short); +extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); extern void blk_queue_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); @@ -988,19 +988,19 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter); #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) -static inline int queue_hardsect_size(struct request_queue *q) +static inline unsigned short queue_logical_block_size(struct request_queue *q) { int retval = 512; - if (q && q->hardsect_size) - retval = q->hardsect_size; + if (q && q->logical_block_size) + retval = q->logical_block_size; return retval; } -static inline int bdev_hardsect_size(struct block_device *bdev) +static inline unsigned short bdev_logical_block_size(struct block_device *bdev) { - return queue_hardsect_size(bdev_get_queue(bdev)); + return queue_logical_block_size(bdev_get_queue(bdev)); } static inline int queue_dma_alignment(struct request_queue *q) diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h index ded2d7c4266..49c2362977f 100644 --- a/include/linux/device-mapper.h +++ b/include/linux/device-mapper.h @@ -149,7 +149,7 @@ struct io_restrictions { unsigned max_hw_sectors; unsigned max_sectors; unsigned max_segment_size; - unsigned short hardsect_size; + unsigned short logical_block_size; unsigned short max_hw_segments; unsigned short max_phys_segments; unsigned char no_cluster; /* inverted so that 0 is default */ From ae03bf639a5027d27270123f5f6e3ee6a412781d Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 22 May 2009 17:17:50 -0400 Subject: [PATCH 1188/2061] block: Use accessor functions for queue limits Convert all external users of queue limits to using wrapper functions instead of poking the request queue variables directly. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-barrier.c | 8 ++++---- block/blk-core.c | 16 +++++++-------- block/blk-map.c | 4 ++-- block/blk-merge.c | 27 +++++++++++++------------ block/blk-settings.c | 15 +++++++++++--- block/blk-sysfs.c | 8 ++++---- block/compat_ioctl.c | 2 +- block/ioctl.c | 10 +++++----- block/scsi_ioctl.c | 8 ++++---- drivers/block/pktcdvd.c | 6 ++++-- drivers/cdrom/cdrom.c | 4 ++-- drivers/md/dm-table.c | 28 +++++++++++++------------- drivers/md/linear.c | 2 +- drivers/md/multipath.c | 4 ++-- drivers/md/raid0.c | 2 +- drivers/md/raid1.c | 4 ++-- drivers/md/raid10.c | 8 ++++---- drivers/md/raid5.c | 4 ++-- drivers/scsi/sg.c | 15 +++++++------- drivers/scsi/st.c | 4 ++-- drivers/usb/storage/scsiglue.c | 4 ++-- fs/bio.c | 19 +++++++++--------- include/linux/bio.h | 2 +- include/linux/blkdev.h | 36 ++++++++++++++++++++++++++++++++++ mm/bounce.c | 4 ++-- 25 files changed, 147 insertions(+), 97 deletions(-) diff --git a/block/blk-barrier.c b/block/blk-barrier.c index 0d98054cdbd..30022b4e2f6 100644 --- a/block/blk-barrier.c +++ b/block/blk-barrier.c @@ -388,10 +388,10 @@ int blkdev_issue_discard(struct block_device *bdev, bio->bi_sector = sector; - if (nr_sects > q->max_hw_sectors) { - bio->bi_size = q->max_hw_sectors << 9; - nr_sects -= q->max_hw_sectors; - sector += q->max_hw_sectors; + if (nr_sects > queue_max_hw_sectors(q)) { + bio->bi_size = queue_max_hw_sectors(q) << 9; + nr_sects -= queue_max_hw_sectors(q); + sector += queue_max_hw_sectors(q); } else { bio->bi_size = nr_sects << 9; nr_sects = 0; diff --git a/block/blk-core.c b/block/blk-core.c index 59c4af52311..7a4c40184a6 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -1437,11 +1437,11 @@ static inline void __generic_make_request(struct bio *bio) goto end_io; } - if (unlikely(nr_sectors > q->max_hw_sectors)) { + if (unlikely(nr_sectors > queue_max_hw_sectors(q))) { printk(KERN_ERR "bio too big device %s (%u > %u)\n", - bdevname(bio->bi_bdev, b), - bio_sectors(bio), - q->max_hw_sectors); + bdevname(bio->bi_bdev, b), + bio_sectors(bio), + queue_max_hw_sectors(q)); goto end_io; } @@ -1608,8 +1608,8 @@ EXPORT_SYMBOL(submit_bio); */ int blk_rq_check_limits(struct request_queue *q, struct request *rq) { - if (blk_rq_sectors(rq) > q->max_sectors || - blk_rq_bytes(rq) > q->max_hw_sectors << 9) { + if (blk_rq_sectors(rq) > queue_max_sectors(q) || + blk_rq_bytes(rq) > queue_max_hw_sectors(q) << 9) { printk(KERN_ERR "%s: over max size limit.\n", __func__); return -EIO; } @@ -1621,8 +1621,8 @@ int blk_rq_check_limits(struct request_queue *q, struct request *rq) * limitation. */ blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > q->max_phys_segments || - rq->nr_phys_segments > q->max_hw_segments) { + if (rq->nr_phys_segments > queue_max_phys_segments(q) || + rq->nr_phys_segments > queue_max_hw_segments(q)) { printk(KERN_ERR "%s: over max segments limit.\n", __func__); return -EIO; } diff --git a/block/blk-map.c b/block/blk-map.c index ef2492adca7..9083cf0180c 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -115,7 +115,7 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, struct bio *bio = NULL; int ret; - if (len > (q->max_hw_sectors << 9)) + if (len > (queue_max_hw_sectors(q) << 9)) return -EINVAL; if (!len) return -EINVAL; @@ -292,7 +292,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, struct bio *bio; int ret; - if (len > (q->max_hw_sectors << 9)) + if (len > (queue_max_hw_sectors(q) << 9)) return -EINVAL; if (!len || !kbuf) return -EINVAL; diff --git a/block/blk-merge.c b/block/blk-merge.c index 4974dd5767e..39ce64432ba 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -32,11 +32,12 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q, * never considered part of another segment, since that * might change with the bounce page. */ - high = page_to_pfn(bv->bv_page) > q->bounce_pfn; + high = page_to_pfn(bv->bv_page) > queue_bounce_pfn(q); if (high || highprv) goto new_segment; if (cluster) { - if (seg_size + bv->bv_len > q->max_segment_size) + if (seg_size + bv->bv_len + > queue_max_segment_size(q)) goto new_segment; if (!BIOVEC_PHYS_MERGEABLE(bvprv, bv)) goto new_segment; @@ -91,7 +92,7 @@ static int blk_phys_contig_segment(struct request_queue *q, struct bio *bio, return 0; if (bio->bi_seg_back_size + nxt->bi_seg_front_size > - q->max_segment_size) + queue_max_segment_size(q)) return 0; if (!bio_has_data(bio)) @@ -134,7 +135,7 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq, int nbytes = bvec->bv_len; if (bvprv && cluster) { - if (sg->length + nbytes > q->max_segment_size) + if (sg->length + nbytes > queue_max_segment_size(q)) goto new_segment; if (!BIOVEC_PHYS_MERGEABLE(bvprv, bvec)) @@ -205,8 +206,8 @@ static inline int ll_new_hw_segment(struct request_queue *q, { int nr_phys_segs = bio_phys_segments(q, bio); - if (req->nr_phys_segments + nr_phys_segs > q->max_hw_segments - || req->nr_phys_segments + nr_phys_segs > q->max_phys_segments) { + if (req->nr_phys_segments + nr_phys_segs > queue_max_hw_segments(q) || + req->nr_phys_segments + nr_phys_segs > queue_max_phys_segments(q)) { req->cmd_flags |= REQ_NOMERGE; if (req == q->last_merge) q->last_merge = NULL; @@ -227,9 +228,9 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req, unsigned short max_sectors; if (unlikely(blk_pc_request(req))) - max_sectors = q->max_hw_sectors; + max_sectors = queue_max_hw_sectors(q); else - max_sectors = q->max_sectors; + max_sectors = queue_max_sectors(q); if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { req->cmd_flags |= REQ_NOMERGE; @@ -251,9 +252,9 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req, unsigned short max_sectors; if (unlikely(blk_pc_request(req))) - max_sectors = q->max_hw_sectors; + max_sectors = queue_max_hw_sectors(q); else - max_sectors = q->max_sectors; + max_sectors = queue_max_sectors(q); if (blk_rq_sectors(req) + bio_sectors(bio) > max_sectors) { @@ -287,7 +288,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, /* * Will it become too large? */ - if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > q->max_sectors) + if ((blk_rq_sectors(req) + blk_rq_sectors(next)) > queue_max_sectors(q)) return 0; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; @@ -299,10 +300,10 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, total_phys_segments--; } - if (total_phys_segments > q->max_phys_segments) + if (total_phys_segments > queue_max_phys_segments(q)) return 0; - if (total_phys_segments > q->max_hw_segments) + if (total_phys_segments > queue_max_hw_segments(q)) return 0; /* Merge is OK... */ diff --git a/block/blk-settings.c b/block/blk-settings.c index 15c3164537b..0b32f984eed 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -219,6 +219,15 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) } EXPORT_SYMBOL(blk_queue_max_sectors); +void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors) +{ + if (BLK_DEF_MAX_SECTORS > max_sectors) + q->max_hw_sectors = BLK_DEF_MAX_SECTORS; + else + q->max_hw_sectors = max_sectors; +} +EXPORT_SYMBOL(blk_queue_max_hw_sectors); + /** * blk_queue_max_phys_segments - set max phys segments for a request for this queue * @q: the request queue for the device @@ -395,11 +404,11 @@ int blk_queue_dma_drain(struct request_queue *q, dma_drain_needed_fn *dma_drain_needed, void *buf, unsigned int size) { - if (q->max_hw_segments < 2 || q->max_phys_segments < 2) + if (queue_max_hw_segments(q) < 2 || queue_max_phys_segments(q) < 2) return -EINVAL; /* make room for appending the drain */ - --q->max_hw_segments; - --q->max_phys_segments; + blk_queue_max_hw_segments(q, queue_max_hw_segments(q) - 1); + blk_queue_max_phys_segments(q, queue_max_phys_segments(q) - 1); q->dma_drain_needed = dma_drain_needed; q->dma_drain_buffer = buf; q->dma_drain_size = size; diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 13d38b7e4d0..142a4acddd4 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -95,7 +95,7 @@ queue_ra_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) { - int max_sectors_kb = q->max_sectors >> 1; + int max_sectors_kb = queue_max_sectors(q) >> 1; return queue_var_show(max_sectors_kb, (page)); } @@ -109,7 +109,7 @@ static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { unsigned long max_sectors_kb, - max_hw_sectors_kb = q->max_hw_sectors >> 1, + max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1, page_kb = 1 << (PAGE_CACHE_SHIFT - 10); ssize_t ret = queue_var_store(&max_sectors_kb, page, count); @@ -117,7 +117,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) return -EINVAL; spin_lock_irq(q->queue_lock); - q->max_sectors = max_sectors_kb << 1; + blk_queue_max_sectors(q, max_sectors_kb << 1); spin_unlock_irq(q->queue_lock); return ret; @@ -125,7 +125,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) { - int max_hw_sectors_kb = q->max_hw_sectors >> 1; + int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1; return queue_var_show(max_hw_sectors_kb, (page)); } diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c index 9eaa1940273..df18a156d01 100644 --- a/block/compat_ioctl.c +++ b/block/compat_ioctl.c @@ -766,7 +766,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) return compat_put_int(arg, bdev_logical_block_size(bdev)); case BLKSECTGET: return compat_put_ushort(arg, - bdev_get_queue(bdev)->max_sectors); + queue_max_sectors(bdev_get_queue(bdev))); case BLKRASET: /* compatible, but no compat_ptr (!) */ case BLKFRASET: if (!capable(CAP_SYS_ADMIN)) diff --git a/block/ioctl.c b/block/ioctl.c index 7aa97f65da8..500e4c73cc5 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -152,10 +152,10 @@ static int blk_ioctl_discard(struct block_device *bdev, uint64_t start, bio->bi_private = &wait; bio->bi_sector = start; - if (len > q->max_hw_sectors) { - bio->bi_size = q->max_hw_sectors << 9; - len -= q->max_hw_sectors; - start += q->max_hw_sectors; + if (len > queue_max_hw_sectors(q)) { + bio->bi_size = queue_max_hw_sectors(q) << 9; + len -= queue_max_hw_sectors(q); + start += queue_max_hw_sectors(q); } else { bio->bi_size = len << 9; len = 0; @@ -313,7 +313,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKSSZGET: /* get block device hardware sector size */ return put_int(arg, bdev_logical_block_size(bdev)); case BLKSECTGET: - return put_ushort(arg, bdev_get_queue(bdev)->max_sectors); + return put_ushort(arg, queue_max_sectors(bdev_get_queue(bdev))); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c index a9670dd4b5d..5f8e798ede4 100644 --- a/block/scsi_ioctl.c +++ b/block/scsi_ioctl.c @@ -75,7 +75,7 @@ static int sg_set_timeout(struct request_queue *q, int __user *p) static int sg_get_reserved_size(struct request_queue *q, int __user *p) { - unsigned val = min(q->sg_reserved_size, q->max_sectors << 9); + unsigned val = min(q->sg_reserved_size, queue_max_sectors(q) << 9); return put_user(val, p); } @@ -89,8 +89,8 @@ static int sg_set_reserved_size(struct request_queue *q, int __user *p) if (size < 0) return -EINVAL; - if (size > (q->max_sectors << 9)) - size = q->max_sectors << 9; + if (size > (queue_max_sectors(q) << 9)) + size = queue_max_sectors(q) << 9; q->sg_reserved_size = size; return 0; @@ -264,7 +264,7 @@ static int sg_io(struct request_queue *q, struct gendisk *bd_disk, if (hdr->cmd_len > BLK_MAX_CDB) return -EINVAL; - if (hdr->dxfer_len > (q->max_hw_sectors << 9)) + if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) return -EIO; if (hdr->dxfer_len) diff --git a/drivers/block/pktcdvd.c b/drivers/block/pktcdvd.c index 293f5858921..d57f1175948 100644 --- a/drivers/block/pktcdvd.c +++ b/drivers/block/pktcdvd.c @@ -991,13 +991,15 @@ static void pkt_iosched_process_queue(struct pktcdvd_device *pd) */ static int pkt_set_segment_merging(struct pktcdvd_device *pd, struct request_queue *q) { - if ((pd->settings.size << 9) / CD_FRAMESIZE <= q->max_phys_segments) { + if ((pd->settings.size << 9) / CD_FRAMESIZE + <= queue_max_phys_segments(q)) { /* * The cdrom device can handle one segment/frame */ clear_bit(PACKET_MERGE_SEGS, &pd->flags); return 0; - } else if ((pd->settings.size << 9) / PAGE_SIZE <= q->max_phys_segments) { + } else if ((pd->settings.size << 9) / PAGE_SIZE + <= queue_max_phys_segments(q)) { /* * We can handle this case at the expense of some extra memory * copies during write operations diff --git a/drivers/cdrom/cdrom.c b/drivers/cdrom/cdrom.c index cceace61ef2..71d1b9bab70 100644 --- a/drivers/cdrom/cdrom.c +++ b/drivers/cdrom/cdrom.c @@ -2101,8 +2101,8 @@ static int cdrom_read_cdda_bpc(struct cdrom_device_info *cdi, __u8 __user *ubuf, nr = nframes; if (cdi->cdda_method == CDDA_BPC_SINGLE) nr = 1; - if (nr * CD_FRAMESIZE_RAW > (q->max_sectors << 9)) - nr = (q->max_sectors << 9) / CD_FRAMESIZE_RAW; + if (nr * CD_FRAMESIZE_RAW > (queue_max_sectors(q) << 9)) + nr = (queue_max_sectors(q) << 9) / CD_FRAMESIZE_RAW; len = nr * CD_FRAMESIZE_RAW; diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c index 65e2d975985..e9a73bb242b 100644 --- a/drivers/md/dm-table.c +++ b/drivers/md/dm-table.c @@ -510,7 +510,7 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) * combine_restrictions_low() */ rs->max_sectors = - min_not_zero(rs->max_sectors, q->max_sectors); + min_not_zero(rs->max_sectors, queue_max_sectors(q)); /* * Check if merge fn is supported. @@ -525,25 +525,25 @@ void dm_set_device_limits(struct dm_target *ti, struct block_device *bdev) rs->max_phys_segments = min_not_zero(rs->max_phys_segments, - q->max_phys_segments); + queue_max_phys_segments(q)); rs->max_hw_segments = - min_not_zero(rs->max_hw_segments, q->max_hw_segments); + min_not_zero(rs->max_hw_segments, queue_max_hw_segments(q)); rs->logical_block_size = max(rs->logical_block_size, queue_logical_block_size(q)); rs->max_segment_size = - min_not_zero(rs->max_segment_size, q->max_segment_size); + min_not_zero(rs->max_segment_size, queue_max_segment_size(q)); rs->max_hw_sectors = - min_not_zero(rs->max_hw_sectors, q->max_hw_sectors); + min_not_zero(rs->max_hw_sectors, queue_max_hw_sectors(q)); rs->seg_boundary_mask = min_not_zero(rs->seg_boundary_mask, - q->seg_boundary_mask); + queue_segment_boundary(q)); - rs->bounce_pfn = min_not_zero(rs->bounce_pfn, q->bounce_pfn); + rs->bounce_pfn = min_not_zero(rs->bounce_pfn, queue_bounce_pfn(q)); rs->no_cluster |= !test_bit(QUEUE_FLAG_CLUSTER, &q->queue_flags); } @@ -914,13 +914,13 @@ void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q) * restrictions. */ blk_queue_max_sectors(q, t->limits.max_sectors); - q->max_phys_segments = t->limits.max_phys_segments; - q->max_hw_segments = t->limits.max_hw_segments; - q->logical_block_size = t->limits.logical_block_size; - q->max_segment_size = t->limits.max_segment_size; - q->max_hw_sectors = t->limits.max_hw_sectors; - q->seg_boundary_mask = t->limits.seg_boundary_mask; - q->bounce_pfn = t->limits.bounce_pfn; + blk_queue_max_phys_segments(q, t->limits.max_phys_segments); + blk_queue_max_hw_segments(q, t->limits.max_hw_segments); + blk_queue_logical_block_size(q, t->limits.logical_block_size); + blk_queue_max_segment_size(q, t->limits.max_segment_size); + blk_queue_max_hw_sectors(q, t->limits.max_hw_sectors); + blk_queue_segment_boundary(q, t->limits.seg_boundary_mask); + blk_queue_bounce_limit(q, t->limits.bounce_pfn); if (t->limits.no_cluster) queue_flag_clear_unlocked(QUEUE_FLAG_CLUSTER, q); diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 7a36e38393a..64f1f3e046e 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -146,7 +146,7 @@ static linear_conf_t *linear_conf(mddev_t *mddev, int raid_disks) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->num_sectors = rdev->sectors; diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 41ced0cbe82..4ee31aa13c4 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -303,7 +303,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * merge_bvec_fn will be involved in multipath.) */ if (q->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(q) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); conf->working_disks++; @@ -467,7 +467,7 @@ static int multipath_run (mddev_t *mddev) * violating it, not that we ever expect a device with * a merge_bvec_fn to be involved in multipath */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); if (!test_bit(Faulty, &rdev->flags)) diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index c08d7559be5..925507e7d67 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -144,7 +144,7 @@ static int create_strip_zones (mddev_t *mddev) */ if (rdev1->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); if (!smallest || (rdev1->sectors < smallest->sectors)) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 36df9109cde..e23758b4a34 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1130,7 +1130,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); p->head_position = 0; @@ -1996,7 +1996,7 @@ static int run(mddev_t *mddev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 499620afb44..750550c1166 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1158,8 +1158,8 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); p->head_position = 0; rdev->raid_disk = mirror; @@ -2145,8 +2145,8 @@ static int run(mddev_t *mddev) * a one page request is never in violation. */ if (rdev->bdev->bd_disk->queue->merge_bvec_fn && - mddev->queue->max_sectors > (PAGE_SIZE>>9)) - mddev->queue->max_sectors = (PAGE_SIZE>>9); + queue_max_sectors(mddev->queue) > (PAGE_SIZE>>9)) + blk_queue_max_sectors(mddev->queue, PAGE_SIZE>>9); disk->head_position = 0; } diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 4616bc3a6e7..7970dc8c522 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3463,10 +3463,10 @@ static int bio_fits_rdev(struct bio *bi) { struct request_queue *q = bdev_get_queue(bi->bi_bdev); - if ((bi->bi_size>>9) > q->max_sectors) + if ((bi->bi_size>>9) > queue_max_sectors(q)) return 0; blk_recount_segments(q, bi); - if (bi->bi_phys_segments > q->max_phys_segments) + if (bi->bi_phys_segments > queue_max_phys_segments(q)) return 0; if (q->merge_bvec_fn) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 0fc2c0ae769..9bd407fa98e 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -289,8 +289,8 @@ sg_open(struct inode *inode, struct file *filp) if (list_empty(&sdp->sfds)) { /* no existing opens on this device */ sdp->sgdebug = 0; q = sdp->device->request_queue; - sdp->sg_tablesize = min(q->max_hw_segments, - q->max_phys_segments); + sdp->sg_tablesize = min(queue_max_hw_segments(q), + queue_max_phys_segments(q)); } if ((sfp = sg_add_sfp(sdp, dev))) filp->private_data = sfp; @@ -909,7 +909,7 @@ sg_ioctl(struct inode *inode, struct file *filp, if (val < 0) return -EINVAL; val = min_t(int, val, - sdp->device->request_queue->max_sectors * 512); + queue_max_sectors(sdp->device->request_queue) * 512); if (val != sfp->reserve.bufflen) { if (sg_res_in_use(sfp) || sfp->mmap_called) return -EBUSY; @@ -919,7 +919,7 @@ sg_ioctl(struct inode *inode, struct file *filp, return 0; case SG_GET_RESERVED_SIZE: val = min_t(int, sfp->reserve.bufflen, - sdp->device->request_queue->max_sectors * 512); + queue_max_sectors(sdp->device->request_queue) * 512); return put_user(val, ip); case SG_SET_COMMAND_Q: result = get_user(val, ip); @@ -1059,7 +1059,7 @@ sg_ioctl(struct inode *inode, struct file *filp, return -ENODEV; return scsi_ioctl(sdp->device, cmd_in, p); case BLKSECTGET: - return put_user(sdp->device->request_queue->max_sectors * 512, + return put_user(queue_max_sectors(sdp->device->request_queue) * 512, ip); case BLKTRACESETUP: return blk_trace_setup(sdp->device->request_queue, @@ -1377,7 +1377,8 @@ static Sg_device *sg_alloc(struct gendisk *disk, struct scsi_device *scsidp) sdp->device = scsidp; INIT_LIST_HEAD(&sdp->sfds); init_waitqueue_head(&sdp->o_excl_wait); - sdp->sg_tablesize = min(q->max_hw_segments, q->max_phys_segments); + sdp->sg_tablesize = min(queue_max_hw_segments(q), + queue_max_phys_segments(q)); sdp->index = k; kref_init(&sdp->d_ref); @@ -2055,7 +2056,7 @@ sg_add_sfp(Sg_device * sdp, int dev) sg_big_buff = def_reserved_size; bufflen = min_t(int, sg_big_buff, - sdp->device->request_queue->max_sectors * 512); + queue_max_sectors(sdp->device->request_queue) * 512); sg_build_reserve(sfp, bufflen); SCSI_LOG_TIMEOUT(3, printk("sg_add_sfp: bufflen=%d, k_use_sg=%d\n", sfp->reserve.bufflen, sfp->reserve.k_use_sg)); diff --git a/drivers/scsi/st.c b/drivers/scsi/st.c index 8681b708344..89bd438e1fe 100644 --- a/drivers/scsi/st.c +++ b/drivers/scsi/st.c @@ -3983,8 +3983,8 @@ static int st_probe(struct device *dev) return -ENODEV; } - i = min(SDp->request_queue->max_hw_segments, - SDp->request_queue->max_phys_segments); + i = min(queue_max_hw_segments(SDp->request_queue), + queue_max_phys_segments(SDp->request_queue)); if (st_max_sg_segs < i) i = st_max_sg_segs; buffer = new_tape_buffer((SDp->host)->unchecked_isa_dma, i); diff --git a/drivers/usb/storage/scsiglue.c b/drivers/usb/storage/scsiglue.c index 4ca3b586064..cfa26d56ce6 100644 --- a/drivers/usb/storage/scsiglue.c +++ b/drivers/usb/storage/scsiglue.c @@ -132,7 +132,7 @@ static int slave_configure(struct scsi_device *sdev) if (us->fflags & US_FL_MAX_SECTORS_MIN) max_sectors = PAGE_CACHE_SIZE >> 9; - if (sdev->request_queue->max_sectors > max_sectors) + if (queue_max_sectors(sdev->request_queue) > max_sectors) blk_queue_max_sectors(sdev->request_queue, max_sectors); } else if (sdev->type == TYPE_TAPE) { @@ -483,7 +483,7 @@ static ssize_t show_max_sectors(struct device *dev, struct device_attribute *att { struct scsi_device *sdev = to_scsi_device(dev); - return sprintf(buf, "%u\n", sdev->request_queue->max_sectors); + return sprintf(buf, "%u\n", queue_max_sectors(sdev->request_queue)); } /* Input routine for the sysfs max_sectors file */ diff --git a/fs/bio.c b/fs/bio.c index 4445c382173..ab423a1024a 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -499,11 +499,11 @@ int bio_get_nr_vecs(struct block_device *bdev) struct request_queue *q = bdev_get_queue(bdev); int nr_pages; - nr_pages = ((q->max_sectors << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; - if (nr_pages > q->max_phys_segments) - nr_pages = q->max_phys_segments; - if (nr_pages > q->max_hw_segments) - nr_pages = q->max_hw_segments; + nr_pages = ((queue_max_sectors(q) << 9) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (nr_pages > queue_max_phys_segments(q)) + nr_pages = queue_max_phys_segments(q); + if (nr_pages > queue_max_hw_segments(q)) + nr_pages = queue_max_hw_segments(q); return nr_pages; } @@ -562,8 +562,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page * make this too complex. */ - while (bio->bi_phys_segments >= q->max_phys_segments - || bio->bi_phys_segments >= q->max_hw_segments) { + while (bio->bi_phys_segments >= queue_max_phys_segments(q) + || bio->bi_phys_segments >= queue_max_hw_segments(q)) { if (retried_segments) return 0; @@ -634,7 +634,8 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page int bio_add_pc_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { - return __bio_add_page(q, bio, page, len, offset, q->max_hw_sectors); + return __bio_add_page(q, bio, page, len, offset, + queue_max_hw_sectors(q)); } /** @@ -654,7 +655,7 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { struct request_queue *q = bdev_get_queue(bio->bi_bdev); - return __bio_add_page(q, bio, page, len, offset, q->max_sectors); + return __bio_add_page(q, bio, page, len, offset, queue_max_sectors(q)); } struct bio_map_data { diff --git a/include/linux/bio.h b/include/linux/bio.h index d30ec6f30dd..12737be5860 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -279,7 +279,7 @@ static inline int bio_has_allocated_vec(struct bio *bio) #define __BIO_SEG_BOUNDARY(addr1, addr2, mask) \ (((addr1) | (mask)) == (((addr2) - 1) | (mask))) #define BIOVEC_SEG_BOUNDARY(q, b1, b2) \ - __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, (q)->seg_boundary_mask) + __BIO_SEG_BOUNDARY(bvec_to_phys((b1)), bvec_to_phys((b2)) + (b2)->bv_len, queue_segment_boundary((q))) #define BIO_SEG_BOUNDARY(q, b1, b2) \ BIOVEC_SEG_BOUNDARY((q), __BVEC_END((b1)), __BVEC_START((b2))) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 872b78b7a10..29b48f7b4ba 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -898,6 +898,7 @@ extern void blk_cleanup_queue(struct request_queue *); extern void blk_queue_make_request(struct request_queue *, make_request_fn *); extern void blk_queue_bounce_limit(struct request_queue *, u64); extern void blk_queue_max_sectors(struct request_queue *, unsigned int); +extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short); extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); @@ -988,6 +989,41 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter); #define blkdev_entry_to_request(entry) list_entry((entry), struct request, queuelist) +static inline unsigned long queue_bounce_pfn(struct request_queue *q) +{ + return q->bounce_pfn; +} + +static inline unsigned long queue_segment_boundary(struct request_queue *q) +{ + return q->seg_boundary_mask; +} + +static inline unsigned int queue_max_sectors(struct request_queue *q) +{ + return q->max_sectors; +} + +static inline unsigned int queue_max_hw_sectors(struct request_queue *q) +{ + return q->max_hw_sectors; +} + +static inline unsigned short queue_max_hw_segments(struct request_queue *q) +{ + return q->max_hw_segments; +} + +static inline unsigned short queue_max_phys_segments(struct request_queue *q) +{ + return q->max_phys_segments; +} + +static inline unsigned int queue_max_segment_size(struct request_queue *q) +{ + return q->max_segment_size; +} + static inline unsigned short queue_logical_block_size(struct request_queue *q) { int retval = 512; diff --git a/mm/bounce.c b/mm/bounce.c index e590272fe7a..8dcd4315e01 100644 --- a/mm/bounce.c +++ b/mm/bounce.c @@ -192,7 +192,7 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, /* * is destination page below bounce pfn? */ - if (page_to_pfn(page) <= q->bounce_pfn) + if (page_to_pfn(page) <= queue_bounce_pfn(q)) continue; /* @@ -284,7 +284,7 @@ void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) * don't waste time iterating over bio segments */ if (!(q->bounce_gfp & GFP_DMA)) { - if (q->bounce_pfn >= blk_max_pfn) + if (queue_bounce_pfn(q) >= blk_max_pfn) return; pool = page_pool; } else { From 025146e13b63483add912706c101fb0fb6f015cc Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 22 May 2009 17:17:51 -0400 Subject: [PATCH 1189/2061] block: Move queue limits to an embedded struct To accommodate stacking drivers that do not have an associated request queue we're moving the limits to a separate, embedded structure. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 53 ++++++++++++++++++++++++++---------------- include/linux/blkdev.h | 44 +++++++++++++++++++++-------------- 2 files changed, 59 insertions(+), 38 deletions(-) diff --git a/block/blk-settings.c b/block/blk-settings.c index 0b32f984eed..b0f547cecfb 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -179,16 +179,16 @@ void blk_queue_bounce_limit(struct request_queue *q, u64 dma_mask) */ if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) dma = 1; - q->bounce_pfn = max_low_pfn; + q->limits.bounce_pfn = max_low_pfn; #else if (b_pfn < blk_max_low_pfn) dma = 1; - q->bounce_pfn = b_pfn; + q->limits.bounce_pfn = b_pfn; #endif if (dma) { init_emergency_isa_pool(); q->bounce_gfp = GFP_NOIO | GFP_DMA; - q->bounce_pfn = b_pfn; + q->limits.bounce_pfn = b_pfn; } } EXPORT_SYMBOL(blk_queue_bounce_limit); @@ -211,10 +211,10 @@ void blk_queue_max_sectors(struct request_queue *q, unsigned int max_sectors) } if (BLK_DEF_MAX_SECTORS > max_sectors) - q->max_hw_sectors = q->max_sectors = max_sectors; + q->limits.max_hw_sectors = q->limits.max_sectors = max_sectors; else { - q->max_sectors = BLK_DEF_MAX_SECTORS; - q->max_hw_sectors = max_sectors; + q->limits.max_sectors = BLK_DEF_MAX_SECTORS; + q->limits.max_hw_sectors = max_sectors; } } EXPORT_SYMBOL(blk_queue_max_sectors); @@ -222,9 +222,9 @@ EXPORT_SYMBOL(blk_queue_max_sectors); void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_sectors) { if (BLK_DEF_MAX_SECTORS > max_sectors) - q->max_hw_sectors = BLK_DEF_MAX_SECTORS; + q->limits.max_hw_sectors = BLK_DEF_MAX_SECTORS; else - q->max_hw_sectors = max_sectors; + q->limits.max_hw_sectors = max_sectors; } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -247,7 +247,7 @@ void blk_queue_max_phys_segments(struct request_queue *q, __func__, max_segments); } - q->max_phys_segments = max_segments; + q->limits.max_phys_segments = max_segments; } EXPORT_SYMBOL(blk_queue_max_phys_segments); @@ -271,7 +271,7 @@ void blk_queue_max_hw_segments(struct request_queue *q, __func__, max_segments); } - q->max_hw_segments = max_segments; + q->limits.max_hw_segments = max_segments; } EXPORT_SYMBOL(blk_queue_max_hw_segments); @@ -292,7 +292,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) __func__, max_size); } - q->max_segment_size = max_size; + q->limits.max_segment_size = max_size; } EXPORT_SYMBOL(blk_queue_max_segment_size); @@ -308,7 +308,7 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); **/ void blk_queue_logical_block_size(struct request_queue *q, unsigned short size) { - q->logical_block_size = size; + q->limits.logical_block_size = size; } EXPORT_SYMBOL(blk_queue_logical_block_size); @@ -325,14 +325,27 @@ EXPORT_SYMBOL(blk_queue_logical_block_size); void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) { /* zero is "infinity" */ - t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); - t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); - t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); + t->limits.max_sectors = min_not_zero(queue_max_sectors(t), + queue_max_sectors(b)); + + t->limits.max_hw_sectors = min_not_zero(queue_max_hw_sectors(t), + queue_max_hw_sectors(b)); + + t->limits.seg_boundary_mask = min_not_zero(queue_segment_boundary(t), + queue_segment_boundary(b)); + + t->limits.max_phys_segments = min_not_zero(queue_max_phys_segments(t), + queue_max_phys_segments(b)); + + t->limits.max_hw_segments = min_not_zero(queue_max_hw_segments(t), + queue_max_hw_segments(b)); + + t->limits.max_segment_size = min_not_zero(queue_max_segment_size(t), + queue_max_segment_size(b)); + + t->limits.logical_block_size = max(queue_logical_block_size(t), + queue_logical_block_size(b)); - t->max_phys_segments = min_not_zero(t->max_phys_segments, b->max_phys_segments); - t->max_hw_segments = min_not_zero(t->max_hw_segments, b->max_hw_segments); - t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); - t->logical_block_size = max(t->logical_block_size, b->logical_block_size); if (!t->queue_lock) WARN_ON_ONCE(1); else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { @@ -430,7 +443,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) __func__, mask); } - q->seg_boundary_mask = mask; + q->limits.seg_boundary_mask = mask; } EXPORT_SYMBOL(blk_queue_segment_boundary); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 29b48f7b4ba..b7bb6fdba12 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -307,6 +307,21 @@ struct blk_cmd_filter { struct kobject kobj; }; +struct queue_limits { + unsigned long bounce_pfn; + unsigned long seg_boundary_mask; + + unsigned int max_hw_sectors; + unsigned int max_sectors; + unsigned int max_segment_size; + + unsigned short logical_block_size; + unsigned short max_hw_segments; + unsigned short max_phys_segments; + + unsigned char no_cluster; +}; + struct request_queue { /* @@ -358,7 +373,6 @@ struct request_queue /* * queue needs bounce pages for pages above this limit */ - unsigned long bounce_pfn; gfp_t bounce_gfp; /* @@ -387,14 +401,6 @@ struct request_queue unsigned int nr_congestion_off; unsigned int nr_batching; - unsigned int max_sectors; - unsigned int max_hw_sectors; - unsigned short max_phys_segments; - unsigned short max_hw_segments; - unsigned short logical_block_size; - unsigned int max_segment_size; - - unsigned long seg_boundary_mask; void *dma_drain_buffer; unsigned int dma_drain_size; unsigned int dma_pad_mask; @@ -410,6 +416,8 @@ struct request_queue struct timer_list timeout; struct list_head timeout_list; + struct queue_limits limits; + /* * sg stuff */ @@ -991,45 +999,45 @@ extern void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter); static inline unsigned long queue_bounce_pfn(struct request_queue *q) { - return q->bounce_pfn; + return q->limits.bounce_pfn; } static inline unsigned long queue_segment_boundary(struct request_queue *q) { - return q->seg_boundary_mask; + return q->limits.seg_boundary_mask; } static inline unsigned int queue_max_sectors(struct request_queue *q) { - return q->max_sectors; + return q->limits.max_sectors; } static inline unsigned int queue_max_hw_sectors(struct request_queue *q) { - return q->max_hw_sectors; + return q->limits.max_hw_sectors; } static inline unsigned short queue_max_hw_segments(struct request_queue *q) { - return q->max_hw_segments; + return q->limits.max_hw_segments; } static inline unsigned short queue_max_phys_segments(struct request_queue *q) { - return q->max_phys_segments; + return q->limits.max_phys_segments; } static inline unsigned int queue_max_segment_size(struct request_queue *q) { - return q->max_segment_size; + return q->limits.max_segment_size; } static inline unsigned short queue_logical_block_size(struct request_queue *q) { int retval = 512; - if (q && q->logical_block_size) - retval = q->logical_block_size; + if (q && q->limits.logical_block_size) + retval = q->limits.logical_block_size; return retval; } From cd43e26f071524647e660706b784ebcbefbd2e44 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 22 May 2009 17:17:52 -0400 Subject: [PATCH 1190/2061] block: Expose stacked device queues in sysfs Currently stacking devices do not have a queue directory in sysfs. However, many of the I/O characteristics like sector size, maximum request size, etc. are queue properties. This patch enables the queue directory for MD/DM devices. The elevator code has been modified to deal with queues that do not have an I/O scheduler. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-sysfs.c | 6 +++--- block/elevator.c | 13 ++++++++++++- 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 142a4acddd4..3ccdadb8e20 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -395,9 +395,6 @@ int blk_register_queue(struct gendisk *disk) if (WARN_ON(!q)) return -ENXIO; - if (!q->request_fn) - return 0; - ret = kobject_add(&q->kobj, kobject_get(&disk_to_dev(disk)->kobj), "%s", "queue"); if (ret < 0) @@ -405,6 +402,9 @@ int blk_register_queue(struct gendisk *disk) kobject_uevent(&q->kobj, KOBJ_ADD); + if (!q->request_fn) + return 0; + ret = elv_register_queue(q); if (ret) { kobject_uevent(&q->kobj, KOBJ_REMOVE); diff --git a/block/elevator.c b/block/elevator.c index ebee948293e..bd78f693fcf 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -575,6 +575,9 @@ void elv_drain_elevator(struct request_queue *q) */ void elv_quiesce_start(struct request_queue *q) { + if (!q->elevator) + return; + queue_flag_set(QUEUE_FLAG_ELVSWITCH, q); /* @@ -1050,6 +1053,9 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; + if (!q->elevator) + return count; + strlcpy(elevator_name, name, sizeof(elevator_name)); strstrip(elevator_name); @@ -1073,10 +1079,15 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *name, ssize_t elv_iosched_show(struct request_queue *q, char *name) { struct elevator_queue *e = q->elevator; - struct elevator_type *elv = e->elevator_type; + struct elevator_type *elv; struct elevator_type *__e; int len = 0; + if (!q->elevator) + return sprintf(name, "none\n"); + + elv = e->elevator_type; + spin_lock(&elv_list_lock); list_for_each_entry(__e, &elv_list, list) { if (!strcmp(elv->elevator_name, __e->elevator_name)) From c72758f33784e5e2a1a4bb9421ef3e6de8f9fcf3 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Fri, 22 May 2009 17:17:53 -0400 Subject: [PATCH 1191/2061] block: Export I/O topology for block devices and partitions To support devices with physical block sizes bigger than 512 bytes we need to ensure proper alignment. This patch adds support for exposing I/O topology characteristics as devices are stacked. logical_block_size is the smallest unit the device can address. physical_block_size indicates the smallest I/O the device can write without incurring a read-modify-write penalty. The io_min parameter is the smallest preferred I/O size reported by the device. In many cases this is the same as the physical block size. However, the io_min parameter can be scaled up when stacking (RAID5 chunk size > physical block size). The io_opt characteristic indicates the optimal I/O size reported by the device. This is usually the stripe width for arrays. The alignment_offset parameter indicates the number of bytes the start of the device/partition is offset from the device's natural alignment. Partition tools and MD/DM utilities can use this to pad their offsets so filesystems start on proper boundaries. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- Documentation/ABI/testing/sysfs-block | 59 ++++++++ block/blk-settings.c | 186 ++++++++++++++++++++++++++ block/blk-sysfs.c | 33 +++++ block/genhd.c | 11 ++ fs/partitions/check.c | 10 ++ include/linux/blkdev.h | 47 +++++++ include/linux/genhd.h | 1 + 7 files changed, 347 insertions(+) diff --git a/Documentation/ABI/testing/sysfs-block b/Documentation/ABI/testing/sysfs-block index 44f52a4f590..cbbd3e06994 100644 --- a/Documentation/ABI/testing/sysfs-block +++ b/Documentation/ABI/testing/sysfs-block @@ -60,3 +60,62 @@ Description: Indicates whether the block layer should automatically generate checksums for write requests bound for devices that support receiving integrity metadata. + +What: /sys/block//alignment_offset +Date: April 2009 +Contact: Martin K. Petersen +Description: + Storage devices may report a physical block size that is + bigger than the logical block size (for instance a drive + with 4KB physical sectors exposing 512-byte logical + blocks to the operating system). This parameter + indicates how many bytes the beginning of the device is + offset from the disk's natural alignment. + +What: /sys/block///alignment_offset +Date: April 2009 +Contact: Martin K. Petersen +Description: + Storage devices may report a physical block size that is + bigger than the logical block size (for instance a drive + with 4KB physical sectors exposing 512-byte logical + blocks to the operating system). This parameter + indicates how many bytes the beginning of the partition + is offset from the disk's natural alignment. + +What: /sys/block//queue/logical_block_size +Date: May 2009 +Contact: Martin K. Petersen +Description: + This is the smallest unit the storage device can + address. It is typically 512 bytes. + +What: /sys/block//queue/physical_block_size +Date: May 2009 +Contact: Martin K. Petersen +Description: + This is the smallest unit the storage device can write + without resorting to read-modify-write operation. It is + usually the same as the logical block size but may be + bigger. One example is SATA drives with 4KB sectors + that expose a 512-byte logical block size to the + operating system. + +What: /sys/block//queue/minimum_io_size +Date: April 2009 +Contact: Martin K. Petersen +Description: + Storage devices may report a preferred minimum I/O size, + which is the smallest request the device can perform + without incurring a read-modify-write penalty. For disk + drives this is often the physical block size. For RAID + arrays it is often the stripe chunk size. + +What: /sys/block//queue/optimal_io_size +Date: April 2009 +Contact: Martin K. Petersen +Description: + Storage devices may report an optimal I/O size, which is + the device's preferred unit of receiving I/O. This is + rarely reported for disk drives. For RAID devices it is + usually the stripe width or the internal block size. diff --git a/block/blk-settings.c b/block/blk-settings.c index b0f547cecfb..5649f34adb4 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -309,9 +309,94 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); void blk_queue_logical_block_size(struct request_queue *q, unsigned short size) { q->limits.logical_block_size = size; + + if (q->limits.physical_block_size < size) + q->limits.physical_block_size = size; + + if (q->limits.io_min < q->limits.physical_block_size) + q->limits.io_min = q->limits.physical_block_size; } EXPORT_SYMBOL(blk_queue_logical_block_size); +/** + * blk_queue_physical_block_size - set physical block size for the queue + * @q: the request queue for the device + * @size: the physical block size, in bytes + * + * Description: + * This should be set to the lowest possible sector size that the + * hardware can operate on without reverting to read-modify-write + * operations. + */ +void blk_queue_physical_block_size(struct request_queue *q, unsigned short size) +{ + q->limits.physical_block_size = size; + + if (q->limits.physical_block_size < q->limits.logical_block_size) + q->limits.physical_block_size = q->limits.logical_block_size; + + if (q->limits.io_min < q->limits.physical_block_size) + q->limits.io_min = q->limits.physical_block_size; +} +EXPORT_SYMBOL(blk_queue_physical_block_size); + +/** + * blk_queue_alignment_offset - set physical block alignment offset + * @q: the request queue for the device + * @alignment: alignment offset in bytes + * + * Description: + * Some devices are naturally misaligned to compensate for things like + * the legacy DOS partition table 63-sector offset. Low-level drivers + * should call this function for devices whose first sector is not + * naturally aligned. + */ +void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) +{ + q->limits.alignment_offset = + offset & (q->limits.physical_block_size - 1); + q->limits.misaligned = 0; +} +EXPORT_SYMBOL(blk_queue_alignment_offset); + +/** + * blk_queue_io_min - set minimum request size for the queue + * @q: the request queue for the device + * @io_min: smallest I/O size in bytes + * + * Description: + * Some devices have an internal block size bigger than the reported + * hardware sector size. This function can be used to signal the + * smallest I/O the device can perform without incurring a performance + * penalty. + */ +void blk_queue_io_min(struct request_queue *q, unsigned int min) +{ + q->limits.io_min = min; + + if (q->limits.io_min < q->limits.logical_block_size) + q->limits.io_min = q->limits.logical_block_size; + + if (q->limits.io_min < q->limits.physical_block_size) + q->limits.io_min = q->limits.physical_block_size; +} +EXPORT_SYMBOL(blk_queue_io_min); + +/** + * blk_queue_io_opt - set optimal request size for the queue + * @q: the request queue for the device + * @io_opt: optimal request size in bytes + * + * Description: + * Drivers can call this function to set the preferred I/O request + * size for devices that report such a value. + */ +void blk_queue_io_opt(struct request_queue *q, unsigned int opt) +{ + q->limits.io_opt = opt; +} +EXPORT_SYMBOL(blk_queue_io_opt); + /* * Returns the minimum that is _not_ zero, unless both are zero. */ @@ -357,6 +442,107 @@ void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) } EXPORT_SYMBOL(blk_queue_stack_limits); +/** + * blk_stack_limits - adjust queue_limits for stacked devices + * @t: the stacking driver limits (top) + * @bdev: the underlying queue limits (bottom) + * @offset: offset to beginning of data within component device + * + * Description: + * Merges two queue_limit structs. Returns 0 if alignment didn't + * change. Returns -1 if adding the bottom device caused + * misalignment. + */ +int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, + sector_t offset) +{ + t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); + t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); + + t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, + b->seg_boundary_mask); + + t->max_phys_segments = min_not_zero(t->max_phys_segments, + b->max_phys_segments); + + t->max_hw_segments = min_not_zero(t->max_hw_segments, + b->max_hw_segments); + + t->max_segment_size = min_not_zero(t->max_segment_size, + b->max_segment_size); + + t->logical_block_size = max(t->logical_block_size, + b->logical_block_size); + + t->physical_block_size = max(t->physical_block_size, + b->physical_block_size); + + t->io_min = max(t->io_min, b->io_min); + t->no_cluster |= b->no_cluster; + + /* Bottom device offset aligned? */ + if (offset && + (offset & (b->physical_block_size - 1)) != b->alignment_offset) { + t->misaligned = 1; + return -1; + } + + /* If top has no alignment offset, inherit from bottom */ + if (!t->alignment_offset) + t->alignment_offset = + b->alignment_offset & (b->physical_block_size - 1); + + /* Top device aligned on logical block boundary? */ + if (t->alignment_offset & (t->logical_block_size - 1)) { + t->misaligned = 1; + return -1; + } + + return 0; +} + +/** + * disk_stack_limits - adjust queue limits for stacked drivers + * @t: MD/DM gendisk (top) + * @bdev: the underlying block device (bottom) + * @offset: offset to beginning of data within component device + * + * Description: + * Merges the limits for two queues. Returns 0 if alignment + * didn't change. Returns -1 if adding the bottom device caused + * misalignment. + */ +void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, + sector_t offset) +{ + struct request_queue *t = disk->queue; + struct request_queue *b = bdev_get_queue(bdev); + + offset += get_start_sect(bdev) << 9; + + if (blk_stack_limits(&t->limits, &b->limits, offset) < 0) { + char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; + + disk_name(disk, 0, top); + bdevname(bdev, bottom); + + printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", + top, bottom); + } + + if (!t->queue_lock) + WARN_ON_ONCE(1); + else if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) { + unsigned long flags; + + spin_lock_irqsave(t->queue_lock, flags); + if (!test_bit(QUEUE_FLAG_CLUSTER, &b->queue_flags)) + queue_flag_clear(QUEUE_FLAG_CLUSTER, t); + spin_unlock_irqrestore(t->queue_lock, flags); + } +} +EXPORT_SYMBOL(disk_stack_limits); + /** * blk_queue_dma_pad - set pad mask * @q: the request queue for the device diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 3ccdadb8e20..9337e17f911 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -105,6 +105,21 @@ static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page return queue_var_show(queue_logical_block_size(q), page); } +static ssize_t queue_physical_block_size_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_physical_block_size(q), page); +} + +static ssize_t queue_io_min_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_io_min(q), page); +} + +static ssize_t queue_io_opt_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_io_opt(q), page); +} + static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { @@ -257,6 +272,21 @@ static struct queue_sysfs_entry queue_logical_block_size_entry = { .show = queue_logical_block_size_show, }; +static struct queue_sysfs_entry queue_physical_block_size_entry = { + .attr = {.name = "physical_block_size", .mode = S_IRUGO }, + .show = queue_physical_block_size_show, +}; + +static struct queue_sysfs_entry queue_io_min_entry = { + .attr = {.name = "minimum_io_size", .mode = S_IRUGO }, + .show = queue_io_min_show, +}; + +static struct queue_sysfs_entry queue_io_opt_entry = { + .attr = {.name = "optimal_io_size", .mode = S_IRUGO }, + .show = queue_io_opt_show, +}; + static struct queue_sysfs_entry queue_nonrot_entry = { .attr = {.name = "rotational", .mode = S_IRUGO | S_IWUSR }, .show = queue_nonrot_show, @@ -289,6 +319,9 @@ static struct attribute *default_attrs[] = { &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, + &queue_physical_block_size_entry.attr, + &queue_io_min_entry.attr, + &queue_io_opt_entry.attr, &queue_nonrot_entry.attr, &queue_nomerges_entry.attr, &queue_rq_affinity_entry.attr, diff --git a/block/genhd.c b/block/genhd.c index 1a4916e0173..fe7ccc0a618 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -852,11 +852,21 @@ static ssize_t disk_capability_show(struct device *dev, return sprintf(buf, "%x\n", disk->flags); } +static ssize_t disk_alignment_offset_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); +} + static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); static DEVICE_ATTR(ext_range, S_IRUGO, disk_ext_range_show, NULL); static DEVICE_ATTR(removable, S_IRUGO, disk_removable_show, NULL); static DEVICE_ATTR(ro, S_IRUGO, disk_ro_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(alignment_offset, S_IRUGO, disk_alignment_offset_show, NULL); static DEVICE_ATTR(capability, S_IRUGO, disk_capability_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -875,6 +885,7 @@ static struct attribute *disk_attrs[] = { &dev_attr_removable.attr, &dev_attr_ro.attr, &dev_attr_size.attr, + &dev_attr_alignment_offset.attr, &dev_attr_capability.attr, &dev_attr_stat.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 99e33ef40be..0af36085eb2 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -219,6 +219,13 @@ ssize_t part_size_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } +ssize_t part_alignment_offset_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct hd_struct *p = dev_to_part(dev); + return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); +} + ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { @@ -272,6 +279,7 @@ ssize_t part_fail_store(struct device *dev, static DEVICE_ATTR(partition, S_IRUGO, part_partition_show, NULL); static DEVICE_ATTR(start, S_IRUGO, part_start_show, NULL); static DEVICE_ATTR(size, S_IRUGO, part_size_show, NULL); +static DEVICE_ATTR(alignment_offset, S_IRUGO, part_alignment_offset_show, NULL); static DEVICE_ATTR(stat, S_IRUGO, part_stat_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST static struct device_attribute dev_attr_fail = @@ -282,6 +290,7 @@ static struct attribute *part_attrs[] = { &dev_attr_partition.attr, &dev_attr_start.attr, &dev_attr_size.attr, + &dev_attr_alignment_offset.attr, &dev_attr_stat.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, @@ -383,6 +392,7 @@ struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev = part_to_dev(p); p->start_sect = start; + p->alignment_offset = queue_sector_alignment_offset(disk->queue, start); p->nr_sects = len; p->partno = partno; p->policy = get_disk_ro(disk); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b7bb6fdba12..5e740a135e7 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -314,11 +314,16 @@ struct queue_limits { unsigned int max_hw_sectors; unsigned int max_sectors; unsigned int max_segment_size; + unsigned int physical_block_size; + unsigned int alignment_offset; + unsigned int io_min; + unsigned int io_opt; unsigned short logical_block_size; unsigned short max_hw_segments; unsigned short max_phys_segments; + unsigned char misaligned; unsigned char no_cluster; }; @@ -911,6 +916,15 @@ extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short); extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); +extern void blk_queue_physical_block_size(struct request_queue *, unsigned short); +extern void blk_queue_alignment_offset(struct request_queue *q, + unsigned int alignment); +extern void blk_queue_io_min(struct request_queue *q, unsigned int min); +extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); +extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, + sector_t offset); +extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, + sector_t offset); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); extern void blk_queue_dma_pad(struct request_queue *, unsigned int); extern void blk_queue_update_dma_pad(struct request_queue *, unsigned int); @@ -1047,6 +1061,39 @@ static inline unsigned short bdev_logical_block_size(struct block_device *bdev) return queue_logical_block_size(bdev_get_queue(bdev)); } +static inline unsigned int queue_physical_block_size(struct request_queue *q) +{ + return q->limits.physical_block_size; +} + +static inline unsigned int queue_io_min(struct request_queue *q) +{ + return q->limits.io_min; +} + +static inline unsigned int queue_io_opt(struct request_queue *q) +{ + return q->limits.io_opt; +} + +static inline int queue_alignment_offset(struct request_queue *q) +{ + if (q && q->limits.misaligned) + return -1; + + if (q && q->limits.alignment_offset) + return q->limits.alignment_offset; + + return 0; +} + +static inline int queue_sector_alignment_offset(struct request_queue *q, + sector_t sector) +{ + return ((sector << 9) - q->limits.alignment_offset) + & (q->limits.io_min - 1); +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index a1a28caed23..149fda264c8 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -90,6 +90,7 @@ struct disk_stats { struct hd_struct { sector_t start_sect; sector_t nr_sects; + sector_t alignment_offset; struct device __dev; struct kobject *holder_dir; int policy, partno; From 3b77f777b8f1c001b63e317c4ce317292ff0ff94 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Sat, 23 May 2009 08:23:16 +0200 Subject: [PATCH 1192/2061] ide-disk: fix missing max_sectors accessor function MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The recent move to accessor functions for querying queue limits missed an entry in ide-disk.c: drivers/ide/ide-disk.c: In function ‘ide_disk_setup’: drivers/ide/ide-disk.c:642: error: ‘struct request_queue’ has no member named ‘max_sectors’ Fix it. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- drivers/ide/ide-disk.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ide/ide-disk.c b/drivers/ide/ide-disk.c index ad18e14043c..c6f7fcfb9d6 100644 --- a/drivers/ide/ide-disk.c +++ b/drivers/ide/ide-disk.c @@ -639,7 +639,7 @@ static void ide_disk_setup(ide_drive_t *drive) } printk(KERN_INFO "%s: max request size: %dKiB\n", drive->name, - q->max_sectors / 2); + queue_max_sectors(q) / 2); if (ata_id_is_ssd(id)) queue_flag_set_unlocked(QUEUE_FLAG_NONROT, q); From e220d2dcb944c5c488b6855d15ec66d76900514f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:55 +0200 Subject: [PATCH 1193/2061] perf_counter: Fix dynamic irq_period logging We call perf_adjust_freq() from perf_counter_task_tick() which is is called under the rq->lock causing lock recursion. However, it's no longer required to be called under the rq->lock, so remove it from under it. Also, fix up some related comments. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.476197912@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 + kernel/perf_counter.c | 3 ++- kernel/sched.c | 3 ++- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2eedae8498d..23ddd29730f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -260,6 +260,7 @@ enum perf_event_type { /* * struct { * struct perf_event_header header; + * u64 time; * u64 irq_period; * }; */ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index c10055416de..2f410ea2cb3 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2559,7 +2559,8 @@ void perf_counter_munmap(unsigned long addr, unsigned long len, } /* - * + * Log irq_period changes so that analyzing tools can re-normalize the + * event flow. */ static void perf_log_period(struct perf_counter *counter, u64 period) diff --git a/kernel/sched.c b/kernel/sched.c index 4c0d58bce6b..ad079f07c9c 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4875,9 +4875,10 @@ void scheduler_tick(void) update_rq_clock(rq); update_cpu_load(rq); curr->sched_class->task_tick(rq, curr, 0); - perf_counter_task_tick(curr, cpu); spin_unlock(&rq->lock); + perf_counter_task_tick(curr, cpu); + #ifdef CONFIG_SMP rq->idle_at_tick = idle_cpu(cpu); trigger_load_balance(rq, cpu); From fccc714b3148ab9741fafc1e90c3876d50df6093 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:56 +0200 Subject: [PATCH 1194/2061] perf_counter: Sanitize counter->mutex s/counter->mutex/counter->child_mutex/ and make sure its only used to protect child_list. The usage in __perf_counter_exit_task() doesn't appear to be problematic since ctx->mutex also covers anything related to fd tear-down. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.533186528@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 7 +++--- kernel/perf_counter.c | 47 +++++++++++++++--------------------- 2 files changed, 22 insertions(+), 32 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 23ddd29730f..4ab8050eb9e 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -452,9 +452,6 @@ struct perf_counter { struct perf_counter_context *ctx; struct file *filp; - struct perf_counter *parent; - struct list_head child_list; - /* * These accumulate total time (in nanoseconds) that children * counters have been enabled and running, respectively. @@ -465,7 +462,9 @@ struct perf_counter { /* * Protect attach/detach and child_list: */ - struct mutex mutex; + struct mutex child_mutex; + struct list_head child_list; + struct perf_counter *parent; int oncpu; int cpu; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 2f410ea2cb3..679c3b5bb7d 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -111,6 +111,10 @@ static void put_ctx(struct perf_counter_context *ctx) } } +/* + * Add a counter from the lists for its context. + * Must be called with ctx->mutex and ctx->lock held. + */ static void list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) { @@ -136,7 +140,7 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) /* * Remove a counter from the lists for its context. - * Must be called with counter->mutex and ctx->mutex held. + * Must be called with ctx->mutex and ctx->lock held. */ static void list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) @@ -276,7 +280,7 @@ static void __perf_counter_remove_from_context(void *info) /* * Remove the counter from a task's (or a CPU's) list of counters. * - * Must be called with counter->mutex and ctx->mutex held. + * Must be called with ctx->mutex held. * * CPU counters are removed with a smp call. For task counters we only * call when the task is on a CPU. @@ -1407,11 +1411,7 @@ static int perf_release(struct inode *inode, struct file *file) file->private_data = NULL; mutex_lock(&ctx->mutex); - mutex_lock(&counter->mutex); - perf_counter_remove_from_context(counter); - - mutex_unlock(&counter->mutex); mutex_unlock(&ctx->mutex); free_counter(counter); @@ -1437,7 +1437,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->state == PERF_COUNTER_STATE_ERROR) return 0; - mutex_lock(&counter->mutex); + mutex_lock(&counter->child_mutex); values[0] = perf_counter_read(counter); n = 1; if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_ENABLED) @@ -1446,7 +1446,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count) if (counter->hw_event.read_format & PERF_FORMAT_TOTAL_TIME_RUNNING) values[n++] = counter->total_time_running + atomic64_read(&counter->child_total_time_running); - mutex_unlock(&counter->mutex); + mutex_unlock(&counter->child_mutex); if (count < n * sizeof(u64)) return -EINVAL; @@ -1510,11 +1510,11 @@ static void perf_counter_for_each_child(struct perf_counter *counter, { struct perf_counter *child; - mutex_lock(&counter->mutex); + mutex_lock(&counter->child_mutex); func(counter); list_for_each_entry(child, &counter->child_list, child_list) func(child); - mutex_unlock(&counter->mutex); + mutex_unlock(&counter->child_mutex); } static void perf_counter_for_each(struct perf_counter *counter, @@ -1522,11 +1522,11 @@ static void perf_counter_for_each(struct perf_counter *counter, { struct perf_counter *child; - mutex_lock(&counter->mutex); + mutex_lock(&counter->child_mutex); perf_counter_for_each_sibling(counter, func); list_for_each_entry(child, &counter->child_list, child_list) perf_counter_for_each_sibling(child, func); - mutex_unlock(&counter->mutex); + mutex_unlock(&counter->child_mutex); } static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) @@ -3106,7 +3106,9 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, if (!group_leader) group_leader = counter; - mutex_init(&counter->mutex); + mutex_init(&counter->child_mutex); + INIT_LIST_HEAD(&counter->child_list); + INIT_LIST_HEAD(&counter->list_entry); INIT_LIST_HEAD(&counter->event_entry); INIT_LIST_HEAD(&counter->sibling_list); @@ -3114,8 +3116,6 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, mutex_init(&counter->mmap_mutex); - INIT_LIST_HEAD(&counter->child_list); - counter->cpu = cpu; counter->hw_event = *hw_event; counter->group_leader = group_leader; @@ -3346,10 +3346,9 @@ inherit_counter(struct perf_counter *parent_counter, /* * Link this into the parent counter's child list */ - mutex_lock(&parent_counter->mutex); + mutex_lock(&parent_counter->child_mutex); list_add_tail(&child_counter->child_list, &parent_counter->child_list); - - mutex_unlock(&parent_counter->mutex); + mutex_unlock(&parent_counter->child_mutex); return child_counter; } @@ -3396,9 +3395,9 @@ static void sync_child_counter(struct perf_counter *child_counter, /* * Remove this counter from the parent's list */ - mutex_lock(&parent_counter->mutex); + mutex_lock(&parent_counter->child_mutex); list_del_init(&child_counter->child_list); - mutex_unlock(&parent_counter->mutex); + mutex_unlock(&parent_counter->child_mutex); /* * Release the parent counter, if this was the last @@ -3414,17 +3413,9 @@ __perf_counter_exit_task(struct task_struct *child, { struct perf_counter *parent_counter; - /* - * Protect against concurrent operations on child_counter - * due its fd getting closed, etc. - */ - mutex_lock(&child_counter->mutex); - update_counter_times(child_counter); list_del_counter(child_counter, child_ctx); - mutex_unlock(&child_counter->mutex); - parent_counter = child_counter->parent; /* * It can happen that parent exits first, and has counters From 682076ae1de0aba9c2da509f7b19dc03e30a6e1f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:57 +0200 Subject: [PATCH 1195/2061] perf_counter: Sanitize context locking Ensure we're consistent with the context locks. context->mutex context->lock list_{add,del}_counter(); so that either lock is sufficient to stabilize the context. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.618790733@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 679c3b5bb7d..d162d2f0b27 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -597,6 +597,8 @@ static void add_counter_to_ctx(struct perf_counter *counter, /* * Cross CPU call to install and enable a performance counter + * + * Must be called with ctx->mutex held */ static void __perf_install_in_context(void *info) { @@ -1496,13 +1498,13 @@ static void perf_counter_for_each_sibling(struct perf_counter *counter, struct perf_counter_context *ctx = counter->ctx; struct perf_counter *sibling; - spin_lock_irq(&ctx->lock); + mutex_lock(&ctx->mutex); counter = counter->group_leader; func(counter); list_for_each_entry(sibling, &counter->sibling_list, list_entry) func(sibling); - spin_unlock_irq(&ctx->lock); + mutex_unlock(&ctx->mutex); } static void perf_counter_for_each_child(struct perf_counter *counter, @@ -3414,7 +3416,10 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter *parent_counter; update_counter_times(child_counter); + + spin_lock_irq(&child_ctx->lock); list_del_counter(child_counter, child_ctx); + spin_unlock_irq(&child_ctx->lock); parent_counter = child_counter->parent; /* From 1a482f38c5aafeb3576079a38a5b21b46619f3d2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:58 +0200 Subject: [PATCH 1196/2061] perf_counter: Fix userspace build recent userspace (F11) seems to already include the linux/unistd.h bits which means we cannot include the version in the kernel sources due to the header guards being the same. Ensure we include the kernel version first. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.739756497@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-record.c | 3 +- Documentation/perf_counter/builtin-stat.c | 5 +--- Documentation/perf_counter/builtin-top.c | 5 +--- Documentation/perf_counter/perf.h | 31 +++++++++++++-------- 4 files changed, 22 insertions(+), 22 deletions(-) diff --git a/Documentation/perf_counter/builtin-record.c b/Documentation/perf_counter/builtin-record.c index efb87595f3c..1b19f187d35 100644 --- a/Documentation/perf_counter/builtin-record.c +++ b/Documentation/perf_counter/builtin-record.c @@ -1,5 +1,6 @@ +#include "perf.h" #include "util/util.h" #include @@ -30,9 +31,7 @@ #include #include -#include "../../include/linux/perf_counter.h" -#include "perf.h" #define ALIGN(x,a) __ALIGN_MASK(x,(typeof(x))(a)-1) #define __ALIGN_MASK(x,mask) (((x)+(mask))&~(mask)) diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index 03518d75d86..8ae01d51f29 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -61,6 +61,7 @@ * Released under the GPL v2. (and only v2, not any later version) */ +#include "perf.h" #include "util/util.h" #include @@ -83,10 +84,6 @@ #include #include -#include "../../include/linux/perf_counter.h" - -#include "perf.h" - #define EVENT_MASK_KERNEL 1 #define EVENT_MASK_USER 2 diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index 814b2e4925e..a3216a6018c 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -42,6 +42,7 @@ * Released under the GPL v2. (and only v2, not any later version) */ +#include "perf.h" #include "util/util.h" #include @@ -64,10 +65,6 @@ #include #include -#include "../../include/linux/perf_counter.h" - -#include "perf.h" - static int system_wide = 0; static int nr_counters = 0; diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h index 81a737444c8..a517683fc66 100644 --- a/Documentation/perf_counter/perf.h +++ b/Documentation/perf_counter/perf.h @@ -1,6 +1,25 @@ #ifndef _PERF_PERF_H #define _PERF_PERF_H +#if defined(__x86_64__) || defined(__i386__) +#include "../../arch/x86/include/asm/unistd.h" +#define rmb() asm volatile("lfence" ::: "memory") +#define cpu_relax() asm volatile("rep; nop" ::: "memory"); +#endif + +#ifdef __powerpc__ +#include "../../arch/powerpc/include/asm/unistd.h" +#define rmb() asm volatile ("sync" ::: "memory") +#define cpu_relax() asm volatile ("" ::: "memory"); +#endif + +#include +#include +#include +#include + +#include "../../include/linux/perf_counter.h" + /* * prctl(PR_TASK_PERF_COUNTERS_DISABLE) will (cheaply) disable all * counters in the current task. @@ -26,18 +45,6 @@ static inline unsigned long long rdclock(void) #define __user #define asmlinkage -#if defined(__x86_64__) || defined(__i386__) -#include "../../arch/x86/include/asm/unistd.h" -#define rmb() asm volatile("lfence" ::: "memory") -#define cpu_relax() asm volatile("rep; nop" ::: "memory"); -#endif - -#ifdef __powerpc__ -#include "../../arch/powerpc/include/asm/unistd.h" -#define rmb() asm volatile ("sync" ::: "memory") -#define cpu_relax() asm volatile ("" ::: "memory"); -#endif - #define unlikely(x) __builtin_expect(!!(x), 0) #define min(x, y) ({ \ typeof(x) _min1 = (x); \ From aa9c67f53d1969cf1db4c9c2db3a78c4ceb96469 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:28:59 +0200 Subject: [PATCH 1197/2061] perf_counter: Simplify context cleanup Use perf_counter_remove_from_context() to remove counters from the context. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.796275849@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index d162d2f0b27..0e97f896133 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3416,10 +3416,7 @@ __perf_counter_exit_task(struct task_struct *child, struct perf_counter *parent_counter; update_counter_times(child_counter); - - spin_lock_irq(&child_ctx->lock); - list_del_counter(child_counter, child_ctx); - spin_unlock_irq(&child_ctx->lock); + perf_counter_remove_from_context(child_counter); parent_counter = child_counter->parent; /* From 79f52b77b89e8b7aa9fbe62135eea198a2ecbd5b Mon Sep 17 00:00:00 2001 From: Dave Kleikamp Date: Sat, 23 May 2009 20:28:41 -0500 Subject: [PATCH 1198/2061] jfs: Add missing mutex_unlock call to error path Jan Kucera found an missing call to mutex_unlock() with his static code checker. It's an unlikely error path to hit in the real world, but it should be fixed. Signed-off-by: Dave Kleikamp Reported-by: Jan Kucera --- fs/jfs/jfs_imap.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index 346057218ed..0fc30407f03 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -2571,6 +2571,7 @@ diNewIAG(struct inomap * imap, int *iagnop, int agno, struct metapage ** mpp) txAbort(tid, 0); txEnd(tid); + mutex_unlock(&JFS_IP(ipimap)->commit_mutex); /* release the inode map lock */ IWRITE_UNLOCK(ipimap); From 082ff5a2767a0679ee543f14883adbafb631ffbe Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:29:00 +0200 Subject: [PATCH 1199/2061] perf_counter: Change pctrl() behaviour Instead of en/dis-abling all counters acting on a particular task, en/dis- able all counters we created. [ v2: fix crash on first counter enable ] Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163012.916937244@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/init_task.h | 10 +++++ include/linux/perf_counter.h | 3 ++ include/linux/sched.h | 2 + kernel/perf_counter.c | 87 ++++++++++-------------------------- 4 files changed, 39 insertions(+), 63 deletions(-) diff --git a/include/linux/init_task.h b/include/linux/init_task.h index d87247d2641..353c0ac7723 100644 --- a/include/linux/init_task.h +++ b/include/linux/init_task.h @@ -108,6 +108,15 @@ extern struct group_info init_groups; extern struct cred init_cred; +#ifdef CONFIG_PERF_COUNTERS +# define INIT_PERF_COUNTERS(tsk) \ + .perf_counter_mutex = \ + __MUTEX_INITIALIZER(tsk.perf_counter_mutex), \ + .perf_counter_list = LIST_HEAD_INIT(tsk.perf_counter_list), +#else +# define INIT_PERF_COUNTERS(tsk) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -171,6 +180,7 @@ extern struct cred init_cred; }, \ .dirties = INIT_PROP_LOCAL_SINGLE(dirties), \ INIT_IDS \ + INIT_PERF_COUNTERS(tsk) \ INIT_TRACE_IRQFLAGS \ INIT_LOCKDEP \ INIT_FTRACE_GRAPH \ diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4ab8050eb9e..4159ee5940f 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -469,6 +469,9 @@ struct perf_counter { int oncpu; int cpu; + struct list_head owner_entry; + struct task_struct *owner; + /* mmap bits */ struct mutex mmap_mutex; atomic_t mmap_count; diff --git a/include/linux/sched.h b/include/linux/sched.h index 9714d450f41..bc9326dcdde 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1389,6 +1389,8 @@ struct task_struct { #endif #ifdef CONFIG_PERF_COUNTERS struct perf_counter_context *perf_counter_ctxp; + struct mutex perf_counter_mutex; + struct list_head perf_counter_list; #endif #ifdef CONFIG_NUMA struct mempolicy *mempolicy; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 0e97f896133..4c86a636976 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1076,79 +1076,26 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) __perf_counter_sched_in(ctx, cpuctx, cpu); } -int perf_counter_task_disable(void) +int perf_counter_task_enable(void) { - struct task_struct *curr = current; - struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; - unsigned long flags; - if (!ctx || !ctx->nr_counters) - return 0; - - local_irq_save(flags); - - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - - /* - * Disable all the counters: - */ - perf_disable(); - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state != PERF_COUNTER_STATE_ERROR) { - update_group_times(counter); - counter->state = PERF_COUNTER_STATE_OFF; - } - } - - perf_enable(); - - spin_unlock_irqrestore(&ctx->lock, flags); + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_enable(counter); + mutex_unlock(¤t->perf_counter_mutex); return 0; } -int perf_counter_task_enable(void) +int perf_counter_task_disable(void) { - struct task_struct *curr = current; - struct perf_counter_context *ctx = curr->perf_counter_ctxp; struct perf_counter *counter; - unsigned long flags; - int cpu; - if (!ctx || !ctx->nr_counters) - return 0; - - local_irq_save(flags); - cpu = smp_processor_id(); - - __perf_counter_task_sched_out(ctx); - - spin_lock(&ctx->lock); - - /* - * Disable all the counters: - */ - perf_disable(); - - list_for_each_entry(counter, &ctx->counter_list, list_entry) { - if (counter->state > PERF_COUNTER_STATE_OFF) - continue; - counter->state = PERF_COUNTER_STATE_INACTIVE; - counter->tstamp_enabled = - ctx->time - counter->total_time_enabled; - counter->hw_event.disabled = 0; - } - perf_enable(); - - spin_unlock(&ctx->lock); - - perf_counter_task_sched_in(curr, cpu); - - local_irq_restore(flags); + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_disable(counter); + mutex_unlock(¤t->perf_counter_mutex); return 0; } @@ -1416,6 +1363,11 @@ static int perf_release(struct inode *inode, struct file *file) perf_counter_remove_from_context(counter); mutex_unlock(&ctx->mutex); + mutex_lock(&counter->owner->perf_counter_mutex); + list_del_init(&counter->owner_entry); + mutex_unlock(&counter->owner->perf_counter_mutex); + put_task_struct(counter->owner); + free_counter(counter); put_context(ctx); @@ -3272,6 +3224,12 @@ SYSCALL_DEFINE5(perf_counter_open, perf_install_in_context(ctx, counter, cpu); mutex_unlock(&ctx->mutex); + counter->owner = current; + get_task_struct(current); + mutex_lock(¤t->perf_counter_mutex); + list_add_tail(&counter->owner_entry, ¤t->perf_counter_list); + mutex_unlock(¤t->perf_counter_mutex); + fput_light(counter_file, fput_needed2); out_fput: @@ -3488,6 +3446,9 @@ void perf_counter_init_task(struct task_struct *child) child->perf_counter_ctxp = NULL; + mutex_init(&child->perf_counter_mutex); + INIT_LIST_HEAD(&child->perf_counter_list); + /* * This is executed from the parent task context, so inherit * counters that have been marked for cloning. From 475c55797323b67435083f6e2eb8ee670f6410ec Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Sat, 23 May 2009 18:29:01 +0200 Subject: [PATCH 1200/2061] perf_counter: Remove perf_counter_context::nr_enabled now that pctrl() no longer disables other people's counters, remove the PMU cache code that deals with that. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090523163013.032998331@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 1 - kernel/perf_counter.c | 11 +---------- 2 files changed, 1 insertion(+), 11 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 4159ee5940f..2ddf5e3c551 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -516,7 +516,6 @@ struct perf_counter_context { struct list_head event_list; int nr_counters; int nr_active; - int nr_enabled; int is_active; atomic_t refcount; struct task_struct *task; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 4c86a636976..cb4062559b4 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -134,8 +134,6 @@ list_add_counter(struct perf_counter *counter, struct perf_counter_context *ctx) list_add_rcu(&counter->event_entry, &ctx->event_list); ctx->nr_counters++; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - ctx->nr_enabled++; } /* @@ -150,8 +148,6 @@ list_del_counter(struct perf_counter *counter, struct perf_counter_context *ctx) if (list_empty(&counter->list_entry)) return; ctx->nr_counters--; - if (counter->state >= PERF_COUNTER_STATE_INACTIVE) - ctx->nr_enabled--; list_del_init(&counter->list_entry); list_del_rcu(&counter->event_entry); @@ -406,7 +402,6 @@ static void __perf_counter_disable(void *info) else counter_sched_out(counter, cpuctx, ctx); counter->state = PERF_COUNTER_STATE_OFF; - ctx->nr_enabled--; } spin_unlock_irqrestore(&ctx->lock, flags); @@ -448,7 +443,6 @@ static void perf_counter_disable(struct perf_counter *counter) if (counter->state == PERF_COUNTER_STATE_INACTIVE) { update_counter_times(counter); counter->state = PERF_COUNTER_STATE_OFF; - ctx->nr_enabled--; } spin_unlock_irq(&ctx->lock); @@ -759,7 +753,6 @@ static void __perf_counter_enable(void *info) goto unlock; counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - ctx->nr_enabled++; /* * If the counter is in a group and isn't the group leader, @@ -850,7 +843,6 @@ static void perf_counter_enable(struct perf_counter *counter) counter->state = PERF_COUNTER_STATE_INACTIVE; counter->tstamp_enabled = ctx->time - counter->total_time_enabled; - ctx->nr_enabled++; } out: spin_unlock_irq(&ctx->lock); @@ -910,8 +902,7 @@ static int context_equiv(struct perf_counter_context *ctx1, struct perf_counter_context *ctx2) { return ctx1->parent_ctx && ctx1->parent_ctx == ctx2->parent_ctx - && ctx1->parent_gen == ctx2->parent_gen - && ctx1->nr_enabled == ctx2->nr_enabled; + && ctx1->parent_gen == ctx2->parent_gen; } /* From c2990a2a582d73562d4dcf2502c39892a19a691d Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sun, 24 May 2009 08:35:49 +0200 Subject: [PATCH 1201/2061] perf top: fix segfault c6eb13 increased stack usage such that perf-top now croaks on startup. Take event_array and mmap_array off the stack to prevent segfault on boxen with smallish ulimit -s setting. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-top.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index a3216a6018c..74021ac90f5 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -1035,10 +1035,11 @@ static void mmap_read(struct mmap_data *md) md->prev = old; } +static struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; +static struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; + int cmd_top(int argc, char **argv, const char *prefix) { - struct pollfd event_array[MAX_NR_CPUS * MAX_COUNTERS]; - struct mmap_data mmap_array[MAX_NR_CPUS][MAX_COUNTERS]; struct perf_counter_hw_event hw_event; pthread_t thread; int i, counter, group_fd, nr_poll = 0; From a3862d3f814ce7dfca9eed56ac23d29db3aee8d5 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 24 May 2009 09:02:37 +0200 Subject: [PATCH 1202/2061] perf_counter: Increase mmap limit In a default 'perf top' run the tool will create a counter for each online CPU. With enough CPUs this will eventually exhaust the default limit. So scale it up with the number of online CPUs. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index cb4062559b4..6cdf8248eda 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1704,6 +1704,12 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma) user_extra = nr_pages + 1; user_lock_limit = sysctl_perf_counter_mlock >> (PAGE_SHIFT - 10); + + /* + * Increase the limit linearly with more CPUs: + */ + user_lock_limit *= num_online_cpus(); + user_locked = atomic_long_read(&user->locked_vm) + user_extra; extra = 0; From 85a9f9200226ddffc2ea50dae6a8df04c033ecd4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2009 09:59:50 +0200 Subject: [PATCH 1203/2061] perf_counter tools: increase limits, fix NR_CPUS and NR_COUNTERS goes up quadratic ... 1024x4096 was far too ambitious upper limit - go for 256x256 which is still plenty. [ Impact: reduce perf tool memory consumption ] Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/perf_counter/perf.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/perf_counter/perf.h b/Documentation/perf_counter/perf.h index a517683fc66..5a2520bb7e5 100644 --- a/Documentation/perf_counter/perf.h +++ b/Documentation/perf_counter/perf.h @@ -61,8 +61,8 @@ sys_perf_counter_open(struct perf_counter_hw_event *hw_event_uptr, group_fd, flags); } -#define MAX_COUNTERS 1024 -#define MAX_NR_CPUS 4096 +#define MAX_COUNTERS 256 +#define MAX_NR_CPUS 256 #define EID(type, id) (((__u64)(type) << PERF_COUNTER_TYPE_SHIFT) | (id)) From d94b943054721c346b0881865d645f000cd19880 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Mon, 25 May 2009 09:57:56 +0200 Subject: [PATCH 1204/2061] perf top: Reduce display overhead Iterate over the symbol table once per display interval, and copy/sort/tally/decay only those symbols which are active. Before: top - 10:14:53 up 4:08, 17 users, load average: 1.17, 1.53, 1.49 Tasks: 273 total, 5 running, 268 sleeping, 0 stopped, 0 zombie Cpu(s): 6.9%us, 38.2%sy, 0.0%ni, 19.9%id, 0.0%wa, 0.0%hi, 35.0%si, 0.0%st PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ P COMMAND 28504 root 20 0 1044 260 164 S 58 0.0 0:04.19 2 netserver 28499 root 20 0 1040 412 316 R 51 0.0 0:04.15 0 netperf 28500 root 20 0 1040 408 316 R 50 0.0 0:04.14 1 netperf 28503 root 20 0 1044 260 164 S 50 0.0 0:04.01 1 netserver 28501 root 20 0 1044 260 164 S 49 0.0 0:03.99 0 netserver 28502 root 20 0 1040 412 316 S 43 0.0 0:03.96 2 netperf 28468 root 20 0 1892m 325m 972 S 16 10.8 0:10.50 3 perf 28467 root 20 0 1892m 325m 972 R 2 10.8 0:00.72 3 perf After: top - 10:16:30 up 4:10, 17 users, load average: 2.27, 1.88, 1.62 Tasks: 273 total, 6 running, 267 sleeping, 0 stopped, 0 zombie Cpu(s): 2.5%us, 39.7%sy, 0.0%ni, 24.6%id, 0.0%wa, 0.0%hi, 33.3%si, 0.0%st PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ P COMMAND 28590 root 20 0 1040 412 316 S 54 0.0 0:07.85 2 netperf 28589 root 20 0 1044 260 164 R 54 0.0 0:07.84 0 netserver 28588 root 20 0 1040 412 316 R 50 0.0 0:07.89 1 netperf 28591 root 20 0 1044 256 164 S 50 0.0 0:07.82 1 netserver 28587 root 20 0 1040 408 316 R 47 0.0 0:07.61 0 netperf 28592 root 20 0 1044 260 164 R 47 0.0 0:07.85 2 netserver 28378 root 20 0 8732 1300 860 R 2 0.0 0:01.81 3 top 28577 root 20 0 1892m 165m 972 R 2 5.5 0:00.48 3 perf 28578 root 20 0 1892m 165m 972 S 2 5.5 0:00.04 3 perf [ Impact: optimization ] Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-top.c | 52 ++++++++++++------------ 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/Documentation/perf_counter/builtin-top.c b/Documentation/perf_counter/builtin-top.c index 74021ac90f5..4bed265926d 100644 --- a/Documentation/perf_counter/builtin-top.c +++ b/Documentation/perf_counter/builtin-top.c @@ -374,18 +374,26 @@ static struct sym_entry tmp[MAX_SYMS]; static void print_sym_table(void) { - int i, printed; + int i, j, active_count, printed; int counter; float events_per_sec = events/delay_secs; float kevents_per_sec = (events-userspace_events)/delay_secs; float sum_kevents = 0.0; events = userspace_events = 0; - memcpy(tmp, sym_table, sizeof(sym_table[0])*sym_table_count); - qsort(tmp, sym_table_count, sizeof(tmp[0]), compare); - for (i = 0; i < sym_table_count && tmp[i].count[0]; i++) - sum_kevents += tmp[i].count[0]; + /* Iterate over symbol table and copy/tally/decay active symbols. */ + for (i = 0, active_count = 0; i < sym_table_count; i++) { + if (sym_table[i].count[0]) { + tmp[active_count++] = sym_table[i]; + sum_kevents += sym_table[i].count[0]; + + for (j = 0; j < nr_counters; j++) + sym_table[i].count[j] = zero ? 0 : sym_table[i].count[j] * 7 / 8; + } + } + + qsort(tmp, active_count + 1, sizeof(tmp[0]), compare); write(1, CONSOLE_CLEAR, strlen(CONSOLE_CLEAR)); @@ -433,29 +441,23 @@ static void print_sym_table(void) " ______ ______ _____ ________________ _______________\n\n" ); - for (i = 0, printed = 0; i < sym_table_count; i++) { + for (i = 0, printed = 0; i < active_count; i++) { float pcnt; - int count; - if (printed <= 18 && tmp[i].count[0] >= count_filter) { - pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents)); + if (++printed > 18 || tmp[i].count[0] < count_filter) + break; - if (nr_counters == 1) - printf("%19.2f - %4.1f%% - %016llx : %s\n", - sym_weight(tmp + i), - pcnt, tmp[i].addr, tmp[i].sym); - else - printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n", - sym_weight(tmp + i), - tmp[i].count[0], - pcnt, tmp[i].addr, tmp[i].sym); - printed++; - } - /* - * Add decay to the counts: - */ - for (count = 0; count < nr_counters; count++) - sym_table[i].count[count] = zero ? 0 : sym_table[i].count[count] * 7 / 8; + pcnt = 100.0 - (100.0*((sum_kevents-tmp[i].count[0])/sum_kevents)); + + if (nr_counters == 1) + printf("%19.2f - %4.1f%% - %016llx : %s\n", + sym_weight(tmp + i), + pcnt, tmp[i].addr, tmp[i].sym); + else + printf("%8.1f %10ld - %4.1f%% - %016llx : %s\n", + sym_weight(tmp + i), + tmp[i].count[0], + pcnt, tmp[i].addr, tmp[i].sym); } if (sym_filter_entry) From e4cbb4e3ac8b09fdb11e39e5a5611bfab0a7cd1a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 19 May 2009 15:50:30 +0200 Subject: [PATCH 1205/2061] perf_counter: Move child perfcounter init to after scheduler init Initialize a task's perfcounters (inherit from parent, etc.) after the child task's scheduler fields have been initialized already. [ Impact: cleanup ] Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/fork.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/fork.c b/kernel/fork.c index e72a09f5355..675e01e9072 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -984,7 +984,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, goto fork_out; rt_mutex_init_task(p); - perf_counter_init_task(p); #ifdef CONFIG_PROVE_LOCKING DEBUG_LOCKS_WARN_ON(!p->hardirqs_enabled); @@ -1096,6 +1095,7 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); + perf_counter_init_task(p); if ((retval = audit_alloc(p))) goto bad_fork_cleanup_policy; From d3f4b3855ba87caff8f35e738c7e7e3bad0a6ab1 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2009 14:40:01 +0200 Subject: [PATCH 1206/2061] perf stat: flip around ':k' and ':u' flags This output: $ perf stat -e 0:1:k -e 0:1:u ./hello Performance counter stats for './hello': 140131 instructions (events) 1906968 instructions (events) Is quite confusing - as :k means "user instructions", :u means "kernel instructions". Flip them around - as the 'exclude' property is not intuitive in the flag naming. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur Cc: Mike Galbraith LKML-Reference: Signed-off-by: Ingo Molnar --- Documentation/perf_counter/builtin-stat.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/perf_counter/builtin-stat.c b/Documentation/perf_counter/builtin-stat.c index 8ae01d51f29..88c70be9903 100644 --- a/Documentation/perf_counter/builtin-stat.c +++ b/Documentation/perf_counter/builtin-stat.c @@ -266,9 +266,9 @@ static __u64 match_event_symbols(char *str) switch (sscanf(str, "%d:%llu:%2s", &type, &id, mask_str)) { case 3: - if (strchr(mask_str, 'u')) - event_mask[nr_counters] |= EVENT_MASK_USER; if (strchr(mask_str, 'k')) + event_mask[nr_counters] |= EVENT_MASK_USER; + if (strchr(mask_str, 'u')) event_mask[nr_counters] |= EVENT_MASK_KERNEL; case 2: return EID(type, id); From 266dfb0b58bc4181b6158ee63a0069abaa9f3a98 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:24 +0200 Subject: [PATCH 1207/2061] perf_counter: Fix perf-$cmd invokation Fix: $ perf-top fatal: cannot handle -top internally Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124559.995591577@chello.nl> Signed-off-by: Ingo Molnar --- Documentation/perf_counter/perf.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Documentation/perf_counter/perf.c b/Documentation/perf_counter/perf.c index 594d270be39..1d6d7aa575a 100644 --- a/Documentation/perf_counter/perf.c +++ b/Documentation/perf_counter/perf.c @@ -357,7 +357,7 @@ int main(int argc, const char **argv) * die if that one cannot handle it. */ if (!prefixcmp(cmd, "perf-")) { - cmd += 4; + cmd += 5; argv[0] = cmd; handle_internal_command(argc, argv); die("cannot handle %s internally", cmd); From e527ea312f31e88a7fa5472b71db71c565b0d44f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:25 +0200 Subject: [PATCH 1208/2061] perf_counter: Remove unused ABI bits extra_config_len isn't used for anything, remove it. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.116035832@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2ddf5e3c551..b1f2bac09f9 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -154,11 +154,11 @@ struct perf_counter_hw_event { __reserved_1 : 51; - __u32 extra_config_len; __u32 wakeup_events; /* wakeup every n events */ + __u32 __reserved_2; - __u64 __reserved_2; __u64 __reserved_3; + __u64 __reserved_4; }; /* From 771d7cde144d87f2d1fbee4da3c6234d61f7e42a Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:26 +0200 Subject: [PATCH 1209/2061] perf_counter: Make pctrl() affect inherited counters too Paul noted that the new ptcrl() didn't work on child counters. Reported-by: Paul Mackerras Signed-off-by: Peter Zijlstra Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.203151469@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 48 +++++++++++++++++++++---------------------- 1 file changed, 24 insertions(+), 24 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 6cdf8248eda..217dbcce2eb 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -1067,30 +1067,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) __perf_counter_sched_in(ctx, cpuctx, cpu); } -int perf_counter_task_enable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_enable(counter); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - -int perf_counter_task_disable(void) -{ - struct perf_counter *counter; - - mutex_lock(¤t->perf_counter_mutex); - list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) - perf_counter_disable(counter); - mutex_unlock(¤t->perf_counter_mutex); - - return 0; -} - static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_freq(struct perf_counter_context *ctx) @@ -1505,6 +1481,30 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return 0; } +int perf_counter_task_enable(void) +{ + struct perf_counter *counter; + + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_for_each_child(counter, perf_counter_enable); + mutex_unlock(¤t->perf_counter_mutex); + + return 0; +} + +int perf_counter_task_disable(void) +{ + struct perf_counter *counter; + + mutex_lock(¤t->perf_counter_mutex); + list_for_each_entry(counter, ¤t->perf_counter_list, owner_entry) + perf_counter_for_each_child(counter, perf_counter_disable); + mutex_unlock(¤t->perf_counter_mutex); + + return 0; +} + /* * Callers need to ensure there can be no nesting of this function, otherwise * the seqlock logic goes bad. We can not serialize this because the arch From 6ab423e0eaca827fbd201ca4ae7d4f8573a366b2 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:27 +0200 Subject: [PATCH 1210/2061] perf_counter: Propagate inheritance failures down the fork() path Fail fork() when we fail inheritance for some reason (-ENOMEM most likely). Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.324656474@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 4 ++-- kernel/fork.c | 6 +++++- kernel/perf_counter.c | 20 ++++++++++++-------- 3 files changed, 19 insertions(+), 11 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index b1f2bac09f9..d3e85de9bf1 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -566,7 +566,7 @@ extern void perf_counter_task_sched_in(struct task_struct *task, int cpu); extern void perf_counter_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu); extern void perf_counter_task_tick(struct task_struct *task, int cpu); -extern void perf_counter_init_task(struct task_struct *child); +extern int perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); @@ -631,7 +631,7 @@ perf_counter_task_sched_out(struct task_struct *task, struct task_struct *next, int cpu) { } static inline void perf_counter_task_tick(struct task_struct *task, int cpu) { } -static inline void perf_counter_init_task(struct task_struct *child) { } +static inline int perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } diff --git a/kernel/fork.c b/kernel/fork.c index 675e01e9072..c07c3335cea 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1095,7 +1095,10 @@ static struct task_struct *copy_process(unsigned long clone_flags, /* Perform scheduler related setup. Assign this task to a CPU. */ sched_fork(p, clone_flags); - perf_counter_init_task(p); + + retval = perf_counter_init_task(p); + if (retval) + goto bad_fork_cleanup_policy; if ((retval = audit_alloc(p))) goto bad_fork_cleanup_policy; @@ -1295,6 +1298,7 @@ bad_fork_cleanup_semundo: bad_fork_cleanup_audit: audit_free(p); bad_fork_cleanup_policy: + perf_counter_exit_task(p); #ifdef CONFIG_NUMA mpol_put(p->mempolicy); bad_fork_cleanup_cgroup: diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 217dbcce2eb..7a7a144870e 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3434,18 +3434,23 @@ again: /* * Initialize the perf_counter context in task_struct */ -void perf_counter_init_task(struct task_struct *child) +int perf_counter_init_task(struct task_struct *child) { struct perf_counter_context *child_ctx, *parent_ctx; struct perf_counter *counter; struct task_struct *parent = current; int inherited_all = 1; + int ret = 0; child->perf_counter_ctxp = NULL; mutex_init(&child->perf_counter_mutex); INIT_LIST_HEAD(&child->perf_counter_list); + parent_ctx = parent->perf_counter_ctxp; + if (likely(!parent_ctx || !parent_ctx->nr_counters)) + return 0; + /* * This is executed from the parent task context, so inherit * counters that have been marked for cloning. @@ -3454,11 +3459,7 @@ void perf_counter_init_task(struct task_struct *child) child_ctx = kmalloc(sizeof(struct perf_counter_context), GFP_KERNEL); if (!child_ctx) - return; - - parent_ctx = parent->perf_counter_ctxp; - if (likely(!parent_ctx || !parent_ctx->nr_counters)) - return; + return -ENOMEM; __perf_counter_init_context(child_ctx, child); child->perf_counter_ctxp = child_ctx; @@ -3482,8 +3483,9 @@ void perf_counter_init_task(struct task_struct *child) continue; } - if (inherit_group(counter, parent, - parent_ctx, child, child_ctx)) { + ret = inherit_group(counter, parent, parent_ctx, + child, child_ctx); + if (ret) { inherited_all = 0; break; } @@ -3505,6 +3507,8 @@ void perf_counter_init_task(struct task_struct *child) } mutex_unlock(&parent_ctx->mutex); + + return ret; } static void __cpuinit perf_counter_init_cpu(int cpu) From 10989fb2451763fae6f42d85fa6106c8fd010cf5 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 14:45:28 +0200 Subject: [PATCH 1211/2061] perf_counter: Fix PERF_COUNTER_CONTEXT_SWITCHES for cpu counters Ingo noticed that cpu counters had 0 context switches, even though there was plenty scheduling on the cpu. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525124600.419025548@chello.nl> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 7a7a144870e..14b1fe98483 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -924,14 +924,13 @@ void perf_counter_task_sched_out(struct task_struct *task, struct perf_counter_context *next_ctx; struct pt_regs *regs; + regs = task_pt_regs(task); + perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); + if (likely(!ctx || !cpuctx->task_ctx)) return; update_context_time(ctx); - - regs = task_pt_regs(task); - perf_swcounter_event(PERF_COUNT_CONTEXT_SWITCHES, 1, 1, regs, 0); - next_ctx = next->perf_counter_ctxp; if (next_ctx && context_equiv(ctx, next_ctx)) { task->perf_counter_ctxp = next_ctx; From 759d427aa5a9d88a81afd11817cdeb40aea85234 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 25 May 2009 11:51:00 -0400 Subject: [PATCH 1212/2061] ext4: remove unused function __ext4_write_dirty_metadata The __ext4_write_dirty_metadata() function was introduced by commit 0390131b, "ext4: Allow ext4 to run without a journal", but nothing ever used the function, either then or since. So let's remove it and save a bit of space. Cc: Frank Mayhar Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 19 ------------------- 1 file changed, 19 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index dadd3f995db..14c00fff371 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -4790,25 +4790,6 @@ int ext4_write_inode(struct inode *inode, int wait) return ext4_force_commit(inode->i_sb); } -int __ext4_write_dirty_metadata(struct inode *inode, struct buffer_head *bh) -{ - int err = 0; - - mark_buffer_dirty(bh); - if (inode && inode_needs_sync(inode)) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { - ext4_error(inode->i_sb, __func__, - "IO error syncing inode, " - "inode=%lu, block=%llu", - inode->i_ino, - (unsigned long long)bh->b_blocknr); - err = -EIO; - } - } - return err; -} - /* * ext4_setattr() * From 88b6edd17c62b7d346d21f4087893ce7d4ef828a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 25 May 2009 11:50:39 -0400 Subject: [PATCH 1213/2061] ext4: Clean up calls to ext4_get_group_desc() If the caller isn't planning on modifying the block group descriptors, there's no need to pass in a pointer to a struct buffer_head. Nuking this saves a tiny amount of CPU time and stack space usage. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 3 +-- fs/ext4/super.c | 3 +-- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 82f7d1d7eae..3743bd849bc 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -347,7 +347,6 @@ static int find_group_flex(struct super_block *sb, struct inode *parent, { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *desc; - struct buffer_head *bh; struct flex_groups *flex_group = sbi->s_flex_groups; ext4_group_t parent_group = EXT4_I(parent)->i_block_group; ext4_group_t parent_fbg_group = ext4_flex_group(sbi, parent_group); @@ -402,7 +401,7 @@ find_close_to_parent: found_flexbg: for (i = best_flex * flex_size; i < ngroups && i < (best_flex + 1) * flex_size; i++) { - desc = ext4_get_group_desc(sb, i, &bh); + desc = ext4_get_group_desc(sb, i, NULL); if (ext4_free_inodes_count(sb, desc)) { *best_group = i; goto out; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index eca6c057b11..91b98b58ccb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1660,7 +1660,6 @@ static int ext4_fill_flex_info(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_group_desc *gdp = NULL; - struct buffer_head *bh; ext4_group_t flex_group_count; ext4_group_t flex_group; int groups_per_flex = 0; @@ -1693,7 +1692,7 @@ static int ext4_fill_flex_info(struct super_block *sb) } for (i = 0; i < sbi->s_groups_count; i++) { - gdp = ext4_get_group_desc(sb, i, &bh); + gdp = ext4_get_group_desc(sb, i, NULL); flex_group = ext4_flex_group(sbi, i); atomic_set(&sbi->s_flex_groups[flex_group].free_inodes, From ff99be573e02e9f7edc23b472c7f9a5ddba12795 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:03 +0200 Subject: [PATCH 1214/2061] perf_counter: x86: Expose INV and EDGE bits Expose the INV and EDGE bits of the PMU to raw configs. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.494709027@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 6cc1660db8d..c14437faf5d 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -87,11 +87,15 @@ static u64 intel_pmu_raw_event(u64 event) { #define CORE_EVNTSEL_EVENT_MASK 0x000000FFULL #define CORE_EVNTSEL_UNIT_MASK 0x0000FF00ULL +#define CORE_EVNTSEL_EDGE_MASK 0x00040000ULL +#define CORE_EVNTSEL_INV_MASK 0x00800000ULL #define CORE_EVNTSEL_COUNTER_MASK 0xFF000000ULL #define CORE_EVNTSEL_MASK \ (CORE_EVNTSEL_EVENT_MASK | \ CORE_EVNTSEL_UNIT_MASK | \ + CORE_EVNTSEL_EDGE_MASK | \ + CORE_EVNTSEL_INV_MASK | \ CORE_EVNTSEL_COUNTER_MASK) return event & CORE_EVNTSEL_MASK; @@ -119,11 +123,15 @@ static u64 amd_pmu_raw_event(u64 event) { #define K7_EVNTSEL_EVENT_MASK 0x7000000FFULL #define K7_EVNTSEL_UNIT_MASK 0x00000FF00ULL +#define K7_EVNTSEL_EDGE_MASK 0x000040000ULL +#define K7_EVNTSEL_INV_MASK 0x000800000ULL #define K7_EVNTSEL_COUNTER_MASK 0x0FF000000ULL #define K7_EVNTSEL_MASK \ (K7_EVNTSEL_EVENT_MASK | \ K7_EVNTSEL_UNIT_MASK | \ + K7_EVNTSEL_EDGE_MASK | \ + K7_EVNTSEL_INV_MASK | \ K7_EVNTSEL_COUNTER_MASK) return event & K7_EVNTSEL_MASK; From 48e22d56ecdeddd1ffb42a02fccba5c6ef42b133 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:04 +0200 Subject: [PATCH 1215/2061] perf_counter: x86: Remove interrupt throttle remove the x86 specific interrupt throttle Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.616671838@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/apic/apic.c | 2 -- arch/x86/kernel/cpu/perf_counter.c | 47 ++++-------------------------- include/linux/perf_counter.h | 2 -- 3 files changed, 5 insertions(+), 46 deletions(-) diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index b4f64402a82..89b63b5fad3 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -763,8 +763,6 @@ static void local_apic_timer_interrupt(void) inc_irq_stat(apic_timer_irqs); evt->event_handler(evt); - - perf_counter_unthrottle(); } /* diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c14437faf5d..8c8177f859f 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -718,11 +718,6 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) intel_pmu_enable_counter(hwc, idx); } -/* - * Maximum interrupt frequency of 100KHz per CPU - */ -#define PERFMON_MAX_INTERRUPTS (100000/HZ) - /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -775,15 +770,14 @@ again: if (status) goto again; - if (++cpuc->interrupts != PERFMON_MAX_INTERRUPTS) - perf_enable(); + perf_enable(); return 1; } static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) { - int cpu, idx, throttle = 0, handled = 0; + int cpu, idx, handled = 0; struct cpu_hw_counters *cpuc; struct perf_counter *counter; struct hw_perf_counter *hwc; @@ -792,16 +786,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) cpu = smp_processor_id(); cpuc = &per_cpu(cpu_hw_counters, cpu); - if (++cpuc->interrupts == PERFMON_MAX_INTERRUPTS) { - throttle = 1; - __perf_disable(); - cpuc->enabled = 0; - barrier(); - } - for (idx = 0; idx < x86_pmu.num_counters; idx++) { - int disable = 0; - if (!test_bit(idx, cpuc->active_mask)) continue; @@ -809,45 +794,23 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) hwc = &counter->hw; if (counter->hw_event.nmi != nmi) - goto next; + continue; val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) - goto next; + continue; /* counter overflow */ x86_perf_counter_set_period(counter, hwc, idx); handled = 1; inc_irq_stat(apic_perf_irqs); - disable = perf_counter_overflow(counter, nmi, regs, 0); - -next: - if (disable || throttle) + if (perf_counter_overflow(counter, nmi, regs, 0)) amd_pmu_disable_counter(hwc, idx); } return handled; } -void perf_counter_unthrottle(void) -{ - struct cpu_hw_counters *cpuc; - - if (!x86_pmu_initialized()) - return; - - cpuc = &__get_cpu_var(cpu_hw_counters); - if (cpuc->interrupts >= PERFMON_MAX_INTERRUPTS) { - /* - * Clear them before re-enabling irqs/NMIs again: - */ - cpuc->interrupts = 0; - perf_enable(); - } else { - cpuc->interrupts = 0; - } -} - void smp_perf_counter_interrupt(struct pt_regs *regs) { irq_enter(); diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index d3e85de9bf1..0c160be2078 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -570,7 +570,6 @@ extern int perf_counter_init_task(struct task_struct *child); extern void perf_counter_exit_task(struct task_struct *child); extern void perf_counter_do_pending(void); extern void perf_counter_print_debug(void); -extern void perf_counter_unthrottle(void); extern void __perf_disable(void); extern bool __perf_enable(void); extern void perf_disable(void); @@ -635,7 +634,6 @@ static inline int perf_counter_init_task(struct task_struct *child) { } static inline void perf_counter_exit_task(struct task_struct *child) { } static inline void perf_counter_do_pending(void) { } static inline void perf_counter_print_debug(void) { } -static inline void perf_counter_unthrottle(void) { } static inline void perf_disable(void) { } static inline void perf_enable(void) { } static inline int perf_counter_task_disable(void) { return -EINVAL; } From a78ac3258782f3e64cb40beb5990808e1febcc0c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 25 May 2009 17:39:05 +0200 Subject: [PATCH 1216/2061] perf_counter: Generic per counter interrupt throttle Introduce a generic per counter interrupt throttle. This uses the perf_counter_overflow() quick disable to throttle a specific counter when its going too fast when a pmu->unthrottle() method is provided which can undo the quick disable. Power needs to implement both the quick disable and the unthrottle method. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 13 +++++++ include/linux/perf_counter.h | 11 ++++++ kernel/perf_counter.c | 59 ++++++++++++++++++++++++++++-- kernel/sysctl.c | 8 ++++ 4 files changed, 87 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 8c8177f859f..c4b543d1a86 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -623,6 +623,18 @@ try_generic: return 0; } +static void x86_pmu_unthrottle(struct perf_counter *counter) +{ + struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters); + struct hw_perf_counter *hwc = &counter->hw; + + if (WARN_ON_ONCE(hwc->idx >= X86_PMC_IDX_MAX || + cpuc->counters[hwc->idx] != counter)) + return; + + x86_pmu.enable(hwc, hwc->idx); +} + void perf_counter_print_debug(void) { u64 ctrl, status, overflow, pmc_ctrl, pmc_count, prev_left, fixed; @@ -1038,6 +1050,7 @@ static const struct pmu pmu = { .enable = x86_pmu_enable, .disable = x86_pmu_disable, .read = x86_pmu_read, + .unthrottle = x86_pmu_unthrottle, }; const struct pmu *hw_perf_counter_init(struct perf_counter *counter) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 0c160be2078..e3a7585d3e4 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -266,6 +266,15 @@ enum perf_event_type { */ PERF_EVENT_PERIOD = 4, + /* + * struct { + * struct perf_event_header header; + * u64 time; + * }; + */ + PERF_EVENT_THROTTLE = 5, + PERF_EVENT_UNTHROTTLE = 6, + /* * When header.misc & PERF_EVENT_MISC_OVERFLOW the event_type field * will be PERF_RECORD_* @@ -367,6 +376,7 @@ struct pmu { int (*enable) (struct perf_counter *counter); void (*disable) (struct perf_counter *counter); void (*read) (struct perf_counter *counter); + void (*unthrottle) (struct perf_counter *counter); }; /** @@ -613,6 +623,7 @@ extern struct perf_callchain_entry *perf_callchain(struct pt_regs *regs); extern int sysctl_perf_counter_priv; extern int sysctl_perf_counter_mlock; +extern int sysctl_perf_counter_limit; extern void perf_counter_init(void); diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 14b1fe98483..ec9c4007a7f 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -46,6 +46,7 @@ static atomic_t nr_comm_tracking __read_mostly; int sysctl_perf_counter_priv __read_mostly; /* do we need to be privileged */ int sysctl_perf_counter_mlock __read_mostly = 512; /* 'free' kb per user */ +int sysctl_perf_counter_limit __read_mostly = 100000; /* max NMIs per second */ /* * Lock for (sysadmin-configurable) counter reservations: @@ -1066,12 +1067,15 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu) __perf_counter_sched_in(ctx, cpuctx, cpu); } +#define MAX_INTERRUPTS (~0ULL) + +static void perf_log_throttle(struct perf_counter *counter, int enable); static void perf_log_period(struct perf_counter *counter, u64 period); static void perf_adjust_freq(struct perf_counter_context *ctx) { struct perf_counter *counter; - u64 irq_period; + u64 interrupts, irq_period; u64 events, period; s64 delta; @@ -1080,10 +1084,19 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) if (counter->state != PERF_COUNTER_STATE_ACTIVE) continue; + interrupts = counter->hw.interrupts; + counter->hw.interrupts = 0; + + if (interrupts == MAX_INTERRUPTS) { + perf_log_throttle(counter, 1); + counter->pmu->unthrottle(counter); + interrupts = 2*sysctl_perf_counter_limit/HZ; + } + if (!counter->hw_event.freq || !counter->hw_event.irq_freq) continue; - events = HZ * counter->hw.interrupts * counter->hw.irq_period; + events = HZ * interrupts * counter->hw.irq_period; period = div64_u64(events, counter->hw_event.irq_freq); delta = (s64)(1 + period - counter->hw.irq_period); @@ -1097,7 +1110,6 @@ static void perf_adjust_freq(struct perf_counter_context *ctx) perf_log_period(counter, irq_period); counter->hw.irq_period = irq_period; - counter->hw.interrupts = 0; } spin_unlock(&ctx->lock); } @@ -2543,6 +2555,35 @@ static void perf_log_period(struct perf_counter *counter, u64 period) perf_output_end(&handle); } +/* + * IRQ throttle logging + */ + +static void perf_log_throttle(struct perf_counter *counter, int enable) +{ + struct perf_output_handle handle; + int ret; + + struct { + struct perf_event_header header; + u64 time; + } throttle_event = { + .header = { + .type = PERF_EVENT_THROTTLE + 1, + .misc = 0, + .size = sizeof(throttle_event), + }, + .time = sched_clock(), + }; + + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0); + if (ret) + return; + + perf_output_put(&handle, throttle_event); + perf_output_end(&handle); +} + /* * Generic counter overflow handling. */ @@ -2551,9 +2592,19 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, struct pt_regs *regs, u64 addr) { int events = atomic_read(&counter->event_limit); + int throttle = counter->pmu->unthrottle != NULL; int ret = 0; - counter->hw.interrupts++; + if (!throttle) { + counter->hw.interrupts++; + } else if (counter->hw.interrupts != MAX_INTERRUPTS) { + counter->hw.interrupts++; + if (HZ*counter->hw.interrupts > (u64)sysctl_perf_counter_limit) { + counter->hw.interrupts = MAX_INTERRUPTS; + perf_log_throttle(counter, 0); + ret = 1; + } + } /* * XXX event_limit might not quite work as expected on inherited diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 3cb1849f598..0c4bf863afa 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -930,6 +930,14 @@ static struct ctl_table kern_table[] = { .mode = 0644, .proc_handler = &proc_dointvec, }, + { + .ctl_name = CTL_UNNUMBERED, + .procname = "perf_counter_int_limit", + .data = &sysctl_perf_counter_limit, + .maxlen = sizeof(sysctl_perf_counter_limit), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #endif /* * NOTE: do not add new entries to this table unless you have read From 53b441a565bf4036ab49c8ea04c5ad06ace7dd6b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2009 21:41:28 +0200 Subject: [PATCH 1217/2061] Revert "perf_counter, x86: speed up the scheduling fast-path" This reverts commit b68f1d2e7aa21029d73c7d453a8046e95d351740. It is causing problems (stuck/stuttering profiling) - when mixed NMI and non-NMI counters are used. Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index c4b543d1a86..189bf9d7cda 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -293,7 +293,6 @@ static int __hw_perf_counter_init(struct perf_counter *counter) return -EACCES; hwc->nmi = 1; } - perf_counters_lapic_init(hwc->nmi); if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -612,6 +611,8 @@ try_generic: hwc->counter_base = x86_pmu.perfctr; } + perf_counters_lapic_init(hwc->nmi); + x86_pmu.disable(hwc, idx); cpuc->counters[idx] = counter; @@ -1037,7 +1038,7 @@ void __init init_hw_perf_counters(void) pr_info("... counter mask: %016Lx\n", perf_counter_mask); - perf_counters_lapic_init(1); + perf_counters_lapic_init(0); register_die_notifier(&perf_counter_nmi_notifier); } From 0127c3ea082ee9f1034789b978dfc7fd83254617 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Mon, 25 May 2009 22:03:26 +0200 Subject: [PATCH 1218/2061] perf_counter: fix warning & lockup - remove bogus warning - fix wakeup from NMI path lockup - also fix up whitespace noise in perf_counter.h Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Corey Ashford Cc: Arnaldo Carvalho de Melo Cc: John Kacur LKML-Reference: <20090525153931.703093461@chello.nl> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 78 ++++++++++++++++++------------------ kernel/perf_counter.c | 4 +- 2 files changed, 40 insertions(+), 42 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index e3a7585d3e4..2b16ed37b74 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -73,7 +73,7 @@ enum sw_event_ids { PERF_SW_EVENTS_MAX = 7, }; -#define __PERF_COUNTER_MASK(name) \ +#define __PERF_COUNTER_MASK(name) \ (((1ULL << PERF_COUNTER_##name##_BITS) - 1) << \ PERF_COUNTER_##name##_SHIFT) @@ -98,14 +98,14 @@ enum sw_event_ids { * in the overflow packets. */ enum perf_counter_record_format { - PERF_RECORD_IP = 1U << 0, - PERF_RECORD_TID = 1U << 1, - PERF_RECORD_TIME = 1U << 2, - PERF_RECORD_ADDR = 1U << 3, - PERF_RECORD_GROUP = 1U << 4, - PERF_RECORD_CALLCHAIN = 1U << 5, - PERF_RECORD_CONFIG = 1U << 6, - PERF_RECORD_CPU = 1U << 7, + PERF_RECORD_IP = 1U << 0, + PERF_RECORD_TID = 1U << 1, + PERF_RECORD_TIME = 1U << 2, + PERF_RECORD_ADDR = 1U << 3, + PERF_RECORD_GROUP = 1U << 4, + PERF_RECORD_CALLCHAIN = 1U << 5, + PERF_RECORD_CONFIG = 1U << 6, + PERF_RECORD_CPU = 1U << 7, }; /* @@ -235,13 +235,13 @@ enum perf_event_type { * correlate userspace IPs to code. They have the following structure: * * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * u32 pid, tid; - * u64 addr; - * u64 len; - * u64 pgoff; - * char filename[]; + * u32 pid, tid; + * u64 addr; + * u64 len; + * u64 pgoff; + * char filename[]; * }; */ PERF_EVENT_MMAP = 1, @@ -249,27 +249,27 @@ enum perf_event_type { /* * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * u32 pid, tid; - * char comm[]; + * u32 pid, tid; + * char comm[]; * }; */ PERF_EVENT_COMM = 3, /* * struct { - * struct perf_event_header header; - * u64 time; - * u64 irq_period; + * struct perf_event_header header; + * u64 time; + * u64 irq_period; * }; */ PERF_EVENT_PERIOD = 4, /* * struct { - * struct perf_event_header header; - * u64 time; + * struct perf_event_header header; + * u64 time; * }; */ PERF_EVENT_THROTTLE = 5, @@ -280,23 +280,23 @@ enum perf_event_type { * will be PERF_RECORD_* * * struct { - * struct perf_event_header header; + * struct perf_event_header header; * - * { u64 ip; } && PERF_RECORD_IP - * { u32 pid, tid; } && PERF_RECORD_TID - * { u64 time; } && PERF_RECORD_TIME - * { u64 addr; } && PERF_RECORD_ADDR - * { u64 config; } && PERF_RECORD_CONFIG - * { u32 cpu, res; } && PERF_RECORD_CPU + * { u64 ip; } && PERF_RECORD_IP + * { u32 pid, tid; } && PERF_RECORD_TID + * { u64 time; } && PERF_RECORD_TIME + * { u64 addr; } && PERF_RECORD_ADDR + * { u64 config; } && PERF_RECORD_CONFIG + * { u32 cpu, res; } && PERF_RECORD_CPU * - * { u64 nr; - * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP + * { u64 nr; + * { u64 event, val; } cnt[nr]; } && PERF_RECORD_GROUP * - * { u16 nr, - * hv, - * kernel, - * user; - * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN + * { u16 nr, + * hv, + * kernel, + * user; + * u64 ips[nr]; } && PERF_RECORD_CALLCHAIN * }; */ }; @@ -406,7 +406,7 @@ struct perf_mmap_data { atomic_t wakeup; /* needs a wakeup */ struct perf_counter_mmap_page *user_page; - void *data_pages[0]; + void *data_pages[0]; }; struct perf_pending_entry { @@ -422,7 +422,7 @@ struct perf_counter { struct list_head list_entry; struct list_head event_entry; struct list_head sibling_list; - int nr_siblings; + int nr_siblings; struct perf_counter *group_leader; const struct pmu *pmu; diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index ec9c4007a7f..070f92d3232 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2576,7 +2576,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable) .time = sched_clock(), }; - ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 0, 0); + ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0); if (ret) return; @@ -3449,8 +3449,6 @@ void perf_counter_exit_task(struct task_struct *child) struct perf_counter_context *child_ctx; unsigned long flags; - WARN_ON_ONCE(child != current); - child_ctx = child->perf_counter_ctxp; if (likely(!child_ctx)) From 4f5359685af6de7dca101393dc606620adbe963f Mon Sep 17 00:00:00 2001 From: Lai Jiangshan Date: Mon, 18 May 2009 19:35:34 +0800 Subject: [PATCH 1219/2061] tracing: add trace_event_read_lock() I found that there is nothing to protect event_hash in ftrace_find_event(). Rcu protects the event hashlist but not the event itself while we use it after its extraction through ftrace_find_event(). This lack of a proper locking in this spot opens a race window between any event dereferencing and module removal. Eg: --Task A-- print_trace_line(trace) { event = find_ftrace_event(trace) --Task B-- trace_module_remove_events(mod) { list_trace_events_module(ev, mod) { unregister_ftrace_event(ev->event) { hlist_del(ev->event->node) list_del(....) } } } |--> module removed, the event has been dropped --Task A-- event->print(trace); // Dereferencing freed memory If the event retrieved belongs to a module and this module is concurrently removed, we may end up dereferencing a data from a freed module. RCU could solve this, but it would add latency to the kernel and forbid tracers output callbacks to call any sleepable code. So this fix converts 'trace_event_mutex' to a read/write semaphore, and adds trace_event_read_lock() to protect ftrace_find_event(). [ Impact: fix possible freed memory dereference in ftrace ] Signed-off-by: Lai Jiangshan Acked-by: Steven Rostedt LKML-Reference: <4A114806.7090302@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace.c | 8 ++++++++ kernel/trace/trace_output.c | 25 ++++++++++++++++++------- kernel/trace/trace_output.h | 2 ++ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index dd40d232034..02d32baa23a 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -1569,12 +1569,14 @@ static void *s_start(struct seq_file *m, loff_t *pos) p = s_next(m, p, &l); } + trace_event_read_lock(); return p; } static void s_stop(struct seq_file *m, void *p) { atomic_dec(&trace_record_cmdline_disabled); + trace_event_read_unlock(); } static void print_lat_help_header(struct seq_file *m) @@ -1817,6 +1819,7 @@ static int trace_empty(struct trace_iterator *iter) return 1; } +/* Called with trace_event_read_lock() held. */ static enum print_line_t print_trace_line(struct trace_iterator *iter) { enum print_line_t ret; @@ -3008,6 +3011,7 @@ waitagain: offsetof(struct trace_iterator, seq)); iter->pos = -1; + trace_event_read_lock(); while (find_next_entry_inc(iter) != NULL) { enum print_line_t ret; int len = iter->seq.len; @@ -3024,6 +3028,7 @@ waitagain: if (iter->seq.len >= cnt) break; } + trace_event_read_unlock(); /* Now copy what we have to the user */ sret = trace_seq_to_user(&iter->seq, ubuf, cnt); @@ -3146,6 +3151,8 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, goto out_err; } + trace_event_read_lock(); + /* Fill as many pages as possible. */ for (i = 0, rem = len; i < PIPE_BUFFERS && rem; i++) { pages[i] = alloc_page(GFP_KERNEL); @@ -3168,6 +3175,7 @@ static ssize_t tracing_splice_read_pipe(struct file *filp, trace_seq_init(&iter->seq); } + trace_event_read_unlock(); mutex_unlock(&iter->mutex); spd.nr_pages = i; diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 489c0e8ada0..7136420603a 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -14,7 +14,7 @@ /* must be a power of 2 */ #define EVENT_HASHSIZE 128 -static DEFINE_MUTEX(trace_event_mutex); +static DECLARE_RWSEM(trace_event_mutex); static struct hlist_head event_hash[EVENT_HASHSIZE] __read_mostly; static int next_event_type = __TRACE_LAST_TYPE + 1; @@ -466,6 +466,7 @@ static int task_state_char(unsigned long state) * @type: the type of event to look for * * Returns an event of type @type otherwise NULL + * Called with trace_event_read_lock() held. */ struct trace_event *ftrace_find_event(int type) { @@ -475,7 +476,7 @@ struct trace_event *ftrace_find_event(int type) key = type & (EVENT_HASHSIZE - 1); - hlist_for_each_entry_rcu(event, n, &event_hash[key], node) { + hlist_for_each_entry(event, n, &event_hash[key], node) { if (event->type == type) return event; } @@ -513,6 +514,16 @@ static int trace_search_list(struct list_head **list) return last + 1; } +void trace_event_read_lock(void) +{ + down_read(&trace_event_mutex); +} + +void trace_event_read_unlock(void) +{ + up_read(&trace_event_mutex); +} + /** * register_ftrace_event - register output for an event type * @event: the event type to register @@ -533,7 +544,7 @@ int register_ftrace_event(struct trace_event *event) unsigned key; int ret = 0; - mutex_lock(&trace_event_mutex); + down_write(&trace_event_mutex); if (WARN_ON(!event)) goto out; @@ -581,11 +592,11 @@ int register_ftrace_event(struct trace_event *event) key = event->type & (EVENT_HASHSIZE - 1); - hlist_add_head_rcu(&event->node, &event_hash[key]); + hlist_add_head(&event->node, &event_hash[key]); ret = event->type; out: - mutex_unlock(&trace_event_mutex); + up_write(&trace_event_mutex); return ret; } @@ -597,10 +608,10 @@ EXPORT_SYMBOL_GPL(register_ftrace_event); */ int unregister_ftrace_event(struct trace_event *event) { - mutex_lock(&trace_event_mutex); + down_write(&trace_event_mutex); hlist_del(&event->node); list_del(&event->list); - mutex_unlock(&trace_event_mutex); + up_write(&trace_event_mutex); return 0; } diff --git a/kernel/trace/trace_output.h b/kernel/trace/trace_output.h index 6e220a8e570..ac240e76eb0 100644 --- a/kernel/trace/trace_output.h +++ b/kernel/trace/trace_output.h @@ -20,6 +20,8 @@ extern int seq_print_user_ip(struct trace_seq *s, struct mm_struct *mm, extern int trace_print_context(struct trace_iterator *iter); extern int trace_print_lat_context(struct trace_iterator *iter); +extern void trace_event_read_lock(void); +extern void trace_event_read_unlock(void); extern struct trace_event *ftrace_find_event(int type); extern enum print_line_t trace_nop_print(struct trace_iterator *iter, From b0aae68cc5508f3c2fbf728988c954db4c8b8a53 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 21 May 2009 13:59:18 +0800 Subject: [PATCH 1220/2061] tracing/events: change the type of __str_loc_item to unsigned short When defining a dynamic size string, we add __str_loc_##item to the trace entry, and it stores the location of the actual string in entry->_str_data[] 'unsigned short' should be sufficient to store this information, thus we save 2 bytes per dyn-size string in the ring buffer. [ Impact: reduce memory occupied by dyn-size strings in ring buffer ] Signed-off-by: Li Zefan Cc: Steven Rostedt LKML-Reference: <4A14EDB6.2050507@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- include/trace/ftrace.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index edb02bc9f8f..b5ff2e8229e 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -25,7 +25,7 @@ #define __field(type, item) type item; #undef __string -#define __string(item, src) int __str_loc_##item; +#define __string(item, src) unsigned short __str_loc_##item; #undef TP_STRUCT__entry #define TP_STRUCT__entry(args...) args From 29fcefba8a2f0fea11e2b721fe174a1832801284 Mon Sep 17 00:00:00 2001 From: Pekka Enberg Date: Sun, 24 May 2009 11:13:17 +0300 Subject: [PATCH 1221/2061] kmemtrace: fix kernel parameter documentation The kmemtrace.enable kernel parameter no longer works. To enable kmemtrace at boot-time, you must pass "ftrace=kmemtrace" instead. [ Impact: remove obsolete kernel parameter documentation ] Cc: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg LKML-Reference: Signed-off-by: Frederic Weisbecker --- Documentation/kernel-parameters.txt | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index e87bdbfbcc7..9243dd84f4d 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -56,7 +56,6 @@ parameter is applicable: ISAPNP ISA PnP code is enabled. ISDN Appropriate ISDN support is enabled. JOY Appropriate joystick support is enabled. - KMEMTRACE kmemtrace is enabled. LIBATA Libata driver is enabled LP Printer support is enabled. LOOP Loopback device support is enabled. @@ -1054,15 +1053,6 @@ and is between 256 and 4096 characters. It is defined in the file use the HighMem zone if it exists, and the Normal zone if it does not. - kmemtrace.enable= [KNL,KMEMTRACE] Format: { yes | no } - Controls whether kmemtrace is enabled - at boot-time. - - kmemtrace.subbufs=n [KNL,KMEMTRACE] Overrides the number of - subbufs kmemtrace's relay channel has. Set this - higher than default (KMEMTRACE_N_SUBBUFS in code) if - you experience buffer overruns. - kgdboc= [HW] kgdb over consoles. Requires a tty driver that supports console polling. (only serial suported for now) From b11c53e12f94a46b50bccc7a1a953d7ca1d54a31 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 25 May 2009 18:11:59 +0800 Subject: [PATCH 1222/2061] ftrace: Add task_comm support for trace_event If we enable a trace event alone without any tracer running (such as function tracer, sched switch tracer, etc...) it can't output enough task command information. We need to use the tracing_{start/stop}_cmdline_record() helpers which are designed to keep track of cmdlines for any tasks that were scheduled during the tracing. Before this patch: # echo 1 > debugfs/tracing/events/sched/sched_switch/enable # cat debugfs/tracing/trace # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | <...>-2289 [000] 526276.724790: sched_switch: task bash:2289 [120] ==> sshd:2287 [120] <...>-2287 [000] 526276.725231: sched_switch: task sshd:2287 [120] ==> bash:2289 [120] <...>-2289 [000] 526276.725452: sched_switch: task bash:2289 [120] ==> sshd:2287 [120] <...>-2287 [000] 526276.727181: sched_switch: task sshd:2287 [120] ==> swapper:0 [140] -0 [000] 526277.032734: sched_switch: task swapper:0 [140] ==> events/0:5 [115] <...>-5 [000] 526277.032782: sched_switch: task events/0:5 [115] ==> swapper:0 [140] ... After this patch: # tracer: nop # # TASK-PID CPU# TIMESTAMP FUNCTION # | | | | | bash-2269 [000] 527347.989229: sched_switch: task bash:2269 [120] ==> sshd:2267 [120] sshd-2267 [000] 527347.990960: sched_switch: task sshd:2267 [120] ==> bash:2269 [120] bash-2269 [000] 527347.991143: sched_switch: task bash:2269 [120] ==> sshd:2267 [120] sshd-2267 [000] 527347.992959: sched_switch: task sshd:2267 [120] ==> swapper:0 [140] -0 [000] 527348.531989: sched_switch: task swapper:0 [140] ==> events/0:5 [115] events/0-5 [000] 527348.532115: sched_switch: task events/0:5 [115] ==> swapper:0 [140] ... Changelog: v1->v2: Update Kconfig to select CONTEXT_SWITCH_TRACER in ENABLE_EVENT_TRACING v2->v3: v2 can solve problem that was caused by config EVENT_TRACING alone, but when CONFIG_FTRACE is off and CONFIG_TRACING is selected by other config, compile fail happened again. This version solves it. [ Impact: fix incomplete output of event tracing ] Signed-off-by: Zhao Lei Cc: Tom Zanussi Cc: Steven Rostedt LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/Kconfig | 9 +++++++-- kernel/trace/trace_events.c | 6 ++++++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index f61be301578..a508b9d2adb 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -49,6 +49,11 @@ config FTRACE_NMI_ENTER default y config EVENT_TRACING + select CONTEXT_SWITCH_TRACER + bool + +config CONTEXT_SWITCH_TRACER + select MARKERS bool config TRACING @@ -176,10 +181,10 @@ config SCHED_TRACER This tracer tracks the latency of the highest priority task to be scheduled in, starting from the point it has woken up. -config CONTEXT_SWITCH_TRACER +config ENABLE_CONTEXT_SWITCH_TRACER bool "Trace process context switches" select TRACING - select MARKERS + select CONTEXT_SWITCH_TRACER help This tracer gets called from the context switch and records all switching of tasks. diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9e91c4ad7c8..9b246eb01d5 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -85,6 +85,7 @@ static void ftrace_clear_events(void) if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } } @@ -99,12 +100,14 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, case 0: if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } break; case 1: if (!call->enabled) { call->enabled = 1; + tracing_start_cmdline_record(); call->regfunc(); } break; @@ -1058,6 +1061,7 @@ static void trace_module_remove_events(struct module *mod) found = true; if (call->enabled) { call->enabled = 0; + tracing_stop_cmdline_record(); call->unregfunc(); } if (call->event) @@ -1262,11 +1266,13 @@ static __init void event_trace_self_tests(void) } call->enabled = 1; + tracing_start_cmdline_record(); call->regfunc(); event_test_stuff(); call->unregfunc(); + tracing_stop_cmdline_record(); call->enabled = 0; pr_cont("OK\n"); From 0e907c99391362385c8e3af2c43b904dd1fd5d73 Mon Sep 17 00:00:00 2001 From: Zhaolei Date: Mon, 25 May 2009 18:13:59 +0800 Subject: [PATCH 1223/2061] ftrace: clean up of using ftrace_event_enable_disable() Always use ftrace_event_enable_disable() to enable/disable an event so that we can factorize out the event toggling code. [ Impact: factorize and cleanup event tracing code ] Signed-off-by: Zhao Lei Cc: Steven Rostedt Cc: Tom Zanussi LKML-Reference: <4A14FDFE.2080402@cn.fujitsu.com> Signed-off-by: Frederic Weisbecker --- kernel/trace/trace_events.c | 44 ++++++++++++------------------------- 1 file changed, 14 insertions(+), 30 deletions(-) diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index 9b246eb01d5..6c81f9c2142 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -76,26 +76,9 @@ static void trace_destroy_fields(struct ftrace_event_call *call) #endif /* CONFIG_MODULES */ -static void ftrace_clear_events(void) -{ - struct ftrace_event_call *call; - - mutex_lock(&event_mutex); - list_for_each_entry(call, &ftrace_events, list) { - - if (call->enabled) { - call->enabled = 0; - tracing_stop_cmdline_record(); - call->unregfunc(); - } - } - mutex_unlock(&event_mutex); -} - static void ftrace_event_enable_disable(struct ftrace_event_call *call, int enable) { - switch (enable) { case 0: if (call->enabled) { @@ -114,6 +97,17 @@ static void ftrace_event_enable_disable(struct ftrace_event_call *call, } } +static void ftrace_clear_events(void) +{ + struct ftrace_event_call *call; + + mutex_lock(&event_mutex); + list_for_each_entry(call, &ftrace_events, list) { + ftrace_event_enable_disable(call, 0); + } + mutex_unlock(&event_mutex); +} + /* * __ftrace_set_clr_event(NULL, NULL, NULL, set) will set/unset all events. */ @@ -1059,11 +1053,7 @@ static void trace_module_remove_events(struct module *mod) list_for_each_entry_safe(call, p, &ftrace_events, list) { if (call->mod == mod) { found = true; - if (call->enabled) { - call->enabled = 0; - tracing_stop_cmdline_record(); - call->unregfunc(); - } + ftrace_event_enable_disable(call, 0); if (call->event) unregister_ftrace_event(call->event); debugfs_remove_recursive(call->dir); @@ -1265,15 +1255,9 @@ static __init void event_trace_self_tests(void) continue; } - call->enabled = 1; - tracing_start_cmdline_record(); - call->regfunc(); - + ftrace_event_enable_disable(call, 1); event_test_stuff(); - - call->unregfunc(); - tracing_stop_cmdline_record(); - call->enabled = 0; + ftrace_event_enable_disable(call, 0); pr_cont("OK\n"); } From 3709ab8dfa23cab553ea9ffea3372c8e0a28f332 Mon Sep 17 00:00:00 2001 From: Paul Mundt Date: Tue, 26 May 2009 13:52:28 +0900 Subject: [PATCH 1224/2061] sh: irq: Fix up imask build warnings. Signed-off-by: Paul Mundt --- arch/sh/kernel/cpu/irq/imask.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/sh/kernel/cpu/irq/imask.c b/arch/sh/kernel/cpu/irq/imask.c index 3bef3c2629a..6b5d191eec3 100644 --- a/arch/sh/kernel/cpu/irq/imask.c +++ b/arch/sh/kernel/cpu/irq/imask.c @@ -53,7 +53,7 @@ static inline void set_interrupt_registers(int ip) static void mask_imask_irq(unsigned int irq) { - clear_bit(irq, &imask_mask); + clear_bit(irq, imask_mask); if (interrupt_priority < IMASK_PRIORITY - irq) interrupt_priority = IMASK_PRIORITY - irq; set_interrupt_registers(interrupt_priority); @@ -61,7 +61,7 @@ static void mask_imask_irq(unsigned int irq) static void unmask_imask_irq(unsigned int irq) { - set_bit(irq, &imask_mask); + set_bit(irq, imask_mask); interrupt_priority = IMASK_PRIORITY - find_first_zero_bit(imask_mask, IMASK_PRIORITY); set_interrupt_registers(interrupt_priority); From 8a7b8cb91f26a671f22cedc7fd54508667f2d9b9 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 26 May 2009 16:27:59 +1000 Subject: [PATCH 1225/2061] perf_counter: powerpc: Implement interrupt throttling This implements interrupt throttling on powerpc. Since we don't have individual count enable/disable or interrupt enable/disable controls per counter, this simply sets the hardware counter to 0, meaning that it will not interrupt again until it has counted 2^31 counts, which will take at least 2^30 cycles assuming a maximum of 2 counts per cycle. Also, we set counter->hw.period_left to the maximum possible value (2^63 - 1), so we won't report overflows for this counter for the forseeable future. The unthrottle operation restores counter->hw.period_left and the hardware counter so that we will once again report a counter overflow after counter->hw.irq_period counts. [ Impact: new perfcounters robustness feature on PowerPC ] Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: Corey Ashford LKML-Reference: <18971.35823.643362.446774@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 48 ++++++++++++++++++++++++++---- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index fe21b2440f2..f96d55f55bd 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -740,10 +740,37 @@ static void power_pmu_disable(struct perf_counter *counter) local_irq_restore(flags); } +/* + * Re-enable interrupts on a counter after they were throttled + * because they were coming too fast. + */ +static void power_pmu_unthrottle(struct perf_counter *counter) +{ + s64 val, left; + unsigned long flags; + + if (!counter->hw.idx || !counter->hw.irq_period) + return; + local_irq_save(flags); + perf_disable(); + power_pmu_read(counter); + left = counter->hw.irq_period; + val = 0; + if (left < 0x80000000L) + val = 0x80000000L - left; + write_pmc(counter->hw.idx, val); + atomic64_set(&counter->hw.prev_count, val); + atomic64_set(&counter->hw.period_left, left); + perf_counter_update_userpage(counter); + perf_enable(); + local_irq_restore(flags); +} + struct pmu power_pmu = { .enable = power_pmu_enable, .disable = power_pmu_disable, .read = power_pmu_read, + .unthrottle = power_pmu_unthrottle, }; /* @@ -957,10 +984,6 @@ static void record_and_restart(struct perf_counter *counter, long val, if (left < 0x80000000L) val = 0x80000000L - left; } - write_pmc(counter->hw.idx, val); - atomic64_set(&counter->hw.prev_count, val); - atomic64_set(&counter->hw.period_left, left); - perf_counter_update_userpage(counter); /* * Finally record data if requested. @@ -983,8 +1006,23 @@ static void record_and_restart(struct perf_counter *counter, long val, if (!(mmcra & MMCRA_SAMPLE_ENABLE) || (mmcra & sdsync)) addr = mfspr(SPRN_SDAR); } - perf_counter_overflow(counter, nmi, regs, addr); + if (perf_counter_overflow(counter, nmi, regs, addr)) { + /* + * Interrupts are coming too fast - throttle them + * by setting the counter to 0, so it will be + * at least 2^30 cycles until the next interrupt + * (assuming each counter counts at most 2 counts + * per cycle). + */ + val = 0; + left = ~0ULL >> 1; + } } + + write_pmc(counter->hw.idx, val); + atomic64_set(&counter->hw.prev_count, val); + atomic64_set(&counter->hw.period_left, left); + perf_counter_update_userpage(counter); } /* From 79202ba9ff8cf570a75596f42e011167734d1c4b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: [PATCH 1226/2061] perf_counter, x86: Fix APIC NMI programming My Nehalem box locks up in certain situations (with an always-asserted NMI causing a lockup) if the PMU LVT entry is programmed between NMI and IRQ mode with a high frequency. Standardize exlusively on NMIs instead. [ Impact: fix lockup ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index 189bf9d7cda..ece3813c7a3 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -285,14 +285,10 @@ static int __hw_perf_counter_init(struct perf_counter *counter) hwc->config |= ARCH_PERFMON_EVENTSEL_OS; /* - * If privileged enough, allow NMI events: + * Use NMI events all the time: */ - hwc->nmi = 0; - if (hw_event->nmi) { - if (sysctl_perf_counter_priv && !capable(CAP_SYS_ADMIN)) - return -EACCES; - hwc->nmi = 1; - } + hwc->nmi = 1; + hw_event->nmi = 1; if (!hwc->irq_period) hwc->irq_period = x86_pmu.max_period; @@ -553,9 +549,6 @@ fixed_mode_idx(struct perf_counter *counter, struct hw_perf_counter *hwc) if (!x86_pmu.num_counters_fixed) return -1; - if (unlikely(hwc->nmi)) - return -1; - event = hwc->config & ARCH_PERFMON_EVENT_MASK; if (unlikely(event == x86_pmu.event_map(PERF_COUNT_INSTRUCTIONS))) @@ -806,9 +799,6 @@ static int amd_pmu_handle_irq(struct pt_regs *regs, int nmi) counter = cpuc->counters[idx]; hwc = &counter->hw; - if (counter->hw_event.nmi != nmi) - continue; - val = x86_perf_counter_update(counter, hwc, idx); if (val & (1ULL << (x86_pmu.counter_bits - 1))) continue; From aaba98018b8295dfa2119345d17f833d74448cd0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: [PATCH 1227/2061] perf_counter, x86: Make NMI lockups more robust We have a debug check that detects stuck NMIs and returns with the PMU disabled in the global ctrl MSR - but i managed to trigger a situation where this was not enough to deassert the NMI. So clear/reset the full PMU and keep the disable count balanced when exiting from here. This way the box produces a debug warning but stays up and is more debuggable. [ Impact: in case of PMU related bugs, recover more gracefully ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_counter.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c index ece3813c7a3..2eeaa99add1 100644 --- a/arch/x86/kernel/cpu/perf_counter.c +++ b/arch/x86/kernel/cpu/perf_counter.c @@ -724,6 +724,30 @@ static void intel_pmu_save_and_restart(struct perf_counter *counter) intel_pmu_enable_counter(hwc, idx); } +static void intel_pmu_reset(void) +{ + unsigned long flags; + int idx; + + if (!x86_pmu.num_counters) + return; + + local_irq_save(flags); + + printk("clearing PMU state on CPU#%d\n", smp_processor_id()); + + for (idx = 0; idx < x86_pmu.num_counters; idx++) { + checking_wrmsrl(x86_pmu.eventsel + idx, 0ull); + checking_wrmsrl(x86_pmu.perfctr + idx, 0ull); + } + for (idx = 0; idx < x86_pmu.num_counters_fixed; idx++) { + checking_wrmsrl(MSR_ARCH_PERFMON_FIXED_CTR0 + idx, 0ull); + } + + local_irq_restore(flags); +} + + /* * This handler is triggered by the local APIC, so the APIC IRQ handling * rules apply: @@ -750,6 +774,8 @@ again: if (++loops > 100) { WARN_ONCE(1, "perfcounters: irq loop stuck!\n"); perf_counter_print_debug(); + intel_pmu_reset(); + perf_enable(); return 1; } From 329d876d6fd326109f191ae0fb2798b8834fb70b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Tue, 26 May 2009 08:10:00 +0200 Subject: [PATCH 1228/2061] perf_counter: Initialize ->oncpu properly This shouldnt matter normally (and i have not seen any misbehavior), because active counters always have a proper ->oncpu value - but nevertheless initialize the field properly to -1. [ Impact: cleanup ] Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Corey Ashford Cc: Marcelo Tosatti Cc: Arnaldo Carvalho de Melo Cc: Thomas Gleixner Cc: John Kacur LKML-Reference: Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 070f92d3232..367299f91aa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3122,6 +3122,8 @@ perf_counter_alloc(struct perf_counter_hw_event *hw_event, counter->group_leader = group_leader; counter->pmu = NULL; counter->ctx = ctx; + counter->oncpu = -1; + get_ctx(ctx); counter->state = PERF_COUNTER_STATE_INACTIVE; From 287c129716c2d4b75f3d8dd6e68732a3cd326ee6 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 26 May 2009 07:04:52 +0000 Subject: [PATCH 1229/2061] sh: Add ms7724se (SH7724) board support This adds preliminary support for the ms7724se solution engine board. Signed-off-by: Kuninori Morimoto Signed-off-by: Paul Mundt --- arch/sh/boards/Kconfig | 9 + arch/sh/boards/mach-se/7724/Makefile | 10 + arch/sh/boards/mach-se/7724/irq.c | 139 +++ arch/sh/boards/mach-se/7724/setup.c | 448 +++++++ arch/sh/boards/mach-se/Makefile | 1 + arch/sh/configs/se7724_defconfig | 1552 +++++++++++++++++++++++++ arch/sh/include/mach-se/mach/se7724.h | 67 ++ 7 files changed, 2226 insertions(+) create mode 100644 arch/sh/boards/mach-se/7724/Makefile create mode 100644 arch/sh/boards/mach-se/7724/irq.c create mode 100644 arch/sh/boards/mach-se/7724/setup.c create mode 100644 arch/sh/configs/se7724_defconfig create mode 100644 arch/sh/include/mach-se/mach/se7724.h diff --git a/arch/sh/boards/Kconfig b/arch/sh/boards/Kconfig index 99e2d47f79f..1c91b1f565d 100644 --- a/arch/sh/boards/Kconfig +++ b/arch/sh/boards/Kconfig @@ -46,6 +46,15 @@ config SH_7722_SOLUTION_ENGINE Select 7722 SolutionEngine if configuring for a Hitachi SH772 evaluation board. +config SH_7724_SOLUTION_ENGINE + bool "SolutionEngine7724" + select SOLUTION_ENGINE + depends on CPU_SUBTYPE_SH7724 + select ARCH_REQUIRE_GPIOLIB + help + Select 7724 SolutionEngine if configuring for a Hitachi SH7724 + evaluation board. + config SH_7751_SOLUTION_ENGINE bool "SolutionEngine7751" select SOLUTION_ENGINE diff --git a/arch/sh/boards/mach-se/7724/Makefile b/arch/sh/boards/mach-se/7724/Makefile new file mode 100644 index 00000000000..349cbd6ce82 --- /dev/null +++ b/arch/sh/boards/mach-se/7724/Makefile @@ -0,0 +1,10 @@ +# +# Makefile for the HITACHI UL SolutionEngine 7724 specific parts of the kernel +# +# This file is subject to the terms and conditions of the GNU General Public +# License. See the file "COPYING" in the main directory of this archive +# for more details. +# +# + +obj-y := setup.o irq.o \ No newline at end of file diff --git a/arch/sh/boards/mach-se/7724/irq.c b/arch/sh/boards/mach-se/7724/irq.c new file mode 100644 index 00000000000..f76cf3b49f2 --- /dev/null +++ b/arch/sh/boards/mach-se/7724/irq.c @@ -0,0 +1,139 @@ +/* + * linux/arch/sh/boards/se/7724/irq.c + * + * Copyright (C) 2009 Renesas Solutions Corp. + * + * Kuninori Morimoto + * + * Based on linux/arch/sh/boards/se/7722/irq.c + * Copyright (C) 2007 Nobuhiro Iwamatsu + * + * Hitachi UL SolutionEngine 7724 Support. + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ +#include +#include +#include +#include +#include +#include + +struct fpga_irq { + unsigned long sraddr; + unsigned long mraddr; + unsigned short mask; + unsigned int base; +}; + +static unsigned int fpga2irq(unsigned int irq) +{ + if (irq >= IRQ0_BASE && + irq <= IRQ0_END) + return IRQ0_IRQ; + else if (irq >= IRQ1_BASE && + irq <= IRQ1_END) + return IRQ1_IRQ; + else + return IRQ2_IRQ; +} + +static struct fpga_irq get_fpga_irq(unsigned int irq) +{ + struct fpga_irq set; + + switch (irq) { + case IRQ0_IRQ: + set.sraddr = IRQ0_SR; + set.mraddr = IRQ0_MR; + set.mask = IRQ0_MASK; + set.base = IRQ0_BASE; + break; + case IRQ1_IRQ: + set.sraddr = IRQ1_SR; + set.mraddr = IRQ1_MR; + set.mask = IRQ1_MASK; + set.base = IRQ1_BASE; + break; + default: + set.sraddr = IRQ2_SR; + set.mraddr = IRQ2_MR; + set.mask = IRQ2_MASK; + set.base = IRQ2_BASE; + break; + } + + return set; +} + +static void disable_se7724_irq(unsigned int irq) +{ + struct fpga_irq set = get_fpga_irq(fpga2irq(irq)); + unsigned int bit = irq - set.base; + ctrl_outw(ctrl_inw(set.mraddr) | 0x0001 << bit, set.mraddr); +} + +static void enable_se7724_irq(unsigned int irq) +{ + struct fpga_irq set = get_fpga_irq(fpga2irq(irq)); + unsigned int bit = irq - set.base; + ctrl_outw(ctrl_inw(set.mraddr) & ~(0x0001 << bit), set.mraddr); +} + +static struct irq_chip se7724_irq_chip __read_mostly = { + .name = "SE7724-FPGA", + .mask = disable_se7724_irq, + .unmask = enable_se7724_irq, + .mask_ack = disable_se7724_irq, +}; + +static void se7724_irq_demux(unsigned int irq, struct irq_desc *desc) +{ + struct fpga_irq set = get_fpga_irq(irq); + unsigned short intv = ctrl_inw(set.sraddr); + struct irq_desc *ext_desc; + unsigned int ext_irq = set.base; + + intv &= set.mask; + + while (intv) { + if (intv & 0x0001) { + ext_desc = irq_desc + ext_irq; + handle_level_irq(ext_irq, ext_desc); + } + intv >>= 1; + ext_irq++; + } +} + +/* + * Initialize IRQ setting + */ +void __init init_se7724_IRQ(void) +{ + int i; + + ctrl_outw(0xffff, IRQ0_MR); /* mask all */ + ctrl_outw(0xffff, IRQ1_MR); /* mask all */ + ctrl_outw(0xffff, IRQ2_MR); /* mask all */ + ctrl_outw(0x0000, IRQ0_SR); /* clear irq */ + ctrl_outw(0x0000, IRQ1_SR); /* clear irq */ + ctrl_outw(0x0000, IRQ2_SR); /* clear irq */ + ctrl_outw(0x002a, IRQ_MODE); /* set irq type */ + + for (i = 0; i < SE7724_FPGA_IRQ_NR; i++) + set_irq_chip_and_handler_name(SE7724_FPGA_IRQ_BASE + i, + &se7724_irq_chip, + handle_level_irq, "level"); + + set_irq_chained_handler(IRQ0_IRQ, se7724_irq_demux); + set_irq_type(IRQ0_IRQ, IRQ_TYPE_LEVEL_LOW); + + set_irq_chained_handler(IRQ1_IRQ, se7724_irq_demux); + set_irq_type(IRQ1_IRQ, IRQ_TYPE_LEVEL_LOW); + + set_irq_chained_handler(IRQ2_IRQ, se7724_irq_demux); + set_irq_type(IRQ2_IRQ, IRQ_TYPE_LEVEL_LOW); +} diff --git a/arch/sh/boards/mach-se/7724/setup.c b/arch/sh/boards/mach-se/7724/setup.c new file mode 100644 index 00000000000..9cd04bd558b --- /dev/null +++ b/arch/sh/boards/mach-se/7724/setup.c @@ -0,0 +1,448 @@ +/* + * linux/arch/sh/boards/se/7724/setup.c + * + * Copyright (C) 2009 Renesas Solutions Corp. + * + * Kuninori Morimoto + * + * This file is subject to the terms and conditions of the GNU General Public + * License. See the file "COPYING" in the main directory of this archive + * for more details. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include