From ed54d0f98000ee03310150aa396e9ff8bcb394ce Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 3 Dec 2009 22:08:26 +0100 Subject: [PATCH 01/57] hw-breakpoints: Add two reserved fields for future extensions Add two reserved fields for future extensions in the hardware breakpoints interface. Further needs may arise. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Benjamin Herrenschmidt Cc: "K. Prasad" --- include/linux/perf_event.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 43adbd7f001..a61e4de3448 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -220,6 +220,8 @@ struct perf_event_attr { __u64 bp_addr; __u32 bp_type; __u32 bp_len; + __u64 __bp_reserved_1; + __u64 __bp_reserved_2; }; }; From 189f202ed197dc25d627e8660de27ece325e9f68 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 3 Dec 2009 23:16:07 +0100 Subject: [PATCH 02/57] perf: Remove pointless union that wraps the hw breakpoint fields It stands to anonymize a structure, but structures can already anonymize by themselves. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- include/linux/perf_event.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index a61e4de3448..53230e99e9e 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -215,14 +215,12 @@ struct perf_event_attr { __u32 wakeup_watermark; /* bytes before wakeup */ }; - union { - struct { /* Hardware breakpoint info */ - __u64 bp_addr; - __u32 bp_type; - __u32 bp_len; - __u64 __bp_reserved_1; - __u64 __bp_reserved_2; - }; + struct { /* Hardware breakpoint info */ + __u64 bp_addr; + __u32 bp_type; + __u32 bp_len; + __u64 __bp_reserved_1; + __u64 __bp_reserved_2; }; __u32 __reserved_2; From 9cef30815b0f5b76e94a58d7674fcbf824d95579 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 3 Dec 2009 23:59:31 +0100 Subject: [PATCH 03/57] perf: Remove unused struct perf_event::event_callback This field might result from an older manual rebasing mistake. We don't use it. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- include/linux/perf_event.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 53230e99e9e..84bd28a0ffa 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -670,8 +670,6 @@ struct perf_event { perf_callback_t callback; - perf_callback_t event_callback; - #endif /* CONFIG_PERF_EVENTS */ }; From 2f0993e0fb663c49e4d1e02654f6203246be4817 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 07:06:10 +0100 Subject: [PATCH 04/57] hw-breakpoints: Drop callback and task parameters from modify helper Drop the callback and task parameters from modify_user_hw_breakpoint(). For now we have no user that need to modify a breakpoint to the point of changing its handler or its task context. Signed-off-by: Frederic Weisbecker Cc: "K. Prasad" --- arch/x86/kernel/ptrace.c | 4 ++-- include/linux/hw_breakpoint.h | 9 ++------- kernel/hw_breakpoint.c | 7 +++---- 3 files changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index 04d182a7cfd..dbb395572ae 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -618,7 +618,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, attr.bp_type = gen_type; attr.disabled = disabled; - return modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); + return modify_user_hw_breakpoint(bp, &attr); } /* @@ -740,7 +740,7 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr = bp->attr; attr.bp_addr = addr; - bp = modify_user_hw_breakpoint(bp, &attr, bp->callback, tsk); + bp = modify_user_hw_breakpoint(bp, &attr); } /* * CHECKME: the previous code returned -EIO if the addr wasn't a diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index a03daed08c5..d33096e0dbd 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -57,10 +57,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, /* FIXME: only change from the attr, and don't unregister */ extern struct perf_event * -modify_user_hw_breakpoint(struct perf_event *bp, - struct perf_event_attr *attr, - perf_callback_t triggered, - struct task_struct *tsk); +modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr); /* * Kernel breakpoints are not associated with any particular thread. @@ -97,9 +94,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, struct task_struct *tsk) { return NULL; } static inline struct perf_event * modify_user_hw_breakpoint(struct perf_event *bp, - struct perf_event_attr *attr, - perf_callback_t triggered, - struct task_struct *tsk) { return NULL; } + struct perf_event_attr *attr) { return NULL; } static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, perf_callback_t triggered, diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index cf5ee162841..2d10b012828 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -312,9 +312,7 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); * @tsk: pointer to 'task_struct' of the process to which the address belongs */ struct perf_event * -modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr, - perf_callback_t triggered, - struct task_struct *tsk) +modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { /* * FIXME: do it without unregistering @@ -323,7 +321,8 @@ modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr, */ unregister_hw_breakpoint(bp); - return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); + return perf_event_create_kernel_counter(attr, -1, bp->ctx->task->pid, + bp->callback); } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); From b326e9560a28fc3e950637ef51847ed8f05c1335 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 09:44:31 +0100 Subject: [PATCH 05/57] hw-breakpoints: Use overflow handler instead of the event callback struct perf_event::event callback was called when a breakpoint triggers. But this is a rather opaque callback, pretty tied-only to the breakpoint API and not really integrated into perf as it triggers even when we don't overflow. We prefer to use overflow_handler() as it fits into the perf events rules, being called only when we overflow. Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- arch/x86/kernel/hw_breakpoint.c | 5 ++--- arch/x86/kernel/ptrace.c | 9 ++++++--- include/linux/hw_breakpoint.h | 25 +++++++++++-------------- include/linux/perf_event.h | 13 +++++++------ kernel/hw_breakpoint.c | 17 +++++------------ kernel/perf_event.c | 24 +++++++++--------------- kernel/trace/trace_ksym.c | 5 +++-- samples/hw_breakpoint/data_breakpoint.c | 7 +++++-- 8 files changed, 48 insertions(+), 57 deletions(-) diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c index d42f65ac492..05d5fec64a9 100644 --- a/arch/x86/kernel/hw_breakpoint.c +++ b/arch/x86/kernel/hw_breakpoint.c @@ -362,8 +362,7 @@ int arch_validate_hwbkpt_settings(struct perf_event *bp, return ret; } - if (bp->callback) - ret = arch_store_info(bp); + ret = arch_store_info(bp); if (ret < 0) return ret; @@ -519,7 +518,7 @@ static int __kprobes hw_breakpoint_handler(struct die_args *args) break; } - (bp->callback)(bp, args->regs); + perf_bp_event(bp, args->regs); rcu_read_unlock(); } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index dbb395572ae..b361d28061d 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -555,7 +555,9 @@ static int genregs_set(struct task_struct *target, return ret; } -static void ptrace_triggered(struct perf_event *bp, void *data) +static void ptrace_triggered(struct perf_event *bp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { int i; struct thread_struct *thread = &(current->thread); @@ -599,7 +601,7 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, { int err; int gen_len, gen_type; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; /* * We shoud have at least an inactive breakpoint at this @@ -721,9 +723,10 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, { struct perf_event *bp; struct thread_struct *t = &tsk->thread; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; if (!t->ptrace_bps[nr]) { + hw_breakpoint_init(&attr); /* * Put stub len and type to register (reserve) an inactive but * correct bp diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index d33096e0dbd..4d14a384a01 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -20,19 +20,16 @@ enum { #ifdef CONFIG_HAVE_HW_BREAKPOINT -/* As it's for in-kernel or ptrace use, we want it to be pinned */ -#define DEFINE_BREAKPOINT_ATTR(name) \ -struct perf_event_attr name = { \ - .type = PERF_TYPE_BREAKPOINT, \ - .size = sizeof(name), \ - .pinned = 1, \ -}; - static inline void hw_breakpoint_init(struct perf_event_attr *attr) { attr->type = PERF_TYPE_BREAKPOINT; attr->size = sizeof(*attr); + /* + * As it's for in-kernel or ptrace use, we want it to be pinned + * and to call its callback every hits. + */ attr->pinned = 1; + attr->sample_period = 1; } static inline unsigned long hw_breakpoint_addr(struct perf_event *bp) @@ -52,7 +49,7 @@ static inline int hw_breakpoint_len(struct perf_event *bp) extern struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, struct task_struct *tsk); /* FIXME: only change from the attr, and don't unregister */ @@ -64,12 +61,12 @@ modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr); */ extern struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, int cpu); extern struct perf_event ** register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered); + perf_overflow_handler_t triggered); extern int register_perf_hw_breakpoint(struct perf_event *bp); extern int __register_perf_hw_breakpoint(struct perf_event *bp); @@ -90,18 +87,18 @@ static inline struct arch_hw_breakpoint *counter_arch_bp(struct perf_event *bp) static inline struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, struct task_struct *tsk) { return NULL; } static inline struct perf_event * modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { return NULL; } static inline struct perf_event * register_wide_hw_breakpoint_cpu(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, int cpu) { return NULL; } static inline struct perf_event ** register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered) { return NULL; } + perf_overflow_handler_t triggered) { return NULL; } static inline int register_perf_hw_breakpoint(struct perf_event *bp) { return -ENOSYS; } static inline int diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 84bd28a0ffa..d2f2667430d 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -565,10 +565,13 @@ struct perf_pending_entry { void (*func)(struct perf_pending_entry *); }; -typedef void (*perf_callback_t)(struct perf_event *, void *); - struct perf_sample_data; +typedef void (*perf_callback_t)(struct perf_event *, void *); +typedef void (*perf_overflow_handler_t)(struct perf_event *, int, + struct perf_sample_data *, + struct pt_regs *regs); + /** * struct perf_event - performance event kernel representation: */ @@ -660,9 +663,7 @@ struct perf_event { struct pid_namespace *ns; u64 id; - void (*overflow_handler)(struct perf_event *event, - int nmi, struct perf_sample_data *data, - struct pt_regs *regs); + perf_overflow_handler_t overflow_handler; #ifdef CONFIG_EVENT_PROFILE struct event_filter *filter; @@ -779,7 +780,7 @@ extern struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, pid_t pid, - perf_callback_t callback); + perf_overflow_handler_t callback); extern u64 perf_event_read_value(struct perf_event *event, u64 *enabled, u64 *running); diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 2d10b012828..b600fc27f16 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -259,7 +259,7 @@ void release_bp_slot(struct perf_event *bp) } -int __register_perf_hw_breakpoint(struct perf_event *bp) +int register_perf_hw_breakpoint(struct perf_event *bp) { int ret; @@ -276,19 +276,12 @@ int __register_perf_hw_breakpoint(struct perf_event *bp) * This is a quick hack that will be removed soon, once we remove * the tmp breakpoints from ptrace */ - if (!bp->attr.disabled || bp->callback == perf_bp_event) + if (!bp->attr.disabled || !bp->overflow_handler) ret = arch_validate_hwbkpt_settings(bp, bp->ctx->task); return ret; } -int register_perf_hw_breakpoint(struct perf_event *bp) -{ - bp->callback = perf_bp_event; - - return __register_perf_hw_breakpoint(bp); -} - /** * register_user_hw_breakpoint - register a hardware breakpoint for user space * @attr: breakpoint attributes @@ -297,7 +290,7 @@ int register_perf_hw_breakpoint(struct perf_event *bp) */ struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered, + perf_overflow_handler_t triggered, struct task_struct *tsk) { return perf_event_create_kernel_counter(attr, -1, tsk->pid, triggered); @@ -322,7 +315,7 @@ modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) unregister_hw_breakpoint(bp); return perf_event_create_kernel_counter(attr, -1, bp->ctx->task->pid, - bp->callback); + bp->overflow_handler); } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); @@ -347,7 +340,7 @@ EXPORT_SYMBOL_GPL(unregister_hw_breakpoint); */ struct perf_event ** register_wide_hw_breakpoint(struct perf_event_attr *attr, - perf_callback_t triggered) + perf_overflow_handler_t triggered) { struct perf_event **cpu_events, **pevent, *bp; long err; diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 6b7ddba1dd6..fd43ff4ac86 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4286,15 +4286,8 @@ static void bp_perf_event_destroy(struct perf_event *event) static const struct pmu *bp_perf_event_init(struct perf_event *bp) { int err; - /* - * The breakpoint is already filled if we haven't created the counter - * through perf syscall - * FIXME: manage to get trigerred to NULL if it comes from syscalls - */ - if (!bp->callback) - err = register_perf_hw_breakpoint(bp); - else - err = __register_perf_hw_breakpoint(bp); + + err = register_perf_hw_breakpoint(bp); if (err) return ERR_PTR(err); @@ -4390,7 +4383,7 @@ perf_event_alloc(struct perf_event_attr *attr, struct perf_event_context *ctx, struct perf_event *group_leader, struct perf_event *parent_event, - perf_callback_t callback, + perf_overflow_handler_t overflow_handler, gfp_t gfpflags) { const struct pmu *pmu; @@ -4433,10 +4426,10 @@ perf_event_alloc(struct perf_event_attr *attr, event->state = PERF_EVENT_STATE_INACTIVE; - if (!callback && parent_event) - callback = parent_event->callback; + if (!overflow_handler && parent_event) + overflow_handler = parent_event->overflow_handler; - event->callback = callback; + event->overflow_handler = overflow_handler; if (attr->disabled) event->state = PERF_EVENT_STATE_OFF; @@ -4776,7 +4769,8 @@ err_put_context: */ struct perf_event * perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, - pid_t pid, perf_callback_t callback) + pid_t pid, + perf_overflow_handler_t overflow_handler) { struct perf_event *event; struct perf_event_context *ctx; @@ -4793,7 +4787,7 @@ perf_event_create_kernel_counter(struct perf_event_attr *attr, int cpu, } event = perf_event_alloc(attr, cpu, ctx, NULL, - NULL, callback, GFP_KERNEL); + NULL, overflow_handler, GFP_KERNEL); if (IS_ERR(event)) { err = PTR_ERR(event); goto err_put_context; diff --git a/kernel/trace/trace_ksym.c b/kernel/trace/trace_ksym.c index ddfa0fd43bc..acb87d4a4ac 100644 --- a/kernel/trace/trace_ksym.c +++ b/kernel/trace/trace_ksym.c @@ -79,11 +79,12 @@ void ksym_collect_stats(unsigned long hbp_hit_addr) } #endif /* CONFIG_PROFILE_KSYM_TRACER */ -void ksym_hbp_handler(struct perf_event *hbp, void *data) +void ksym_hbp_handler(struct perf_event *hbp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { struct ring_buffer_event *event; struct ksym_trace_entry *entry; - struct pt_regs *regs = data; struct ring_buffer *buffer; int pc; diff --git a/samples/hw_breakpoint/data_breakpoint.c b/samples/hw_breakpoint/data_breakpoint.c index 29525500df0..c69cbe9b242 100644 --- a/samples/hw_breakpoint/data_breakpoint.c +++ b/samples/hw_breakpoint/data_breakpoint.c @@ -41,7 +41,9 @@ module_param_string(ksym, ksym_name, KSYM_NAME_LEN, S_IRUGO); MODULE_PARM_DESC(ksym, "Kernel symbol to monitor; this module will report any" " write operations on the kernel symbol"); -static void sample_hbp_handler(struct perf_event *temp, void *data) +static void sample_hbp_handler(struct perf_event *bp, int nmi, + struct perf_sample_data *data, + struct pt_regs *regs) { printk(KERN_INFO "%s value is changed\n", ksym_name); dump_stack(); @@ -51,8 +53,9 @@ static void sample_hbp_handler(struct perf_event *temp, void *data) static int __init hw_break_module_init(void) { int ret; - DEFINE_BREAKPOINT_ATTR(attr); + struct perf_event_attr attr; + hw_breakpoint_init(&attr); attr.bp_addr = kallsyms_lookup_name(ksym_name); attr.bp_len = HW_BREAKPOINT_LEN_4; attr.bp_type = HW_BREAKPOINT_W | HW_BREAKPOINT_R; From c0dfb2feb632537cf0a9d2ce3c29bcf5778fec59 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 09:53:28 +0100 Subject: [PATCH 06/57] perf: Remove the "event" callback from perf events As it is not used anymore and has been superseded by overflow_handler. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- include/linux/perf_event.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index d2f2667430d..89098e35a03 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -567,7 +567,6 @@ struct perf_pending_entry { struct perf_sample_data; -typedef void (*perf_callback_t)(struct perf_event *, void *); typedef void (*perf_overflow_handler_t)(struct perf_event *, int, struct perf_sample_data *, struct pt_regs *regs); @@ -669,8 +668,6 @@ struct perf_event { struct event_filter *filter; #endif - perf_callback_t callback; - #endif /* CONFIG_PERF_EVENTS */ }; From 7f33f9c5cc3c99aeaf4d266a7ed502b828115a53 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 5 Dec 2009 12:01:17 +0100 Subject: [PATCH 07/57] x86/perf: Exclude the debug stack from the callchains Dumping the callchains from breakpoint events with perf gives strange results: 3.75% perf [kernel] [k] _raw_read_unlock | --- _raw_read_unlock perf_callchain perf_prepare_sample __perf_event_overflow perf_swevent_overflow perf_swevent_add perf_bp_event hw_breakpoint_exceptions_notify notifier_call_chain __atomic_notifier_call_chain atomic_notifier_call_chain notify_die do_debug debug munmap We are infected with all the debug stack. Like the nmi stack, the debug stack is undesired as it is part of the profiling path, not helpful for the user. Ignore it. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" --- arch/x86/kernel/cpu/perf_event.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index c1bbed1021d..d35f26076ae 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2287,7 +2287,7 @@ void callchain_store(struct perf_callchain_entry *entry, u64 ip) static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_irq_entry); static DEFINE_PER_CPU(struct perf_callchain_entry, pmc_nmi_entry); -static DEFINE_PER_CPU(int, in_nmi_frame); +static DEFINE_PER_CPU(int, in_ignored_frame); static void @@ -2303,8 +2303,9 @@ static void backtrace_warning(void *data, char *msg) static int backtrace_stack(void *data, char *name) { - per_cpu(in_nmi_frame, smp_processor_id()) = - x86_is_stack_id(NMI_STACK, name); + per_cpu(in_ignored_frame, smp_processor_id()) = + x86_is_stack_id(NMI_STACK, name) || + x86_is_stack_id(DEBUG_STACK, name); return 0; } @@ -2313,7 +2314,7 @@ static void backtrace_address(void *data, unsigned long addr, int reliable) { struct perf_callchain_entry *entry = data; - if (per_cpu(in_nmi_frame, smp_processor_id())) + if (per_cpu(in_ignored_frame, smp_processor_id())) return; if (reliable) From b625b3b3b740e177a1148594cd3ad5ff52f35315 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 6 Dec 2009 00:52:56 +0100 Subject: [PATCH 08/57] x86: Fixup wrong debug exception frame link in stacktraces While dumping a stacktrace, the end of the exception stack won't link the frame pointer to the previous stack. The interrupted stack will then be considered as unreliable and ignored by perf, as the frame pointer is unreliable itself. This happens because we overwrite the frame pointer that links to the interrupted frame with the address of the exception stack. This is done in order to reserve space inside. But rbp has been chosen here only because it is not a scratch register, so that the address of the exception stack remains in rbp after calling do_debug(), we can then release the exception stack space without the need to retrieve its address again. But we can pick another non-scratch register to do that, so that we preserve the link to the interrupted stack frame in the stacktraces. Just randomly choose r12. Every registers are saved just before and restored just after calling do_debug(). And r12 is not used in the middle, which makes it a perfect candidate. Example: perf record -g -a -c 1 -f -e mem:$(tasklist_lock_addr):rw Before: 44.18% [k] _raw_read_lock | | --- |--6.31%-- waitid | |--4.26%-- writev | |--3.63%-- __select | |--3.15%-- __waitpid | | | |--28.57%-- 0x8b52e00000139f | | | |--28.57%-- 0x8b52e0000013c6 | | | |--14.29%-- 0x7fde786dc000 | | | |--14.29%-- 0x62696c2f7273752f | | | --14.29%-- 0x1ea9df800000000 | |--3.00%-- __poll After: 43.94% [k] _raw_read_lock | --- _read_lock | |--60.53%-- send_sigio | __kill_fasync | kill_fasync | evdev_pass_event | evdev_event | input_pass_event | input_handle_event | input_event | synaptics_process_byte | psmouse_handle_byte | psmouse_interrupt | serio_interrupt | i8042_interrupt | handle_IRQ_event | handle_edge_irq | handle_irq | __irqentry_text_start | ret_from_intr | | | |--30.43%-- __select | | | |--17.39%-- 0x454f15 | | | |--13.04%-- __read | | | |--13.04%-- vread_hpet | | | |--13.04%-- _xcb_lock_io | | | --13.04%-- 0x7f630878ce87 Note: it does not only affect perf events but also other stacktraces in x86-64. They were considered as unreliable once we quit the debug stack frame. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/kernel/entry_64.S | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 722df1b1152..0f08a0cea3e 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -1076,10 +1076,10 @@ ENTRY(\sym) TRACE_IRQS_OFF movq %rsp,%rdi /* pt_regs pointer */ xorl %esi,%esi /* no error code */ - PER_CPU(init_tss, %rbp) - subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + PER_CPU(init_tss, %r12) + subq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) call \do_sym - addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%rbp) + addq $EXCEPTION_STKSZ, TSS_ist + (\ist - 1) * 8(%r12) jmp paranoid_exit /* %ebx: no swapgs flag */ CFI_ENDPROC END(\sym) From af2d8289f57e427836be482c6f72cca674028121 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 6 Dec 2009 05:34:27 +0100 Subject: [PATCH 09/57] x86: Fixup wrong irq frame link in stacktraces When we enter in irq, two things can happen to preserve the link to the previous frame pointer: - If we were in an irq already, we don't switch to the irq stack as we are inside. We just need to save the previous frame pointer and to link the new one to the previous. - Otherwise we need another level of indirection. We enter the irq with the previous stack. We save the previous bp inside and make bp pointing to its saved address. Then we switch to the irq stack and push bp another time but to the new stack. This makes two levels to dereference instead of one. In the second case, the current stacktrace code omits the second level and loses the frame pointer accuracy. The stack that follows will then be considered as unreliable. Handling that makes the perf callchain happier. Before: 43.94% [k] _raw_read_lock | --- _read_lock | |--60.53%-- send_sigio | __kill_fasync | kill_fasync | evdev_pass_event | evdev_event | input_pass_event | input_handle_event | input_event | synaptics_process_byte | psmouse_handle_byte | psmouse_interrupt | serio_interrupt | i8042_interrupt | handle_IRQ_event | handle_edge_irq | handle_irq | __irqentry_text_start | ret_from_intr | | | |--30.43%-- __select | | | |--17.39%-- 0x454f15 | | | |--13.04%-- __read | | | |--13.04%-- vread_hpet | | | |--13.04%-- _xcb_lock_io | | | --13.04%-- 0x7f630878ce8 After: 50.00% [k] _raw_read_lock | --- _read_lock | |--98.97%-- send_sigio | __kill_fasync | kill_fasync | evdev_pass_event | evdev_event | input_pass_event | input_handle_event | input_event | | | |--96.88%-- synaptics_process_byte | | psmouse_handle_byte | | psmouse_interrupt | | serio_interrupt | | i8042_interrupt | | handle_IRQ_event | | handle_edge_irq | | handle_irq | | __irqentry_text_start | | ret_from_intr | | | | | |--39.78%-- __const_udelay | | | | | | | |--91.89%-- ath5k_hw_register_timeout | | | | ath5k_hw_noise_floor_calibration | | | | ath5k_hw_reset | | | | ath5k_reset | | | | ath5k_config | | | | ieee80211_hw_config | | | | | | | | | |--88.24%-- ieee80211_scan_work | | | | | worker_thread | | | | | kthread | | | | | child_rip | | | | | | | | | --11.76%-- ieee80211_scan_completed | | | | ieee80211_scan_work | | | | worker_thread | | | | kthread | | | | child_rip | | | | | | | --8.11%-- ath5k_hw_noise_floor_calibration | | | ath5k_hw_reset | | | ath5k_reset | | | ath5k_config Note: This does not only affect perf events but also x86-64 stacktraces. They were considered as unreliable once we quit the irq stack frame. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: "K. Prasad" Cc: Thomas Gleixner Cc: "H. Peter Anvin" --- arch/x86/kernel/dumpstack_64.c | 33 ++++++++++++++++++++++++++++++++- 1 file changed, 32 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/dumpstack_64.c b/arch/x86/kernel/dumpstack_64.c index a071e6be177..004b8aa6a35 100644 --- a/arch/x86/kernel/dumpstack_64.c +++ b/arch/x86/kernel/dumpstack_64.c @@ -101,6 +101,35 @@ static unsigned long *in_exception_stack(unsigned cpu, unsigned long stack, return NULL; } +static inline int +in_irq_stack(unsigned long *stack, unsigned long *irq_stack, + unsigned long *irq_stack_end) +{ + return (stack >= irq_stack && stack < irq_stack_end); +} + +/* + * We are returning from the irq stack and go to the previous one. + * If the previous stack is also in the irq stack, then bp in the first + * frame of the irq stack points to the previous, interrupted one. + * Otherwise we have another level of indirection: We first save + * the bp of the previous stack, then we switch the stack to the irq one + * and save a new bp that links to the previous one. + * (See save_args()) + */ +static inline unsigned long +fixup_bp_irq_link(unsigned long bp, unsigned long *stack, + unsigned long *irq_stack, unsigned long *irq_stack_end) +{ +#ifdef CONFIG_FRAME_POINTER + struct stack_frame *frame = (struct stack_frame *)bp; + + if (!in_irq_stack(stack, irq_stack, irq_stack_end)) + return (unsigned long)frame->next_frame; +#endif + return bp; +} + /* * x86-64 can have up to three kernel stacks: * process stack @@ -173,7 +202,7 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, irq_stack = irq_stack_end - (IRQ_STACK_SIZE - 64) / sizeof(*irq_stack); - if (stack >= irq_stack && stack < irq_stack_end) { + if (in_irq_stack(stack, irq_stack, irq_stack_end)) { if (ops->stack(data, "IRQ") < 0) break; bp = print_context_stack(tinfo, stack, bp, @@ -184,6 +213,8 @@ void dump_trace(struct task_struct *task, struct pt_regs *regs, * pointer (index -1 to end) in the IRQ stack: */ stack = (unsigned long *) (irq_stack_end[-1]); + bp = fixup_bp_irq_link(bp, stack, irq_stack, + irq_stack_end); irq_stack_end = NULL; ops->stack(data, "EOI"); continue; From 59b4caeb797494043f5f3b98a610f5d9b75eefa3 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 6 Dec 2009 10:16:30 +0100 Subject: [PATCH 10/57] perf tools: Correct size computation in tracepoint_id_to_path() The size argument to zalloc should be the size of desired structure, not the pointer to it. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @expression@ expression *x; @@ x = <+... -sizeof(x) +sizeof(*x) ...+>// Signed-off-by: Julia Lawall Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 9e5dbd66d34..448a13b5201 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -197,7 +197,7 @@ struct tracepoint_path *tracepoint_id_to_path(u64 config) if (id == config) { closedir(evt_dir); closedir(sys_dir); - path = zalloc(sizeof(path)); + path = zalloc(sizeof(*path)); path->system = malloc(MAX_EVENT_LENGTH); if (!path->system) { free(path); From be2bf0a2dfbba785860284968fa055006eb1610e Mon Sep 17 00:00:00 2001 From: Jean Delvare Date: Sun, 6 Dec 2009 12:40:59 +0100 Subject: [PATCH 11/57] x86, perf probe: Fix warning in test_get_len() Fix the following warning: arch/x86/tools/test_get_len.c: In function "main": arch/x86/tools/test_get_len.c:116: warning: unused variable "c" Signed-off-by: Jean Delvare Cc: Masami Hiramatsu Signed-off-by: Ingo Molnar --- arch/x86/tools/test_get_len.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/tools/test_get_len.c b/arch/x86/tools/test_get_len.c index d8214dc03fa..bee8d6ac269 100644 --- a/arch/x86/tools/test_get_len.c +++ b/arch/x86/tools/test_get_len.c @@ -113,7 +113,7 @@ int main(int argc, char **argv) char line[BUFSIZE], sym[BUFSIZE] = ""; unsigned char insn_buf[16]; struct insn insn; - int insns = 0, c; + int insns = 0; int warnings = 0; parse_args(argc, argv); From 028c515253761084c6594bf9ac9b194b51d87065 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 6 Dec 2009 20:07:29 +0900 Subject: [PATCH 12/57] perf timechart: Fix header handling Update "struct trace_entry" to match with current one. And remove "size" field from it. If it has "size", it become cause of alignment mismatch of structure with kernel. Signed-off-by: OGAWA Hirofumi Acked-by: Arjan van de Ven Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <87ljhg8ioe.fsf@devron.myhome.or.jp> Signed-off-by: Ingo Molnar --- tools/perf/builtin-timechart.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index cb58b6605fc..c0f29ed0996 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -302,12 +302,11 @@ process_exit_event(event_t *event) } struct trace_entry { - u32 size; unsigned short type; unsigned char flags; unsigned char preempt_count; int pid; - int tgid; + int lock_depth; }; struct power_entry { @@ -489,6 +488,7 @@ process_sample_event(event_t *event) u64 stamp = 0; u32 cpu = 0; u32 pid = 0; + u32 size, *size_ptr; struct trace_entry *te; if (sample_type & PERF_SAMPLE_IP) @@ -518,9 +518,13 @@ process_sample_event(event_t *event) if (sample_type & PERF_SAMPLE_PERIOD) cursor++; - te = (void *)&event->sample.array[cursor]; + size_ptr = (void *)&event->sample.array[cursor]; - if (sample_type & PERF_SAMPLE_RAW && te->size > 0) { + size = *size_ptr; + size_ptr++; + + te = (void *)size_ptr; + if (sample_type & PERF_SAMPLE_RAW && size > 0) { char *event_str; struct power_entry *pe; From 180f95e29aa8782c019caa64ede2a28d8ab62564 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 6 Dec 2009 20:08:24 +0900 Subject: [PATCH 13/57] perf: Make common SAMPLE_EVENT parser Currently, sample event data is parsed for each commands, and it is assuming that the data is not including other data. (E.g. timechart, trace, etc. can't parse the event if it has PERF_SAMPLE_CALLCHAIN) So, even if we record the superset data for multiple commands at a time, commands can't parse. etc. To fix it, this makes common sample event parser, and use it to parse sample event correctly. (PERF_SAMPLE_READ is unsupported for now though, it seems to be not using.) Signed-off-by: OGAWA Hirofumi Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <87hbs48imv.fsf@devron.myhome.or.jp> Signed-off-by: Ingo Molnar --- tools/perf/builtin-kmem.c | 36 ++++++------------ tools/perf/builtin-report.c | 39 +++++++++----------- tools/perf/builtin-sched.c | 38 ++++++------------- tools/perf/builtin-timechart.c | 56 ++++++++-------------------- tools/perf/builtin-trace.c | 48 ++++++++---------------- tools/perf/util/event.c | 67 ++++++++++++++++++++++++++++++++++ tools/perf/util/event.h | 17 ++++++++- 7 files changed, 155 insertions(+), 146 deletions(-) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 047fef74bd5..f218990de0c 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -320,35 +320,23 @@ process_raw_event(event_t *raw_event __used, void *more_data, static int process_sample_event(event_t *event) { - u64 ip = event->ip.ip; - u64 timestamp = -1; - u32 cpu = -1; - u64 period = 1; - void *more_data = event->ip.__more_data; - struct thread *thread = threads__findnew(event->ip.pid); + struct sample_data data; + struct thread *thread; - if (sample_type & PERF_SAMPLE_TIME) { - timestamp = *(u64 *)more_data; - more_data += sizeof(u64); - } + memset(&data, 0, sizeof(data)); + data.time = -1; + data.cpu = -1; + data.period = 1; - if (sample_type & PERF_SAMPLE_CPU) { - cpu = *(u32 *)more_data; - more_data += sizeof(u32); - more_data += sizeof(u32); /* reserved */ - } - - if (sample_type & PERF_SAMPLE_PERIOD) { - period = *(u64 *)more_data; - more_data += sizeof(u64); - } + event__parse_sample(event, sample_type, &data); dump_printf("(IP, %d): %d/%d: %p period: %Ld\n", event->header.misc, - event->ip.pid, event->ip.tid, - (void *)(long)ip, - (long long)period); + data.pid, data.tid, + (void *)(long)data.ip, + (long long)data.period); + thread = threads__findnew(event->ip.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); @@ -357,7 +345,7 @@ static int process_sample_event(event_t *event) dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - process_raw_event(event, more_data, cpu, timestamp, thread); + process_raw_event(event, data.raw_data, data.cpu, data.time, thread); return 0; } diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 383c4ab4f9a..2b9eb3a553e 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -605,44 +605,41 @@ static int validate_chain(struct ip_callchain *chain, event_t *event) static int process_sample_event(event_t *event) { - u64 ip = event->ip.ip; - u64 period = 1; - void *more_data = event->ip.__more_data; - struct ip_callchain *chain = NULL; + struct sample_data data; int cpumode; struct addr_location al; - struct thread *thread = threads__findnew(event->ip.pid); + struct thread *thread; - if (sample_type & PERF_SAMPLE_PERIOD) { - period = *(u64 *)more_data; - more_data += sizeof(u64); - } + memset(&data, 0, sizeof(data)); + data.period = 1; + + event__parse_sample(event, sample_type, &data); dump_printf("(IP, %d): %d/%d: %p period: %Ld\n", event->header.misc, - event->ip.pid, event->ip.tid, - (void *)(long)ip, - (long long)period); + data.pid, data.tid, + (void *)(long)data.ip, + (long long)data.period); if (sample_type & PERF_SAMPLE_CALLCHAIN) { unsigned int i; - chain = (void *)more_data; + dump_printf("... chain: nr:%Lu\n", data.callchain->nr); - dump_printf("... chain: nr:%Lu\n", chain->nr); - - if (validate_chain(chain, event) < 0) { + if (validate_chain(data.callchain, event) < 0) { pr_debug("call-chain problem with event, " "skipping it.\n"); return 0; } if (dump_trace) { - for (i = 0; i < chain->nr; i++) - dump_printf("..... %2d: %016Lx\n", i, chain->ips[i]); + for (i = 0; i < data.callchain->nr; i++) + dump_printf("..... %2d: %016Lx\n", + i, data.callchain->ips[i]); } } + thread = threads__findnew(data.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); @@ -657,7 +654,7 @@ static int process_sample_event(event_t *event) cpumode = event->header.misc & PERF_RECORD_MISC_CPUMODE_MASK; thread__find_addr_location(thread, cpumode, - MAP__FUNCTION, ip, &al, NULL); + MAP__FUNCTION, data.ip, &al, NULL); /* * We have to do this here as we may have a dso with no symbol hit that * has a name longer than the ones with symbols sampled. @@ -675,12 +672,12 @@ static int process_sample_event(event_t *event) if (sym_list && al.sym && !strlist__has_entry(sym_list, al.sym->name)) return 0; - if (hist_entry__add(&al, chain, period)) { + if (hist_entry__add(&al, data.callchain, data.period)) { pr_debug("problem incrementing symbol count, skipping event\n"); return -1; } - event__stats.total += period; + event__stats.total += data.period; return 0; } diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 26b782f26ee..45c46c79049 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1598,40 +1598,26 @@ process_raw_event(event_t *raw_event __used, void *more_data, static int process_sample_event(event_t *event) { + struct sample_data data; struct thread *thread; - u64 ip = event->ip.ip; - u64 timestamp = -1; - u32 cpu = -1; - u64 period = 1; - void *more_data = event->ip.__more_data; if (!(sample_type & PERF_SAMPLE_RAW)) return 0; - thread = threads__findnew(event->ip.pid); + memset(&data, 0, sizeof(data)); + data.time = -1; + data.cpu = -1; + data.period = -1; - if (sample_type & PERF_SAMPLE_TIME) { - timestamp = *(u64 *)more_data; - more_data += sizeof(u64); - } - - if (sample_type & PERF_SAMPLE_CPU) { - cpu = *(u32 *)more_data; - more_data += sizeof(u32); - more_data += sizeof(u32); /* reserved */ - } - - if (sample_type & PERF_SAMPLE_PERIOD) { - period = *(u64 *)more_data; - more_data += sizeof(u64); - } + event__parse_sample(event, sample_type, &data); dump_printf("(IP, %d): %d/%d: %p period: %Ld\n", event->header.misc, - event->ip.pid, event->ip.tid, - (void *)(long)ip, - (long long)period); + data.pid, data.tid, + (void *)(long)data.ip, + (long long)data.period); + thread = threads__findnew(data.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); @@ -1640,10 +1626,10 @@ static int process_sample_event(event_t *event) dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - if (profile_cpu != -1 && profile_cpu != (int) cpu) + if (profile_cpu != -1 && profile_cpu != (int)data.cpu) return 0; - process_raw_event(event, more_data, cpu, timestamp, thread); + process_raw_event(event, data.raw_data, data.cpu, data.time, thread); return 0; } diff --git a/tools/perf/builtin-timechart.c b/tools/perf/builtin-timechart.c index c0f29ed0996..f472df9561e 100644 --- a/tools/perf/builtin-timechart.c +++ b/tools/perf/builtin-timechart.c @@ -483,48 +483,22 @@ static void sched_switch(int cpu, u64 timestamp, struct trace_entry *te) static int process_sample_event(event_t *event) { - int cursor = 0; - u64 addr = 0; - u64 stamp = 0; - u32 cpu = 0; - u32 pid = 0; - u32 size, *size_ptr; + struct sample_data data; struct trace_entry *te; - if (sample_type & PERF_SAMPLE_IP) - cursor++; + memset(&data, 0, sizeof(data)); + + event__parse_sample(event, sample_type, &data); - if (sample_type & PERF_SAMPLE_TID) { - pid = event->sample.array[cursor]>>32; - cursor++; - } if (sample_type & PERF_SAMPLE_TIME) { - stamp = event->sample.array[cursor++]; - - if (!first_time || first_time > stamp) - first_time = stamp; - if (last_time < stamp) - last_time = stamp; - + if (!first_time || first_time > data.time) + first_time = data.time; + if (last_time < data.time) + last_time = data.time; } - if (sample_type & PERF_SAMPLE_ADDR) - addr = event->sample.array[cursor++]; - if (sample_type & PERF_SAMPLE_ID) - cursor++; - if (sample_type & PERF_SAMPLE_STREAM_ID) - cursor++; - if (sample_type & PERF_SAMPLE_CPU) - cpu = event->sample.array[cursor++] & 0xFFFFFFFF; - if (sample_type & PERF_SAMPLE_PERIOD) - cursor++; - size_ptr = (void *)&event->sample.array[cursor]; - - size = *size_ptr; - size_ptr++; - - te = (void *)size_ptr; - if (sample_type & PERF_SAMPLE_RAW && size > 0) { + te = (void *)data.raw_data; + if (sample_type & PERF_SAMPLE_RAW && data.raw_size > 0) { char *event_str; struct power_entry *pe; @@ -536,19 +510,19 @@ process_sample_event(event_t *event) return 0; if (strcmp(event_str, "power:power_start") == 0) - c_state_start(cpu, stamp, pe->value); + c_state_start(data.cpu, data.time, pe->value); if (strcmp(event_str, "power:power_end") == 0) - c_state_end(cpu, stamp); + c_state_end(data.cpu, data.time); if (strcmp(event_str, "power:power_frequency") == 0) - p_state_change(cpu, stamp, pe->value); + p_state_change(data.cpu, data.time, pe->value); if (strcmp(event_str, "sched:sched_wakeup") == 0) - sched_wakeup(cpu, stamp, pid, te); + sched_wakeup(data.cpu, data.time, data.pid, te); if (strcmp(event_str, "sched:sched_switch") == 0) - sched_switch(cpu, stamp, te); + sched_switch(data.cpu, data.time, te); } return 0; } diff --git a/tools/perf/builtin-trace.c b/tools/perf/builtin-trace.c index abb914aa7be..c2fcc34486f 100644 --- a/tools/perf/builtin-trace.c +++ b/tools/perf/builtin-trace.c @@ -66,58 +66,40 @@ static u64 sample_type; static int process_sample_event(event_t *event) { - u64 ip = event->ip.ip; - u64 timestamp = -1; - u32 cpu = -1; - u64 period = 1; - void *more_data = event->ip.__more_data; - struct thread *thread = threads__findnew(event->ip.pid); + struct sample_data data; + struct thread *thread; - if (sample_type & PERF_SAMPLE_TIME) { - timestamp = *(u64 *)more_data; - more_data += sizeof(u64); - } + memset(&data, 0, sizeof(data)); + data.time = -1; + data.cpu = -1; + data.period = 1; - if (sample_type & PERF_SAMPLE_CPU) { - cpu = *(u32 *)more_data; - more_data += sizeof(u32); - more_data += sizeof(u32); /* reserved */ - } - - if (sample_type & PERF_SAMPLE_PERIOD) { - period = *(u64 *)more_data; - more_data += sizeof(u64); - } + event__parse_sample(event, sample_type, &data); dump_printf("(IP, %d): %d/%d: %p period: %Ld\n", event->header.misc, - event->ip.pid, event->ip.tid, - (void *)(long)ip, - (long long)period); + data.pid, data.tid, + (void *)(long)data.ip, + (long long)data.period); + thread = threads__findnew(event->ip.pid); if (thread == NULL) { pr_debug("problem processing %d event, skipping it.\n", event->header.type); return -1; } - dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - if (sample_type & PERF_SAMPLE_RAW) { - struct { - u32 size; - char data[0]; - } *raw = more_data; - /* * FIXME: better resolve from pid from the struct trace_entry * field, although it should be the same than this perf * event pid */ - scripting_ops->process_event(cpu, raw->data, raw->size, - timestamp, thread->comm); + scripting_ops->process_event(data.cpu, data.raw_data, + data.raw_size, + data.time, thread->comm); } - event__stats.total += period; + event__stats.total += data.period; return 0; } diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 414b89d1bde..4dcecafa85d 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -310,3 +310,70 @@ int event__preprocess_sample(const event_t *self, struct addr_location *al, al->level == 'H' ? "[hypervisor]" : ""); return 0; } + +int event__parse_sample(event_t *event, u64 type, struct sample_data *data) +{ + u64 *array = event->sample.array; + + if (type & PERF_SAMPLE_IP) { + data->ip = event->ip.ip; + array++; + } + + if (type & PERF_SAMPLE_TID) { + u32 *p = (u32 *)array; + data->pid = p[0]; + data->tid = p[1]; + array++; + } + + if (type & PERF_SAMPLE_TIME) { + data->time = *array; + array++; + } + + if (type & PERF_SAMPLE_ADDR) { + data->addr = *array; + array++; + } + + if (type & PERF_SAMPLE_ID) { + data->id = *array; + array++; + } + + if (type & PERF_SAMPLE_STREAM_ID) { + data->stream_id = *array; + array++; + } + + if (type & PERF_SAMPLE_CPU) { + u32 *p = (u32 *)array; + data->cpu = *p; + array++; + } + + if (type & PERF_SAMPLE_PERIOD) { + data->period = *array; + array++; + } + + if (type & PERF_SAMPLE_READ) { + pr_debug("PERF_SAMPLE_READ is unsuported for now\n"); + return -1; + } + + if (type & PERF_SAMPLE_CALLCHAIN) { + data->callchain = (struct ip_callchain *)array; + array += 1 + data->callchain->nr; + } + + if (type & PERF_SAMPLE_RAW) { + u32 *p = (u32 *)array; + data->raw_size = *p; + p++; + data->raw_data = p; + } + + return 0; +} diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index a4cc8105cf6..c7a78eef8e5 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -56,11 +56,25 @@ struct read_event { u64 id; }; -struct sample_event{ +struct sample_event { struct perf_event_header header; u64 array[]; }; +struct sample_data { + u64 ip; + u32 pid, tid; + u64 time; + u64 addr; + u64 id; + u64 stream_id; + u32 cpu; + u64 period; + struct ip_callchain *callchain; + u32 raw_size; + void *raw_data; +}; + #define BUILD_ID_SIZE 20 struct build_id_event { @@ -155,5 +169,6 @@ int event__process_task(event_t *self); struct addr_location; int event__preprocess_sample(const event_t *self, struct addr_location *al, symbol_filter_t filter); +int event__parse_sample(event_t *event, u64 type, struct sample_data *data); #endif /* __PERF_RECORD_H */ From 7691b1ec2e4a8d4bd88dcf88b29792399ebe1c91 Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 6 Dec 2009 20:10:49 +0900 Subject: [PATCH 14/57] perf tools: Misc small fixes - util/header.c "len" is aligned to 64. So, it tries to write the out of long_name buffer. So, this use "zero_buf" to write aligned area. - util/trace-event-read.c "size" is not including nul byte. So, this allocates it, and set '\0'. - util/trace-event-parse.c It needs parens to calc correct size. Signed-off-by: OGAWA Hirofumi Cc: Peter Zijlstra Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <87d42s8iiu.fsf_-_@devron.myhome.or.jp> Signed-off-by: Ingo Molnar --- tools/perf/util/header.c | 9 +++++++-- tools/perf/util/trace-event-parse.c | 2 +- tools/perf/util/trace-event-read.c | 3 ++- 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 4805e6dfd23..08b6759287f 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -187,7 +187,9 @@ static int do_write(int fd, const void *buf, size_t size) static int __dsos__write_buildid_table(struct list_head *head, int fd) { +#define NAME_ALIGN 64 struct dso *pos; + static const char zero_buf[NAME_ALIGN]; list_for_each_entry(pos, head, node) { int err; @@ -197,14 +199,17 @@ static int __dsos__write_buildid_table(struct list_head *head, int fd) if (!pos->has_build_id) continue; len = pos->long_name_len + 1; - len = ALIGN(len, 64); + len = ALIGN(len, NAME_ALIGN); memset(&b, 0, sizeof(b)); memcpy(&b.build_id, pos->build_id, sizeof(pos->build_id)); b.header.size = sizeof(b) + len; err = do_write(fd, &b, sizeof(b)); if (err < 0) return err; - err = do_write(fd, pos->long_name, len); + err = do_write(fd, pos->long_name, pos->long_name_len + 1); + if (err < 0) + return err; + err = do_write(fd, zero_buf, len - pos->long_name_len + 1); if (err < 0) return err; } diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 0302405aa2c..6ffe9d63d85 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -177,7 +177,7 @@ void parse_proc_kallsyms(char *file, unsigned int size __unused) func_count++; } - func_list = malloc_or_die(sizeof(*func_list) * func_count + 1); + func_list = malloc_or_die(sizeof(*func_list) * (func_count + 1)); i = 0; while (list) { diff --git a/tools/perf/util/trace-event-read.c b/tools/perf/util/trace-event-read.c index 342dfdd43f8..1744422cafc 100644 --- a/tools/perf/util/trace-event-read.c +++ b/tools/perf/util/trace-event-read.c @@ -145,8 +145,9 @@ static void read_proc_kallsyms(void) if (!size) return; - buf = malloc_or_die(size); + buf = malloc_or_die(size + 1); read_or_die(buf, size); + buf[size] = '\0'; parse_proc_kallsyms(buf, size); From b9b1e1c71a9481c0c34ed5bed42f1bfa730fd39e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Sun, 6 Dec 2009 18:03:10 -0200 Subject: [PATCH 15/57] perf buildid-list: Fix copy'n'paste help message MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Arnaldo Carvalho de Melo Cc: Frédéric Weisbecker Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1260129790-11520-1-git-send-email-acme@infradead.org> Signed-off-by: Ingo Molnar --- tools/perf/builtin-buildid-list.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-buildid-list.c b/tools/perf/builtin-buildid-list.c index 7dee9d19ab7..dcb6143a000 100644 --- a/tools/perf/builtin-buildid-list.c +++ b/tools/perf/builtin-buildid-list.c @@ -19,7 +19,7 @@ static char const *input_name = "perf.data"; static int force; static const char *const buildid_list_usage[] = { - "perf report []", + "perf buildid-list []", NULL }; From c0777c5aa835a97ccc77d82e55388940f0140a61 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Dec 2009 12:04:49 +0800 Subject: [PATCH 16/57] perf/sched: Fix 'perf sched trace' If we use 'perf sched trace', it will call symbol__init() again, and can lead to a perf tool crash: [root@localhost perf]# ./perf sched trace *** glibc detected *** ./perf: free(): invalid next size (normal): 0x094c1898 *** ======= Backtrace: ========= /lib/libc.so.6[0xb7602404] /lib/libc.so.6(cfree+0x96)[0xb76043b6] ./perf[0x80730fe] ./perf[0x8074c97] ./perf[0x805eb59] ./perf[0x80536fd] ./perf[0x804b618] ./perf[0x804bdc3] /lib/libc.so.6(__libc_start_main+0xe5)[0xb75a9735] ./perf[0x804af81] ======= Memory map: ======== 08048000-08158000 r-xp 00000000 fe:00 556831 /home/eric/.... 08158000-08168000 rw-p 0010f000 fe:00 556831 /home/eric/... 08168000-085fe000 rw-p 00000000 00:00 0 094ab000-094cc000 rw-p 00000000 00:00 0 [heap] Signed-off-by: Xiao Guangrong LKML-Reference: <4B1C7EE1.8030906@cn.fujitsu.com> Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 45c46c79049..7481ebdb17e 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1888,13 +1888,18 @@ static int __cmd_record(int argc, const char **argv) int cmd_sched(int argc, const char **argv, const char *prefix __used) { - symbol__init(0); - argc = parse_options(argc, argv, sched_options, sched_usage, PARSE_OPT_STOP_AT_NON_OPTION); if (!argc) usage_with_options(sched_usage, sched_options); + /* + * Aliased to 'perf trace' for now: + */ + if (!strcmp(argv[0], "trace")) + return cmd_trace(argc, argv, prefix); + + symbol__init(0); if (!strncmp(argv[0], "rec", 3)) { return __cmd_record(argc, argv); } else if (!strncmp(argv[0], "lat", 3)) { @@ -1918,11 +1923,6 @@ int cmd_sched(int argc, const char **argv, const char *prefix __used) usage_with_options(replay_usage, replay_options); } __cmd_replay(); - } else if (!strcmp(argv[0], "trace")) { - /* - * Aliased to 'perf trace' for now: - */ - return cmd_trace(argc, argv, prefix); } else { usage_with_options(sched_usage, sched_options); } From d8bd9e0aedabcb47887712497bc386a06ddcbd12 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Dec 2009 12:06:29 +0800 Subject: [PATCH 17/57] perf_event: Fix raw event processing We use 'data.raw_data' parameter to call process_raw_event(), but data.raw_data buffer not include data size. it can make perf tool crash. This bug was introduced by commit 180f95e29a ("perf: Make common SAMPLE_EVENT parser"). Signed-off-by: Xiao Guangrong Cc: Pekka Enberg Cc: Eduard - Gabriel Munteanu Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: OGAWA Hirofumi Cc: Peter Zijlstra Cc: Li Zefan LKML-Reference: <4B1C7F45.5080105@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-kmem.c | 11 ++++++++--- tools/perf/builtin-sched.c | 11 ++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index f218990de0c..f84d7a3db68 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -289,13 +289,17 @@ static void process_free_event(struct raw_event_sample *raw, } static void -process_raw_event(event_t *raw_event __used, void *more_data, +process_raw_event(event_t *raw_event __used, u32 size, void *data, int cpu, u64 timestamp, struct thread *thread) { - struct raw_event_sample *raw = more_data; + struct raw_event_sample *raw; struct event *event; int type; + raw = malloc_or_die(sizeof(*raw)+size); + raw->size = size; + memcpy(raw->data, data, size); + type = trace_parse_common_type(raw->data); event = trace_find_event(type); @@ -345,7 +349,8 @@ static int process_sample_event(event_t *event) dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - process_raw_event(event, data.raw_data, data.cpu, data.time, thread); + process_raw_event(event, data.raw_size, data.raw_data, data.cpu, + data.time, thread); return 0; } diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 7481ebdb17e..4655e16b929 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -1570,13 +1570,17 @@ process_sched_migrate_task_event(struct raw_event_sample *raw, } static void -process_raw_event(event_t *raw_event __used, void *more_data, +process_raw_event(event_t *raw_event __used, u32 size, void *data, int cpu, u64 timestamp, struct thread *thread) { - struct raw_event_sample *raw = more_data; + struct raw_event_sample *raw; struct event *event; int type; + raw = malloc_or_die(sizeof(*raw)+size); + raw->size = size; + memcpy(raw->data, data, size); + type = trace_parse_common_type(raw->data); event = trace_find_event(type); @@ -1629,7 +1633,8 @@ static int process_sample_event(event_t *event) if (profile_cpu != -1 && profile_cpu != (int)data.cpu) return 0; - process_raw_event(event, data.raw_data, data.cpu, data.time, thread); + process_raw_event(event, data.raw_size, data.raw_data, data.cpu, + data.time, thread); return 0; } From d9541ed3241bb6c2b805d3ea0e87563cf2a0c5c3 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Dec 2009 12:07:15 +0800 Subject: [PATCH 18/57] perf_event: Fix __dsos__write_buildid_table() The remain buff size is 'len - pos->long_name_len - 1', not 'len - pos->long_name_len + 1' This bug was introduced by commit 7691b1e ("perf tools: Misc small fixes"). Signed-off-by: Xiao Guangrong Acked-by: OGAWA Hirofumi Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Li Zefan LKML-Reference: <4B1C7F73.80707@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- tools/perf/util/header.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 08b6759287f..59a9c0b3033 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -209,7 +209,7 @@ static int __dsos__write_buildid_table(struct list_head *head, int fd) err = do_write(fd, pos->long_name, pos->long_name_len + 1); if (err < 0) return err; - err = do_write(fd, zero_buf, len - pos->long_name_len + 1); + err = do_write(fd, zero_buf, len - pos->long_name_len - 1); if (err < 0) return err; } From f48f669d42e133db839af16656fd720107ef6742 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Mon, 7 Dec 2009 13:04:04 +0800 Subject: [PATCH 19/57] perf_event: Eliminate raw->size raw->size is not used, this patch just cleans it up. Signed-off-by: Xiao Guangrong Cc: Frederic Weisbecker Cc: Paul Mackerras Cc: OGAWA Hirofumi Cc: Peter Zijlstra Cc: Li Zefan LKML-Reference: <4B1C8CC4.4050007@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-kmem.c | 38 ++++++--------- tools/perf/builtin-sched.c | 94 +++++++++++++++++--------------------- 2 files changed, 56 insertions(+), 76 deletions(-) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index f84d7a3db68..7551a5f834b 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -57,11 +57,6 @@ static struct rb_root root_caller_sorted; static unsigned long total_requested, total_allocated; static unsigned long nr_allocs, nr_cross_allocs; -struct raw_event_sample { - u32 size; - char data[0]; -}; - #define PATH_SYS_NODE "/sys/devices/system/node" static void init_cpunode_map(void) @@ -201,7 +196,7 @@ static void insert_caller_stat(unsigned long call_site, } } -static void process_alloc_event(struct raw_event_sample *raw, +static void process_alloc_event(void *data, struct event *event, int cpu, u64 timestamp __used, @@ -214,10 +209,10 @@ static void process_alloc_event(struct raw_event_sample *raw, int bytes_alloc; int node1, node2; - ptr = raw_field_value(event, "ptr", raw->data); - call_site = raw_field_value(event, "call_site", raw->data); - bytes_req = raw_field_value(event, "bytes_req", raw->data); - bytes_alloc = raw_field_value(event, "bytes_alloc", raw->data); + ptr = raw_field_value(event, "ptr", data); + call_site = raw_field_value(event, "call_site", data); + bytes_req = raw_field_value(event, "bytes_req", data); + bytes_alloc = raw_field_value(event, "bytes_alloc", data); insert_alloc_stat(call_site, ptr, bytes_req, bytes_alloc, cpu); insert_caller_stat(call_site, bytes_req, bytes_alloc); @@ -227,7 +222,7 @@ static void process_alloc_event(struct raw_event_sample *raw, if (node) { node1 = cpunode_map[cpu]; - node2 = raw_field_value(event, "node", raw->data); + node2 = raw_field_value(event, "node", data); if (node1 != node2) nr_cross_allocs++; } @@ -262,7 +257,7 @@ static struct alloc_stat *search_alloc_stat(unsigned long ptr, return NULL; } -static void process_free_event(struct raw_event_sample *raw, +static void process_free_event(void *data, struct event *event, int cpu, u64 timestamp __used, @@ -271,7 +266,7 @@ static void process_free_event(struct raw_event_sample *raw, unsigned long ptr; struct alloc_stat *s_alloc, *s_caller; - ptr = raw_field_value(event, "ptr", raw->data); + ptr = raw_field_value(event, "ptr", data); s_alloc = search_alloc_stat(ptr, 0, &root_alloc_stat, ptr_cmp); if (!s_alloc) @@ -289,35 +284,30 @@ static void process_free_event(struct raw_event_sample *raw, } static void -process_raw_event(event_t *raw_event __used, u32 size, void *data, +process_raw_event(event_t *raw_event __used, void *data, int cpu, u64 timestamp, struct thread *thread) { - struct raw_event_sample *raw; struct event *event; int type; - raw = malloc_or_die(sizeof(*raw)+size); - raw->size = size; - memcpy(raw->data, data, size); - - type = trace_parse_common_type(raw->data); + type = trace_parse_common_type(data); event = trace_find_event(type); if (!strcmp(event->name, "kmalloc") || !strcmp(event->name, "kmem_cache_alloc")) { - process_alloc_event(raw, event, cpu, timestamp, thread, 0); + process_alloc_event(data, event, cpu, timestamp, thread, 0); return; } if (!strcmp(event->name, "kmalloc_node") || !strcmp(event->name, "kmem_cache_alloc_node")) { - process_alloc_event(raw, event, cpu, timestamp, thread, 1); + process_alloc_event(data, event, cpu, timestamp, thread, 1); return; } if (!strcmp(event->name, "kfree") || !strcmp(event->name, "kmem_cache_free")) { - process_free_event(raw, event, cpu, timestamp, thread); + process_free_event(data, event, cpu, timestamp, thread); return; } } @@ -349,7 +339,7 @@ static int process_sample_event(event_t *event) dump_printf(" ... thread: %s:%d\n", thread->comm, thread->pid); - process_raw_event(event, data.raw_size, data.raw_data, data.cpu, + process_raw_event(event, data.raw_data, data.cpu, data.time, thread); return 0; diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 4655e16b929..19f43faa9f8 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -628,11 +628,6 @@ static void test_calibrations(void) printf("the sleep test took %Ld nsecs\n", T1-T0); } -struct raw_event_sample { - u32 size; - char data[0]; -}; - #define FILL_FIELD(ptr, field, event, data) \ ptr.field = (typeof(ptr.field)) raw_field_value(event, #field, data) @@ -1356,7 +1351,7 @@ static void sort_lat(void) static struct trace_sched_handler *trace_handler; static void -process_sched_wakeup_event(struct raw_event_sample *raw, +process_sched_wakeup_event(void *data, struct event *event, int cpu __used, u64 timestamp __used, @@ -1364,13 +1359,13 @@ process_sched_wakeup_event(struct raw_event_sample *raw, { struct trace_wakeup_event wakeup_event; - FILL_COMMON_FIELDS(wakeup_event, event, raw->data); + FILL_COMMON_FIELDS(wakeup_event, event, data); - FILL_ARRAY(wakeup_event, comm, event, raw->data); - FILL_FIELD(wakeup_event, pid, event, raw->data); - FILL_FIELD(wakeup_event, prio, event, raw->data); - FILL_FIELD(wakeup_event, success, event, raw->data); - FILL_FIELD(wakeup_event, cpu, event, raw->data); + FILL_ARRAY(wakeup_event, comm, event, data); + FILL_FIELD(wakeup_event, pid, event, data); + FILL_FIELD(wakeup_event, prio, event, data); + FILL_FIELD(wakeup_event, success, event, data); + FILL_FIELD(wakeup_event, cpu, event, data); if (trace_handler->wakeup_event) trace_handler->wakeup_event(&wakeup_event, event, cpu, timestamp, thread); @@ -1469,7 +1464,7 @@ map_switch_event(struct trace_switch_event *switch_event, static void -process_sched_switch_event(struct raw_event_sample *raw, +process_sched_switch_event(void *data, struct event *event, int this_cpu, u64 timestamp __used, @@ -1477,15 +1472,15 @@ process_sched_switch_event(struct raw_event_sample *raw, { struct trace_switch_event switch_event; - FILL_COMMON_FIELDS(switch_event, event, raw->data); + FILL_COMMON_FIELDS(switch_event, event, data); - FILL_ARRAY(switch_event, prev_comm, event, raw->data); - FILL_FIELD(switch_event, prev_pid, event, raw->data); - FILL_FIELD(switch_event, prev_prio, event, raw->data); - FILL_FIELD(switch_event, prev_state, event, raw->data); - FILL_ARRAY(switch_event, next_comm, event, raw->data); - FILL_FIELD(switch_event, next_pid, event, raw->data); - FILL_FIELD(switch_event, next_prio, event, raw->data); + FILL_ARRAY(switch_event, prev_comm, event, data); + FILL_FIELD(switch_event, prev_pid, event, data); + FILL_FIELD(switch_event, prev_prio, event, data); + FILL_FIELD(switch_event, prev_state, event, data); + FILL_ARRAY(switch_event, next_comm, event, data); + FILL_FIELD(switch_event, next_pid, event, data); + FILL_FIELD(switch_event, next_prio, event, data); if (curr_pid[this_cpu] != (u32)-1) { /* @@ -1502,7 +1497,7 @@ process_sched_switch_event(struct raw_event_sample *raw, } static void -process_sched_runtime_event(struct raw_event_sample *raw, +process_sched_runtime_event(void *data, struct event *event, int cpu __used, u64 timestamp __used, @@ -1510,17 +1505,17 @@ process_sched_runtime_event(struct raw_event_sample *raw, { struct trace_runtime_event runtime_event; - FILL_ARRAY(runtime_event, comm, event, raw->data); - FILL_FIELD(runtime_event, pid, event, raw->data); - FILL_FIELD(runtime_event, runtime, event, raw->data); - FILL_FIELD(runtime_event, vruntime, event, raw->data); + FILL_ARRAY(runtime_event, comm, event, data); + FILL_FIELD(runtime_event, pid, event, data); + FILL_FIELD(runtime_event, runtime, event, data); + FILL_FIELD(runtime_event, vruntime, event, data); if (trace_handler->runtime_event) trace_handler->runtime_event(&runtime_event, event, cpu, timestamp, thread); } static void -process_sched_fork_event(struct raw_event_sample *raw, +process_sched_fork_event(void *data, struct event *event, int cpu __used, u64 timestamp __used, @@ -1528,12 +1523,12 @@ process_sched_fork_event(struct raw_event_sample *raw, { struct trace_fork_event fork_event; - FILL_COMMON_FIELDS(fork_event, event, raw->data); + FILL_COMMON_FIELDS(fork_event, event, data); - FILL_ARRAY(fork_event, parent_comm, event, raw->data); - FILL_FIELD(fork_event, parent_pid, event, raw->data); - FILL_ARRAY(fork_event, child_comm, event, raw->data); - FILL_FIELD(fork_event, child_pid, event, raw->data); + FILL_ARRAY(fork_event, parent_comm, event, data); + FILL_FIELD(fork_event, parent_pid, event, data); + FILL_ARRAY(fork_event, child_comm, event, data); + FILL_FIELD(fork_event, child_pid, event, data); if (trace_handler->fork_event) trace_handler->fork_event(&fork_event, event, cpu, timestamp, thread); @@ -1550,7 +1545,7 @@ process_sched_exit_event(struct event *event, } static void -process_sched_migrate_task_event(struct raw_event_sample *raw, +process_sched_migrate_task_event(void *data, struct event *event, int cpu __used, u64 timestamp __used, @@ -1558,46 +1553,42 @@ process_sched_migrate_task_event(struct raw_event_sample *raw, { struct trace_migrate_task_event migrate_task_event; - FILL_COMMON_FIELDS(migrate_task_event, event, raw->data); + FILL_COMMON_FIELDS(migrate_task_event, event, data); - FILL_ARRAY(migrate_task_event, comm, event, raw->data); - FILL_FIELD(migrate_task_event, pid, event, raw->data); - FILL_FIELD(migrate_task_event, prio, event, raw->data); - FILL_FIELD(migrate_task_event, cpu, event, raw->data); + FILL_ARRAY(migrate_task_event, comm, event, data); + FILL_FIELD(migrate_task_event, pid, event, data); + FILL_FIELD(migrate_task_event, prio, event, data); + FILL_FIELD(migrate_task_event, cpu, event, data); if (trace_handler->migrate_task_event) trace_handler->migrate_task_event(&migrate_task_event, event, cpu, timestamp, thread); } static void -process_raw_event(event_t *raw_event __used, u32 size, void *data, +process_raw_event(event_t *raw_event __used, void *data, int cpu, u64 timestamp, struct thread *thread) { - struct raw_event_sample *raw; struct event *event; int type; - raw = malloc_or_die(sizeof(*raw)+size); - raw->size = size; - memcpy(raw->data, data, size); - type = trace_parse_common_type(raw->data); + type = trace_parse_common_type(data); event = trace_find_event(type); if (!strcmp(event->name, "sched_switch")) - process_sched_switch_event(raw, event, cpu, timestamp, thread); + process_sched_switch_event(data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_stat_runtime")) - process_sched_runtime_event(raw, event, cpu, timestamp, thread); + process_sched_runtime_event(data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup")) - process_sched_wakeup_event(raw, event, cpu, timestamp, thread); + process_sched_wakeup_event(data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_wakeup_new")) - process_sched_wakeup_event(raw, event, cpu, timestamp, thread); + process_sched_wakeup_event(data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_process_fork")) - process_sched_fork_event(raw, event, cpu, timestamp, thread); + process_sched_fork_event(data, event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_process_exit")) process_sched_exit_event(event, cpu, timestamp, thread); if (!strcmp(event->name, "sched_migrate_task")) - process_sched_migrate_task_event(raw, event, cpu, timestamp, thread); + process_sched_migrate_task_event(data, event, cpu, timestamp, thread); } static int process_sample_event(event_t *event) @@ -1633,8 +1624,7 @@ static int process_sample_event(event_t *event) if (profile_cpu != -1 && profile_cpu != (int)data.cpu) return 0; - process_raw_event(event, data.raw_size, data.raw_data, data.cpu, - data.time, thread); + process_raw_event(event, data.raw_data, data.cpu, data.time, thread); return 0; } From 3a9a0beba2913edaae39ff8b4645fee10c3acf37 Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 6 Dec 2009 20:41:52 -0600 Subject: [PATCH 20/57] perf trace/scripting: Fix compile error when libperl not installed When I added the xs callbacks into perf, I forgot to re-check the no-libperl case. This patch fixes the undefined reference error for that. Reported-by: Arnaldo Carvalho de Melo Signed-off-by: Tom Zanussi Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1260153712.6564.4.camel@tropicana> Signed-off-by: Ingo Molnar --- tools/perf/util/trace-event-perl.c | 3 --- tools/perf/util/trace-event-perl.h | 4 ++++ 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/trace-event-perl.c b/tools/perf/util/trace-event-perl.c index 51e833fd58c..59564b22d9c 100644 --- a/tools/perf/util/trace-event-perl.c +++ b/tools/perf/util/trace-event-perl.c @@ -32,9 +32,6 @@ void xs_init(pTHX); -void boot_Perf__Trace__Context(pTHX_ CV *cv); -void boot_DynaLoader(pTHX_ CV *cv); - void xs_init(pTHX) { const char *file = __FILE__; diff --git a/tools/perf/util/trace-event-perl.h b/tools/perf/util/trace-event-perl.h index 8fe0d866fe1..e88fb26137b 100644 --- a/tools/perf/util/trace-event-perl.h +++ b/tools/perf/util/trace-event-perl.h @@ -34,9 +34,13 @@ typedef int INTERP; #define dXSUB_SYS #define pTHX_ static inline void newXS(const char *a, void *b, const char *c) {} +static void boot_Perf__Trace__Context(pTHX_ CV *cv) {} +static void boot_DynaLoader(pTHX_ CV *cv) {} #else #include #include +void boot_Perf__Trace__Context(pTHX_ CV *cv); +void boot_DynaLoader(pTHX_ CV *cv); typedef PerlInterpreter * INTERP; #endif From ed872d09effd54aa8ecb4ceedbc4dbab9592f337 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Dec 2009 03:14:17 +0100 Subject: [PATCH 21/57] hw-breakpoints: Zeroe the breakpoint attrs on initialization The perf attrs used to set up breakpoint parameters are often allocated in the stack and not zeroed out before calling hw_breakpoint_init(). Handle it from this helper to avoid random attributes set by the stack. Signed-off-by: Frederic Weisbecker Cc: Prasad --- include/linux/hw_breakpoint.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 4d14a384a01..42da1ce19ec 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -22,6 +22,8 @@ enum { static inline void hw_breakpoint_init(struct perf_event_attr *attr) { + memset(attr, 0, sizeof(*attr)); + attr->type = PERF_TYPE_BREAKPOINT; attr->size = sizeof(*attr); /* From 56053170ea2a2c0dc17420e9b94aa3ca51d80408 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 7 Dec 2009 06:46:48 +0100 Subject: [PATCH 22/57] hw-breakpoints: Fix task-bound breakpoint slot allocation Whatever the context nature of a breakpoint, we always perform the following constraint checks before allocating it a slot: - Check the number of pinned breakpoint bound the concerned cpus - Check the max number of task-bound breakpoints that are belonging to a task. - Add both and see if we have a reamining slot for the new breakpoint This is the right thing to do when we are about to register a cpu-only bound breakpoint. But not if we are dealing with a task bound breakpoint. What we want in this case is: - Check the number of pinned breakpoint bound the concerned cpus - Check the number of breakpoints that already belong to the task in which the breakpoint to register is bound to. - Add both This fixes a regression that makes the "firefox -g" command fail to register breakpoints once we deal with a secondary thread. Reported-by: Walt Signed-off-by: Frederic Weisbecker Cc: Prasad --- kernel/hw_breakpoint.c | 96 ++++++++++++++++++++++++------------------ 1 file changed, 56 insertions(+), 40 deletions(-) diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index b600fc27f16..02b492504a5 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -83,50 +83,16 @@ static unsigned int max_task_bp_pinned(int cpu) return 0; } -/* - * Report the number of pinned/un-pinned breakpoints we have in - * a given cpu (cpu > -1) or in all of them (cpu = -1). - */ -static void fetch_bp_busy_slots(struct bp_busy_slots *slots, int cpu) +static int task_bp_pinned(struct task_struct *tsk) { - if (cpu >= 0) { - slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); - slots->pinned += max_task_bp_pinned(cpu); - slots->flexible = per_cpu(nr_bp_flexible, cpu); - - return; - } - - for_each_online_cpu(cpu) { - unsigned int nr; - - nr = per_cpu(nr_cpu_bp_pinned, cpu); - nr += max_task_bp_pinned(cpu); - - if (nr > slots->pinned) - slots->pinned = nr; - - nr = per_cpu(nr_bp_flexible, cpu); - - if (nr > slots->flexible) - slots->flexible = nr; - } -} - -/* - * Add a pinned breakpoint for the given task in our constraint table - */ -static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) -{ - int count = 0; - struct perf_event *bp; struct perf_event_context *ctx = tsk->perf_event_ctxp; - unsigned int *tsk_pinned; struct list_head *list; + struct perf_event *bp; unsigned long flags; + int count = 0; if (WARN_ONCE(!ctx, "No perf context for this task")) - return; + return 0; list = &ctx->event_list; @@ -143,8 +109,58 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) spin_unlock_irqrestore(&ctx->lock, flags); - if (WARN_ONCE(count < 0, "No breakpoint counter found in the counter list")) + return count; +} + +/* + * Report the number of pinned/un-pinned breakpoints we have in + * a given cpu (cpu > -1) or in all of them (cpu = -1). + */ +static void +fetch_bp_busy_slots(struct bp_busy_slots *slots, struct perf_event *bp) +{ + int cpu = bp->cpu; + struct task_struct *tsk = bp->ctx->task; + + if (cpu >= 0) { + slots->pinned = per_cpu(nr_cpu_bp_pinned, cpu); + if (!tsk) + slots->pinned += max_task_bp_pinned(cpu); + else + slots->pinned += task_bp_pinned(tsk); + slots->flexible = per_cpu(nr_bp_flexible, cpu); + return; + } + + for_each_online_cpu(cpu) { + unsigned int nr; + + nr = per_cpu(nr_cpu_bp_pinned, cpu); + if (!tsk) + nr += max_task_bp_pinned(cpu); + else + nr += task_bp_pinned(tsk); + + if (nr > slots->pinned) + slots->pinned = nr; + + nr = per_cpu(nr_bp_flexible, cpu); + + if (nr > slots->flexible) + slots->flexible = nr; + } +} + +/* + * Add a pinned breakpoint for the given task in our constraint table + */ +static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) +{ + unsigned int *tsk_pinned; + int count = 0; + + count = task_bp_pinned(tsk); tsk_pinned = per_cpu(task_bp_pinned, cpu); if (enable) { @@ -233,7 +249,7 @@ int reserve_bp_slot(struct perf_event *bp) mutex_lock(&nr_bp_mutex); - fetch_bp_busy_slots(&slots, bp->cpu); + fetch_bp_busy_slots(&slots, bp); /* Flexible counters need to keep at least one slot */ if (slots.pinned + (!!slots.flexible) == HBP_NUM) { From 67a6259ec97b8408f86f2fe8459d2233f0b0987d Mon Sep 17 00:00:00 2001 From: Tom Zanussi Date: Sun, 6 Dec 2009 23:31:59 -0600 Subject: [PATCH 23/57] perf trace/scripting: Don't display 'scripting unsupported' msg unnecessarily The 'scripting unsupported' message should only be displayed when the -s or -g options are used, and not when they aren't, as the current code does. Signed-off-by: Tom Zanussi Cc: rostedt@goodmis.org Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <1260163919-6679-3-git-send-email-tzanussi@gmail.com> Signed-off-by: Ingo Molnar --- tools/perf/util/trace-event-perl.c | 64 +++++++++++++++++++++++++----- 1 file changed, 55 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/trace-event-perl.c b/tools/perf/util/trace-event-perl.c index 59564b22d9c..a5ffe60db5d 100644 --- a/tools/perf/util/trace-event-perl.c +++ b/tools/perf/util/trace-event-perl.c @@ -570,26 +570,72 @@ struct scripting_ops perl_scripting_ops = { .generate_script = perl_generate_script, }; -#ifdef NO_LIBPERL -void setup_perl_scripting(void) +static void print_unsupported_msg(void) { fprintf(stderr, "Perl scripting not supported." - " Install libperl and rebuild perf to enable it. e.g. " - "apt-get install libperl-dev (ubuntu), yum install " - "perl-ExtUtils-Embed (Fedora), etc.\n"); + " Install libperl and rebuild perf to enable it.\n" + "For example:\n # apt-get install libperl-dev (ubuntu)" + "\n # yum install perl-ExtUtils-Embed (Fedora)" + "\n etc.\n"); } -#else -void setup_perl_scripting(void) + +static int perl_start_script_unsupported(const char *script __unused) +{ + print_unsupported_msg(); + + return -1; +} + +static int perl_stop_script_unsupported(void) +{ + return 0; +} + +static void perl_process_event_unsupported(int cpu __unused, + void *data __unused, + int size __unused, + unsigned long long nsecs __unused, + char *comm __unused) +{ +} + +static int perl_generate_script_unsupported(const char *outfile __unused) +{ + print_unsupported_msg(); + + return -1; +} + +struct scripting_ops perl_scripting_unsupported_ops = { + .name = "Perl", + .start_script = perl_start_script_unsupported, + .stop_script = perl_stop_script_unsupported, + .process_event = perl_process_event_unsupported, + .generate_script = perl_generate_script_unsupported, +}; + +static void register_perl_scripting(struct scripting_ops *scripting_ops) { int err; - err = script_spec_register("Perl", &perl_scripting_ops); + err = script_spec_register("Perl", scripting_ops); if (err) die("error registering Perl script extension"); - err = script_spec_register("pl", &perl_scripting_ops); + err = script_spec_register("pl", scripting_ops); if (err) die("error registering pl script extension"); scripting_context = malloc(sizeof(struct scripting_context)); } + +#ifdef NO_LIBPERL +void setup_perl_scripting(void) +{ + register_perl_scripting(&perl_scripting_unsupported_ops); +} +#else +void setup_perl_scripting(void) +{ + register_perl_scripting(&perl_scripting_ops); +} #endif From 180570fdb7a3c404b599f0a318c2ccf86e4827ed Mon Sep 17 00:00:00 2001 From: Ulrich Drepper Date: Sun, 6 Dec 2009 13:25:30 -0500 Subject: [PATCH 24/57] perf tools: Optimize parse_subsystem_tracepoint_event() Uses of strcat are almost always signs that someone is too lazy to think about the code a bit more carefully. One always has to know about the lengths of the strings involved to avoid buffer overflows. This is one case where the size of the object code for me is reduced by 38 bytes. The code should also be faster, especially if flags is non-NULL. Signed-off-by: Ulrich Drepper Cc: a.p.zijlstra@chello.nl Cc: fweisbec@gmail.com Cc: jaswinderrajput@gmail.com Cc: paulus@samba.org LKML-Reference: <200912061825.nB6IPUa1023306@hs20-bc2-1.build.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 448a13b5201..e5bc0fb016b 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -467,7 +467,6 @@ parse_subsystem_tracepoint_event(char *sys_name, char *flags) while ((evt_ent = readdir(evt_dir))) { char event_opt[MAX_EVOPT_LEN + 1]; int len; - unsigned int rem = MAX_EVOPT_LEN; if (!strcmp(evt_ent->d_name, ".") || !strcmp(evt_ent->d_name, "..") @@ -475,20 +474,12 @@ parse_subsystem_tracepoint_event(char *sys_name, char *flags) || !strcmp(evt_ent->d_name, "filter")) continue; - len = snprintf(event_opt, MAX_EVOPT_LEN, "%s:%s", sys_name, - evt_ent->d_name); + len = snprintf(event_opt, MAX_EVOPT_LEN, "%s:%s%s%s", sys_name, + evt_ent->d_name, flags ? ":" : "", + flags ?: ""); if (len < 0) return EVT_FAILED; - rem -= len; - if (flags) { - if (rem < strlen(flags) + 1) - return EVT_FAILED; - - strcat(event_opt, ":"); - strcat(event_opt, flags); - } - if (parse_events(NULL, event_opt, 0)) return EVT_FAILED; } From cbe5c34c8c1f8ed1afbe6273f4ad57fcfad7822f Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Sun, 6 Dec 2009 20:14:29 +0900 Subject: [PATCH 25/57] x86: Compile insn.c and inat.c only for KPROBES At least, insn.c and inat.c is needed for kprobe for now. So, this compile those only if KPROBES is enabled. Signed-off-by: OGAWA Hirofumi Cc: Masami Hiramatsu LKML-Reference: <878wdg8icq.fsf@devron.myhome.or.jp> Signed-off-by: Ingo Molnar --- arch/x86/Kconfig.debug | 4 ++-- arch/x86/lib/Makefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug index 7d0b681a132..0e90929da40 100644 --- a/arch/x86/Kconfig.debug +++ b/arch/x86/Kconfig.debug @@ -187,8 +187,8 @@ config HAVE_MMIOTRACE_SUPPORT def_bool y config X86_DECODER_SELFTEST - bool "x86 instruction decoder selftest" - depends on DEBUG_KERNEL + bool "x86 instruction decoder selftest" + depends on DEBUG_KERNEL && KPROBES ---help--- Perform x86 instruction decoder selftests at build time. This option is useful for checking the sanity of x86 instruction diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index a2d6472895f..442b3b3b2d8 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -20,7 +20,7 @@ lib-y := delay.o lib-y += thunk_$(BITS).o lib-y += usercopy_$(BITS).o getuser.o putuser.o lib-y += memcpy_$(BITS).o -lib-y += insn.o inat.o +lib-$(CONFIG_KPROBES) += insn.o inat.o obj-y += msr-reg.o msr-reg-export.o From d32ba45503acf9c23b301eba2397ca2ee322627b Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 7 Dec 2009 12:00:33 -0500 Subject: [PATCH 26/57] x86 insn: Delete empty or incomplete inat-tables.c Delete empty or incomplete inat-tables.c if gen-insn-attr-x86.awk failed, because it causes a build error if user tries to build kernel next time. Reported-by: Arkadiusz Miskiewicz Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Jens Axboe Cc: Frederic Weisbecker LKML-Reference: <20091207170033.19230.37688.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- arch/x86/lib/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile index 442b3b3b2d8..45b20e486c2 100644 --- a/arch/x86/lib/Makefile +++ b/arch/x86/lib/Makefile @@ -5,7 +5,7 @@ inat_tables_script = $(srctree)/arch/x86/tools/gen-insn-attr-x86.awk inat_tables_maps = $(srctree)/arch/x86/lib/x86-opcode-map.txt quiet_cmd_inat_tables = GEN $@ - cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ + cmd_inat_tables = $(AWK) -f $(inat_tables_script) $(inat_tables_maps) > $@ || rm -f $@ $(obj)/inat-tables.c: $(inat_tables_script) $(inat_tables_maps) $(call cmd,inat_tables) From d56728b8d7fb3e1e5e5f97b88fdf6b43a35b4f5e Mon Sep 17 00:00:00 2001 From: Juha Leppanen Date: Mon, 7 Dec 2009 12:00:40 -0500 Subject: [PATCH 27/57] perf probe: Fix strtailcmp() to compare s1and s2[0] Fix strtailcmp() to compare s1[0] and s2[0]. strtailcmp() returns 0 if "a" and "b" or "a" and "ab", it's a wrong behavior. This patch fixes it. Signed-off-by: "Juha Leppanen" Acked-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Juha Leppanen Cc: Frederic Weisbecker LKML-Reference: <20091207170040.19230.37464.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/util/probe-finder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/probe-finder.c b/tools/perf/util/probe-finder.c index 293cdfc1b8c..4585f1d8679 100644 --- a/tools/perf/util/probe-finder.c +++ b/tools/perf/util/probe-finder.c @@ -106,7 +106,7 @@ static int strtailcmp(const char *s1, const char *s2) { int i1 = strlen(s1); int i2 = strlen(s2); - while (--i1 > 0 && --i2 > 0) { + while (--i1 >= 0 && --i2 >= 0) { if (s1[i1] != s2[i2]) return s1[i1] - s2[i2]; } From e1d2017b24fb31602f1128e6a8b2afc54c9283cd Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 7 Dec 2009 12:00:46 -0500 Subject: [PATCH 28/57] perf probe: Fix event namelist to duplicate string Fix event namelist to duplicate string. Without duplicating, adding multiple probes causes stack overwrite bug, because it reuses a buffer on stack while the buffer is already added in the namelist. String duplication solves this bug because only contents of the buffer is copied to the namelist. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20091207170046.19230.55557.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/util/probe-event.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index cd7fbda5e2a..de0d91385c9 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -413,12 +413,13 @@ static struct strlist *get_perf_event_names(int fd) rawlist = get_trace_kprobe_event_rawlist(fd); - sl = strlist__new(false, NULL); + sl = strlist__new(true, NULL); for (i = 0; i < strlist__nr_entries(rawlist); i++) { ent = strlist__entry(rawlist, i); parse_trace_kprobe_event(ent->s, &group, &event, NULL); strlist__add(sl, event); free(group); + free(event); } strlist__delete(rawlist); @@ -480,5 +481,6 @@ void add_trace_kprobe_events(struct probe_point *probes, int nr_probes) strlist__add(namelist, event); } } + strlist__delete(namelist); close(fd); } From 849884508ecbe2d220131840e4cc7c32ca24ebe3 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 7 Dec 2009 12:00:53 -0500 Subject: [PATCH 29/57] perf probe: Check e_snprintf() format string Check e_snprintf() format string by gcc, and fix a bug of e_snprintf() caller. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20091207170053.19230.7690.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/util/probe-event.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index de0d91385c9..88e18044993 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -47,6 +47,9 @@ #define semantic_error(msg ...) die("Semantic error :" msg) /* If there is no space to write, returns -E2BIG. */ +static int e_snprintf(char *str, size_t size, const char *format, ...) + __attribute__((format(printf, 3, 4))); + static int e_snprintf(char *str, size_t size, const char *format, ...) { int ret; @@ -258,7 +261,7 @@ int synthesize_perf_probe_event(struct probe_point *pp) ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s", pp->function, offs, pp->retprobe ? "%return" : "", line); else - ret = e_snprintf(buf, MAX_CMDLEN, "%s%s%s%s", pp->file, line); + ret = e_snprintf(buf, MAX_CMDLEN, "%s%s", pp->file, line); if (ret <= 0) goto error; len = ret; From d3a2dbf844d81b4b9c9ad6044563c294e7a48cac Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 7 Dec 2009 12:00:59 -0500 Subject: [PATCH 30/57] perf probe: Use pr_debug for debug message Use pr_debug() for "missing vmlinux" debugging message. Signed-off-by: Masami Hiramatsu Cc: systemtap Cc: DLE Cc: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20091207170059.19230.51459.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-probe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index a58e11b7ea8..8993a1f4e1c 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -194,8 +194,8 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) if (session.need_dwarf) die("Could not open vmlinux/module file."); - pr_warning("Could not open vmlinux/module file." - " Try to use symbols.\n"); + pr_debug("Could not open vmlinux/module file." + " Try to use symbols.\n"); goto end_dwarf; } From 2ff6cfd70720780234fdfea636218c2a62b31287 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Mon, 7 Dec 2009 17:12:58 +0100 Subject: [PATCH 31/57] perf events: hw_breakpoints: Don't include asm/hw_breakpoint.h in user space asm/hw_breakpoint.h is evidently a kernel internal file and should not be included globally, not even under an #ifdef. Reported-by: Geert Uytterhoeven Signed-off-by: Arnd Bergmann Cc: Frederic Weisbecker Cc: Alan Stern Cc: K.Prasad LKML-Reference: <200912071712.58650.arnd@arndb.de> Signed-off-by: Ingo Molnar --- include/linux/perf_event.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index 89098e35a03..bf3329413e1 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -18,10 +18,6 @@ #include #include -#ifdef CONFIG_HAVE_HW_BREAKPOINT -#include -#endif - /* * User-space ABI bits: */ @@ -451,6 +447,10 @@ enum perf_callchain_context { # include #endif +#ifdef CONFIG_HAVE_HW_BREAKPOINT +#include +#endif + #include #include #include From 6ab8886326a1b9a3a8d164d8174e3c20703a03a2 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Tue, 8 Dec 2009 18:25:15 +1100 Subject: [PATCH 32/57] perf: hw_breakpoints: Fix percpu namespace clash Today's linux-next build failed with: kernel/hw_breakpoint.c:86: error: 'task_bp_pinned' redeclared as different kind of symbol ... Caused by commit dd17c8f72993f9461e9c19250e3f155d6d99df22 ("percpu: remove per_cpu__ prefix") from the percpu tree interacting with commit 56053170ea2a2c0dc17420e9b94aa3ca51d80408 ("hw-breakpoints: Fix task-bound breakpoint slot allocation") from the tip tree. Signed-off-by: Stephen Rothwell Acked-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Tejun Heo Cc: Rusty Russell Cc: Christoph Lameter Cc: Andrew Morton Cc: Linus Torvalds LKML-Reference: <20091208182515.bb6dda4a.sfr@canb.auug.org.au> Signed-off-by: Ingo Molnar --- kernel/hw_breakpoint.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 02b492504a5..03a0773ac2b 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -52,7 +52,7 @@ static DEFINE_PER_CPU(unsigned int, nr_cpu_bp_pinned); /* Number of pinned task breakpoints in a cpu */ -static DEFINE_PER_CPU(unsigned int, task_bp_pinned[HBP_NUM]); +static DEFINE_PER_CPU(unsigned int, nr_task_bp_pinned[HBP_NUM]); /* Number of non-pinned cpu/task breakpoints in a cpu */ static DEFINE_PER_CPU(unsigned int, nr_bp_flexible); @@ -73,7 +73,7 @@ static DEFINE_MUTEX(nr_bp_mutex); static unsigned int max_task_bp_pinned(int cpu) { int i; - unsigned int *tsk_pinned = per_cpu(task_bp_pinned, cpu); + unsigned int *tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); for (i = HBP_NUM -1; i >= 0; i--) { if (tsk_pinned[i] > 0) @@ -162,7 +162,7 @@ static void toggle_bp_task_slot(struct task_struct *tsk, int cpu, bool enable) count = task_bp_pinned(tsk); - tsk_pinned = per_cpu(task_bp_pinned, cpu); + tsk_pinned = per_cpu(nr_task_bp_pinned, cpu); if (enable) { tsk_pinned[count]++; if (count > 0) @@ -209,7 +209,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to a single cpu, check: * * (per_cpu(nr_bp_flexible, cpu) || (per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(task_bp_pinned, cpu)))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, cpu)))) < HBP_NUM * * -> If there are already non-pinned counters in this cpu, it means * there is already a free slot for them. @@ -220,7 +220,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to every cpus, check: * * (per_cpu(nr_bp_flexible, *) || (max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(task_bp_pinned, *)))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, *)))) < HBP_NUM * * -> This is roughly the same, except we check the number of per cpu * bp for every cpu and we keep the max one. Same for the per tasks @@ -232,7 +232,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to a single cpu, check: * * ((per_cpu(nr_bp_flexible, cpu) > 1) + per_cpu(nr_cpu_bp_pinned, cpu) - * + max(per_cpu(task_bp_pinned, cpu))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, cpu))) < HBP_NUM * * -> Same checks as before. But now the nr_bp_flexible, if any, must keep * one register at least (or they will never be fed). @@ -240,7 +240,7 @@ static void toggle_bp_slot(struct perf_event *bp, bool enable) * - If attached to every cpus, check: * * ((per_cpu(nr_bp_flexible, *) > 1) + max(per_cpu(nr_cpu_bp_pinned, *)) - * + max(per_cpu(task_bp_pinned, *))) < HBP_NUM + * + max(per_cpu(nr_task_bp_pinned, *))) < HBP_NUM */ int reserve_bp_slot(struct perf_event *bp) { From 278498d438781426d8f315b65f7bca023a26fcc0 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2009 17:02:40 -0500 Subject: [PATCH 33/57] perf probe: Change event list format Change event list format for user readability. perf probe --list shows event list in "[GROUP:EVENT] EVENT-DEFINITION" format, but this format is different from the output of perf-list, and EVENT-DEFINITION is a bit blunt. This patch changes the format to more user friendly one. Before: [probe:schedule_0] schedule+10 prev cpu After: probe:schedule_0 (on schedule+10 with prev cpu) Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE LKML-Reference: <20091208220240.10142.42916.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/util/probe-event.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 88e18044993..a20e3827324 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -379,11 +379,29 @@ static void clear_probe_point(struct probe_point *pp) memset(pp, 0, sizeof(pp)); } +/* Show an event */ +static void show_perf_probe_event(const char *group, const char *event, + const char *place, struct probe_point *pp) +{ + int i; + char buf[128]; + + e_snprintf(buf, 128, "%s:%s", group, event); + printf(" %-40s (on %s", buf, place); + + if (pp->nr_args > 0) { + printf(" with"); + for (i = 0; i < pp->nr_args; i++) + printf(" %s", pp->args[i]); + } + printf(")\n"); +} + /* List up current perf-probe events */ void show_perf_probe_events(void) { unsigned int i; - int fd; + int fd, nr; char *group, *event; struct probe_point pp; struct strlist *rawlist; @@ -396,8 +414,13 @@ void show_perf_probe_events(void) for (i = 0; i < strlist__nr_entries(rawlist); i++) { ent = strlist__entry(rawlist, i); parse_trace_kprobe_event(ent->s, &group, &event, &pp); + /* Synthesize only event probe point */ + nr = pp.nr_args; + pp.nr_args = 0; synthesize_perf_probe_event(&pp); - printf("[%s:%s]\t%s\n", group, event, pp.probes[0]); + pp.nr_args = nr; + /* Show an event */ + show_perf_probe_event(group, event, pp.probes[0], &pp); free(group); free(event); clear_probe_point(&pp); From a9b495b0d35859971d6896293f6d0a0d880c7dfb Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2009 17:02:47 -0500 Subject: [PATCH 34/57] perf probe: Change probe-added message more user-friendly Change probe-added message more user-friendly expression and show usage of new events. Before: Added new event: p:probe/schedule_0 schedule+10 prev=%ax cpu=%bx After: Added new event: probe:schedule_1 (on schedule+1 with prev cpu) You can now use it on all perf tools, such as: perf record -e probe:schedule_1 -a sleep 1 Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE LKML-Reference: <20091208220247.10142.91642.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/util/probe-event.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index a20e3827324..2c4d3017441 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -453,17 +453,13 @@ static struct strlist *get_perf_event_names(int fd) return sl; } -static int write_trace_kprobe_event(int fd, const char *buf) +static void write_trace_kprobe_event(int fd, const char *buf) { int ret; ret = write(fd, buf, strlen(buf)); if (ret <= 0) die("Failed to create event."); - else - printf("Added new event: %s\n", buf); - - return ret; } static void get_new_event_name(char *buf, size_t len, const char *base, @@ -503,10 +499,19 @@ void add_trace_kprobe_events(struct probe_point *probes, int nr_probes) PERFPROBE_GROUP, event, pp->probes[i]); write_trace_kprobe_event(fd, buf); + printf("Added new event:\n"); + /* Get the first parameter (probe-point) */ + sscanf(pp->probes[i], "%s", buf); + show_perf_probe_event(PERFPROBE_GROUP, event, + buf, pp); /* Add added event name to namelist */ strlist__add(namelist, event); } } + /* Show how to use the event. */ + printf("\nYou can now use it on all perf tools, such as:\n\n"); + printf("\tperf record -e %s:%s -a sleep 1\n\n", PERFPROBE_GROUP, event); + strlist__delete(namelist); close(fd); } From d1bde3f755e8652faad59e264c466c4baab68fa8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2009 17:02:54 -0500 Subject: [PATCH 35/57] perf probe: Fix add-probe command syntax without --add option Fix add-probe command syntax without --add option. perf-probe supports add-probe command without --add option. But it treats each argument as an event definition. e.g. perf probe func arg1 arg2 is interpreted as perf probe --add func --add arg1 --add arg2 But it may be useless in many cases. This patch fixes this syntax to fold those arguments into one event definition if there is no --add option. With this change, above command is interpreted as below; perf probe --add "func arg1 arg2" Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE LKML-Reference: <20091208220254.10142.73767.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-probe.c | 33 ++++++++++++++++++++++++++------- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 8993a1f4e1c..1347fdf5337 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -79,6 +79,25 @@ static void parse_probe_event(const char *str) pr_debug("%d arguments\n", pp->nr_args); } +static void parse_probe_event_argv(int argc, const char **argv) +{ + int i, len; + char *buf; + + /* Bind up rest arguments */ + len = 0; + for (i = 0; i < argc; i++) + len += strlen(argv[i]) + 1; + buf = zalloc(len + 1); + if (!buf) + die("Failed to allocate memory for binding arguments."); + len = 0; + for (i = 0; i < argc; i++) + len += sprintf(&buf[len], "%s ", argv[i]); + parse_probe_event(buf); + free(buf); +} + static int opt_add_probe_event(const struct option *opt __used, const char *str, int unset __used) { @@ -160,7 +179,7 @@ static const struct option options[] = { int cmd_probe(int argc, const char **argv, const char *prefix __used) { - int i, j, ret; + int i, ret; #ifndef NO_LIBDWARF int fd; #endif @@ -168,8 +187,8 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) argc = parse_options(argc, argv, options, probe_usage, PARSE_OPT_STOP_AT_NON_OPTION); - for (i = 0; i < argc; i++) - parse_probe_event(argv[i]); + if (argc > 0) + parse_probe_event_argv(argc, argv); if ((session.nr_probe == 0 && !listing) || (session.nr_probe != 0 && listing)) @@ -200,8 +219,8 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) } /* Searching probe points */ - for (j = 0; j < session.nr_probe; j++) { - pp = &session.probes[j]; + for (i = 0; i < session.nr_probe; i++) { + pp = &session.probes[i]; if (pp->found) continue; @@ -223,8 +242,8 @@ end_dwarf: #endif /* !NO_LIBDWARF */ /* Synthesize probes without dwarf */ - for (j = 0; j < session.nr_probe; j++) { - pp = &session.probes[j]; + for (i = 0; i < session.nr_probe; i++) { + pp = &session.probes[i]; if (pp->found) /* This probe is already found. */ continue; From 17f88fcd667a914b6f4dca146c9a09492fcd57b8 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2009 17:03:02 -0500 Subject: [PATCH 36/57] perf probe: Remove event suffix number _0 Remove event suffix number _0 if it is the first. The first event has no suffix, and from the second, each event has suffix number counted from _1. This reduces typing cost :-). Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE LKML-Reference: <20091208220301.10142.50031.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/util/probe-event.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 2c4d3017441..31beedcf61c 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -466,7 +466,16 @@ static void get_new_event_name(char *buf, size_t len, const char *base, struct strlist *namelist) { int i, ret; - for (i = 0; i < MAX_EVENT_INDEX; i++) { + + /* Try no suffix */ + ret = e_snprintf(buf, len, "%s", base); + if (ret < 0) + die("snprintf() failed: %s", strerror(-ret)); + if (!strlist__has_entry(namelist, buf)) + return; + + /* Try to add suffix */ + for (i = 1; i < MAX_EVENT_INDEX; i++) { ret = e_snprintf(buf, len, "%s_%d", base, i); if (ret < 0) die("snprintf() failed: %s", strerror(-ret)); From f984f03da35357b23d53e9cad29e909810857451 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2009 17:03:09 -0500 Subject: [PATCH 37/57] perf probe: Support vmlinux on cwd by default Support vmlinux on current working direcotry by default and also update file-open messages. Now perf probe searches ./vmlinux too. Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE LKML-Reference: <20091208220309.10142.33040.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-probe.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 1347fdf5337..1c97e133a3f 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -43,11 +43,12 @@ #include "util/probe-event.h" /* Default vmlinux search paths */ -#define NR_SEARCH_PATH 3 +#define NR_SEARCH_PATH 4 const char *default_search_path[NR_SEARCH_PATH] = { "/lib/modules/%s/build/vmlinux", /* Custom build kernel */ "/usr/lib/debug/lib/modules/%s/vmlinux", /* Red Hat debuginfo */ "/boot/vmlinux-debug-%s", /* Ubuntu */ +"./vmlinux", /* CWD */ }; #define MAX_PATH_LEN 256 @@ -205,13 +206,14 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) #else /* !NO_LIBDWARF */ pr_debug("Some probes require debuginfo.\n"); - if (session.vmlinux) + if (session.vmlinux) { + pr_debug("Try to open %s.", session.vmlinux); fd = open(session.vmlinux, O_RDONLY); - else + } else fd = open_default_vmlinux(); if (fd < 0) { if (session.need_dwarf) - die("Could not open vmlinux/module file."); + die("Could not open debuginfo file."); pr_debug("Could not open vmlinux/module file." " Try to use symbols.\n"); From a7c312bed772c11138409c3a98531e85d690302e Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2009 17:03:16 -0500 Subject: [PATCH 38/57] trace-kprobe: Support delete probe syntax Support delete probe syntax. The syntax is "-:[group/]event". Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE LKML-Reference: <20091208220316.10142.39192.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo --- kernel/trace/trace_kprobe.c | 37 ++++++++++++++++++++++++++++--------- 1 file changed, 28 insertions(+), 9 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index aff5f80b59b..bf05fb49a6f 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -606,23 +606,22 @@ static int create_trace_probe(int argc, char **argv) */ struct trace_probe *tp; int i, ret = 0; - int is_return = 0; + int is_return = 0, is_delete = 0; char *symbol = NULL, *event = NULL, *arg = NULL, *group = NULL; unsigned long offset = 0; void *addr = NULL; char buf[MAX_EVENT_NAME_LEN]; - if (argc < 2) { - pr_info("Probe point is not specified.\n"); - return -EINVAL; - } - + /* argc must be >= 1 */ if (argv[0][0] == 'p') is_return = 0; else if (argv[0][0] == 'r') is_return = 1; + else if (argv[0][0] == '-') + is_delete = 1; else { - pr_info("Probe definition must be started with 'p' or 'r'.\n"); + pr_info("Probe definition must be started with 'p', 'r' or" + " '-'.\n"); return -EINVAL; } @@ -642,7 +641,29 @@ static int create_trace_probe(int argc, char **argv) return -EINVAL; } } + if (!group) + group = KPROBE_EVENT_SYSTEM; + if (is_delete) { + if (!event) { + pr_info("Delete command needs an event name.\n"); + return -EINVAL; + } + tp = find_probe_event(event, group); + if (!tp) { + pr_info("Event %s/%s doesn't exist.\n", group, event); + return -ENOENT; + } + /* delete an event */ + unregister_trace_probe(tp); + free_trace_probe(tp); + return 0; + } + + if (argc < 2) { + pr_info("Probe point is not specified.\n"); + return -EINVAL; + } if (isdigit(argv[1][0])) { if (is_return) { pr_info("Return probe point must be a symbol.\n"); @@ -671,8 +692,6 @@ static int create_trace_probe(int argc, char **argv) argc -= 2; argv += 2; /* setup a probe */ - if (!group) - group = KPROBE_EVENT_SYSTEM; if (!event) { /* Make a new event name */ if (symbol) From fa28244d12337eebcc620b23852ec3cf29582ff9 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2009 17:03:23 -0500 Subject: [PATCH 39/57] perf probe: Support --del option Support perf probe --del option. Currently, perf probe can have only one event for each --del option. If you'd like to delete several probe events, you need to specify --del for each events. Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE LKML-Reference: <20091208220323.10142.62079.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-probe.c | 33 +++++++++++++++-- tools/perf/util/probe-event.c | 69 +++++++++++++++++++++++++++++++++-- tools/perf/util/probe-event.h | 1 + 3 files changed, 96 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-probe.c b/tools/perf/builtin-probe.c index 1c97e133a3f..5a47c1e11f7 100644 --- a/tools/perf/builtin-probe.c +++ b/tools/perf/builtin-probe.c @@ -35,6 +35,7 @@ #include "perf.h" #include "builtin.h" #include "util/util.h" +#include "util/strlist.h" #include "util/event.h" #include "util/debug.h" #include "util/parse-options.h" @@ -61,6 +62,7 @@ static struct { int need_dwarf; int nr_probe; struct probe_point probes[MAX_PROBES]; + struct strlist *dellist; } session; static bool listing; @@ -107,6 +109,17 @@ static int opt_add_probe_event(const struct option *opt __used, return 0; } +static int opt_del_probe_event(const struct option *opt __used, + const char *str, int unset __used) +{ + if (str) { + if (!session.dellist) + session.dellist = strlist__new(true, NULL); + strlist__add(session.dellist, str); + } + return 0; +} + #ifndef NO_LIBDWARF static int open_default_vmlinux(void) { @@ -141,6 +154,7 @@ static int open_default_vmlinux(void) static const char * const probe_usage[] = { "perf probe [] 'PROBEDEF' ['PROBEDEF' ...]", "perf probe [] --add 'PROBEDEF' [--add 'PROBEDEF' ...]", + "perf probe [] --del '[GROUP:]EVENT' ...", "perf probe --list", NULL }; @@ -152,7 +166,9 @@ static const struct option options[] = { OPT_STRING('k', "vmlinux", &session.vmlinux, "file", "vmlinux/module pathname"), #endif - OPT_BOOLEAN('l', "list", &listing, "list up current probes"), + OPT_BOOLEAN('l', "list", &listing, "list up current probe events"), + OPT_CALLBACK('d', "del", NULL, "[GROUP:]EVENT", "delete a probe event.", + opt_del_probe_event), OPT_CALLBACK('a', "add", NULL, #ifdef NO_LIBDWARF "FUNC[+OFFS|%return] [ARG ...]", @@ -191,15 +207,26 @@ int cmd_probe(int argc, const char **argv, const char *prefix __used) if (argc > 0) parse_probe_event_argv(argc, argv); - if ((session.nr_probe == 0 && !listing) || - (session.nr_probe != 0 && listing)) + if ((session.nr_probe == 0 && !session.dellist && !listing)) usage_with_options(probe_usage, options); if (listing) { + if (session.nr_probe != 0 || session.dellist) { + pr_warning(" Error: Don't use --list with" + " --add/--del.\n"); + usage_with_options(probe_usage, options); + } show_perf_probe_events(); return 0; } + if (session.dellist) { + del_trace_kprobe_events(session.dellist); + strlist__delete(session.dellist); + if (session.nr_probe == 0) + return 0; + } + if (session.need_dwarf) #ifdef NO_LIBDWARF die("Debuginfo-analysis is not supported"); diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 31beedcf61c..9480d9941cc 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -430,10 +430,11 @@ void show_perf_probe_events(void) } /* Get current perf-probe event names */ -static struct strlist *get_perf_event_names(int fd) +static struct strlist *get_perf_event_names(int fd, bool include_group) { unsigned int i; char *group, *event; + char buf[128]; struct strlist *sl, *rawlist; struct str_node *ent; @@ -443,7 +444,12 @@ static struct strlist *get_perf_event_names(int fd) for (i = 0; i < strlist__nr_entries(rawlist); i++) { ent = strlist__entry(rawlist, i); parse_trace_kprobe_event(ent->s, &group, &event, NULL); - strlist__add(sl, event); + if (include_group) { + if (e_snprintf(buf, 128, "%s:%s", group, event) < 0) + die("Failed to copy group:event name."); + strlist__add(sl, buf); + } else + strlist__add(sl, event); free(group); free(event); } @@ -457,9 +463,10 @@ static void write_trace_kprobe_event(int fd, const char *buf) { int ret; + pr_debug("Writing event: %s\n", buf); ret = write(fd, buf, strlen(buf)); if (ret <= 0) - die("Failed to create event."); + die("Failed to write event: %s", strerror(errno)); } static void get_new_event_name(char *buf, size_t len, const char *base, @@ -496,7 +503,7 @@ void add_trace_kprobe_events(struct probe_point *probes, int nr_probes) fd = open_kprobe_events(O_RDWR, O_APPEND); /* Get current event names */ - namelist = get_perf_event_names(fd); + namelist = get_perf_event_names(fd, false); for (j = 0; j < nr_probes; j++) { pp = probes + j; @@ -524,3 +531,57 @@ void add_trace_kprobe_events(struct probe_point *probes, int nr_probes) strlist__delete(namelist); close(fd); } + +static void del_trace_kprobe_event(int fd, const char *group, + const char *event, struct strlist *namelist) +{ + char buf[128]; + + if (e_snprintf(buf, 128, "%s:%s", group, event) < 0) + die("Failed to copy event."); + if (!strlist__has_entry(namelist, buf)) { + pr_warning("Warning: event \"%s\" is not found.\n", buf); + return; + } + /* Convert from perf-probe event to trace-kprobe event */ + if (e_snprintf(buf, 128, "-:%s/%s", group, event) < 0) + die("Failed to copy event."); + + write_trace_kprobe_event(fd, buf); + printf("Remove event: %s:%s\n", group, event); +} + +void del_trace_kprobe_events(struct strlist *dellist) +{ + int fd; + unsigned int i; + const char *group, *event; + char *p, *str; + struct str_node *ent; + struct strlist *namelist; + + fd = open_kprobe_events(O_RDWR, O_APPEND); + /* Get current event names */ + namelist = get_perf_event_names(fd, true); + + for (i = 0; i < strlist__nr_entries(dellist); i++) { + ent = strlist__entry(dellist, i); + str = strdup(ent->s); + if (!str) + die("Failed to copy event."); + p = strchr(str, ':'); + if (p) { + group = str; + *p = '\0'; + event = p + 1; + } else { + group = PERFPROBE_GROUP; + event = str; + } + del_trace_kprobe_event(fd, group, event, namelist); + free(str); + } + strlist__delete(namelist); + close(fd); +} + diff --git a/tools/perf/util/probe-event.h b/tools/perf/util/probe-event.h index 0c6fe56fe38..f752159124a 100644 --- a/tools/perf/util/probe-event.h +++ b/tools/perf/util/probe-event.h @@ -10,6 +10,7 @@ extern void parse_trace_kprobe_event(const char *str, char **group, char **event, struct probe_point *pp); extern int synthesize_trace_kprobe_event(struct probe_point *pp); extern void add_trace_kprobe_events(struct probe_point *probes, int nr_probes); +extern void del_trace_kprobe_events(struct strlist *dellist); extern void show_perf_probe_events(void); /* Maximum index number of event-name postfix */ From c937fe20cb6d9e24c6ad5f9f0c64d64c78411057 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Tue, 8 Dec 2009 17:03:30 -0500 Subject: [PATCH 40/57] perf probe: Update perf-probe document Add --list and --del option descriptions to perf-probe.txt. Signed-off-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Jim Keniston Cc: Ananth N Mavinakayanahalli Cc: Christoph Hellwig Cc: Frank Ch. Eigler Cc: Frederic Weisbecker Cc: Jason Baron Cc: K.Prasad Cc: Peter Zijlstra Cc: Srikar Dronamraju Cc: Arnaldo Carvalho de Melo Cc: systemtap Cc: DLE Cc: Frederic Weisbecker LKML-Reference: <20091208220330.10142.73296.stgit@dhcp-100-2-132.bos.redhat.com> Signed-off-by: Ingo Molnar --- tools/perf/Documentation/perf-probe.txt | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/tools/perf/Documentation/perf-probe.txt b/tools/perf/Documentation/perf-probe.txt index 9270594e6df..8fa6bf99fcb 100644 --- a/tools/perf/Documentation/perf-probe.txt +++ b/tools/perf/Documentation/perf-probe.txt @@ -8,10 +8,13 @@ perf-probe - Define new dynamic tracepoints SYNOPSIS -------- [verse] -'perf probe' [options] --add 'PROBE' [--add 'PROBE' ...] +'perf probe' [options] --add='PROBE' [...] or -'perf probe' [options] 'PROBE' ['PROBE' ...] - +'perf probe' [options] PROBE +or +'perf probe' [options] --del='[GROUP:]EVENT' [...] +or +'perf probe' --list DESCRIPTION ----------- @@ -31,8 +34,16 @@ OPTIONS Be more verbose (show parsed arguments, etc). -a:: ---add:: - Define a probe point (see PROBE SYNTAX for detail) +--add=:: + Define a probe event (see PROBE SYNTAX for detail). + +-d:: +--del=:: + Delete a probe event. + +-l:: +--list:: + List up current probe events. PROBE SYNTAX ------------ From 44234adcdce38f83c56e05f808ce656175b4beeb Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 9 Dec 2009 09:25:48 +0100 Subject: [PATCH 41/57] hw-breakpoints: Modify breakpoints without unregistering them Currently, when ptrace needs to modify a breakpoint, like disabling it, changing its address, type or len, it calls modify_user_hw_breakpoint(). This latter will perform the heavy and racy task of unregistering the old breakpoint and registering a new one. This is racy as someone else might steal the reserved breakpoint slot under us, which is undesired as the breakpoint is only supposed to be modified, sometimes in the middle of a debugging workflow. We don't want our slot to be stolen in the middle. So instead of unregistering/registering the breakpoint, just disable it while we modify its breakpoint fields and re-enable it after if necessary. Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Prasad LKML-Reference: <1260347148-5519-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/ptrace.c | 57 ++++++++++++++++------------------- include/linux/hw_breakpoint.h | 4 +-- include/linux/perf_event.h | 5 ++- kernel/hw_breakpoint.c | 42 ++++++++++++++++++++------ kernel/perf_event.c | 4 +-- 5 files changed, 66 insertions(+), 46 deletions(-) diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index b361d28061d..7079ddaf073 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -595,7 +595,7 @@ static unsigned long ptrace_get_dr7(struct perf_event *bp[]) return dr7; } -static struct perf_event * +static int ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, struct task_struct *tsk, int disabled) { @@ -609,11 +609,11 @@ ptrace_modify_breakpoint(struct perf_event *bp, int len, int type, * written the address register first */ if (!bp) - return ERR_PTR(-EINVAL); + return -EINVAL; err = arch_bp_generic_fields(len, type, &gen_len, &gen_type); if (err) - return ERR_PTR(err); + return err; attr = bp->attr; attr.bp_len = gen_len; @@ -658,28 +658,17 @@ restore: if (!second_pass) continue; - thread->ptrace_bps[i] = NULL; - bp = ptrace_modify_breakpoint(bp, len, type, + rc = ptrace_modify_breakpoint(bp, len, type, tsk, 1); - if (IS_ERR(bp)) { - rc = PTR_ERR(bp); - thread->ptrace_bps[i] = NULL; + if (rc) break; - } - thread->ptrace_bps[i] = bp; } continue; } - bp = ptrace_modify_breakpoint(bp, len, type, tsk, 0); - - /* Incorrect bp, or we have a bug in bp API */ - if (IS_ERR(bp)) { - rc = PTR_ERR(bp); - thread->ptrace_bps[i] = NULL; + rc = ptrace_modify_breakpoint(bp, len, type, tsk, 0); + if (rc) break; - } - thread->ptrace_bps[i] = bp; } /* * Make a second pass to free the remaining unused breakpoints @@ -737,26 +726,32 @@ static int ptrace_set_breakpoint_addr(struct task_struct *tsk, int nr, attr.disabled = 1; bp = register_user_hw_breakpoint(&attr, ptrace_triggered, tsk); + + /* + * CHECKME: the previous code returned -EIO if the addr wasn't + * a valid task virtual addr. The new one will return -EINVAL in + * this case. + * -EINVAL may be what we want for in-kernel breakpoints users, + * but -EIO looks better for ptrace, since we refuse a register + * writing for the user. And anyway this is the previous + * behaviour. + */ + if (IS_ERR(bp)) + return PTR_ERR(bp); + + t->ptrace_bps[nr] = bp; } else { + int err; + bp = t->ptrace_bps[nr]; - t->ptrace_bps[nr] = NULL; attr = bp->attr; attr.bp_addr = addr; - bp = modify_user_hw_breakpoint(bp, &attr); + err = modify_user_hw_breakpoint(bp, &attr); + if (err) + return err; } - /* - * CHECKME: the previous code returned -EIO if the addr wasn't a - * valid task virtual addr. The new one will return -EINVAL in this - * case. - * -EINVAL may be what we want for in-kernel breakpoints users, but - * -EIO looks better for ptrace, since we refuse a register writing - * for the user. And anyway this is the previous behaviour. - */ - if (IS_ERR(bp)) - return PTR_ERR(bp); - t->ptrace_bps[nr] = bp; return 0; } diff --git a/include/linux/hw_breakpoint.h b/include/linux/hw_breakpoint.h index 42da1ce19ec..69f07a9f127 100644 --- a/include/linux/hw_breakpoint.h +++ b/include/linux/hw_breakpoint.h @@ -55,7 +55,7 @@ register_user_hw_breakpoint(struct perf_event_attr *attr, struct task_struct *tsk); /* FIXME: only change from the attr, and don't unregister */ -extern struct perf_event * +extern int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr); /* @@ -91,7 +91,7 @@ static inline struct perf_event * register_user_hw_breakpoint(struct perf_event_attr *attr, perf_overflow_handler_t triggered, struct task_struct *tsk) { return NULL; } -static inline struct perf_event * +static inline int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { return NULL; } static inline struct perf_event * diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index bf3329413e1..64a53f74c9a 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -872,6 +872,8 @@ extern void perf_output_copy(struct perf_output_handle *handle, const void *buf, unsigned int len); extern int perf_swevent_get_recursion_context(void); extern void perf_swevent_put_recursion_context(int rctx); +extern void perf_event_enable(struct perf_event *event); +extern void perf_event_disable(struct perf_event *event); #else static inline void perf_event_task_sched_in(struct task_struct *task, int cpu) { } @@ -902,7 +904,8 @@ static inline void perf_event_fork(struct task_struct *tsk) { } static inline void perf_event_init(void) { } static inline int perf_swevent_get_recursion_context(void) { return -1; } static inline void perf_swevent_put_recursion_context(int rctx) { } - +static inline void perf_event_enable(struct perf_event *event) { } +static inline void perf_event_disable(struct perf_event *event) { } #endif #define perf_output_put(handle, x) \ diff --git a/kernel/hw_breakpoint.c b/kernel/hw_breakpoint.c index 03a0773ac2b..366eedf949c 100644 --- a/kernel/hw_breakpoint.c +++ b/kernel/hw_breakpoint.c @@ -320,18 +320,40 @@ EXPORT_SYMBOL_GPL(register_user_hw_breakpoint); * @triggered: callback to trigger when we hit the breakpoint * @tsk: pointer to 'task_struct' of the process to which the address belongs */ -struct perf_event * -modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) +int modify_user_hw_breakpoint(struct perf_event *bp, struct perf_event_attr *attr) { - /* - * FIXME: do it without unregistering - * - We don't want to lose our slot - * - If the new bp is incorrect, don't lose the older one - */ - unregister_hw_breakpoint(bp); + u64 old_addr = bp->attr.bp_addr; + int old_type = bp->attr.bp_type; + int old_len = bp->attr.bp_len; + int err = 0; - return perf_event_create_kernel_counter(attr, -1, bp->ctx->task->pid, - bp->overflow_handler); + perf_event_disable(bp); + + bp->attr.bp_addr = attr->bp_addr; + bp->attr.bp_type = attr->bp_type; + bp->attr.bp_len = attr->bp_len; + + if (attr->disabled) + goto end; + + err = arch_validate_hwbkpt_settings(bp, bp->ctx->task); + if (!err) + perf_event_enable(bp); + + if (err) { + bp->attr.bp_addr = old_addr; + bp->attr.bp_type = old_type; + bp->attr.bp_len = old_len; + if (!bp->attr.disabled) + perf_event_enable(bp); + + return err; + } + +end: + bp->attr.disabled = attr->disabled; + + return 0; } EXPORT_SYMBOL_GPL(modify_user_hw_breakpoint); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index fd43ff4ac86..3b0cf86eee8 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -567,7 +567,7 @@ static void __perf_event_disable(void *info) * is the current context on this CPU and preemption is disabled, * hence we can't get into perf_event_task_sched_out for this context. */ -static void perf_event_disable(struct perf_event *event) +void perf_event_disable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; @@ -971,7 +971,7 @@ static void __perf_event_enable(void *info) * perf_event_for_each_child or perf_event_for_each as described * for perf_event_disable. */ -static void perf_event_enable(struct perf_event *event) +void perf_event_enable(struct perf_event *event) { struct perf_event_context *ctx = event->ctx; struct task_struct *task = ctx->task; From aa5452d70c0d559310598b243b8b1033c10056e7 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Dec 2009 11:28:13 +0800 Subject: [PATCH 42/57] perf_event: Clean up __perf_event_init_context() Clean up the code a bit: - define 'perf_cpu_context' variable with 'static' - use kzalloc() instead of kmalloc() and memset() Signed-off-by: Xiao Guangrong Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B1F194D.7080306@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3b0cf86eee8..2b06c45bfba 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -36,7 +36,7 @@ /* * Each CPU has a list of per CPU events: */ -DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); +static DEFINE_PER_CPU(struct perf_cpu_context, perf_cpu_context); int perf_max_events __read_mostly = 1; static int perf_reserved_percpu __read_mostly; @@ -1579,7 +1579,6 @@ static void __perf_event_init_context(struct perf_event_context *ctx, struct task_struct *task) { - memset(ctx, 0, sizeof(*ctx)); spin_lock_init(&ctx->lock); mutex_init(&ctx->mutex); INIT_LIST_HEAD(&ctx->group_list); @@ -1654,7 +1653,7 @@ static struct perf_event_context *find_get_context(pid_t pid, int cpu) } if (!ctx) { - ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); err = -ENOMEM; if (!ctx) goto errout; @@ -5105,7 +5104,7 @@ int perf_event_init_task(struct task_struct *child) * First allocate and initialize a context for the child. */ - child_ctx = kmalloc(sizeof(struct perf_event_context), GFP_KERNEL); + child_ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); if (!child_ctx) return -ENOMEM; From b93f7978ad6b46133e9453b90ccc057dc2429e75 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Dec 2009 11:29:44 +0800 Subject: [PATCH 43/57] perf_event: Allocate children's perf_event_ctxp at the right time In current code, children task will allocate memory for 'child->perf_event_ctxp' if the parent is counted, we can do it only if the parent allowed children inherit it. It can save memory and reduce overhead. Signed-off-by: Xiao Guangrong Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B1F19A8.5040805@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 37 ++++++++++++++++++++++--------------- 1 file changed, 22 insertions(+), 15 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 2b06c45bfba..77641ae6b23 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -5083,7 +5083,7 @@ again: */ int perf_event_init_task(struct task_struct *child) { - struct perf_event_context *child_ctx, *parent_ctx; + struct perf_event_context *child_ctx = NULL, *parent_ctx; struct perf_event_context *cloned_ctx; struct perf_event *event; struct task_struct *parent = current; @@ -5098,20 +5098,6 @@ int perf_event_init_task(struct task_struct *child) if (likely(!parent->perf_event_ctxp)) return 0; - /* - * This is executed from the parent task context, so inherit - * events that have been marked for cloning. - * First allocate and initialize a context for the child. - */ - - child_ctx = kzalloc(sizeof(struct perf_event_context), GFP_KERNEL); - if (!child_ctx) - return -ENOMEM; - - __perf_event_init_context(child_ctx, child); - child->perf_event_ctxp = child_ctx; - get_task_struct(child); - /* * If the parent's context is a clone, pin it so it won't get * swapped under us. @@ -5142,6 +5128,26 @@ int perf_event_init_task(struct task_struct *child) continue; } + if (!child->perf_event_ctxp) { + /* + * This is executed from the parent task context, so + * inherit events that have been marked for cloning. + * First allocate and initialize a context for the + * child. + */ + + child_ctx = kzalloc(sizeof(struct perf_event_context), + GFP_KERNEL); + if (!child_ctx) { + ret = -ENOMEM; + goto exit; + } + + __perf_event_init_context(child_ctx, child); + child->perf_event_ctxp = child_ctx; + get_task_struct(child); + } + ret = inherit_group(event, parent, parent_ctx, child, child_ctx); if (ret) { @@ -5170,6 +5176,7 @@ int perf_event_init_task(struct task_struct *child) get_ctx(child_ctx->parent_ctx); } +exit: mutex_unlock(&parent_ctx->mutex); perf_unpin_context(parent_ctx); From ec89a06fd4e12301f11ab039ee07d2353a18addc Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Dec 2009 11:30:36 +0800 Subject: [PATCH 44/57] perf_event: Cleanup for cpu_clock_perf_event_update() Using atomic64_xchg() instead of atomic64_read() and atomic64_set(). Signed-off-by: Xiao Guangrong Reviewed-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <4B1F19DC.90204@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 77641ae6b23..94e1b28333a 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4079,8 +4079,7 @@ static void cpu_clock_perf_event_update(struct perf_event *event) u64 now; now = cpu_clock(cpu); - prev = atomic64_read(&event->hw.prev_count); - atomic64_set(&event->hw.prev_count, now); + prev = atomic64_xchg(&event->hw.prev_count, now); atomic64_add(now - prev, &event->count); } From 822a6961112f0c9101d3359d8524604c3309ee6c Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Tue, 8 Dec 2009 10:00:04 +0100 Subject: [PATCH 45/57] tracing/kprobes: Fix field creation's bad error handling When we define the common event fields in kprobe, we invert the error handling and return immediately in case of success. Then we omit to define specific kprobes fields (ip and nargs), and specific kretprobes fields (func, ret_ip, nargs). And we only define them when we fail to create common fields. The most visible consequence is that we can't create filter for k(ret)probes specific fields. This patch re-invert the success/error handling to fix it. Reported-by: Lai Jiangshan Signed-off-by: Frederic Weisbecker Acked-by: Masami Hiramatsu Cc: Steven Rostedt Cc: Li Zefan LKML-Reference: <1260263815-5167-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/trace/trace_kprobe.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace_kprobe.c b/kernel/trace/trace_kprobe.c index bf05fb49a6f..b52d397e57e 100644 --- a/kernel/trace/trace_kprobe.c +++ b/kernel/trace/trace_kprobe.c @@ -1133,7 +1133,7 @@ static int kprobe_event_define_fields(struct ftrace_event_call *event_call) struct trace_probe *tp = (struct trace_probe *)event_call->data; ret = trace_define_common_fields(event_call); - if (!ret) + if (ret) return ret; DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0); @@ -1151,7 +1151,7 @@ static int kretprobe_event_define_fields(struct ftrace_event_call *event_call) struct trace_probe *tp = (struct trace_probe *)event_call->data; ret = trace_define_common_fields(event_call); - if (!ret) + if (ret) return ret; DEFINE_FIELD(unsigned long, func, FIELD_STRING_FUNC, 0); From c0c9e72150c07b4a6766cd24a6f685ec2dc9c343 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Wed, 9 Dec 2009 17:51:30 +0800 Subject: [PATCH 46/57] perf sched: Fix for getting task's execution time In current code, task's execute time is got by reading '/proc//sched' file, it's wrong if the task is created by pthread_create(), because every thread task has same pid. This way also has two demerits: 1: 'perf sched replay' can't work if the kernel is not compiled with the 'CONFIG_SCHED_DEBUG' option 2: perf tool should depend on proc file system So, this patch uses PERF_COUNT_SW_TASK_CLOCK to get task's execution time instead of reading /proc file. Changelog v2 -> v3: use PERF_COUNT_SW_TASK_CLOCK instead of rusage() as Ingo's suggestion Reported-by: Torok Edwin Signed-off-by: Xiao Guangrong Cc: Xiao Guangrong Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras LKML-Reference: <4B1F7322.80103@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 55 +++++++++++++++++++------------------- 1 file changed, 27 insertions(+), 28 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index 19f43faa9f8..b12b23ac06f 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -13,7 +13,6 @@ #include "util/debug.h" #include "util/data_map.h" -#include #include #include @@ -414,34 +413,33 @@ static u64 get_cpu_usage_nsec_parent(void) return sum; } -static u64 get_cpu_usage_nsec_self(void) +static int self_open_counters(void) { - char filename [] = "/proc/1234567890/sched"; - unsigned long msecs, nsecs; - char *line = NULL; - u64 total = 0; - size_t len = 0; - ssize_t chars; - FILE *file; + struct perf_event_attr attr; + int fd; + + memset(&attr, 0, sizeof(attr)); + + attr.type = PERF_TYPE_SOFTWARE; + attr.config = PERF_COUNT_SW_TASK_CLOCK; + + fd = sys_perf_event_open(&attr, 0, -1, -1, 0); + + if (fd < 0) + die("Error: sys_perf_event_open() syscall returned" + "with %d (%s)\n", fd, strerror(errno)); + return fd; +} + +static u64 get_cpu_usage_nsec_self(int fd) +{ + u64 runtime; int ret; - sprintf(filename, "/proc/%d/sched", getpid()); - file = fopen(filename, "r"); - BUG_ON(!file); + ret = read(fd, &runtime, sizeof(runtime)); + BUG_ON(ret != sizeof(runtime)); - while ((chars = getline(&line, &len, file)) != -1) { - ret = sscanf(line, "se.sum_exec_runtime : %ld.%06ld\n", - &msecs, &nsecs); - if (ret == 2) { - total = msecs*1e6 + nsecs; - break; - } - } - if (line) - free(line); - fclose(file); - - return total; + return runtime; } static void *thread_func(void *ctx) @@ -450,9 +448,11 @@ static void *thread_func(void *ctx) u64 cpu_usage_0, cpu_usage_1; unsigned long i, ret; char comm2[22]; + int fd; sprintf(comm2, ":%s", this_task->comm); prctl(PR_SET_NAME, comm2); + fd = self_open_counters(); again: ret = sem_post(&this_task->ready_for_work); @@ -462,16 +462,15 @@ again: ret = pthread_mutex_unlock(&start_work_mutex); BUG_ON(ret); - cpu_usage_0 = get_cpu_usage_nsec_self(); + cpu_usage_0 = get_cpu_usage_nsec_self(fd); for (i = 0; i < this_task->nr_events; i++) { this_task->curr_event = i; process_sched_event(this_task, this_task->atoms[i]); } - cpu_usage_1 = get_cpu_usage_nsec_self(); + cpu_usage_1 = get_cpu_usage_nsec_self(fd); this_task->cpu_usage = cpu_usage_1 - cpu_usage_0; - ret = sem_post(&this_task->work_done_sem); BUG_ON(ret); From 21140f4d3387aa2213f1deea0128df1dbf924379 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 10 Dec 2009 14:00:51 +0800 Subject: [PATCH 47/57] perf_event: Fix perf_swevent_hrtimer() variable initialization fix: [] ? printk+0x1d/0x24 [] ? perf_prepare_sample+0x269/0x280 [] warn_slowpath_common+0x71/0xd0 [] ? perf_prepare_sample+0x269/0x280 [] warn_slowpath_null+0x1a/0x20 [] perf_prepare_sample+0x269/0x280 [] ? cpu_clock+0x53/0x90 [] __perf_event_overflow+0x2a8/0x300 [] perf_event_overflow+0x1b/0x30 [] perf_swevent_hrtimer+0x7f/0x120 This is because 'data.raw' variable not initialize. Signed-off-by: Xiao Guangrong Acked-by: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras LKML-Reference: <4B208E93.1010801@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- kernel/perf_event.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 94e1b28333a..3a5d6c4786b 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4010,6 +4010,7 @@ static enum hrtimer_restart perf_swevent_hrtimer(struct hrtimer *hrtimer) event->pmu->read(event); data.addr = 0; + data.raw = NULL; data.period = event->hw.last_period; regs = get_irq_regs(); /* From 5660ce34241ab204bf78fbcaa5e09318c2748d37 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Wed, 9 Dec 2009 20:26:18 +0100 Subject: [PATCH 48/57] perf tools: Correct size given to memset Memset should be given the size of the structure, not the size of the pointer. The semantic patch that makes this change is as follows: (http://coccinelle.lip6.fr/) // @@ type T; T *x; expression E; @@ memset(x, E, sizeof( + * x)) // Signed-off-by: Julia Lawall Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/util/probe-event.c | 2 +- tools/perf/util/trace-event-parse.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/probe-event.c b/tools/perf/util/probe-event.c index 9480d9941cc..d14a4585bca 100644 --- a/tools/perf/util/probe-event.c +++ b/tools/perf/util/probe-event.c @@ -376,7 +376,7 @@ static void clear_probe_point(struct probe_point *pp) free(pp->args); for (i = 0; i < pp->found; i++) free(pp->probes[i]); - memset(pp, 0, sizeof(pp)); + memset(pp, 0, sizeof(*pp)); } /* Show an event */ diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 6ffe9d63d85..c5c32be040b 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -1477,7 +1477,7 @@ process_fields(struct event *event, struct print_flag_sym **list, char **tok) goto out_free; field = malloc_or_die(sizeof(*field)); - memset(field, 0, sizeof(field)); + memset(field, 0, sizeof(*field)); value = arg_eval(arg); field->value = strdup(value); From 3786310afe738070be31c439b8deeaeb69b9735d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Wed, 9 Dec 2009 21:40:08 +0100 Subject: [PATCH 49/57] perf sched: Add max delay time snapshot When we have a maximum latency reported for a task, we need a convenient way to find the matching location to the raw traces or to perf sched map that shows where the task has been eventually scheduled in. This gives a pointer to retrieve the events that occured during this max latency. Signed-off-by: Frederic Weisbecker Reviewed-by: Xiao Guangrong Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Paul Mackerras LKML-Reference: <1260391208-6808-1-git-send-regression-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-sched.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-sched.c b/tools/perf/builtin-sched.c index b12b23ac06f..7cca7c15b40 100644 --- a/tools/perf/builtin-sched.c +++ b/tools/perf/builtin-sched.c @@ -140,6 +140,7 @@ struct work_atoms { struct thread *thread; struct rb_node node; u64 max_lat; + u64 max_lat_at; u64 total_lat; u64 nb_atoms; u64 total_runtime; @@ -1013,8 +1014,10 @@ add_sched_in_event(struct work_atoms *atoms, u64 timestamp) delta = atom->sched_in_time - atom->wake_up_time; atoms->total_lat += delta; - if (delta > atoms->max_lat) + if (delta > atoms->max_lat) { atoms->max_lat = delta; + atoms->max_lat_at = timestamp; + } atoms->nb_atoms++; } @@ -1210,10 +1213,11 @@ static void output_lat_thread(struct work_atoms *work_list) avg = work_list->total_lat / work_list->nb_atoms; - printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms |\n", + printf("|%11.3f ms |%9llu | avg:%9.3f ms | max:%9.3f ms | max at: %9.6f s\n", (double)work_list->total_runtime / 1e6, work_list->nb_atoms, (double)avg / 1e6, - (double)work_list->max_lat / 1e6); + (double)work_list->max_lat / 1e6, + (double)work_list->max_lat_at / 1e9); } static int pid_cmp(struct work_atoms *l, struct work_atoms *r) @@ -1704,9 +1708,9 @@ static void __cmd_lat(void) read_events(); sort_lat(); - printf("\n -----------------------------------------------------------------------------------------\n"); - printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms |\n"); - printf(" -----------------------------------------------------------------------------------------\n"); + printf("\n ---------------------------------------------------------------------------------------------------------------\n"); + printf(" Task | Runtime ms | Switches | Average delay ms | Maximum delay ms | Maximum delay at |\n"); + printf(" ---------------------------------------------------------------------------------------------------------------\n"); next = rb_first(&sorted_atom_root); From 1bbfa6f25673019dc0acc9308b667c96f6cda8bf Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Wed, 9 Dec 2009 20:07:03 -0500 Subject: [PATCH 50/57] sched: Mark sched_clock() as notrace The core ftrace code (trace_clock_local) calls sched_clock() directly, so we don't want to recurisvely trigger the ftrace code. Rather than update every sched_clock() definition, tag the prototype for everyone as notrace. Signed-off-by: Mike Frysinger Cc: Peter Zijlstra LKML-Reference: <1260407223-10900-1-git-send-email-vapier@gentoo.org> Signed-off-by: Ingo Molnar --- include/linux/sched.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 75e6e60bf58..576d838adf6 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1836,7 +1836,8 @@ static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask) extern int sched_clock_stable; #endif -extern unsigned long long sched_clock(void); +/* ftrace calls sched_clock() directly */ +extern unsigned long long notrace sched_clock(void); extern void sched_clock_init(void); extern u64 sched_clock_cpu(int cpu); From 90b86a9f7dc22e7ff8e8c79ed553860454ff8dd9 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 10 Dec 2009 15:21:57 +0800 Subject: [PATCH 51/57] perf kmem: Show usage if no option is specified As Ingo suggested, make "perf kmem" show help information. "perf kmem stat [--caller] [--alloc] .." will show memory statistics. Signed-off-by: Li Zefan Acked-by: Pekka Enberg LKML-Reference: <4B20A195.8030106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- tools/perf/Documentation/perf-kmem.txt | 13 ++++--- tools/perf/builtin-kmem.c | 52 ++++++++++++++------------ 2 files changed, 37 insertions(+), 28 deletions(-) diff --git a/tools/perf/Documentation/perf-kmem.txt b/tools/perf/Documentation/perf-kmem.txt index 44b0ce35c28..eac4d852e7c 100644 --- a/tools/perf/Documentation/perf-kmem.txt +++ b/tools/perf/Documentation/perf-kmem.txt @@ -8,16 +8,16 @@ perf-kmem - Tool to trace/measure kernel memory(slab) properties SYNOPSIS -------- [verse] -'perf kmem' {record} [] +'perf kmem' {record|stat} [] DESCRIPTION ----------- -There's two variants of perf kmem: +There are two variants of perf kmem: 'perf kmem record ' to record the kmem events of an arbitrary workload. - 'perf kmem' to report kernel memory statistics. + 'perf kmem stat' to report kernel memory statistics. OPTIONS ------- @@ -25,8 +25,11 @@ OPTIONS --input=:: Select the input file (default: perf.data) ---stat=:: - Select per callsite or per allocation statistics +--caller:: + Show per-callsite statistics + +--alloc:: + Show per-allocation statistics -s :: --sort=:: diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 7551a5f834b..1b04787ed90 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -526,7 +526,7 @@ static int __cmd_kmem(void) } static const char * const kmem_usage[] = { - "perf kmem [] {record}", + "perf kmem [] {record|stat}", NULL }; @@ -686,18 +686,17 @@ static int parse_sort_opt(const struct option *opt __used, return 0; } -static int parse_stat_opt(const struct option *opt __used, +static int parse_caller_opt(const struct option *opt __used, const char *arg, int unset __used) { - if (!arg) - return -1; + caller_flag = (alloc_flag + 1); + return 0; +} - if (strcmp(arg, "alloc") == 0) - alloc_flag = (caller_flag + 1); - else if (strcmp(arg, "caller") == 0) - caller_flag = (alloc_flag + 1); - else - return -1; +static int parse_alloc_opt(const struct option *opt __used, + const char *arg, int unset __used) +{ + alloc_flag = (caller_flag + 1); return 0; } @@ -722,14 +721,17 @@ static int parse_line_opt(const struct option *opt __used, static const struct option kmem_options[] = { OPT_STRING('i', "input", &input_name, "file", "input file name"), - OPT_CALLBACK(0, "stat", NULL, "|", - "stat selector, Pass 'alloc' or 'caller'.", - parse_stat_opt), + OPT_CALLBACK_NOOPT(0, "caller", NULL, NULL, + "show per-callsite statistics", + parse_caller_opt), + OPT_CALLBACK_NOOPT(0, "alloc", NULL, NULL, + "show per-allocation statistics", + parse_alloc_opt), OPT_CALLBACK('s', "sort", NULL, "key[,key2...]", "sort by keys: ptr, call_site, bytes, hit, pingpong, frag", parse_sort_opt), OPT_CALLBACK('l', "line", NULL, "num", - "show n lins", + "show n lines", parse_line_opt), OPT_BOOLEAN(0, "raw-ip", &raw_ip, "show raw ip instead of symbol"), OPT_END() @@ -773,18 +775,22 @@ int cmd_kmem(int argc, const char **argv, const char *prefix __used) argc = parse_options(argc, argv, kmem_options, kmem_usage, 0); - if (argc && !strncmp(argv[0], "rec", 3)) - return __cmd_record(argc, argv); - else if (argc) + if (!argc) usage_with_options(kmem_usage, kmem_options); - if (list_empty(&caller_sort)) - setup_sorting(&caller_sort, default_sort_order); - if (list_empty(&alloc_sort)) - setup_sorting(&alloc_sort, default_sort_order); + if (!strncmp(argv[0], "rec", 3)) { + return __cmd_record(argc, argv); + } else if (!strcmp(argv[0], "stat")) { + setup_cpunode_map(); - setup_cpunode_map(); + if (list_empty(&caller_sort)) + setup_sorting(&caller_sort, default_sort_order); + if (list_empty(&alloc_sort)) + setup_sorting(&alloc_sort, default_sort_order); - return __cmd_kmem(); + return __cmd_kmem(); + } + + return 0; } From bc3abfb1b50964ffbbd0fc4e1ffe598b1b63a8c7 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 10 Dec 2009 15:22:17 +0800 Subject: [PATCH 52/57] perf tools: Align long options which have no short forms Before: $ ./perf kmem ... -l, --line show n lines --raw-ip show raw ip instead of symbol After: $ ./perf kmem ... -l, --line show n lines --raw-ip show raw ip instead of symbol Signed-off-by: Li Zefan Cc: Pekka Enberg LKML-Reference: <4B20A1A9.3040104@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- tools/perf/util/parse-options.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/parse-options.c b/tools/perf/util/parse-options.c index 6d8af48c925..efebd5b476b 100644 --- a/tools/perf/util/parse-options.c +++ b/tools/perf/util/parse-options.c @@ -430,6 +430,9 @@ int usage_with_options_internal(const char * const *usagestr, pos = fprintf(stderr, " "); if (opts->short_name) pos += fprintf(stderr, "-%c", opts->short_name); + else + pos += fprintf(stderr, " "); + if (opts->long_name && opts->short_name) pos += fprintf(stderr, ", "); if (opts->long_name) From 8b4825bf8da5c07e80496b749e9a50d675df4119 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 9 Dec 2009 20:09:37 -0200 Subject: [PATCH 53/57] perf symbols: dsos__read_build_ids() should read both user and kernel buildids MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Arnaldo Carvalho de Melo Cc: Frédéric Weisbecker Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1260396578-19116-1-git-send-email-acme@infradead.org> Signed-off-by: Ingo Molnar --- tools/perf/util/symbol.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index fffcb937cdc..e7508ad3450 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -938,8 +938,9 @@ static bool __dsos__read_build_ids(struct list_head *head) bool dsos__read_build_ids(void) { - return __dsos__read_build_ids(&dsos__kernel) || - __dsos__read_build_ids(&dsos__user); + bool kbuildids = __dsos__read_build_ids(&dsos__kernel), + ubuildids = __dsos__read_build_ids(&dsos__user); + return kbuildids || ubuildids; } /* From 716d69e4fda0563ef67d62ee44baa17b377b9b23 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 9 Dec 2009 20:09:38 -0200 Subject: [PATCH 54/57] perf symbols: perf_header__read_build_ids() offset'n'size should be u64 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As off_t is a long, so breaking things on 32-bit land. Now buildids work on 32-bit land. [root@ana ~]# uname -a Linux ana.ghostprotocols.net 2.6.31.6-162.fc12.i686 #1 SMP Fri Dec 4 01:09:09 EST 2009 i686 i686 i386 GNU/Linux [root@ana ~]# perf buildid-list | tail -5 136ee6792ba2ae57870ecd87369f4ae3194d5b27 /lib/libreadline.so.6.0 d202dcb1ad48d140065783657d37ae3f2d9ab83f /usr/bin/gdb 0a56c0c00dcc2e9e581ae9997f31957c9c4671df /usr/lib/libdwarf.so.0.0 5f9e6ac95241cbb3227608e0ff2a2e0cbbe72439 /home/acme/bin/perf 925d19eccc2ddb1c9d74dd178a011426f1b124a8 /bin/sleep [root@ana ~]# Signed-off-by: Arnaldo Carvalho de Melo Cc: Frédéric Weisbecker Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1260396578-19116-2-git-send-email-acme@infradead.org> Signed-off-by: Ingo Molnar --- tools/perf/util/data_map.c | 4 ++-- tools/perf/util/data_map.h | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/data_map.c b/tools/perf/util/data_map.c index ca0bedf637c..59b65d0bd7c 100644 --- a/tools/perf/util/data_map.c +++ b/tools/perf/util/data_map.c @@ -100,11 +100,11 @@ process_event(event_t *event, unsigned long offset, unsigned long head) } } -int perf_header__read_build_ids(int input, off_t offset, off_t size) +int perf_header__read_build_ids(int input, u64 offset, u64 size) { struct build_id_event bev; char filename[PATH_MAX]; - off_t limit = offset + size; + u64 limit = offset + size; int err = -1; while (offset < limit) { diff --git a/tools/perf/util/data_map.h b/tools/perf/util/data_map.h index 3180ff7e363..258a87bcc4f 100644 --- a/tools/perf/util/data_map.h +++ b/tools/perf/util/data_map.h @@ -27,6 +27,6 @@ int mmap_dispatch_perf_file(struct perf_header **pheader, int full_paths, int *cwdlen, char **cwd); -int perf_header__read_build_ids(int input, off_t offset, off_t file_size); +int perf_header__read_build_ids(int input, u64 offset, u64 file_size); #endif From 7931241694b25589658b1ceb02218d2750540ae0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 10 Dec 2009 08:43:34 +0100 Subject: [PATCH 55/57] perf kmem: Fix unused argument build warning Fix: builtin-kmem.c: In function 'parse_caller_opt': builtin-kmem.c:690: error: unused parameter 'arg' builtin-kmem.c: In function 'parse_alloc_opt': builtin-kmem.c:697: error: unused parameter 'arg' Cc: Li Zefan Cc: Pekka Enberg LKML-Reference: <4B20A195.8030106@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-kmem.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-kmem.c b/tools/perf/builtin-kmem.c index 1b04787ed90..5f209514f65 100644 --- a/tools/perf/builtin-kmem.c +++ b/tools/perf/builtin-kmem.c @@ -687,14 +687,14 @@ static int parse_sort_opt(const struct option *opt __used, } static int parse_caller_opt(const struct option *opt __used, - const char *arg, int unset __used) + const char *arg __used, int unset __used) { caller_flag = (alloc_flag + 1); return 0; } static int parse_alloc_opt(const struct option *opt __used, - const char *arg, int unset __used) + const char *arg __used, int unset __used) { alloc_flag = (caller_flag + 1); return 0; From 5e855db5d8fec44e6604eb245aa9077bbd3f0d05 Mon Sep 17 00:00:00 2001 From: Xiao Guangrong Date: Thu, 10 Dec 2009 17:08:54 +0800 Subject: [PATCH 56/57] perf_event: Fix variable initialization in other codepaths Signed-off-by: Xiao Guangrong Cc: Peter Zijlstra Cc: Frederic Weisbecker Cc: Paul Mackerras LKML-Reference: <4B20BAA6.7010609@cn.fujitsu.com> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 4 ++++ kernel/perf_event.c | 1 + 2 files changed, 5 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index d35f26076ae..1342f236e32 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -1632,6 +1632,7 @@ static void intel_pmu_drain_bts_buffer(struct cpu_hw_events *cpuc) data.period = event->hw.last_period; data.addr = 0; + data.raw = NULL; regs.ip = 0; /* @@ -1749,6 +1750,7 @@ static int p6_pmu_handle_irq(struct pt_regs *regs) u64 val; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); @@ -1794,6 +1796,7 @@ static int intel_pmu_handle_irq(struct pt_regs *regs) u64 ack, status; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); @@ -1857,6 +1860,7 @@ static int amd_pmu_handle_irq(struct pt_regs *regs) u64 val; data.addr = 0; + data.raw = NULL; cpuc = &__get_cpu_var(cpu_hw_events); diff --git a/kernel/perf_event.c b/kernel/perf_event.c index 3a5d6c4786b..d891ec4a810 100644 --- a/kernel/perf_event.c +++ b/kernel/perf_event.c @@ -4300,6 +4300,7 @@ void perf_bp_event(struct perf_event *bp, void *data) struct perf_sample_data sample; struct pt_regs *regs = data; + sample.raw = NULL; sample.addr = bp->attr.bp_addr; if (!perf_exclude_event(bp, regs)) From 125580380f418000b1a06d9a54700f1191b6e561 Mon Sep 17 00:00:00 2001 From: Cyrill Gorcunov Date: Thu, 10 Dec 2009 19:56:34 +0300 Subject: [PATCH 57/57] x86, perf events: Check if we have APIC enabled Ralf Hildebrandt reported this boot warning: | Running a vanilla 2.6.32 as Xen DomU, I'm getting: | | [ 0.000999] CPU: Physical Processor ID: 0 | [ 0.000999] CPU: Processor Core ID: 1 | [ 0.000999] Performance Events: AMD PMU driver. | [ 0.000999] ------------[ cut here ]------------ | [ 0.000999] WARNING: at arch/x86/kernel/apic/apic.c:249 native_apic_write_dummy So we need to check if APIC functionality is available, and not just in the P6 driver but elsewhere as well. Reported-by: Ralf Hildebrandt Signed-off-by: Cyrill Gorcunov Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker LKML-Reference: <20091210165634.GF5086@lenovo> Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index 1342f236e32..18f05eccbb6 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2066,12 +2066,6 @@ static __init int p6_pmu_init(void) x86_pmu = p6_pmu; - if (!cpu_has_apic) { - pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); - pr_info("no hardware sampling interrupt available.\n"); - x86_pmu.apic = 0; - } - return 0; } @@ -2163,6 +2157,16 @@ static __init int amd_pmu_init(void) return 0; } +static void __init pmu_check_apic(void) +{ + if (cpu_has_apic) + return; + + x86_pmu.apic = 0; + pr_info("no APIC, boot with the \"lapic\" boot parameter to force-enable it.\n"); + pr_info("no hardware sampling interrupt available.\n"); +} + void __init init_hw_perf_events(void) { int err; @@ -2184,6 +2188,8 @@ void __init init_hw_perf_events(void) return; } + pmu_check_apic(); + pr_cont("%s PMU driver.\n", x86_pmu.name); if (x86_pmu.num_events > X86_PMC_MAX_GENERIC) {