From 923c42c19944da214d697e312a040384a0e33e78 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Wed, 22 Jul 2009 20:36:03 +0200 Subject: [PATCH 01/27] perf_counter tools: Fix/resurrect perf top annotation in a simple interactive form perf top used to have annotation support, but it has bitrotted and removed. This patch restores that: it allows the user to select any symbol in kernel space for source level annotation on the fly, switch between event counters and alter display variables. When symbol details are being displayed, stopping annotation reverts to normal. known keys: [d] select display delay. [e] select display entries (lines). [E] select annotation event counter. [f] select normal display count filter. [F] select annotation display count filter (percentage). [qQ] quit. [s] select annotation symbol and start annotation. [S] stop annotation, revert to normal display. [z] toggle event count zeroing. Sample: ------------------------------------------------------------------------------ PerfTop: 16719 irqs/sec kernel:78.7% [cache-misses/cache-references/instructions/cycles], (all, 4 CPUs) ------------------------------------------------------------------------------ Showing cache-misses for e1000_clean_rx_irq Events Pcnt (>=3%) 0 0.0% /* adjust length to remove Ethernet CRC */ 0 0.0% if (!(adapter->flags2 & FLAG2_CRC_STRIPPING)) 0 0.0% length -= 4; 436 5.0% f039: 41 f6 84 24 5c 29 00 testb $0x1,0x295c(%r12) 0 0.0% f089: 8b 4d 84 mov -0x7c(%rbp),%ecx 0 0.0% f08c: 48 83 ef 02 sub $0x2,%rdi 0 0.0% f090: 48 83 ee 02 sub $0x2,%rsi 811 9.3% f094: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 0 0.0% 0 0.0% while (rx_desc->status & E1000_RXD_STAT_DD) { 0 0.0% f114: 41 f6 47 0c 01 testb $0x1,0xc(%r15) 7226 82.6% f119: 0f 85 24 fe ff ff jne ef43 Available events: 0 cache-misses 1 cache-references 2 instructions 3 cycles Enter details event counter: 2 ------------------------------------------------------------------------------ PerfTop: 15035 irqs/sec kernel:79.0% [cache-misses/cache-references/instructions/cycles], (all, 4 CPUs) ------------------------------------------------------------------------------ Showing instructions for e1000_clean_rx_irq Events Pcnt (>=3%) 0 0.0% int *work_done, int work_to_do) 0 0.0% { 175 0.9% eebf: 55 push %rbp 1898 9.8% eec0: 48 89 e5 mov %rsp,%rbp 0 0.0% 0 0.0% i = rx_ring->next_to_clean; 140 0.7% ef0a: 0f b7 41 1a movzwl 0x1a(%rcx),%eax 670 3.4% ef0e: 89 45 ac mov %eax,-0x54(%rbp) 0 0.0% { 0 0.0% memcpy(skb->data + offset, from, len); 91 0.5% f07b: 49 8b b6 e8 00 00 00 mov 0xe8(%r14),%rsi 1153 5.9% f082: 48 8b b8 e8 00 00 00 mov 0xe8(%rax),%rdi 42 0.2% f089: 8b 4d 84 mov -0x7c(%rbp),%ecx 14 0.1% f08c: 48 83 ef 02 sub $0x2,%rdi 0 0.0% f090: 48 83 ee 02 sub $0x2,%rsi 1618 8.3% f094: f3 a4 rep movsb %ds:(%rsi),%es:(%rdi) 0 0.0% 0 0.0% /* return some buffers to hardware, one at a time is too slow */ 0 0.0% if (cleaned_count >= E1000_RX_BUFFER_WRITE) { 867 4.5% f0e7: 83 7d b0 0f cmpl $0xf,-0x50(%rbp) 0 0.0% 0 0.0% while (rx_desc->status & E1000_RXD_STAT_DD) { 37 0.2% f114: 41 f6 47 0c 01 testb $0x1,0xc(%r15) 4047 20.8% f119: 0f 85 24 fe ff ff jne ef43 Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- tools/perf/builtin-top.c | 494 +++++++++++++++++++++++++++++++++++---- 1 file changed, 455 insertions(+), 39 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index f139f1ab933..d58701346b1 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -31,6 +31,8 @@ #include #include +#include +#include #include #include @@ -54,7 +56,7 @@ static int system_wide = 0; static int default_interval = 100000; -static u64 count_filter = 5; +static int count_filter = 5; static int print_entries = 15; static int target_pid = -1; @@ -69,14 +71,26 @@ static int freq = 0; static int verbose = 0; static char *vmlinux = NULL; -static char *sym_filter; -static unsigned long filter_start; -static unsigned long filter_end; - static int delay_secs = 2; static int zero; static int dump_symtab; +/* + * Source + */ + +struct source_line { + u64 eip; + unsigned long count[MAX_COUNTERS]; + char *line; + struct source_line *next; +}; + +static char *sym_filter = NULL; +struct sym_entry *sym_filter_entry = NULL; +static int sym_pcnt_filter = 5; +static int sym_counter = 0; + /* * Symbols */ @@ -91,9 +105,237 @@ struct sym_entry { unsigned long snap_count; double weight; int skip; + struct source_line *source; + struct source_line *lines; + struct source_line **lines_tail; + pthread_mutex_t source_lock; }; -struct sym_entry *sym_filter_entry; +/* + * Source functions + */ + +static void parse_source(struct sym_entry *syme) +{ + struct symbol *sym; + struct module *module; + struct section *section = NULL; + FILE *file; + char command[PATH_MAX*2], *path = vmlinux; + u64 start, end, len; + + if (!syme) + return; + + if (syme->lines) { + pthread_mutex_lock(&syme->source_lock); + goto out_assign; + } + + sym = (struct symbol *)(syme + 1); + module = sym->module; + + if (module) + path = module->path; + if (!path) + return; + + start = sym->obj_start; + if (!start) + start = sym->start; + + if (module) { + section = module->sections->find_section(module->sections, ".text"); + if (section) + start -= section->vma; + } + + end = start + sym->end - sym->start + 1; + len = sym->end - sym->start; + + sprintf(command, "objdump --start-address=0x%016Lx --stop-address=0x%016Lx -dS %s", start, end, path); + + file = popen(command, "r"); + if (!file) + return; + + pthread_mutex_lock(&syme->source_lock); + syme->lines_tail = &syme->lines; + while (!feof(file)) { + struct source_line *src; + size_t dummy = 0; + char *c; + + src = malloc(sizeof(struct source_line)); + assert(src != NULL); + memset(src, 0, sizeof(struct source_line)); + + if (getline(&src->line, &dummy, file) < 0) + break; + if (!src->line) + break; + + c = strchr(src->line, '\n'); + if (c) + *c = 0; + + src->next = NULL; + *syme->lines_tail = src; + syme->lines_tail = &src->next; + + if (strlen(src->line)>8 && src->line[8] == ':') { + src->eip = strtoull(src->line, NULL, 16); + if (section) + src->eip += section->vma; + } + if (strlen(src->line)>8 && src->line[16] == ':') { + src->eip = strtoull(src->line, NULL, 16); + if (section) + src->eip += section->vma; + } + } + pclose(file); +out_assign: + sym_filter_entry = syme; + pthread_mutex_unlock(&syme->source_lock); +} + +static void __zero_source_counters(struct sym_entry *syme) +{ + int i; + struct source_line *line; + + line = syme->lines; + while (line) { + for (i = 0; i < nr_counters; i++) + line->count[i] = 0; + line = line->next; + } +} + +static void record_precise_ip(struct sym_entry *syme, int counter, u64 ip) +{ + struct source_line *line; + + if (syme != sym_filter_entry) + return; + + if (pthread_mutex_trylock(&syme->source_lock)) + return; + + if (!syme->source) + goto out_unlock; + + for (line = syme->lines; line; line = line->next) { + if (line->eip == ip) { + line->count[counter]++; + break; + } + if (line->eip > ip) + break; + } +out_unlock: + pthread_mutex_unlock(&syme->source_lock); +} + +static void lookup_sym_source(struct sym_entry *syme) +{ + struct symbol *symbol = (struct symbol *)(syme + 1); + struct source_line *line; + char pattern[PATH_MAX]; + char *idx; + + sprintf(pattern, "<%s>:", symbol->name); + + if (symbol->module) { + idx = strstr(pattern, "\t"); + if (idx) + *idx = 0; + } + + pthread_mutex_lock(&syme->source_lock); + for (line = syme->lines; line; line = line->next) { + if (strstr(line->line, pattern)) { + syme->source = line; + break; + } + } + pthread_mutex_unlock(&syme->source_lock); +} + +static void show_lines(struct source_line *queue, int count, int total) +{ + int i; + struct source_line *line; + + line = queue; + for (i = 0; i < count; i++) { + float pcnt = 100.0*(float)line->count[sym_counter]/(float)total; + + printf("%8li %4.1f%%\t%s\n", line->count[sym_counter], pcnt, line->line); + line = line->next; + } +} + +#define TRACE_COUNT 3 + +static void show_details(struct sym_entry *syme) +{ + struct symbol *symbol; + struct source_line *line; + struct source_line *line_queue = NULL; + int displayed = 0; + int line_queue_count = 0, total = 0, more = 0; + + if (!syme) + return; + + if (!syme->source) + lookup_sym_source(syme); + + if (!syme->source) + return; + + symbol = (struct symbol *)(syme + 1); + printf("Showing %s for %s\n", event_name(sym_counter), symbol->name); + printf(" Events Pcnt (>=%d%%)\n", sym_pcnt_filter); + + pthread_mutex_lock(&syme->source_lock); + line = syme->source; + while (line) { + total += line->count[sym_counter]; + line = line->next; + } + + line = syme->source; + while (line) { + float pcnt = 0.0; + + if (!line_queue_count) + line_queue = line; + line_queue_count++; + + if (line->count[sym_counter]) + pcnt = 100.0 * line->count[sym_counter] / (float)total; + if (pcnt >= (float)sym_pcnt_filter) { + if (displayed <= print_entries) + show_lines(line_queue, line_queue_count, total); + else more++; + displayed += line_queue_count; + line_queue_count = 0; + line_queue = NULL; + } else if (line_queue_count > TRACE_COUNT) { + line_queue = line_queue->next; + line_queue_count--; + } + + line->count[sym_counter] = zero ? 0 : line->count[sym_counter] * 7 / 8; + line = line->next; + } + pthread_mutex_unlock(&syme->source_lock); + if (more) + printf("%d lines not displayed, maybe increase display entries [e]\n", more); +} struct dso *kernel_dso; @@ -228,6 +470,11 @@ static void print_sym_table(void) printf("------------------------------------------------------------------------------\n\n"); + if (sym_filter_entry) { + show_details(sym_filter_entry); + return; + } + if (nr_counters == 1) printf(" samples pcnt"); else @@ -242,7 +489,7 @@ static void print_sym_table(void) struct symbol *sym = (struct symbol *)(syme + 1); double pcnt; - if (++printed > print_entries || syme->snap_count < count_filter) + if (++printed > print_entries || (int)syme->snap_count < count_filter) continue; pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) / @@ -261,19 +508,208 @@ static void print_sym_table(void) } } +static void prompt_integer(int *target, const char *msg) +{ + char *buf = malloc(0), *p; + size_t dummy = 0; + int tmp; + + fprintf(stdout, "\n%s: ", msg); + if (getline(&buf, &dummy, stdin) < 0) + return; + + p = strchr(buf, '\n'); + if (p) + *p = 0; + + p = buf; + while(*p) { + if (!isdigit(*p)) + goto out_free; + p++; + } + tmp = strtoul(buf, NULL, 10); + *target = tmp; +out_free: + free(buf); +} + +static void prompt_percent(int *target, const char *msg) +{ + int tmp = 0; + + prompt_integer(&tmp, msg); + if (tmp >= 0 && tmp <= 100) + *target = tmp; +} + +static void prompt_symbol(struct sym_entry **target, const char *msg) +{ + char *buf = malloc(0), *p; + struct sym_entry *syme = *target, *n, *found = NULL; + size_t dummy = 0; + + /* zero counters of active symbol */ + if (syme) { + pthread_mutex_lock(&syme->source_lock); + __zero_source_counters(syme); + *target = NULL; + pthread_mutex_unlock(&syme->source_lock); + } + + fprintf(stdout, "\n%s: ", msg); + if (getline(&buf, &dummy, stdin) < 0) + goto out_free; + + p = strchr(buf, '\n'); + if (p) + *p = 0; + + pthread_mutex_lock(&active_symbols_lock); + syme = list_entry(active_symbols.next, struct sym_entry, node); + pthread_mutex_unlock(&active_symbols_lock); + + list_for_each_entry_safe_from(syme, n, &active_symbols, node) { + struct symbol *sym = (struct symbol *)(syme + 1); + + if (!strcmp(buf, sym->name)) { + found = syme; + break; + } + } + + if (!found) { + fprintf(stderr, "Sorry, %s is not active.\n", sym_filter); + sleep(1); + return; + } else + parse_source(found); + +out_free: + free(buf); +} + +static void print_known_keys(void) +{ + fprintf(stdout, "\nknown keys:\n"); + fprintf(stdout, "\t[d] select display delay.\n"); + fprintf(stdout, "\t[e] select display entries (lines).\n"); + fprintf(stdout, "\t[E] select annotation event counter.\n"); + fprintf(stdout, "\t[f] select normal display count filter.\n"); + fprintf(stdout, "\t[F] select annotation display count filter (percentage).\n"); + fprintf(stdout, "\t[qQ] quit.\n"); + fprintf(stdout, "\t[s] select annotation symbol and start annotation.\n"); + fprintf(stdout, "\t[S] stop annotation, revert to normal display.\n"); + fprintf(stdout, "\t[z] toggle event count zeroing.\n"); +} + +static void handle_keypress(int c) +{ + int once = 0; +repeat: + switch (c) { + case 'd': + prompt_integer(&delay_secs, "Enter display delay"); + break; + case 'e': + prompt_integer(&print_entries, "Enter display entries (lines)"); + break; + case 'E': + if (nr_counters > 1) { + int i; + + fprintf(stderr, "\nAvailable events:"); + for (i = 0; i < nr_counters; i++) + fprintf(stderr, "\n\t%d %s", i, event_name(i)); + + prompt_integer(&sym_counter, "Enter details event counter"); + + if (sym_counter >= nr_counters) { + fprintf(stderr, "Sorry, no such event, using %s.\n", event_name(0)); + sym_counter = 0; + sleep(1); + } + } else sym_counter = 0; + break; + case 'f': + prompt_integer(&count_filter, "Enter display event count filter"); + break; + case 'F': + prompt_percent(&sym_pcnt_filter, "Enter details display event filter (percent)"); + break; + case 'q': + case 'Q': + printf("exiting.\n"); + exit(0); + case 's': + prompt_symbol(&sym_filter_entry, "Enter details symbol"); + break; + case 'S': + if (!sym_filter_entry) + break; + else { + struct sym_entry *syme = sym_filter_entry; + + pthread_mutex_lock(&syme->source_lock); + sym_filter_entry = NULL; + __zero_source_counters(syme); + pthread_mutex_unlock(&syme->source_lock); + } + break; + case 'z': + zero = ~zero; + break; + default: { + struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; + struct termios tc, save; + + if (!once) { + print_known_keys(); + once++; + } + + tcgetattr(0, &save); + tc = save; + tc.c_lflag &= ~(ICANON | ECHO); + tc.c_cc[VMIN] = 0; + tc.c_cc[VTIME] = 0; + tcsetattr(0, TCSANOW, &tc); + + poll(&stdin_poll, 1, -1); + c = getc(stdin); + + tcsetattr(0, TCSAFLUSH, &save); + goto repeat; + } + } +} + static void *display_thread(void *arg __used) { struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; - int delay_msecs = delay_secs * 1000; + struct termios tc, save; + int delay_msecs, c; - printf("PerfTop refresh period: %d seconds\n", delay_secs); + tcgetattr(0, &save); + tc = save; + tc.c_lflag &= ~(ICANON | ECHO); + tc.c_cc[VMIN] = 0; + tc.c_cc[VTIME] = 0; +repeat: + delay_msecs = delay_secs * 1000; + tcsetattr(0, TCSANOW, &tc); + /* trash return*/ + getc(stdin); do { print_sym_table(); } while (!poll(&stdin_poll, 1, delay_msecs) == 1); - printf("key pressed - exiting.\n"); - exit(0); + c = getc(stdin); + tcsetattr(0, TCSAFLUSH, &save); + + handle_keypress(c); + goto repeat; return NULL; } @@ -293,7 +729,6 @@ static const char *skip_symbols[] = { static int symbol_filter(struct dso *self, struct symbol *sym) { - static int filter_match; struct sym_entry *syme; const char *name = sym->name; int i; @@ -315,6 +750,10 @@ static int symbol_filter(struct dso *self, struct symbol *sym) return 1; syme = dso__sym_priv(self, sym); + pthread_mutex_init(&syme->source_lock, NULL); + if (!sym_filter_entry && sym_filter && !strcmp(name, sym_filter)) + sym_filter_entry = syme; + for (i = 0; skip_symbols[i]; i++) { if (!strcmp(skip_symbols[i], name)) { syme->skip = 1; @@ -322,29 +761,6 @@ static int symbol_filter(struct dso *self, struct symbol *sym) } } - if (filter_match == 1) { - filter_end = sym->start; - filter_match = -1; - if (filter_end - filter_start > 10000) { - fprintf(stderr, - "hm, too large filter symbol <%s> - skipping.\n", - sym_filter); - fprintf(stderr, "symbol filter start: %016lx\n", - filter_start); - fprintf(stderr, " end: %016lx\n", - filter_end); - filter_end = filter_start = 0; - sym_filter = NULL; - sleep(1); - } - } - - if (filter_match == 0 && sym_filter && !strcmp(name, sym_filter)) { - filter_match = 1; - filter_start = sym->start; - } - - return 0; } @@ -380,8 +796,6 @@ out_delete_dso: return -1; } -#define TRACE_COUNT 3 - /* * Binary search in the histogram table and record the hit: */ @@ -394,6 +808,7 @@ static void record_ip(u64 ip, int counter) if (!syme->skip) { syme->count[counter]++; + record_precise_ip(syme, counter, ip); pthread_mutex_lock(&active_symbols_lock); if (list_empty(&syme->node) || !syme->node.next) __list_insert_active_sym(syme); @@ -690,8 +1105,8 @@ static const struct option options[] = { "put the counters into a counter group"), OPT_BOOLEAN('i', "inherit", &inherit, "child tasks inherit counters"), - OPT_STRING('s', "sym-filter", &sym_filter, "pattern", - "only display symbols matchig this pattern"), + OPT_STRING('s', "sym-annotate", &sym_filter, "symbol name", + "symbol to annotate - requires -k option"), OPT_BOOLEAN('z', "zero", &zero, "zero history across updates"), OPT_INTEGER('F', "freq", &freq, @@ -734,6 +1149,7 @@ int cmd_top(int argc, const char **argv, const char *prefix __used) delay_secs = 1; parse_symbols(); + parse_source(sym_filter_entry); /* * Fill in the ones not specifically initialized via -c: From 46ab976443c6c566c8fe6fc72a6733a55ba9fbea Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Fri, 24 Jul 2009 10:09:50 +0200 Subject: [PATCH 02/27] perf_counter tools: Allow perf top top users to switch between weighted and individual counter display Add [w]eighted hotkey. Pressing [w] toggles between displaying weighted total of all counters, and the counter selected via [E]vent select key. ------------------------------------------------------------------------------ PerfTop: 90395 irqs/sec kernel:16.1% [cache-misses/cache-references/instructions], (all, 4 CPUs) ------------------------------------------------------------------------------ weight samples pcnt RIP kernel function ______ _______ _____ ________________ _______________ 1275408.6 10881 - 5.3% - ffffffff81146f70 : copy_page_c 553683.4 43569 - 21.3% - ffffffff81146f20 : clear_page_c 74075.0 6768 - 3.3% - ffffffff81147190 : copy_user_generic_string 40602.9 7538 - 3.7% - ffffffff81284ba2 : _spin_lock 26882.1 965 - 0.5% - ffffffff8109d280 : file_ra_state_init [w] ------------------------------------------------------------------------------ PerfTop: 91221 irqs/sec kernel:14.5% [10000Hz cache-misses], (all, 4 CPUs) ------------------------------------------------------------------------------ weight samples pcnt RIP kernel function ______ _______ _____ ________________ _______________ 47320.00 - 22.3% - ffffffff81146f20 : clear_page_c 14261.00 - 6.7% - ffffffff810992f5 : __rmqueue 11046.00 - 5.2% - ffffffff81146f70 : copy_page_c 7842.00 - 3.7% - ffffffff81284ba2 : _spin_lock 7234.00 - 3.4% - ffffffff810aa1d6 : unmap_vmas Signed-off-by: Mike Galbraith Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- tools/perf/builtin-top.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index d58701346b1..4eef3465e83 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -90,6 +90,7 @@ static char *sym_filter = NULL; struct sym_entry *sym_filter_entry = NULL; static int sym_pcnt_filter = 5; static int sym_counter = 0; +static int display_weighted = -1; /* * Symbols @@ -354,6 +355,9 @@ static double sym_weight(const struct sym_entry *sym) double weight = sym->snap_count; int counter; + if (!display_weighted) + return weight; + for (counter = 1; counter < nr_counters-1; counter++) weight *= sym->count[counter]; @@ -401,7 +405,7 @@ static void rb_insert_active_sym(struct rb_root *tree, struct sym_entry *se) static void print_sym_table(void) { int printed = 0, j; - int counter; + int counter, snap = !display_weighted ? sym_counter : 0; float samples_per_sec = samples/delay_secs; float ksamples_per_sec = (samples-userspace_samples)/delay_secs; float sum_ksamples = 0.0; @@ -417,7 +421,7 @@ static void print_sym_table(void) pthread_mutex_unlock(&active_symbols_lock); list_for_each_entry_safe_from(syme, n, &active_symbols, node) { - syme->snap_count = syme->count[0]; + syme->snap_count = syme->count[snap]; if (syme->snap_count != 0) { syme->weight = sym_weight(syme); rb_insert_active_sym(&tmp, syme); @@ -437,7 +441,7 @@ static void print_sym_table(void) samples_per_sec, 100.0 - (100.0*((samples_per_sec-ksamples_per_sec)/samples_per_sec))); - if (nr_counters == 1) { + if (nr_counters == 1 || !display_weighted) { printf("%Ld", (u64)attrs[0].sample_period); if (freq) printf("Hz "); @@ -445,7 +449,9 @@ static void print_sym_table(void) printf(" "); } - for (counter = 0; counter < nr_counters; counter++) { + if (!display_weighted) + printf("%s", event_name(sym_counter)); + else for (counter = 0; counter < nr_counters; counter++) { if (counter) printf("/"); @@ -495,7 +501,7 @@ static void print_sym_table(void) pcnt = 100.0 - (100.0 * ((sum_ksamples - syme->snap_count) / sum_ksamples)); - if (nr_counters == 1) + if (nr_counters == 1 || !display_weighted) printf("%20.2f - ", syme->weight); else printf("%9.1f %10ld - ", syme->weight, syme->snap_count); @@ -594,13 +600,14 @@ static void print_known_keys(void) fprintf(stdout, "\nknown keys:\n"); fprintf(stdout, "\t[d] select display delay.\n"); fprintf(stdout, "\t[e] select display entries (lines).\n"); - fprintf(stdout, "\t[E] select annotation event counter.\n"); + fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_counter)); fprintf(stdout, "\t[f] select normal display count filter.\n"); fprintf(stdout, "\t[F] select annotation display count filter (percentage).\n"); fprintf(stdout, "\t[qQ] quit.\n"); fprintf(stdout, "\t[s] select annotation symbol and start annotation.\n"); fprintf(stdout, "\t[S] stop annotation, revert to normal display.\n"); - fprintf(stdout, "\t[z] toggle event count zeroing.\n"); + fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0); + fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", zero ? 1 : 0); } static void handle_keypress(int c) @@ -656,6 +663,9 @@ repeat: pthread_mutex_unlock(&syme->source_lock); } break; + case 'w': + display_weighted = ~display_weighted; + break; case 'z': zero = ~zero; break; From 7b4b6658e152ed4568cfff48175d93645df081d1 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 22 Jul 2009 09:29:32 +0200 Subject: [PATCH 03/27] perf_counter: Fix software counters for fast moving event sources Reimplement the software counters to deal with fast moving event sources (such as tracepoints). This means being able to generate multiple overflows from a single 'event' as well as support throttling. Signed-off-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 186 ++++++++++++++++++++++++------------------ 1 file changed, 105 insertions(+), 81 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 868102172aa..615440ab929 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3344,87 +3344,81 @@ int perf_counter_overflow(struct perf_counter *counter, int nmi, * Generic software counter infrastructure */ -static void perf_swcounter_update(struct perf_counter *counter) +/* + * We directly increment counter->count and keep a second value in + * counter->hw.period_left to count intervals. This period counter + * is kept in the range [-sample_period, 0] so that we can use the + * sign as trigger. + */ + +static u64 perf_swcounter_set_period(struct perf_counter *counter) { struct hw_perf_counter *hwc = &counter->hw; - u64 prev, now; - s64 delta; + u64 period = hwc->last_period; + u64 nr, offset; + s64 old, val; + + hwc->last_period = hwc->sample_period; again: - prev = atomic64_read(&hwc->prev_count); - now = atomic64_read(&hwc->count); - if (atomic64_cmpxchg(&hwc->prev_count, prev, now) != prev) + old = val = atomic64_read(&hwc->period_left); + if (val < 0) + return 0; + + nr = div64_u64(period + val, period); + offset = nr * period; + val -= offset; + if (atomic64_cmpxchg(&hwc->period_left, old, val) != old) goto again; - delta = now - prev; - - atomic64_add(delta, &counter->count); - atomic64_sub(delta, &hwc->period_left); -} - -static void perf_swcounter_set_period(struct perf_counter *counter) -{ - struct hw_perf_counter *hwc = &counter->hw; - s64 left = atomic64_read(&hwc->period_left); - s64 period = hwc->sample_period; - - if (unlikely(left <= -period)) { - left = period; - atomic64_set(&hwc->period_left, left); - hwc->last_period = period; - } - - if (unlikely(left <= 0)) { - left += period; - atomic64_add(period, &hwc->period_left); - hwc->last_period = period; - } - - atomic64_set(&hwc->prev_count, -left); - atomic64_set(&hwc->count, -left); -} - -static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) -{ - enum hrtimer_restart ret = HRTIMER_RESTART; - struct perf_sample_data data; - struct perf_counter *counter; - u64 period; - - counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); - counter->pmu->read(counter); - - data.addr = 0; - data.regs = get_irq_regs(); - /* - * In case we exclude kernel IPs or are somehow not in interrupt - * context, provide the next best thing, the user IP. - */ - if ((counter->attr.exclude_kernel || !data.regs) && - !counter->attr.exclude_user) - data.regs = task_pt_regs(current); - - if (data.regs) { - if (perf_counter_overflow(counter, 0, &data)) - ret = HRTIMER_NORESTART; - } - - period = max_t(u64, 10000, counter->hw.sample_period); - hrtimer_forward_now(hrtimer, ns_to_ktime(period)); - - return ret; + return nr; } static void perf_swcounter_overflow(struct perf_counter *counter, int nmi, struct perf_sample_data *data) { - data->period = counter->hw.last_period; + struct hw_perf_counter *hwc = &counter->hw; + u64 overflow; - perf_swcounter_update(counter); - perf_swcounter_set_period(counter); - if (perf_counter_overflow(counter, nmi, data)) - /* soft-disable the counter */ - ; + data->period = counter->hw.last_period; + overflow = perf_swcounter_set_period(counter); + + if (hwc->interrupts == MAX_INTERRUPTS) + return; + + for (; overflow; overflow--) { + if (perf_counter_overflow(counter, nmi, data)) { + /* + * We inhibit the overflow from happening when + * hwc->interrupts == MAX_INTERRUPTS. + */ + break; + } + } +} + +static void perf_swcounter_unthrottle(struct perf_counter *counter) +{ + /* + * Nothing to do, we already reset hwc->interrupts. + */ +} + +static void perf_swcounter_add(struct perf_counter *counter, u64 nr, + int nmi, struct perf_sample_data *data) +{ + struct hw_perf_counter *hwc = &counter->hw; + + atomic64_add(nr, &counter->count); + + if (!hwc->sample_period) + return; + + if (!data->regs) + return; + + if (!atomic64_add_negative(nr, &hwc->period_left)) + perf_swcounter_overflow(counter, nmi, data); } static int perf_swcounter_is_counting(struct perf_counter *counter) @@ -3488,15 +3482,6 @@ static int perf_swcounter_match(struct perf_counter *counter, return 1; } -static void perf_swcounter_add(struct perf_counter *counter, u64 nr, - int nmi, struct perf_sample_data *data) -{ - int neg = atomic64_add_negative(nr, &counter->hw.count); - - if (counter->hw.sample_period && !neg && data->regs) - perf_swcounter_overflow(counter, nmi, data); -} - static void perf_swcounter_ctx_event(struct perf_counter_context *ctx, enum perf_type_id type, u32 event, u64 nr, int nmi, @@ -3575,26 +3560,65 @@ void __perf_swcounter_event(u32 event, u64 nr, int nmi, static void perf_swcounter_read(struct perf_counter *counter) { - perf_swcounter_update(counter); } static int perf_swcounter_enable(struct perf_counter *counter) { - perf_swcounter_set_period(counter); + struct hw_perf_counter *hwc = &counter->hw; + + if (hwc->sample_period) { + hwc->last_period = hwc->sample_period; + perf_swcounter_set_period(counter); + } return 0; } static void perf_swcounter_disable(struct perf_counter *counter) { - perf_swcounter_update(counter); } static const struct pmu perf_ops_generic = { .enable = perf_swcounter_enable, .disable = perf_swcounter_disable, .read = perf_swcounter_read, + .unthrottle = perf_swcounter_unthrottle, }; +/* + * hrtimer based swcounter callback + */ + +static enum hrtimer_restart perf_swcounter_hrtimer(struct hrtimer *hrtimer) +{ + enum hrtimer_restart ret = HRTIMER_RESTART; + struct perf_sample_data data; + struct perf_counter *counter; + u64 period; + + counter = container_of(hrtimer, struct perf_counter, hw.hrtimer); + counter->pmu->read(counter); + + data.addr = 0; + data.regs = get_irq_regs(); + /* + * In case we exclude kernel IPs or are somehow not in interrupt + * context, provide the next best thing, the user IP. + */ + if ((counter->attr.exclude_kernel || !data.regs) && + !counter->attr.exclude_user) + data.regs = task_pt_regs(current); + + if (data.regs) { + if (perf_counter_overflow(counter, 0, &data)) + ret = HRTIMER_NORESTART; + } + + period = max_t(u64, 10000, counter->hw.sample_period); + hrtimer_forward_now(hrtimer, ns_to_ktime(period)); + + return ret; +} + /* * Software counter: cpu wall time clock */ From 091bd2e993fcb1094a23e36157285b62bc87afdf Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 4 Aug 2009 10:21:23 +0200 Subject: [PATCH 04/27] perf top: Improve interactive key handling Pressing any key which is not currently mapped to functionality, based on startup command line options, displays currently mapped keys, and prompts for input. Pressing any unmapped key at the prompt returns the user to display mode with variables unchanged. eg, pressing ? etc displays currently available keys, the value of the variable associated with that key, and prompts. Pressing same again aborts input. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- tools/perf/builtin-top.c | 108 ++++++++++++++++++++++++++------------- 1 file changed, 73 insertions(+), 35 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index 4eef3465e83..7de28ce9ca2 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -595,25 +595,84 @@ out_free: free(buf); } -static void print_known_keys(void) +static void print_mapped_keys(void) { - fprintf(stdout, "\nknown keys:\n"); - fprintf(stdout, "\t[d] select display delay.\n"); - fprintf(stdout, "\t[e] select display entries (lines).\n"); - fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_counter)); - fprintf(stdout, "\t[f] select normal display count filter.\n"); - fprintf(stdout, "\t[F] select annotation display count filter (percentage).\n"); - fprintf(stdout, "\t[qQ] quit.\n"); - fprintf(stdout, "\t[s] select annotation symbol and start annotation.\n"); - fprintf(stdout, "\t[S] stop annotation, revert to normal display.\n"); - fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0); + char *name = NULL; + + if (sym_filter_entry) { + struct symbol *sym = (struct symbol *)(sym_filter_entry+1); + name = sym->name; + } + + fprintf(stdout, "\nMapped keys:\n"); + fprintf(stdout, "\t[d] display refresh delay. \t(%d)\n", delay_secs); + fprintf(stdout, "\t[e] display entries (lines). \t(%d)\n", print_entries); + + if (nr_counters > 1) + fprintf(stdout, "\t[E] active event counter. \t(%s)\n", event_name(sym_counter)); + + fprintf(stdout, "\t[f] profile display filter (count). \t(%d)\n", count_filter); + + if (vmlinux) { + fprintf(stdout, "\t[F] annotate display filter (percent). \t(%d%%)\n", sym_pcnt_filter); + fprintf(stdout, "\t[s] annotate symbol. \t(%s)\n", name?: "NULL"); + fprintf(stdout, "\t[S] stop annotation.\n"); + } + + if (nr_counters > 1) + fprintf(stdout, "\t[w] toggle display weighted/count[E]r. \t(%d)\n", display_weighted ? 1 : 0); + fprintf(stdout, "\t[z] toggle sample zeroing. \t(%d)\n", zero ? 1 : 0); + fprintf(stdout, "\t[qQ] quit.\n"); +} + +static int key_mapped(int c) +{ + switch (c) { + case 'd': + case 'e': + case 'f': + case 'z': + case 'q': + case 'Q': + return 1; + case 'E': + case 'w': + return nr_counters > 1 ? 1 : 0; + case 'F': + case 's': + case 'S': + return vmlinux ? 1 : 0; + } + + return 0; } static void handle_keypress(int c) { - int once = 0; -repeat: + if (!key_mapped(c)) { + struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; + struct termios tc, save; + + print_mapped_keys(); + fprintf(stdout, "\nEnter selection, or unmapped key to continue: "); + fflush(stdout); + + tcgetattr(0, &save); + tc = save; + tc.c_lflag &= ~(ICANON | ECHO); + tc.c_cc[VMIN] = 0; + tc.c_cc[VTIME] = 0; + tcsetattr(0, TCSANOW, &tc); + + poll(&stdin_poll, 1, -1); + c = getc(stdin); + + tcsetattr(0, TCSAFLUSH, &save); + if (!key_mapped(c)) + return; + } + switch (c) { case 'd': prompt_integer(&delay_secs, "Enter display delay"); @@ -669,28 +728,6 @@ repeat: case 'z': zero = ~zero; break; - default: { - struct pollfd stdin_poll = { .fd = 0, .events = POLLIN }; - struct termios tc, save; - - if (!once) { - print_known_keys(); - once++; - } - - tcgetattr(0, &save); - tc = save; - tc.c_lflag &= ~(ICANON | ECHO); - tc.c_cc[VMIN] = 0; - tc.c_cc[VTIME] = 0; - tcsetattr(0, TCSANOW, &tc); - - poll(&stdin_poll, 1, -1); - c = getc(stdin); - - tcsetattr(0, TCSAFLUSH, &save); - goto repeat; - } } } @@ -705,6 +742,7 @@ static void *display_thread(void *arg __used) tc.c_lflag &= ~(ICANON | ECHO); tc.c_cc[VMIN] = 0; tc.c_cc[VTIME] = 0; + repeat: delay_msecs = delay_secs * 1000; tcsetattr(0, TCSANOW, &tc); From 836179834833272f89098c6d1e1b89e8e69797c2 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Tue, 4 Aug 2009 10:24:41 +0200 Subject: [PATCH 05/27] perf top: Update man page perf_counter tools: update perf top manual page to reflect current implementation. Signed-off-by: Mike Galbraith Cc: Peter Zijlstra Signed-off-by: Ingo Molnar --- tools/perf/Documentation/perf-top.txt | 112 +++++++++++++++++++++++--- 1 file changed, 99 insertions(+), 13 deletions(-) diff --git a/tools/perf/Documentation/perf-top.txt b/tools/perf/Documentation/perf-top.txt index 539d0128972..4a7d558dc30 100644 --- a/tools/perf/Documentation/perf-top.txt +++ b/tools/perf/Documentation/perf-top.txt @@ -3,36 +3,122 @@ perf-top(1) NAME ---- -perf-top - Run a command and profile it +perf-top - System profiling tool. SYNOPSIS -------- [verse] -'perf top' [-e | --event=EVENT] [-l] [-a] +'perf top' [-e | --event=EVENT] [] DESCRIPTION ----------- -This command runs a command and gathers a performance counter profile -from it. +This command generates and displays a performance counter profile in realtime. OPTIONS ------- -...:: - Any command you can specify in a shell. +-a:: +--all-cpus:: + System-wide collection. (default) --e:: ---event=:: +-c :: +--count=:: + Event period to sample. + +-C :: +--CPU=:: + CPU to profile. + +-d :: +--delay=:: + Number of seconds to delay between refreshes. + +-e :: +--event=:: Select the PMU event. Selection can be a symbolic event name (use 'perf list' to list all events) or a raw PMU event (eventsel+umask) in the form of rNNN where NNN is a - hexadecimal event descriptor. + hexadecimal event descriptor. --a:: - system-wide collection +-E :: +--entries=:: + Display this many functions. + +-f :: +--count-filter=:: + Only display functions with more events than this. + +-F :: +--freq=:: + Profile at this frequency. + +-i:: +--inherit:: + Child tasks inherit counters, only makes sens with -p option. + +-k :: +--vmlinux=:: + Path to vmlinux. Required for annotation functionality. + +-m :: +--mmap-pages=:: + Number of mmapped data pages. + +-p :: +--pid=:: + Profile events on existing pid. + +-r :: +--realtime=:: + Collect data with this RT SCHED_FIFO priority. + +-s :: +--sym-annotate=:: + Annotate this symbol. Requires -k option. + +-v:: +--verbose:: + Be more verbose (show counter open errors, etc). + +-z:: +--zero:: + Zero history across display updates. + +INTERACTIVE PROMPTING KEYS +-------------------------- + +[d]:: + Display refresh delay. + +[e]:: + Number of entries to display. + +[E]:: + Event to display when multiple counters are active. + +[f]:: + Profile display filter (>= hit count). + +[F]:: + Annotation display filter (>= % of total). + +[s]:: + Annotate symbol. + +[S]:: + Stop annotation, return to full profile display. + +[w]:: + Toggle between weighted sum and individual count[E]r profile. + +[z]:: + Toggle event count zeroing across display updates. + +[qQ]:: + Quit. + +Pressing any unmapped key displays a menu, and prompts for input. --l:: - scale counter values SEE ALSO -------- From 1953287bfe8afcbbd235bd6c42c9df06d52438dc Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Fri, 7 Aug 2009 07:11:05 +0200 Subject: [PATCH 06/27] perf tools: Fix call-chain cumul hit based sub-total (fractal mode) The callchain fractal mode builds each new total hits in a new branch of profiling by using the parent's hits of the current branch plus the hits of the children. This is wrong, the total hits of a branch should be made of the sum of every children hits, we must ignore the parent hits in this scope. This patch also fixes another mistake with the hit counting. Now the rates are correct. Signed-off-by: Frederic Weisbecker Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Pekka Enberg Signed-off-by: Ingo Molnar --- tools/perf/builtin-report.c | 4 ++-- tools/perf/util/callchain.c | 27 ++++++++++++++++----------- tools/perf/util/callchain.h | 7 ++++++- 3 files changed, 24 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 8cb58d68a00..da402e18656 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -901,7 +901,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, int i; if (callchain_param.mode == CHAIN_GRAPH_REL) - new_total = self->cumul_hit; + new_total = self->children_hit; else new_total = total_samples; @@ -930,7 +930,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, ret += ipchain__fprintf_graph(fp, chain, depth, new_depth_mask, i++, new_total, - child->cumul_hit); + cumul_hits(child)); } ret += callchain__fprintf_graph(fp, child, new_total, depth + 1, diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 9d3c8141b8c..98c5627f327 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -26,10 +26,14 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, struct rb_node **p = &root->rb_node; struct rb_node *parent = NULL; struct callchain_node *rnode; + u64 chain_cumul = cumul_hits(chain); while (*p) { + u64 rnode_cumul; + parent = *p; rnode = rb_entry(parent, struct callchain_node, rb_node); + rnode_cumul = cumul_hits(rnode); switch (mode) { case CHAIN_FLAT: @@ -40,7 +44,7 @@ rb_insert_callchain(struct rb_root *root, struct callchain_node *chain, break; case CHAIN_GRAPH_ABS: /* Falldown */ case CHAIN_GRAPH_REL: - if (rnode->cumul_hit < chain->cumul_hit) + if (rnode_cumul < chain_cumul) p = &(*p)->rb_left; else p = &(*p)->rb_right; @@ -87,7 +91,7 @@ static void __sort_chain_graph_abs(struct callchain_node *node, chain_for_each_child(child, node) { __sort_chain_graph_abs(child, min_hit); - if (child->cumul_hit >= min_hit) + if (cumul_hits(child) >= min_hit) rb_insert_callchain(&node->rb_root, child, CHAIN_GRAPH_ABS); } @@ -108,11 +112,11 @@ static void __sort_chain_graph_rel(struct callchain_node *node, u64 min_hit; node->rb_root = RB_ROOT; - min_hit = node->cumul_hit * min_percent / 100.0; + min_hit = node->children_hit * min_percent / 100.0; chain_for_each_child(child, node) { __sort_chain_graph_rel(child, min_percent); - if (child->cumul_hit >= min_hit) + if (cumul_hits(child) >= min_hit) rb_insert_callchain(&node->rb_root, child, CHAIN_GRAPH_REL); } @@ -211,7 +215,8 @@ add_child(struct callchain_node *parent, struct ip_callchain *chain, new = create_child(parent, false); fill_node(new, chain, start, syms); - new->cumul_hit = new->hit = 1; + new->children_hit = 0; + new->hit = 1; } /* @@ -241,7 +246,8 @@ split_add_child(struct callchain_node *parent, struct ip_callchain *chain, /* split the hits */ new->hit = parent->hit; - new->cumul_hit = parent->cumul_hit; + new->children_hit = parent->children_hit; + parent->children_hit = cumul_hits(new); new->val_nr = parent->val_nr - idx_local; parent->val_nr = idx_local; @@ -249,6 +255,7 @@ split_add_child(struct callchain_node *parent, struct ip_callchain *chain, if (idx_total < chain->nr) { parent->hit = 0; add_child(parent, chain, idx_total, syms); + parent->children_hit++; } else { parent->hit = 1; } @@ -269,13 +276,13 @@ __append_chain_children(struct callchain_node *root, struct ip_callchain *chain, unsigned int ret = __append_chain(rnode, chain, start, syms); if (!ret) - goto cumul; + goto inc_children_hit; } /* nothing in children, add to the current node */ add_child(root, chain, start, syms); -cumul: - root->cumul_hit++; +inc_children_hit: + root->children_hit++; } static int @@ -317,8 +324,6 @@ __append_chain(struct callchain_node *root, struct ip_callchain *chain, /* we match 100% of the path, increment the hit */ if (i - start == root->val_nr && i == chain->nr) { root->hit++; - root->cumul_hit++; - return 0; } diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 7812122bea1..b2d128e07c8 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -21,7 +21,7 @@ struct callchain_node { struct rb_root rb_root; /* sorted tree of children */ unsigned int val_nr; u64 hit; - u64 cumul_hit; /* hit + hits of children */ + u64 children_hit; }; struct callchain_param; @@ -48,6 +48,11 @@ static inline void callchain_init(struct callchain_node *node) INIT_LIST_HEAD(&node->val); } +static inline u64 cumul_hits(struct callchain_node *node) +{ + return node->hit + node->children_hit; +} + int register_callchain_param(struct callchain_param *param); void append_chain(struct callchain_node *root, struct ip_callchain *chain, struct symbol **syms); From 1c222bce7dd0cb9578b4c5cd3874a74f1db497c3 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2009 20:57:41 +0200 Subject: [PATCH 07/27] perf tools: Fix multi-counter stat bug caused by incorrect reading of perf.data file header Brice Goglin reported that only the first result from a multi-counter perf record --stat run is accurate, the rest looks bogus. A silly mistake made us re-read the first attribute for every recorded attribute. Reported-by: Brice Goglin Signed-off-by: Peter Zijlstra Tested-by: Brice Goglin Cc: paulus@samba.org Signed-off-by: Ingo Molnar --- tools/perf/util/header.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 450384b3bbe..95a44bcfc2d 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -213,9 +213,10 @@ struct perf_header *perf_header__read(int fd) for (i = 0; i < nr_attrs; i++) { struct perf_header_attr *attr; - off_t tmp = lseek(fd, 0, SEEK_CUR); + off_t tmp; do_read(fd, &f_attr, sizeof(f_attr)); + tmp = lseek(fd, 0, SEEK_CUR); attr = perf_header_attr__new(&f_attr.attr); From 8f18aec535b5ca513dd13b531730177d35175ffa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2009 19:40:28 +0200 Subject: [PATCH 08/27] perf report: Fix per task mult-counter stat reporting Brice Goglin reported: > I can easily sort them by thread id, but I don't know how to match > my 4 events with each group of 4 lines. Also report the counter id and the time running/enabled stats (in case the counter got time-shared). Reported-by: Brice Goglin Signed-off-by: Peter Zijlstra Tested-by: Brice Goglin Signed-off-by: Ingo Molnar --- tools/perf/builtin-report.c | 31 +++++++++++++++++++++++++++---- tools/perf/util/parse-events.c | 8 +++++++- tools/perf/util/parse-events.h | 1 + 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index da402e18656..84205462e07 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -112,7 +112,9 @@ struct read_event { struct perf_event_header header; u32 pid,tid; u64 value; - u64 format[3]; + u64 time_enabled; + u64 time_running; + u64 id; }; typedef union event_union { @@ -1690,14 +1692,37 @@ static void trace_event(event_t *event) dprintf(".\n"); } +static struct perf_header *header; + +static struct perf_counter_attr *perf_header__find_attr(u64 id) +{ + int i; + + for (i = 0; i < header->attrs; i++) { + struct perf_header_attr *attr = header->attr[i]; + int j; + + for (j = 0; j < attr->ids; j++) { + if (attr->id[j] == id) + return &attr->attr; + } + } + + return NULL; +} + static int process_read_event(event_t *event, unsigned long offset, unsigned long head) { - dprintf("%p [%p]: PERF_EVENT_READ: %d %d %Lu\n", + struct perf_counter_attr *attr = perf_header__find_attr(event->read.id); + + dprintf("%p [%p]: PERF_EVENT_READ: %d %d %s %Lu\n", (void *)(offset + head), (void *)(long)(event->header.size), event->read.pid, event->read.tid, + attr ? __event_name(attr->type, attr->config) + : "FAIL", event->read.value); return 0; @@ -1743,8 +1768,6 @@ process_event(event_t *event, unsigned long offset, unsigned long head) return 0; } -static struct perf_header *header; - static u64 perf_header__sample_type(void) { u64 sample_type = 0; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 7bdad8df22a..f77407b5832 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -223,9 +223,15 @@ char *event_name(int counter) { u64 config = attrs[counter].config; int type = attrs[counter].type; + + return __event_name(type, config); +} + +char *__event_name(int type, u64 config) +{ static char buf[32]; - if (attrs[counter].type == PERF_TYPE_RAW) { + if (type == PERF_TYPE_RAW) { sprintf(buf, "raw 0x%llx", config); return buf; } diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h index 1ea5d09b6eb..192a962e3a0 100644 --- a/tools/perf/util/parse-events.h +++ b/tools/perf/util/parse-events.h @@ -10,6 +10,7 @@ extern int nr_counters; extern struct perf_counter_attr attrs[MAX_COUNTERS]; extern char *event_name(int ctr); +extern char *__event_name(int type, u64 config); extern int parse_events(const struct option *opt, const char *str, int unset); From 94cb9e385d5b4d55a5ae389baa10ad2835ea39bb Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 6 Aug 2009 14:43:17 -0300 Subject: [PATCH 09/27] perf report: Add debug help for the finding of symbol bugs - show the symtab origin (DSO, build-id, kernel, etc) Used with perf report --verbose: [acme@doppio linux-2.6-tip]$ perf report -v | head -16 5.17% firefox /usr/lib64/xulrunner-1.9.1/libxul.so 0x00000000005d8eee f [.] imgContainer::DrawFrameTo(gfxIImageFrame*, gfxIImageFrame*, nsRect&) 2.56% firefox /lib64/libpthread-2.10.1.so 0x0000000000008e02 d [.] __pthread_mutex_lock_internal 1.94% firefox /usr/lib64/xulrunner-1.9.1/libxul.so 0x0000000000d0af8f f [.] SearchTable 1.75% firefox [kernel] 0xffffffffff60013b k [.] vread_hpet 1.63% firefox /lib64/libpthread-2.10.1.so 0x000000000000a404 d [.] __pthread_mutex_unlock 1.47% firefox /usr/lib64/xulrunner-1.9.1/libmozjs.so 0x00000000000482ea f [.] js_Interpret 1.42% firefox /usr/lib64/xulrunner-1.9.1/libmozjs.so 0x000000000003eda3 f [.] JS_CallTracer 1.24% firefox [kernel] 0xffffffff8102ca4a k [k] read_hpet 1.16% firefox [kernel] 0xffffffff810f3dd4 k [k] fget_light 1.11% firefox /usr/lib64/xulrunner-1.9.1/libmozjs.so 0x00000000000567ff f [.] js_TraceObject 0.98% firefox /usr/lib64/firefox-3.5.2/firefox 0x000000000000dd23 b [.] arena_ralloc [acme@doppio linux-2.6-tip]$ The new field is just after the symbol address. To help in figuring out symbol resolution bugs. Signed-off-by: Arnaldo Carvalho de Melo Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- tools/perf/builtin-report.c | 3 +- tools/perf/util/symbol.c | 57 ++++++++++++++++++++++++++++++------- tools/perf/util/symbol.h | 2 ++ 3 files changed, 50 insertions(+), 12 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index 84205462e07..a5e2f8df411 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -700,7 +700,8 @@ sort__sym_print(FILE *fp, struct hist_entry *self, unsigned int width __used) size_t ret = 0; if (verbose) - ret += repsep_fprintf(fp, "%#018llx ", (u64)self->ip); + ret += repsep_fprintf(fp, "%#018llx %c ", (u64)self->ip, + dso__symtab_origin(self->dso)); ret += repsep_fprintf(fp, "[%c] ", self->level); if (self->sym) { diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c index 16ddca20294..f1dcede1430 100644 --- a/tools/perf/util/symbol.c +++ b/tools/perf/util/symbol.c @@ -24,6 +24,16 @@ const char *sym_hist_filter; #define DMGL_ANSI (1 << 1) /* Include const, volatile, etc */ #endif +enum dso_origin { + DSO__ORIG_KERNEL = 0, + DSO__ORIG_JAVA_JIT, + DSO__ORIG_FEDORA, + DSO__ORIG_UBUNTU, + DSO__ORIG_BUILDID, + DSO__ORIG_DSO, + DSO__ORIG_NOT_FOUND, +}; + static struct symbol *symbol__new(u64 start, u64 len, const char *name, unsigned int priv_size, u64 obj_start, int verbose) @@ -81,6 +91,7 @@ struct dso *dso__new(const char *name, unsigned int sym_priv_size) self->sym_priv_size = sym_priv_size; self->find_symbol = dso__find_symbol; self->slen_calculated = 0; + self->origin = DSO__ORIG_NOT_FOUND; } return self; @@ -710,7 +721,7 @@ static char *dso__read_build_id(struct dso *self, int verbose) ++raw; bid += 2; } - if (verbose) + if (verbose >= 2) printf("%s(%s): %s\n", __func__, self->name, build_id); out_elf_end: elf_end(elf); @@ -720,11 +731,26 @@ out: return build_id; } +char dso__symtab_origin(const struct dso *self) +{ + static const char origin[] = { + [DSO__ORIG_KERNEL] = 'k', + [DSO__ORIG_JAVA_JIT] = 'j', + [DSO__ORIG_FEDORA] = 'f', + [DSO__ORIG_UBUNTU] = 'u', + [DSO__ORIG_BUILDID] = 'b', + [DSO__ORIG_DSO] = 'd', + }; + + if (self == NULL || self->origin == DSO__ORIG_NOT_FOUND) + return '!'; + return origin[self->origin]; +} + int dso__load(struct dso *self, symbol_filter_t filter, int verbose) { int size = PATH_MAX; char *name = malloc(size), *build_id = NULL; - int variant = 0; int ret = -1; int fd; @@ -733,19 +759,26 @@ int dso__load(struct dso *self, symbol_filter_t filter, int verbose) self->adjust_symbols = 0; - if (strncmp(self->name, "/tmp/perf-", 10) == 0) - return dso__load_perf_map(self, filter, verbose); + if (strncmp(self->name, "/tmp/perf-", 10) == 0) { + ret = dso__load_perf_map(self, filter, verbose); + self->origin = ret > 0 ? DSO__ORIG_JAVA_JIT : + DSO__ORIG_NOT_FOUND; + return ret; + } + + self->origin = DSO__ORIG_FEDORA - 1; more: do { - switch (variant) { - case 0: /* Fedora */ + self->origin++; + switch (self->origin) { + case DSO__ORIG_FEDORA: snprintf(name, size, "/usr/lib/debug%s.debug", self->name); break; - case 1: /* Ubuntu */ + case DSO__ORIG_UBUNTU: snprintf(name, size, "/usr/lib/debug%s", self->name); break; - case 2: + case DSO__ORIG_BUILDID: build_id = dso__read_build_id(self, verbose); if (build_id != NULL) { snprintf(name, size, @@ -754,16 +787,15 @@ more: free(build_id); break; } - variant++; + self->origin++; /* Fall thru */ - case 3: /* Sane people */ + case DSO__ORIG_DSO: snprintf(name, size, "%s", self->name); break; default: goto out; } - variant++; fd = open(name, O_RDONLY); } while (fd < 0); @@ -899,6 +931,9 @@ int dso__load_kernel(struct dso *self, const char *vmlinux, if (err <= 0) err = dso__load_kallsyms(self, filter, verbose); + if (err > 0) + self->origin = DSO__ORIG_KERNEL; + return err; } diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h index 2f92b21c712..1e003ec2f4b 100644 --- a/tools/perf/util/symbol.h +++ b/tools/perf/util/symbol.h @@ -26,6 +26,7 @@ struct dso { unsigned int sym_priv_size; unsigned char adjust_symbols; unsigned char slen_calculated; + unsigned char origin; char name[0]; }; @@ -49,6 +50,7 @@ int dso__load_modules(struct dso *self, symbol_filter_t filter, int verbose); int dso__load(struct dso *self, symbol_filter_t filter, int verbose); size_t dso__fprintf(struct dso *self, FILE *fp); +char dso__symtab_origin(const struct dso *self); void symbol__init(void); #endif /* _PERF_SYMBOL_ */ From b26bc5a7f81474937e427b0c855eabee5ad56f89 Mon Sep 17 00:00:00 2001 From: Brice Goglin Date: Fri, 7 Aug 2009 10:18:39 +0200 Subject: [PATCH 10/27] perf stat: Fix tool option consistency: rename -S/--scale to -c/--scale We want to use a coherent flag for -S/--stat across all tools, so free up -S in perf stat. Signed-off-by: Brice Goglin Cc: Peter Zijlstra Cc: paulus@samba.org Signed-off-by: Ingo Molnar --- tools/perf/Documentation/perf-stat.txt | 2 +- tools/perf/builtin-stat.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/Documentation/perf-stat.txt b/tools/perf/Documentation/perf-stat.txt index 0d74346d21a..484080dd5b6 100644 --- a/tools/perf/Documentation/perf-stat.txt +++ b/tools/perf/Documentation/perf-stat.txt @@ -40,7 +40,7 @@ OPTIONS -a:: system-wide collection --S:: +-c:: scale counter values EXAMPLES diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index f9510eeeb6c..b4b06c7903e 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -496,7 +496,7 @@ static const struct option options[] = { "stat events on existing pid"), OPT_BOOLEAN('a', "all-cpus", &system_wide, "system-wide collection from all CPUs"), - OPT_BOOLEAN('S', "scale", &scale, + OPT_BOOLEAN('c', "scale", &scale, "scale/normalize counters"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), From f36a1a133a947973efb8e6a1fbdcc23e4a011437 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Fri, 7 Aug 2009 16:59:45 +1000 Subject: [PATCH 11/27] perf_counter/powerpc: Fix oops on cpus without perf_counter hardware support If we have the powerpc perf_counter backend compiled in, but the cpu we are running on is one where we don't support the PMU, we currently oops in hw_perf_group_sched_in if we try to use any counters, because ppmu is NULL in that case, and we unconditionally dereference ppmu. This fixes the problem by adding a check if ppmu is NULL at the beginning of hw_perf_group_sched_in, and also at the beginning of the other functions that get called from the perf_counter core, i.e. hw_perf_disable, hw_perf_enable, and hw_perf_counter_setup. Signed-off-by: Paul Mackerras Cc: Peter Zijlstra Cc: benh@kernel.crashing.org Signed-off-by: Ingo Molnar --- arch/powerpc/kernel/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/arch/powerpc/kernel/perf_counter.c b/arch/powerpc/kernel/perf_counter.c index 809fdf94b95..70e1f57f7dd 100644 --- a/arch/powerpc/kernel/perf_counter.c +++ b/arch/powerpc/kernel/perf_counter.c @@ -518,6 +518,8 @@ void hw_perf_disable(void) struct cpu_hw_counters *cpuhw; unsigned long flags; + if (!ppmu) + return; local_irq_save(flags); cpuhw = &__get_cpu_var(cpu_hw_counters); @@ -572,6 +574,8 @@ void hw_perf_enable(void) int n_lim; int idx; + if (!ppmu) + return; local_irq_save(flags); cpuhw = &__get_cpu_var(cpu_hw_counters); if (!cpuhw->disabled) { @@ -737,6 +741,8 @@ int hw_perf_group_sched_in(struct perf_counter *group_leader, long i, n, n0; struct perf_counter *sub; + if (!ppmu) + return 0; cpuhw = &__get_cpu_var(cpu_hw_counters); n0 = cpuhw->n_counters; n = collect_events(group_leader, ppmu->n_counter - n0, @@ -1281,6 +1287,8 @@ void hw_perf_counter_setup(int cpu) { struct cpu_hw_counters *cpuhw = &per_cpu(cpu_hw_counters, cpu); + if (!ppmu) + return; memset(cpuhw, 0, sizeof(*cpuhw)); cpuhw->mmcr[0] = MMCR0_FC; } From ae07b63f4b6728e1f98aa5c5416cfc1280f59f51 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 6 Aug 2009 16:48:54 +0200 Subject: [PATCH 12/27] perf list: Fix the output to not include tracepoints without an id Stop perf list from displaying tracepoints without an id file, those are special tracepoints that are not interfaced to perfcounters so listing them is erroneous and passing them as events will produce no output. Signed-off-by: Peter Zijlstra Acked-by: Jason Baron Cc: Steven Rostedt Cc: Chris Mason Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index f77407b5832..4858d83b3b6 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -121,13 +121,29 @@ static unsigned long hw_cache_stat[C(MAX)] = { (strcmp(sys_dirent.d_name, ".")) && \ (strcmp(sys_dirent.d_name, ".."))) +static int tp_event_has_id(struct dirent *sys_dir, struct dirent *evt_dir) +{ + char evt_path[MAXPATHLEN]; + int fd; + + snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path, + sys_dir->d_name, evt_dir->d_name); + fd = open(evt_path, O_RDONLY); + if (fd < 0) + return -EINVAL; + close(fd); + + return 0; +} + #define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next, file, st) \ while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next) \ if (snprintf(file, MAXPATHLEN, "%s/%s/%s", debugfs_path, \ sys_dirent.d_name, evt_dirent.d_name) && \ (!stat(file, &st)) && (S_ISDIR(st.st_mode)) && \ (strcmp(evt_dirent.d_name, ".")) && \ - (strcmp(evt_dirent.d_name, ".."))) + (strcmp(evt_dirent.d_name, "..")) && \ + (!tp_event_has_id(&sys_dirent, &evt_dirent))) #define MAX_EVENT_LENGTH 30 From 7eac7e9e726c1b136bd7e0ad6671ce315f48bb18 Mon Sep 17 00:00:00 2001 From: Pierre Habouzit Date: Fri, 7 Aug 2009 14:16:00 +0200 Subject: [PATCH 13/27] perf util: Fix do_read() to fail on EOF instead of busy-looping While toying with perf, I've noticed that perf record can easily enter a busy loop when doing something as silly as: $ perf record -A ls Yeah, do_read here really wants to read a known size, not being able to should die(), not busy-loop ;) That was the cause for the bug. Signed-off-by: Pierre Habouzit Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- tools/perf/util/header.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 95a44bcfc2d..b92a457ca32 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -185,6 +185,8 @@ static void do_read(int fd, void *buf, size_t size) if (ret < 0) die("failed to read"); + if (ret == 0) + die("failed to read: missing data"); size -= ret; buf += ret; From 266e0e219888420a1a7cafc82e82891cf7b5a979 Mon Sep 17 00:00:00 2001 From: Pierre Habouzit Date: Fri, 7 Aug 2009 14:16:01 +0200 Subject: [PATCH 14/27] perf record: Fix the -A UI for empty or non-existent perf.data 1. Ignore the -A argument if there is no perf.data file 2. Treat an empty file like a non existent file. Else, perf will try to read the perf.data header, and fail with an error. Treating an empty file like a non-existent file makes sense, since an interupted (as in SIGKILLed) perf could leave such files around, and you don't want to annoy the user with errors for files with no data in it. Signed-off-by: Pierre Habouzit Acked-by: Peter Zijlstra Cc: Paul Mackerras Signed-off-by: Ingo Molnar --- tools/perf/builtin-record.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c index 90c98082af1..0345aad8eba 100644 --- a/tools/perf/builtin-record.c +++ b/tools/perf/builtin-record.c @@ -525,10 +525,14 @@ static int __cmd_record(int argc, const char **argv) signal(SIGCHLD, sig_handler); signal(SIGINT, sig_handler); - if (!stat(output_name, &st) && !force && !append_file) { - fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n", - output_name); - exit(-1); + if (!stat(output_name, &st) && st.st_size) { + if (!force && !append_file) { + fprintf(stderr, "Error, output file %s exists, use -A to append or -f to overwrite.\n", + output_name); + exit(-1); + } + } else { + append_file = 0; } flags = O_CREAT|O_RDWR; From b0efe213f84f7fd5ccfe07053e3d9fb827b7c188 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 02:16:23 +0200 Subject: [PATCH 15/27] perf tools: callchain: Fix spurious 'perf report' warnings: ignore empty callchains When the callchain tree comes to insert an empty backtrace, it raises a spurious warning about the fact we are inserting an empty. This is spurious because the radix tree assumes it did something wrong to reach this state. But it didn't, we just met an empty callchain that has to be ignored. This happens occasionally with certain types of call-chain recordings. If it happens it's a big nuisance as perf report output starts with thousands of warning lines. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1249690585-9145-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- tools/perf/util/callchain.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index 98c5627f327..a8e67aa9ef4 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -336,5 +336,7 @@ __append_chain(struct callchain_node *root, struct ip_callchain *chain, void append_chain(struct callchain_node *root, struct ip_callchain *chain, struct symbol **syms) { + if (!chain->nr) + return; __append_chain_children(root, chain, syms, 0); } From b1a88349c37624755b28ac3b3152b48f52c1f487 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 02:16:24 +0200 Subject: [PATCH 16/27] perf tools: callchain: Fix 'perf report' display to be callchain by default If we recorded with -g option to record the callchain, right now we require a -g option to perf report as well - and people reported this as unnecessary complication: the user already specified -g once, no need to require it a second time. So if the recording includes call-chains, display the callchain by default from perf report. ( The user can override this default using "-g none" option from perf report. ) Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1249690585-9145-3-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-report.c | 16 +++++++++++++++- tools/perf/util/callchain.h | 1 + 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index a5e2f8df411..c4a8e108e52 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -68,7 +68,7 @@ static int callchain; static struct callchain_param callchain_param = { - .mode = CHAIN_GRAPH_ABS, + .mode = CHAIN_GRAPH_REL, .min_percent = 0.5 }; @@ -1836,6 +1836,13 @@ static int __cmd_report(void) " -g?\n"); exit(-1); } + } else if (callchain_param.mode != CHAIN_NONE && !callchain) { + callchain = 1; + if (register_callchain_param(&callchain_param) < 0) { + fprintf(stderr, "Can't register callchain" + " params\n"); + exit(-1); + } } if (load_kernel() < 0) { @@ -1974,6 +1981,13 @@ parse_callchain_opt(const struct option *opt __used, const char *arg, else if (!strncmp(tok, "fractal", strlen(arg))) callchain_param.mode = CHAIN_GRAPH_REL; + else if (!strncmp(tok, "none", strlen(arg))) { + callchain_param.mode = CHAIN_NONE; + callchain = 0; + + return 0; + } + else return -1; diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index b2d128e07c8..a926ae4f5a1 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -7,6 +7,7 @@ #include "symbol.h" enum chain_mode { + CHAIN_NONE, CHAIN_FLAT, CHAIN_GRAPH_ABS, CHAIN_GRAPH_REL From 25446036cbfc2c89faacdb4fb4603943d2197dc6 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 02:16:25 +0200 Subject: [PATCH 17/27] perf tools: callchain: Fix sum of percentages to be 100% by displaying amount of ignored chains in fractal mode When we filter the callchains below a given percentage, we ignore them and the end result only shows entries that have an upper percentage than the filter threshold. It seems to users then that we have an imbalance in the percentage, as if the sum inside a profiled branch doesn't reach 100%. Since in the past there have been real perf report bugs that showed the same sypmtom, it would be nice to assure the user that the data is perfect and trustable and it all sums up to 100.00%. So fix this by displaying the remaining hits that have been filtered but without more detail than their amount in each branches. Example while filtering below 50%: 7.73% [k] delay_tsc | |--98.22%-- __const_udelay | | | |--86.37%-- ath5k_hw_register_timeout | | ath5k_hw_noise_floor_calibration | | ath5k_hw_reset | | ath5k_reset | | ath5k_config | | ieee80211_hw_config | | | | | |--88.53%-- ieee80211_scan_work | | | worker_thread | | | kthread | | | child_rip | | --11.47%-- [...] | --13.63%-- [...] --1.78%-- [...] Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1249690585-9145-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- tools/perf/builtin-report.c | 47 ++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c index c4a8e108e52..99274cec0ad 100644 --- a/tools/perf/builtin-report.c +++ b/tools/perf/builtin-report.c @@ -891,6 +891,21 @@ ipchain__fprintf_graph(FILE *fp, struct callchain_list *chain, int depth, return ret; } +static struct symbol *rem_sq_bracket; +static struct callchain_list rem_hits; + +static void init_rem_hits(void) +{ + rem_sq_bracket = malloc(sizeof(*rem_sq_bracket) + 6); + if (!rem_sq_bracket) { + fprintf(stderr, "Not enough memory to display remaining hits\n"); + return; + } + + strcpy(rem_sq_bracket->name, "[...]"); + rem_hits.sym = rem_sq_bracket; +} + static size_t callchain__fprintf_graph(FILE *fp, struct callchain_node *self, u64 total_samples, int depth, int depth_mask) @@ -900,6 +915,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, struct callchain_list *chain; int new_depth_mask = depth_mask; u64 new_total; + u64 remaining; size_t ret = 0; int i; @@ -908,17 +924,25 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, else new_total = total_samples; + remaining = new_total; + node = rb_first(&self->rb_root); while (node) { + u64 cumul; + child = rb_entry(node, struct callchain_node, rb_node); + cumul = cumul_hits(child); + remaining -= cumul; /* * The depth mask manages the output of pipes that show * the depth. We don't want to keep the pipes of the current - * level for the last child of this depth + * level for the last child of this depth. + * Except if we have remaining filtered hits. They will + * supersede the last child */ next = rb_next(node); - if (!next) + if (!next && (callchain_param.mode != CHAIN_GRAPH_REL || !remaining)) new_depth_mask &= ~(1 << (depth - 1)); /* @@ -933,7 +957,7 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, ret += ipchain__fprintf_graph(fp, chain, depth, new_depth_mask, i++, new_total, - cumul_hits(child)); + cumul); } ret += callchain__fprintf_graph(fp, child, new_total, depth + 1, @@ -941,6 +965,19 @@ callchain__fprintf_graph(FILE *fp, struct callchain_node *self, node = next; } + if (callchain_param.mode == CHAIN_GRAPH_REL && + remaining && remaining != new_total) { + + if (!rem_sq_bracket) + return ret; + + new_depth_mask &= ~(1 << (depth - 1)); + + ret += ipchain__fprintf_graph(fp, &rem_hits, depth, + new_depth_mask, 0, new_total, + remaining); + } + return ret; } @@ -1361,6 +1398,8 @@ static size_t output__fprintf(FILE *fp, u64 total_samples) unsigned int width; char *col_width = col_width_list_str; + init_rem_hits(); + fprintf(fp, "# Samples: %Ld\n", (u64)total_samples); fprintf(fp, "#\n"); @@ -1432,6 +1471,8 @@ print_entries: } fprintf(fp, "\n"); + free(rem_sq_bracket); + return ret; } From 10b8e3066066708f304e0fc5cfe658e05abf943d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:35 +0200 Subject: [PATCH 18/27] perf_counter: Work around gcc warning by initializing tracepoint record unconditionally MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Despite that the tracepoint record is always present when the PERF_SAMPLE_TP_RECORD flag is set, gcc raises a warning, thinking it might not be initialized: kernel/perf_counter.c: In function ‘perf_counter_output’: kernel/perf_counter.c:2650: warning: ‘tp’ may be used uninitialized in this function Then, initialize it to NULL and always check if it's not NULL before dereference it. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-2-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 615440ab929..117622cb73a 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp; + struct perf_tracepoint_record *tp = NULL; int callchain_size = 0; u64 time; struct { @@ -2717,7 +2717,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, if (sample_type & PERF_SAMPLE_TP_RECORD) { tp = data->private; - header.size += tp->size; + if (tp) + header.size += tp->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2783,7 +2784,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if (sample_type & PERF_SAMPLE_TP_RECORD) + if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) perf_output_copy(&handle, tp->record, tp->size); perf_output_end(&handle); From 3a43ce68ae1758fa6a839386025ef45acb6baa22 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sat, 8 Aug 2009 04:26:37 +0200 Subject: [PATCH 19/27] perf_counter: Fix tracepoint sampling to be part of generic sampling Based on Peter's comments, make tracepoint sampling generic just like all the other sampling bits are. This is a rename with no code changes: - PERF_SAMPLE_TP_RECORD to PERF_SAMPLE_RAW - struct perf_tracepoint_record to perf_raw_record We want the system in place that transport tracepoints raw samples events into the perf ring buffer to be generalized and usable by any type of counter. Reported-by; Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249698400-5441-4-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 10 +++++----- kernel/perf_counter.c | 20 ++++++++++---------- 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index a67dd5c5b6d..2aabe43c1d0 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -121,7 +121,7 @@ enum perf_counter_sample_format { PERF_SAMPLE_CPU = 1U << 7, PERF_SAMPLE_PERIOD = 1U << 8, PERF_SAMPLE_STREAM_ID = 1U << 9, - PERF_SAMPLE_TP_RECORD = 1U << 10, + PERF_SAMPLE_RAW = 1U << 10, PERF_SAMPLE_MAX = 1U << 11, /* non-ABI */ }; @@ -414,9 +414,9 @@ struct perf_callchain_entry { __u64 ip[PERF_MAX_STACK_DEPTH]; }; -struct perf_tracepoint_record { - int size; - char *record; +struct perf_raw_record { + u32 size; + void *data; }; struct task_struct; @@ -687,7 +687,7 @@ struct perf_sample_data { struct pt_regs *regs; u64 addr; u64 period; - void *private; + struct perf_raw_record *raw; }; extern int perf_counter_overflow(struct perf_counter *counter, int nmi, diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 117622cb73a..00231054041 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_tracepoint_record *tp = NULL; + struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2715,10 +2715,10 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, header.size += sizeof(u64); } - if (sample_type & PERF_SAMPLE_TP_RECORD) { - tp = data->private; - if (tp) - header.size += tp->size; + if (sample_type & PERF_SAMPLE_RAW) { + raw = data->raw; + if (raw) + header.size += raw->size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2784,8 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_TP_RECORD) && tp) - perf_output_copy(&handle, tp->record, tp->size); + if ((sample_type & PERF_SAMPLE_RAW) && raw) + perf_output_copy(&handle, raw->data, raw->size); perf_output_end(&handle); } @@ -3740,15 +3740,15 @@ static const struct pmu perf_ops_task_clock = { void perf_tpcounter_event(int event_id, u64 addr, u64 count, void *record, int entry_size) { - struct perf_tracepoint_record tp = { + struct perf_raw_record raw = { .size = entry_size, - .record = record, + .data = record, }; struct perf_sample_data data = { .regs = get_irq_regs(), .addr = addr, - .private = &tp, + .raw = &raw, }; if (!data.regs) From 3a80b4a3539696f4b0574876326860323035a302 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 7 Aug 2009 19:49:01 +0200 Subject: [PATCH 20/27] perf_counter: Fix a race on perf_counter_ctx While extending perfcounters with BTS hw-tracing, Markus Metzger managed to trigger this warning: [ 995.557128] WARNING: at kernel/perf_counter.c:1191 __perf_counter_task_sched_out+0x48/0x6b() triggers because commit 9f498cc5be7e013d8d6e4c616980ed0ffc8680d2 (perf_counter: Full task tracing) removed clearing of tsk->perf_counter_ctxp out from under ctx->lock which introduced a race (against perf_lock_task_context). Move it back and deal with the exit notification by explicitly passing along the former task context. Reported-by: Markus T Metzger Signed-off-by: Peter Zijlstra Cc: Paul Mackerras LKML-Reference: <1249667341.17467.5.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 00231054041..546e62d6294 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2850,7 +2850,8 @@ perf_counter_read_event(struct perf_counter *counter, */ struct perf_task_event { - struct task_struct *task; + struct task_struct *task; + struct perf_counter_context *task_ctx; struct { struct perf_event_header header; @@ -2910,24 +2911,23 @@ static void perf_counter_task_ctx(struct perf_counter_context *ctx, static void perf_counter_task_event(struct perf_task_event *task_event) { struct perf_cpu_context *cpuctx; - struct perf_counter_context *ctx; + struct perf_counter_context *ctx = task_event->task_ctx; cpuctx = &get_cpu_var(perf_cpu_context); perf_counter_task_ctx(&cpuctx->ctx, task_event); put_cpu_var(perf_cpu_context); rcu_read_lock(); - /* - * doesn't really matter which of the child contexts the - * events ends up in. - */ - ctx = rcu_dereference(current->perf_counter_ctxp); + if (!ctx) + ctx = rcu_dereference(task_event->task->perf_counter_ctxp); if (ctx) perf_counter_task_ctx(ctx, task_event); rcu_read_unlock(); } -static void perf_counter_task(struct task_struct *task, int new) +static void perf_counter_task(struct task_struct *task, + struct perf_counter_context *task_ctx, + int new) { struct perf_task_event task_event; @@ -2937,8 +2937,9 @@ static void perf_counter_task(struct task_struct *task, int new) return; task_event = (struct perf_task_event){ - .task = task, - .event = { + .task = task, + .task_ctx = task_ctx, + .event = { .header = { .type = new ? PERF_EVENT_FORK : PERF_EVENT_EXIT, .misc = 0, @@ -2956,7 +2957,7 @@ static void perf_counter_task(struct task_struct *task, int new) void perf_counter_fork(struct task_struct *task) { - perf_counter_task(task, 1); + perf_counter_task(task, NULL, 1); } /* @@ -4310,7 +4311,7 @@ void perf_counter_exit_task(struct task_struct *child) unsigned long flags; if (likely(!child->perf_counter_ctxp)) { - perf_counter_task(child, 0); + perf_counter_task(child, NULL, 0); return; } @@ -4330,6 +4331,7 @@ void perf_counter_exit_task(struct task_struct *child) * incremented the context's refcount before we do put_ctx below. */ spin_lock(&child_ctx->lock); + child->perf_counter_ctxp = NULL; /* * If this context is a clone; unclone it so it can't get * swapped to another process while we're removing all @@ -4343,9 +4345,7 @@ void perf_counter_exit_task(struct task_struct *child) * won't get any samples after PERF_EVENT_EXIT. We can however still * get a few PERF_EVENT_READ events. */ - perf_counter_task(child, 0); - - child->perf_counter_ctxp = NULL; + perf_counter_task(child, child_ctx, 0); /* * We can recurse on the same lock type through: From c24b513337f06712b8c81eb4f1413d44591dfee7 Mon Sep 17 00:00:00 2001 From: "Carlos R. Mafra" Date: Wed, 5 Aug 2009 20:53:34 +0200 Subject: [PATCH 21/27] perf: "Longum est iter per praecepta, breve et efficax per exempla" A few examples of how 'perf' can be used, from an e-mail by Ingo Molnar http://lkml.org/lkml/2009/8/4/346. Signed-off-by: Carlos R. Mafra Cc: Peter Zijlstra Cc: Valdis.Kletnieks@vt.edu LKML-Reference: <20090805185334.GA4535@Pilar.aei.mpg.de> Signed-off-by: Ingo Molnar --- tools/perf/Documentation/perf-examples.txt | 225 +++++++++++++++++++++ 1 file changed, 225 insertions(+) create mode 100644 tools/perf/Documentation/perf-examples.txt diff --git a/tools/perf/Documentation/perf-examples.txt b/tools/perf/Documentation/perf-examples.txt new file mode 100644 index 00000000000..8eb6c489fb1 --- /dev/null +++ b/tools/perf/Documentation/perf-examples.txt @@ -0,0 +1,225 @@ + + ------------------------------ + ****** perf by examples ****** + ------------------------------ + +[ From an e-mail by Ingo Molnar, http://lkml.org/lkml/2009/8/4/346 ] + + +First, discovery/enumeration of available counters can be done via +'perf list': + +titan:~> perf list + [...] + kmem:kmalloc [Tracepoint event] + kmem:kmem_cache_alloc [Tracepoint event] + kmem:kmalloc_node [Tracepoint event] + kmem:kmem_cache_alloc_node [Tracepoint event] + kmem:kfree [Tracepoint event] + kmem:kmem_cache_free [Tracepoint event] + kmem:mm_page_free_direct [Tracepoint event] + kmem:mm_pagevec_free [Tracepoint event] + kmem:mm_page_alloc [Tracepoint event] + kmem:mm_page_alloc_zone_locked [Tracepoint event] + kmem:mm_page_pcpu_drain [Tracepoint event] + kmem:mm_page_alloc_extfrag [Tracepoint event] + +Then any (or all) of the above event sources can be activated and +measured. For example the page alloc/free properties of a 'hackbench +run' are: + + titan:~> perf stat -e kmem:mm_page_pcpu_drain -e kmem:mm_page_alloc + -e kmem:mm_pagevec_free -e kmem:mm_page_free_direct ./hackbench 10 + Time: 0.575 + + Performance counter stats for './hackbench 10': + + 13857 kmem:mm_page_pcpu_drain + 27576 kmem:mm_page_alloc + 6025 kmem:mm_pagevec_free + 20934 kmem:mm_page_free_direct + + 0.613972165 seconds time elapsed + +You can observe the statistical properties as well, by using the +'repeat the workload N times' feature of perf stat: + + titan:~> perf stat --repeat 5 -e kmem:mm_page_pcpu_drain -e + kmem:mm_page_alloc -e kmem:mm_pagevec_free -e + kmem:mm_page_free_direct ./hackbench 10 + Time: 0.627 + Time: 0.644 + Time: 0.564 + Time: 0.559 + Time: 0.626 + + Performance counter stats for './hackbench 10' (5 runs): + + 12920 kmem:mm_page_pcpu_drain ( +- 3.359% ) + 25035 kmem:mm_page_alloc ( +- 3.783% ) + 6104 kmem:mm_pagevec_free ( +- 0.934% ) + 18376 kmem:mm_page_free_direct ( +- 4.941% ) + + 0.643954516 seconds time elapsed ( +- 2.363% ) + +Furthermore, these tracepoints can be used to sample the workload as +well. For example the page allocations done by a 'git gc' can be +captured the following way: + + titan:~/git> perf record -f -e kmem:mm_page_alloc -c 1 ./git gc + Counting objects: 1148, done. + Delta compression using up to 2 threads. + Compressing objects: 100% (450/450), done. + Writing objects: 100% (1148/1148), done. + Total 1148 (delta 690), reused 1148 (delta 690) + [ perf record: Captured and wrote 0.267 MB perf.data (~11679 samples) ] + +To check which functions generated page allocations: + + titan:~/git> perf report + # Samples: 10646 + # + # Overhead Command Shared Object + # ........ ............... .......................... + # + 23.57% git-repack /lib64/libc-2.5.so + 21.81% git /lib64/libc-2.5.so + 14.59% git ./git + 11.79% git-repack ./git + 7.12% git /lib64/ld-2.5.so + 3.16% git-repack /lib64/libpthread-2.5.so + 2.09% git-repack /bin/bash + 1.97% rm /lib64/libc-2.5.so + 1.39% mv /lib64/ld-2.5.so + 1.37% mv /lib64/libc-2.5.so + 1.12% git-repack /lib64/ld-2.5.so + 0.95% rm /lib64/ld-2.5.so + 0.90% git-update-serv /lib64/libc-2.5.so + 0.73% git-update-serv /lib64/ld-2.5.so + 0.68% perf /lib64/libpthread-2.5.so + 0.64% git-repack /usr/lib64/libz.so.1.2.3 + +Or to see it on a more finegrained level: + +titan:~/git> perf report --sort comm,dso,symbol +# Samples: 10646 +# +# Overhead Command Shared Object Symbol +# ........ ............... .......................... ...... +# + 9.35% git-repack ./git [.] insert_obj_hash + 9.12% git ./git [.] insert_obj_hash + 7.31% git /lib64/libc-2.5.so [.] memcpy + 6.34% git-repack /lib64/libc-2.5.so [.] _int_malloc + 6.24% git-repack /lib64/libc-2.5.so [.] memcpy + 5.82% git-repack /lib64/libc-2.5.so [.] __GI___fork + 5.47% git /lib64/libc-2.5.so [.] _int_malloc + 2.99% git /lib64/libc-2.5.so [.] memset + +Furthermore, call-graph sampling can be done too, of page +allocations - to see precisely what kind of page allocations there +are: + + titan:~/git> perf record -f -g -e kmem:mm_page_alloc -c 1 ./git gc + Counting objects: 1148, done. + Delta compression using up to 2 threads. + Compressing objects: 100% (450/450), done. + Writing objects: 100% (1148/1148), done. + Total 1148 (delta 690), reused 1148 (delta 690) + [ perf record: Captured and wrote 0.963 MB perf.data (~42069 samples) ] + + titan:~/git> perf report -g + # Samples: 10686 + # + # Overhead Command Shared Object + # ........ ............... .......................... + # + 23.25% git-repack /lib64/libc-2.5.so + | + |--50.00%-- _int_free + | + |--37.50%-- __GI___fork + | make_child + | + |--12.50%-- ptmalloc_unlock_all2 + | make_child + | + --6.25%-- __GI_strcpy + 21.61% git /lib64/libc-2.5.so + | + |--30.00%-- __GI_read + | | + | --83.33%-- git_config_from_file + | git_config + | | + [...] + +Or you can observe the whole system's page allocations for 10 +seconds: + +titan:~/git> perf stat -a -e kmem:mm_page_pcpu_drain -e +kmem:mm_page_alloc -e kmem:mm_pagevec_free -e +kmem:mm_page_free_direct sleep 10 + + Performance counter stats for 'sleep 10': + + 171585 kmem:mm_page_pcpu_drain + 322114 kmem:mm_page_alloc + 73623 kmem:mm_pagevec_free + 254115 kmem:mm_page_free_direct + + 10.000591410 seconds time elapsed + +Or observe how fluctuating the page allocations are, via statistical +analysis done over ten 1-second intervals: + + titan:~/git> perf stat --repeat 10 -a -e kmem:mm_page_pcpu_drain -e + kmem:mm_page_alloc -e kmem:mm_pagevec_free -e + kmem:mm_page_free_direct sleep 1 + + Performance counter stats for 'sleep 1' (10 runs): + + 17254 kmem:mm_page_pcpu_drain ( +- 3.709% ) + 34394 kmem:mm_page_alloc ( +- 4.617% ) + 7509 kmem:mm_pagevec_free ( +- 4.820% ) + 25653 kmem:mm_page_free_direct ( +- 3.672% ) + + 1.058135029 seconds time elapsed ( +- 3.089% ) + +Or you can annotate the recorded 'git gc' run on a per symbol basis +and check which instructions/source-code generated page allocations: + + titan:~/git> perf annotate __GI___fork + ------------------------------------------------ + Percent | Source code & Disassembly of libc-2.5.so + ------------------------------------------------ + : + : + : Disassembly of section .plt: + : Disassembly of section .text: + : + : 00000031a2e95560 <__fork>: + [...] + 0.00 : 31a2e95602: b8 38 00 00 00 mov $0x38,%eax + 0.00 : 31a2e95607: 0f 05 syscall + 83.42 : 31a2e95609: 48 3d 00 f0 ff ff cmp $0xfffffffffffff000,%rax + 0.00 : 31a2e9560f: 0f 87 4d 01 00 00 ja 31a2e95762 <__fork+0x202> + 0.00 : 31a2e95615: 85 c0 test %eax,%eax + +( this shows that 83.42% of __GI___fork's page allocations come from + the 0x38 system call it performs. ) + +etc. etc. - a lot more is possible. I could list a dozen of +other different usecases straight away - neither of which is +possible via /proc/vmstat. + +/proc/vmstat is not in the same league really, in terms of +expressive power of system analysis and performance +analysis. + +All that the above results needed were those new tracepoints +in include/tracing/events/kmem.h. + + Ingo + + From 183f3b0887083d36c8a25cd5e3518906415d1889 Mon Sep 17 00:00:00 2001 From: Mike Galbraith Date: Sat, 8 Aug 2009 14:14:15 +0200 Subject: [PATCH 22/27] perf_counter tools: Fix libbfd detection for systems with libz dependency Due to a libz dependency in some distro's binutils package, C++ demangle support isn't compiled in despite the necessary libraries being available. Fix this by adding a -lz link test to the dependency detection rules. Signed-off-by: Mike Galbraith Acked-by: Peter Zijlstra LKML-Reference: <1249733655.6929.5.camel@marge.simson.net> Signed-off-by: Ingo Molnar --- tools/perf/Makefile | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/tools/perf/Makefile b/tools/perf/Makefile index 1916e44b9bb..60411e94113 100644 --- a/tools/perf/Makefile +++ b/tools/perf/Makefile @@ -387,10 +387,14 @@ else has_bfd_iberty := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty > /dev/null 2>&1 && echo y") + has_bfd_iberty_z := $(shell sh -c "(echo '\#include '; echo 'int main(void) { bfd_demangle(0, 0, 0); return 0; }') | $(CC) -x c - $(ALL_CFLAGS) -o /dev/null $(ALL_LDFLAGS) -lbfd -liberty -lz > /dev/null 2>&1 && echo y") + ifeq ($(has_bfd),y) EXTLIBS += -lbfd else ifeq ($(has_bfd_iberty),y) EXTLIBS += -lbfd -liberty + else ifeq ($(has_bfd_iberty_z),y) + EXTLIBS += -lbfd -liberty -lz else msg := $(warning No bfd.h/libbfd found, install binutils-dev[el] to gain symbol demangling) BASIC_CFLAGS += -DNO_DEMANGLE From c0a8865e32c8d1a562db38e06ef31ef23282f646 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Sun, 9 Aug 2009 04:19:15 +0200 Subject: [PATCH 23/27] perf tools: callchain: Fix bad rounding of minimum rate Sometimes we get callchain branches that have a rate under the limit given by the user. Say you launched: perf record -f -g -a ./hackbench 10 perf report -g fractal,10.0 And you got: 2.33% hackbench [kernel] [k] _spin_lock_irqsave | |--78.57%-- remove_wait_queue | poll_freewait | do_sys_poll | sys_poll | sysenter_dispatch | 0xf7ffa430 | 0x1ffadea3c | |--7.14%-- __up_read | up_read | do_page_fault | page_fault | 0xf7ffa430 | 0xa0df710000000a ... It is abnormal to get a 7.14% branch whereas we passed a 10% filter. The problem is that we round down the minimum threshold. This happens mostly when we have very low number of events. If the total amount of your branch is 4 and you have a subranch of 3 events, filtering to 90% will be computed like follows: limit = 4 * 0.9; The result is about 3.6, but the cast to integer will round down to 3. It means that our filter is actually of 75% We must then explicitly round up the minimum threshold. Reported-by: Ingo Molnar Signed-off-by: Frederic Weisbecker Cc: acme@redhat.com Cc: peterz@infradead.org Cc: efault@gmx.de LKML-Reference: <20090809024235.GA10146@nowhere> Signed-off-by: Ingo Molnar --- tools/perf/util/callchain.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/callchain.c b/tools/perf/util/callchain.c index a8e67aa9ef4..01147341164 100644 --- a/tools/perf/util/callchain.c +++ b/tools/perf/util/callchain.c @@ -13,6 +13,7 @@ #include #include #include +#include #include "callchain.h" @@ -112,7 +113,7 @@ static void __sort_chain_graph_rel(struct callchain_node *node, u64 min_hit; node->rb_root = RB_ROOT; - min_hit = node->children_hit * min_percent / 100.0; + min_hit = ceil(node->children_hit * min_percent); chain_for_each_child(child, node) { __sort_chain_graph_rel(child, min_percent); @@ -126,7 +127,7 @@ static void sort_chain_graph_rel(struct rb_root *rb_root, struct callchain_node *chain_root, u64 min_hit __used, struct callchain_param *param) { - __sort_chain_graph_rel(chain_root, param->min_percent); + __sort_chain_graph_rel(chain_root, param->min_percent / 100.0); rb_root->rb_node = chain_root->rb_root.rb_node; } From a044560c3a1f0ad75ce685c1ed7604820b9ed319 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:16:52 +0200 Subject: [PATCH 24/27] perf_counter: Correct PERF_SAMPLE_RAW output PERF_SAMPLE_* output switches should unconditionally output the correct format, as they are the only way to unambiguously parse the PERF_EVENT_SAMPLE data. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896447.17467.74.camel@twins> Signed-off-by: Ingo Molnar --- include/linux/perf_counter.h | 2 ++ include/trace/ftrace.h | 3 ++- kernel/perf_counter.c | 30 ++++++++++++++++++++++++------ 3 files changed, 28 insertions(+), 7 deletions(-) diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h index 2aabe43c1d0..a9d823a93fe 100644 --- a/include/linux/perf_counter.h +++ b/include/linux/perf_counter.h @@ -369,6 +369,8 @@ enum perf_event_type { * * { u64 nr, * u64 ips[nr]; } && PERF_SAMPLE_CALLCHAIN + * { u32 size; + * char data[size];}&& PERF_SAMPLE_RAW * }; */ PERF_EVENT_SAMPLE = 9, diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7fb16d90e7b..7167b9b97da 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -685,7 +685,8 @@ static void ftrace_profile_##call(proto) \ pc = preempt_count(); \ \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ - __entry_size = ALIGN(__data_size + sizeof(*entry), sizeof(u64));\ + __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ + sizeof(u64)); \ \ do { \ char raw_data[__entry_size]; \ diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 546e62d6294..5229d1666fa 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -2646,7 +2646,6 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, u64 counter; } group_entry; struct perf_callchain_entry *callchain = NULL; - struct perf_raw_record *raw = NULL; int callchain_size = 0; u64 time; struct { @@ -2716,9 +2715,15 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } if (sample_type & PERF_SAMPLE_RAW) { - raw = data->raw; - if (raw) - header.size += raw->size; + int size = sizeof(u32); + + if (data->raw) + size += data->raw->size; + else + size += sizeof(u32); + + WARN_ON_ONCE(size & (sizeof(u64)-1)); + header.size += size; } ret = perf_output_begin(&handle, counter, header.size, nmi, 1); @@ -2784,8 +2789,21 @@ static void perf_counter_output(struct perf_counter *counter, int nmi, } } - if ((sample_type & PERF_SAMPLE_RAW) && raw) - perf_output_copy(&handle, raw->data, raw->size); + if (sample_type & PERF_SAMPLE_RAW) { + if (data->raw) { + perf_output_put(&handle, data->raw->size); + perf_output_copy(&handle, data->raw->data, data->raw->size); + } else { + struct { + u32 size; + u32 data; + } raw = { + .size = sizeof(u32), + .data = 0, + }; + perf_output_put(&handle, raw); + } + } perf_output_end(&handle); } From a4e95fc2cbb31d70a65beffeaf8773f881328c34 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 10 Aug 2009 11:20:12 +0200 Subject: [PATCH 25/27] perf_counter: Require CAP_SYS_ADMIN for raw tracepoint data Raw tracepoint data contains various kernel internals and data from other users, so restrict this to CAP_SYS_ADMIN. Signed-off-by: Peter Zijlstra Acked-by: Frederic Weisbecker Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <1249896452.17467.75.camel@twins> Signed-off-by: Ingo Molnar --- kernel/perf_counter.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c index 5229d1666fa..b0b20a07f39 100644 --- a/kernel/perf_counter.c +++ b/kernel/perf_counter.c @@ -3787,6 +3787,14 @@ static void tp_perf_counter_destroy(struct perf_counter *counter) static const struct pmu *tp_perf_counter_init(struct perf_counter *counter) { + /* + * Raw tracepoint data is a severe data leak, only allow root to + * have these. + */ + if ((counter->attr.sample_type & PERF_SAMPLE_RAW) && + !capable(CAP_SYS_ADMIN)) + return ERR_PTR(-EPERM); + if (ftrace_profile_enable(counter->attr.config)) return NULL; From 304703aba31a87903b8c0db8f5e6890cac2d596d Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 10 Aug 2009 16:11:32 +0200 Subject: [PATCH 26/27] perf_counter: Subtract the buffer size field from the event record size We compute the perf raw sample size by aligning the raw ftrace event size plus the buffer size field itself. We do that instead of aligning only the perf raw sample size, so that we might economize some in some cases. But this buffer size field is not stored in the perf raw sample, we must then substract its size from the buffer once we computed the alignment unless we may get a useless u32 field in the buffer. Signed-off-by: Frederic Weisbecker Acked-by: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith Cc: Paul Mackerras LKML-Reference: <20090810141129.GA5124@nowhere> Signed-off-by: Ingo Molnar --- include/trace/ftrace.h | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index 7167b9b97da..a05524fa245 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -637,7 +637,12 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * pc = preempt_count(); * * __data_size = ftrace_get_offsets_(&__data_offsets, args); - * __entry_size = __data_size + sizeof(*entry); + * + * // Below we want to get the aligned size by taking into account + * // the u32 field that will later store the buffer size + * __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32), + * sizeof(u64)); + * __entry_size -= sizeof(u32); * * do { * char raw_data[__entry_size]; <- allocate our sample in the stack @@ -687,6 +692,7 @@ static void ftrace_profile_##call(proto) \ __data_size = ftrace_get_offsets_##call(&__data_offsets, args); \ __entry_size = ALIGN(__data_size + sizeof(*entry) + sizeof(u32),\ sizeof(u64)); \ + __entry_size -= sizeof(u32); \ \ do { \ char raw_data[__entry_size]; \ From 1853db0e02ae4088f102b0d8e59e83dc98f93f03 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Mon, 10 Aug 2009 16:38:36 +0200 Subject: [PATCH 27/27] perf_counter: Zero dead bytes from ftrace raw samples size alignment After aligning the ftrace raw samples, there are dead bytes storing random data from the stack. We don't want to leak these to userspace, then zero these out. Before: 0x2de88 [0x50]: event: 9 . . ... raw event: size 80 bytes . 0000: 09 00 00 00 01 00 50 00 d0 c7 00 81 ff ff ff ff ......P........ . 0010: 68 01 00 00 68 01 00 00 2c 00 00 00 00 00 00 00 h...h...,...... . 0020: 2c 00 00 00 2b 00 01 02 68 01 00 00 68 01 00 00 ,...+...h...h.. . 0030: 6b 6f 6e 64 65 6d 61 6e 64 2f 30 00 00 00 00 00 kondemand/0.... . 0040: 68 01 00 00 40 7f 46 81 ff ff ff ff 00 10 1b 7f h...@.F........ ^ ^ ^ ^ Leak After: 0x2d318 [0x50]: event: 9 . . ... raw event: size 80 bytes . 0000: 09 00 00 00 01 00 50 00 d0 c7 00 81 ff ff ff ff ......P........ . 0010: 68 01 00 00 68 01 00 00 68 14 00 00 00 00 00 00 h...h...h...... . 0020: 2c 00 00 00 2b 00 01 02 68 01 00 00 68 01 00 00 ,...+...h...h.. . 0030: 6b 6f 6e 64 65 6d 61 6e 64 2f 30 00 00 00 00 00 kondemand/0.... . 0040: 68 01 00 00 a0 80 46 81 ff ff ff ff 00 00 00 00 h.....F........ ^ ^ ^ ^ Fixed Reported-by: Peter Zijlstra Signed-off-by: Frederic Weisbecker Cc: Frederic Weisbecker Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith LKML-Reference: <1249915116-5210-1-git-send-email-fweisbec@gmail.com> Signed-off-by: Ingo Molnar Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Mike Galbraith --- include/trace/ftrace.h | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/include/trace/ftrace.h b/include/trace/ftrace.h index a05524fa245..f64fbaae781 100644 --- a/include/trace/ftrace.h +++ b/include/trace/ftrace.h @@ -648,6 +648,9 @@ __attribute__((section("_ftrace_events"))) event_##call = { \ * char raw_data[__entry_size]; <- allocate our sample in the stack * struct trace_entry *ent; * + * zero dead bytes from alignment to avoid stack leak to userspace: + * + * *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; * entry = (struct ftrace_raw_ *)raw_data; * ent = &entry->ent; * tracing_generic_entry_update(ent, irq_flags, pc); @@ -698,6 +701,7 @@ static void ftrace_profile_##call(proto) \ char raw_data[__entry_size]; \ struct trace_entry *ent; \ \ + *(u64 *)(&raw_data[__entry_size - sizeof(u64)]) = 0ULL; \ entry = (struct ftrace_raw_##call *)raw_data; \ ent = &entry->ent; \ tracing_generic_entry_update(ent, irq_flags, pc); \