From 42202dd56c717f173cd0bf2390249e1bf5cf210b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 13 Jun 2009 14:57:28 +0200 Subject: [PATCH] perf stat: Add feature to run and measure a command multiple times Add the --repeat feature to perf stat, which repeats a given command up to a 100 times, collects the stats and calculates an average and a stddev. For example, the following oneliner 'perf stat' command runs hackbench 5 times and prints a tabulated result of all metrics, with averages and noise levels (in percentage) printed: aldebaran:~/linux/linux/tools/perf> ./perf stat --repeat 5 ~/hackbench 10 Time: 0.117 Time: 0.108 Time: 0.089 Time: 0.088 Time: 0.100 Performance counter stats for '/home/mingo/hackbench 10' (5 runs): 1243.989586 task-clock-msecs # 10.460 CPUs ( +- 4.720% ) 47706 context-switches # 0.038 M/sec ( +- 19.706% ) 387 CPU-migrations # 0.000 M/sec ( +- 3.608% ) 17793 page-faults # 0.014 M/sec ( +- 0.354% ) 3770941606 cycles # 3031.329 M/sec ( +- 4.621% ) 1566372416 instructions # 0.415 IPC ( +- 2.703% ) 16783421 cache-references # 13.492 M/sec ( +- 5.202% ) 7128590 cache-misses # 5.730 M/sec ( +- 7.420% ) 0.118924455 seconds time elapsed. The goal of this feature is to allow the reliance on these accurate statistics and to know how many times a command has to be repeated for the noise to go down to an acceptable level. (The -v option can be used to see a line printed out as each run progresses.) Cc: Peter Zijlstra Cc: Mike Galbraith Cc: Paul Mackerras Cc: Arnaldo Carvalho de Melo LKML-Reference: Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 305 +++++++++++++++++++++++++++----------- 1 file changed, 217 insertions(+), 88 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c12804853ea..9eb42b1ae78 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -43,6 +43,7 @@ #include "util/parse-events.h" #include +#include static struct perf_counter_attr default_attrs[MAX_COUNTERS] = { @@ -79,12 +80,34 @@ static const unsigned int default_count[] = { 10000, }; -static __u64 event_res[MAX_COUNTERS][3]; -static __u64 event_scaled[MAX_COUNTERS]; +#define MAX_RUN 100 -static __u64 runtime_nsecs; -static __u64 walltime_nsecs; -static __u64 runtime_cycles; +static int run_count = 1; +static int run_idx = 0; + +static __u64 event_res[MAX_RUN][MAX_COUNTERS][3]; +static __u64 event_scaled[MAX_RUN][MAX_COUNTERS]; + +//static __u64 event_hist[MAX_RUN][MAX_COUNTERS][3]; + + +static __u64 runtime_nsecs[MAX_RUN]; +static __u64 walltime_nsecs[MAX_RUN]; +static __u64 runtime_cycles[MAX_RUN]; + +static __u64 event_res_avg[MAX_COUNTERS][3]; +static __u64 event_res_noise[MAX_COUNTERS][3]; + +static __u64 event_scaled_avg[MAX_COUNTERS]; + +static __u64 runtime_nsecs_avg; +static __u64 runtime_nsecs_noise; + +static __u64 walltime_nsecs_avg; +static __u64 walltime_nsecs_noise; + +static __u64 runtime_cycles_avg; +static __u64 runtime_cycles_noise; static void create_perf_stat_counter(int counter) { @@ -140,7 +163,7 @@ static void read_counter(int counter) int cpu, nv; int scaled; - count = event_res[counter]; + count = event_res[run_idx][counter]; count[0] = count[1] = count[2] = 0; @@ -151,6 +174,8 @@ static void read_counter(int counter) res = read(fd[cpu][counter], single_count, nv * sizeof(__u64)); assert(res == nv * sizeof(__u64)); + close(fd[cpu][counter]); + fd[cpu][counter] = -1; count[0] += single_count[0]; if (scale) { @@ -162,13 +187,13 @@ static void read_counter(int counter) scaled = 0; if (scale) { if (count[2] == 0) { - event_scaled[counter] = -1; + event_scaled[run_idx][counter] = -1; count[0] = 0; return; } if (count[2] < count[1]) { - event_scaled[counter] = 1; + event_scaled[run_idx][counter] = 1; count[0] = (unsigned long long) ((double)count[0] * count[1] / count[2] + 0.5); } @@ -178,82 +203,18 @@ static void read_counter(int counter) */ if (attrs[counter].type == PERF_TYPE_SOFTWARE && attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) - runtime_nsecs = count[0]; + runtime_nsecs[run_idx] = count[0]; if (attrs[counter].type == PERF_TYPE_HARDWARE && attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES) - runtime_cycles = count[0]; + runtime_cycles[run_idx] = count[0]; } -static void nsec_printout(int counter, __u64 *count) -{ - double msecs = (double)count[0] / 1000000; - - fprintf(stderr, " %14.6f %-20s", msecs, event_name(counter)); - - if (attrs[counter].type == PERF_TYPE_SOFTWARE && - attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) { - - if (walltime_nsecs) - fprintf(stderr, " # %10.3f CPUs", - (double)count[0] / (double)walltime_nsecs); - } -} - -static void abs_printout(int counter, __u64 *count) -{ - fprintf(stderr, " %14Ld %-20s", count[0], event_name(counter)); - - if (runtime_cycles && - attrs[counter].type == PERF_TYPE_HARDWARE && - attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) { - - fprintf(stderr, " # %10.3f IPC", - (double)count[0] / (double)runtime_cycles); - - return; - } - - if (runtime_nsecs) - fprintf(stderr, " # %10.3f M/sec", - (double)count[0]/runtime_nsecs*1000.0); -} - -/* - * Print out the results of a single counter: - */ -static void print_counter(int counter) -{ - __u64 *count; - int scaled; - - count = event_res[counter]; - scaled = event_scaled[counter]; - - if (scaled == -1) { - fprintf(stderr, " %14s %-20s\n", - "", event_name(counter)); - return; - } - - if (nsec_counter(counter)) - nsec_printout(counter, count); - else - abs_printout(counter, count); - - if (scaled) - fprintf(stderr, " (scaled from %.2f%%)", - (double) count[2] / count[1] * 100); - - fprintf(stderr, "\n"); -} - -static int do_perf_stat(int argc, const char **argv) +static int run_perf_stat(int argc, const char **argv) { unsigned long long t0, t1; + int status = 0; int counter; - int status; int pid; - int i; if (!system_wide) nr_cpus = 1; @@ -277,13 +238,168 @@ static int do_perf_stat(int argc, const char **argv) } } - while (wait(&status) >= 0) - ; + wait(&status); prctl(PR_TASK_PERF_COUNTERS_DISABLE); t1 = rdclock(); - walltime_nsecs = t1 - t0; + walltime_nsecs[run_idx] = t1 - t0; + + for (counter = 0; counter < nr_counters; counter++) + read_counter(counter); + + return WEXITSTATUS(status); +} + +static void print_noise(__u64 *count, __u64 *noise) +{ + if (run_count > 1) + fprintf(stderr, " ( +- %7.3f%% )", + (double)noise[0]/(count[0]+1)*100.0); +} + +static void nsec_printout(int counter, __u64 *count, __u64 *noise) +{ + double msecs = (double)count[0] / 1000000; + + fprintf(stderr, " %14.6f %-20s", msecs, event_name(counter)); + + if (attrs[counter].type == PERF_TYPE_SOFTWARE && + attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) { + + if (walltime_nsecs_avg) + fprintf(stderr, " # %10.3f CPUs ", + (double)count[0] / (double)walltime_nsecs_avg); + } + print_noise(count, noise); +} + +static void abs_printout(int counter, __u64 *count, __u64 *noise) +{ + fprintf(stderr, " %14Ld %-20s", count[0], event_name(counter)); + + if (runtime_cycles_avg && + attrs[counter].type == PERF_TYPE_HARDWARE && + attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) { + + fprintf(stderr, " # %10.3f IPC ", + (double)count[0] / (double)runtime_cycles_avg); + } else { + if (runtime_nsecs_avg) { + fprintf(stderr, " # %10.3f M/sec", + (double)count[0]/runtime_nsecs_avg*1000.0); + } + } + print_noise(count, noise); +} + +/* + * Print out the results of a single counter: + */ +static void print_counter(int counter) +{ + __u64 *count, *noise; + int scaled; + + count = event_res_avg[counter]; + noise = event_res_noise[counter]; + scaled = event_scaled_avg[counter]; + + if (scaled == -1) { + fprintf(stderr, " %14s %-20s\n", + "", event_name(counter)); + return; + } + + if (nsec_counter(counter)) + nsec_printout(counter, count, noise); + else + abs_printout(counter, count, noise); + + if (scaled) + fprintf(stderr, " (scaled from %.2f%%)", + (double) count[2] / count[1] * 100); + + fprintf(stderr, "\n"); +} + +/* + * Normalize noise values down to stddev: + */ +static void normalize(__u64 *val) +{ + double res; + + res = (double)*val / (run_count * sqrt((double)run_count)); + + *val = (__u64)res; +} + +/* + * Calculate the averages and noises: + */ +static void calc_avg(void) +{ + int i, j; + + for (i = 0; i < run_count; i++) { + runtime_nsecs_avg += runtime_nsecs[i]; + walltime_nsecs_avg += walltime_nsecs[i]; + runtime_cycles_avg += runtime_cycles[i]; + + for (j = 0; j < nr_counters; j++) { + event_res_avg[j][0] += event_res[i][j][0]; + event_res_avg[j][1] += event_res[i][j][1]; + event_res_avg[j][2] += event_res[i][j][2]; + event_scaled_avg[j] += event_scaled[i][j]; + } + } + runtime_nsecs_avg /= run_count; + walltime_nsecs_avg /= run_count; + runtime_cycles_avg /= run_count; + + for (j = 0; j < nr_counters; j++) { + event_res_avg[j][0] /= run_count; + event_res_avg[j][1] /= run_count; + event_res_avg[j][2] /= run_count; + } + + for (i = 0; i < run_count; i++) { + runtime_nsecs_noise += + abs((__s64)(runtime_nsecs[i] - runtime_nsecs_avg)); + walltime_nsecs_noise += + abs((__s64)(walltime_nsecs[i] - walltime_nsecs_avg)); + runtime_cycles_noise += + abs((__s64)(runtime_cycles[i] - runtime_cycles_avg)); + + for (j = 0; j < nr_counters; j++) { + event_res_noise[j][0] += + abs((__s64)(event_res[i][j][0] - event_res_avg[j][0])); + event_res_noise[j][1] += + abs((__s64)(event_res[i][j][1] - event_res_avg[j][1])); + event_res_noise[j][2] += + abs((__s64)(event_res[i][j][2] - event_res_avg[j][2])); + } + } + + normalize(&runtime_nsecs_noise); + normalize(&walltime_nsecs_noise); + normalize(&runtime_cycles_noise); + + for (j = 0; j < nr_counters; j++) { + normalize(&event_res_noise[j][0]); + normalize(&event_res_noise[j][1]); + normalize(&event_res_noise[j][2]); + } +} + +static void print_stat(int argc, const char **argv) +{ + int i, counter; + + calc_avg(); + + run_idx = 0; fflush(stdout); @@ -293,21 +409,19 @@ static int do_perf_stat(int argc, const char **argv) for (i = 1; i < argc; i++) fprintf(stderr, " %s", argv[i]); - fprintf(stderr, "\':\n"); - fprintf(stderr, "\n"); - - for (counter = 0; counter < nr_counters; counter++) - read_counter(counter); + fprintf(stderr, "\'"); + if (run_count > 1) + fprintf(stderr, " (%d runs)", run_count); + fprintf(stderr, ":\n\n"); for (counter = 0; counter < nr_counters; counter++) print_counter(counter); fprintf(stderr, "\n"); - fprintf(stderr, " %14.9f seconds time elapsed.\n", (double)(t1-t0)/1e9); + fprintf(stderr, " %14.9f seconds time elapsed.\n", + (double)walltime_nsecs_avg/1e9); fprintf(stderr, "\n"); - - return 0; } static volatile int signr = -1; @@ -345,11 +459,15 @@ static const struct option options[] = { "scale/normalize counters"), OPT_BOOLEAN('v', "verbose", &verbose, "be more verbose (show counter open errors, etc)"), + OPT_INTEGER('r', "repeat", &run_count, + "repeat command and print average + stddev (max: 100)"), OPT_END() }; int cmd_stat(int argc, const char **argv, const char *prefix) { + int status; + page_size = sysconf(_SC_PAGE_SIZE); memcpy(attrs, default_attrs, sizeof(attrs)); @@ -357,6 +475,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix) argc = parse_options(argc, argv, options, stat_usage, 0); if (!argc) usage_with_options(stat_usage, options); + if (run_count <= 0 || run_count > MAX_RUN) + usage_with_options(stat_usage, options); if (!nr_counters) nr_counters = 8; @@ -376,5 +496,14 @@ int cmd_stat(int argc, const char **argv, const char *prefix) signal(SIGALRM, skip_signal); signal(SIGABRT, skip_signal); - return do_perf_stat(argc, argv); + status = 0; + for (run_idx = 0; run_idx < run_count; run_idx++) { + if (run_count != 1 && verbose) + fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx+1); + status = run_perf_stat(argc, argv); + } + + print_stat(argc, argv); + + return status; }