051ae7f734
Vince Weaver reported a 'perf stat' measurement overhead in the count of retired instructions, which can amount to a +6000 instructions inflated count in the reported count. At present, perf stat creates its counters on the perf process. Thus the counters count the fork and various other activity in both the parent and child, such as the resolver overhead for resolving PLT entries for any libc functions that haven't been called before, such as execvp. This reduces the overhead by creating the counters on the child process after the fork, using a couple of pipes to synchronize so that the child process waits until the parent has created the counters before doing the exec. To eliminate the PLT resolution overhead on calling execvp, this does a dummy execvp first which will always fail. With this, the overhead of executing a program goes down from over 4800 instructions to about 90 instructions on powerpc (32-bit). This was measured with a statically-linked program written in assembler which only does the 3 instructions needed to call _exit(0). Before: $ perf stat -e 0:1:u ./three Performance counter stats for './three': 4858 instructions 0.001274523 seconds time elapsed After: $ perf stat -e 0:1:u ./three Performance counter stats for './three': 92 instructions 0.000468153 seconds time elapsed Reported-by: Vince Weaver <vince@deater.net> Signed-off-by: Paul Mackerras <paulus@samba.org> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <19016.41425.814043.870352@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
555 lines
14 KiB
C
555 lines
14 KiB
C
/*
|
|
* builtin-stat.c
|
|
*
|
|
* Builtin stat command: Give a precise performance counters summary
|
|
* overview about any workload, CPU or specific PID.
|
|
*
|
|
* Sample output:
|
|
|
|
$ perf stat ~/hackbench 10
|
|
Time: 0.104
|
|
|
|
Performance counter stats for '/home/mingo/hackbench':
|
|
|
|
1255.538611 task clock ticks # 10.143 CPU utilization factor
|
|
54011 context switches # 0.043 M/sec
|
|
385 CPU migrations # 0.000 M/sec
|
|
17755 pagefaults # 0.014 M/sec
|
|
3808323185 CPU cycles # 3033.219 M/sec
|
|
1575111190 instructions # 1254.530 M/sec
|
|
17367895 cache references # 13.833 M/sec
|
|
7674421 cache misses # 6.112 M/sec
|
|
|
|
Wall-clock time elapsed: 123.786620 msecs
|
|
|
|
*
|
|
* Copyright (C) 2008, Red Hat Inc, Ingo Molnar <mingo@redhat.com>
|
|
*
|
|
* Improvements and fixes by:
|
|
*
|
|
* Arjan van de Ven <arjan@linux.intel.com>
|
|
* Yanmin Zhang <yanmin.zhang@intel.com>
|
|
* Wu Fengguang <fengguang.wu@intel.com>
|
|
* Mike Galbraith <efault@gmx.de>
|
|
* Paul Mackerras <paulus@samba.org>
|
|
* Jaswinder Singh Rajput <jaswinder@kernel.org>
|
|
*
|
|
* Released under the GPL v2. (and only v2, not any later version)
|
|
*/
|
|
|
|
#include "perf.h"
|
|
#include "builtin.h"
|
|
#include "util/util.h"
|
|
#include "util/parse-options.h"
|
|
#include "util/parse-events.h"
|
|
|
|
#include <sys/prctl.h>
|
|
#include <math.h>
|
|
|
|
static struct perf_counter_attr default_attrs[] = {
|
|
|
|
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK },
|
|
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES},
|
|
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS },
|
|
{ .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS },
|
|
|
|
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES },
|
|
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS },
|
|
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES},
|
|
{ .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES },
|
|
|
|
};
|
|
|
|
#define MAX_RUN 100
|
|
|
|
static int system_wide = 0;
|
|
static int verbose = 0;
|
|
static int nr_cpus = 0;
|
|
static int run_idx = 0;
|
|
|
|
static int run_count = 1;
|
|
static int inherit = 1;
|
|
static int scale = 1;
|
|
static int target_pid = -1;
|
|
static int null_run = 0;
|
|
|
|
static int fd[MAX_NR_CPUS][MAX_COUNTERS];
|
|
|
|
static u64 runtime_nsecs[MAX_RUN];
|
|
static u64 walltime_nsecs[MAX_RUN];
|
|
static u64 runtime_cycles[MAX_RUN];
|
|
|
|
static u64 event_res[MAX_RUN][MAX_COUNTERS][3];
|
|
static u64 event_scaled[MAX_RUN][MAX_COUNTERS];
|
|
|
|
static u64 event_res_avg[MAX_COUNTERS][3];
|
|
static u64 event_res_noise[MAX_COUNTERS][3];
|
|
|
|
static u64 event_scaled_avg[MAX_COUNTERS];
|
|
|
|
static u64 runtime_nsecs_avg;
|
|
static u64 runtime_nsecs_noise;
|
|
|
|
static u64 walltime_nsecs_avg;
|
|
static u64 walltime_nsecs_noise;
|
|
|
|
static u64 runtime_cycles_avg;
|
|
static u64 runtime_cycles_noise;
|
|
|
|
#define ERR_PERF_OPEN \
|
|
"Error: counter %d, sys_perf_counter_open() syscall returned with %d (%s)\n"
|
|
|
|
static void create_perf_stat_counter(int counter, int pid)
|
|
{
|
|
struct perf_counter_attr *attr = attrs + counter;
|
|
|
|
if (scale)
|
|
attr->read_format = PERF_FORMAT_TOTAL_TIME_ENABLED |
|
|
PERF_FORMAT_TOTAL_TIME_RUNNING;
|
|
|
|
if (system_wide) {
|
|
int cpu;
|
|
for (cpu = 0; cpu < nr_cpus; cpu++) {
|
|
fd[cpu][counter] = sys_perf_counter_open(attr, -1, cpu, -1, 0);
|
|
if (fd[cpu][counter] < 0 && verbose)
|
|
fprintf(stderr, ERR_PERF_OPEN, counter,
|
|
fd[cpu][counter], strerror(errno));
|
|
}
|
|
} else {
|
|
attr->inherit = inherit;
|
|
attr->disabled = 1;
|
|
|
|
fd[0][counter] = sys_perf_counter_open(attr, pid, -1, -1, 0);
|
|
if (fd[0][counter] < 0 && verbose)
|
|
fprintf(stderr, ERR_PERF_OPEN, counter,
|
|
fd[0][counter], strerror(errno));
|
|
}
|
|
}
|
|
|
|
/*
|
|
* Does the counter have nsecs as a unit?
|
|
*/
|
|
static inline int nsec_counter(int counter)
|
|
{
|
|
if (attrs[counter].type != PERF_TYPE_SOFTWARE)
|
|
return 0;
|
|
|
|
if (attrs[counter].config == PERF_COUNT_SW_CPU_CLOCK)
|
|
return 1;
|
|
|
|
if (attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
|
|
return 1;
|
|
|
|
return 0;
|
|
}
|
|
|
|
/*
|
|
* Read out the results of a single counter:
|
|
*/
|
|
static void read_counter(int counter)
|
|
{
|
|
u64 *count, single_count[3];
|
|
ssize_t res;
|
|
int cpu, nv;
|
|
int scaled;
|
|
|
|
count = event_res[run_idx][counter];
|
|
|
|
count[0] = count[1] = count[2] = 0;
|
|
|
|
nv = scale ? 3 : 1;
|
|
for (cpu = 0; cpu < nr_cpus; cpu++) {
|
|
if (fd[cpu][counter] < 0)
|
|
continue;
|
|
|
|
res = read(fd[cpu][counter], single_count, nv * sizeof(u64));
|
|
assert(res == nv * sizeof(u64));
|
|
close(fd[cpu][counter]);
|
|
fd[cpu][counter] = -1;
|
|
|
|
count[0] += single_count[0];
|
|
if (scale) {
|
|
count[1] += single_count[1];
|
|
count[2] += single_count[2];
|
|
}
|
|
}
|
|
|
|
scaled = 0;
|
|
if (scale) {
|
|
if (count[2] == 0) {
|
|
event_scaled[run_idx][counter] = -1;
|
|
count[0] = 0;
|
|
return;
|
|
}
|
|
|
|
if (count[2] < count[1]) {
|
|
event_scaled[run_idx][counter] = 1;
|
|
count[0] = (unsigned long long)
|
|
((double)count[0] * count[1] / count[2] + 0.5);
|
|
}
|
|
}
|
|
/*
|
|
* Save the full runtime - to allow normalization during printout:
|
|
*/
|
|
if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
|
|
attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK)
|
|
runtime_nsecs[run_idx] = count[0];
|
|
if (attrs[counter].type == PERF_TYPE_HARDWARE &&
|
|
attrs[counter].config == PERF_COUNT_HW_CPU_CYCLES)
|
|
runtime_cycles[run_idx] = count[0];
|
|
}
|
|
|
|
static int run_perf_stat(int argc, const char **argv)
|
|
{
|
|
unsigned long long t0, t1;
|
|
int status = 0;
|
|
int counter;
|
|
int pid;
|
|
int child_ready_pipe[2], go_pipe[2];
|
|
char buf;
|
|
|
|
if (!system_wide)
|
|
nr_cpus = 1;
|
|
|
|
if (pipe(child_ready_pipe) < 0 || pipe(go_pipe) < 0) {
|
|
perror("failed to create pipes");
|
|
exit(1);
|
|
}
|
|
|
|
if ((pid = fork()) < 0)
|
|
perror("failed to fork");
|
|
|
|
if (!pid) {
|
|
close(child_ready_pipe[0]);
|
|
close(go_pipe[1]);
|
|
fcntl(go_pipe[0], F_SETFD, FD_CLOEXEC);
|
|
|
|
/*
|
|
* Do a dummy execvp to get the PLT entry resolved,
|
|
* so we avoid the resolver overhead on the real
|
|
* execvp call.
|
|
*/
|
|
execvp("", (char **)argv);
|
|
|
|
/*
|
|
* Tell the parent we're ready to go
|
|
*/
|
|
close(child_ready_pipe[1]);
|
|
|
|
/*
|
|
* Wait until the parent tells us to go.
|
|
*/
|
|
read(go_pipe[0], &buf, 1);
|
|
|
|
execvp(argv[0], (char **)argv);
|
|
|
|
perror(argv[0]);
|
|
exit(-1);
|
|
}
|
|
|
|
/*
|
|
* Wait for the child to be ready to exec.
|
|
*/
|
|
close(child_ready_pipe[1]);
|
|
close(go_pipe[0]);
|
|
read(child_ready_pipe[0], &buf, 1);
|
|
close(child_ready_pipe[0]);
|
|
|
|
for (counter = 0; counter < nr_counters; counter++)
|
|
create_perf_stat_counter(counter, pid);
|
|
|
|
/*
|
|
* Enable counters and exec the command:
|
|
*/
|
|
t0 = rdclock();
|
|
prctl(PR_TASK_PERF_COUNTERS_ENABLE);
|
|
|
|
close(go_pipe[1]);
|
|
wait(&status);
|
|
|
|
t1 = rdclock();
|
|
|
|
walltime_nsecs[run_idx] = t1 - t0;
|
|
|
|
for (counter = 0; counter < nr_counters; counter++)
|
|
read_counter(counter);
|
|
|
|
return WEXITSTATUS(status);
|
|
}
|
|
|
|
static void print_noise(u64 *count, u64 *noise)
|
|
{
|
|
if (run_count > 1)
|
|
fprintf(stderr, " ( +- %7.3f%% )",
|
|
(double)noise[0]/(count[0]+1)*100.0);
|
|
}
|
|
|
|
static void nsec_printout(int counter, u64 *count, u64 *noise)
|
|
{
|
|
double msecs = (double)count[0] / 1000000;
|
|
|
|
fprintf(stderr, " %14.6f %-24s", msecs, event_name(counter));
|
|
|
|
if (attrs[counter].type == PERF_TYPE_SOFTWARE &&
|
|
attrs[counter].config == PERF_COUNT_SW_TASK_CLOCK) {
|
|
|
|
if (walltime_nsecs_avg)
|
|
fprintf(stderr, " # %10.3f CPUs ",
|
|
(double)count[0] / (double)walltime_nsecs_avg);
|
|
}
|
|
print_noise(count, noise);
|
|
}
|
|
|
|
static void abs_printout(int counter, u64 *count, u64 *noise)
|
|
{
|
|
fprintf(stderr, " %14Ld %-24s", count[0], event_name(counter));
|
|
|
|
if (runtime_cycles_avg &&
|
|
attrs[counter].type == PERF_TYPE_HARDWARE &&
|
|
attrs[counter].config == PERF_COUNT_HW_INSTRUCTIONS) {
|
|
|
|
fprintf(stderr, " # %10.3f IPC ",
|
|
(double)count[0] / (double)runtime_cycles_avg);
|
|
} else {
|
|
if (runtime_nsecs_avg) {
|
|
fprintf(stderr, " # %10.3f M/sec",
|
|
(double)count[0]/runtime_nsecs_avg*1000.0);
|
|
}
|
|
}
|
|
print_noise(count, noise);
|
|
}
|
|
|
|
/*
|
|
* Print out the results of a single counter:
|
|
*/
|
|
static void print_counter(int counter)
|
|
{
|
|
u64 *count, *noise;
|
|
int scaled;
|
|
|
|
count = event_res_avg[counter];
|
|
noise = event_res_noise[counter];
|
|
scaled = event_scaled_avg[counter];
|
|
|
|
if (scaled == -1) {
|
|
fprintf(stderr, " %14s %-24s\n",
|
|
"<not counted>", event_name(counter));
|
|
return;
|
|
}
|
|
|
|
if (nsec_counter(counter))
|
|
nsec_printout(counter, count, noise);
|
|
else
|
|
abs_printout(counter, count, noise);
|
|
|
|
if (scaled)
|
|
fprintf(stderr, " (scaled from %.2f%%)",
|
|
(double) count[2] / count[1] * 100);
|
|
|
|
fprintf(stderr, "\n");
|
|
}
|
|
|
|
/*
|
|
* normalize_noise noise values down to stddev:
|
|
*/
|
|
static void normalize_noise(u64 *val)
|
|
{
|
|
double res;
|
|
|
|
res = (double)*val / (run_count * sqrt((double)run_count));
|
|
|
|
*val = (u64)res;
|
|
}
|
|
|
|
static void update_avg(const char *name, int idx, u64 *avg, u64 *val)
|
|
{
|
|
*avg += *val;
|
|
|
|
if (verbose > 1)
|
|
fprintf(stderr, "debug: %20s[%d]: %Ld\n", name, idx, *val);
|
|
}
|
|
/*
|
|
* Calculate the averages and noises:
|
|
*/
|
|
static void calc_avg(void)
|
|
{
|
|
int i, j;
|
|
|
|
if (verbose > 1)
|
|
fprintf(stderr, "\n");
|
|
|
|
for (i = 0; i < run_count; i++) {
|
|
update_avg("runtime", 0, &runtime_nsecs_avg, runtime_nsecs + i);
|
|
update_avg("walltime", 0, &walltime_nsecs_avg, walltime_nsecs + i);
|
|
update_avg("runtime_cycles", 0, &runtime_cycles_avg, runtime_cycles + i);
|
|
|
|
for (j = 0; j < nr_counters; j++) {
|
|
update_avg("counter/0", j,
|
|
event_res_avg[j]+0, event_res[i][j]+0);
|
|
update_avg("counter/1", j,
|
|
event_res_avg[j]+1, event_res[i][j]+1);
|
|
update_avg("counter/2", j,
|
|
event_res_avg[j]+2, event_res[i][j]+2);
|
|
if (event_scaled[i][j] != -1)
|
|
update_avg("scaled", j,
|
|
event_scaled_avg + j, event_scaled[i]+j);
|
|
else
|
|
event_scaled_avg[j] = -1;
|
|
}
|
|
}
|
|
runtime_nsecs_avg /= run_count;
|
|
walltime_nsecs_avg /= run_count;
|
|
runtime_cycles_avg /= run_count;
|
|
|
|
for (j = 0; j < nr_counters; j++) {
|
|
event_res_avg[j][0] /= run_count;
|
|
event_res_avg[j][1] /= run_count;
|
|
event_res_avg[j][2] /= run_count;
|
|
}
|
|
|
|
for (i = 0; i < run_count; i++) {
|
|
runtime_nsecs_noise +=
|
|
abs((s64)(runtime_nsecs[i] - runtime_nsecs_avg));
|
|
walltime_nsecs_noise +=
|
|
abs((s64)(walltime_nsecs[i] - walltime_nsecs_avg));
|
|
runtime_cycles_noise +=
|
|
abs((s64)(runtime_cycles[i] - runtime_cycles_avg));
|
|
|
|
for (j = 0; j < nr_counters; j++) {
|
|
event_res_noise[j][0] +=
|
|
abs((s64)(event_res[i][j][0] - event_res_avg[j][0]));
|
|
event_res_noise[j][1] +=
|
|
abs((s64)(event_res[i][j][1] - event_res_avg[j][1]));
|
|
event_res_noise[j][2] +=
|
|
abs((s64)(event_res[i][j][2] - event_res_avg[j][2]));
|
|
}
|
|
}
|
|
|
|
normalize_noise(&runtime_nsecs_noise);
|
|
normalize_noise(&walltime_nsecs_noise);
|
|
normalize_noise(&runtime_cycles_noise);
|
|
|
|
for (j = 0; j < nr_counters; j++) {
|
|
normalize_noise(&event_res_noise[j][0]);
|
|
normalize_noise(&event_res_noise[j][1]);
|
|
normalize_noise(&event_res_noise[j][2]);
|
|
}
|
|
}
|
|
|
|
static void print_stat(int argc, const char **argv)
|
|
{
|
|
int i, counter;
|
|
|
|
calc_avg();
|
|
|
|
fflush(stdout);
|
|
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, " Performance counter stats for \'%s", argv[0]);
|
|
|
|
for (i = 1; i < argc; i++)
|
|
fprintf(stderr, " %s", argv[i]);
|
|
|
|
fprintf(stderr, "\'");
|
|
if (run_count > 1)
|
|
fprintf(stderr, " (%d runs)", run_count);
|
|
fprintf(stderr, ":\n\n");
|
|
|
|
for (counter = 0; counter < nr_counters; counter++)
|
|
print_counter(counter);
|
|
|
|
fprintf(stderr, "\n");
|
|
fprintf(stderr, " %14.9f seconds time elapsed",
|
|
(double)walltime_nsecs_avg/1e9);
|
|
if (run_count > 1) {
|
|
fprintf(stderr, " ( +- %7.3f%% )",
|
|
100.0*(double)walltime_nsecs_noise/(double)walltime_nsecs_avg);
|
|
}
|
|
fprintf(stderr, "\n\n");
|
|
}
|
|
|
|
static volatile int signr = -1;
|
|
|
|
static void skip_signal(int signo)
|
|
{
|
|
signr = signo;
|
|
}
|
|
|
|
static void sig_atexit(void)
|
|
{
|
|
if (signr == -1)
|
|
return;
|
|
|
|
signal(signr, SIG_DFL);
|
|
kill(getpid(), signr);
|
|
}
|
|
|
|
static const char * const stat_usage[] = {
|
|
"perf stat [<options>] <command>",
|
|
NULL
|
|
};
|
|
|
|
static const struct option options[] = {
|
|
OPT_CALLBACK('e', "event", NULL, "event",
|
|
"event selector. use 'perf list' to list available events",
|
|
parse_events),
|
|
OPT_BOOLEAN('i', "inherit", &inherit,
|
|
"child tasks inherit counters"),
|
|
OPT_INTEGER('p', "pid", &target_pid,
|
|
"stat events on existing pid"),
|
|
OPT_BOOLEAN('a', "all-cpus", &system_wide,
|
|
"system-wide collection from all CPUs"),
|
|
OPT_BOOLEAN('S', "scale", &scale,
|
|
"scale/normalize counters"),
|
|
OPT_BOOLEAN('v', "verbose", &verbose,
|
|
"be more verbose (show counter open errors, etc)"),
|
|
OPT_INTEGER('r', "repeat", &run_count,
|
|
"repeat command and print average + stddev (max: 100)"),
|
|
OPT_BOOLEAN('n', "null", &null_run,
|
|
"null run - dont start any counters"),
|
|
OPT_END()
|
|
};
|
|
|
|
int cmd_stat(int argc, const char **argv, const char *prefix)
|
|
{
|
|
int status;
|
|
|
|
argc = parse_options(argc, argv, options, stat_usage, 0);
|
|
if (!argc)
|
|
usage_with_options(stat_usage, options);
|
|
if (run_count <= 0 || run_count > MAX_RUN)
|
|
usage_with_options(stat_usage, options);
|
|
|
|
/* Set attrs and nr_counters if no event is selected and !null_run */
|
|
if (!null_run && !nr_counters) {
|
|
memcpy(attrs, default_attrs, sizeof(default_attrs));
|
|
nr_counters = ARRAY_SIZE(default_attrs);
|
|
}
|
|
|
|
nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
|
|
assert(nr_cpus <= MAX_NR_CPUS);
|
|
assert(nr_cpus >= 0);
|
|
|
|
/*
|
|
* We dont want to block the signals - that would cause
|
|
* child tasks to inherit that and Ctrl-C would not work.
|
|
* What we want is for Ctrl-C to work in the exec()-ed
|
|
* task, but being ignored by perf stat itself:
|
|
*/
|
|
atexit(sig_atexit);
|
|
signal(SIGINT, skip_signal);
|
|
signal(SIGALRM, skip_signal);
|
|
signal(SIGABRT, skip_signal);
|
|
|
|
status = 0;
|
|
for (run_idx = 0; run_idx < run_count; run_idx++) {
|
|
if (run_count != 1 && verbose)
|
|
fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1);
|
|
status = run_perf_stat(argc, argv);
|
|
}
|
|
|
|
print_stat(argc, argv);
|
|
|
|
return status;
|
|
}
|