From 94403f8863d0d1d2005291b2ef0719c2534aa303 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 24 Apr 2011 08:18:31 +0200 Subject: [PATCH 01/29] perf events: Add stalled cycles generic event - PERF_COUNT_HW_STALLED_CYCLES The new PERF_COUNT_HW_STALLED_CYCLES event tries to approximate cycles the CPU does nothing useful, because it is stalled on a cache-miss or some other condition. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-fue11vymwqsoo5to72jxxjyl@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 3 +++ include/linux/perf_event.h | 1 + tools/perf/util/parse-events.c | 1 + tools/perf/util/python.c | 1 + 4 files changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 9ae4a2aa7398..efa2704c9dfd 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1413,6 +1413,9 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; + /* Install the stalled-cycles event: 0xff: All reasons, 0xa2: Resource stalls */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0xffa2; + if (ebx & 0x40) { /* * Erratum AAJ80 detected, we work it around by using diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ee9f1e782800..ac636dd20a0c 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -52,6 +52,7 @@ enum perf_hw_id { PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_HW_BRANCH_MISSES = 5, PERF_COUNT_HW_BUS_CYCLES = 6, + PERF_COUNT_HW_STALLED_CYCLES = 7, PERF_COUNT_HW_MAX, /* non-ABI */ }; diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 952b4ae3d954..1869e4c646db 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -38,6 +38,7 @@ static struct event_symbol event_symbols[] = { { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, { CHW(BRANCH_MISSES), "branch-misses", "" }, { CHW(BUS_CYCLES), "bus-cycles", "" }, + { CHW(STALLED_CYCLES), "stalled-cycles", "" }, { CSW(CPU_CLOCK), "cpu-clock", "" }, { CSW(TASK_CLOCK), "task-clock", "" }, diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index f5e38451fdc5..406f613ee619 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -798,6 +798,7 @@ static struct { { "COUNT_HW_BRANCH_INSTRUCTIONS", PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { "COUNT_HW_BRANCH_MISSES", PERF_COUNT_HW_BRANCH_MISSES }, { "COUNT_HW_BUS_CYCLES", PERF_COUNT_HW_BUS_CYCLES }, + { "COUNT_HW_STALLED_CYCLES", PERF_COUNT_HW_STALLED_CYCLES }, { "COUNT_HW_CACHE_L1D", PERF_COUNT_HW_CACHE_L1D }, { "COUNT_HW_CACHE_L1I", PERF_COUNT_HW_CACHE_L1I }, { "COUNT_HW_CACHE_LL", PERF_COUNT_HW_CACHE_LL }, From 5c543e3c442d6382db127152c7096ca6a55283de Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 12:02:04 +0200 Subject: [PATCH 02/29] perf events, x86: Mark constrant tables read mostly Various constraint tables were not marked read-mostly. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-wpqwwvmhxucy5e718wnamjiv@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index efa2704c9dfd..067a48b13a76 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -36,7 +36,7 @@ static u64 intel_perfmon_event_map[PERF_COUNT_HW_MAX] __read_mostly = [PERF_COUNT_HW_BUS_CYCLES] = 0x013c, }; -static struct event_constraint intel_core_event_constraints[] = +static struct event_constraint intel_core_event_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0x11, 0x2), /* FP_ASSIST */ INTEL_EVENT_CONSTRAINT(0x12, 0x2), /* MUL */ @@ -47,7 +47,7 @@ static struct event_constraint intel_core_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_core2_event_constraints[] = +static struct event_constraint intel_core2_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -70,7 +70,7 @@ static struct event_constraint intel_core2_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_nehalem_event_constraints[] = +static struct event_constraint intel_nehalem_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -86,19 +86,19 @@ static struct event_constraint intel_nehalem_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct extra_reg intel_nehalem_extra_regs[] = +static struct extra_reg intel_nehalem_extra_regs[] __read_mostly = { INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), EVENT_EXTRA_END }; -static struct event_constraint intel_nehalem_percore_constraints[] = +static struct event_constraint intel_nehalem_percore_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0xb7, 0), EVENT_CONSTRAINT_END }; -static struct event_constraint intel_westmere_event_constraints[] = +static struct event_constraint intel_westmere_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -110,7 +110,7 @@ static struct event_constraint intel_westmere_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct event_constraint intel_snb_event_constraints[] = +static struct event_constraint intel_snb_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ @@ -123,21 +123,21 @@ static struct event_constraint intel_snb_event_constraints[] = EVENT_CONSTRAINT_END }; -static struct extra_reg intel_westmere_extra_regs[] = +static struct extra_reg intel_westmere_extra_regs[] __read_mostly = { INTEL_EVENT_EXTRA_REG(0xb7, MSR_OFFCORE_RSP_0, 0xffff), INTEL_EVENT_EXTRA_REG(0xbb, MSR_OFFCORE_RSP_1, 0xffff), EVENT_EXTRA_END }; -static struct event_constraint intel_westmere_percore_constraints[] = +static struct event_constraint intel_westmere_percore_constraints[] __read_mostly = { INTEL_EVENT_CONSTRAINT(0xb7, 0), INTEL_EVENT_CONSTRAINT(0xbb, 0), EVENT_CONSTRAINT_END }; -static struct event_constraint intel_gen_event_constraints[] = +static struct event_constraint intel_gen_event_constraints[] __read_mostly = { FIXED_EVENT_CONSTRAINT(0x00c0, 0), /* INST_RETIRED.ANY */ FIXED_EVENT_CONSTRAINT(0x003c, 1), /* CPU_CLK_UNHALTED.CORE */ From 11ba2b85f506306c8dfc9fe144aa4ddc43242855 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 24 Apr 2011 15:05:10 +0200 Subject: [PATCH 03/29] perf stat: Print stalled cycles percentage Print: 611,527 cycles 400,553 instructions # ( 0.71 instructions per cycle ) 77,809 stalled-cycles # ( 12.71% of all cycles ) 0.000610987 seconds time elapsed Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Signed-off-by: Ingo Molnar Link: http://lkml.kernel.org/n/tip-fd6x8r1cpyb6zhlrc4ix8m45@git.kernel.org --- tools/perf/builtin-stat.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 03f0e45f1479..3a29041f8575 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -442,7 +442,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # %10.3f IPC ", ratio); + fprintf(stderr, " # ( %4.2f instructions per cycle )", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { total = avg_stats(&runtime_branches_stats[cpu]); @@ -450,7 +450,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %10.3f %% ", ratio); + fprintf(stderr, " # %10.3f %%", ratio); } else if (runtime_nsecs_stats[cpu].n != 0) { total = avg_stats(&runtime_nsecs_stats[cpu]); @@ -459,6 +459,13 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) ratio = 1000.0 * avg / total; fprintf(stderr, " # %10.3f M/sec", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + fprintf(stderr, " # (%5.2f%% of all cycles )", ratio); } } From d58f4c82fed69fdd4a79fa54fe17fd14d98c27aa Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 03:42:18 +0200 Subject: [PATCH 04/29] perf stat: Print cache misses as percentage Before: 113,393,041 cache-references # 83.636 M/sec 7,052,454 cache-misses # 5.202 M/sec After: 112,589,441 cache-references # 87.925 M/sec 6,556,354 cache-misses # 5.823 % misses/hits percentages are more expressive than absolute numbers or rates. (Also prettify the CPUs printout line to not have a trailing whitespace.) Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-axm28f43x439bl41zkvfzd63@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 3a29041f8575..0de3a2002f49 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -157,6 +157,7 @@ static double stddev_stats(struct stats *stats) struct stats runtime_nsecs_stats[MAX_NR_CPUS]; struct stats runtime_cycles_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; +struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; static int create_perf_stat_counter(struct perf_evsel *evsel) @@ -219,10 +220,12 @@ static int read_counter_aggr(struct perf_evsel *counter) */ if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) update_stats(&runtime_nsecs_stats[0], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) + else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) + else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) + update_stats(&runtime_cacherefs_stats[0], count[0]); return 0; } @@ -404,7 +407,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) return; if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) - fprintf(stderr, " # %10.3f CPUs ", + fprintf(stderr, " # %10.3f CPUs", avg / avg_stats(&walltime_nsecs_stats)); } @@ -452,6 +455,15 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %10.3f %%", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && + runtime_cacherefs_stats[cpu].n != 0) { + total = avg_stats(&runtime_cacherefs_stats[cpu]); + + if (total) + ratio = avg * 100 / total; + + fprintf(stderr, " # %10.3f %%", ratio); + } else if (runtime_nsecs_stats[cpu].n != 0) { total = avg_stats(&runtime_nsecs_stats[cpu]); From b908debd4eef91471016138569f7a9e292be682e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 03:55:40 +0200 Subject: [PATCH 05/29] perf tools: Accept case-insensitive symbolic event variants We currently fail on something like '-e CPU-migrations', with: invalid or unsupported event: 'CPU-migrations' While 'CPU-migrations' is how we actually print out the event in the default perf stat output: Performance counter stats for 'true': 0.202204 task-clock-msecs # 0.282 CPUs 0 context-switches # 0.000 M/sec 0 CPU-migrations # 0.000 M/sec So change the matching to be case-insensitive. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-omcm3edjjtx83a4kh2e244se@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 1869e4c646db..8e54bdb553c3 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -649,13 +649,15 @@ static int check_events(const char *str, unsigned int i) int n; n = strlen(event_symbols[i].symbol); - if (!strncmp(str, event_symbols[i].symbol, n)) + if (!strncasecmp(str, event_symbols[i].symbol, n)) return n; n = strlen(event_symbols[i].alias); - if (n) - if (!strncmp(str, event_symbols[i].alias, n)) + if (n) { + if (!strncasecmp(str, event_symbols[i].alias, n)) return n; + } + return 0; } From ceb53fbf6dbb1df26d38379a262c6981fe73dd36 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 04:06:33 +0200 Subject: [PATCH 06/29] perf stat: Fail more clearly when an invalid modifier is specified Currently we fail without printing any error message on "perf stat -e task-clock-msecs". The reason is that the task-clock event is matched and the "-msecs" postfix is assumed to be an event modifier - but is not recognized. This patch changes the code to be more informative: $ perf stat -e task-clock-msecs true invalid event modifier: '-msecs' Run 'perf list' for a list of valid events and modifiers And restructures the return value of parse_event_modifier() to allow the printing of all variants of invalid event modifiers. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-wlaw3dvz1ly6wple8l52cfca@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 8e54bdb553c3..b686269427ea 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -721,15 +721,19 @@ parse_numeric_event(const char **strp, struct perf_event_attr *attr) return EVT_FAILED; } -static enum event_result +static int parse_event_modifier(const char **strp, struct perf_event_attr *attr) { const char *str = *strp; int exclude = 0; int eu = 0, ek = 0, eh = 0, precise = 0; - if (*str++ != ':') + if (!*str) return 0; + + if (*str++ != ':') + return -1; + while (*str) { if (*str == 'u') { if (!exclude) @@ -750,14 +754,16 @@ parse_event_modifier(const char **strp, struct perf_event_attr *attr) ++str; } - if (str >= *strp + 2) { - *strp = str; - attr->exclude_user = eu; - attr->exclude_kernel = ek; - attr->exclude_hv = eh; - attr->precise_ip = precise; - return 1; - } + if (str < *strp + 2) + return -1; + + *strp = str; + + attr->exclude_user = eu; + attr->exclude_kernel = ek; + attr->exclude_hv = eh; + attr->precise_ip = precise; + return 0; } @@ -800,7 +806,12 @@ parse_event_symbols(const struct option *opt, const char **str, return EVT_FAILED; modifier: - parse_event_modifier(str, attr); + if (parse_event_modifier(str, attr) < 0) { + fprintf(stderr, "invalid event modifier: '%s'\n", *str); + fprintf(stderr, "Run 'perf list' for a list of valid events and modifiers\n"); + + return EVT_FAILED; + } return ret; } From 749141d926faf23ef811686a8050e7cf13dc223f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 04:24:57 +0200 Subject: [PATCH 07/29] perf stat: Make all displayed event names parseable as well Right now we display this by default: 0.202204 task-clock-msecs # 0.282 CPUs 0 context-switches # 0.000 M/sec 0 CPU-migrations # 0.000 M/sec 85 page-faults # 0.420 M/sec The task-clock-msecs event cannot actually be passed back as an event name, the event name we recognize is 'task-clock'. So change the output of the cpu-clock and task-clock events to be idempotent. ( Units should be printed out in the right-side column, if needed. ) Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-lexrnbzy09asscgd4f7oac4i@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index b686269427ea..b5bfef12f399 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -70,8 +70,8 @@ static const char *hw_event_names[] = { }; static const char *sw_event_names[] = { - "cpu-clock-msecs", - "task-clock-msecs", + "cpu-clock", + "task-clock", "page-faults", "context-switches", "CPU-migrations", From dcd9936a5a6d89512b5323c1145647f2dbe0236f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 04:36:37 +0200 Subject: [PATCH 08/29] perf stat: Factor our shadow stats Create update_shadow_stats() which is then used in both read_counter_aggr() and read_counter(). This not only simplifies the code but also fixes a bug: HW_CACHE_REFERENCES was not updated in read_counter(). Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-9uc55z3g88r47exde7zxjm6p@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 0de3a2002f49..e5e82f62c784 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -193,6 +193,23 @@ static inline int nsec_counter(struct perf_evsel *evsel) return 0; } +/* + * Update various tracking values we maintain to print + * more semantic information such as miss/hit ratios, + * instruction rates, etc: + */ +static void update_shadow_stats(struct perf_evsel *counter, u64 *count) +{ + if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) + update_stats(&runtime_nsecs_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) + update_stats(&runtime_cycles_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) + update_stats(&runtime_branches_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) + update_stats(&runtime_cacherefs_stats[0], count[0]); +} + /* * Read out the results of a single counter: * aggregate counts across CPUs in system-wide mode @@ -218,14 +235,7 @@ static int read_counter_aggr(struct perf_evsel *counter) /* * Save the full runtime - to allow normalization during printout: */ - if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) - update_stats(&runtime_nsecs_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) - update_stats(&runtime_cycles_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) - update_stats(&runtime_branches_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) - update_stats(&runtime_cacherefs_stats[0], count[0]); + update_shadow_stats(counter, count); return 0; } @@ -245,12 +255,7 @@ static int read_counter(struct perf_evsel *counter) count = counter->counts->cpu[cpu].values; - if (perf_evsel__match(counter, SOFTWARE, SW_TASK_CLOCK)) - update_stats(&runtime_nsecs_stats[cpu], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) - update_stats(&runtime_cycles_stats[cpu], count[0]); - if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) - update_stats(&runtime_branches_stats[cpu], count[0]); + update_shadow_stats(counter, count); } return 0; From 481f988a016f7a0327a5537bde4794349fc4625c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 04:34:16 +0200 Subject: [PATCH 09/29] perf stat: Add stalled cycles accounting, prettify the resulting output Add stalled cycles accounting and use it to print the "cycles stalled per instruction" value. Also change the unit of the cycles output from M/sec to GHz - this is more intuitive. Prettify the output to: Performance counter stats for './loop_1b_instructions': 239.775036 task-clock # 0.997 CPUs utilized 761,903,912 cycles # 3.178 GHz 356,620,620 stalled-cycles # 46.81% of all cycles are idle 1,001,578,351 instructions # 1.31 insns per cycle # 0.36 stalled cycles per insn 14,782 cache-references # 0.062 M/sec 5,694 cache-misses # 38.520 % of all cache refs 0.240493656 seconds time elapsed Also adjust the --repeat output to make the percentages align vertically: Performance counter stats for './loop_1b_instructions' (10 runs): 236.096793 task-clock # 0.997 CPUs utilized ( +- 0.011% ) 756,553,086 cycles # 3.204 GHz ( +- 0.002% ) 354,942,692 stalled-cycles # 46.92% of all cycles are idle ( +- 0.008% ) 1,001,389,700 instructions # 1.32 insns per cycle # 0.35 stalled cycles per insn ( +- 0.000% ) 10,166 cache-references # 0.043 M/sec ( +- 0.742% ) 468 cache-misses # 4.608 % of all cache refs ( +- 13.385% ) 0.236874136 seconds time elapsed ( +- 0.01% ) Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-uapziqny39601apdmmhoz7hk@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 45 +++++++++++++++++++++++++++------------ 1 file changed, 31 insertions(+), 14 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e5e82f62c784..e881c2061381 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -156,6 +156,7 @@ static double stddev_stats(struct stats *stats) struct stats runtime_nsecs_stats[MAX_NR_CPUS]; struct stats runtime_cycles_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; @@ -204,6 +205,8 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_nsecs_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES)) + update_stats(&runtime_stalled_cycles_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) @@ -412,8 +415,7 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) return; if (perf_evsel__match(evsel, SOFTWARE, SW_TASK_CLOCK)) - fprintf(stderr, " # %10.3f CPUs", - avg / avg_stats(&walltime_nsecs_stats)); + fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); } static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) @@ -450,7 +452,15 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # ( %4.2f instructions per cycle )", ratio); + fprintf(stderr, " # %4.2f insns per cycle", ratio); + + total = avg_stats(&runtime_stalled_cycles_stats[cpu]); + + if (total && avg) { + ratio = total / avg; + fprintf(stderr, "\n # %4.2f stalled cycles per insn", ratio); + } + } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { total = avg_stats(&runtime_branches_stats[cpu]); @@ -458,7 +468,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %10.3f %%", ratio); + fprintf(stderr, " # %8.3f %% of all branches", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { @@ -467,22 +477,29 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %10.3f %%", ratio); + fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); - } else if (runtime_nsecs_stats[cpu].n != 0) { - total = avg_stats(&runtime_nsecs_stats[cpu]); - - if (total) - ratio = 1000.0 * avg / total; - - fprintf(stderr, " # %10.3f M/sec", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { total = avg_stats(&runtime_cycles_stats[cpu]); if (total) ratio = avg / total * 100.0; - fprintf(stderr, " # (%5.2f%% of all cycles )", ratio); + fprintf(stderr, " # %5.2f%% of all cycles are idle ", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { + total = avg_stats(&runtime_nsecs_stats[cpu]); + + if (total) + ratio = 1.0 * avg / total; + + fprintf(stderr, " # %8.3f GHz ", ratio); + } else if (runtime_nsecs_stats[cpu].n != 0) { + total = avg_stats(&runtime_nsecs_stats[cpu]); + + if (total) + ratio = 1000.0 * avg / total; + + fprintf(stderr, " # %8.3f M/sec ", ratio); } } @@ -619,7 +636,7 @@ static void print_stat(int argc, const char **argv) fprintf(stderr, " %18.9f seconds time elapsed", avg_stats(&walltime_nsecs_stats)/1e9); if (run_count > 1) { - fprintf(stderr, " ( +- %7.3f%% )", + fprintf(stderr, " ( +-%5.2f%% )", 100*stddev_stats(&walltime_nsecs_stats) / avg_stats(&walltime_nsecs_stats)); } From 1fc570ad89e55dc32dfa4dda1311948b38f26524 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 05:20:22 +0200 Subject: [PATCH 10/29] perf stat: Add stalled cycles to the default output The new default output looks like this: Performance counter stats for './loop_1b_instructions': 236.010686 task-clock # 0.996 CPUs utilized 0 context-switches # 0.000 M/sec 0 CPU-migrations # 0.000 M/sec 99 page-faults # 0.000 M/sec 756,487,646 cycles # 3.205 GHz 354,938,996 stalled-cycles # 46.92% of all cycles are idle 1,001,403,797 instructions # 1.32 insns per cycle # 0.35 stalled cycles per insn 100,279,773 branches # 424.895 M/sec 12,646 branch-misses # 0.013 % of all branches 0.236902540 seconds time elapsed We dropped cache-refs and cache-misses and added stalled-cycles - this is a more generic "how well utilized is the CPU" metric. If the stalled-cycles ratio is too high then more specific measurements can be taken to figure out the source of the inefficiency. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-pbpl2l4mn797s69bclfpwkwn@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 5 ++--- tools/perf/util/parse-events.c | 11 ++++++----- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e881c2061381..924d18c407b8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -65,11 +65,10 @@ static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_REFERENCES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CACHE_MISSES }, }; @@ -468,7 +467,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg * 100 / total; - fprintf(stderr, " # %8.3f %% of all branches", ratio); + fprintf(stderr, " # %5.2f %% of all branches ", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index b5bfef12f399..bbbb735268ef 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -32,13 +32,13 @@ char debugfs_path[MAXPATHLEN]; static struct event_symbol event_symbols[] = { { CHW(CPU_CYCLES), "cpu-cycles", "cycles" }, + { CHW(STALLED_CYCLES), "stalled-cycles", "idle-cycles" }, { CHW(INSTRUCTIONS), "instructions", "" }, { CHW(CACHE_REFERENCES), "cache-references", "" }, { CHW(CACHE_MISSES), "cache-misses", "" }, { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, { CHW(BRANCH_MISSES), "branch-misses", "" }, { CHW(BUS_CYCLES), "bus-cycles", "" }, - { CHW(STALLED_CYCLES), "stalled-cycles", "" }, { CSW(CPU_CLOCK), "cpu-clock", "" }, { CSW(TASK_CLOCK), "task-clock", "" }, @@ -54,9 +54,9 @@ static struct event_symbol event_symbols[] = { #define __PERF_EVENT_FIELD(config, name) \ ((config & PERF_EVENT_##name##_MASK) >> PERF_EVENT_##name##_SHIFT) -#define PERF_EVENT_RAW(config) __PERF_EVENT_FIELD(config, RAW) +#define PERF_EVENT_RAW(config) __PERF_EVENT_FIELD(config, RAW) #define PERF_EVENT_CONFIG(config) __PERF_EVENT_FIELD(config, CONFIG) -#define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) +#define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) #define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) static const char *hw_event_names[] = { @@ -67,6 +67,7 @@ static const char *hw_event_names[] = { "branches", "branch-misses", "bus-cycles", + "stalled-cycles", }; static const char *sw_event_names[] = { @@ -308,7 +309,7 @@ const char *__event_name(int type, u64 config) switch (type) { case PERF_TYPE_HARDWARE: - if (config < PERF_COUNT_HW_MAX) + if (config < PERF_COUNT_HW_MAX && hw_event_names[config]) return hw_event_names[config]; return "unknown-hardware"; @@ -334,7 +335,7 @@ const char *__event_name(int type, u64 config) } case PERF_TYPE_SOFTWARE: - if (config < PERF_COUNT_SW_MAX) + if (config < PERF_COUNT_SW_MAX && sw_event_names[config]) return sw_event_names[config]; return "unknown-software"; From f99844cb76b7d347711c22cdcb94266b7214141f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 05:35:39 +0200 Subject: [PATCH 11/29] perf stat: Fix -nan% output in perf stat noise printouts Before: 0 CPU-migrations # 0.000 M/sec ( +- -nan% ) After: 0 CPU-migrations # 0.000 M/sec ( +- 0.00% ) Also factor out the noise printing function. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-z89h2v1bk1mikcbsf7e6v34q@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 18 +++++++++++++----- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 924d18c407b8..845ded8f15a2 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -382,6 +382,16 @@ static int run_perf_stat(int argc __used, const char **argv) return WEXITSTATUS(status); } +static void print_noise_pct(double total, double avg) +{ + double pct = 0.0; + + if (avg) + pct = 100.0*total/avg; + + fprintf(stderr, " ( +-%6.2f%% )", pct); +} + static void print_noise(struct perf_evsel *evsel, double avg) { struct perf_stat *ps; @@ -390,8 +400,7 @@ static void print_noise(struct perf_evsel *evsel, double avg) return; ps = evsel->priv; - fprintf(stderr, " ( +- %7.3f%% )", - 100 * stddev_stats(&ps->res_stats[0]) / avg); + print_noise_pct(stddev_stats(&ps->res_stats[0]), avg); } static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) @@ -635,9 +644,8 @@ static void print_stat(int argc, const char **argv) fprintf(stderr, " %18.9f seconds time elapsed", avg_stats(&walltime_nsecs_stats)/1e9); if (run_count > 1) { - fprintf(stderr, " ( +-%5.2f%% )", - 100*stddev_stats(&walltime_nsecs_stats) / - avg_stats(&walltime_nsecs_stats)); + print_noise_pct(stddev_stats(&walltime_nsecs_stats), + avg_stats(&walltime_nsecs_stats)); } fprintf(stderr, "\n\n"); } From a5d243d04a150acbfa79d641154f49e5d920f64f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 05:39:24 +0200 Subject: [PATCH 12/29] perf stat: Print stalled cycles warning colors Print the stalled-cycles percentage with different warning level ASCII colors, as the percentage passes the 25%/50%/75% thresholds. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-e25zz44rcms7mu9az4fu5zp0@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 845ded8f15a2..e7d91f9cf789 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -46,6 +46,7 @@ #include "util/evlist.h" #include "util/evsel.h" #include "util/debug.h" +#include "util/color.h" #include "util/header.h" #include "util/cpumap.h" #include "util/thread.h" @@ -426,6 +427,29 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); } +static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 75.0) + color = PERF_COLOR_RED; + else if (ratio > 50.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 25.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " of all cycles are idle "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -488,12 +512,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { - total = avg_stats(&runtime_cycles_stats[cpu]); - - if (total) - ratio = avg / total * 100.0; - - fprintf(stderr, " # %5.2f%% of all cycles are idle ", ratio); + print_stalled_cycles(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); @@ -508,6 +527,8 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) ratio = 1000.0 * avg / total; fprintf(stderr, " # %8.3f M/sec ", ratio); + } else { + fprintf(stderr, " "); } } From c78df6c1d49b5d798f1579141e3a12be7c325d1e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 12:16:10 +0200 Subject: [PATCH 13/29] perf stat: Print branch misses warning colors Print the missed-branches percentage with different warning level ASCII colors, as the percentage passes the 5%/10%/20% thresholds. These thresholds are set to relatively low levels, because on most CPUs even a moderate percentage of branch-misses already shows up as a slowdown. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-ybqukg7p86leiup7gl03ecgk@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e7d91f9cf789..5d4e1b9b2d89 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -450,6 +450,29 @@ static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, doubl fprintf(stderr, " of all cycles are idle "); } +static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_branches_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " of all branches "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -495,13 +518,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { - total = avg_stats(&runtime_branches_stats[cpu]); - - if (total) - ratio = avg * 100 / total; - - fprintf(stderr, " # %5.2f %% of all branches ", ratio); - + print_branch_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { total = avg_stats(&runtime_cacherefs_stats[cpu]); From 8bb6c79f24e66538f606076915e918242c02ec7c Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 13:25:24 +0200 Subject: [PATCH 14/29] perf stat: Print out miss/hit ratio for L1 data-cache events Print out this kind of l1-dcache-misses percentage: Performance counter stats for './bw_tcp localhost': 29,956,262,201 cycles # 3.002 GHz (scaled from 85.14%) 8,255,209,558 stalled-cycles # 27.56% of all cycles are idle (scaled from 86.56%) 1,206,130,308 l1-dcache-misses # 40.49% of all L1-dcache hits (scaled from 86.30%) 2,978,756,779 l1-dcache-refs # 298.512 M/sec (scaled from 70.02%) 8,861,956,159 instructions # 0.30 insns per cycle # 0.93 stalled cycles per insn (scaled from 84.27%) 1,644,306,068 branches # 164.782 M/sec (scaled from 86.43%) 74,778,443 branch-misses # 4.55% of all branches (scaled from 70.69%) 9978.695711 task-clock # 0.693 CPUs utilized 14.404347983 seconds time elapsed And color the result depending on the severity of cache-trashing. Acked-by: Peter Zijlstra Acked-by Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-54gmz0zymaid84zcs7joq02p@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 33 +++++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5d4e1b9b2d89..03bac6aa014b 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -159,6 +159,7 @@ struct stats runtime_cycles_stats[MAX_NR_CPUS]; struct stats runtime_stalled_cycles_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; +struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; struct stats walltime_nsecs_stats; static int create_perf_stat_counter(struct perf_evsel *evsel) @@ -211,6 +212,8 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_branches_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) update_stats(&runtime_cacherefs_stats[0], count[0]); + else if (perf_evsel__match(counter, HW_CACHE, HW_CACHE_L1D)) + update_stats(&runtime_l1_dcache_stats[0], count[0]); } /* @@ -473,6 +476,29 @@ static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double fprintf(stderr, " of all branches "); } +static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_l1_dcache_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 20.0) + color = PERF_COLOR_RED; + else if (ratio > 10.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 5.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " of all L1-dcache hits "); +} + static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) { double total, ratio = 0.0; @@ -519,6 +545,13 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && runtime_branches_stats[cpu].n != 0) { print_branch_misses(cpu, evsel, avg); + } else if ( + evsel->attr.type == PERF_TYPE_HW_CACHE && + evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | + ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | + ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && + runtime_branches_stats[cpu].n != 0) { + print_l1_dcache_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { total = avg_stats(&runtime_cacherefs_stats[cpu]); From c6264deff7ea6125492b442edad885e5429679af Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 27 Apr 2011 13:50:47 +0200 Subject: [PATCH 15/29] perf stat: Add -d/--detailed flag to run with a lot of events Add the new -d/--detailed flag, which generates a pretty detailed event list: Performance counter stats for './hackbench 10' (10 runs): 1514.287888 task-clock # 10.897 CPUs utilized ( +- 3.05% ) 39,698 context-switches # 0.026 M/sec ( +- 12.19% ) 8,147 CPU-migrations # 0.005 M/sec ( +- 16.55% ) 17,918 page-faults # 0.012 M/sec ( +- 0.37% ) 2,944,504,050 cycles # 1.944 GHz ( +- 3.89% ) (32.60%) 1,043,971,283 stalled-cycles # 35.45% of all cycles are idle ( +- 5.22% ) (44.48%) 1,655,906,768 instructions # 0.56 insns per cycle # 0.63 stalled cycles per insn ( +- 1.95% ) (55.09%) 338,832,373 branches # 223.757 M/sec ( +- 1.96% ) (64.47%) 3,892,416 branch-misses # 1.15% of all branches ( +- 5.49% ) (73.12%) 606,410,482 L1-dcache-loads # 400.459 M/sec ( +- 1.29% ) (71.21%) 31,204,395 L1-dcache-load-misses # 5.15% of all L1-dcache hits ( +- 3.04% ) (60.43%) 3,922,751 LLC-loads # 2.590 M/sec ( +- 6.80% ) (46.87%) 5,037,288 LLC-load-misses # 3.327 M/sec ( +- 3.56% ) (13.00%) 0.138966828 seconds time elapsed ( +- 4.11% ) This can be used "at a glance" for narrower analysis. -d can also be used in addition to other -e events, to further expand an event list. Acked-by: Peter Zijlstra Acked-by: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-cxs98quixs3qyvdqx3goojc4@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 68 ++++++++++++++++++++++++++++++++++----- 1 file changed, 60 insertions(+), 8 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 03bac6aa014b..6959fdecb203 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -73,6 +73,47 @@ static struct perf_event_attr default_attrs[] = { }; +/* + * Detailed stats: + */ +static struct perf_event_attr detailed_attrs[] = { + + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_TASK_CLOCK }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CONTEXT_SWITCHES }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_CPU_MIGRATIONS }, + { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, + + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_L1D << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_LL << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_ACCESS << 16) }, + + { .type = PERF_TYPE_HW_CACHE, + .config = + PERF_COUNT_HW_CACHE_LL << 0 | + (PERF_COUNT_HW_CACHE_OP_READ << 8) | + (PERF_COUNT_HW_CACHE_RESULT_MISS << 16) }, +}; + struct perf_evlist *evsel_list; static bool system_wide = false; @@ -86,6 +127,7 @@ static pid_t target_pid = -1; static pid_t target_tid = -1; static pid_t child_pid = -1; static bool null_run = false; +static bool detailed_run = false; static bool big_num = true; static int big_num_opt = -1; static const char *cpu_list; @@ -550,7 +592,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) evsel->attr.config == ( PERF_COUNT_HW_CACHE_L1D | ((PERF_COUNT_HW_CACHE_OP_READ) << 8) | ((PERF_COUNT_HW_CACHE_RESULT_MISS) << 16)) && - runtime_branches_stats[cpu].n != 0) { + runtime_l1_dcache_stats[cpu].n != 0) { print_l1_dcache_misses(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CACHE_MISSES) && runtime_cacherefs_stats[cpu].n != 0) { @@ -625,8 +667,7 @@ static void print_counter_aggr(struct perf_evsel *counter) avg_enabled = avg_stats(&ps->res_stats[1]); avg_running = avg_stats(&ps->res_stats[2]); - fprintf(stderr, " (scaled from %.2f%%)", - 100 * avg_running / avg_enabled); + fprintf(stderr, " (%.2f%%)", 100 * avg_running / avg_enabled); } fprintf(stderr, "\n"); } @@ -668,10 +709,8 @@ static void print_counter(struct perf_evsel *counter) if (!csv_output) { print_noise(counter, 1.0); - if (run != ena) { - fprintf(stderr, " (scaled from %.2f%%)", - 100.0 * run / ena); - } + if (run != ena) + fprintf(stderr, " (%.2f%%)", 100.0 * run / ena); } fputc('\n', stderr); } @@ -778,6 +817,8 @@ static const struct option options[] = { "repeat command and print average + stddev (max: 100)"), OPT_BOOLEAN('n', "null", &null_run, "null run - dont start any counters"), + OPT_BOOLEAN('d', "detailed", &detailed_run, + "detailed run - start a lot of events"), OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, "print large numbers with thousands\' separators", stat__set_big_num), @@ -839,7 +880,18 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) } /* Set attrs and nr_counters if no event is selected and !null_run */ - if (!null_run && !evsel_list->nr_entries) { + if (detailed_run) { + size_t c; + + for (c = 0; c < ARRAY_SIZE(detailed_attrs); ++c) { + pos = perf_evsel__new(&detailed_attrs[c], c); + if (pos == NULL) + goto out; + perf_evlist__add(evsel_list, pos); + } + } + /* Set attrs and nr_counters if no event is selected and !null_run */ + if (!detailed_run && !null_run && !evsel_list->nr_entries) { size_t c; for (c = 0; c < ARRAY_SIZE(default_attrs); ++c) { From 9ceb1c3d1fe15c2f9b55eaa8978019ef0e0a06ac Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 02:57:53 +0200 Subject: [PATCH 16/29] perf stat: Fix printout vertical alignment Before: | | Performance counter stats for '/home/mingo/hackbench 20' (5 runs): | | 71,321,607 instructions:u # 0.42 insns per cycle ( +- 0.00% ) | 168,040,009 cycles:u # 0.000 GHz ( +- 0.81% ) | | 1.468002368 seconds time elapsed ( +- 1.33% ) | After: | | Performance counter stats for '/home/mingo/hackbench 20' (5 runs): | | 71,321,607 instructions:u # 0.42 insns per cycle ( +- 0.00% ) | 168,040,009 cycles:u # 0.000 GHz ( +- 0.81% ) | | 1.468002368 seconds time elapsed ( +- 1.33% ) | The last column (stddev noise) is properly aligned, vertically. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio7hjpn0dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6959fdecb203..003caa857a44 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -575,7 +575,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # %4.2f insns per cycle", ratio); + fprintf(stderr, " # %4.2f insns per cycle ", ratio); total = avg_stats(&runtime_stalled_cycles_stats[cpu]); From 8a850cadca0e387c87a0911a61e99fd66aeb57ec Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 11:16:44 +0200 Subject: [PATCH 17/29] perf event, x86: Use better stalled cycles metric Use the UOPS_EXECUTED.*,c=1,i=1 event on Intel CPUs - it is a rather good indicator of CPU execution stalls, more sensitive and more inclusive than the 0xa2 resource stalls event (which does not count nearly as many stall types). Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio7hjpn2dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 067a48b13a76..1ea94224f62e 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1413,8 +1413,8 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; - /* Install the stalled-cycles event: 0xff: All reasons, 0xa2: Resource stalls */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0xffa2; + /* Install the stalled-cycles event: UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0x1803fb1; if (ebx & 0x40) { /* From f9cef0a90c4e7637f1ec98474a1a099aec45eb65 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 18:17:11 +0200 Subject: [PATCH 18/29] perf stat: Add --sync/-S option --sync will tell perf stat to run sync() before starting a command. This allows IO-heavy tests to be used with --repeat, without one iteration impacting the other. Elapsed time will stabilize for example: before: 3.971525714 seconds time elapsed ( +- 8.56% ) after: 3.211098537 seconds time elapsed ( +- 1.52% ) So measurements will be more accurate. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 003caa857a44..5658a770dbd7 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -128,6 +128,7 @@ static pid_t target_tid = -1; static pid_t child_pid = -1; static bool null_run = false; static bool detailed_run = false; +static bool sync_run = false; static bool big_num = true; static int big_num_opt = -1; static const char *cpu_list; @@ -819,6 +820,8 @@ static const struct option options[] = { "null run - dont start any counters"), OPT_BOOLEAN('d', "detailed", &detailed_run, "detailed run - start a lot of events"), + OPT_BOOLEAN('S', "sync", &sync_run, + "call sync() before starting a run"), OPT_CALLBACK_NOOPT('B', "big-num", NULL, NULL, "print large numbers with thousands\' separators", stat__set_big_num), @@ -944,6 +947,10 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used) for (run_idx = 0; run_idx < run_count; run_idx++) { if (run_count != 1 && verbose) fprintf(stderr, "[ perf stat: executing run #%d ... ]\n", run_idx + 1); + + if (sync_run) + sync(); + status = run_perf_stat(argc, argv); } From ede70290046043b2638204cab55e26ea1d0c6cd9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 28 Apr 2011 08:48:42 +0200 Subject: [PATCH 19/29] perf stat: Fix compatibility behavior Instead of failing on an unknown event, when new perf stat is run on older kernels: $ ./perf stat true Error: open_counter returned with 22 (Invalid argument). /bin/dmesg may provide additional information. Fatal: Not all events could be opened. Just ignore EINVAL and ENOSYS, we'll print the results as not counted: Performance counter stats for 'true': 0.239483 task-clock # 0.493 CPUs utilized 0 context-switches # 0.000 M/sec 0 CPU-migrations # 0.000 M/sec 86 page-faults # 0.359 M/sec 704,766 cycles # 2.943 GHz stalled-cycles 381,961 instructions # 0.54 insns per cycle 69,626 branches # 290.735 M/sec 4,594 branch-misses # 6.60% of all branches 0.000485883 seconds time elapsed Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n1eqio5hjpn3dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 5658a770dbd7..da77077450cf 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -372,7 +372,10 @@ static int run_perf_stat(int argc __used, const char **argv) list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter) < 0) { - if (errno == -EPERM || errno == -EACCES) { + if (errno == EINVAL || errno == ENOSYS) + continue; + + if (errno == EPERM || errno == EACCES) { error("You may not have permission to collect %sstats.\n" "\t Consider tweaking" " /proc/sys/kernel/perf_event_paranoid or running as root.", From 8f62242246351b5a4bc0c1f00c0c7003edea128a Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 13:19:47 +0200 Subject: [PATCH 20/29] perf events: Add generic front-end and back-end stalled cycle event definitions Add two generic hardware events: front-end and back-end stalled cycles. These events measure conditions when the CPU is executing code but its capabilities are not fully utilized. Understanding such situations and analyzing them is an important sub-task of code optimization workflows. Both events limit performance: most front end stalls tend to be caused by branch misprediction or instruction fetch cachemisses, backend stalls can be caused by various resource shortages or inefficient instruction scheduling. Front-end stalls are the more important ones: code cannot run fast if the instruction stream is not being kept up. An over-utilized back-end can cause front-end stalls and thus has to be kept an eye on as well. The exact composition is very program logic and instruction mix dependent. We use the terms 'stall', 'front-end' and 'back-end' loosely and try to use the best available events from specific CPUs that approximate these concepts. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n000io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 2 +- include/linux/perf_event.h | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 1ea94224f62e..393085b87a2c 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1414,7 +1414,7 @@ static __init int intel_pmu_init(void) x86_pmu.extra_regs = intel_nehalem_extra_regs; /* Install the stalled-cycles event: UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ - intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES] = 0x1803fb1; + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; if (ebx & 0x40) { /* diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h index ac636dd20a0c..4e2d7ae71499 100644 --- a/include/linux/perf_event.h +++ b/include/linux/perf_event.h @@ -52,7 +52,8 @@ enum perf_hw_id { PERF_COUNT_HW_BRANCH_INSTRUCTIONS = 4, PERF_COUNT_HW_BRANCH_MISSES = 5, PERF_COUNT_HW_BUS_CYCLES = 6, - PERF_COUNT_HW_STALLED_CYCLES = 7, + PERF_COUNT_HW_STALLED_CYCLES_FRONTEND = 7, + PERF_COUNT_HW_STALLED_CYCLES_BACKEND = 8, PERF_COUNT_HW_MAX, /* non-ABI */ }; From 91fc4cc00099986bc1ba50e1f421c3548cffae42 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 14:17:19 +0200 Subject: [PATCH 21/29] perf, x86: Add new stalled cycles events for Intel and AMD CPUs Extend the Intel and AMD event definitions with generic front-end and back-end stall events. ( These are only approximations - suggestions are welcome for better events. ) Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n001io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_amd.c | 14 ++++++++------ arch/x86/kernel/cpu/perf_event_intel.c | 4 +++- 2 files changed, 11 insertions(+), 7 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_amd.c b/arch/x86/kernel/cpu/perf_event_amd.c index cf4e369cea67..fe29c1d2219e 100644 --- a/arch/x86/kernel/cpu/perf_event_amd.c +++ b/arch/x86/kernel/cpu/perf_event_amd.c @@ -96,12 +96,14 @@ static __initconst const u64 amd_hw_cache_event_ids */ static const u64 amd_perfmon_event_map[] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, - [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x0080, + [PERF_COUNT_HW_CACHE_MISSES] = 0x0081, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ }; static u64 amd_pmu_event_map(int hw_event) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 393085b87a2c..7983b9a9533b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1413,7 +1413,9 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.extra_regs = intel_nehalem_extra_regs; - /* Install the stalled-cycles event: UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; if (ebx & 0x40) { From 129c04cb8ce2e4bf3f17223f58ef16aa8a2cb3b8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 14:41:28 +0200 Subject: [PATCH 22/29] perf tools: Add front-end and back-end stalled cycles support Update perf tooling to deal with front-end and back-end stalled cycles events. Add both the default 'perf stat' output. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n002io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 10 ++++++---- tools/perf/util/parse-events.c | 35 +++++++++++++++++----------------- tools/perf/util/python.c | 4 +++- 3 files changed, 27 insertions(+), 22 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index da77077450cf..6a4a8a399d95 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -66,7 +66,8 @@ static struct perf_event_attr default_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, @@ -84,7 +85,8 @@ static struct perf_event_attr detailed_attrs[] = { { .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_PAGE_FAULTS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_CPU_CYCLES }, - { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_BRANCH_MISSES }, @@ -249,7 +251,7 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_nsecs_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); - else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES)) + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) update_stats(&runtime_stalled_cycles_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); @@ -607,7 +609,7 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); - } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES)) { + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { print_stalled_cycles(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index bbbb735268ef..04d2f0a96674 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -31,24 +31,25 @@ char debugfs_path[MAXPATHLEN]; #define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x static struct event_symbol event_symbols[] = { - { CHW(CPU_CYCLES), "cpu-cycles", "cycles" }, - { CHW(STALLED_CYCLES), "stalled-cycles", "idle-cycles" }, - { CHW(INSTRUCTIONS), "instructions", "" }, - { CHW(CACHE_REFERENCES), "cache-references", "" }, - { CHW(CACHE_MISSES), "cache-misses", "" }, - { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, - { CHW(BRANCH_MISSES), "branch-misses", "" }, - { CHW(BUS_CYCLES), "bus-cycles", "" }, + { CHW(CPU_CYCLES), "cpu-cycles", "cycles" }, + { CHW(STALLED_CYCLES_FRONTEND), "stalled-cycles-frontend", "idle-cycles-frontend" }, + { CHW(STALLED_CYCLES_BACKEND), "stalled-cycles-backend", "idle-cycles-backend" }, + { CHW(INSTRUCTIONS), "instructions", "" }, + { CHW(CACHE_REFERENCES), "cache-references", "" }, + { CHW(CACHE_MISSES), "cache-misses", "" }, + { CHW(BRANCH_INSTRUCTIONS), "branch-instructions", "branches" }, + { CHW(BRANCH_MISSES), "branch-misses", "" }, + { CHW(BUS_CYCLES), "bus-cycles", "" }, - { CSW(CPU_CLOCK), "cpu-clock", "" }, - { CSW(TASK_CLOCK), "task-clock", "" }, - { CSW(PAGE_FAULTS), "page-faults", "faults" }, - { CSW(PAGE_FAULTS_MIN), "minor-faults", "" }, - { CSW(PAGE_FAULTS_MAJ), "major-faults", "" }, - { CSW(CONTEXT_SWITCHES), "context-switches", "cs" }, - { CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" }, - { CSW(ALIGNMENT_FAULTS), "alignment-faults", "" }, - { CSW(EMULATION_FAULTS), "emulation-faults", "" }, + { CSW(CPU_CLOCK), "cpu-clock", "" }, + { CSW(TASK_CLOCK), "task-clock", "" }, + { CSW(PAGE_FAULTS), "page-faults", "faults" }, + { CSW(PAGE_FAULTS_MIN), "minor-faults", "" }, + { CSW(PAGE_FAULTS_MAJ), "major-faults", "" }, + { CSW(CONTEXT_SWITCHES), "context-switches", "cs" }, + { CSW(CPU_MIGRATIONS), "cpu-migrations", "migrations" }, + { CSW(ALIGNMENT_FAULTS), "alignment-faults", "" }, + { CSW(EMULATION_FAULTS), "emulation-faults", "" }, }; #define __PERF_EVENT_FIELD(config, name) \ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 406f613ee619..8b0eff8b8283 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -798,7 +798,6 @@ static struct { { "COUNT_HW_BRANCH_INSTRUCTIONS", PERF_COUNT_HW_BRANCH_INSTRUCTIONS }, { "COUNT_HW_BRANCH_MISSES", PERF_COUNT_HW_BRANCH_MISSES }, { "COUNT_HW_BUS_CYCLES", PERF_COUNT_HW_BUS_CYCLES }, - { "COUNT_HW_STALLED_CYCLES", PERF_COUNT_HW_STALLED_CYCLES }, { "COUNT_HW_CACHE_L1D", PERF_COUNT_HW_CACHE_L1D }, { "COUNT_HW_CACHE_L1I", PERF_COUNT_HW_CACHE_L1I }, { "COUNT_HW_CACHE_LL", PERF_COUNT_HW_CACHE_LL }, @@ -811,6 +810,9 @@ static struct { { "COUNT_HW_CACHE_RESULT_ACCESS", PERF_COUNT_HW_CACHE_RESULT_ACCESS }, { "COUNT_HW_CACHE_RESULT_MISS", PERF_COUNT_HW_CACHE_RESULT_MISS }, + { "COUNT_HW_STALLED_CYCLES_FRONTEND", PERF_COUNT_HW_STALLED_CYCLES_FRONTEND }, + { "COUNT_HW_STALLED_CYCLES_BACKEND", PERF_COUNT_HW_STALLED_CYCLES_BACKEND }, + { "COUNT_SW_CPU_CLOCK", PERF_COUNT_SW_CPU_CLOCK }, { "COUNT_SW_TASK_CLOCK", PERF_COUNT_SW_TASK_CLOCK }, { "COUNT_SW_PAGE_FAULTS", PERF_COUNT_SW_PAGE_FAULTS }, From d3d1e86da07b4565815e3dbcd082f53017d215f8 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 13:49:08 +0200 Subject: [PATCH 23/29] perf stat: Analyze front-end and back-end stall counts Sample output: Performance counter stats for './loop_1b': 873.691065 task-clock # 1.000 CPUs utilized 1 context-switches # 0.000 M/sec 1 CPU-migrations # 0.000 M/sec 96 page-faults # 0.000 M/sec 2,012,637,222 cycles # 2.304 GHz (66.58%) 1,001,397,911 stalled-cycles-frontend # 49.76% frontend cycles idle (66.58%) 7,523,398 stalled-cycles-backend # 0.37% backend cycles idle (66.76%) 2,004,551,046 instructions # 1.00 insns per cycle # 0.50 stalled cycles per insn (66.80%) 1,001,304,992 branches # 1146.063 M/sec (66.76%) 39,453 branch-misses # 0.00% of all branches (66.64%) 0.874046121 seconds time elapsed Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n003io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 41 +++++++++++++++++++++++++++++----- tools/perf/util/parse-events.c | 7 +++--- 2 files changed, 39 insertions(+), 9 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 6a4a8a399d95..e45449938b80 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -201,7 +201,8 @@ static double stddev_stats(struct stats *stats) struct stats runtime_nsecs_stats[MAX_NR_CPUS]; struct stats runtime_cycles_stats[MAX_NR_CPUS]; -struct stats runtime_stalled_cycles_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_front_stats[MAX_NR_CPUS]; +struct stats runtime_stalled_cycles_back_stats[MAX_NR_CPUS]; struct stats runtime_branches_stats[MAX_NR_CPUS]; struct stats runtime_cacherefs_stats[MAX_NR_CPUS]; struct stats runtime_l1_dcache_stats[MAX_NR_CPUS]; @@ -251,8 +252,10 @@ static void update_shadow_stats(struct perf_evsel *counter, u64 *count) update_stats(&runtime_nsecs_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CPU_CYCLES)) update_stats(&runtime_cycles_stats[0], count[0]); + else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) + update_stats(&runtime_stalled_cycles_front_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_STALLED_CYCLES_BACKEND)) - update_stats(&runtime_stalled_cycles_stats[0], count[0]); + update_stats(&runtime_stalled_cycles_back_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_BRANCH_INSTRUCTIONS)) update_stats(&runtime_branches_stats[0], count[0]); else if (perf_evsel__match(counter, HARDWARE, HW_CACHE_REFERENCES)) @@ -478,7 +481,30 @@ static void nsec_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f CPUs utilized ", avg / avg_stats(&walltime_nsecs_stats)); } -static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, double avg) +static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __used, double avg) +{ + double total, ratio = 0.0; + const char *color; + + total = avg_stats(&runtime_cycles_stats[cpu]); + + if (total) + ratio = avg / total * 100.0; + + color = PERF_COLOR_NORMAL; + if (ratio > 75.0) + color = PERF_COLOR_RED; + else if (ratio > 50.0) + color = PERF_COLOR_MAGENTA; + else if (ratio > 20.0) + color = PERF_COLOR_YELLOW; + + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " frontend cycles idle "); +} + +static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __used, double avg) { double total, ratio = 0.0; const char *color; @@ -498,7 +524,7 @@ static void print_stalled_cycles(int cpu, struct perf_evsel *evsel __used, doubl fprintf(stderr, " # "); color_fprintf(stderr, color, "%5.2f%%", ratio); - fprintf(stderr, " of all cycles are idle "); + fprintf(stderr, " backend cycles idle "); } static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) @@ -583,7 +609,8 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %4.2f insns per cycle ", ratio); - total = avg_stats(&runtime_stalled_cycles_stats[cpu]); + total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); + total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); if (total && avg) { ratio = total / avg; @@ -609,8 +636,10 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) fprintf(stderr, " # %8.3f %% of all cache refs ", ratio); + } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_FRONTEND)) { + print_stalled_cycles_frontend(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_STALLED_CYCLES_BACKEND)) { - print_stalled_cycles(cpu, evsel, avg); + print_stalled_cycles_backend(cpu, evsel, avg); } else if (perf_evsel__match(evsel, HARDWARE, HW_CPU_CYCLES)) { total = avg_stats(&runtime_nsecs_stats[cpu]); diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 04d2f0a96674..8a407f3e286f 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -60,7 +60,7 @@ static struct event_symbol event_symbols[] = { #define PERF_EVENT_TYPE(config) __PERF_EVENT_FIELD(config, TYPE) #define PERF_EVENT_ID(config) __PERF_EVENT_FIELD(config, EVENT) -static const char *hw_event_names[] = { +static const char *hw_event_names[PERF_COUNT_HW_MAX] = { "cycles", "instructions", "cache-references", @@ -68,10 +68,11 @@ static const char *hw_event_names[] = { "branches", "branch-misses", "bus-cycles", - "stalled-cycles", + "stalled-cycles-frontend", + "stalled-cycles-backend", }; -static const char *sw_event_names[] = { +static const char *sw_event_names[PERF_COUNT_SW_MAX] = { "cpu-clock", "task-clock", "page-faults", From 2b427e14b77dbf3e05f1bd0785f1d07ea5fe924e Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 14:16:18 +0200 Subject: [PATCH 24/29] perf stat: Adjust stall cycles warning percentages Adjust to color thresholds to better match the percentages seen in real workloads. Both are now a bit more sensitive. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n004io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index e45449938b80..2492a0efa4d8 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -492,11 +492,11 @@ static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __us ratio = avg / total * 100.0; color = PERF_COLOR_NORMAL; - if (ratio > 75.0) + if (ratio > 50.0) color = PERF_COLOR_RED; - else if (ratio > 50.0) + else if (ratio > 30.0) color = PERF_COLOR_MAGENTA; - else if (ratio > 20.0) + else if (ratio > 10.0) color = PERF_COLOR_YELLOW; fprintf(stderr, " # "); @@ -519,7 +519,7 @@ static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __use color = PERF_COLOR_RED; else if (ratio > 50.0) color = PERF_COLOR_MAGENTA; - else if (ratio > 25.0) + else if (ratio > 20.0) color = PERF_COLOR_YELLOW; fprintf(stderr, " # "); From fce3c786d3a49eff397583b4b62fa38df90db937 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 30 Apr 2011 09:03:15 +0200 Subject: [PATCH 25/29] perf stat: Leave more room for percentages Triple digit percentages do not fit otherwise. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n005io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 2492a0efa4d8..9e596ab98d05 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -499,8 +499,8 @@ static void print_stalled_cycles_frontend(int cpu, struct perf_evsel *evsel __us else if (ratio > 10.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); fprintf(stderr, " frontend cycles idle "); } @@ -522,9 +522,9 @@ static void print_stalled_cycles_backend(int cpu, struct perf_evsel *evsel __use else if (ratio > 20.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); - fprintf(stderr, " backend cycles idle "); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); + fprintf(stderr, " backend cycles idle "); } static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double avg) @@ -545,8 +545,8 @@ static void print_branch_misses(int cpu, struct perf_evsel *evsel __used, double else if (ratio > 5.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); fprintf(stderr, " of all branches "); } @@ -568,8 +568,8 @@ static void print_l1_dcache_misses(int cpu, struct perf_evsel *evsel __used, dou else if (ratio > 5.0) color = PERF_COLOR_YELLOW; - fprintf(stderr, " # "); - color_fprintf(stderr, color, "%5.2f%%", ratio); + fprintf(stderr, " # "); + color_fprintf(stderr, color, "%6.2f%%", ratio); fprintf(stderr, " of all L1-dcache hits "); } @@ -607,14 +607,14 @@ static void abs_printout(int cpu, struct perf_evsel *evsel, double avg) if (total) ratio = avg / total; - fprintf(stderr, " # %4.2f insns per cycle ", ratio); + fprintf(stderr, " # %5.2f insns per cycle ", ratio); total = avg_stats(&runtime_stalled_cycles_front_stats[cpu]); total = max(total, avg_stats(&runtime_stalled_cycles_back_stats[cpu])); if (total && avg) { ratio = total / avg; - fprintf(stderr, "\n # %4.2f stalled cycles per insn", ratio); + fprintf(stderr, "\n # %5.2f stalled cycles per insn", ratio); } } else if (perf_evsel__match(evsel, HARDWARE, HW_BRANCH_MISSES) && From 370faf1dd0461ad811852c8abbbcd3d73b1e4fc4 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 16:11:03 +0200 Subject: [PATCH 26/29] perf stat: Fail softly on unsupported events David Ahern reported this perf stat failure: > # /tmp/build-perf/perf stat -- sleep 1 > Error: stalled-cycles-frontend event is not supported. > Fatal: Not all events could be opened. > > This is a Dell R410 with an E5620 processor. Fail in a softer fashion on unknown/unsupported events. Reported-by: David Ahern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n006io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index 9e596ab98d05..c8b535bc27bd 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -377,7 +377,7 @@ static int run_perf_stat(int argc __used, const char **argv) list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter) < 0) { - if (errno == EINVAL || errno == ENOSYS) + if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) continue; if (errno == EPERM || errno == EACCES) { @@ -385,8 +385,6 @@ static int run_perf_stat(int argc __used, const char **argv) "\t Consider tweaking" " /proc/sys/kernel/perf_event_paranoid or running as root.", system_wide ? "system-wide " : ""); - } else if (errno == ENOENT) { - error("%s event is not supported. ", event_name(counter)); } else { error("open_counter returned with %d (%s). " "/bin/dmesg may provide additional information.\n", From 301120396b766ae4480e52ece220516a1707822b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sat, 30 Apr 2011 09:14:54 +0200 Subject: [PATCH 27/29] perf events, x86: Add Westmere stalled-cycles-frontend/backend events Extend the Intel Westmere PMU driver with definitions for generic front-end and back-end stall events. ( These are only approximations. ) Reported-by: David Ahern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n008io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/perf_event_intel.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index 7983b9a9533b..be8363adf4e2 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -1458,6 +1458,12 @@ static __init int intel_pmu_init(void) x86_pmu.enable_all = intel_pmu_nhm_enable_all; x86_pmu.pebs_constraints = intel_westmere_pebs_event_constraints; x86_pmu.extra_regs = intel_westmere_extra_regs; + + /* UOPS_ISSUED.STALLED_CYCLES */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x180010e; + /* UOPS_EXECUTED.CORE_ACTIVE_CYCLES,c=1,i=1 */ + intel_perfmon_event_map[PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x1803fb1; + pr_cont("Westmere events, "); break; From 947b4ad1d198b7303ecc961f4939a331be0c48f0 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 29 Apr 2011 22:52:42 +0200 Subject: [PATCH 28/29] perf list: Fix max event string size Recent stalled-cycles event names were larger than the 40 chars printout used by perf list. Extend that, make it robust for future extensions and also adjust alignments in face of wider event names. Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/n/tip-7y40wib8n009io7hjpn1dsrm@git.kernel.org Signed-off-by: Ingo Molnar --- tools/perf/util/parse-events.c | 25 +++++++++++++------------ 1 file changed, 13 insertions(+), 12 deletions(-) diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index 8a407f3e286f..ffa493a24333 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -929,7 +929,7 @@ void print_tracepoint_events(const char *subsys_glob, const char *event_glob) snprintf(evt_path, MAXPATHLEN, "%s:%s", sys_dirent.d_name, evt_dirent.d_name); - printf(" %-42s [%s]\n", evt_path, + printf(" %-50s [%s]\n", evt_path, event_type_descriptors[PERF_TYPE_TRACEPOINT]); } closedir(evt_dir); @@ -994,7 +994,7 @@ void print_events_type(u8 type) else snprintf(name, sizeof(name), "%s", syms->symbol); - printf(" %-42s [%s]\n", name, + printf(" %-50s [%s]\n", name, event_type_descriptors[type]); } } @@ -1012,11 +1012,10 @@ int print_hwcache_events(const char *event_glob) for (i = 0; i < PERF_COUNT_HW_CACHE_RESULT_MAX; i++) { char *name = event_cache_name(type, op, i); - if (event_glob != NULL && - !strglobmatch(name, event_glob)) + if (event_glob != NULL && !strglobmatch(name, event_glob)) continue; - printf(" %-42s [%s]\n", name, + printf(" %-50s [%s]\n", name, event_type_descriptors[PERF_TYPE_HW_CACHE]); ++printed; } @@ -1026,14 +1025,16 @@ int print_hwcache_events(const char *event_glob) return printed; } +#define MAX_NAME_LEN 100 + /* * Print the help text for the event symbols: */ void print_events(const char *event_glob) { - struct event_symbol *syms = event_symbols; unsigned int i, type, prev_type = -1, printed = 0, ntypes_printed = 0; - char name[40]; + struct event_symbol *syms = event_symbols; + char name[MAX_NAME_LEN]; printf("\n"); printf("List of pre-defined events (to be used in -e):\n"); @@ -1053,10 +1054,10 @@ void print_events(const char *event_glob) continue; if (strlen(syms->alias)) - sprintf(name, "%s OR %s", syms->symbol, syms->alias); + snprintf(name, MAX_NAME_LEN, "%s OR %s", syms->symbol, syms->alias); else - strcpy(name, syms->symbol); - printf(" %-42s [%s]\n", name, + strncpy(name, syms->symbol, MAX_NAME_LEN); + printf(" %-50s [%s]\n", name, event_type_descriptors[type]); prev_type = type; @@ -1073,12 +1074,12 @@ void print_events(const char *event_glob) return; printf("\n"); - printf(" %-42s [%s]\n", + printf(" %-50s [%s]\n", "rNNN (see 'perf list --help' on how to encode it)", event_type_descriptors[PERF_TYPE_RAW]); printf("\n"); - printf(" %-42s [%s]\n", + printf(" %-50s [%s]\n", "mem:[:access]", event_type_descriptors[PERF_TYPE_BREAKPOINT]); printf("\n"); From c63ca0c01d73563d4e2ab174bb3dd1e5efb907e6 Mon Sep 17 00:00:00 2001 From: David Ahern Date: Fri, 29 Apr 2011 16:04:15 -0600 Subject: [PATCH 29/29] perf stat: Tell user about unsupported events in the list Similar to perf-record, tell user about unsupported events that will not be counted if invoked in verbose mode. e.g., $ perf stat -e dTLB-prefetch-misses -v -- sleep 1 dTLB-prefetch-misses event is not supported by the kernel. dTLB-prefetch-misses: 0 0 0 Performance counter stats for 'sleep 1': dTLB-prefetch-misses 1.001884783 seconds time elapsed Signed-off-by: David Ahern Cc: Peter Zijlstra Cc: Arnaldo Carvalho de Melo Cc: Frederic Weisbecker Link: http://lkml.kernel.org/r/1304114655-10600-1-git-send-email-dsahern@gmail.com Signed-off-by: Ingo Molnar --- tools/perf/builtin-stat.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c index c8b535bc27bd..602c3c96fa1e 100644 --- a/tools/perf/builtin-stat.c +++ b/tools/perf/builtin-stat.c @@ -377,8 +377,12 @@ static int run_perf_stat(int argc __used, const char **argv) list_for_each_entry(counter, &evsel_list->entries, node) { if (create_perf_stat_counter(counter) < 0) { - if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) + if (errno == EINVAL || errno == ENOSYS || errno == ENOENT) { + if (verbose) + ui__warning("%s event is not supported by the kernel.\n", + event_name(counter)); continue; + } if (errno == EPERM || errno == EACCES) { error("You may not have permission to collect %sstats.\n"