From 9590b7ba3fefdfe0c7741f5e2f61faf2ffcea19c Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Mon, 6 Jul 2009 22:01:31 +1000
Subject: [PATCH 01/31] perf_counter tools: Rename cache events to remove $

The cache events contain '$' which will hit shell variable
expansion. To avoid confusion change this to 'cache', ie
L1-d$-loads becomes L1-dcache-loads.

Signed-off-by: Anton Blanchard <anton@samba.org>
Cc: Roland Dreier <rdreier@cisco.com>
Cc: Jaswinder Singh Rajput <jaswinder@kernel.org>
Cc: Peter Zijlstra <peterz@infradead.org>
LKML-Reference: <20090706120131.GB4391@kryten>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/parse-events.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 5184959e061..518a33affe1 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -71,8 +71,8 @@ static char *sw_event_names[] = {
 #define MAX_ALIASES 8
 
 static char *hw_cache[][MAX_ALIASES] = {
- { "L1-d$",	"l1-d",		"l1d",		"L1-data",		},
- { "L1-i$",	"l1-i",		"l1i",		"L1-instruction",	},
+ { "L1-dcache",	"l1-d",		"l1d",		"L1-data",		},
+ { "L1-icache",	"l1-i",		"l1i",		"L1-instruction",	},
  { "LLC",	"L2"							},
  { "dTLB",	"d-tlb",	"Data-TLB",				},
  { "iTLB",	"i-tlb",	"Instruction-TLB",			},

From 11d1578f9454159c43499d1d8fe8a7d728c176a3 Mon Sep 17 00:00:00 2001
From: Vince Weaver <vince@deater.net>
Date: Wed, 8 Jul 2009 17:46:14 -0400
Subject: [PATCH 02/31] perf_counter: Add P6 PMU support

Add basic P6 PMU support. The P6 uses the EVNTSEL0 EN bit to
enable/disable both its counters. We use this for the
global enable/disable, and clear all config bits (except EN)
to disable individual counters.

Actual ia32 hardware doesn't support lfence, so use a locked
op without side-effect to implement a full barrier.

perf stat and perf record seem to function correctly.

[a.p.zijlstra@chello.nl: cleanups and complete the enable/disable code]

Signed-off-by: Vince Weaver <vince@deater.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <Pine.LNX.4.64.0907081718450.2715@pianoman.cluster.toy>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 227 +++++++++++++++++++++++++++--
 tools/perf/perf.h                  |   8 +-
 2 files changed, 220 insertions(+), 15 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 36c3dc7b899..1910f39ff19 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -65,6 +65,44 @@ static DEFINE_PER_CPU(struct cpu_hw_counters, cpu_hw_counters) = {
 	.enabled = 1,
 };
 
+/*
+ * Not sure about some of these
+ */
+static const u64 p6_perfmon_event_map[] =
+{
+  [PERF_COUNT_HW_CPU_CYCLES]		= 0x0079,
+  [PERF_COUNT_HW_INSTRUCTIONS]		= 0x00c0,
+  [PERF_COUNT_HW_CACHE_REFERENCES]	= 0x0000,
+  [PERF_COUNT_HW_CACHE_MISSES]		= 0x0000,
+  [PERF_COUNT_HW_BRANCH_INSTRUCTIONS]	= 0x00c4,
+  [PERF_COUNT_HW_BRANCH_MISSES]		= 0x00c5,
+  [PERF_COUNT_HW_BUS_CYCLES]		= 0x0062,
+};
+
+static u64 p6_pmu_event_map(int event)
+{
+	return p6_perfmon_event_map[event];
+}
+
+static u64 p6_pmu_raw_event(u64 event)
+{
+#define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
+#define P6_EVNTSEL_UNIT_MASK		0x0000FF00ULL
+#define P6_EVNTSEL_EDGE_MASK		0x00040000ULL
+#define P6_EVNTSEL_INV_MASK		0x00800000ULL
+#define P6_EVNTSEL_COUNTER_MASK		0xFF000000ULL
+
+#define P6_EVNTSEL_MASK			\
+	(P6_EVNTSEL_EVENT_MASK |	\
+	 P6_EVNTSEL_UNIT_MASK  |	\
+	 P6_EVNTSEL_EDGE_MASK  |	\
+	 P6_EVNTSEL_INV_MASK   |	\
+	 P6_EVNTSEL_COUNTER_MASK)
+
+	return event & P6_EVNTSEL_MASK;
+}
+
+
 /*
  * Intel PerfMon v3. Used on Core2 and later.
  */
@@ -726,6 +764,23 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 	return 0;
 }
 
+static void p6_pmu_disable_all(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	unsigned long val;
+
+	if (!cpuc->enabled)
+		return;
+
+	cpuc->enabled = 0;
+	barrier();
+
+	/* p6 only has one enable register */
+	rdmsrl(MSR_P6_EVNTSEL0, val);
+	val &= ~ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
 static void intel_pmu_disable_all(void)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, 0);
@@ -767,6 +822,23 @@ void hw_perf_disable(void)
 	return x86_pmu.disable_all();
 }
 
+static void p6_pmu_enable_all(void)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	unsigned long val;
+
+	if (cpuc->enabled)
+		return;
+
+	cpuc->enabled = 1;
+	barrier();
+
+	/* p6 only has one enable register */
+	rdmsrl(MSR_P6_EVNTSEL0, val);
+	val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+	wrmsrl(MSR_P6_EVNTSEL0, val);
+}
+
 static void intel_pmu_enable_all(void)
 {
 	wrmsrl(MSR_CORE_PERF_GLOBAL_CTRL, x86_pmu.intel_ctrl);
@@ -819,16 +891,13 @@ static inline void intel_pmu_ack_status(u64 ack)
 
 static inline void x86_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	int err;
-	err = checking_wrmsrl(hwc->config_base + idx,
+	(void)checking_wrmsrl(hwc->config_base + idx,
 			      hwc->config | ARCH_PERFMON_EVENTSEL0_ENABLE);
 }
 
 static inline void x86_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
-	int err;
-	err = checking_wrmsrl(hwc->config_base + idx,
-			      hwc->config);
+	(void)checking_wrmsrl(hwc->config_base + idx, hwc->config);
 }
 
 static inline void
@@ -836,13 +905,24 @@ intel_pmu_disable_fixed(struct hw_perf_counter *hwc, int __idx)
 {
 	int idx = __idx - X86_PMC_IDX_FIXED;
 	u64 ctrl_val, mask;
-	int err;
 
 	mask = 0xfULL << (idx * 4);
 
 	rdmsrl(hwc->config_base, ctrl_val);
 	ctrl_val &= ~mask;
-	err = checking_wrmsrl(hwc->config_base, ctrl_val);
+	(void)checking_wrmsrl(hwc->config_base, ctrl_val);
+}
+
+static inline void
+p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	unsigned long val = ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	if (!cpuc->enabled)
+		val = 0;
+
+	(void)checking_wrmsrl(hwc->config_base + idx, val);
 }
 
 static inline void
@@ -943,6 +1023,17 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
 	err = checking_wrmsrl(hwc->config_base, ctrl_val);
 }
 
+static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
+{
+	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	if (cpuc->enabled)
+		x86_pmu_enable_counter(hwc, idx);
+	else
+		x86_pmu_disable_counter(hwc, idx);
+}
+
+
 static void intel_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	if (unlikely(hwc->config_base == MSR_ARCH_PERFMON_FIXED_CTR_CTRL)) {
@@ -1176,6 +1267,49 @@ static void intel_pmu_reset(void)
 	local_irq_restore(flags);
 }
 
+static int p6_pmu_handle_irq(struct pt_regs *regs)
+{
+	struct perf_sample_data data;
+	struct cpu_hw_counters *cpuc;
+	struct perf_counter *counter;
+	struct hw_perf_counter *hwc;
+	int idx, handled = 0;
+	u64 val;
+
+	data.regs = regs;
+	data.addr = 0;
+
+	cpuc = &__get_cpu_var(cpu_hw_counters);
+
+	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		if (!test_bit(idx, cpuc->active_mask))
+			continue;
+
+		counter = cpuc->counters[idx];
+		hwc = &counter->hw;
+
+		val = x86_perf_counter_update(counter, hwc, idx);
+		if (val & (1ULL << (x86_pmu.counter_bits - 1)))
+			continue;
+
+		/*
+		 * counter overflow
+		 */
+		handled		= 1;
+		data.period	= counter->hw.last_period;
+
+		if (!x86_perf_counter_set_period(counter, hwc, idx))
+			continue;
+
+		if (perf_counter_overflow(counter, 1, &data))
+			p6_pmu_disable_counter(hwc, idx);
+	}
+
+	if (handled)
+		inc_irq_stat(apic_perf_irqs);
+
+	return handled;
+}
 
 /*
  * This handler is triggered by the local APIC, so the APIC IRQ handling
@@ -1185,14 +1319,13 @@ static int intel_pmu_handle_irq(struct pt_regs *regs)
 {
 	struct perf_sample_data data;
 	struct cpu_hw_counters *cpuc;
-	int bit, cpu, loops;
+	int bit, loops;
 	u64 ack, status;
 
 	data.regs = regs;
 	data.addr = 0;
 
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
+	cpuc = &__get_cpu_var(cpu_hw_counters);
 
 	perf_disable();
 	status = intel_pmu_get_status();
@@ -1249,14 +1382,13 @@ static int amd_pmu_handle_irq(struct pt_regs *regs)
 	struct cpu_hw_counters *cpuc;
 	struct perf_counter *counter;
 	struct hw_perf_counter *hwc;
-	int cpu, idx, handled = 0;
+	int idx, handled = 0;
 	u64 val;
 
 	data.regs = regs;
 	data.addr = 0;
 
-	cpu = smp_processor_id();
-	cpuc = &per_cpu(cpu_hw_counters, cpu);
+	cpuc = &__get_cpu_var(cpu_hw_counters);
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
 		if (!test_bit(idx, cpuc->active_mask))
@@ -1353,6 +1485,32 @@ static __read_mostly struct notifier_block perf_counter_nmi_notifier = {
 	.priority		= 1
 };
 
+static struct x86_pmu p6_pmu = {
+	.name			= "p6",
+	.handle_irq		= p6_pmu_handle_irq,
+	.disable_all		= p6_pmu_disable_all,
+	.enable_all		= p6_pmu_enable_all,
+	.enable			= p6_pmu_enable_counter,
+	.disable		= p6_pmu_disable_counter,
+	.eventsel		= MSR_P6_EVNTSEL0,
+	.perfctr		= MSR_P6_PERFCTR0,
+	.event_map		= p6_pmu_event_map,
+	.raw_event		= p6_pmu_raw_event,
+	.max_events		= ARRAY_SIZE(p6_perfmon_event_map),
+	.max_period		= (1ULL << 31) - 1,
+	.version		= 0,
+	.num_counters		= 2,
+	/*
+	 * Counters have 40 bits implemented. However they are designed such
+	 * that bits [32-39] are sign extensions of bit 31. As such the
+	 * effective width of a counter for P6-like PMU is 32 bits only.
+	 *
+	 * See IA-32 Intel Architecture Software developer manual Vol 3B
+	 */
+	.counter_bits		= 32,
+	.counter_mask		= (1ULL << 32) - 1,
+};
+
 static struct x86_pmu intel_pmu = {
 	.name			= "Intel",
 	.handle_irq		= intel_pmu_handle_irq,
@@ -1392,6 +1550,41 @@ static struct x86_pmu amd_pmu = {
 	.max_period		= (1ULL << 47) - 1,
 };
 
+static int p6_pmu_init(void)
+{
+	int high, low;
+
+	switch (boot_cpu_data.x86_model) {
+	case 1:
+	case 3:  /* Pentium Pro */
+	case 5:
+	case 6:  /* Pentium II */
+	case 7:
+	case 8:
+	case 11: /* Pentium III */
+		break;
+	case 9:
+	case 13:
+		/* for Pentium M, we need to check if PMU exist */
+		rdmsr(MSR_IA32_MISC_ENABLE, low, high);
+		if (low & MSR_IA32_MISC_ENABLE_EMON)
+			break;
+	default:
+		pr_cont("unsupported p6 CPU model %d ",
+			boot_cpu_data.x86_model);
+		return -ENODEV;
+	}
+
+	if (!cpu_has_apic) {
+		pr_info("no Local APIC, try rebooting with lapic");
+		return -ENODEV;
+	}
+
+	x86_pmu				= p6_pmu;
+
+	return 0;
+}
+
 static int intel_pmu_init(void)
 {
 	union cpuid10_edx edx;
@@ -1400,8 +1593,14 @@ static int intel_pmu_init(void)
 	unsigned int ebx;
 	int version;
 
-	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON))
+	if (!cpu_has(&boot_cpu_data, X86_FEATURE_ARCH_PERFMON)) {
+		/* check for P6 processor family */
+	   if (boot_cpu_data.x86 == 6) {
+		return p6_pmu_init();
+	   } else {
 		return -ENODEV;
+	   }
+	}
 
 	/*
 	 * Check whether the Architectural PerfMon supports
diff --git a/tools/perf/perf.h b/tools/perf/perf.h
index 27887c91643..53bb9550def 100644
--- a/tools/perf/perf.h
+++ b/tools/perf/perf.h
@@ -1,7 +1,13 @@
 #ifndef _PERF_PERF_H
 #define _PERF_PERF_H
 
-#if defined(__x86_64__) || defined(__i386__)
+#if defined(__i386__)
+#include "../../arch/x86/include/asm/unistd.h"
+#define rmb()		asm volatile("lock; addl $0,0(%%esp)" ::: "memory")
+#define cpu_relax()	asm volatile("rep; nop" ::: "memory");
+#endif
+
+#if defined(__x86_64__)
 #include "../../arch/x86/include/asm/unistd.h"
 #define rmb()		asm volatile("lfence" ::: "memory")
 #define cpu_relax()	asm volatile("rep; nop" ::: "memory");

From 9c74fb50867e8fb5f3be3be06716492c0f79309e Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 8 Jul 2009 10:21:41 +0200
Subject: [PATCH 03/31] perf_counter: Fix up P6 PMU details

The P6 doesn't seem to support cache ref/hit/miss counts, so
we extend the generic hardware event codes to have 0 and -1
mean the same thing as for the generic cache events.

Furthermore, it turns out the 0 event does not count
(that is, its reported that on PPro it actually does count
something), therefore use a event configuration that's
specified not to count to disable the counters.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 28 +++++++++++++++++++++++-----
 1 file changed, 23 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 1910f39ff19..c7cc6eac14e 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -84,6 +84,14 @@ static u64 p6_pmu_event_map(int event)
 	return p6_perfmon_event_map[event];
 }
 
+/*
+ * Counter setting that is specified not to count anything.
+ * We use this to effectively disable a counter.
+ *
+ * L2_RQSTS with 0 MESI unit mask.
+ */
+#define P6_NOP_COUNTER			0x0000002EULL
+
 static u64 p6_pmu_raw_event(u64 event)
 {
 #define P6_EVNTSEL_EVENT_MASK		0x000000FFULL
@@ -704,6 +712,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 {
 	struct perf_counter_attr *attr = &counter->attr;
 	struct hw_perf_counter *hwc = &counter->hw;
+	u64 config;
 	int err;
 
 	if (!x86_pmu_initialized())
@@ -756,10 +765,19 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 
 	if (attr->config >= x86_pmu.max_events)
 		return -EINVAL;
+
 	/*
 	 * The generic map:
 	 */
-	hwc->config |= x86_pmu.event_map(attr->config);
+	config = x86_pmu.event_map(attr->config);
+
+	if (config == 0)
+		return -ENOENT;
+
+	if (config == -1LL)
+		return -EINVAL;
+
+	hwc->config |= config;
 
 	return 0;
 }
@@ -767,7 +785,7 @@ static int __hw_perf_counter_init(struct perf_counter *counter)
 static void p6_pmu_disable_all(void)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	unsigned long val;
+	u64 val;
 
 	if (!cpuc->enabled)
 		return;
@@ -917,10 +935,10 @@ static inline void
 p6_pmu_disable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
-	unsigned long val = ARCH_PERFMON_EVENTSEL0_ENABLE;
+	u64 val = P6_NOP_COUNTER;
 
-	if (!cpuc->enabled)
-		val = 0;
+	if (cpuc->enabled)
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 
 	(void)checking_wrmsrl(hwc->config_base + idx, val);
 }

From 984b838ce69c063a91b87550598ab7f3439dd94a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 10 Jul 2009 09:59:56 +0200
Subject: [PATCH 04/31] perf_counter: Clean up global vs counter enable

Ingo noticed that both AMD and P6 call
x86_pmu_disable_counter() on *_pmu_enable_counter(). This is
because we rely on the side effect of that call to program
the event config but not touch the EN bit.

We change that for AMD by having enable_all() simply write
the full config in, and for P6 by explicitly coding it.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index c7cc6eac14e..bed1c4c2f25 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -874,13 +874,13 @@ static void amd_pmu_enable_all(void)
 	barrier();
 
 	for (idx = 0; idx < x86_pmu.num_counters; idx++) {
+		struct perf_counter *counter = cpuc->counters[idx];
 		u64 val;
 
 		if (!test_bit(idx, cpuc->active_mask))
 			continue;
-		rdmsrl(MSR_K7_EVNTSEL0 + idx, val);
-		if (val & ARCH_PERFMON_EVENTSEL0_ENABLE)
-			continue;
+
+		val = counter->hw.config;
 		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
 		wrmsrl(MSR_K7_EVNTSEL0 + idx, val);
 	}
@@ -1044,11 +1044,13 @@ intel_pmu_enable_fixed(struct hw_perf_counter *hwc, int __idx)
 static void p6_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 {
 	struct cpu_hw_counters *cpuc = &__get_cpu_var(cpu_hw_counters);
+	u64 val;
 
+	val = hwc->config;
 	if (cpuc->enabled)
-		x86_pmu_enable_counter(hwc, idx);
-	else
-		x86_pmu_disable_counter(hwc, idx);
+		val |= ARCH_PERFMON_EVENTSEL0_ENABLE;
+
+	(void)checking_wrmsrl(hwc->config_base + idx, val);
 }
 
 
@@ -1068,8 +1070,6 @@ static void amd_pmu_enable_counter(struct hw_perf_counter *hwc, int idx)
 
 	if (cpuc->enabled)
 		x86_pmu_enable_counter(hwc, idx);
-	else
-		x86_pmu_disable_counter(hwc, idx);
 }
 
 static int

From 71a851b4d2a815adcfac09c1adda7ef6811fde66 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Fri, 10 Jul 2009 09:06:56 +0200
Subject: [PATCH 05/31] perf_counter: Stop open coding unclone_ctx

Instead of open coding the unclone context thingy, put it in
a common function.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 36 +++++++++++++++++-------------------
 1 file changed, 17 insertions(+), 19 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50da234..8bf997d86bf 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -146,6 +146,14 @@ static void put_ctx(struct perf_counter_context *ctx)
 	}
 }
 
+static void unclone_ctx(struct perf_counter_context *ctx)
+{
+	if (ctx->parent_ctx) {
+		put_ctx(ctx->parent_ctx);
+		ctx->parent_ctx = NULL;
+	}
+}
+
 /*
  * Get the perf_counter_context for a task and lock it.
  * This has to cope with with the fact that until it is locked,
@@ -1463,10 +1471,8 @@ static void perf_counter_enable_on_exec(struct task_struct *task)
 	/*
 	 * Unclone this context if we enabled any counter.
 	 */
-	if (enabled && ctx->parent_ctx) {
-		put_ctx(ctx->parent_ctx);
-		ctx->parent_ctx = NULL;
-	}
+	if (enabled)
+		unclone_ctx(ctx);
 
 	spin_unlock(&ctx->lock);
 
@@ -1526,7 +1532,6 @@ __perf_counter_init_context(struct perf_counter_context *ctx,
 
 static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
 {
-	struct perf_counter_context *parent_ctx;
 	struct perf_counter_context *ctx;
 	struct perf_cpu_context *cpuctx;
 	struct task_struct *task;
@@ -1586,11 +1591,7 @@ static struct perf_counter_context *find_get_context(pid_t pid, int cpu)
  retry:
 	ctx = perf_lock_task_context(task, &flags);
 	if (ctx) {
-		parent_ctx = ctx->parent_ctx;
-		if (parent_ctx) {
-			put_ctx(parent_ctx);
-			ctx->parent_ctx = NULL;		/* no longer a clone */
-		}
+		unclone_ctx(ctx);
 		spin_unlock_irqrestore(&ctx->lock, flags);
 	}
 
@@ -4255,15 +4256,12 @@ void perf_counter_exit_task(struct task_struct *child)
 	 */
 	spin_lock(&child_ctx->lock);
 	child->perf_counter_ctxp = NULL;
-	if (child_ctx->parent_ctx) {
-		/*
-		 * This context is a clone; unclone it so it can't get
-		 * swapped to another process while we're removing all
-		 * the counters from it.
-		 */
-		put_ctx(child_ctx->parent_ctx);
-		child_ctx->parent_ctx = NULL;
-	}
+	/*
+	 * If this context is a clone; unclone it so it can't get
+	 * swapped to another process while we're removing all
+	 * the counters from it.
+	 */
+	unclone_ctx(child_ctx);
 	spin_unlock(&child_ctx->lock);
 	local_irq_restore(flags);
 

From 52d422de22b26d96bbdfbc605eb31c2994df6d0b Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Fri, 10 Jul 2009 22:47:28 -0300
Subject: [PATCH 06/31] perf report: Adjust column width to the values sampled

Auto-adjust column width of perf report output to the
longest occuring string length.

Example:

[acme@doppio pahole]$  perf report --sort comm,dso,symbol | head -13

    12.79%   pahole  /usr/lib64/libdw-0.141.so    [.] __libdw_find_attr
     8.90%   pahole  /lib64/libc-2.10.1.so        [.] _int_malloc
     8.68%   pahole  /usr/lib64/libdw-0.141.so    [.] __libdw_form_val_len
     8.15%   pahole  /lib64/libc-2.10.1.so        [.] __GI_strcmp
     6.80%   pahole  /lib64/libc-2.10.1.so        [.] __tsearch
     5.54%   pahole  ./build/libdwarves.so.1.0.0  [.] tag__recode_dwarf_type
[acme@doppio pahole]$

[acme@doppio pahole]$  perf report --sort comm,dso,symbol -d /lib64/libc-2.10.1.so | head -10

    21.92%   pahole  /lib64/libc-2.10.1.so  [.] _int_malloc
    20.08%   pahole  /lib64/libc-2.10.1.so  [.] __GI_strcmp
    16.75%   pahole  /lib64/libc-2.10.1.so  [.] __tsearch
[acme@doppio pahole]$

Also add these extra options to control the new behaviour:

  -w, --field-width

Force each column width to the provided list, for large terminal
readability.

  -t, --field-separator:

Use a special separator character and don't pad with spaces, replacing
all occurances of this separator in symbol names (and other output) with
a '.' character, that thus it's the only non valid separator.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
LKML-Reference: <20090711014728.GH3452@ghostprotocols.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Documentation/perf-report.txt |  12 ++
 tools/perf/builtin-report.c              | 174 ++++++++++++++++++-----
 tools/perf/util/include/linux/kernel.h   |   8 ++
 tools/perf/util/symbol.c                 |   1 +
 tools/perf/util/symbol.h                 |   1 +
 5 files changed, 164 insertions(+), 32 deletions(-)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 8aa3f8c8870..05774dfbd14 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -33,6 +33,18 @@ OPTIONS
 	Only consider these symbols. CSV that understands
 	file://filename entries.
 
+-w::
+--field-width=::
+	Force each column width to the provided list, for large terminal
+	readability.
+
+-t::
+--field-separator=::
+
+	Use a special separator character and don't pad with spaces, replacing
+	all occurances of this separator in symbol names (and other output)
+	with a '.' character, that thus it's the only non valid separator.
+
 SEE ALSO
 --------
 linkperf:perf-stat[1]
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 4e5cc266311..740da43f313 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -33,8 +33,10 @@ static char		*vmlinux = NULL;
 
 static char		default_sort_order[] = "comm,dso";
 static char		*sort_order = default_sort_order;
-static char		*dso_list_str, *comm_list_str, *sym_list_str;
+static char		*dso_list_str, *comm_list_str, *sym_list_str,
+			*col_width_list_str;
 static struct strlist	*dso_list, *comm_list, *sym_list;
+static char		*field_sep;
 
 static int		input;
 static int		show_mask = SHOW_KERNEL | SHOW_USER | SHOW_HV;
@@ -129,6 +131,33 @@ typedef union event_union {
 	struct read_event		read;
 } event_t;
 
+static int repsep_fprintf(FILE *fp, const char *fmt, ...)
+{
+	int n;
+	va_list ap;
+
+	va_start(ap, fmt);
+	if (!field_sep)
+		n = vfprintf(fp, fmt, ap);
+	else {
+		char *bf = NULL;
+		n = vasprintf(&bf, fmt, ap);
+		if (n > 0) {
+			char *sep = bf;
+			while (1) {
+				sep = strchr(sep, *field_sep);
+				if (sep == NULL)
+					break;
+				*sep = '.';
+			}
+		}
+		fputs(bf, fp);
+		free(bf);
+	}
+	va_end(ap);
+	return n;
+}
+
 static LIST_HEAD(dsos);
 static struct dso *kernel_dso;
 static struct dso *vdso;
@@ -360,12 +389,28 @@ static struct thread *thread__new(pid_t pid)
 	return self;
 }
 
+static unsigned int dsos__col_width,
+		    comms__col_width,
+		    threads__col_width;
+
 static int thread__set_comm(struct thread *self, const char *comm)
 {
 	if (self->comm)
 		free(self->comm);
 	self->comm = strdup(comm);
-	return self->comm ? 0 : -ENOMEM;
+	if (!self->comm)
+		return -ENOMEM;
+
+	if (!col_width_list_str && !field_sep &&
+	    (!comm_list || strlist__has_entry(comm_list, comm))) {
+		unsigned int slen = strlen(comm);
+		if (slen > comms__col_width) {
+			comms__col_width = slen;
+			threads__col_width = slen + 6;
+		}
+	}
+
+	return 0;
 }
 
 static size_t thread__fprintf(struct thread *self, FILE *fp)
@@ -536,7 +581,8 @@ struct sort_entry {
 
 	int64_t (*cmp)(struct hist_entry *, struct hist_entry *);
 	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
-	size_t	(*print)(FILE *fp, struct hist_entry *);
+	size_t	(*print)(FILE *fp, struct hist_entry *, unsigned int width);
+	unsigned int *width;
 };
 
 static int64_t cmp_null(void *l, void *r)
@@ -558,15 +604,17 @@ sort__thread_cmp(struct hist_entry *left, struct hist_entry *right)
 }
 
 static size_t
-sort__thread_print(FILE *fp, struct hist_entry *self)
+sort__thread_print(FILE *fp, struct hist_entry *self, unsigned int width)
 {
-	return fprintf(fp, "%16s:%5d", self->thread->comm ?: "", self->thread->pid);
+	return repsep_fprintf(fp, "%*s:%5d", width - 6,
+			      self->thread->comm ?: "", self->thread->pid);
 }
 
 static struct sort_entry sort_thread = {
-	.header = "         Command:  Pid",
+	.header = "Command:  Pid",
 	.cmp	= sort__thread_cmp,
 	.print	= sort__thread_print,
+	.width	= &threads__col_width,
 };
 
 /* --sort comm */
@@ -590,16 +638,17 @@ sort__comm_collapse(struct hist_entry *left, struct hist_entry *right)
 }
 
 static size_t
-sort__comm_print(FILE *fp, struct hist_entry *self)
+sort__comm_print(FILE *fp, struct hist_entry *self, unsigned int width)
 {
-	return fprintf(fp, "%16s", self->thread->comm);
+	return repsep_fprintf(fp, "%*s", width, self->thread->comm);
 }
 
 static struct sort_entry sort_comm = {
-	.header		= "         Command",
+	.header		= "Command",
 	.cmp		= sort__comm_cmp,
 	.collapse	= sort__comm_collapse,
 	.print		= sort__comm_print,
+	.width		= &comms__col_width,
 };
 
 /* --sort dso */
@@ -617,18 +666,19 @@ sort__dso_cmp(struct hist_entry *left, struct hist_entry *right)
 }
 
 static size_t
-sort__dso_print(FILE *fp, struct hist_entry *self)
+sort__dso_print(FILE *fp, struct hist_entry *self, unsigned int width)
 {
 	if (self->dso)
-		return fprintf(fp, "%-25s", self->dso->name);
+		return repsep_fprintf(fp, "%-*s", width, self->dso->name);
 
-	return fprintf(fp, "%016llx         ", (u64)self->ip);
+	return repsep_fprintf(fp, "%*llx", width, (u64)self->ip);
 }
 
 static struct sort_entry sort_dso = {
-	.header = "Shared Object            ",
+	.header = "Shared Object",
 	.cmp	= sort__dso_cmp,
 	.print	= sort__dso_print,
+	.width	= &dsos__col_width,
 };
 
 /* --sort symbol */
@@ -648,22 +698,23 @@ sort__sym_cmp(struct hist_entry *left, struct hist_entry *right)
 }
 
 static size_t
-sort__sym_print(FILE *fp, struct hist_entry *self)
+sort__sym_print(FILE *fp, struct hist_entry *self, unsigned int width __used)
 {
 	size_t ret = 0;
 
 	if (verbose)
-		ret += fprintf(fp, "%#018llx  ", (u64)self->ip);
+		ret += repsep_fprintf(fp, "%#018llx  ", (u64)self->ip);
 
 	if (self->sym) {
-		ret += fprintf(fp, "[%c] %s",
+		ret += repsep_fprintf(fp, "[%c] %s",
 			self->dso == kernel_dso ? 'k' :
 			self->dso == hypervisor_dso ? 'h' : '.', self->sym->name);
 
 		if (self->sym->module)
-			ret += fprintf(fp, "\t[%s]", self->sym->module->name);
+			ret += repsep_fprintf(fp, "\t[%s]",
+					     self->sym->module->name);
 	} else {
-		ret += fprintf(fp, "%#016llx", (u64)self->ip);
+		ret += repsep_fprintf(fp, "%#016llx", (u64)self->ip);
 	}
 
 	return ret;
@@ -690,19 +741,19 @@ sort__parent_cmp(struct hist_entry *left, struct hist_entry *right)
 }
 
 static size_t
-sort__parent_print(FILE *fp, struct hist_entry *self)
+sort__parent_print(FILE *fp, struct hist_entry *self, unsigned int width)
 {
-	size_t ret = 0;
-
-	ret += fprintf(fp, "%-20s", self->parent ? self->parent->name : "[other]");
-
-	return ret;
+	return repsep_fprintf(fp, "%-*s", width,
+			      self->parent ? self->parent->name : "[other]");
 }
 
+static unsigned int parent_symbol__col_width;
+
 static struct sort_entry sort_parent = {
-	.header = "Parent symbol       ",
+	.header = "Parent symbol",
 	.cmp	= sort__parent_cmp,
 	.print	= sort__parent_print,
+	.width	= &parent_symbol__col_width,
 };
 
 static int sort__need_collapse = 0;
@@ -967,17 +1018,18 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, u64 total_samples)
 		return 0;
 
 	if (total_samples)
-		ret = percent_color_fprintf(fp, "   %6.2f%%",
-				(self->count * 100.0) / total_samples);
+		ret = percent_color_fprintf(fp,
+					    field_sep ? "%.2f" : "   %6.2f%%",
+					(self->count * 100.0) / total_samples);
 	else
-		ret = fprintf(fp, "%12Ld ", self->count);
+		ret = fprintf(fp, field_sep ? "%lld" : "%12lld ", self->count);
 
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		if (exclude_other && (se == &sort_parent))
 			continue;
 
-		fprintf(fp, "  ");
-		ret += se->print(fp, self);
+		fprintf(fp, "%s", field_sep ?: "  ");
+		ret += se->print(fp, self, se->width ? *se->width : 0);
 	}
 
 	ret += fprintf(fp, "\n");
@@ -992,6 +1044,18 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, u64 total_samples)
  *
  */
 
+static void dso__calc_col_width(struct dso *self)
+{
+	if (!col_width_list_str && !field_sep &&
+	    (!dso_list || strlist__has_entry(dso_list, self->name))) {
+		unsigned int slen = strlen(self->name);
+		if (slen > dsos__col_width)
+			dsos__col_width = slen;
+	}
+
+	self->slen_calculated = 1;
+}
+
 static struct symbol *
 resolve_symbol(struct thread *thread, struct map **mapp,
 	       struct dso **dsop, u64 *ipp)
@@ -1011,6 +1075,14 @@ resolve_symbol(struct thread *thread, struct map **mapp,
 
 	map = thread__find_map(thread, ip);
 	if (map != NULL) {
+		/*
+		 * We have to do this here as we may have a dso
+		 * with no symbol hit that has a name longer than
+		 * the ones with symbols sampled.
+		 */
+		if (!map->dso->slen_calculated)
+			dso__calc_col_width(map->dso);
+
 		if (mapp)
 			*mapp = map;
 got_map:
@@ -1282,6 +1354,8 @@ static size_t output__fprintf(FILE *fp, u64 total_samples)
 	struct sort_entry *se;
 	struct rb_node *nd;
 	size_t ret = 0;
+	unsigned int width;
+	char *col_width = col_width_list_str;
 
 	fprintf(fp, "\n");
 	fprintf(fp, "#\n");
@@ -1292,10 +1366,29 @@ static size_t output__fprintf(FILE *fp, u64 total_samples)
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		if (exclude_other && (se == &sort_parent))
 			continue;
-		fprintf(fp, "  %s", se->header);
+		if (field_sep) {
+			fprintf(fp, "%c%s", *field_sep, se->header);
+			continue;
+		}
+		width = strlen(se->header);
+		if (se->width) {
+			if (col_width_list_str) {
+				if (col_width) {
+					*se->width = atoi(col_width);
+					col_width = strchr(col_width, ',');
+					if (col_width)
+						++col_width;
+				}
+			}
+			width = *se->width = max(*se->width, width);
+		}
+		fprintf(fp, "  %*s", width, se->header);
 	}
 	fprintf(fp, "\n");
 
+	if (field_sep)
+		goto print_entries;
+
 	fprintf(fp, "# ........");
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		unsigned int i;
@@ -1304,13 +1397,18 @@ static size_t output__fprintf(FILE *fp, u64 total_samples)
 			continue;
 
 		fprintf(fp, "  ");
-		for (i = 0; i < strlen(se->header); i++)
+		if (se->width)
+			width = *se->width;
+		else
+			width = strlen(se->header);
+		for (i = 0; i < width; i++)
 			fprintf(fp, ".");
 	}
 	fprintf(fp, "\n");
 
 	fprintf(fp, "#\n");
 
+print_entries:
 	for (nd = rb_first(&output_hists); nd; nd = rb_next(nd)) {
 		pos = rb_entry(nd, struct hist_entry, rb_node);
 		ret += hist_entry__fprintf(fp, pos, total_samples);
@@ -1900,6 +1998,12 @@ static const struct option options[] = {
 		   "only consider symbols in these comms"),
 	OPT_STRING('S', "symbols", &sym_list_str, "symbol[,symbol...]",
 		   "only consider these symbols"),
+	OPT_STRING('w', "column-widths", &col_width_list_str,
+		   "width[,width...]",
+		   "don't try to adjust column width, use these fixed values"),
+	OPT_STRING('t', "field-separator", &field_sep, "separator",
+		   "separator for columns, no spaces will be added between "
+		   "columns '.' is reserved."),
 	OPT_END()
 };
 
@@ -1956,6 +2060,12 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 	setup_list(&comm_list, comm_list_str, "comm");
 	setup_list(&sym_list, sym_list_str, "symbol");
 
+	if (field_sep && *field_sep == '.') {
+		fputs("'.' is the only non valid --field-separator argument\n",
+		      stderr);
+		exit(129);
+	}
+
 	setup_pager();
 
 	return __cmd_report();
diff --git a/tools/perf/util/include/linux/kernel.h b/tools/perf/util/include/linux/kernel.h
index 99c1b3d1edd..a6b87390cb5 100644
--- a/tools/perf/util/include/linux/kernel.h
+++ b/tools/perf/util/include/linux/kernel.h
@@ -18,4 +18,12 @@
 	(type *)((char *)__mptr - offsetof(type, member)); })
 #endif
 
+#ifndef max
+#define max(x, y) ({				\
+	typeof(x) _max1 = (x);			\
+	typeof(y) _max2 = (y);			\
+	(void) (&_max1 == &_max2);		\
+	_max1 > _max2 ? _max1 : _max2; })
+#endif
+
 #endif
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 4683b67b5ee..8efe7e41109 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -65,6 +65,7 @@ struct dso *dso__new(const char *name, unsigned int sym_priv_size)
 		self->syms = RB_ROOT;
 		self->sym_priv_size = sym_priv_size;
 		self->find_symbol = dso__find_symbol;
+		self->slen_calculated = 0;
 	}
 
 	return self;
diff --git a/tools/perf/util/symbol.h b/tools/perf/util/symbol.h
index 7918cffb23c..2f92b21c712 100644
--- a/tools/perf/util/symbol.h
+++ b/tools/perf/util/symbol.h
@@ -25,6 +25,7 @@ struct dso {
 	struct symbol    *(*find_symbol)(struct dso *, u64 ip);
 	unsigned int	 sym_priv_size;
 	unsigned char	 adjust_symbols;
+	unsigned char	 slen_calculated;
 	char		 name[0];
 };
 

From 60c1baf1248e00d423604f018c8af1cf750ad885 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 11 Jul 2009 12:18:33 -0300
Subject: [PATCH 07/31] perf report: Tidy up reporting of symbols not found

Always printing the level info about if it is in the kernel,
hypervisor or userspace as that is in the hist_entry.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1247325517-12272-1-git-send-email-acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 740da43f313..617f4cb7f16 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -705,10 +705,9 @@ sort__sym_print(FILE *fp, struct hist_entry *self, unsigned int width __used)
 	if (verbose)
 		ret += repsep_fprintf(fp, "%#018llx  ", (u64)self->ip);
 
+	ret += repsep_fprintf(fp, "[%c] ", self->level);
 	if (self->sym) {
-		ret += repsep_fprintf(fp, "[%c] %s",
-			self->dso == kernel_dso ? 'k' :
-			self->dso == hypervisor_dso ? 'h' : '.', self->sym->name);
+		ret += repsep_fprintf(fp, "%s", self->sym->name);
 
 		if (self->sym->module)
 			ret += repsep_fprintf(fp, "\t[%s]",

From 27d0fd410c3cee00ece2e55f4354a7a9ec1a6a6a Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 11 Jul 2009 12:18:34 -0300
Subject: [PATCH 08/31] strlist: Introduce strlist__entry and
 strlist__nr_entries methods

The strlist__entry method allows accessing strlists like an
array, will be used in the 'perf report' to access the first
entry.

We now keep the nr_entries so that we can check if we have just
one entry, will be used in 'perf report' to improve the output
by showing just at the top when we have just, say, one DSO.

While at it use nr_entries to optimize strlist__is_empty by not
using the far more costly rb_first based implementation.

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1247325517-12272-2-git-send-email-acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/strlist.c | 20 ++++++++++++++++++--
 tools/perf/util/strlist.h | 11 +++++++++--
 2 files changed, 27 insertions(+), 4 deletions(-)

diff --git a/tools/perf/util/strlist.c b/tools/perf/util/strlist.c
index 025a78edfff..7ad38171dc2 100644
--- a/tools/perf/util/strlist.c
+++ b/tools/perf/util/strlist.c
@@ -64,6 +64,7 @@ int strlist__add(struct strlist *self, const char *new_entry)
 
 	rb_link_node(&sn->rb_node, parent, p);
 	rb_insert_color(&sn->rb_node, &self->entries);
+	++self->nr_entries;
 
 	return 0;
 }
@@ -155,8 +156,9 @@ struct strlist *strlist__new(bool dupstr, const char *slist)
 	struct strlist *self = malloc(sizeof(*self));
 
 	if (self != NULL) {
-		self->entries = RB_ROOT;
-		self->dupstr = dupstr;
+		self->entries	 = RB_ROOT;
+		self->dupstr	 = dupstr;
+		self->nr_entries = 0;
 		if (slist && strlist__parse_list(self, slist) != 0)
 			goto out_error;
 	}
@@ -182,3 +184,17 @@ void strlist__delete(struct strlist *self)
 		free(self);
 	}
 }
+
+struct str_node *strlist__entry(const struct strlist *self, unsigned int idx)
+{
+	struct rb_node *nd;
+
+	for (nd = rb_first(&self->entries); nd; nd = rb_next(nd)) {
+		struct str_node *pos = rb_entry(nd, struct str_node, rb_node);
+
+		if (!idx--)
+			return pos;
+	}
+
+	return NULL;
+}
diff --git a/tools/perf/util/strlist.h b/tools/perf/util/strlist.h
index 2fdcfee8758..921818e44a5 100644
--- a/tools/perf/util/strlist.h
+++ b/tools/perf/util/strlist.h
@@ -11,7 +11,8 @@ struct str_node {
 
 struct strlist {
 	struct rb_root entries;
-	bool dupstr;
+	unsigned int   nr_entries;
+	bool	       dupstr;
 };
 
 struct strlist *strlist__new(bool dupstr, const char *slist);
@@ -21,11 +22,17 @@ void strlist__remove(struct strlist *self, struct str_node *sn);
 int strlist__load(struct strlist *self, const char *filename);
 int strlist__add(struct strlist *self, const char *str);
 
+struct str_node *strlist__entry(const struct strlist *self, unsigned int idx);
 bool strlist__has_entry(struct strlist *self, const char *entry);
 
 static inline bool strlist__empty(const struct strlist *self)
 {
-	return rb_first(&self->entries) == NULL;
+	return self->nr_entries == 0;
+}
+
+static inline unsigned int strlist__nr_entries(const struct strlist *self)
+{
+	return self->nr_entries;
 }
 
 int strlist__parse_list(struct strlist *self, const char *s);

From 021191b35cdfb1b5ee6e78ed5ae010114a40902c Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 11 Jul 2009 12:18:35 -0300
Subject: [PATCH 09/31] perf report: Make the output more compact

When we filter by column content we may end up with a column
that has the same value for all the lines. So remove that
column and tell its unique value on the top, as a comment.

Example:

  [acme@doppio pahole]$  perf report --sort comm,dso,symbol -d ./build/libdwarves.so.1.0.0 -C pahole | head -15
  # dso: ./build/libdwarves.so.1.0.0
  # comm: pahole
  # Samples: 58409
  #
  # Overhead  Symbol
  # ........  ......
  #
      20.93%  [.] tag__recode_dwarf_type
      14.94%  [.] namespace__recode_dwarf_types
      10.38%  [.] cu__table_add_tag
       6.69%  [.] __die__process_tag
       5.05%  [.] die__process_function
       4.70%  [.] list__for_all_tags
       3.68%  [.] tag__init
       3.48%  [.] die__create_new_parameter
  [acme@doppio pahole]$

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1247325517-12272-3-git-send-email-acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 36 +++++++++++++++++++++---------------
 1 file changed, 21 insertions(+), 15 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 617f4cb7f16..f3422121d85 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -583,6 +583,7 @@ struct sort_entry {
 	int64_t (*collapse)(struct hist_entry *, struct hist_entry *);
 	size_t	(*print)(FILE *fp, struct hist_entry *, unsigned int width);
 	unsigned int *width;
+	bool	elide;
 };
 
 static int64_t cmp_null(void *l, void *r)
@@ -1024,7 +1025,7 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, u64 total_samples)
 		ret = fprintf(fp, field_sep ? "%lld" : "%12lld ", self->count);
 
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		if (exclude_other && (se == &sort_parent))
+		if (se->elide)
 			continue;
 
 		fprintf(fp, "%s", field_sep ?: "  ");
@@ -1079,7 +1080,7 @@ resolve_symbol(struct thread *thread, struct map **mapp,
 		 * with no symbol hit that has a name longer than
 		 * the ones with symbols sampled.
 		 */
-		if (!map->dso->slen_calculated)
+		if (!sort_dso.elide && !map->dso->slen_calculated)
 			dso__calc_col_width(map->dso);
 
 		if (mapp)
@@ -1356,14 +1357,12 @@ static size_t output__fprintf(FILE *fp, u64 total_samples)
 	unsigned int width;
 	char *col_width = col_width_list_str;
 
-	fprintf(fp, "\n");
-	fprintf(fp, "#\n");
-	fprintf(fp, "# (%Ld samples)\n", (u64)total_samples);
+	fprintf(fp, "# Samples: %Ld\n", (u64)total_samples);
 	fprintf(fp, "#\n");
 
 	fprintf(fp, "# Overhead");
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
-		if (exclude_other && (se == &sort_parent))
+		if (se->elide)
 			continue;
 		if (field_sep) {
 			fprintf(fp, "%c%s", *field_sep, se->header);
@@ -1392,7 +1391,7 @@ static size_t output__fprintf(FILE *fp, u64 total_samples)
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		unsigned int i;
 
-		if (exclude_other && (se == &sort_parent))
+		if (se->elide)
 			continue;
 
 		fprintf(fp, "  ");
@@ -2022,7 +2021,8 @@ static void setup_sorting(void)
 }
 
 static void setup_list(struct strlist **list, const char *list_str,
-		       const char *list_name)
+		       struct sort_entry *se, const char *list_name,
+		       FILE *fp)
 {
 	if (list_str) {
 		*list = strlist__new(true, list_str);
@@ -2031,6 +2031,11 @@ static void setup_list(struct strlist **list, const char *list_str,
 				list_name);
 			exit(129);
 		}
+		if (strlist__nr_entries(*list) == 1) {
+			fprintf(fp, "# %s: %s\n", list_name,
+				strlist__entry(*list, 0)->s);
+			se->elide = true;
+		}
 	}
 }
 
@@ -2044,9 +2049,10 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 
 	setup_sorting();
 
-	if (parent_pattern != default_parent_pattern)
+	if (parent_pattern != default_parent_pattern) {
 		sort_dimension__add("parent");
-	else
+		sort_parent.elide = 1;
+	} else
 		exclude_other = 0;
 
 	/*
@@ -2055,9 +2061,11 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 	if (argc)
 		usage_with_options(report_usage, options);
 
-	setup_list(&dso_list, dso_list_str, "dso");
-	setup_list(&comm_list, comm_list_str, "comm");
-	setup_list(&sym_list, sym_list_str, "symbol");
+	setup_pager();
+
+	setup_list(&dso_list, dso_list_str, &sort_dso, "dso", stdout);
+	setup_list(&comm_list, comm_list_str, &sort_comm, "comm", stdout);
+	setup_list(&sym_list, sym_list_str, &sort_sym, "symbol", stdout);
 
 	if (field_sep && *field_sep == '.') {
 		fputs("'.' is the only non valid --field-separator argument\n",
@@ -2065,7 +2073,5 @@ int cmd_report(int argc, const char **argv, const char *prefix __used)
 		exit(129);
 	}
 
-	setup_pager();
-
 	return __cmd_report();
 }

From a25e46c46311316cd1b3f27f8bb036df1693c032 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 11 Jul 2009 12:18:36 -0300
Subject: [PATCH 10/31] perf_counter tools: PLT info is stripped in -debuginfo
 packages

So we need to get the richer .symtab from the debuginfo
packages but the PLT info from the original DSO where we have
just the leaner .dynsym symtab.

Example:

| [acme@doppio pahole]$ perf report --sort comm,dso,symbol > before
| [acme@doppio pahole]$ perf report --sort comm,dso,symbol > after
| [acme@doppio pahole]$ diff -U1 before after
| --- before	2009-07-11 11:04:22.688595741 -0300
| +++ after	2009-07-11 11:04:33.380595676 -0300
| @@ -80,3 +80,2 @@
|       0.07%  pahole ./build/pahole              [.] pahole_stealer
| -     0.06%  pahole /usr/lib64/libdw-0.141.so   [.] 0x00000000007140
|       0.06%  pahole /usr/lib64/libdw-0.141.so   [.] __libdw_getabbrev
| @@ -91,2 +90,3 @@
|       0.06%  pahole [kernel]                    [k] free_hot_cold_page
| +     0.06%  pahole /usr/lib64/libdw-0.141.so   [.] tfind@plt
|       0.05%  pahole ./build/libdwarves.so.1.0.0 [.] ftype__add_parameter
| @@ -242,2 +242,3 @@
|       0.01%  pahole [kernel]                    [k] account_group_user_time
| +     0.01%  pahole /usr/lib64/libdw-0.141.so   [.] strlen@plt
|       0.01%  pahole ./build/pahole              [.] strcmp@plt
| [acme@doppio pahole]$

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1247325517-12272-4-git-send-email-acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/symbol.c | 118 ++++++++++++++++++++++-----------------
 1 file changed, 66 insertions(+), 52 deletions(-)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 8efe7e41109..f40266b4845 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -374,36 +374,61 @@ static Elf_Scn *elf_section_by_name(Elf *elf, GElf_Ehdr *ep,
 	     idx < nr_entries; \
 	     ++idx, pos = gelf_getrela(reldata, idx, &pos_mem))
 
-static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
-				       GElf_Ehdr *ehdr, Elf_Scn *scn_dynsym,
-				       GElf_Shdr *shdr_dynsym,
-				       size_t dynsym_idx, int verbose)
+/*
+ * We need to check if we have a .dynsym, so that we can handle the
+ * .plt, synthesizing its symbols, that aren't on the symtabs (be it
+ * .dynsym or .symtab).
+ * And always look at the original dso, not at debuginfo packages, that
+ * have the PLT data stripped out (shdr_rel_plt.sh_type == SHT_NOBITS).
+ */
+static int dso__synthesize_plt_symbols(struct  dso *self, int verbose)
 {
 	uint32_t nr_rel_entries, idx;
 	GElf_Sym sym;
 	u64 plt_offset;
 	GElf_Shdr shdr_plt;
 	struct symbol *f;
-	GElf_Shdr shdr_rel_plt;
+	GElf_Shdr shdr_rel_plt, shdr_dynsym;
 	Elf_Data *reldata, *syms, *symstrs;
-	Elf_Scn *scn_plt_rel, *scn_symstrs;
+	Elf_Scn *scn_plt_rel, *scn_symstrs, *scn_dynsym;
+	size_t dynsym_idx;
+	GElf_Ehdr ehdr;
 	char sympltname[1024];
-	int nr = 0, symidx;
+	Elf *elf;
+	int nr = 0, symidx, fd, err = 0;
 
-	scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
+	fd = open(self->name, O_RDONLY);
+	if (fd < 0)
+		goto out;
+
+	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
+	if (elf == NULL)
+		goto out_close;
+
+	if (gelf_getehdr(elf, &ehdr) == NULL)
+		goto out_elf_end;
+
+	scn_dynsym = elf_section_by_name(elf, &ehdr, &shdr_dynsym,
+					 ".dynsym", &dynsym_idx);
+	if (scn_dynsym == NULL)
+		goto out_elf_end;
+
+	scn_plt_rel = elf_section_by_name(elf, &ehdr, &shdr_rel_plt,
 					  ".rela.plt", NULL);
 	if (scn_plt_rel == NULL) {
-		scn_plt_rel = elf_section_by_name(elf, ehdr, &shdr_rel_plt,
+		scn_plt_rel = elf_section_by_name(elf, &ehdr, &shdr_rel_plt,
 						  ".rel.plt", NULL);
 		if (scn_plt_rel == NULL)
-			return 0;
+			goto out_elf_end;
 	}
 
-	if (shdr_rel_plt.sh_link != dynsym_idx)
-		return 0;
+	err = -1;
 
-	if (elf_section_by_name(elf, ehdr, &shdr_plt, ".plt", NULL) == NULL)
-		return 0;
+	if (shdr_rel_plt.sh_link != dynsym_idx)
+		goto out_elf_end;
+
+	if (elf_section_by_name(elf, &ehdr, &shdr_plt, ".plt", NULL) == NULL)
+		goto out_elf_end;
 
 	/*
 	 * Fetch the relocation section to find the indexes to the GOT
@@ -411,19 +436,19 @@ static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 	 */
 	reldata = elf_getdata(scn_plt_rel, NULL);
 	if (reldata == NULL)
-		return -1;
+		goto out_elf_end;
 
 	syms = elf_getdata(scn_dynsym, NULL);
 	if (syms == NULL)
-		return -1;
+		goto out_elf_end;
 
-	scn_symstrs = elf_getscn(elf, shdr_dynsym->sh_link);
+	scn_symstrs = elf_getscn(elf, shdr_dynsym.sh_link);
 	if (scn_symstrs == NULL)
-		return -1;
+		goto out_elf_end;
 
 	symstrs = elf_getdata(scn_symstrs, NULL);
 	if (symstrs == NULL)
-		return -1;
+		goto out_elf_end;
 
 	nr_rel_entries = shdr_rel_plt.sh_size / shdr_rel_plt.sh_entsize;
 	plt_offset = shdr_plt.sh_offset;
@@ -442,7 +467,7 @@ static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
 					sympltname, self->sym_priv_size, 0, verbose);
 			if (!f)
-				return -1;
+				goto out_elf_end;
 
 			dso__insert_symbol(self, f);
 			++nr;
@@ -460,19 +485,25 @@ static int dso__synthesize_plt_symbols(struct  dso *self, Elf *elf,
 			f = symbol__new(plt_offset, shdr_plt.sh_entsize,
 					sympltname, self->sym_priv_size, 0, verbose);
 			if (!f)
-				return -1;
+				goto out_elf_end;
 
 			dso__insert_symbol(self, f);
 			++nr;
 		}
-	} else {
-		/*
-		 * TODO: There are still one more shdr_rel_plt.sh_type
-		 * I have to investigate, but probably should be ignored.
-		 */
 	}
 
-	return nr;
+	err = 0;
+out_elf_end:
+	elf_end(elf);
+out_close:
+	close(fd);
+
+	if (err == 0)
+		return nr;
+out:
+	fprintf(stderr, "%s: problems reading %s PLT info.\n",
+		__func__, self->name);
+	return 0;
 }
 
 static int dso__load_sym(struct dso *self, int fd, const char *name,
@@ -486,9 +517,8 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 	GElf_Shdr shdr;
 	Elf_Data *syms;
 	GElf_Sym sym;
-	Elf_Scn *sec, *sec_dynsym, *sec_strndx;
+	Elf_Scn *sec, *sec_strndx;
 	Elf *elf;
-	size_t dynsym_idx;
 	int nr = 0;
 
 	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
@@ -505,32 +535,11 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 		goto out_elf_end;
 	}
 
-	/*
-	 * We need to check if we have a .dynsym, so that we can handle the
-	 * .plt, synthesizing its symbols, that aren't on the symtabs (be it
-	 * .dynsym or .symtab)
-	 */
-	sec_dynsym = elf_section_by_name(elf, &ehdr, &shdr,
-					 ".dynsym", &dynsym_idx);
-	if (sec_dynsym != NULL) {
-		nr = dso__synthesize_plt_symbols(self, elf, &ehdr,
-						 sec_dynsym, &shdr,
-						 dynsym_idx, verbose);
-		if (nr < 0)
-			goto out_elf_end;
-	}
-
-	/*
-	 * But if we have a full .symtab (that is a superset of .dynsym) we
-	 * should add the symbols not in the .dynsyn
-	 */
 	sec = elf_section_by_name(elf, &ehdr, &shdr, ".symtab", NULL);
 	if (sec == NULL) {
-		if (sec_dynsym == NULL)
+		sec = elf_section_by_name(elf, &ehdr, &shdr, ".dynsym", NULL);
+		if (sec == NULL)
 			goto out_elf_end;
-
-		sec = sec_dynsym;
-		gelf_getshdr(sec, &shdr);
 	}
 
 	syms = elf_getdata(sec, NULL);
@@ -669,6 +678,11 @@ more:
 	if (!ret)
 		goto more;
 
+	if (ret > 0) {
+		int nr_plt = dso__synthesize_plt_symbols(self, verbose);
+		if (nr_plt > 0)
+			ret += nr_plt;
+	}
 out:
 	free(name);
 	return ret;

From e3d7e183dc276df2fcaf02af173a49ad119ba9f9 Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Sat, 11 Jul 2009 12:18:37 -0300
Subject: [PATCH 11/31] perf report: Introduce -n/--show-nr-samples

[acme@doppio pahole]$ perf report -ns comm,dso,symbol -d /lib64/libc-2.10.1.so -C pahole | head -17
    21.94%      32101  [.] _int_malloc
    20.10%      29402  [.] __GI_strcmp
    16.77%      24533  [.] __tsearch
    12.61%      18450  [.] malloc_consolidate
     6.42%       9394  [.] _int_free
     6.28%       9191  [.] __tfind
     4.56%       6678  [.] __GI___libc_free
     4.46%       6520  [.] _IO_vfprintf_internal
     2.59%       3786  [.] __malloc
     1.17%       1716  [.] __GI_memcpy
[acme@doppio pahole]$

Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <1247325517-12272-5-git-send-email-acme@redhat.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/Documentation/perf-report.txt |  3 +++
 tools/perf/builtin-report.c              | 18 ++++++++++++++++++
 2 files changed, 21 insertions(+)

diff --git a/tools/perf/Documentation/perf-report.txt b/tools/perf/Documentation/perf-report.txt
index 05774dfbd14..e72e9311078 100644
--- a/tools/perf/Documentation/perf-report.txt
+++ b/tools/perf/Documentation/perf-report.txt
@@ -24,6 +24,9 @@ OPTIONS
 --dsos=::
 	Only consider symbols in these dsos. CSV that understands
 	file://filename entries.
+-n
+--show-nr-samples
+	Show the number of samples for each symbol
 -C::
 --comms=::
 	Only consider symbols in these comms. CSV that understands
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index f3422121d85..430a195b858 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -51,6 +51,7 @@ static int		verbose;
 static int		modules;
 
 static int		full_paths;
+static int		show_nr_samples;
 
 static unsigned long	page_size;
 static unsigned long	mmap_window = 32;
@@ -1024,6 +1025,13 @@ hist_entry__fprintf(FILE *fp, struct hist_entry *self, u64 total_samples)
 	else
 		ret = fprintf(fp, field_sep ? "%lld" : "%12lld ", self->count);
 
+	if (show_nr_samples) {
+		if (field_sep)
+			fprintf(fp, "%c%lld", *field_sep, self->count);
+		else
+			fprintf(fp, "%11lld", self->count);
+	}
+
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		if (se->elide)
 			continue;
@@ -1361,6 +1369,12 @@ static size_t output__fprintf(FILE *fp, u64 total_samples)
 	fprintf(fp, "#\n");
 
 	fprintf(fp, "# Overhead");
+	if (show_nr_samples) {
+		if (field_sep)
+			fprintf(fp, "%cSamples", *field_sep);
+		else
+			fputs("  Samples  ", fp);
+	}
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		if (se->elide)
 			continue;
@@ -1388,6 +1402,8 @@ static size_t output__fprintf(FILE *fp, u64 total_samples)
 		goto print_entries;
 
 	fprintf(fp, "# ........");
+	if (show_nr_samples)
+		fprintf(fp, " ..........");
 	list_for_each_entry(se, &hist_entry__sort_list, list) {
 		unsigned int i;
 
@@ -1979,6 +1995,8 @@ static const struct option options[] = {
 	OPT_STRING('k', "vmlinux", &vmlinux, "file", "vmlinux pathname"),
 	OPT_BOOLEAN('m', "modules", &modules,
 		    "load module symbols - WARNING: use only with -k and LIVE kernel"),
+	OPT_BOOLEAN('n', "show-nr-samples", &show_nr_samples,
+		    "Show a column with the number of samples"),
 	OPT_STRING('s', "sort", &sort_order, "key[,key2...]",
 		   "sort by key(s): pid, comm, dso, symbol, parent"),
 	OPT_BOOLEAN('P', "full-paths", &full_paths,

From f1c6a58121f9846ac665b0fbd3cbab90ce8bcbac Mon Sep 17 00:00:00 2001
From: Daniel Qarras <dqarras@yahoo.com>
Date: Sun, 12 Jul 2009 04:32:40 -0700
Subject: [PATCH 12/31] perf_counter, x86: Extend perf_counter Pentium M
 support

I've attached a patch to remove the Pentium M special casing of
EMON and as noticed at least with my Pentium M the hardware PMU
now works:

 Performance counter stats for '/bin/ls /var/tmp':

       1.809988  task-clock-msecs         #      0.125 CPUs
              1  context-switches         #      0.001 M/sec
              0  CPU-migrations           #	 0.000 M/sec
            224  page-faults              #	 0.124 M/sec
        1425648  cycles                   #    787.656 M/sec
         912755  instructions             #	 0.640 IPC

Vince suggested that this code was trying to address erratum
Y17 in Pentium-M's:

  http://download.intel.com/support/processors/mobile/pm/sb/25266532.pdf

But that erratum (related to IA32_MISC_ENABLES.7) does not
affect perfcounters as we dont use this toggle to disable RDPMC
and WRMSR/RDMSR access to performance counters. We keep cr4's
bit 8 (X86_CR4_PCE) clear so unprivileged RDPMC access is not
allowed anyway.

Cc: Vince Weaver <vince@deater.net>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Stephane Eranian <eranian@googlemail.com>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/cpu/perf_counter.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index bed1c4c2f25..7e346d4bc0f 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1583,10 +1583,8 @@ static int p6_pmu_init(void)
 		break;
 	case 9:
 	case 13:
-		/* for Pentium M, we need to check if PMU exist */
-		rdmsr(MSR_IA32_MISC_ENABLE, low, high);
-		if (low & MSR_IA32_MISC_ENABLE_EMON)
-			break;
+		/* Pentium M */
+		break;
 	default:
 		pr_cont("unsupported p6 CPU model %d ",
 			boot_cpu_data.x86_model);

From d4d7d0b9545721d3cabb19d15163bbc66b797707 Mon Sep 17 00:00:00 2001
From: Chris Wilson <chris@chris-wilson.co.uk>
Date: Mon, 6 Jul 2009 09:31:33 +0100
Subject: [PATCH 13/31] perf_counter: Fix the tracepoint channel to
 perfcounters

Fix a missed rename in EVENT_PROFILE support so that it gets
built and allows tracepoint tracing from the 'perf' tool.

Fix a typo in the (never before built & enabled) portion in
perf_counter.c as well, and update that code to the
attr.config changes as well.

Signed-off-by: Chris Wilson <chris@chris-wilson.co.uk>
Cc: Ben Gamari <bgamari.foss@gmail.com>
Cc: Jason Baron <jbaron@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
LKML-Reference: <1246869094-21237-1-git-send-email-chris@chris-wilson.co.uk>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 init/Kconfig          |  2 +-
 kernel/perf_counter.c | 10 +++-------
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/init/Kconfig b/init/Kconfig
index 1ce05a4cb5f..cb2c0927022 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -962,7 +962,7 @@ config PERF_COUNTERS
 
 config EVENT_PROFILE
 	bool "Tracepoint profile sources"
-	depends on PERF_COUNTERS && EVENT_TRACER
+	depends on PERF_COUNTERS && EVENT_TRACING
 	default y
 
 endmenu
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index d55a50da234..c6c38fb7766 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3671,7 +3671,7 @@ static const struct pmu perf_ops_task_clock = {
 void perf_tpcounter_event(int event_id)
 {
 	struct perf_sample_data data = {
-		.regs = get_irq_regs();
+		.regs = get_irq_regs(),
 		.addr = 0,
 	};
 
@@ -3687,16 +3687,12 @@ extern void ftrace_profile_disable(int);
 
 static void tp_perf_counter_destroy(struct perf_counter *counter)
 {
-	ftrace_profile_disable(perf_event_id(&counter->attr));
+	ftrace_profile_disable(counter->attr.config);
 }
 
 static const struct pmu *tp_perf_counter_init(struct perf_counter *counter)
 {
-	int event_id = perf_event_id(&counter->attr);
-	int ret;
-
-	ret = ftrace_profile_enable(event_id);
-	if (ret)
+	if (ftrace_profile_enable(counter->attr.config))
 		return NULL;
 
 	counter->destroy = tp_perf_counter_destroy;

From 23cdb5d5171d591ec911aada682e09d53c14a810 Mon Sep 17 00:00:00 2001
From: Roel Kluin <roel.kluin@gmail.com>
Date: Mon, 13 Jul 2009 02:25:47 +0200
Subject: [PATCH 14/31] perf_counter tools: Fix index boundary check

Keep index within event_type_descriptors[]

Signed-off-by: Roel Kluin <roel.kluin@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
LKML-Reference: <4A5A7F0B.4070106@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/util/parse-events.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 518a33affe1..d18c98edd00 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -436,7 +436,7 @@ void print_events(void)
 
 	for (i = 0; i < ARRAY_SIZE(event_symbols); i++, syms++) {
 		type = syms->type + 1;
-		if (type > ARRAY_SIZE(event_type_descriptors))
+		if (type >= ARRAY_SIZE(event_type_descriptors))
 			type = 0;
 
 		if (type != prev_type)

From 413ee3b48ab582ffea33e7e140c7a2c5ea657e9a Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 16 Jul 2009 15:15:52 +0200
Subject: [PATCH 15/31] perf_counter: Make sure we dont leak kernel memory to
 userspace

There are a few places we are leaking tiny amounts of kernel
memory to userspace. This happens when writing out strings
because we always align the end to 64 bits.

To avoid this we should always use an appropriately sized
temporary buffer and ensure it is zeroed.

Since d_path assembles the string from the end of the buffer
backwards, we need to add 64 bits after the buffer to allow for
alignment.

We also need to copy arch_vma_name to the temporary buffer,
because if we use it directly we may end up copying to
userspace a number of bytes after the end of the string
constant.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090716104817.273972048@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/perf_counter.c | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index c6c38fb7766..f7a8ab9576e 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2968,8 +2968,10 @@ static void perf_counter_comm_event(struct perf_comm_event *comm_event)
 	struct perf_cpu_context *cpuctx;
 	struct perf_counter_context *ctx;
 	unsigned int size;
-	char *comm = comm_event->task->comm;
+	char comm[TASK_COMM_LEN];
 
+	memset(comm, 0, sizeof(comm));
+	strncpy(comm, comm_event->task->comm, sizeof(comm));
 	size = ALIGN(strlen(comm)+1, sizeof(u64));
 
 	comm_event->comm = comm;
@@ -3088,8 +3090,15 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 	char *buf = NULL;
 	const char *name;
 
+	memset(tmp, 0, sizeof(tmp));
+
 	if (file) {
-		buf = kzalloc(PATH_MAX, GFP_KERNEL);
+		/*
+		 * d_path works from the end of the buffer backwards, so we
+		 * need to add enough zero bytes after the string to handle
+		 * the 64bit alignment we do later.
+		 */
+		buf = kzalloc(PATH_MAX + sizeof(u64), GFP_KERNEL);
 		if (!buf) {
 			name = strncpy(tmp, "//enomem", sizeof(tmp));
 			goto got_name;
@@ -3100,9 +3109,11 @@ static void perf_counter_mmap_event(struct perf_mmap_event *mmap_event)
 			goto got_name;
 		}
 	} else {
-		name = arch_vma_name(mmap_event->vma);
-		if (name)
+		if (arch_vma_name(mmap_event->vma)) {
+			name = strncpy(tmp, arch_vma_name(mmap_event->vma),
+				       sizeof(tmp));
 			goto got_name;
+		}
 
 		if (!vma->vm_mm) {
 			name = strncpy(tmp, "[vdso]", sizeof(tmp));

From 11b5f81e1b0ea0bc84fe32f0a27054e052b2bf84 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: [PATCH 16/31] perf_counter: Synthesize VDSO mmap event

perf record synthesizes mmap events for the running process.
Right now it just catches file mappings, but we can check for
the vdso symbol and add that too.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090716104817.517264409@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 4ef78a5e6f3..072aaf0369f 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -313,6 +313,10 @@ static void pid_synthesize_mmap_samples(pid_t pid)
 		if (*pbf == 'x') { /* vm_exec */
 			char *execname = strchr(bf, '/');
 
+			/* Catch VDSO */
+			if (execname == NULL)
+				execname = strstr(bf, "[vdso]");
+
 			if (execname == NULL)
 				continue;
 

From ed900c054b541254f0ce5cedaf75206e29bd614e Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: [PATCH 17/31] perf_counter: Log vfork as a fork event

Right now we don't output vfork events. Even though we should
always see an exec after a vfork, we may get perfcounter
samples between the vfork and exec. These samples can lead to
some confusion when parsing perfcounter data.

To keep things consistent we should always log a fork event. It
will result in a little more log data, but is less confusing to
trace parsing tools.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090716104817.589309391@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/fork.c | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/kernel/fork.c b/kernel/fork.c
index 467746b3f0a..4812d60b29f 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1408,14 +1408,11 @@ long do_fork(unsigned long clone_flags,
 		if (clone_flags & CLONE_VFORK) {
 			p->vfork_done = &vfork;
 			init_completion(&vfork);
-		} else if (!(clone_flags & CLONE_VM)) {
-			/*
-			 * vfork will do an exec which will call
-			 * set_task_comm()
-			 */
-			perf_counter_fork(p);
 		}
 
+		if (!(clone_flags & CLONE_THREAD))
+			perf_counter_fork(p);
+
 		audit_finish_fork(p);
 		tracehook_report_clone(regs, clone_flags, nr, p);
 

From 4bba828dd9bb950ad1fe340ef148a5436a10f131 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: [PATCH 18/31] perf_counter: Add perf record option to log addresses

Add the -d or --data option to log event addresses (eg page
faults).

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090716104817.697698033@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-record.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 072aaf0369f..68a9be0d151 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -43,6 +43,7 @@ static int			call_graph			= 0;
 static int			verbose				= 0;
 static int			inherit_stat			= 0;
 static int			no_samples			= 0;
+static int			sample_address			= 0;
 
 static long			samples;
 static struct timeval		last_read;
@@ -405,6 +406,9 @@ static void create_counter(int counter, int cpu, pid_t pid)
 	if (inherit_stat)
 		attr->inherit_stat = 1;
 
+	if (sample_address)
+		attr->sample_type	|= PERF_SAMPLE_ADDR;
+
 	if (call_graph)
 		attr->sample_type	|= PERF_SAMPLE_CALLCHAIN;
 
@@ -649,6 +653,8 @@ static const struct option options[] = {
 		    "be more verbose (show counter open errors, etc)"),
 	OPT_BOOLEAN('s', "stat", &inherit_stat,
 		    "per thread counts"),
+	OPT_BOOLEAN('d', "data", &sample_address,
+		    "Sample addresses"),
 	OPT_BOOLEAN('n', "no-samples", &no_samples,
 		    "don't sample"),
 	OPT_END()

From 1483b19f8f5e8ad0c8816de368b099322dad4db5 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Thu, 16 Jul 2009 15:44:29 +0200
Subject: [PATCH 19/31] perf_counter: Make call graph option consistent

perf record uses -g for logging call graph data but perf report
uses -c to print call graph data. Be consistent and use -g
everywhere for call graph data.

Also update the help text to reflect the current default -
fractal,0.5

Signed-off-by: Anton Blanchard <anton@samba.org>
Acked-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090716104817.803604373@samba.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 tools/perf/builtin-report.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index 4e5cc266311..4b980cce705 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -1891,9 +1891,9 @@ static const struct option options[] = {
 		   "regex filter to identify parent, see: '--sort parent'"),
 	OPT_BOOLEAN('x', "exclude-other", &exclude_other,
 		    "Only display entries with parent-match"),
-	OPT_CALLBACK_DEFAULT('c', "callchain", NULL, "output_type,min_percent",
+	OPT_CALLBACK_DEFAULT('g', "call-graph", NULL, "output_type,min_percent",
 		     "Display callchains using output_type and min percent threshold. "
-		     "Default: flat,0", &parse_callchain_opt, callchain_default_opt),
+		     "Default: fractal,0.5", &parse_callchain_opt, callchain_default_opt),
 	OPT_STRING('d', "dsos", &dso_list_str, "dso[,dso...]",
 		   "only consider symbols in these dsos"),
 	OPT_STRING('C', "comms", &comm_list_str, "comm[,comm...]",

From 9b7019ae6a542a801a80df6694c66e240e2c3e39 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 22 Jul 2009 16:31:36 +0200
Subject: [PATCH 20/31] perf_counter: Remove unused variables

Fix a gcc unused variables warning.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 arch/x86/kernel/cpu/perf_counter.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/arch/x86/kernel/cpu/perf_counter.c b/arch/x86/kernel/cpu/perf_counter.c
index 7e346d4bc0f..a7aa8f90095 100644
--- a/arch/x86/kernel/cpu/perf_counter.c
+++ b/arch/x86/kernel/cpu/perf_counter.c
@@ -1570,8 +1570,6 @@ static struct x86_pmu amd_pmu = {
 
 static int p6_pmu_init(void)
 {
-	int high, low;
-
 	switch (boot_cpu_data.x86_model) {
 	case 1:
 	case 3:  /* Pentium Pro */

From c9f73a3dd27e03411f18a58c0814d51392d2b17a Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Tue, 21 Jul 2009 00:55:05 -0700
Subject: [PATCH 21/31] perf: Fix stack data leak

the "reserved" field was not initialized to zero, resulting in 4 bytes
of stack data leaking to userspace....

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/perf_counter.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 5c6fae4f43d..ff854fd89a8 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2666,6 +2666,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 		header.size += sizeof(cpu_entry);
 
 		cpu_entry.cpu = raw_smp_processor_id();
+		cpu_entry.reserved = 0;
 	}
 
 	if (sample_type & PERF_SAMPLE_PERIOD)

From 573402db02746179b3f95f83a11a787501f52d0a Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Wed, 22 Jul 2009 11:13:50 +0200
Subject: [PATCH 22/31] perf_counter: Plug more stack leaks

Per example of Arjan's patch, I went through and found a few more.

Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
---
 kernel/perf_counter.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index ff854fd89a8..e1d6a3aa133 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -2897,8 +2897,11 @@ void perf_counter_fork(struct task_struct *task)
 		.event  = {
 			.header = {
 				.type = PERF_EVENT_FORK,
+				.misc = 0,
 				.size = sizeof(fork_event.event),
 			},
+			/* .pid  */
+			/* .ppid */
 		},
 	};
 
@@ -3008,8 +3011,16 @@ void perf_counter_comm(struct task_struct *task)
 
 	comm_event = (struct perf_comm_event){
 		.task	= task,
+		/* .comm      */
+		/* .comm_size */
 		.event  = {
-			.header = { .type = PERF_EVENT_COMM, },
+			.header = {
+				.type = PERF_EVENT_COMM,
+				.misc = 0,
+				/* .size */
+			},
+			/* .pid */
+			/* .tid */
 		},
 	};
 
@@ -3160,8 +3171,16 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
 
 	mmap_event = (struct perf_mmap_event){
 		.vma	= vma,
+		/* .file_name */
+		/* .file_size */
 		.event  = {
-			.header = { .type = PERF_EVENT_MMAP, },
+			.header = {
+				.type = PERF_EVENT_MMAP,
+				.misc = 0,
+				/* .size */
+			},
+			/* .pid */
+			/* .tid */
 			.start  = vma->vm_start,
 			.len    = vma->vm_end - vma->vm_start,
 			.pgoff  = vma->vm_pgoff,

From 7f453c24b95a085fc7bd35d53b33abc4dc5a048b Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Tue, 21 Jul 2009 13:19:40 +0200
Subject: [PATCH 23/31] perf_counter: PERF_SAMPLE_ID and inherited counters

Anton noted that for inherited counters the counter-id as provided by
PERF_SAMPLE_ID isn't mappable to the id found through PERF_RECORD_ID
because each inherited counter gets its own id.

His suggestion was to always return the parent counter id, since that
is the primary counter id as exposed. However, these inherited
counters have a unique identifier so that events like
PERF_EVENT_PERIOD and PERF_EVENT_THROTTLE can be specific about which
counter gets modified, which is important when trying to normalize the
sample streams.

This patch removes PERF_EVENT_PERIOD in favour of PERF_SAMPLE_PERIOD,
which is more useful anyway, since changing periods became a lot more
common than initially thought -- rendering PERF_EVENT_PERIOD the less
useful solution (also, PERF_SAMPLE_PERIOD reports the more accurate
value, since it reports the value used to trigger the overflow,
whereas PERF_EVENT_PERIOD simply reports the requested period changed,
which might only take effect on the next cycle).

This still leaves us PERF_EVENT_THROTTLE to consider, but since that
_should_ be a rare occurrence, and linking it to a primary id is the
most useful bit to diagnose the problem, we introduce a
PERF_SAMPLE_STREAM_ID, for those few cases where the full
reconstruction is important.

[Does change the ABI a little, but I see no other way out]

Suggested-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1248095846.15751.8781.camel@twins>
---
 include/linux/perf_counter.h  | 15 ++----
 kernel/perf_counter.c         | 92 ++++++++++++-----------------------
 tools/perf/builtin-annotate.c | 24 ---------
 tools/perf/builtin-report.c   | 24 ---------
 4 files changed, 35 insertions(+), 120 deletions(-)

diff --git a/include/linux/perf_counter.h b/include/linux/perf_counter.h
index 5e970c7d3fd..bd15d7a5f5c 100644
--- a/include/linux/perf_counter.h
+++ b/include/linux/perf_counter.h
@@ -120,8 +120,9 @@ enum perf_counter_sample_format {
 	PERF_SAMPLE_ID				= 1U << 6,
 	PERF_SAMPLE_CPU				= 1U << 7,
 	PERF_SAMPLE_PERIOD			= 1U << 8,
+	PERF_SAMPLE_STREAM_ID			= 1U << 9,
 
-	PERF_SAMPLE_MAX = 1U << 9,		/* non-ABI */
+	PERF_SAMPLE_MAX = 1U << 10,		/* non-ABI */
 };
 
 /*
@@ -312,16 +313,7 @@ enum perf_event_type {
 	 *	struct perf_event_header	header;
 	 *	u64				time;
 	 *	u64				id;
-	 *	u64				sample_period;
-	 * };
-	 */
-	PERF_EVENT_PERIOD		= 4,
-
-	/*
-	 * struct {
-	 *	struct perf_event_header	header;
-	 *	u64				time;
-	 *	u64				id;
+	 *	u64				stream_id;
 	 * };
 	 */
 	PERF_EVENT_THROTTLE		= 5,
@@ -356,6 +348,7 @@ enum perf_event_type {
 	 *	{ u64			time;     } && PERF_SAMPLE_TIME
 	 *	{ u64			addr;     } && PERF_SAMPLE_ADDR
 	 *	{ u64			id;	  } && PERF_SAMPLE_ID
+	 *	{ u64			stream_id;} && PERF_SAMPLE_STREAM_ID
 	 *	{ u32			cpu, res; } && PERF_SAMPLE_CPU
 	 * 	{ u64			period;   } && PERF_SAMPLE_PERIOD
 	 *
diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index e1d6a3aa133..7530588fa5c 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -154,6 +154,20 @@ static void unclone_ctx(struct perf_counter_context *ctx)
 	}
 }
 
+/*
+ * If we inherit counters we want to return the parent counter id
+ * to userspace.
+ */
+static u64 primary_counter_id(struct perf_counter *counter)
+{
+	u64 id = counter->id;
+
+	if (counter->parent)
+		id = counter->parent->id;
+
+	return id;
+}
+
 /*
  * Get the perf_counter_context for a task and lock it.
  * This has to cope with with the fact that until it is locked,
@@ -1296,7 +1310,6 @@ static void perf_counter_cpu_sched_in(struct perf_cpu_context *cpuctx, int cpu)
 #define MAX_INTERRUPTS (~0ULL)
 
 static void perf_log_throttle(struct perf_counter *counter, int enable);
-static void perf_log_period(struct perf_counter *counter, u64 period);
 
 static void perf_adjust_period(struct perf_counter *counter, u64 events)
 {
@@ -1315,8 +1328,6 @@ static void perf_adjust_period(struct perf_counter *counter, u64 events)
 	if (!sample_period)
 		sample_period = 1;
 
-	perf_log_period(counter, sample_period);
-
 	hwc->sample_period = sample_period;
 }
 
@@ -1705,7 +1716,7 @@ perf_read_hw(struct perf_counter *counter, char __user *buf, size_t count)
 		values[n++] = counter->total_time_running +
 			atomic64_read(&counter->child_total_time_running);
 	if (counter->attr.read_format & PERF_FORMAT_ID)
-		values[n++] = counter->id;
+		values[n++] = primary_counter_id(counter);
 	mutex_unlock(&counter->child_mutex);
 
 	if (count < n * sizeof(u64))
@@ -1812,8 +1823,6 @@ static int perf_counter_period(struct perf_counter *counter, u64 __user *arg)
 
 		counter->attr.sample_freq = value;
 	} else {
-		perf_log_period(counter, value);
-
 		counter->attr.sample_period = value;
 		counter->hw.sample_period = value;
 	}
@@ -2662,6 +2671,9 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 	if (sample_type & PERF_SAMPLE_ID)
 		header.size += sizeof(u64);
 
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
+		header.size += sizeof(u64);
+
 	if (sample_type & PERF_SAMPLE_CPU) {
 		header.size += sizeof(cpu_entry);
 
@@ -2705,7 +2717,13 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 	if (sample_type & PERF_SAMPLE_ADDR)
 		perf_output_put(&handle, data->addr);
 
-	if (sample_type & PERF_SAMPLE_ID)
+	if (sample_type & PERF_SAMPLE_ID) {
+		u64 id = primary_counter_id(counter);
+
+		perf_output_put(&handle, id);
+	}
+
+	if (sample_type & PERF_SAMPLE_STREAM_ID)
 		perf_output_put(&handle, counter->id);
 
 	if (sample_type & PERF_SAMPLE_CPU)
@@ -2728,7 +2746,7 @@ static void perf_counter_output(struct perf_counter *counter, int nmi,
 			if (sub != counter)
 				sub->pmu->read(sub);
 
-			group_entry.id = sub->id;
+			group_entry.id = primary_counter_id(sub);
 			group_entry.counter = atomic64_read(&sub->count);
 
 			perf_output_put(&handle, group_entry);
@@ -2788,15 +2806,8 @@ perf_counter_read_event(struct perf_counter *counter,
 	}
 
 	if (counter->attr.read_format & PERF_FORMAT_ID) {
-		u64 id;
-
 		event.header.size += sizeof(u64);
-		if (counter->parent)
-			id = counter->parent->id;
-		else
-			id = counter->id;
-
-		event.format[i++] = id;
+		event.format[i++] = primary_counter_id(counter);
 	}
 
 	ret = perf_output_begin(&handle, counter, event.header.size, 0, 0);
@@ -3190,49 +3201,6 @@ void __perf_counter_mmap(struct vm_area_struct *vma)
 	perf_counter_mmap_event(&mmap_event);
 }
 
-/*
- * Log sample_period changes so that analyzing tools can re-normalize the
- * event flow.
- */
-
-struct freq_event {
-	struct perf_event_header	header;
-	u64				time;
-	u64				id;
-	u64				period;
-};
-
-static void perf_log_period(struct perf_counter *counter, u64 period)
-{
-	struct perf_output_handle handle;
-	struct freq_event event;
-	int ret;
-
-	if (counter->hw.sample_period == period)
-		return;
-
-	if (counter->attr.sample_type & PERF_SAMPLE_PERIOD)
-		return;
-
-	event = (struct freq_event) {
-		.header = {
-			.type = PERF_EVENT_PERIOD,
-			.misc = 0,
-			.size = sizeof(event),
-		},
-		.time = sched_clock(),
-		.id = counter->id,
-		.period = period,
-	};
-
-	ret = perf_output_begin(&handle, counter, sizeof(event), 1, 0);
-	if (ret)
-		return;
-
-	perf_output_put(&handle, event);
-	perf_output_end(&handle);
-}
-
 /*
  * IRQ throttle logging
  */
@@ -3246,14 +3214,16 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 		struct perf_event_header	header;
 		u64				time;
 		u64				id;
+		u64				stream_id;
 	} throttle_event = {
 		.header = {
 			.type = PERF_EVENT_THROTTLE + 1,
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
-		.time	= sched_clock(),
-		.id	= counter->id,
+		.time		= sched_clock(),
+		.id		= primary_counter_id(counter),
+		.stream_id	= counter->id,
 	};
 
 	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
diff --git a/tools/perf/builtin-annotate.c b/tools/perf/builtin-annotate.c
index 5f9eefecc57..1dba568e194 100644
--- a/tools/perf/builtin-annotate.c
+++ b/tools/perf/builtin-annotate.c
@@ -74,20 +74,12 @@ struct fork_event {
 	u32 pid, ppid;
 };
 
-struct period_event {
-	struct perf_event_header header;
-	u64 time;
-	u64 id;
-	u64 sample_period;
-};
-
 typedef union event_union {
 	struct perf_event_header	header;
 	struct ip_event			ip;
 	struct mmap_event		mmap;
 	struct comm_event		comm;
 	struct fork_event		fork;
-	struct period_event		period;
 } event_t;
 
 
@@ -997,19 +989,6 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
-static int
-process_period_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->period.time,
-		event->period.id,
-		event->period.sample_period);
-
-	return 0;
-}
-
 static int
 process_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -1025,9 +1004,6 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 
 	case PERF_EVENT_FORK:
 		return process_fork_event(event, offset, head);
-
-	case PERF_EVENT_PERIOD:
-		return process_period_event(event, offset, head);
 	/*
 	 * We dont process them right now but they are fine:
 	 */
diff --git a/tools/perf/builtin-report.c b/tools/perf/builtin-report.c
index a118bc77286..b20a4b6e31b 100644
--- a/tools/perf/builtin-report.c
+++ b/tools/perf/builtin-report.c
@@ -101,13 +101,6 @@ struct fork_event {
 	u32 pid, ppid;
 };
 
-struct period_event {
-	struct perf_event_header header;
-	u64 time;
-	u64 id;
-	u64 sample_period;
-};
-
 struct lost_event {
 	struct perf_event_header header;
 	u64 id;
@@ -127,7 +120,6 @@ typedef union event_union {
 	struct mmap_event		mmap;
 	struct comm_event		comm;
 	struct fork_event		fork;
-	struct period_event		period;
 	struct lost_event		lost;
 	struct read_event		read;
 } event_t;
@@ -1635,19 +1627,6 @@ process_fork_event(event_t *event, unsigned long offset, unsigned long head)
 	return 0;
 }
 
-static int
-process_period_event(event_t *event, unsigned long offset, unsigned long head)
-{
-	dprintf("%p [%p]: PERF_EVENT_PERIOD: time:%Ld, id:%Ld: period:%Ld\n",
-		(void *)(offset + head),
-		(void *)(long)(event->header.size),
-		event->period.time,
-		event->period.id,
-		event->period.sample_period);
-
-	return 0;
-}
-
 static int
 process_lost_event(event_t *event, unsigned long offset, unsigned long head)
 {
@@ -1729,9 +1708,6 @@ process_event(event_t *event, unsigned long offset, unsigned long head)
 	case PERF_EVENT_FORK:
 		return process_fork_event(event, offset, head);
 
-	case PERF_EVENT_PERIOD:
-		return process_period_event(event, offset, head);
-
 	case PERF_EVENT_LOST:
 		return process_lost_event(event, offset, head);
 

From a0541234f89c93f313961ce7b28676e11488a5f0 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 22 Jul 2009 23:04:12 +1000
Subject: [PATCH 24/31] perf_counter: Improve perf stat and perf record option
 parsing

perf stat and perf record currently look for all options on the command
line. This can lead to some confusion:

# perf stat ls -l
  Error: unknown switch `l'

While we can work around this by adding '--' before the command, the git
option parsing code can stop at the first non option:

# perf stat ls -l
 Performance counter stats for 'ls -l':
....

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090722130412.GD9029@kryten>
---
 tools/perf/builtin-record.c | 3 ++-
 tools/perf/builtin-stat.c   | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/tools/perf/builtin-record.c b/tools/perf/builtin-record.c
index 68a9be0d151..6da09928130 100644
--- a/tools/perf/builtin-record.c
+++ b/tools/perf/builtin-record.c
@@ -664,7 +664,8 @@ int cmd_record(int argc, const char **argv, const char *prefix __used)
 {
 	int counter;
 
-	argc = parse_options(argc, argv, options, record_usage, 0);
+	argc = parse_options(argc, argv, options, record_usage,
+		PARSE_OPT_STOP_AT_NON_OPTION);
 	if (!argc && target_pid == -1 && !system_wide)
 		usage_with_options(record_usage, options);
 
diff --git a/tools/perf/builtin-stat.c b/tools/perf/builtin-stat.c
index 27921a8ce1a..f9510eeeb6c 100644
--- a/tools/perf/builtin-stat.c
+++ b/tools/perf/builtin-stat.c
@@ -511,7 +511,8 @@ int cmd_stat(int argc, const char **argv, const char *prefix __used)
 {
 	int status;
 
-	argc = parse_options(argc, argv, options, stat_usage, 0);
+	argc = parse_options(argc, argv, options, stat_usage,
+		PARSE_OPT_STOP_AT_NON_OPTION);
 	if (!argc)
 		usage_with_options(stat_usage, options);
 	if (run_count <= 0 || run_count > MAX_RUN)

From 966ee4d6b887c14159043ac80b8c3661d2bbe5e2 Mon Sep 17 00:00:00 2001
From: Anton Blanchard <anton@samba.org>
Date: Wed, 22 Jul 2009 23:05:46 +1000
Subject: [PATCH 25/31] perf_counter: Fix throttle/unthrottle event logging

Right now we only print PERF_EVENT_THROTTLE + 1 (ie PERF_EVENT_UNTHROTTLE).
Fix this to print both a throttle and unthrottle event.

Signed-off-by: Anton Blanchard <anton@samba.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090722130546.GE9029@kryten>
---
 kernel/perf_counter.c | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/kernel/perf_counter.c b/kernel/perf_counter.c
index 7530588fa5c..787d4daef18 100644
--- a/kernel/perf_counter.c
+++ b/kernel/perf_counter.c
@@ -3217,7 +3217,7 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 		u64				stream_id;
 	} throttle_event = {
 		.header = {
-			.type = PERF_EVENT_THROTTLE + 1,
+			.type = PERF_EVENT_THROTTLE,
 			.misc = 0,
 			.size = sizeof(throttle_event),
 		},
@@ -3226,6 +3226,9 @@ static void perf_log_throttle(struct perf_counter *counter, int enable)
 		.stream_id	= counter->id,
 	};
 
+	if (enable)
+		throttle_event.header.type = PERF_EVENT_UNTHROTTLE;
+
 	ret = perf_output_begin(&handle, counter, sizeof(throttle_event), 1, 0);
 	if (ret)
 		return;

From dfe5a50461db90fab901cb697eff0d3d2e9fd229 Mon Sep 17 00:00:00 2001
From: Arjan van de Ven <arjan@linux.intel.com>
Date: Mon, 20 Jul 2009 22:54:26 -0700
Subject: [PATCH 26/31] perf: avoid structure size confusion by using a fixed
 size

for some reason, this structure gets compiled as 36 bytes in some files
(the ones that alloacte it) but 40 bytes in others (the ones that use it).
The cause is an off_t type that gets a different size in different
compilation units for some yet-to-be-explained reason.

But the effect is disasterous; the size/offset members of the struct
are at different offsets, and result in mostly complete garbage.
The parser in perf is so robust that this all gets hidden, and after
skipping an certain amount of samples, it recovers.... so this bug
is not normally noticed.

.... except when you want every sample to be exact.

Fix this by just using an explicitly sized type.

Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <4A655917.9080504@linux.intel.com>
---
 tools/perf/util/header.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/perf/util/header.h b/tools/perf/util/header.h
index b5ef53ad4c7..bf280449fcf 100644
--- a/tools/perf/util/header.h
+++ b/tools/perf/util/header.h
@@ -16,7 +16,7 @@ struct perf_header {
 	int frozen;
 	int attrs, size;
 	struct perf_header_attr **attr;
-	off_t attr_offset;
+	s64 attr_offset;
 	u64 data_offset;
 	u64 data_size;
 };

From 28ac909b49a155856c957d080f8a796b3c1d1f3e Mon Sep 17 00:00:00 2001
From: Arnaldo Carvalho de Melo <acme@redhat.com>
Date: Mon, 20 Jul 2009 14:14:12 -0300
Subject: [PATCH 27/31] perf symbol: C++ demangling
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

[acme@doppio ~]$ perf report -s comm,dso,symbol -C firefox -d /usr/lib64/xulrunner-1.9.1/libxul.so | grep :: | head
     2.21%  [.] nsDeque::Push(void*)
     1.78%  [.] GraphWalker::DoWalk(nsDeque&)
     1.30%  [.] GCGraphBuilder::AddNode(void*, nsCycleCollectionParticipant*)
     1.27%  [.] XPCWrappedNative::CallMethod(XPCCallContext&, XPCWrappedNative::CallMode)
     1.18%  [.] imgContainer::DrawFrameTo(gfxIImageFrame*, gfxIImageFrame*, nsRect&)
     1.13%  [.] nsDeque::PopFront()
     1.11%  [.] nsGlobalWindow::RunTimeout(nsTimeout*)
     0.97%  [.] nsXPConnect::Traverse(void*, nsCycleCollectionTraversalCallback&)
     0.95%  [.] nsJSEventListener::cycleCollection::Traverse(void*, nsCycleCollectionTraversalCallback&)
     0.95%  [.] nsCOMPtr_base::~nsCOMPtr_base()
[acme@doppio ~]$

Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: Vegard Nossum <vegard.nossum@gmail.com>
Cc: Paul Mackerras <paulus@samba.org>
Cc: Frédéric Weisbecker <fweisbec@gmail.com>
Suggested-by: Clark Williams <williams@redhat.com>
Signed-off-by: Arnaldo Carvalho de Melo <acme@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090720171412.GB10410@ghostprotocols.net>
---
 tools/perf/Makefile      |  2 +-
 tools/perf/util/symbol.c | 21 +++++++++++++++++++--
 2 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/tools/perf/Makefile b/tools/perf/Makefile
index 7822b3d6bac..a5e9b876ca0 100644
--- a/tools/perf/Makefile
+++ b/tools/perf/Makefile
@@ -345,7 +345,7 @@ BUILTIN_OBJS += builtin-stat.o
 BUILTIN_OBJS += builtin-top.o
 
 PERFLIBS = $(LIB_FILE)
-EXTLIBS =
+EXTLIBS = -lbfd
 
 #
 # Platform specific tweaks
diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index f40266b4845..98aee922acc 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -6,9 +6,15 @@
 #include <libelf.h>
 #include <gelf.h>
 #include <elf.h>
+#include <bfd.h>
 
 const char *sym_hist_filter;
 
+#ifndef DMGL_PARAMS
+#define DMGL_PARAMS      (1 << 0)       /* Include function args */
+#define DMGL_ANSI        (1 << 1)       /* Include const, volatile, etc */
+#endif
+
 static struct symbol *symbol__new(u64 start, u64 len,
 				  const char *name, unsigned int priv_size,
 				  u64 obj_start, int verbose)
@@ -571,6 +577,8 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 						     NULL) != NULL);
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
+		const char *name;
+		char *demangled;
 		u64 obj_start;
 		struct section *section = NULL;
 		int is_label = elf_sym__is_label(&sym);
@@ -609,10 +617,19 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 				goto out_elf_end;
 			}
 		}
+		/*
+		 * We need to figure out if the object was created from C++ sources
+		 * DWARF DW_compile_unit has this, but we don't always have access
+		 * to it...
+		 */
+		name = elf_sym__name(&sym, symstrs);
+		demangled = bfd_demangle(NULL, name, DMGL_PARAMS | DMGL_ANSI);
+		if (demangled != NULL)
+			name = demangled;
 
-		f = symbol__new(sym.st_value, sym.st_size,
-				elf_sym__name(&sym, symstrs),
+		f = symbol__new(sym.st_value, sym.st_size, name,
 				self->sym_priv_size, obj_start, verbose);
+		free(demangled);
 		if (!f)
 			goto out_elf_end;
 

From f6bdafef2ab911f03321fa83d8da1df99878009e Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Tue, 21 Jul 2009 12:20:22 -0400
Subject: [PATCH 28/31] perf_counter: Add tracepoint support to perf list, perf
 stat

Add support to 'perf list' and 'perf stat' for kernel tracepoints. The
implementation creates a 'for_each_subsystem' and 'for_each_event' for
easy iteration over the tracepoints.

Signed-off-by: Jason Baron <jbaron@redhat.com>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <426129bf9fcc8ee63bb094cf736e7316a7dcd77a.1248190728.git.jbaron@redhat.com>
---
 tools/perf/util/parse-events.c | 175 ++++++++++++++++++++++++++++++++-
 tools/perf/util/util.h         |   2 +
 2 files changed, 176 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index d18c98edd00..5a3cd3a34af 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -12,6 +12,8 @@ int					nr_counters;
 
 struct perf_counter_attr		attrs[MAX_COUNTERS];
 
+static char default_debugfs_path[] = "/sys/kernel/debug/tracing/events";
+
 struct event_symbol {
 	u8	type;
 	u64	config;
@@ -110,6 +112,88 @@ static unsigned long hw_cache_stat[C(MAX)] = {
  [C(BPU)]	= (CACHE_READ),
 };
 
+#define for_each_subsystem(sys_dir, sys_dirent, sys_next, file, st)	       \
+	while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next)	       \
+	if (snprintf(file, MAXPATHLEN, "%s/%s", default_debugfs_path,	       \
+			sys_dirent.d_name) &&				       \
+	   (!stat(file, &st)) && (S_ISDIR(st.st_mode)) &&		       \
+	   (strcmp(sys_dirent.d_name, ".")) &&				       \
+	   (strcmp(sys_dirent.d_name, "..")))
+
+#define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next, file, st)    \
+	while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next)        \
+	if (snprintf(file, MAXPATHLEN, "%s/%s/%s", default_debugfs_path,       \
+			sys_dirent.d_name, evt_dirent.d_name) &&	       \
+	   (!stat(file, &st)) && (S_ISDIR(st.st_mode)) &&		       \
+	   (strcmp(evt_dirent.d_name, ".")) &&				       \
+	   (strcmp(evt_dirent.d_name, "..")))
+
+#define MAX_EVENT_LENGTH 30
+
+static int valid_debugfs_mount(void)
+{
+	struct statfs st_fs;
+
+	if (statfs(default_debugfs_path, &st_fs) < 0)
+		return -ENOENT;
+	else if (st_fs.f_type != (long) DEBUGFS_MAGIC)
+		return -ENOENT;
+	return 0;
+}
+
+static char *tracepoint_id_to_name(u64 config)
+{
+	static char tracepoint_name[2 * MAX_EVENT_LENGTH];
+	DIR *sys_dir, *evt_dir;
+	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+	struct stat st;
+	char id_buf[4];
+	int fd;
+	u64 id;
+	char evt_path[MAXPATHLEN];
+
+	if (valid_debugfs_mount())
+		return "unkown";
+
+	sys_dir = opendir(default_debugfs_path);
+	if (!sys_dir)
+		goto cleanup;
+
+	for_each_subsystem(sys_dir, sys_dirent, sys_next, evt_path, st) {
+		evt_dir = opendir(evt_path);
+		if (!evt_dir)
+			goto cleanup;
+		for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next,
+								evt_path, st) {
+			snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id",
+				 default_debugfs_path, sys_dirent.d_name,
+				 evt_dirent.d_name);
+			fd = open(evt_path, O_RDONLY);
+			if (fd < 0)
+				continue;
+			if (read(fd, id_buf, sizeof(id_buf)) < 0) {
+				close(fd);
+				continue;
+			}
+			close(fd);
+			id = atoll(id_buf);
+			if (id == config) {
+				closedir(evt_dir);
+				closedir(sys_dir);
+				snprintf(tracepoint_name, 2 * MAX_EVENT_LENGTH,
+					"%s:%s", sys_dirent.d_name,
+					evt_dirent.d_name);
+				return tracepoint_name;
+			}
+		}
+		closedir(evt_dir);
+	}
+
+cleanup:
+	closedir(sys_dir);
+	return "unkown";
+}
+
 static int is_cache_op_valid(u8 cache_type, u8 cache_op)
 {
 	if (hw_cache_stat[cache_type] & COP(cache_op))
@@ -177,6 +261,9 @@ char *event_name(int counter)
 			return sw_event_names[config];
 		return "unknown-software";
 
+	case PERF_TYPE_TRACEPOINT:
+		return tracepoint_id_to_name(config);
+
 	default:
 		break;
 	}
@@ -265,6 +352,53 @@ parse_generic_hw_event(const char **str, struct perf_counter_attr *attr)
 	return 1;
 }
 
+static int parse_tracepoint_event(const char **strp,
+				    struct perf_counter_attr *attr)
+{
+	const char *evt_name;
+	char sys_name[MAX_EVENT_LENGTH];
+	char id_buf[4];
+	int fd;
+	unsigned int sys_length, evt_length;
+	u64 id;
+	char evt_path[MAXPATHLEN];
+
+	if (valid_debugfs_mount())
+		return 0;
+
+	evt_name = strchr(*strp, ':');
+	if (!evt_name)
+		return 0;
+
+	sys_length = evt_name - *strp;
+	if (sys_length >= MAX_EVENT_LENGTH)
+		return 0;
+
+	strncpy(sys_name, *strp, sys_length);
+	sys_name[sys_length] = '\0';
+	evt_name = evt_name + 1;
+	evt_length = strlen(evt_name);
+	if (evt_length >= MAX_EVENT_LENGTH)
+		return 0;
+
+	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", default_debugfs_path,
+				sys_name, evt_name);
+	fd = open(evt_path, O_RDONLY);
+	if (fd < 0)
+		return 0;
+
+	if (read(fd, id_buf, sizeof(id_buf)) < 0) {
+		close(fd);
+		return 0;
+	}
+	close(fd);
+	id = atoll(id_buf);
+	attr->config = id;
+	attr->type = PERF_TYPE_TRACEPOINT;
+	*strp = evt_name + evt_length;
+	return 1;
+}
+
 static int check_events(const char *str, unsigned int i)
 {
 	int n;
@@ -374,7 +508,8 @@ parse_event_modifier(const char **strp, struct perf_counter_attr *attr)
  */
 static int parse_event_symbols(const char **str, struct perf_counter_attr *attr)
 {
-	if (!(parse_raw_event(str, attr) ||
+	if (!(parse_tracepoint_event(str, attr) ||
+	      parse_raw_event(str, attr) ||
 	      parse_numeric_event(str, attr) ||
 	      parse_symbolic_event(str, attr) ||
 	      parse_generic_hw_event(str, attr)))
@@ -422,6 +557,42 @@ static const char * const event_type_descriptors[] = {
 	"Hardware cache event",
 };
 
+/*
+ * Print the events from <debugfs_mount_point>/tracing/events
+ */
+
+static void print_tracepoint_events(void)
+{
+	DIR *sys_dir, *evt_dir;
+	struct dirent *sys_next, *evt_next, sys_dirent, evt_dirent;
+	struct stat st;
+	char evt_path[MAXPATHLEN];
+
+	if (valid_debugfs_mount())
+		return;
+
+	sys_dir = opendir(default_debugfs_path);
+	if (!sys_dir)
+		goto cleanup;
+
+	for_each_subsystem(sys_dir, sys_dirent, sys_next, evt_path, st) {
+		evt_dir = opendir(evt_path);
+		if (!evt_dir)
+			goto cleanup;
+		for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next,
+								evt_path, st) {
+			snprintf(evt_path, MAXPATHLEN, "%s:%s",
+				 sys_dirent.d_name, evt_dirent.d_name);
+			fprintf(stderr, "  %-40s [%s]\n", evt_path,
+				event_type_descriptors[PERF_TYPE_TRACEPOINT+1]);
+		}
+		closedir(evt_dir);
+	}
+
+cleanup:
+	closedir(sys_dir);
+}
+
 /*
  * Print the help text for the event symbols:
  */
@@ -472,5 +643,7 @@ void print_events(void)
 		"rNNN");
 	fprintf(stderr, "\n");
 
+	print_tracepoint_events();
+
 	exit(129);
 }
diff --git a/tools/perf/util/util.h b/tools/perf/util/util.h
index b4be6071c10..68fe157d72f 100644
--- a/tools/perf/util/util.h
+++ b/tools/perf/util/util.h
@@ -50,6 +50,7 @@
 #include <unistd.h>
 #include <stdio.h>
 #include <sys/stat.h>
+#include <sys/statfs.h>
 #include <fcntl.h>
 #include <stddef.h>
 #include <stdlib.h>
@@ -80,6 +81,7 @@
 #include <netdb.h>
 #include <pwd.h>
 #include <inttypes.h>
+#include "../../../include/linux/magic.h"
 
 #ifndef NO_ICONV
 #include <iconv.h>

From 5beeded123c5befa21f1c6e16219f2a3eb7dd197 Mon Sep 17 00:00:00 2001
From: Jason Baron <jbaron@redhat.com>
Date: Tue, 21 Jul 2009 14:16:29 -0400
Subject: [PATCH 29/31] perf_counter: Detect debugfs location

If "/sys/kernel/debug" is not a debugfs mount point, search for the debugfs
filesystem in /proc/mounts, but also allows the user to specify
'--debugfs-dir=blah' or set the environment variable: 'PERF_DEBUGFS_DIR'

Signed-off-by: Jason Baron <jbaron@redhat.com>
[ also made it probe "/debug" by default ]
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <20090721181629.GA3094@redhat.com>
---
 tools/perf/perf.c              | 77 +++++++++++++++++++++++++++++++++-
 tools/perf/util/cache.h        |  1 +
 tools/perf/util/parse-events.c | 33 ++++++++-------
 tools/perf/util/parse-events.h |  5 +++
 tools/perf/util/string.h       |  3 ++
 5 files changed, 102 insertions(+), 17 deletions(-)

diff --git a/tools/perf/perf.c b/tools/perf/perf.c
index c5656784c61..31982ad064b 100644
--- a/tools/perf/perf.c
+++ b/tools/perf/perf.c
@@ -12,6 +12,8 @@
 #include "util/cache.h"
 #include "util/quote.h"
 #include "util/run-command.h"
+#include "util/parse-events.h"
+#include "util/string.h"
 
 const char perf_usage_string[] =
 	"perf [--version] [--help] COMMAND [ARGS]";
@@ -25,6 +27,8 @@ struct pager_config {
 	int val;
 };
 
+static char debugfs_mntpt[MAXPATHLEN];
+
 static int pager_command_config(const char *var, const char *value, void *data)
 {
 	struct pager_config *c = data;
@@ -56,6 +60,15 @@ static void commit_pager_choice(void) {
 	}
 }
 
+static void set_debugfs_path(void)
+{
+	char *path;
+
+	path = getenv(PERF_DEBUGFS_ENVIRONMENT);
+	snprintf(debugfs_path, MAXPATHLEN, "%s/%s", path ?: debugfs_mntpt,
+		 "tracing/events");
+}
+
 static int handle_options(const char*** argv, int* argc, int* envchanged)
 {
 	int handled = 0;
@@ -122,6 +135,22 @@ static int handle_options(const char*** argv, int* argc, int* envchanged)
 			setenv(PERF_WORK_TREE_ENVIRONMENT, cmd + 12, 1);
 			if (envchanged)
 				*envchanged = 1;
+		} else if (!strcmp(cmd, "--debugfs-dir")) {
+			if (*argc < 2) {
+				fprintf(stderr, "No directory given for --debugfs-dir.\n");
+				usage(perf_usage_string);
+			}
+			strncpy(debugfs_mntpt, (*argv)[1], MAXPATHLEN);
+			debugfs_mntpt[MAXPATHLEN - 1] = '\0';
+			if (envchanged)
+				*envchanged = 1;
+			(*argv)++;
+			(*argc)--;
+		} else if (!prefixcmp(cmd, "--debugfs-dir=")) {
+			strncpy(debugfs_mntpt, cmd + 14, MAXPATHLEN);
+			debugfs_mntpt[MAXPATHLEN - 1] = '\0';
+			if (envchanged)
+				*envchanged = 1;
 		} else {
 			fprintf(stderr, "Unknown option: %s\n", cmd);
 			usage(perf_usage_string);
@@ -228,6 +257,7 @@ static int run_builtin(struct cmd_struct *p, int argc, const char **argv)
 	if (use_pager == -1 && p->option & USE_PAGER)
 		use_pager = 1;
 	commit_pager_choice();
+	set_debugfs_path();
 
 	status = p->fn(argc, argv, prefix);
 	if (status)
@@ -346,6 +376,49 @@ static int run_argv(int *argcp, const char ***argv)
 	return done_alias;
 }
 
+/* mini /proc/mounts parser: searching for "^blah /mount/point debugfs" */
+static void get_debugfs_mntpt(void)
+{
+	FILE *file;
+	char fs_type[100];
+	char debugfs[MAXPATHLEN];
+
+	/*
+	 * try the standard location
+	 */
+	if (valid_debugfs_mount("/sys/kernel/debug/") == 0) {
+		strcpy(debugfs_mntpt, "/sys/kernel/debug/");
+		return;
+	}
+
+	/*
+	 * try the sane location
+	 */
+	if (valid_debugfs_mount("/debug/") == 0) {
+		strcpy(debugfs_mntpt, "/debug/");
+		return;
+	}
+
+	/*
+	 * give up and parse /proc/mounts
+	 */
+	file = fopen("/proc/mounts", "r");
+	if (file == NULL)
+		return;
+
+	while (fscanf(file, "%*s %"
+		      STR(MAXPATHLEN)
+		      "s %99s %*s %*d %*d\n",
+		      debugfs, fs_type) == 2) {
+		if (strcmp(fs_type, "debugfs") == 0)
+			break;
+	}
+	fclose(file);
+	if (strcmp(fs_type, "debugfs") == 0) {
+		strncpy(debugfs_mntpt, debugfs, MAXPATHLEN);
+		debugfs_mntpt[MAXPATHLEN - 1] = '\0';
+	}
+}
 
 int main(int argc, const char **argv)
 {
@@ -354,7 +427,8 @@ int main(int argc, const char **argv)
 	cmd = perf_extract_argv0_path(argv[0]);
 	if (!cmd)
 		cmd = "perf-help";
-
+	/* get debugfs mount point from /proc/mounts */
+	get_debugfs_mntpt();
 	/*
 	 * "perf-xxxx" is the same as "perf xxxx", but we obviously:
 	 *
@@ -377,6 +451,7 @@ int main(int argc, const char **argv)
 	argc--;
 	handle_options(&argv, &argc, NULL);
 	commit_pager_choice();
+	set_debugfs_path();
 	if (argc > 0) {
 		if (!prefixcmp(argv[0], "--"))
 			argv[0] += 2;
diff --git a/tools/perf/util/cache.h b/tools/perf/util/cache.h
index 161d5f413e2..4b50c412b9c 100644
--- a/tools/perf/util/cache.h
+++ b/tools/perf/util/cache.h
@@ -18,6 +18,7 @@
 #define PERFATTRIBUTES_FILE ".perfattributes"
 #define INFOATTRIBUTES_FILE "info/attributes"
 #define ATTRIBUTE_MACRO_PREFIX "[attr]"
+#define PERF_DEBUGFS_ENVIRONMENT "PERF_DEBUGFS_DIR"
 
 typedef int (*config_fn_t)(const char *, const char *, void *);
 extern int perf_default_config(const char *, const char *, void *);
diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c
index 5a3cd3a34af..7bdad8df22a 100644
--- a/tools/perf/util/parse-events.c
+++ b/tools/perf/util/parse-events.c
@@ -5,6 +5,7 @@
 #include "parse-events.h"
 #include "exec_cmd.h"
 #include "string.h"
+#include "cache.h"
 
 extern char *strcasestr(const char *haystack, const char *needle);
 
@@ -12,8 +13,6 @@ int					nr_counters;
 
 struct perf_counter_attr		attrs[MAX_COUNTERS];
 
-static char default_debugfs_path[] = "/sys/kernel/debug/tracing/events";
-
 struct event_symbol {
 	u8	type;
 	u64	config;
@@ -21,6 +20,8 @@ struct event_symbol {
 	char	*alias;
 };
 
+char debugfs_path[MAXPATHLEN];
+
 #define CHW(x) .type = PERF_TYPE_HARDWARE, .config = PERF_COUNT_HW_##x
 #define CSW(x) .type = PERF_TYPE_SOFTWARE, .config = PERF_COUNT_SW_##x
 
@@ -114,27 +115,27 @@ static unsigned long hw_cache_stat[C(MAX)] = {
 
 #define for_each_subsystem(sys_dir, sys_dirent, sys_next, file, st)	       \
 	while (!readdir_r(sys_dir, &sys_dirent, &sys_next) && sys_next)	       \
-	if (snprintf(file, MAXPATHLEN, "%s/%s", default_debugfs_path,	       \
-			sys_dirent.d_name) &&				       \
+	if (snprintf(file, MAXPATHLEN, "%s/%s", debugfs_path,	       	       \
+			sys_dirent.d_name) &&		       		       \
 	   (!stat(file, &st)) && (S_ISDIR(st.st_mode)) &&		       \
 	   (strcmp(sys_dirent.d_name, ".")) &&				       \
 	   (strcmp(sys_dirent.d_name, "..")))
 
 #define for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next, file, st)    \
 	while (!readdir_r(evt_dir, &evt_dirent, &evt_next) && evt_next)        \
-	if (snprintf(file, MAXPATHLEN, "%s/%s/%s", default_debugfs_path,       \
-			sys_dirent.d_name, evt_dirent.d_name) &&	       \
+	if (snprintf(file, MAXPATHLEN, "%s/%s/%s", debugfs_path,	       \
+		     sys_dirent.d_name, evt_dirent.d_name) &&		       \
 	   (!stat(file, &st)) && (S_ISDIR(st.st_mode)) &&		       \
 	   (strcmp(evt_dirent.d_name, ".")) &&				       \
 	   (strcmp(evt_dirent.d_name, "..")))
 
 #define MAX_EVENT_LENGTH 30
 
-static int valid_debugfs_mount(void)
+int valid_debugfs_mount(const char *debugfs)
 {
 	struct statfs st_fs;
 
-	if (statfs(default_debugfs_path, &st_fs) < 0)
+	if (statfs(debugfs, &st_fs) < 0)
 		return -ENOENT;
 	else if (st_fs.f_type != (long) DEBUGFS_MAGIC)
 		return -ENOENT;
@@ -152,10 +153,10 @@ static char *tracepoint_id_to_name(u64 config)
 	u64 id;
 	char evt_path[MAXPATHLEN];
 
-	if (valid_debugfs_mount())
+	if (valid_debugfs_mount(debugfs_path))
 		return "unkown";
 
-	sys_dir = opendir(default_debugfs_path);
+	sys_dir = opendir(debugfs_path);
 	if (!sys_dir)
 		goto cleanup;
 
@@ -166,7 +167,7 @@ static char *tracepoint_id_to_name(u64 config)
 		for_each_event(sys_dirent, evt_dir, evt_dirent, evt_next,
 								evt_path, st) {
 			snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id",
-				 default_debugfs_path, sys_dirent.d_name,
+				 debugfs_path, sys_dirent.d_name,
 				 evt_dirent.d_name);
 			fd = open(evt_path, O_RDONLY);
 			if (fd < 0)
@@ -363,7 +364,7 @@ static int parse_tracepoint_event(const char **strp,
 	u64 id;
 	char evt_path[MAXPATHLEN];
 
-	if (valid_debugfs_mount())
+	if (valid_debugfs_mount(debugfs_path))
 		return 0;
 
 	evt_name = strchr(*strp, ':');
@@ -381,8 +382,8 @@ static int parse_tracepoint_event(const char **strp,
 	if (evt_length >= MAX_EVENT_LENGTH)
 		return 0;
 
-	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", default_debugfs_path,
-				sys_name, evt_name);
+	snprintf(evt_path, MAXPATHLEN, "%s/%s/%s/id", debugfs_path,
+		 sys_name, evt_name);
 	fd = open(evt_path, O_RDONLY);
 	if (fd < 0)
 		return 0;
@@ -568,10 +569,10 @@ static void print_tracepoint_events(void)
 	struct stat st;
 	char evt_path[MAXPATHLEN];
 
-	if (valid_debugfs_mount())
+	if (valid_debugfs_mount(debugfs_path))
 		return;
 
-	sys_dir = opendir(default_debugfs_path);
+	sys_dir = opendir(debugfs_path);
 	if (!sys_dir)
 		goto cleanup;
 
diff --git a/tools/perf/util/parse-events.h b/tools/perf/util/parse-events.h
index e3d552908e6..1ea5d09b6eb 100644
--- a/tools/perf/util/parse-events.h
+++ b/tools/perf/util/parse-events.h
@@ -3,6 +3,8 @@
  * Parse symbolic events/counts passed in as options:
  */
 
+struct option;
+
 extern int			nr_counters;
 
 extern struct perf_counter_attr attrs[MAX_COUNTERS];
@@ -15,3 +17,6 @@ extern int parse_events(const struct option *opt, const char *str, int unset);
 
 extern void print_events(void);
 
+extern char debugfs_path[];
+extern int valid_debugfs_mount(const char *debugfs);
+
diff --git a/tools/perf/util/string.h b/tools/perf/util/string.h
index 3dca2f654cd..bf39dfadfd2 100644
--- a/tools/perf/util/string.h
+++ b/tools/perf/util/string.h
@@ -5,4 +5,7 @@
 
 int hex2u64(const char *ptr, u64 *val);
 
+#define _STR(x) #x
+#define STR(x) _STR(x)
+
 #endif

From d20ff6bd6bba2e7e6681fa17565347b410c46ab3 Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Mon, 20 Jul 2009 14:01:38 +0200
Subject: [PATCH 30/31] perf_counter tools: Fix vmlinux symbol generation
 breakage

vmlinux meets the criteria for symbol adjustment, which breaks vmlinux generated symbols.
Fix this by exempting vmlinux.  This is a bit fragile in that someone could change the
kernel dso's name, but currently that name is also hardwired.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1248091298.18702.18.camel@marge.simson.net>
---
 tools/perf/util/symbol.c | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/tools/perf/util/symbol.c b/tools/perf/util/symbol.c
index 98aee922acc..28106059bf1 100644
--- a/tools/perf/util/symbol.c
+++ b/tools/perf/util/symbol.c
@@ -525,7 +525,7 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 	GElf_Sym sym;
 	Elf_Scn *sec, *sec_strndx;
 	Elf *elf;
-	int nr = 0;
+	int nr = 0, kernel = !strcmp("[kernel]", self->name);
 
 	elf = elf_begin(fd, ELF_C_READ_MMAP, NULL);
 	if (elf == NULL) {
@@ -571,10 +571,13 @@ static int dso__load_sym(struct dso *self, int fd, const char *name,
 	nr_syms = shdr.sh_size / shdr.sh_entsize;
 
 	memset(&sym, 0, sizeof(sym));
-	self->adjust_symbols = (ehdr.e_type == ET_EXEC ||
+	if (!kernel) {
+		self->adjust_symbols = (ehdr.e_type == ET_EXEC ||
 				elf_section_by_name(elf, &ehdr, &shdr,
 						     ".gnu.prelink_undo",
 						     NULL) != NULL);
+	} else self->adjust_symbols = 0;
+
 	elf_symtab__for_each_symbol(syms, nr_syms, index, sym) {
 		struct symbol *f;
 		const char *name;

From 0fdc7e67dd312986e30b861adff48732bd33eb3f Mon Sep 17 00:00:00 2001
From: Mike Galbraith <efault@gmx.de>
Date: Tue, 21 Jul 2009 10:30:36 +0200
Subject: [PATCH 31/31] perf_counter tools: Give perf top inherit option

Currently, perf top -p only tracks the pid provided, which isn't very useful
for watching forky loads, so give it an inherit option.

Signed-off-by: Mike Galbraith <efault@gmx.de>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <1248165036.9795.10.camel@marge.simson.net>
---
 tools/perf/builtin-top.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index 95d5c0ae375..c0a423004e1 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -58,6 +58,7 @@ static u64			count_filter			=  5;
 static int			print_entries			= 15;
 
 static int			target_pid			= -1;
+static int			inherit				=  0;
 static int			profile_cpu			= -1;
 static int			nr_cpus				=  0;
 static unsigned int		realtime_prio			=  0;
@@ -549,7 +550,7 @@ int group_fd;
 static void start_counter(int i, int counter)
 {
 	struct perf_counter_attr *attr;
-	unsigned int cpu;
+	int cpu;
 
 	cpu = profile_cpu;
 	if (target_pid == -1 && profile_cpu == -1)
@@ -559,6 +560,7 @@ static void start_counter(int i, int counter)
 
 	attr->sample_type	= PERF_SAMPLE_IP | PERF_SAMPLE_TID;
 	attr->freq		= freq;
+	attr->inherit		= (cpu < 0) && inherit;
 
 try_again:
 	fd[i][counter] = sys_perf_counter_open(attr, target_pid, cpu, group_fd, 0);
@@ -685,6 +687,8 @@ static const struct option options[] = {
 		    "only display functions with more events than this"),
 	OPT_BOOLEAN('g', "group", &group,
 			    "put the counters into a counter group"),
+	OPT_BOOLEAN('i', "inherit", &inherit,
+		    "child tasks inherit counters"),
 	OPT_STRING('s', "sym-filter", &sym_filter, "pattern",
 		    "only display symbols matchig this pattern"),
 	OPT_BOOLEAN('z', "zero", &zero,