x86/perf/cqm: Wipe out perf based cqm

'perf cqm' never worked due to the incompatibility between perf infrastructure and cqm hardware support. The hardware uses RMIDs to track the llc occupancy of tasks and these RMIDs are per package. This makes monitoring a hierarchy like cgroup along with monitoring of tasks separately difficult and several patches sent to lkml to fix them were NACKed. Further more, the following issues in the current perf cqm make it almost unusable: 1. No support to monitor the same group of tasks for which we do allocation using resctrl. 2. It gives random and inaccurate data (mostly 0s) once we run out of RMIDs due to issues in Recycling. 3. Recycling results in inaccuracy of data because we cannot guarantee that the RMID was stolen from a task when it was not pulling data into cache or even when it pulled the least data. Also for monitoring llc_occupancy, if we stop using an RMID_x and then start using an RMID_y after we reclaim an RMID from an other event, we miss accounting all the occupancy that was tagged to RMID_x at a later perf_count. 2. Recycling code makes the monitoring code complex including scheduling because the event can lose RMID any time. Since MBM counters count bandwidth for a period of time by taking snap shot of total bytes at two different times, recycling complicates the way we count MBM in a hierarchy. Also we need a spin lock while we do the processing to account for MBM counter overflow. We also currently use a spin lock in scheduling to prevent the RMID from being taken away. 4. Lack of support when we run different kind of event like task, system-wide and cgroup events together. Data mostly prints 0s. This is also because we can have only one RMID tied to a cpu as defined by the cqm hardware but a perf can at the same time tie multiple events during one sched_in. 5. No support of monitoring a group of tasks. There is partial support for cgroup but it does not work once there is a hierarchy of cgroups or if we want to monitor a task in a cgroup and the cgroup itself. 6. No support for monitoring tasks for the lifetime without perf overhead. 7. It reported the aggregate cache occupancy or memory bandwidth over all sockets. But most cloud and VMM based use cases want to know the individual per-socket usage. Signed-off-by: Vikas Shivappa <vikas.shivappa@linux.intel.com> Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Cc: ravi.v.shankar@intel.com Cc: tony.luck@intel.com Cc: fenghua.yu@intel.com Cc: peterz@infradead.org Cc: eranian@google.com Cc: vikas.shivappa@intel.com Cc: ak@linux.intel.com Cc: davidcc@google.com Cc: reinette.chatre@intel.com Link: http://lkml.kernel.org/r/1501017287-28083-2-git-send-email-vikas.shivappa@linux.intel.com
2017-07-25 14:14:20 -07:00 · 2017-07-25 14:14:20 -07:00 · c39a0e2c88
commit c39a0e2c88
parent 16f73eb02d
6 changed files with 10 additions and 1800 deletions
--- a/arch/x86/events/intel/Makefile
+++ b/arch/x86/events/intel/Makefile
@ -1,4 +1,4 @@
-obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o cqm.o
+obj-$(CONFIG_CPU_SUP_INTEL)		+= core.o bts.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= ds.o knc.o
 obj-$(CONFIG_CPU_SUP_INTEL)		+= lbr.o p4.o p6.o pt.o
 obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL)	+= intel-rapl-perf.o
--- a/arch/x86/events/intel/cqm.c
+++ b/arch/x86/events/intel/cqm.c
--- a/arch/x86/include/asm/intel_rdt_common.h
+++ b/arch/x86/include/asm/intel_rdt_common.h
@ -7,7 +7,6 @@
 * struct intel_pqr_state - State cache for the PQR MSR
 * @rmid:		The cached Resource Monitoring ID
 * @closid:		The cached Class Of Service ID
- * @rmid_usecnt:	The usage counter for rmid
 *
 * The upper 32 bits of MSR_IA32_PQR_ASSOC contain closid and the
 * lower 10 bits rmid. The update to MSR_IA32_PQR_ASSOC always
@ -19,7 +18,6 @@
 struct intel_pqr_state {
 	u32			rmid;
 	u32			closid;
-	int			rmid_usecnt;
 };

 DECLARE_PER_CPU(struct intel_pqr_state, pqr_state);
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@ -40,6 +40,14 @@ DEFINE_MUTEX(rdtgroup_mutex);

 DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid);

+/*
+ * The cached intel_pqr_state is strictly per CPU and can never be
+ * updated from a remote CPU. Functions which modify the state
+ * are called with interrupts disabled and no preemption, which
+ * is sufficient for the protection.
+ */
+DEFINE_PER_CPU(struct intel_pqr_state, pqr_state);
+
 /*
 * Used to store the max resource name width and max resource data width
 * to display the schemata in a tabular format
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@ -139,14 +139,6 @@ struct hw_perf_event {
 			/* for tp_event->class */
 			struct list_head	tp_list;
 		};
-		struct { /* intel_cqm */
-			int			cqm_state;
-			u32			cqm_rmid;
-			int			is_group_event;
-			struct list_head	cqm_events_entry;
-			struct list_head	cqm_groups_entry;
-			struct list_head	cqm_group_entry;
-		};
 		struct { /* itrace */
 			int			itrace_started;
 		};
@ -416,11 +408,6 @@ struct pmu {
 	size_t				task_ctx_size;


-	/*
-	 * Return the count value for a counter.
-	 */
-	u64 (*count)			(struct perf_event *event); /*optional*/
-
 	/*
 	 * Set up pmu-private data structures for an AUX area
 	 */
@ -1111,11 +1098,6 @@ static inline void perf_event_task_sched_out(struct task_struct *prev,
 		__perf_event_task_sched_out(prev, next);
 }

-static inline u64 __perf_event_count(struct perf_event *event)
-{
-	return local64_read(&event->count) + atomic64_read(&event->child_count);
-}
-
 extern void perf_event_mmap(struct vm_area_struct *vma);
 extern struct perf_guest_info_callbacks *perf_guest_cbs;
 extern int perf_register_guest_info_callbacks(struct perf_guest_info_callbacks *callbacks);
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@ -3625,10 +3625,7 @@ static void __perf_event_read(void *info)

 static inline u64 perf_event_count(struct perf_event *event)
 {
-	if (event->pmu->count)
-		return event->pmu->count(event);
-
-	return __perf_event_count(event);
+	return local64_read(&event->count) + atomic64_read(&event->child_count);
 }

 /*
@ -3659,15 +3656,6 @@ int perf_event_read_local(struct perf_event *event, u64 *value)
 		goto out;
 	}

-	/*
-	 * It must not have a pmu::count method, those are not
-	 * NMI safe.
-	 */
-	if (event->pmu->count) {
-		ret = -EOPNOTSUPP;
-		goto out;
-	}
-
 	/* If this is a per-task event, it must be for current */
 	if ((event->attach_state & PERF_ATTACH_TASK) &&
 	    event->hw.target != current) {