All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC] perf: Allow fine-grained PMU access control
@ 2018-05-21  9:25 Tvrtko Ursulin
  2018-05-22  9:05 ` Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2018-05-21  9:25 UTC (permalink / raw)
  To: linux-kernel
  Cc: Tvrtko Ursulin, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
	Namhyung Kim

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

For situations where sysadmins might want to allow different level of
of access control for different PMUs, we start creating per-PMU
perf_event_paranoid controls in sysfs.

These work in equivalent fashion as the existing perf_event_paranoid
sysctl, which now becomes the parent control for each PMU.

On PMU registration the global/parent value will be inherited by each PMU,
as it will be propagated to all registered PMUs when the sysctl is
updated.

At any later point individual PMU access controls, located in
<sysfs>/device/<pmu-name>/perf_event_paranoid, can be adjusted to achieve
fine grained access control.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/events/intel/bts.c     |  2 +-
 arch/x86/events/intel/core.c    |  2 +-
 arch/x86/events/intel/p4.c      |  2 +-
 include/linux/perf_event.h      | 18 ++++--
 kernel/events/core.c            | 99 +++++++++++++++++++++++++++------
 kernel/sysctl.c                 |  4 +-
 kernel/trace/trace_event_perf.c |  6 +-
 7 files changed, 105 insertions(+), 28 deletions(-)

diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 24ffa1e88cf9..e416c9e2400a 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -555,7 +555,7 @@ static int bts_event_init(struct perf_event *event)
 	 * Note that the default paranoia setting permits unprivileged
 	 * users to profile the kernel.
 	 */
-	if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+	if (event->attr.exclude_kernel && perf_paranoid_kernel(event->pmu) &&
 	    !capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 707b2a96e516..6b126bdbd16c 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3025,7 +3025,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (x86_pmu.version < 3)
 		return -EINVAL;
 
-	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+	if (perf_paranoid_cpu(event->pmu) && !capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
 	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index d32c0eed38ca..878451ef1ace 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -776,7 +776,7 @@ static int p4_validate_raw_event(struct perf_event *event)
 	 * the user needs special permissions to be able to use it
 	 */
 	if (p4_ht_active() && p4_event_bind_map[v].shared) {
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_cpu(event->pmu) && !capable(CAP_SYS_ADMIN))
 			return -EACCES;
 	}
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index e71e99eb9a4e..2d9e7b4bcfac 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -271,6 +271,9 @@ struct pmu {
 	/* number of address filters this PMU can do */
 	unsigned int			nr_addr_filters;
 
+	/* fine grained access control */
+	int				perf_event_paranoid;
+
 	/*
 	 * Fully disable/enable this PMU, can be used to protect from the PMI
 	 * as well as for lazy/batch writing of the MSRs.
@@ -1159,6 +1162,9 @@ extern int sysctl_perf_cpu_time_max_percent;
 
 extern void perf_sample_event_took(u64 sample_len_ns);
 
+extern int perf_proc_paranoid_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
 extern int perf_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
@@ -1169,19 +1175,19 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
-static inline bool perf_paranoid_tracepoint_raw(void)
+static inline bool perf_paranoid_tracepoint_raw(const struct pmu *pmu)
 {
-	return sysctl_perf_event_paranoid > -1;
+	return pmu->perf_event_paranoid > -1;
 }
 
-static inline bool perf_paranoid_cpu(void)
+static inline bool perf_paranoid_cpu(const struct pmu *pmu)
 {
-	return sysctl_perf_event_paranoid > 0;
+	return pmu->perf_event_paranoid > 0;
 }
 
-static inline bool perf_paranoid_kernel(void)
+static inline bool perf_paranoid_kernel(const struct pmu *pmu)
 {
-	return sysctl_perf_event_paranoid > 1;
+	return pmu->perf_event_paranoid > 1;
 }
 
 extern void perf_event_init(void);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index 725d37d6e386..f20c41ff9c4b 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -432,6 +432,24 @@ static void update_perf_cpu_limits(void)
 
 static bool perf_rotate_context(struct perf_cpu_context *cpuctx);
 
+int perf_proc_paranoid_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	struct pmu *pmu;
+
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&pmus_lock);
+	list_for_each_entry(pmu, &pmus, entry)
+		pmu->perf_event_paranoid = sysctl_perf_event_paranoid;
+	mutex_unlock(&pmus_lock);
+
+	return 0;
+}
+
 int perf_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
@@ -4113,7 +4131,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 
 	if (!task) {
 		/* Must be root to operate on a CPU event: */
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_cpu(pmu) && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
@@ -5679,7 +5697,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	lock_limit >>= PAGE_SHIFT;
 	locked = vma->vm_mm->pinned_vm + extra;
 
-	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw(event->pmu) &&
 		!capable(CAP_IPC_LOCK)) {
 		ret = -EPERM;
 		goto unlock;
@@ -9426,6 +9444,41 @@ static void free_pmu_context(struct pmu *pmu)
 	mutex_unlock(&pmus_lock);
 }
 
+/*
+ * Fine-grained access control:
+ */
+static ssize_t
+perf_event_paranoid_show(struct device *dev,
+			 struct device_attribute *attr,
+			 char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->perf_event_paranoid);
+}
+
+static ssize_t
+perf_event_paranoid_store(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	int ret, val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < -1 || val > 2)
+		return -EINVAL;
+
+	pmu->perf_event_paranoid = val;
+
+	return count;
+}
+
+DEVICE_ATTR_RW(perf_event_paranoid);
+
 /*
  * Let userspace know that this PMU supports address range filtering:
  */
@@ -9540,6 +9593,11 @@ static int pmu_dev_alloc(struct pmu *pmu)
 	if (ret)
 		goto free_dev;
 
+	/* Add fine-grained access control attribute. */
+	ret = device_create_file(pmu->dev, &dev_attr_perf_event_paranoid);
+	if (ret)
+		goto del_dev;
+
 	/* For PMUs with address filters, throw in an extra attribute: */
 	if (pmu->nr_addr_filters)
 		ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
@@ -9571,6 +9629,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 	if (!pmu->pmu_disable_count)
 		goto unlock;
 
+	pmu->perf_event_paranoid = sysctl_perf_event_paranoid;
 	pmu->type = -1;
 	if (!name)
 		goto skip_type;
@@ -10190,10 +10249,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 			 */
 			attr->branch_sample_type = mask;
 		}
-		/* privileged levels capture (kernel, hv): check permissions */
-		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
-		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
 	}
 
 	if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -10410,11 +10465,6 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (err)
 		return err;
 
-	if (!attr.exclude_kernel) {
-		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
-	}
-
 	if (attr.namespaces) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
@@ -10428,11 +10478,6 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
-	/* Only privileged users can get physical addresses */
-	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
-	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
 	/*
 	 * In cgroup mode, the pid argument is used to pass the fd
 	 * opened to the cgroup directory in cgroupfs. The cpu argument
@@ -10502,6 +10547,28 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_cred;
 	}
 
+	if (!attr.exclude_kernel) {
+		if (perf_paranoid_kernel(event->pmu) &&
+		    !capable(CAP_SYS_ADMIN)) {
+			err = -EACCES;
+			goto err_alloc;
+		}
+	}
+
+	/* Only privileged users can get physical addresses */
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+	    perf_paranoid_kernel(event->pmu) && !capable(CAP_SYS_ADMIN)) {
+		err = -EACCES;
+		goto err_alloc;
+	}
+
+	/* privileged levels capture (kernel, hv): check permissions */
+	if ((attr.branch_sample_type & PERF_SAMPLE_BRANCH_PERM_PLM) &&
+	    perf_paranoid_kernel(event->pmu) && !capable(CAP_SYS_ADMIN)) {
+		err = -EACCES;
+		goto err_alloc;
+	}
+
 	if (is_sampling_event(event)) {
 		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
 			err = -EOPNOTSUPP;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 6a78cf70761d..aeec3ac5405e 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1142,7 +1142,9 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_perf_event_paranoid,
 		.maxlen		= sizeof(sysctl_perf_event_paranoid),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= perf_proc_paranoid_handler,
+		.extra1		= &neg_one,
+		.extra2		= &two,
 	},
 	{
 		.procname	= "perf_event_mlock_kb",
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index c79193e598f5..545a7ef9bfe1 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -45,7 +45,8 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
 
 	/* The ftrace function trace is allowed only for root. */
 	if (ftrace_event_is_function(tp_event)) {
-		if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_tracepoint_raw(p_event->pmu) &&
+		    !capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
 		if (!is_sampling_event(p_event))
@@ -81,7 +82,8 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
 	 * ...otherwise raw tracepoint data can be a severe data leak,
 	 * only allow root to have these.
 	 */
-	if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+	if (perf_paranoid_tracepoint_raw(p_event->pmu) &&
+	    !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	return 0;
-- 
2.17.0

^ permalink raw reply related	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-05-21  9:25 [RFC] perf: Allow fine-grained PMU access control Tvrtko Ursulin
@ 2018-05-22  9:05 ` Peter Zijlstra
  2018-05-22  9:29   ` Tvrtko Ursulin
  0 siblings, 1 reply; 14+ messages in thread
From: Peter Zijlstra @ 2018-05-22  9:05 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: linux-kernel, Tvrtko Ursulin, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
	Namhyung Kim, Mark Rutland

On Mon, May 21, 2018 at 10:25:49AM +0100, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> For situations where sysadmins might want to allow different level of
> of access control for different PMUs, we start creating per-PMU
> perf_event_paranoid controls in sysfs.

Could you explain how exactly this makes sense?

For example, how does it make sense for one PMU to reveal kernel data
while another PMU is not allowed.

Once you allow one PMU to do so, the secret is out.

So please explain, in excruciating detail, how you want to use this and
how exactly that makes sense from a security pov.

> These work in equivalent fashion as the existing perf_event_paranoid
> sysctl, which now becomes the parent control for each PMU.
> 
> On PMU registration the global/parent value will be inherited by each PMU,
> as it will be propagated to all registered PMUs when the sysctl is
> updated.
> 
> At any later point individual PMU access controls, located in
> <sysfs>/device/<pmu-name>/perf_event_paranoid, can be adjusted to achieve
> fine grained access control.
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-05-22  9:05 ` Peter Zijlstra
@ 2018-05-22  9:29   ` Tvrtko Ursulin
  2018-05-22 12:32     ` Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2018-05-22  9:29 UTC (permalink / raw)
  To: Peter Zijlstra
  Cc: linux-kernel, Tvrtko Ursulin, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
	Namhyung Kim, Mark Rutland, Tvrtko Ursulin


On 22/05/18 10:05, Peter Zijlstra wrote:
> On Mon, May 21, 2018 at 10:25:49AM +0100, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> For situations where sysadmins might want to allow different level of
>> of access control for different PMUs, we start creating per-PMU
>> perf_event_paranoid controls in sysfs.
> 
> Could you explain how exactly this makes sense?
> 
> For example, how does it make sense for one PMU to reveal kernel data
> while another PMU is not allowed.
> 
> Once you allow one PMU to do so, the secret is out.
> 
> So please explain, in excruciating detail, how you want to use this and
> how exactly that makes sense from a security pov.

Not sure it will be excruciating but will try to explain once again.

There are two things:

1. i915 PMU which exports data such as different engine busyness levels. 
(Perhaps you remember, you helped us implement this from the perf API 
angle.)

2. Customers who want to look at those stats in production.

They want to use it to answer questions such as:

a) How loaded is my server and can it take one more of X type of job?
b) What is the least utilised video engine to submit the next packet of 
work to?
c) What is the least utilised server to schedule the next transcoding 
job on?

Current option for them is to turn off the global paranoid setting which 
then enables unprivileged access to _all_ PMU providers.

To me it sounded quite logical that it would be better for the paranoid 
knob to be more fine-grained, so that they can configure their servers 
so only access to needed data is possible.

I am not sure what do you mean by "Once you allow one PMU to do so, the 
secret is out."? What secret? Are you implying that enabling 
unprivileged access to i915 engine busyness data opens up access to CPU 
PMU's as well via some side channel?

Regards,

Tvrtko

>> These work in equivalent fashion as the existing perf_event_paranoid
>> sysctl, which now becomes the parent control for each PMU.
>>
>> On PMU registration the global/parent value will be inherited by each PMU,
>> as it will be propagated to all registered PMUs when the sysctl is
>> updated.
>>
>> At any later point individual PMU access controls, located in
>> <sysfs>/device/<pmu-name>/perf_event_paranoid, can be adjusted to achieve
>> fine grained access control.
>>
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-05-22  9:29   ` Tvrtko Ursulin
@ 2018-05-22 12:32     ` Peter Zijlstra
  2018-05-22 13:01       ` Alexey Budankov
  2018-05-22 16:15       ` Tvrtko Ursulin
  0 siblings, 2 replies; 14+ messages in thread
From: Peter Zijlstra @ 2018-05-22 12:32 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: linux-kernel, Tvrtko Ursulin, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
	Namhyung Kim, Mark Rutland, Tvrtko Ursulin

On Tue, May 22, 2018 at 10:29:29AM +0100, Tvrtko Ursulin wrote:
> 
> On 22/05/18 10:05, Peter Zijlstra wrote:
> > On Mon, May 21, 2018 at 10:25:49AM +0100, Tvrtko Ursulin wrote:
> > > From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> > > 
> > > For situations where sysadmins might want to allow different level of
> > > of access control for different PMUs, we start creating per-PMU
> > > perf_event_paranoid controls in sysfs.
> > 
> > Could you explain how exactly this makes sense?
> > 
> > For example, how does it make sense for one PMU to reveal kernel data
> > while another PMU is not allowed.
> > 
> > Once you allow one PMU to do so, the secret is out.
> > 
> > So please explain, in excruciating detail, how you want to use this and
> > how exactly that makes sense from a security pov.
> 
> Not sure it will be excruciating but will try to explain once again.
> 
> There are two things:
> 
> 1. i915 PMU which exports data such as different engine busyness levels.
> (Perhaps you remember, you helped us implement this from the perf API
> angle.)

Right, but I completely forgot everything again.. So thanks for
reminding.

> 2. Customers who want to look at those stats in production.
> 
> They want to use it to answer questions such as:
> 
> a) How loaded is my server and can it take one more of X type of job?
> b) What is the least utilised video engine to submit the next packet of work
> to?
> c) What is the least utilised server to schedule the next transcoding job
> on?

On the other hand, do those counters provide enough information for a
side-channel (timing) attack on GPGPU workloads? Because, as you say, it
is a shared resource. So if user A is doing GPGPU crypto, and user B is
observing, might he infer things from the counters?

> Current option for them is to turn off the global paranoid setting which
> then enables unprivileged access to _all_ PMU providers.

Right.

> To me it sounded quite logical that it would be better for the paranoid knob
> to be more fine-grained, so that they can configure their servers so only
> access to needed data is possible.

The proposed semantics are a tad awkward though, the moment you prod at
the sysctl you loose all individual PMU settings. Ideally the per-pmu
would have a special setting that says follow-global in addition to the
existing ones.

> I am not sure what do you mean by "Once you allow one PMU to do so, the
> secret is out."? What secret? Are you implying that enabling unprivileged
> access to i915 engine busyness data opens up access to CPU PMU's as well via
> some side channel?

It was not i915 specific; but if you look at the descriptions:

 * perf event paranoia level:
 *  -1 - not paranoid at all
 *   0 - disallow raw tracepoint access for unpriv
 *   1 - disallow cpu events for unpriv
 *   2 - disallow kernel profiling for unpriv

Then the moment you allow some data to escape, it cannot be put back.
i915 is fairly special in that (afaict) it doesn't leak kernel specific
data

In general I think allowing access to uncore PMUs will leak kernel data.
Thus in general I'm fairly wary of all this.

Is there no other way to expose this information? Can't we do a
traditional load-avg like thing for the GPU?

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-05-22 12:32     ` Peter Zijlstra
@ 2018-05-22 13:01       ` Alexey Budankov
  2018-05-22 17:19         ` Andi Kleen
  2018-05-22 16:15       ` Tvrtko Ursulin
  1 sibling, 1 reply; 14+ messages in thread
From: Alexey Budankov @ 2018-05-22 13:01 UTC (permalink / raw)
  To: Peter Zijlstra, Tvrtko Ursulin
  Cc: linux-kernel, Tvrtko Ursulin, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
	Namhyung Kim, Mark Rutland, Tvrtko Ursulin, Andi Kleen

Hi,
On 22.05.2018 15:32, Peter Zijlstra wrote:
> On Tue, May 22, 2018 at 10:29:29AM +0100, Tvrtko Ursulin wrote:
>>
>> On 22/05/18 10:05, Peter Zijlstra wrote:
>>> On Mon, May 21, 2018 at 10:25:49AM +0100, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> For situations where sysadmins might want to allow different level of
>>>> of access control for different PMUs, we start creating per-PMU
>>>> perf_event_paranoid controls in sysfs.
>>>
>>> Could you explain how exactly this makes sense?
>>>
>>> For example, how does it make sense for one PMU to reveal kernel data
>>> while another PMU is not allowed.
>>>
>>> Once you allow one PMU to do so, the secret is out.
>>>
>>> So please explain, in excruciating detail, how you want to use this and
>>> how exactly that makes sense from a security pov.
>>
>> Not sure it will be excruciating but will try to explain once again.
>>
>> There are two things:
>>
>> 1. i915 PMU which exports data such as different engine busyness levels.
>> (Perhaps you remember, you helped us implement this from the perf API
>> angle.)
> 
> Right, but I completely forgot everything again.. So thanks for
> reminding.
> 
>> 2. Customers who want to look at those stats in production.
>>
>> They want to use it to answer questions such as:
>>
>> a) How loaded is my server and can it take one more of X type of job?
>> b) What is the least utilised video engine to submit the next packet of work
>> to?
>> c) What is the least utilised server to schedule the next transcoding job
>> on?
> 
> On the other hand, do those counters provide enough information for a
> side-channel (timing) attack on GPGPU workloads? Because, as you say, it
> is a shared resource. So if user A is doing GPGPU crypto, and user B is
> observing, might he infer things from the counters?
> 
>> Current option for them is to turn off the global paranoid setting which
>> then enables unprivileged access to _all_ PMU providers.
> 
> Right.
> 
>> To me it sounded quite logical that it would be better for the paranoid knob
>> to be more fine-grained, so that they can configure their servers so only
>> access to needed data is possible.
> 
> The proposed semantics are a tad awkward though, the moment you prod at
> the sysctl you loose all individual PMU settings. Ideally the per-pmu
> would have a special setting that says follow-global in addition to the
> existing ones.
> 
>> I am not sure what do you mean by "Once you allow one PMU to do so, the
>> secret is out."? What secret? Are you implying that enabling unprivileged
>> access to i915 engine busyness data opens up access to CPU PMU's as well via
>> some side channel?
> 
> It was not i915 specific; but if you look at the descriptions:
> 
>  * perf event paranoia level:
>  *  -1 - not paranoid at all
>  *   0 - disallow raw tracepoint access for unpriv
>  *   1 - disallow cpu events for unpriv
>  *   2 - disallow kernel profiling for unpriv
> 
> Then the moment you allow some data to escape, it cannot be put back.
> i915 is fairly special in that (afaict) it doesn't leak kernel specific
> data
> 
> In general I think allowing access to uncore PMUs will leak kernel data.

IMHO, it is unsafe for CBOX pmu but could IMC, UPI pmus be an exception here?
Because currently perf stat -I from IMC, UPI counters is only allowed when 
system wide monitoring is permitted and this prevents joint perf record and 
perf stat -I in cluster environments where users usually lack ability to 
modify paranoid. Adding Andi who may have more ideas regarding all that.

> Thus in general I'm fairly wary of all this.

Second this. Extra care is required here so some security related folks 
need to be involved into the discussion.

> 
> Is there no other way to expose this information? Can't we do a
> traditional load-avg like thing for the GPU?
> 

Thanks,
Alexey

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-05-22 12:32     ` Peter Zijlstra
  2018-05-22 13:01       ` Alexey Budankov
@ 2018-05-22 16:15       ` Tvrtko Ursulin
  1 sibling, 0 replies; 14+ messages in thread
From: Tvrtko Ursulin @ 2018-05-22 16:15 UTC (permalink / raw)
  To: Peter Zijlstra, Tvrtko Ursulin
  Cc: linux-kernel, Tvrtko Ursulin, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
	Namhyung Kim, Mark Rutland


On 22/05/2018 13:32, Peter Zijlstra wrote:
> On Tue, May 22, 2018 at 10:29:29AM +0100, Tvrtko Ursulin wrote:
>>
>> On 22/05/18 10:05, Peter Zijlstra wrote:
>>> On Mon, May 21, 2018 at 10:25:49AM +0100, Tvrtko Ursulin wrote:
>>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>>
>>>> For situations where sysadmins might want to allow different level of
>>>> of access control for different PMUs, we start creating per-PMU
>>>> perf_event_paranoid controls in sysfs.
>>>
>>> Could you explain how exactly this makes sense?
>>>
>>> For example, how does it make sense for one PMU to reveal kernel data
>>> while another PMU is not allowed.
>>>
>>> Once you allow one PMU to do so, the secret is out.
>>>
>>> So please explain, in excruciating detail, how you want to use this and
>>> how exactly that makes sense from a security pov.
>>
>> Not sure it will be excruciating but will try to explain once again.
>>
>> There are two things:
>>
>> 1. i915 PMU which exports data such as different engine busyness levels.
>> (Perhaps you remember, you helped us implement this from the perf API
>> angle.)
> 
> Right, but I completely forgot everything again.. So thanks for
> reminding.
> 
>> 2. Customers who want to look at those stats in production.
>>
>> They want to use it to answer questions such as:
>>
>> a) How loaded is my server and can it take one more of X type of job?
>> b) What is the least utilised video engine to submit the next packet of work
>> to?
>> c) What is the least utilised server to schedule the next transcoding job
>> on?
> 
> On the other hand, do those counters provide enough information for a
> side-channel (timing) attack on GPGPU workloads? Because, as you say, it
> is a shared resource. So if user A is doing GPGPU crypto, and user B is
> observing, might he infer things from the counters?

This question would need to be looked at by security experts. And maybe 
it would be best to spawn off that effort separately. Because for me the 
most important question here is whether adding per PMU access control 
makes security worse, better, or is neutral? At the moment I cannot see 
that it makes anything worse, since the real-world alternative is to 
turn all security off. Enabling sysadmins to only relax access to a 
subset of PMU's I think can at worst be neutral. And if it is not 
possible to side-channel everything from anything, then it should be 
better overall security.

In terms of what metrics i915 PMU exposes the current list is this:

1. GPU global counters
1.1 Driver requested frequency and actual GPU frequency
1.2 Time spent in RC6 state
1.3 Interrupt count

2. Per GPU engine counters
2.1 Time spent engine was executing something
2.2 Time spent engine was waiting on semaphores
2.3 Time spent engine was waiting on sync events

In the future we are also considering:

2.4 Number of requests queued / runnable / running

>> Current option for them is to turn off the global paranoid setting which
>> then enables unprivileged access to _all_ PMU providers.
> 
> Right.
> 
>> To me it sounded quite logical that it would be better for the paranoid knob
>> to be more fine-grained, so that they can configure their servers so only
>> access to needed data is possible.
> 
> The proposed semantics are a tad awkward though, the moment you prod at
> the sysctl you loose all individual PMU settings. Ideally the per-pmu
> would have a special setting that says follow-global in addition to the
> existing ones.

Hmm.. possibly follow global makes sense for some use cases, but also I 
do not at the moment see awkwardness in the proposed semantics. The 
master knob should be only touched by sysadmins so any override of 
individual settings is a top-level decision, together will all the 
sub-controls, which is as it should be. If we had follow global, I 
suspect we would still need to have top-level override so it is 
basically a discussion on the richness of the controls.

>> I am not sure what do you mean by "Once you allow one PMU to do so, the
>> secret is out."? What secret? Are you implying that enabling unprivileged
>> access to i915 engine busyness data opens up access to CPU PMU's as well via
>> some side channel?
> 
> It was not i915 specific; but if you look at the descriptions:
> 
>   * perf event paranoia level:
>   *  -1 - not paranoid at all
>   *   0 - disallow raw tracepoint access for unpriv
>   *   1 - disallow cpu events for unpriv
>   *   2 - disallow kernel profiling for unpriv
> 
> Then the moment you allow some data to escape, it cannot be put back.
> i915 is fairly special in that (afaict) it doesn't leak kernel specific
> data
> 
> In general I think allowing access to uncore PMUs will leak kernel data.
> Thus in general I'm fairly wary of all this.

Yeah, I guess I don't follow this argument since I am not relaxing any 
security criteria. Just adding ability to apply the existing scale per 
individual PMU provider.

> Is there no other way to expose this information? Can't we do a
> traditional load-avg like thing for the GPU?

We of course could expose the same data in sysfs, or somewhere, and then 
control access to it via the filesystem, but we wanted to avoid 
duplication. Since we picked to export via PMU, ideally we would like to 
maintain only one mechanism to export the same set data.

Also perf uAPI is pretty handy to use from userspace, where you can read 
all the interesting counters in one go together with a matching timestamp.

Furthermore I do not see how that would make a difference security wise? 
If the concern is exposing i915 PMU data to unprivileged users (via 
explicit sysadmin action!), then the mechanism of exposure shouldn't be 
important.

The argument may be that the proposed fine-grained control are 
uninteresting for all other PMU providers, so it is undesirable to 
burden the perf core with extra code, which I would understand.

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-05-22 13:01       ` Alexey Budankov
@ 2018-05-22 17:19         ` Andi Kleen
  2018-06-11  8:08           ` Tvrtko Ursulin
  0 siblings, 1 reply; 14+ messages in thread
From: Andi Kleen @ 2018-05-22 17:19 UTC (permalink / raw)
  To: Alexey Budankov
  Cc: Peter Zijlstra, Tvrtko Ursulin, linux-kernel, Tvrtko Ursulin,
	Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Mark Rutland, Tvrtko Ursulin

> IMHO, it is unsafe for CBOX pmu but could IMC, UPI pmus be an exception here?
> Because currently perf stat -I from IMC, UPI counters is only allowed when 
> system wide monitoring is permitted and this prevents joint perf record and 
> perf stat -I in cluster environments where users usually lack ability to 
> modify paranoid. Adding Andi who may have more ideas regarding all that.

PMU isolation is about not making side channels worse. There are normally
already side channels from timing, but it has a degree of noise.

PMU isolation is just to prevent opening side channels with less noise.
But reducing noise is always a trade off, it can never be perfect
and at some point there are dimishing returns.

In general the farther you are from the origin of the noise there 
is already more noise. The PMU can reduce the noise, but if it's far
enough away it may not make much difference.

So there are always trade offs with shades of grey, not a black 
and white situation. Depending on your security requirements
it may be totally reasonable e.g. to allow the PMU 
on the memory controller (which is already very noisy in any case), 
but not on the caches.

Or allow it only on the graphics which is already fairly isolated.

So per pmu paranoid settings are a useful concept.

-Andi

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-05-22 17:19         ` Andi Kleen
@ 2018-06-11  8:08           ` Tvrtko Ursulin
  2018-06-18  8:06             ` Alexey Budankov
  0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2018-06-11  8:08 UTC (permalink / raw)
  To: Andi Kleen, Alexey Budankov
  Cc: Peter Zijlstra, Tvrtko Ursulin, linux-kernel, Tvrtko Ursulin,
	Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Mark Rutland, Rogozhkin, Dmitry V


Hi,

On 22/05/2018 18:19, Andi Kleen wrote:
>> IMHO, it is unsafe for CBOX pmu but could IMC, UPI pmus be an exception here?
>> Because currently perf stat -I from IMC, UPI counters is only allowed when
>> system wide monitoring is permitted and this prevents joint perf record and
>> perf stat -I in cluster environments where users usually lack ability to
>> modify paranoid. Adding Andi who may have more ideas regarding all that.
> 
> PMU isolation is about not making side channels worse. There are normally
> already side channels from timing, but it has a degree of noise.
> 
> PMU isolation is just to prevent opening side channels with less noise.
> But reducing noise is always a trade off, it can never be perfect
> and at some point there are dimishing returns.
> 
> In general the farther you are from the origin of the noise there
> is already more noise. The PMU can reduce the noise, but if it's far
> enough away it may not make much difference.
> 
> So there are always trade offs with shades of grey, not a black
> and white situation. Depending on your security requirements
> it may be totally reasonable e.g. to allow the PMU
> on the memory controller (which is already very noisy in any case),
> but not on the caches.
> 
> Or allow it only on the graphics which is already fairly isolated.
> 
> So per pmu paranoid settings are a useful concept.

So it seems there is some positive feedback and fine-grained controls 
would be useful for other PMU's in cluster environments.

If we have agreement on that, question is how to drive this forward? 
Would someone be able to review the patch I've sent, or suggest more 
people to look at it before it could be queued up for merge?

Regards,

Tvrtko

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-06-11  8:08           ` Tvrtko Ursulin
@ 2018-06-18  8:06             ` Alexey Budankov
  0 siblings, 0 replies; 14+ messages in thread
From: Alexey Budankov @ 2018-06-18  8:06 UTC (permalink / raw)
  To: Tvrtko Ursulin, Andi Kleen
  Cc: Peter Zijlstra, Tvrtko Ursulin, linux-kernel, Tvrtko Ursulin,
	Ingo Molnar, Arnaldo Carvalho de Melo, Alexander Shishkin,
	Jiri Olsa, Namhyung Kim, Mark Rutland, Rogozhkin, Dmitry V

Hi Tvrtko,

On 11.06.2018 11:08, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 22/05/2018 18:19, Andi Kleen wrote:
>>> IMHO, it is unsafe for CBOX pmu but could IMC, UPI pmus be an exception here?
>>> Because currently perf stat -I from IMC, UPI counters is only allowed when
>>> system wide monitoring is permitted and this prevents joint perf record and
>>> perf stat -I in cluster environments where users usually lack ability to
>>> modify paranoid. Adding Andi who may have more ideas regarding all that.
>>
>> PMU isolation is about not making side channels worse. There are normally
>> already side channels from timing, but it has a degree of noise.
>>
>> PMU isolation is just to prevent opening side channels with less noise.
>> But reducing noise is always a trade off, it can never be perfect
>> and at some point there are dimishing returns.
>>
>> In general the farther you are from the origin of the noise there
>> is already more noise. The PMU can reduce the noise, but if it's far
>> enough away it may not make much difference.
>>
>> So there are always trade offs with shades of grey, not a black
>> and white situation. Depending on your security requirements
>> it may be totally reasonable e.g. to allow the PMU
>> on the memory controller (which is already very noisy in any case),
>> but not on the caches.
>>
>> Or allow it only on the graphics which is already fairly isolated.
>>
>> So per pmu paranoid settings are a useful concept.
> 
> So it seems there is some positive feedback and fine-grained controls would be useful for other PMU's in cluster environments.
> 
> If we have agreement on that, question is how to drive this forward? Would someone be able to review the patch I've sent, or suggest more people to look at it before it could be queued up for merge?

It makes sense to split this RFC into series of patches and resend. 
The series could be shaped up something similar to this:

[PATCH v1 0/4]: perf: enable per-pmu paranoid setting for Intel GPU pmu

    [PATCH v1 1/1]: perf/core: introduce pmu specific paranoid settings
        - extend pmu kernel object in the headers with the new settings
	- adjust code to adopt this new settings

    [PATCH v1 1/2]: perf/core: enable pmu specific paranoid setting thru fs
	- introduce code interfacing the setting thru fs from userspace
	- may be introduce code applying some policies around 
          global/per-pmu relationship

    [PATCH v1 1/3]: perf/core: enable i915 GPU pmu specifics features
	- implement your specific task related to GPU pmu on top of 
          this new whole concept

    [PATCH v1 1/4]: perf/docs: document Intel GPU pmu paranoid specific changes
	- some may be regression testing and README or other docs 
          updates related to the changes

Also when sending the patches The Linux kernel security team (security@kernel.org)
needs to be in TO or CC to let the folks know of the changes and possibly 
explicitly ask support from them.

Regards,
Alexey

> 
> Regards,
> 
> Tvrtko
> 

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-01-19 17:10   ` Tvrtko Ursulin
@ 2018-02-23 15:58     ` Tvrtko Ursulin
  0 siblings, 0 replies; 14+ messages in thread
From: Tvrtko Ursulin @ 2018-02-23 15:58 UTC (permalink / raw)
  To: Peter Zijlstra, Tvrtko Ursulin
  Cc: Alexander Shishkin, Intel-gfx, linux-kernel,
	Arnaldo Carvalho de Melo, Ingo Molnar, Namhyung Kim, Jiri Olsa


Hi,

On 19/01/2018 17:10, Tvrtko Ursulin wrote:
> 
> Hi,
> 
> On 19/01/2018 16:45, Peter Zijlstra wrote:
>> On Thu, Jan 18, 2018 at 06:40:07PM +0000, Tvrtko Ursulin wrote:
>>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>>
>>> For situations where sysadmins might want to allow different level of
>>> of access control for different PMUs, we start creating per-PMU
>>> perf_event_paranoid controls in sysfs.
>>
>> You've completely and utterly failed to explain why.
> 
> On an abstract level, if there is a desire to decrease the security knob 
> on one particular PMU provider, it is better to be able to do it just 
> for the one, rather for the whole system.
> 
> On a more concrete level, we have customers who want to look at certain 
> i915 metrics, most probably engine utilization or queue depth, in order 
> to make load-balancing decisions. (The two would be roughly analogous to 
> CPU usage and load.)
> 
> This data needs to be available to their userspaces dynamically and 
> would be used to pick a best GPU engine (mostly analogous to a CPU core) 
> to run a particular packet of work.
> 
> It would be impossible to run their product as root, and while one 
> option could be to write a proxy daemon which would allow unprivileged 
> queries, it is also a significant complication which introduces a time 
> shift problem on the PMU data as well.
> 
> So my thinking was that a per-PMU paranoid control should not be a 
> problematic concept in general. And my gut feeling anyway was that not 
> all PMU providers are the same class data, security wise, which was 
> another reason I thought per-PMU controls would be fine.
> 
> There is one more way of thinking about it, and that is that the access 
> control could even be extended to be per-event, and not just per-PMU. 
> That would allow registered PMUs to let the core know which counters are 
> potentially security sensitive, and which are not.
> 
> I've sent another RFC along those lines some time ago, but afterwards 
> I've changed my mind and thought the approach from this patch should be 
> less controversial since it retains all control fully in the perf core 
> and in the hands of sysadmins.

Any thoughts on this one? Is the approach acceptable?

Regards,

Tvrtko
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-01-19 16:45   ` Peter Zijlstra
  (?)
@ 2018-01-19 17:10   ` Tvrtko Ursulin
  2018-02-23 15:58     ` Tvrtko Ursulin
  -1 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2018-01-19 17:10 UTC (permalink / raw)
  To: Peter Zijlstra, Tvrtko Ursulin
  Cc: Alexander Shishkin, Intel-gfx, linux-kernel,
	Arnaldo Carvalho de Melo, Ingo Molnar, Namhyung Kim, Jiri Olsa


Hi,

On 19/01/2018 16:45, Peter Zijlstra wrote:
> On Thu, Jan 18, 2018 at 06:40:07PM +0000, Tvrtko Ursulin wrote:
>> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
>>
>> For situations where sysadmins might want to allow different level of
>> of access control for different PMUs, we start creating per-PMU
>> perf_event_paranoid controls in sysfs.
> 
> You've completely and utterly failed to explain why.

On an abstract level, if there is a desire to decrease the security knob 
on one particular PMU provider, it is better to be able to do it just 
for the one, rather for the whole system.

On a more concrete level, we have customers who want to look at certain 
i915 metrics, most probably engine utilization or queue depth, in order 
to make load-balancing decisions. (The two would be roughly analogous to 
CPU usage and load.)

This data needs to be available to their userspaces dynamically and 
would be used to pick a best GPU engine (mostly analogous to a CPU core) 
to run a particular packet of work.

It would be impossible to run their product as root, and while one 
option could be to write a proxy daemon which would allow unprivileged 
queries, it is also a significant complication which introduces a time 
shift problem on the PMU data as well.

So my thinking was that a per-PMU paranoid control should not be a 
problematic concept in general. And my gut feeling anyway was that not 
all PMU providers are the same class data, security wise, which was 
another reason I thought per-PMU controls would be fine.

There is one more way of thinking about it, and that is that the access 
control could even be extended to be per-event, and not just per-PMU. 
That would allow registered PMUs to let the core know which counters are 
potentially security sensitive, and which are not.

I've sent another RFC along those lines some time ago, but afterwards 
I've changed my mind and thought the approach from this patch should be 
less controversial since it retains all control fully in the perf core 
and in the hands of sysadmins.

Regards,

Tvrtko


_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
  2018-01-18 18:40 Tvrtko Ursulin
@ 2018-01-19 16:45   ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2018-01-19 16:45 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: Intel-gfx, Tvrtko Ursulin, Ingo Molnar, Arnaldo Carvalho de Melo,
	Alexander Shishkin, Jiri Olsa, Namhyung Kim, linux-kernel

On Thu, Jan 18, 2018 at 06:40:07PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> For situations where sysadmins might want to allow different level of
> of access control for different PMUs, we start creating per-PMU
> perf_event_paranoid controls in sysfs.

You've completely and utterly failed to explain why.

^ permalink raw reply	[flat|nested] 14+ messages in thread

* Re: [RFC] perf: Allow fine-grained PMU access control
@ 2018-01-19 16:45   ` Peter Zijlstra
  0 siblings, 0 replies; 14+ messages in thread
From: Peter Zijlstra @ 2018-01-19 16:45 UTC (permalink / raw)
  To: Tvrtko Ursulin
  Cc: Alexander Shishkin, Intel-gfx, linux-kernel,
	Arnaldo Carvalho de Melo, Ingo Molnar, Namhyung Kim, Jiri Olsa

On Thu, Jan 18, 2018 at 06:40:07PM +0000, Tvrtko Ursulin wrote:
> From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
> 
> For situations where sysadmins might want to allow different level of
> of access control for different PMUs, we start creating per-PMU
> perf_event_paranoid controls in sysfs.

You've completely and utterly failed to explain why.
_______________________________________________
Intel-gfx mailing list
Intel-gfx@lists.freedesktop.org
https://lists.freedesktop.org/mailman/listinfo/intel-gfx

^ permalink raw reply	[flat|nested] 14+ messages in thread

* [RFC] perf: Allow fine-grained PMU access control
@ 2018-01-18 18:40 Tvrtko Ursulin
  2018-01-19 16:45   ` Peter Zijlstra
  0 siblings, 1 reply; 14+ messages in thread
From: Tvrtko Ursulin @ 2018-01-18 18:40 UTC (permalink / raw)
  To: Intel-gfx
  Cc: tursulin, Tvrtko Ursulin, Peter Zijlstra, Ingo Molnar,
	Arnaldo Carvalho de Melo, Alexander Shishkin, Jiri Olsa,
	Namhyung Kim, linux-kernel

From: Tvrtko Ursulin <tvrtko.ursulin@intel.com>

For situations where sysadmins might want to allow different level of
of access control for different PMUs, we start creating per-PMU
perf_event_paranoid controls in sysfs.

These work in equivalent fashion as the existing perf_event_paranoid
sysctl, which now becomes the parent control for each PMU.

On PMU registration the global/parent value will be inherited by each PMU,
as it will be propagated to all registered PMUs when the sysctl is
updated.

At any later point individual PMU access controls, located in
<sysfs>/device/<pmu-name>/perf_event_paranoid, can be adjusted to achieve
fine grained access control.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Alexander Shishkin <alexander.shishkin@linux.intel.com>
Cc: Jiri Olsa <jolsa@redhat.com>
Cc: Namhyung Kim <namhyung@kernel.org>
Cc: linux-kernel@vger.kernel.org
---
 arch/x86/events/intel/bts.c     |  2 +-
 arch/x86/events/intel/core.c    |  2 +-
 arch/x86/events/intel/p4.c      |  2 +-
 include/linux/perf_event.h      | 18 +++++---
 kernel/events/core.c            | 99 ++++++++++++++++++++++++++++++++++-------
 kernel/sysctl.c                 |  4 +-
 kernel/trace/trace_event_perf.c |  6 ++-
 7 files changed, 105 insertions(+), 28 deletions(-)

diff --git a/arch/x86/events/intel/bts.c b/arch/x86/events/intel/bts.c
index 24ffa1e88cf9..e416c9e2400a 100644
--- a/arch/x86/events/intel/bts.c
+++ b/arch/x86/events/intel/bts.c
@@ -555,7 +555,7 @@ static int bts_event_init(struct perf_event *event)
 	 * Note that the default paranoia setting permits unprivileged
 	 * users to profile the kernel.
 	 */
-	if (event->attr.exclude_kernel && perf_paranoid_kernel() &&
+	if (event->attr.exclude_kernel && perf_paranoid_kernel(event->pmu) &&
 	    !capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 731153a4681e..d623db13f212 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3009,7 +3009,7 @@ static int intel_pmu_hw_config(struct perf_event *event)
 	if (x86_pmu.version < 3)
 		return -EINVAL;
 
-	if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+	if (perf_paranoid_cpu(event->pmu) && !capable(CAP_SYS_ADMIN))
 		return -EACCES;
 
 	event->hw.config |= ARCH_PERFMON_EVENTSEL_ANY;
diff --git a/arch/x86/events/intel/p4.c b/arch/x86/events/intel/p4.c
index d32c0eed38ca..878451ef1ace 100644
--- a/arch/x86/events/intel/p4.c
+++ b/arch/x86/events/intel/p4.c
@@ -776,7 +776,7 @@ static int p4_validate_raw_event(struct perf_event *event)
 	 * the user needs special permissions to be able to use it
 	 */
 	if (p4_ht_active() && p4_event_bind_map[v].shared) {
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_cpu(event->pmu) && !capable(CAP_SYS_ADMIN))
 			return -EACCES;
 	}
 
diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h
index 7546822a1d74..1cb4e00d7f96 100644
--- a/include/linux/perf_event.h
+++ b/include/linux/perf_event.h
@@ -271,6 +271,9 @@ struct pmu {
 	/* number of address filters this PMU can do */
 	unsigned int			nr_addr_filters;
 
+	/* fine grained access control */
+	int				perf_event_paranoid;
+
 	/*
 	 * Fully disable/enable this PMU, can be used to protect from the PMI
 	 * as well as for lazy/batch writing of the MSRs.
@@ -1141,6 +1144,9 @@ extern int sysctl_perf_cpu_time_max_percent;
 
 extern void perf_sample_event_took(u64 sample_len_ns);
 
+extern int perf_proc_paranoid_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos);
 extern int perf_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos);
@@ -1151,19 +1157,19 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write,
 int perf_event_max_stack_handler(struct ctl_table *table, int write,
 				 void __user *buffer, size_t *lenp, loff_t *ppos);
 
-static inline bool perf_paranoid_tracepoint_raw(void)
+static inline bool perf_paranoid_tracepoint_raw(const struct pmu *pmu)
 {
-	return sysctl_perf_event_paranoid > -1;
+	return pmu->perf_event_paranoid > -1;
 }
 
-static inline bool perf_paranoid_cpu(void)
+static inline bool perf_paranoid_cpu(const struct pmu *pmu)
 {
-	return sysctl_perf_event_paranoid > 0;
+	return pmu->perf_event_paranoid > 0;
 }
 
-static inline bool perf_paranoid_kernel(void)
+static inline bool perf_paranoid_kernel(const struct pmu *pmu)
 {
-	return sysctl_perf_event_paranoid > 1;
+	return pmu->perf_event_paranoid > 1;
 }
 
 extern void perf_event_init(void);
diff --git a/kernel/events/core.c b/kernel/events/core.c
index b4152da656fa..21fd4430df66 100644
--- a/kernel/events/core.c
+++ b/kernel/events/core.c
@@ -432,6 +432,24 @@ static void update_perf_cpu_limits(void)
 
 static int perf_rotate_context(struct perf_cpu_context *cpuctx);
 
+int perf_proc_paranoid_handler(struct ctl_table *table, int write,
+		void __user *buffer, size_t *lenp,
+		loff_t *ppos)
+{
+	int ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos);
+	struct pmu *pmu;
+
+	if (ret || !write)
+		return ret;
+
+	mutex_lock(&pmus_lock);
+	list_for_each_entry(pmu, &pmus, entry)
+		pmu->perf_event_paranoid = sysctl_perf_event_paranoid;
+	mutex_unlock(&pmus_lock);
+
+	return 0;
+}
+
 int perf_proc_update_handler(struct ctl_table *table, int write,
 		void __user *buffer, size_t *lenp,
 		loff_t *ppos)
@@ -3772,7 +3790,7 @@ find_get_context(struct pmu *pmu, struct task_struct *task,
 
 	if (!task) {
 		/* Must be root to operate on a CPU event: */
-		if (perf_paranoid_cpu() && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_cpu(pmu) && !capable(CAP_SYS_ADMIN))
 			return ERR_PTR(-EACCES);
 
 		cpuctx = per_cpu_ptr(pmu->pmu_cpu_context, cpu);
@@ -5313,7 +5331,7 @@ static int perf_mmap(struct file *file, struct vm_area_struct *vma)
 	lock_limit >>= PAGE_SHIFT;
 	locked = vma->vm_mm->pinned_vm + extra;
 
-	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw() &&
+	if ((locked > lock_limit) && perf_paranoid_tracepoint_raw(event->pmu) &&
 		!capable(CAP_IPC_LOCK)) {
 		ret = -EPERM;
 		goto unlock;
@@ -8880,6 +8898,41 @@ static void free_pmu_context(struct pmu *pmu)
 	mutex_unlock(&pmus_lock);
 }
 
+/*
+ * Fine-grained access control:
+ */
+static ssize_t
+perf_event_paranoid_show(struct device *dev,
+			 struct device_attribute *attr,
+			 char *page)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+
+	return snprintf(page, PAGE_SIZE - 1, "%d\n", pmu->perf_event_paranoid);
+}
+
+static ssize_t
+perf_event_paranoid_store(struct device *dev,
+			  struct device_attribute *attr,
+			  const char *buf, size_t count)
+{
+	struct pmu *pmu = dev_get_drvdata(dev);
+	int ret, val;
+
+	ret = kstrtoint(buf, 0, &val);
+	if (ret)
+		return ret;
+
+	if (val < -1 || val > 2)
+		return -EINVAL;
+
+	pmu->perf_event_paranoid = val;
+
+	return count;
+}
+
+DEVICE_ATTR_RW(perf_event_paranoid);
+
 /*
  * Let userspace know that this PMU supports address range filtering:
  */
@@ -8994,6 +9047,11 @@ static int pmu_dev_alloc(struct pmu *pmu)
 	if (ret)
 		goto free_dev;
 
+	/* Add fine-grained access control attribute. */
+	ret = device_create_file(pmu->dev, &dev_attr_perf_event_paranoid);
+	if (ret)
+		goto del_dev;
+
 	/* For PMUs with address filters, throw in an extra attribute: */
 	if (pmu->nr_addr_filters)
 		ret = device_create_file(pmu->dev, &dev_attr_nr_addr_filters);
@@ -9025,6 +9083,7 @@ int perf_pmu_register(struct pmu *pmu, const char *name, int type)
 	if (!pmu->pmu_disable_count)
 		goto unlock;
 
+	pmu->perf_event_paranoid = sysctl_perf_event_paranoid;
 	pmu->type = -1;
 	if (!name)
 		goto skip_type;
@@ -9634,10 +9693,6 @@ static int perf_copy_attr(struct perf_event_attr __user *uattr,
 			 */
 			attr->branch_sample_type = mask;
 		}
-		/* privileged levels capture (kernel, hv): check permissions */
-		if ((mask & PERF_SAMPLE_BRANCH_PERM_PLM)
-		    && perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
 	}
 
 	if (attr->sample_type & PERF_SAMPLE_REGS_USER) {
@@ -9851,11 +9906,6 @@ SYSCALL_DEFINE5(perf_event_open,
 	if (err)
 		return err;
 
-	if (!attr.exclude_kernel) {
-		if (perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-			return -EACCES;
-	}
-
 	if (attr.namespaces) {
 		if (!capable(CAP_SYS_ADMIN))
 			return -EACCES;
@@ -9869,11 +9919,6 @@ SYSCALL_DEFINE5(perf_event_open,
 			return -EINVAL;
 	}
 
-	/* Only privileged users can get physical addresses */
-	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
-	    perf_paranoid_kernel() && !capable(CAP_SYS_ADMIN))
-		return -EACCES;
-
 	if (!attr.sample_max_stack)
 		attr.sample_max_stack = sysctl_perf_event_max_stack;
 
@@ -9946,6 +9991,28 @@ SYSCALL_DEFINE5(perf_event_open,
 		goto err_cred;
 	}
 
+	if (!attr.exclude_kernel) {
+		if (perf_paranoid_kernel(event->pmu) &&
+		    !capable(CAP_SYS_ADMIN)) {
+			err = -EACCES;
+			goto err_alloc;
+		}
+	}
+
+	/* Only privileged users can get physical addresses */
+	if ((attr.sample_type & PERF_SAMPLE_PHYS_ADDR) &&
+	    perf_paranoid_kernel(event->pmu) && !capable(CAP_SYS_ADMIN)) {
+		err = -EACCES;
+		goto err_alloc;
+	}
+
+	/* privileged levels capture (kernel, hv): check permissions */
+	if ((attr.branch_sample_type & PERF_SAMPLE_BRANCH_PERM_PLM) &&
+	    perf_paranoid_kernel(event->pmu) && !capable(CAP_SYS_ADMIN)) {
+		err = -EACCES;
+		goto err_alloc;
+	}
+
 	if (is_sampling_event(event)) {
 		if (event->pmu->capabilities & PERF_PMU_CAP_NO_INTERRUPT) {
 			err = -EOPNOTSUPP;
diff --git a/kernel/sysctl.c b/kernel/sysctl.c
index 557d46728577..2f724b951e92 100644
--- a/kernel/sysctl.c
+++ b/kernel/sysctl.c
@@ -1129,7 +1129,9 @@ static struct ctl_table kern_table[] = {
 		.data		= &sysctl_perf_event_paranoid,
 		.maxlen		= sizeof(sysctl_perf_event_paranoid),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= perf_proc_paranoid_handler,
+		.extra1		= &neg_one,
+		.extra2		= &two,
 	},
 	{
 		.procname	= "perf_event_mlock_kb",
diff --git a/kernel/trace/trace_event_perf.c b/kernel/trace/trace_event_perf.c
index 55d6dff37daf..f23e3560237e 100644
--- a/kernel/trace/trace_event_perf.c
+++ b/kernel/trace/trace_event_perf.c
@@ -44,7 +44,8 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
 
 	/* The ftrace function trace is allowed only for root. */
 	if (ftrace_event_is_function(tp_event)) {
-		if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+		if (perf_paranoid_tracepoint_raw(p_event->pmu) &&
+		    !capable(CAP_SYS_ADMIN))
 			return -EPERM;
 
 		if (!is_sampling_event(p_event))
@@ -80,7 +81,8 @@ static int perf_trace_event_perm(struct trace_event_call *tp_event,
 	 * ...otherwise raw tracepoint data can be a severe data leak,
 	 * only allow root to have these.
 	 */
-	if (perf_paranoid_tracepoint_raw() && !capable(CAP_SYS_ADMIN))
+	if (perf_paranoid_tracepoint_raw(p_event->pmu) &&
+	    !capable(CAP_SYS_ADMIN))
 		return -EPERM;
 
 	return 0;
-- 
2.14.1

^ permalink raw reply related	[flat|nested] 14+ messages in thread

end of thread, other threads:[~2018-06-18  8:07 UTC | newest]

Thread overview: 14+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-05-21  9:25 [RFC] perf: Allow fine-grained PMU access control Tvrtko Ursulin
2018-05-22  9:05 ` Peter Zijlstra
2018-05-22  9:29   ` Tvrtko Ursulin
2018-05-22 12:32     ` Peter Zijlstra
2018-05-22 13:01       ` Alexey Budankov
2018-05-22 17:19         ` Andi Kleen
2018-06-11  8:08           ` Tvrtko Ursulin
2018-06-18  8:06             ` Alexey Budankov
2018-05-22 16:15       ` Tvrtko Ursulin
  -- strict thread matches above, loose matches on Subject: below --
2018-01-18 18:40 Tvrtko Ursulin
2018-01-19 16:45 ` Peter Zijlstra
2018-01-19 16:45   ` Peter Zijlstra
2018-01-19 17:10   ` Tvrtko Ursulin
2018-02-23 15:58     ` Tvrtko Ursulin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.