[-v2.1] x86/msr: Filter MSR writes
diff mbox series

Message ID 20200615063837.GA14668@zn.tnic
State In Next
Commit 8f4014b89002e02d6bf278c577402d8a08d9b516
Headers show
Series
  • [-v2.1] x86/msr: Filter MSR writes
Related show

Commit Message

Borislav Petkov June 15, 2020, 6:38 a.m. UTC
Here's an improved v2 with sparse warnings fixed:

---
Disable writing to MSRs from userspace by default. Writes can still be
allowed by supplying the allow_writes=1 module parameter and the kernel
will be tainted so that it shows in oopses.

Having unfettered access to all MSRs on a system is and has always been
a disaster waiting to happen. Think performance counter MSRs, MSRs with
sticky or locked bits, MSRs making major system changes like loading
microcode, MTRRs, PAT configuration, TSC counter, security mitigations
MSRs, you name it.

This also destroys all the kernel's caching of MSR values for
performance, as the recent case with MSR_AMD64_LS_CFG showed.

Another example is writing MSRs by mistake by simply typing the wrong
MSR address. System freezes have been experienced that way.

In general, poking at MSRs under the kernel's feet is a bad bad idea.

So disable poking directly at the MSRs by default. If userspace still
wants to do that, then proper interfaces should be defined which
are under the kernel's control and accesses to those MSRs can be
synchronized and sanitized properly.

Changelog:
- taint before WRMSR, all
- make param 0600, Sean.
- do not deny but log writes by default, Linus.

[ Fix sparse warnings ]
Reported-by: kernel test robot <lkp@intel.com>
Signed-off-by: Borislav Petkov <bp@suse.de>
---
 arch/x86/kernel/msr.c | 69 +++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)

Comments

Sean Christopherson June 25, 2020, 5:51 a.m. UTC | #1
On Mon, Jun 15, 2020 at 08:38:37AM +0200, Borislav Petkov wrote:
> Here's an improved v2 with sparse warnings fixed:
> 
> ---
> Disable writing to MSRs from userspace by default. Writes can still be
> allowed by supplying the allow_writes=1 module parameter and the kernel
> will be tainted so that it shows in oopses.
> 
> Having unfettered access to all MSRs on a system is and has always been
> a disaster waiting to happen. Think performance counter MSRs, MSRs with
> sticky or locked bits, MSRs making major system changes like loading
> microcode, MTRRs, PAT configuration, TSC counter, security mitigations
> MSRs, you name it.
> 
> This also destroys all the kernel's caching of MSR values for
> performance, as the recent case with MSR_AMD64_LS_CFG showed.
> 
> Another example is writing MSRs by mistake by simply typing the wrong
> MSR address. System freezes have been experienced that way.
> 
> In general, poking at MSRs under the kernel's feet is a bad bad idea.
> 
> So disable poking directly at the MSRs by default. If userspace still
> wants to do that, then proper interfaces should be defined which
> are under the kernel's control and accesses to those MSRs can be
> synchronized and sanitized properly.
> 
> Changelog:
> - taint before WRMSR, all
> - make param 0600, Sean.
> - do not deny but log writes by default, Linus.
> 
> [ Fix sparse warnings ]
> Reported-by: kernel test robot <lkp@intel.com>
> Signed-off-by: Borislav Petkov <bp@suse.de>

A few non-functional nits below.

Tested-by: Sean Christopherson <sean.j.christopherson@intel.com>

> ---
>  arch/x86/kernel/msr.c | 69 +++++++++++++++++++++++++++++++++++++++++++
>  1 file changed, 69 insertions(+)
> 
> diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
> index 1547be359d7f..576c43e39247 100644
> --- a/arch/x86/kernel/msr.c
> +++ b/arch/x86/kernel/msr.c
> @@ -42,6 +42,14 @@
>  static struct class *msr_class;
>  static enum cpuhp_state cpuhp_msr_state;
>  
> +enum allow_write_msrs {
> +	MSR_WRITES_ON,
> +	MSR_WRITES_OFF,
> +	MSR_WRITES_DEFAULT,
> +};
> +
> +static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;
> +
>  static ssize_t msr_read(struct file *file, char __user *buf,
>  			size_t count, loff_t *ppos)
>  {
> @@ -70,6 +78,24 @@ static ssize_t msr_read(struct file *file, char __user *buf,
>  	return bytes ? bytes : err;
>  }
>  
> +static int filter_write(u32 reg)
> +{
> +     switch (allow_writes) {
> +     case MSR_WRITES_ON:  return 0;          break;
> +     case MSR_WRITES_OFF: return -EPERM;     break;

The breaks after the returns are unnecessary.

> +     default: break;
> +     }
> +
> +     if (reg == MSR_IA32_ENERGY_PERF_BIAS)
> +             return 0;
> +
> +     pr_err_ratelimited("Write to unrecognized MSR 0x%x by %s\n"
> +                        "Please report to x86@kernel.org\n",
> +                        reg, current->comm);


Maybe s/unrecognized/unauthorized?  Unrecognized implies the kernel doesn't
know anything about the MSR being written, which may not hold true.

> +     return 0;
> +}
> +
>  static ssize_t msr_write(struct file *file, const char __user *buf,
>                        size_t count, loff_t *ppos)
>  {
> @@ -84,6 +110,10 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
>       if (err)
>               return err;
>
> +     err = filter_write(reg);
> +     if (err)
> +             return err;
> +
>       if (count % 8)
>               return -EINVAL; /* Invalid chunk size */
>
> @@ -92,9 +122,13 @@ static ssize_t msr_write(struct file *file, const char __user *buf,
>                       err = -EFAULT;
>                       break;
>               }
> +
> +             add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
> +
>               err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
>               if (err)
>                       break;
> +

Random leftover whitespace change.

>  		tmp += 2;
>  		bytes += 8;
>  	}
Borislav Petkov June 25, 2020, 8:37 a.m. UTC | #2
On Wed, Jun 24, 2020 at 10:51:40PM -0700, Sean Christopherson wrote:
> Maybe s/unrecognized/unauthorized?  Unrecognized implies the kernel doesn't
> know anything about the MSR being written, which may not hold true.

"unrecognized" in the sense that it is not on the whitelist above. It
contains only one MSR and when the patches for the interface to that MSR
get upstreamed, it won't be needed anymore.

But this all is WIP anyway so we'll adjust that printk when we know
whether we wanna have a whitelist at all...

> Random leftover whitespace change.

That's spreading out the statements for better readability. That file
could use more spreading out too.

Thx.
Chris Down July 14, 2020, 12:19 p.m. UTC | #3
Hi Borislav,

This is certainly a good idea, but I wonder whether we should be more pragmatic 
about the printk ratelimiting while we give userspace time to react and update 
their methodologies.

As one example, there is a common MSR hack which is verging on essential if 
you're doing thermally intensive work on some recent ThinkPads[0][1], and this 
drastically reduces the signal-to-noise ratio in kmsg (and this is only about 
five minutes after boot):

     % dmesg | wc -l
     2963
     % dmesg | grep -c 'unrecognized MSR'
     2411

That is, even with pr_err_ratelimited, we still end up logging on basically 
every single write, even though it's from the same TGID writing to the same 
MSRs, and end up becoming >80% of kmsg.

Of course, one can boot with `allow_writes=1` to avoid these messages at all, 
but that then has the downfall that one doesn't get _any_ notification at all 
about these problems in the first place, and so is much less likely to forget 
to fix it. One might rather it was less binary: it was still logged, just less 
often, so that application developers _do_ have the incentive to improve their 
current methods, without us having to push other useful stuff out of the kmsg 
buffer.

This one example isn't the point, of course: I'm sure there are plenty of other 
non-ideal-but-pragmatic cases where people are writing to MSRs from userspace 
right now, and it will take time for those people to find other solutions.

I completely agree with you that there should be a better solution for these 
cases, and that writing to MSRs from userspace is really not a good idea.  
However, going from zero to over 80% of dmesg in cases where these MSRs are 
repeatedly used seems too fast to me.

Have you considered perhaps making the ramping up of error logging more gradual 
by having this printk have its own, more conservative `struct ratelimit_state`, 
as we do in some other places with similar noise concerns? Then we could 
gradually make the warnings more aggressive as time goes on, up until the point 
where we make allow_writes=0 the default.

Thanks,

Chris

0: Lenovo is supposedly fixing this since last year, but no news yet.
1: https://github.com/erpalma/throttled
Borislav Petkov July 14, 2020, 3:47 p.m. UTC | #4
On Tue, Jul 14, 2020 at 01:19:55PM +0100, Chris Down wrote:
> That is, even with pr_err_ratelimited, we still end up logging on basically
> every single write, even though it's from the same TGID writing to the same
> MSRs, and end up becoming >80% of kmsg.
> 
> Of course, one can boot with `allow_writes=1` to avoid these messages at

Yes, use that.

From a quick scan over that "tool" you pointed me at, it pokes at some
MSRs from userspace which the kernel *also* writes to and this is
exactly what should not be allowed.

As to the "MSR hack", please describe what the issue is exactly so that
we can get the proper people involved. If anything, this needs to be
fixed in the kernel properly. If people are waiting for a year for a
BIOS fix, I'd say there's a very slim chance for that to ever happen...

Thx.
Chris Down July 14, 2020, 4:04 p.m. UTC | #5
Borislav Petkov writes:
>On Tue, Jul 14, 2020 at 01:19:55PM +0100, Chris Down wrote:
>> That is, even with pr_err_ratelimited, we still end up logging on basically
>> every single write, even though it's from the same TGID writing to the same
>> MSRs, and end up becoming >80% of kmsg.
>>
>> Of course, one can boot with `allow_writes=1` to avoid these messages at
>
>Yes, use that.
>
>From a quick scan over that "tool" you pointed me at, it pokes at some
>MSRs from userspace which the kernel *also* writes to and this is
>exactly what should not be allowed.

I don't think we're in disagreement about that. My concern is strictly about 
the amount of spam caused for some of those existing use cases during the 
transition phase. People should know that their tools would break, but there 
shouldn't be so many messages generated that it inevitably pushes other useful 
information out of the kmsg buffer.

>As to the "MSR hack", please describe what the issue is exactly so that
>we can get the proper people involved. If anything, this needs to be
>fixed in the kernel properly. If people are waiting for a year for a
>BIOS fix, I'd say there's a very slim chance for that to ever happen...

Since the issue involves DPTF which is only supported via binary blobs, I can't 
say for certain what the issue is. As I understand it, when the throttling 
behaviour isn't explicitly configured by the OS kernel, the default policy is 
extremely overeager. Matthew also had a look at it[0], but I don't know if 
anything eventually happened there. I've cc'ed him.

Either way, again, this isn't really the point. :-) The point is that there 
_are_ currently widespread cases involving poking MSRs from userspace, however 
sacrilegious or ugly (which I agree with!), and while people should be told 
about that, it's excessive to have the potential to take up 80% of kmsg in the 
default configuration. It doesn't take thousands of messages to get the message 
across, that's what a custom printk ratelimit is for.

0: https://twitter.com/mjg59/status/1034132444201582596
Luck, Tony July 14, 2020, 4:46 p.m. UTC | #6
On Tue, Jul 14, 2020 at 05:04:48PM +0100, Chris Down wrote:
> Borislav Petkov writes:
> > On Tue, Jul 14, 2020 at 01:19:55PM +0100, Chris Down wrote:
> > > That is, even with pr_err_ratelimited, we still end up logging on basically
> > > every single write, even though it's from the same TGID writing to the same
> > > MSRs, and end up becoming >80% of kmsg.
> > > 
> > > Of course, one can boot with `allow_writes=1` to avoid these messages at
> > 
> > Yes, use that.
> > 
> > From a quick scan over that "tool" you pointed me at, it pokes at some
> > MSRs from userspace which the kernel *also* writes to and this is
> > exactly what should not be allowed.
> 
> I don't think we're in disagreement about that. My concern is strictly about
> the amount of spam caused for some of those existing use cases during the
> transition phase. People should know that their tools would break, but there
> shouldn't be so many messages generated that it inevitably pushes other
> useful information out of the kmsg buffer.

Maybe we just need smarter filtering of warnings.  It doesn't
seem at all useful to warn for the same MSR 1000's of times.
Maybe keep a count of warnings for each MSR and just stop
all reports when reach a threshold?

-Tony
Borislav Petkov July 14, 2020, 4:56 p.m. UTC | #7
On Tue, Jul 14, 2020 at 05:04:48PM +0100, Chris Down wrote:
> Since the issue involves DPTF which is only supported via binary blobs, I
> can't say for certain what the issue is. As I understand it, when the
> throttling behaviour isn't explicitly configured by the OS kernel, the
> default policy is extremely overeager. Matthew also had a look at it[0], but
> I don't know if anything eventually happened there. I've cc'ed him.
> 
> Either way, again, this isn't really the point. :-) The point is that there
> _are_ currently widespread cases involving poking MSRs from userspace,
> however sacrilegious or ugly (which I agree with!), and while people should
> be told about that, it's excessive to have the potential to take up 80% of
> kmsg in the default configuration. It doesn't take thousands of messages to
> get the message across, that's what a custom printk ratelimit is for.

Ok, feel free to suggest a fix, better yet send a patch. Otherwise,
you'd have to wait for my vacation to end first. :-)

> 0: https://twitter.com/mjg59/status/1034132444201582596

As to the power issue, lemme CC some Intel folks I found in MAINTAINERS.

Intel folks, pls check the link above and upthread: Why TF do people
need to use some luserspace daemon which pokes at MSRs which the kernel
writes to too, in order to bypass some apparently too conservative
throttling, AFAIU?

And why does this work on windoze reportedly?
Borislav Petkov July 14, 2020, 4:58 p.m. UTC | #8
On Tue, Jul 14, 2020 at 09:46:12AM -0700, Luck, Tony wrote:
> Maybe we just need smarter filtering of warnings.  It doesn't
> seem at all useful to warn for the same MSR 1000's of times.
> Maybe keep a count of warnings for each MSR and just stop
> all reports when reach a threshold?

No, not stop - just make them more rare.

And this is where the bikeshedding starts about how rare.

/me runs away to the beach and lets the others fight it out.
Chris Down July 14, 2020, 5:02 p.m. UTC | #9
Luck, Tony writes:
>On Tue, Jul 14, 2020 at 05:04:48PM +0100, Chris Down wrote:
>> Borislav Petkov writes:
>> > On Tue, Jul 14, 2020 at 01:19:55PM +0100, Chris Down wrote:
>> > > That is, even with pr_err_ratelimited, we still end up logging on basically
>> > > every single write, even though it's from the same TGID writing to the same
>> > > MSRs, and end up becoming >80% of kmsg.
>> > >
>> > > Of course, one can boot with `allow_writes=1` to avoid these messages at
>> >
>> > Yes, use that.
>> >
>> > From a quick scan over that "tool" you pointed me at, it pokes at some
>> > MSRs from userspace which the kernel *also* writes to and this is
>> > exactly what should not be allowed.
>>
>> I don't think we're in disagreement about that. My concern is strictly about
>> the amount of spam caused for some of those existing use cases during the
>> transition phase. People should know that their tools would break, but there
>> shouldn't be so many messages generated that it inevitably pushes other
>> useful information out of the kmsg buffer.
>
>Maybe we just need smarter filtering of warnings.  It doesn't
>seem at all useful to warn for the same MSR 1000's of times.
>Maybe keep a count of warnings for each MSR and just stop
>all reports when reach a threshold?

That also a fine good solution, albeit more complex than just using the 
existing custom ratelimit_state infrastructure. Doing so probably also means 
we'd miss out on some of the other stuff that comes for free with it.

My only other concern with ratelimiting per-TGID or per-MSR was that the 
ratelimit cache table could become unwieldy, but if we keep it simple by 
limiting the size and not printing after we reach that, that sounds fine too.

Any solution which means that we avoid saturating kmsg for workloads which 
currently twiddle MSRs sounds fine to me. People should know that we don't 
support or encourage this, but it shouldn't be at the cost of potentially 
pushing everything else out of the kmsg buffer.
Chris Down July 14, 2020, 5:04 p.m. UTC | #10
Borislav Petkov writes:
>On Tue, Jul 14, 2020 at 05:04:48PM +0100, Chris Down wrote:
>> Since the issue involves DPTF which is only supported via binary blobs, I
>> can't say for certain what the issue is. As I understand it, when the
>> throttling behaviour isn't explicitly configured by the OS kernel, the
>> default policy is extremely overeager. Matthew also had a look at it[0], but
>> I don't know if anything eventually happened there. I've cc'ed him.
>>
>> Either way, again, this isn't really the point. :-) The point is that there
>> _are_ currently widespread cases involving poking MSRs from userspace,
>> however sacrilegious or ugly (which I agree with!), and while people should
>> be told about that, it's excessive to have the potential to take up 80% of
>> kmsg in the default configuration. It doesn't take thousands of messages to
>> get the message across, that's what a custom printk ratelimit is for.
>
>Ok, feel free to suggest a fix, better yet send a patch. Otherwise,
>you'd have to wait for my vacation to end first. :-)

Sure thing, I'll send a patch tomorrow, then. :-)
Srinivas Pandruvada July 14, 2020, 6:52 p.m. UTC | #11
On Tue, 2020-07-14 at 18:56 +0200, Borislav Petkov wrote:
> On Tue, Jul 14, 2020 at 05:04:48PM +0100, Chris Down wrote:
> > Since the issue involves DPTF which is only supported via binary
> > blobs, I
> > can't say for certain what the issue is. As I understand it, when
> > the
> > throttling behaviour isn't explicitly configured by the OS kernel,
> > the
> > default policy is extremely overeager. Matthew also had a look at
> > it[0], but
> > I don't know if anything eventually happened there. I've cc'ed him.
> > 
> > Either way, again, this isn't really the point. :-) The point is
> > that there
> > _are_ currently widespread cases involving poking MSRs from
> > userspace,
> > however sacrilegious or ugly (which I agree with!), and while
> > people should
> > be told about that, it's excessive to have the potential to take up
> > 80% of
> > kmsg in the default configuration. It doesn't take thousands of
> > messages to
> > get the message across, that's what a custom printk ratelimit is
> > for.
> 
> Ok, feel free to suggest a fix, better yet send a patch. Otherwise,
> you'd have to wait for my vacation to end first. :-)
> 
> > 0: https://twitter.com/mjg59/status/1034132444201582596
> 
> As to the power issue, lemme CC some Intel folks I found in
> MAINTAINERS.
> 
> Intel folks, pls check the link above and upthread: Why TF do people
> need to use some luserspace daemon which pokes at MSRs which the
> kernel
> writes to too, in order to bypass some apparently too conservative
> throttling, AFAIU?
For issues related to thermal or power, we don't expect to poke MSRs
from user space by any daemon. We have sysfs interfaces for the
required controls. This is also true for controls via MMIO space.
Anytime if it is safe to add, we are adding controls via sysfs.

The tool in question from the link (not from Intel), when developed may
not have TCC or RAPL-MMIO controls via sysfs. We have sysfs interfaces
for a while. They can send email to me to justify other controls if
any.
 
Only time direct MSR access is required is for debug tools like
turbostat.

> 
> And why does this work on windoze reportedly?
This is not related to MSR or MMIO. This is related to some ACPI
tables. In Linux, thermald will adjust these knobs like Windows. It was
missing some ACPI details, which Matthew Garrett submitted patches to
kernel and getting merged with 5.8 series.

Thanks,
Srinivas
Matthew Garrett July 14, 2020, 7:17 p.m. UTC | #12
On Tue, Jul 14, 2020 at 9:04 AM Chris Down <chris@chrisdown.name> wrote:
> Either way, again, this isn't really the point. :-) The point is that there
> _are_ currently widespread cases involving poking MSRs from userspace, however
> sacrilegious or ugly (which I agree with!), and while people should be told
> about that, it's excessive to have the potential to take up 80% of kmsg in the
> default configuration. It doesn't take thousands of messages to get the message
> across, that's what a custom printk ratelimit is for.

Agreed - we should now offer all the necessary interfaces to avoid
userspace having to hit MSRs directly for thermal management, but that
wasn't always the case, and as a result there's tooling that still
behaves this way.
Borislav Petkov July 15, 2020, 4:26 a.m. UTC | #13
On Tue, Jul 14, 2020 at 11:52:47AM -0700, Srinivas Pandruvada wrote:
> > As to the power issue, lemme CC some Intel folks I found in
> > MAINTAINERS.
> > 
> > Intel folks, pls check the link above and upthread: Why TF do people
> > need to use some luserspace daemon which pokes at MSRs which the
> > kernel
> > writes to too, in order to bypass some apparently too conservative
> > throttling, AFAIU?
> For issues related to thermal or power, we don't expect to poke MSRs
> from user space by any daemon. We have sysfs interfaces for the
> required controls. This is also true for controls via MMIO space.
> Anytime if it is safe to add, we are adding controls via sysfs.
> 
> The tool in question from the link (not from Intel), when developed may
> not have TCC or RAPL-MMIO controls via sysfs. We have sysfs interfaces
> for a while. They can send email to me to justify other controls if
> any.

CCed. (I think I got the right email from the repo).

Francesco, see the whole thread starting here:

https://lkml.kernel.org/r/20200714121955.GA2080@chrisdown.name

> > And why does this work on windoze reportedly?
> This is not related to MSR or MMIO. This is related to some ACPI
> tables. In Linux, thermald will adjust these knobs like Windows. It was
> missing some ACPI details, which Matthew Garrett submitted patches to
> kernel and getting merged with 5.8 series.

Good.

Which means that that throttled tool could do the same thing thermald is
doing so that they're all on the same page. Or simply not do anything
and tell users to install thermald instead.

Thx.
Mathieu Chouquet-Stringer Nov. 17, 2020, 9 p.m. UTC | #14
Hello all,

On Tue, Jul 14, 2020 at 12:17:50PM -0700, Matthew Garrett wrote:
> On Tue, Jul 14, 2020 at 9:04 AM Chris Down <chris@chrisdown.name> wrote:
> > Either way, again, this isn't really the point. :-) The point is that there
> > _are_ currently widespread cases involving poking MSRs from userspace, however
> > sacrilegious or ugly (which I agree with!), and while people should be told
> > about that, it's excessive to have the potential to take up 80% of kmsg in the
> > default configuration. It doesn't take thousands of messages to get the message
> > across, that's what a custom printk ratelimit is for.

> Agreed - we should now offer all the necessary interfaces to avoid
> userspace having to hit MSRs directly for thermal management, but that
> wasn't always the case, and as a result there's tooling that still
> behaves this way.

I'm late to the party but it seems allowing MSR_IA32_ENERGY_PERF_BIAS
has the downside of flagging the kernel as tainted without telling you
why if you use something like x86_energy_perf_policy (from
tools/power/x86/x86_energy_perf_policy) which itself is used by tuned.

I can taint my kernel manually by just running:
x86_energy_perf_policy -c all performance

The net impact is an OOPS triggered on such kernel won't necessarily be
read by anyone nor analyzed by reporting tools as the kernel is now
considered tainted.

For instance abrt reports the following:
===========8<===========8<===========8<===========8<===========8<===========8<
A kernel problem occurred, but your kernel has been tainted (flags:GS).
Explanation:
S - SMP with CPUs not designed for SMP.
Kernel maintainers are unable to diagnose tainted reports.
===========8<===========8<===========8<===========8<===========8<===========8<

To add to the confusion, kernel documentation
(Documentation/admin-guide/tainted-kernels.rst) is not up to date so
while looking for an explanation, one gets to wonder how what used to be
a regular average computer can now be classified as something using "an
officially SMP incapable processor"...

So while both documentation and tools should be updated as to be clearer
and to not taint the kernel respectively, there's something that remains
to be done to explain why or how the kernel got tainted because of
poking into MSRs...
Borislav Petkov Nov. 17, 2020, 9:20 p.m. UTC | #15
On Tue, Nov 17, 2020 at 10:00:18PM +0100, Mathieu Chouquet-Stringer wrote:
> I'm late to the party but it seems allowing MSR_IA32_ENERGY_PERF_BIAS
> has the downside of flagging the kernel as tainted without telling you
> why if you use something like x86_energy_perf_policy (from
> tools/power/x86/x86_energy_perf_policy) which itself is used by tuned.

Not for long:

https://git.kernel.org/tip/fe0a5788624c8b8f113a35bbe4636e37f9321241

> So while both documentation and tools should be updated as to be clearer
> and to not taint the kernel respectively, there's something that remains
> to be done to explain why or how the kernel got tainted because of
> poking into MSRs...

Because if you poke at random MSRs and you manage to "configure" your
CPU to run "out of spec" - this is what the taint flag is called:
TAINT_CPU_OUT_OF_SPEC - then this is exactly the case you've created: a
CPU executing outside of specifications.

I agree with the update-the-documentation aspect - S does not mean only
SMP kernel on !SMP-capable CPU but the more general, CPU is out of spec.

Thx.
Matthew Garrett Nov. 17, 2020, 9:21 p.m. UTC | #16
On Tue, Nov 17, 2020 at 1:00 PM Mathieu Chouquet-Stringer
<me@mathieu.digital> wrote:

> I'm late to the party but it seems allowing MSR_IA32_ENERGY_PERF_BIAS
> has the downside of flagging the kernel as tainted without telling you
> why if you use something like x86_energy_perf_policy (from
> tools/power/x86/x86_energy_perf_policy) which itself is used by tuned.

I initially pushed back against a kernel interface for
MSR_IA32_ENERGY_PERF_BIAS (cc: Len Brown, who tried mightily to
convince me I was wrong here) on the grounds that it was exporting an
implementation detail rather than providing a generic interface, and
that it was something that could be done via userland instead. I
thought we'd end up with more examples of similar functionality and
could tie it into something more reasonable - history has proven me
wrong on that. I think it's probably reasonable to dust off the driver
that Len submitted however many years ago and push that into the
kernel now.
Matthew Garrett Nov. 17, 2020, 9:22 p.m. UTC | #17
On Tue, Nov 17, 2020 at 1:21 PM Matthew Garrett <mjg59@google.com> wrote:
>
> On Tue, Nov 17, 2020 at 1:00 PM Mathieu Chouquet-Stringer
> <me@mathieu.digital> wrote:
>
> > I'm late to the party but it seems allowing MSR_IA32_ENERGY_PERF_BIAS
> > has the downside of flagging the kernel as tainted without telling you
> > why if you use something like x86_energy_perf_policy (from
> > tools/power/x86/x86_energy_perf_policy) which itself is used by tuned.
>
> I initially pushed back against a kernel interface for
> MSR_IA32_ENERGY_PERF_BIAS (cc: Len Brown, who tried mightily to
> convince me I was wrong here) on the grounds that it was exporting an
> implementation detail rather than providing a generic interface, and
> that it was something that could be done via userland instead. I
> thought we'd end up with more examples of similar functionality and
> could tie it into something more reasonable - history has proven me
> wrong on that. I think it's probably reasonable to dust off the driver
> that Len submitted however many years ago and push that into the
> kernel now.

But ha ok based on Borislav's response it looks like someone's already
done that.
Mathieu Chouquet-Stringer Nov. 18, 2020, 8:58 a.m. UTC | #18
On Tue, Nov 17, 2020 at 10:20:16PM +0100, Borislav Petkov wrote:
> Not for long:
> 
> https://git.kernel.org/tip/fe0a5788624c8b8f113a35bbe4636e37f9321241

That's fantastic.

> Because if you poke at random MSRs and you manage to "configure" your
> CPU to run "out of spec" - this is what the taint flag is called:
> TAINT_CPU_OUT_OF_SPEC - then this is exactly the case you've created: a
> CPU executing outside of specifications.

Don't get me wrong, it makes total sense to do that, it's just the
original reason of !SMP-capable isn't so clear while CPU out of spec is.
Mathieu Chouquet-Stringer Nov. 18, 2020, 9:02 a.m. UTC | #19
On Tue, Nov 17, 2020 at 01:22:49PM -0800, Matthew Garrett wrote:
> > MSR_IA32_ENERGY_PERF_BIAS (cc: Len Brown, who tried mightily to
> > convince me I was wrong here) on the grounds that it was exporting an
> > implementation detail rather than providing a generic interface, and
> > that it was something that could be done via userland instead. I
> > thought we'd end up with more examples of similar functionality and
> > could tie it into something more reasonable - history has proven me
> > wrong on that. I think it's probably reasonable to dust off the driver
> > that Len submitted however many years ago and push that into the
> > kernel now.
> 
> But ha ok based on Borislav's response it looks like someone's already
> done that.

Indeed, all is good. Just have to wait for it to be merged and the
proper kernel to be released...
Mathieu Chouquet-Stringer Nov. 18, 2020, 9:09 a.m. UTC | #20
On Tue, Nov 17, 2020 at 10:20:16PM +0100, Borislav Petkov wrote:
> I agree with the update-the-documentation aspect - S does not mean only
> SMP kernel on !SMP-capable CPU but the more general, CPU is out of spec.

Speaking of doc, looking at the patches you submitted, I didn't see any
update to the documentation. Would you like me to create a patch for
that?
Borislav Petkov Nov. 18, 2020, 11:50 a.m. UTC | #21
On Wed, Nov 18, 2020 at 10:09:29AM +0100, Mathieu Chouquet-Stringer wrote:
> Speaking of doc, looking at the patches you submitted, I didn't see any
> update to the documentation. Would you like me to create a patch for
> that?

Sure, that would be appreciated.

Thx.

Patch
diff mbox series

diff --git a/arch/x86/kernel/msr.c b/arch/x86/kernel/msr.c
index 1547be359d7f..576c43e39247 100644
--- a/arch/x86/kernel/msr.c
+++ b/arch/x86/kernel/msr.c
@@ -42,6 +42,14 @@ 
 static struct class *msr_class;
 static enum cpuhp_state cpuhp_msr_state;
 
+enum allow_write_msrs {
+	MSR_WRITES_ON,
+	MSR_WRITES_OFF,
+	MSR_WRITES_DEFAULT,
+};
+
+static enum allow_write_msrs allow_writes = MSR_WRITES_DEFAULT;
+
 static ssize_t msr_read(struct file *file, char __user *buf,
 			size_t count, loff_t *ppos)
 {
@@ -70,6 +78,24 @@  static ssize_t msr_read(struct file *file, char __user *buf,
 	return bytes ? bytes : err;
 }
 
+static int filter_write(u32 reg)
+{
+	switch (allow_writes) {
+	case MSR_WRITES_ON:  return 0;		break;
+	case MSR_WRITES_OFF: return -EPERM;	break;
+	default: break;
+	}
+
+	if (reg == MSR_IA32_ENERGY_PERF_BIAS)
+		return 0;
+
+	pr_err_ratelimited("Write to unrecognized MSR 0x%x by %s\n"
+			   "Please report to x86@kernel.org\n",
+			   reg, current->comm);
+
+	return 0;
+}
+
 static ssize_t msr_write(struct file *file, const char __user *buf,
 			 size_t count, loff_t *ppos)
 {
@@ -84,6 +110,10 @@  static ssize_t msr_write(struct file *file, const char __user *buf,
 	if (err)
 		return err;
 
+	err = filter_write(reg);
+	if (err)
+		return err;
+
 	if (count % 8)
 		return -EINVAL;	/* Invalid chunk size */
 
@@ -92,9 +122,13 @@  static ssize_t msr_write(struct file *file, const char __user *buf,
 			err = -EFAULT;
 			break;
 		}
+
+		add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK);
+
 		err = wrmsr_safe_on_cpu(cpu, reg, data[0], data[1]);
 		if (err)
 			break;
+
 		tmp += 2;
 		bytes += 8;
 	}
@@ -242,6 +276,41 @@  static void __exit msr_exit(void)
 }
 module_exit(msr_exit)
 
+static int set_allow_writes(const char *val, const struct kernel_param *cp)
+{
+	/* val is NUL-terminated, see kernfs_fop_write() */
+	char *s = strstrip((char *)val);
+
+	if (!strcmp(s, "on"))
+		allow_writes = MSR_WRITES_ON;
+	else if (!strcmp(s, "off"))
+		allow_writes = MSR_WRITES_OFF;
+	else
+		allow_writes = MSR_WRITES_DEFAULT;
+
+	return 0;
+}
+
+static int get_allow_writes(char *buf, const struct kernel_param *kp)
+{
+	const char *res;
+
+	switch (allow_writes) {
+	case MSR_WRITES_ON:  res = "on"; break;
+	case MSR_WRITES_OFF: res = "off"; break;
+	default: res = "default"; break;
+	}
+
+	return sprintf(buf, "%s\n", res);
+}
+
+static const struct kernel_param_ops allow_writes_ops = {
+	.set = set_allow_writes,
+	.get = get_allow_writes
+};
+
+module_param_cb(allow_writes, &allow_writes_ops, NULL, 0600);
+
 MODULE_AUTHOR("H. Peter Anvin <hpa@zytor.com>");
 MODULE_DESCRIPTION("x86 generic MSR driver");
 MODULE_LICENSE("GPL");