[v2] sparc64: Add support for Application Data Integrity (ADI)
diff mbox series

Message ID 1456951177-23579-1-git-send-email-khalid.aziz@oracle.com
State New, archived
Headers show
Series
  • [v2] sparc64: Add support for Application Data Integrity (ADI)
Related show

Commit Message

Khalid Aziz March 2, 2016, 8:39 p.m. UTC
Enable Application Data Integrity (ADI) support in the sparc
kernel for applications to use ADI in userspace. ADI is a new
feature supported on sparc M7 and newer processors. ADI is supported
for data fetches only and not instruction fetches. This patch adds
prctl commands to enable and disable ADI (TSTATE.mcde), return ADI
parameters to userspace, enable/disable MCD (Memory Corruption
Detection) on selected memory ranges and enable TTE.mcd in PTEs. It
also adds handlers for all traps related to MCD. ADI is not enabled
by default for any task and a task must explicitly enable ADI
(TSTATE.mcde), turn MCD on on a memory range and set version tag
for ADI to be effective for the task. This patch adds support for
ADI for hugepages only. Addresses passed into system calls must be
non-ADI tagged addresses.

Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
---
NOTES: ADI is a new feature added to M7 processor to allow hardware
	to catch rogue accesses to memory. An app can enable ADI on
	its data pages, set version tags on them and use versioned
	addresses (bits 63-60 of the address contain a version tag)
	to access the data pages. If a rogue app attempts to access
	ADI enabled data pages, its access is blocked and processor
	generates an exception. Enabling this functionality for all
	data pages of an app requires adding infrastructure to save
	version tags for any data pages that get swapped out and
	restoring those tags when pages are swapped back in. In this
	first implementation I am enabling ADI for hugepages only
	since these pages are locked in memory and hence avoid the
	issue of saving and restoring tags. Once this core functionality
	is stable, ADI for other memory pages can be enabled more
	easily.

v2:
	- Fixed a build error

 Documentation/prctl/sparc_adi.txt     |  62 ++++++++++
 Documentation/sparc/adi.txt           | 206 +++++++++++++++++++++++++++++++
 arch/sparc/Kconfig                    |  12 ++
 arch/sparc/include/asm/hugetlb.h      |  14 +++
 arch/sparc/include/asm/hypervisor.h   |   2 +
 arch/sparc/include/asm/mmu_64.h       |   1 +
 arch/sparc/include/asm/pgtable_64.h   |  15 +++
 arch/sparc/include/asm/processor_64.h |  19 +++
 arch/sparc/include/asm/ttable.h       |  10 ++
 arch/sparc/include/uapi/asm/asi.h     |   3 +
 arch/sparc/include/uapi/asm/pstate.h  |  10 ++
 arch/sparc/kernel/entry.h             |   3 +
 arch/sparc/kernel/head_64.S           |   1 +
 arch/sparc/kernel/mdesc.c             |  81 +++++++++++++
 arch/sparc/kernel/process_64.c        | 222 ++++++++++++++++++++++++++++++++++
 arch/sparc/kernel/sun4v_mcd.S         |  16 +++
 arch/sparc/kernel/traps_64.c          |  96 ++++++++++++++-
 arch/sparc/kernel/ttable_64.S         |   6 +-
 include/linux/mm.h                    |   2 +
 include/uapi/asm-generic/siginfo.h    |   5 +-
 include/uapi/linux/prctl.h            |  16 +++
 kernel/sys.c                          |  30 +++++
 22 files changed, 826 insertions(+), 6 deletions(-)
 create mode 100644 Documentation/prctl/sparc_adi.txt
 create mode 100644 Documentation/sparc/adi.txt
 create mode 100644 arch/sparc/kernel/sun4v_mcd.S

Comments

Julian Calaby March 2, 2016, 11:08 p.m. UTC | #1
Hi Khalid,

On Thu, Mar 3, 2016 at 7:39 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
>
> Enable Application Data Integrity (ADI) support in the sparc
> kernel for applications to use ADI in userspace. ADI is a new
> feature supported on sparc M7 and newer processors. ADI is supported
> for data fetches only and not instruction fetches. This patch adds
> prctl commands to enable and disable ADI (TSTATE.mcde), return ADI
> parameters to userspace, enable/disable MCD (Memory Corruption
> Detection) on selected memory ranges and enable TTE.mcd in PTEs. It
> also adds handlers for all traps related to MCD. ADI is not enabled
> by default for any task and a task must explicitly enable ADI
> (TSTATE.mcde), turn MCD on on a memory range and set version tag
> for ADI to be effective for the task. This patch adds support for
> ADI for hugepages only. Addresses passed into system calls must be
> non-ADI tagged addresses.

I can't comment on the actual functionality here, but I do see a few
minor style issues in your patch.

My big concern is that you're defining a lot of new code that is ADI
specific but isn't inside a CONFIG_SPARC_ADI ifdef. (That said,
handling ADI specific traps if ADI isn't enabled looks like a good
idea to me, however most of the other stuff is just dead code if
CONFIG_SPARC_ADI isn't enabled.)

> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
> ---
> NOTES: ADI is a new feature added to M7 processor to allow hardware
>         to catch rogue accesses to memory. An app can enable ADI on
>         its data pages, set version tags on them and use versioned
>         addresses (bits 63-60 of the address contain a version tag)
>         to access the data pages. If a rogue app attempts to access
>         ADI enabled data pages, its access is blocked and processor
>         generates an exception. Enabling this functionality for all
>         data pages of an app requires adding infrastructure to save
>         version tags for any data pages that get swapped out and
>         restoring those tags when pages are swapped back in. In this
>         first implementation I am enabling ADI for hugepages only
>         since these pages are locked in memory and hence avoid the
>         issue of saving and restoring tags. Once this core functionality
>         is stable, ADI for other memory pages can be enabled more
>         easily.
>
> v2:
>         - Fixed a build error
>
>  Documentation/prctl/sparc_adi.txt     |  62 ++++++++++
>  Documentation/sparc/adi.txt           | 206 +++++++++++++++++++++++++++++++
>  arch/sparc/Kconfig                    |  12 ++
>  arch/sparc/include/asm/hugetlb.h      |  14 +++
>  arch/sparc/include/asm/hypervisor.h   |   2 +
>  arch/sparc/include/asm/mmu_64.h       |   1 +
>  arch/sparc/include/asm/pgtable_64.h   |  15 +++
>  arch/sparc/include/asm/processor_64.h |  19 +++
>  arch/sparc/include/asm/ttable.h       |  10 ++
>  arch/sparc/include/uapi/asm/asi.h     |   3 +
>  arch/sparc/include/uapi/asm/pstate.h  |  10 ++
>  arch/sparc/kernel/entry.h             |   3 +
>  arch/sparc/kernel/head_64.S           |   1 +
>  arch/sparc/kernel/mdesc.c             |  81 +++++++++++++
>  arch/sparc/kernel/process_64.c        | 222 ++++++++++++++++++++++++++++++++++
>  arch/sparc/kernel/sun4v_mcd.S         |  16 +++
>  arch/sparc/kernel/traps_64.c          |  96 ++++++++++++++-
>  arch/sparc/kernel/ttable_64.S         |   6 +-
>  include/linux/mm.h                    |   2 +
>  include/uapi/asm-generic/siginfo.h    |   5 +-
>  include/uapi/linux/prctl.h            |  16 +++
>  kernel/sys.c                          |  30 +++++
>  22 files changed, 826 insertions(+), 6 deletions(-)
>  create mode 100644 Documentation/prctl/sparc_adi.txt
>  create mode 100644 Documentation/sparc/adi.txt
>  create mode 100644 arch/sparc/kernel/sun4v_mcd.S

I must admit that I'm slightly impressed that the documentation is
over a quarter of the lines added. =)

> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
> index 56442d2..0aac0ae 100644
> --- a/arch/sparc/Kconfig
> +++ b/arch/sparc/Kconfig
> @@ -80,6 +80,7 @@ config SPARC64
>         select NO_BOOTMEM
>         select HAVE_ARCH_AUDITSYSCALL
>         select ARCH_SUPPORTS_ATOMIC_RMW
> +       select SPARC_ADI

This doesn't look right.

>  config ARCH_DEFCONFIG
>         string
> @@ -314,6 +315,17 @@ if SPARC64
>  source "kernel/power/Kconfig"
>  endif
>
> +config SPARC_ADI
> +       bool "Application Data Integrity support"
> +       def_bool y if SPARC64

def_bool is for config options without names (i.e. "this is a boolean
value and it's default is...")

So if you want people to be able to disable this option, then you
should remove the select above and just have:

bool "Application Data Integrity support"
default y if SPARC64

If you don't want people disabling it, then there's no point in having
a separate Kconfig symbol.

> +       help
> +         Support for Application Data Integrity (ADI). ADI feature allows
> +         a process to tag memory blocks with version tags. Once ADI is
> +         enabled and version tag is set on a memory block, any access to
> +         it is allowed only if the correct version tag is presented by
> +         a process. This feature is meant to help catch rogue accesses
> +         to memory.
> +

You should probably mention that it's only available on newer
processors and recommend that it's enabled on them.

This code won't break anything on older processors, right? I haven't
looked very closely, but I don't see anything that specifically
disables the code if it's run on, say, a UltraSparc I.

>  config SCHED_SMT
>         bool "SMT (Hyperthreading) scheduler support"
>         depends on SPARC64 && SMP
> diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h
> index 6924bde..9a71701 100644
> --- a/arch/sparc/include/asm/processor_64.h
> +++ b/arch/sparc/include/asm/processor_64.h
> @@ -97,6 +97,25 @@ struct thread_struct {
>  struct task_struct;
>  unsigned long thread_saved_pc(struct task_struct *);
>
> +#ifdef CONFIG_SPARC_ADI
> +extern struct adi_caps *get_adi_caps(void);
> +extern long get_sparc_adicaps(unsigned long);
> +extern long set_sparc_pstate_mcde(unsigned long);
> +extern long enable_sparc_adi(unsigned long, unsigned long);
> +extern long disable_sparc_adi(unsigned long, unsigned long);
> +extern long get_sparc_adi_status(unsigned long);
> +extern bool adi_capable(void);
> +
> +#define GET_SPARC_ADICAPS(a)   get_sparc_adicaps(a)
> +#define SET_SPARC_MCDE(a)      set_sparc_pstate_mcde(a)
> +#define ENABLE_SPARC_ADI(a, b) enable_sparc_adi(a, b)
> +#define DISABLE_SPARC_ADI(a, b)        disable_sparc_adi(a, b)
> +#define GET_SPARC_ADI_STATUS(a)        get_sparc_adi_status(a)
> +#define ADI_CAPABLE()          adi_capable()

Get rid of the ADI_CAPABLE macro, the usual pattern here is to define
a static inline function for the entire API when the symbol is
disabled, i.e.

#ifdef CONFIG_SPARC_ADI
...
extern bool adi_capable(void);
#else
...
static inline bool adi_capable(void) {
    return false;
}
#endif

That way you get type checking on the arguments even if the option is
disabled and modern compilers are smart enough to optimise all the
no-op code away. (Not that the type checking is needed here.)

Also, in all but one place you use the ADI_CAPABLE() macro when the
adi_capable() function is defined and available.

> +#else
> +#define ADI_CAPABLE()          0
> +#endif
> +
>  /* On Uniprocessor, even in RMO processes see TSO semantics */
>  #ifdef CONFIG_SMP
>  #define TSTATE_INITIAL_MM      TSTATE_TSO
> diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
> index 6f80936..79f981c 100644
> --- a/arch/sparc/kernel/mdesc.c
> +++ b/arch/sparc/kernel/mdesc.c
> @@ -1007,6 +1013,80 @@ static int mdesc_open(struct inode *inode, struct file *file)
>         return 0;
>  }
>
> +bool adi_capable(void)
> +{
> +       return adi_state.enabled;
> +}
> +
> +struct adi_caps *get_adi_caps(void)
> +{
> +       return &adi_state.caps;
> +}
> +
> +void __init
> +init_adi(void)
> +{
> +       struct mdesc_handle *hp = mdesc_grab();
> +       const char *prop;
> +       u64 pn, *val;
> +       int len;
> +
> +       adi_state.enabled = false;
> +
> +       if (!hp)
> +               return;
> +
> +       pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "cpu");
> +       if (pn == MDESC_NODE_NULL)
> +               goto out;
> +
> +       prop = mdesc_get_property(hp, pn, "hwcap-list", &len);
> +       if (!prop)
> +               goto out;
> +
> +       /*
> +        * Look for "adp" keyword in hwcap-list which would indicate
> +        * ADI support
> +        */
> +       while (len) {
> +               int plen;
> +
> +               if (!strcmp(prop, "adp")) {
> +                       adi_state.enabled = true;
> +                       break;
> +               }
> +
> +               plen = strlen(prop) + 1;
> +               prop += plen;
> +               len -= plen;
> +       }
> +
> +       if (!adi_state.enabled)
> +               goto out;
> +
> +       pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform");
> +       if (pn == MDESC_NODE_NULL)
> +               goto out;
> +
> +       val = (u64 *) mdesc_get_property(hp, pn, "adp-blksz", &len);
> +       if (!val)
> +               goto out;
> +       adi_state.caps.blksz = *val;
> +
> +       val = (u64 *) mdesc_get_property(hp, pn, "adp-nbits", &len);
> +       if (!val)
> +               goto out;
> +       adi_state.caps.nbits = *val;
> +
> +       val = (u64 *) mdesc_get_property(hp, pn, "ue-on-adp", &len);
> +       if (!val)
> +               goto out;
> +       adi_state.caps.ue_on_adi = *val;
> +
> +out:
> +       mdesc_release(hp);
> +}
> +

Should all the ADI related functions above be within a #ifdef CONFIG_SPARC_ADI?

>  static ssize_t mdesc_read(struct file *file, char __user *buf,
>                           size_t len, loff_t *offp)
>  {
> diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
> index d21cd62..29db583 100644
> --- a/arch/sparc/kernel/traps_64.c
> +++ b/arch/sparc/kernel/traps_64.c
> @@ -2531,6 +2589,38 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c
>         force_sig_info(SIGBUS, &info, current);
>  }
>
> +void sun4v_mem_corrupt_detect_precise(struct pt_regs *regs, unsigned long addr,
> +                                     unsigned long context)
> +{
> +       siginfo_t info;
> +
> +       if (!ADI_CAPABLE()) {
> +               bad_trap(regs, 0x1a);
> +               return;
> +       }
> +
> +       if (notify_die(DIE_TRAP, "memory corruption precise exception", regs,
> +                      0, 0x8, SIGSEGV) == NOTIFY_STOP)
> +               return;
> +
> +       if (regs->tstate & TSTATE_PRIV) {
> +               pr_emerg("sun4v_mem_corrupt_detect_precise: ADDR[%016lx] "
> +                       "CTX[%lx], going.\n", addr, context);
> +               die_if_kernel("MCD precise", regs);
> +       }
> +
> +       if (test_thread_flag(TIF_32BIT)) {
> +               regs->tpc &= 0xffffffff;
> +               regs->tnpc &= 0xffffffff;
> +       }
> +       info.si_signo = SIGSEGV;
> +       info.si_code = SEGV_ADIPERR;
> +       info.si_errno = 0;
> +       info.si_addr = (void __user *) addr;
> +       info.si_trapno = 0;
> +       force_sig_info(SIGSEGV, &info, current);
> +}
> +

Should this be ifdef'd too?

>  void do_privop(struct pt_regs *regs)
>  {
>         enum ctx_state prev_state = exception_enter();
> diff --git a/kernel/sys.c b/kernel/sys.c
> index 6af9212..fa7b5d9 100644
> --- a/kernel/sys.c
> +++ b/kernel/sys.c
> @@ -103,6 +103,21 @@
>  #ifndef SET_FP_MODE
>  # define SET_FP_MODE(a,b)      (-EINVAL)
>  #endif
> +#ifndef GET_SPARC_ADICAPS
> +# define GET_SPARC_ADICAPS(a)          (-EINVAL)
> +#endif
> +#ifndef SET_SPARC_MCDE
> +# define SET_SPARC_MCDE(a)             (-EINVAL)
> +#endif
> +#ifndef ENABLE_SPARC_ADI
> +# define ENABLE_SPARC_ADI(a, b)                (-EINVAL)
> +#endif
> +#ifndef DISABLE_SPARC_ADI
> +# define DISABLE_SPARC_ADI(a, b)       (-EINVAL)
> +#endif
> +#ifndef GET_SPARC_ADI_STATUS
> +# define GET_SPARC_ADI_STATUS(a)       (-EINVAL)
> +#endif

Ah, I was wondering why you were defining macros in processor_64.h.

>  /*
>   * this is where the system-wide overflow UID and GID are defined, for

I've got a couple more comments, I'll send another email with them shortly.

Thanks,
Khalid Aziz March 3, 2016, 12:25 a.m. UTC | #2
Thanks, Julian! I really appreciate your feedback.

My comments below.

On 03/02/2016 04:08 PM, Julian Calaby wrote:
> Hi Khalid,
>
> On Thu, Mar 3, 2016 at 7:39 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
>>
>> Enable Application Data Integrity (ADI) support in the sparc
>> kernel for applications to use ADI in userspace. ADI is a new
>> feature supported on sparc M7 and newer processors. ADI is supported
>> for data fetches only and not instruction fetches. This patch adds
>> prctl commands to enable and disable ADI (TSTATE.mcde), return ADI
>> parameters to userspace, enable/disable MCD (Memory Corruption
>> Detection) on selected memory ranges and enable TTE.mcd in PTEs. It
>> also adds handlers for all traps related to MCD. ADI is not enabled
>> by default for any task and a task must explicitly enable ADI
>> (TSTATE.mcde), turn MCD on on a memory range and set version tag
>> for ADI to be effective for the task. This patch adds support for
>> ADI for hugepages only. Addresses passed into system calls must be
>> non-ADI tagged addresses.
>
> I can't comment on the actual functionality here, but I do see a few
> minor style issues in your patch.
>
> My big concern is that you're defining a lot of new code that is ADI
> specific but isn't inside a CONFIG_SPARC_ADI ifdef. (That said,
> handling ADI specific traps if ADI isn't enabled looks like a good
> idea to me, however most of the other stuff is just dead code if
> CONFIG_SPARC_ADI isn't enabled.)

Some of the code will be executed when CONFIG_SPARC_ADI is not enabled, 
for instance init_adi() which will parse machine description to 
determine if platform supports ADI. On the other hand, it might still 
make sense to enclose this code in #ifdef. More on that below.

>
>> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
>> ---
>> NOTES: ADI is a new feature added to M7 processor to allow hardware
>>          to catch rogue accesses to memory. An app can enable ADI on
>>          its data pages, set version tags on them and use versioned
>>          addresses (bits 63-60 of the address contain a version tag)
>>          to access the data pages. If a rogue app attempts to access
>>          ADI enabled data pages, its access is blocked and processor
>>          generates an exception. Enabling this functionality for all
>>          data pages of an app requires adding infrastructure to save
>>          version tags for any data pages that get swapped out and
>>          restoring those tags when pages are swapped back in. In this
>>          first implementation I am enabling ADI for hugepages only
>>          since these pages are locked in memory and hence avoid the
>>          issue of saving and restoring tags. Once this core functionality
>>          is stable, ADI for other memory pages can be enabled more
>>          easily.
>>
>> v2:
>>          - Fixed a build error
>>
>>   Documentation/prctl/sparc_adi.txt     |  62 ++++++++++
>>   Documentation/sparc/adi.txt           | 206 +++++++++++++++++++++++++++++++
>>   arch/sparc/Kconfig                    |  12 ++
>>   arch/sparc/include/asm/hugetlb.h      |  14 +++
>>   arch/sparc/include/asm/hypervisor.h   |   2 +
>>   arch/sparc/include/asm/mmu_64.h       |   1 +
>>   arch/sparc/include/asm/pgtable_64.h   |  15 +++
>>   arch/sparc/include/asm/processor_64.h |  19 +++
>>   arch/sparc/include/asm/ttable.h       |  10 ++
>>   arch/sparc/include/uapi/asm/asi.h     |   3 +
>>   arch/sparc/include/uapi/asm/pstate.h  |  10 ++
>>   arch/sparc/kernel/entry.h             |   3 +
>>   arch/sparc/kernel/head_64.S           |   1 +
>>   arch/sparc/kernel/mdesc.c             |  81 +++++++++++++
>>   arch/sparc/kernel/process_64.c        | 222 ++++++++++++++++++++++++++++++++++
>>   arch/sparc/kernel/sun4v_mcd.S         |  16 +++
>>   arch/sparc/kernel/traps_64.c          |  96 ++++++++++++++-
>>   arch/sparc/kernel/ttable_64.S         |   6 +-
>>   include/linux/mm.h                    |   2 +
>>   include/uapi/asm-generic/siginfo.h    |   5 +-
>>   include/uapi/linux/prctl.h            |  16 +++
>>   kernel/sys.c                          |  30 +++++
>>   22 files changed, 826 insertions(+), 6 deletions(-)
>>   create mode 100644 Documentation/prctl/sparc_adi.txt
>>   create mode 100644 Documentation/sparc/adi.txt
>>   create mode 100644 arch/sparc/kernel/sun4v_mcd.S
>
> I must admit that I'm slightly impressed that the documentation is
> over a quarter of the lines added. =)
>
>> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
>> index 56442d2..0aac0ae 100644
>> --- a/arch/sparc/Kconfig
>> +++ b/arch/sparc/Kconfig
>> @@ -80,6 +80,7 @@ config SPARC64
>>          select NO_BOOTMEM
>>          select HAVE_ARCH_AUDITSYSCALL
>>          select ARCH_SUPPORTS_ATOMIC_RMW
>> +       select SPARC_ADI
>
> This doesn't look right.
>
>>   config ARCH_DEFCONFIG
>>          string
>> @@ -314,6 +315,17 @@ if SPARC64
>>   source "kernel/power/Kconfig"
>>   endif
>>
>> +config SPARC_ADI
>> +       bool "Application Data Integrity support"
>> +       def_bool y if SPARC64
>
> def_bool is for config options without names (i.e. "this is a boolean
> value and it's default is...")
>
> So if you want people to be able to disable this option, then you
> should remove the select above and just have:
>
> bool "Application Data Integrity support"
> default y if SPARC64
>
> If you don't want people disabling it, then there's no point in having
> a separate Kconfig symbol.
>

Ah, I see. I do not want people disabling it. I will make changes.

>> +       help
>> +         Support for Application Data Integrity (ADI). ADI feature allows
>> +         a process to tag memory blocks with version tags. Once ADI is
>> +         enabled and version tag is set on a memory block, any access to
>> +         it is allowed only if the correct version tag is presented by
>> +         a process. This feature is meant to help catch rogue accesses
>> +         to memory.
>> +
>
> You should probably mention that it's only available on newer
> processors and recommend that it's enabled on them.

Good point.

>
> This code won't break anything on older processors, right? I haven't
> looked very closely, but I don't see anything that specifically
> disables the code if it's run on, say, a UltraSparc I.

Right, this code does not break anything on older processors and has 
been tested on older machines. init_adi() will detect the platform does 
not support ADI when it parses machine description and will leave ADI 
disabled in that case (adi_state.enabled=false).

>
>>   config SCHED_SMT
>>          bool "SMT (Hyperthreading) scheduler support"
>>          depends on SPARC64 && SMP
>> diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h
>> index 6924bde..9a71701 100644
>> --- a/arch/sparc/include/asm/processor_64.h
>> +++ b/arch/sparc/include/asm/processor_64.h
>> @@ -97,6 +97,25 @@ struct thread_struct {
>>   struct task_struct;
>>   unsigned long thread_saved_pc(struct task_struct *);
>>
>> +#ifdef CONFIG_SPARC_ADI
>> +extern struct adi_caps *get_adi_caps(void);
>> +extern long get_sparc_adicaps(unsigned long);
>> +extern long set_sparc_pstate_mcde(unsigned long);
>> +extern long enable_sparc_adi(unsigned long, unsigned long);
>> +extern long disable_sparc_adi(unsigned long, unsigned long);
>> +extern long get_sparc_adi_status(unsigned long);
>> +extern bool adi_capable(void);
>> +
>> +#define GET_SPARC_ADICAPS(a)   get_sparc_adicaps(a)
>> +#define SET_SPARC_MCDE(a)      set_sparc_pstate_mcde(a)
>> +#define ENABLE_SPARC_ADI(a, b) enable_sparc_adi(a, b)
>> +#define DISABLE_SPARC_ADI(a, b)        disable_sparc_adi(a, b)
>> +#define GET_SPARC_ADI_STATUS(a)        get_sparc_adi_status(a)
>> +#define ADI_CAPABLE()          adi_capable()
>
> Get rid of the ADI_CAPABLE macro, the usual pattern here is to define
> a static inline function for the entire API when the symbol is
> disabled, i.e.
>
> #ifdef CONFIG_SPARC_ADI
> ...
> extern bool adi_capable(void);
> #else
> ...
> static inline bool adi_capable(void) {
>      return false;
> }
> #endif
>
> That way you get type checking on the arguments even if the option is
> disabled and modern compilers are smart enough to optimise all the
> no-op code away. (Not that the type checking is needed here.)
>
> Also, in all but one place you use the ADI_CAPABLE() macro when the
> adi_capable() function is defined and available.

I defined ADI_CAPABLE() 0 for the case when CONFIG_SPARC_ADI is not set 
to help compiler optimize sun4v_mem_corrupt_detect_precise(). Since 
sun4v_mem_corrupt_detect_precise() is exception handler, optimizing it 
can be good for performance but perhaps compiler is smart enough to do 
that any way if adi_capable() is defined inline as you show above? I do 
like that doing it this way retains type checking.

>
>> +#else
>> +#define ADI_CAPABLE()          0
>> +#endif
>> +
>>   /* On Uniprocessor, even in RMO processes see TSO semantics */
>>   #ifdef CONFIG_SMP
>>   #define TSTATE_INITIAL_MM      TSTATE_TSO
>> diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
>> index 6f80936..79f981c 100644
>> --- a/arch/sparc/kernel/mdesc.c
>> +++ b/arch/sparc/kernel/mdesc.c
>> @@ -1007,6 +1013,80 @@ static int mdesc_open(struct inode *inode, struct file *file)
>>          return 0;
>>   }
>>
>> +bool adi_capable(void)
>> +{
>> +       return adi_state.enabled;
>> +}
>> +
>> +struct adi_caps *get_adi_caps(void)
>> +{
>> +       return &adi_state.caps;
>> +}
>> +
>> +void __init
>> +init_adi(void)
>> +{
>> +       struct mdesc_handle *hp = mdesc_grab();
>> +       const char *prop;
>> +       u64 pn, *val;
>> +       int len;
>> +
>> +       adi_state.enabled = false;
>> +
>> +       if (!hp)
>> +               return;
>> +
>> +       pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "cpu");
>> +       if (pn == MDESC_NODE_NULL)
>> +               goto out;
>> +
>> +       prop = mdesc_get_property(hp, pn, "hwcap-list", &len);
>> +       if (!prop)
>> +               goto out;
>> +
>> +       /*
>> +        * Look for "adp" keyword in hwcap-list which would indicate
>> +        * ADI support
>> +        */
>> +       while (len) {
>> +               int plen;
>> +
>> +               if (!strcmp(prop, "adp")) {
>> +                       adi_state.enabled = true;
>> +                       break;
>> +               }
>> +
>> +               plen = strlen(prop) + 1;
>> +               prop += plen;
>> +               len -= plen;
>> +       }
>> +
>> +       if (!adi_state.enabled)
>> +               goto out;
>> +
>> +       pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform");
>> +       if (pn == MDESC_NODE_NULL)
>> +               goto out;
>> +
>> +       val = (u64 *) mdesc_get_property(hp, pn, "adp-blksz", &len);
>> +       if (!val)
>> +               goto out;
>> +       adi_state.caps.blksz = *val;
>> +
>> +       val = (u64 *) mdesc_get_property(hp, pn, "adp-nbits", &len);
>> +       if (!val)
>> +               goto out;
>> +       adi_state.caps.nbits = *val;
>> +
>> +       val = (u64 *) mdesc_get_property(hp, pn, "ue-on-adp", &len);
>> +       if (!val)
>> +               goto out;
>> +       adi_state.caps.ue_on_adi = *val;
>> +
>> +out:
>> +       mdesc_release(hp);
>> +}
>> +
>
> Should all the ADI related functions above be within a #ifdef CONFIG_SPARC_ADI?
>

CONFIG_SPARC_ADI is selected for 64-bit kernels only since M7 is 64-bit 
only. init_adi() will do the right thing whether CONFIG_SPARC_ADI is 
enabled or not. It will parse machine description on 32-bit kernels, 
detect ADI is not supported by the platform and leave 
adi_state.enabled=false. I was considering adding something like 
/proc/sys/vm/sparc_adi_available at later point which would get its data 
from what init_adi() detects. On the other hand, since 32-bit processors 
do not support ADI, why have even this much code run on them. I can 
enclose this code as well inside #ifdef.

>>   static ssize_t mdesc_read(struct file *file, char __user *buf,
>>                            size_t len, loff_t *offp)
>>   {
>> diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
>> index d21cd62..29db583 100644
>> --- a/arch/sparc/kernel/traps_64.c
>> +++ b/arch/sparc/kernel/traps_64.c
>> @@ -2531,6 +2589,38 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c
>>          force_sig_info(SIGBUS, &info, current);
>>   }
>>
>> +void sun4v_mem_corrupt_detect_precise(struct pt_regs *regs, unsigned long addr,
>> +                                     unsigned long context)
>> +{
>> +       siginfo_t info;
>> +
>> +       if (!ADI_CAPABLE()) {
>> +               bad_trap(regs, 0x1a);
>> +               return;
>> +       }
>> +
>> +       if (notify_die(DIE_TRAP, "memory corruption precise exception", regs,
>> +                      0, 0x8, SIGSEGV) == NOTIFY_STOP)
>> +               return;
>> +
>> +       if (regs->tstate & TSTATE_PRIV) {
>> +               pr_emerg("sun4v_mem_corrupt_detect_precise: ADDR[%016lx] "
>> +                       "CTX[%lx], going.\n", addr, context);
>> +               die_if_kernel("MCD precise", regs);
>> +       }
>> +
>> +       if (test_thread_flag(TIF_32BIT)) {
>> +               regs->tpc &= 0xffffffff;
>> +               regs->tnpc &= 0xffffffff;
>> +       }
>> +       info.si_signo = SIGSEGV;
>> +       info.si_code = SEGV_ADIPERR;
>> +       info.si_errno = 0;
>> +       info.si_addr = (void __user *) addr;
>> +       info.si_trapno = 0;
>> +       force_sig_info(SIGSEGV, &info, current);
>> +}
>> +
>
> Should this be ifdef'd too?

I would prefer to leave exception handlers in place any way unless there 
are strong objections.

>
>>   void do_privop(struct pt_regs *regs)
>>   {
>>          enum ctx_state prev_state = exception_enter();
>> diff --git a/kernel/sys.c b/kernel/sys.c
>> index 6af9212..fa7b5d9 100644
>> --- a/kernel/sys.c
>> +++ b/kernel/sys.c
>> @@ -103,6 +103,21 @@
>>   #ifndef SET_FP_MODE
>>   # define SET_FP_MODE(a,b)      (-EINVAL)
>>   #endif
>> +#ifndef GET_SPARC_ADICAPS
>> +# define GET_SPARC_ADICAPS(a)          (-EINVAL)
>> +#endif
>> +#ifndef SET_SPARC_MCDE
>> +# define SET_SPARC_MCDE(a)             (-EINVAL)
>> +#endif
>> +#ifndef ENABLE_SPARC_ADI
>> +# define ENABLE_SPARC_ADI(a, b)                (-EINVAL)
>> +#endif
>> +#ifndef DISABLE_SPARC_ADI
>> +# define DISABLE_SPARC_ADI(a, b)       (-EINVAL)
>> +#endif
>> +#ifndef GET_SPARC_ADI_STATUS
>> +# define GET_SPARC_ADI_STATUS(a)       (-EINVAL)
>> +#endif
>
> Ah, I was wondering why you were defining macros in processor_64.h.
>
>>   /*
>>    * this is where the system-wide overflow UID and GID are defined, for
>
> I've got a couple more comments, I'll send another email with them shortly.
>
> Thanks,
>

Thanks,
Khalid
Julian Calaby March 3, 2016, 12:48 a.m. UTC | #3
Hi Khalid,

On Thu, Mar 3, 2016 at 11:25 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
> Thanks, Julian! I really appreciate your feedback.

No problem!

> My comments below.
>
> On 03/02/2016 04:08 PM, Julian Calaby wrote:
>>
>> Hi Khalid,
>>
>> On Thu, Mar 3, 2016 at 7:39 AM, Khalid Aziz <khalid.aziz@oracle.com>
>> wrote:
>>>
>>>
>>> Enable Application Data Integrity (ADI) support in the sparc
>>> kernel for applications to use ADI in userspace. ADI is a new
>>> feature supported on sparc M7 and newer processors. ADI is supported
>>> for data fetches only and not instruction fetches. This patch adds
>>> prctl commands to enable and disable ADI (TSTATE.mcde), return ADI
>>> parameters to userspace, enable/disable MCD (Memory Corruption
>>> Detection) on selected memory ranges and enable TTE.mcd in PTEs. It
>>> also adds handlers for all traps related to MCD. ADI is not enabled
>>> by default for any task and a task must explicitly enable ADI
>>> (TSTATE.mcde), turn MCD on on a memory range and set version tag
>>> for ADI to be effective for the task. This patch adds support for
>>> ADI for hugepages only. Addresses passed into system calls must be
>>> non-ADI tagged addresses.
>>
>>
>> I can't comment on the actual functionality here, but I do see a few
>> minor style issues in your patch.
>>
>> My big concern is that you're defining a lot of new code that is ADI
>> specific but isn't inside a CONFIG_SPARC_ADI ifdef. (That said,
>> handling ADI specific traps if ADI isn't enabled looks like a good
>> idea to me, however most of the other stuff is just dead code if
>> CONFIG_SPARC_ADI isn't enabled.)
>
>
> Some of the code will be executed when CONFIG_SPARC_ADI is not enabled, for
> instance init_adi() which will parse machine description to determine if
> platform supports ADI. On the other hand, it might still make sense to
> enclose this code in #ifdef. More on that below.
>
>
>>
>>> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
>>> ---
>>> NOTES: ADI is a new feature added to M7 processor to allow hardware
>>>          to catch rogue accesses to memory. An app can enable ADI on
>>>          its data pages, set version tags on them and use versioned
>>>          addresses (bits 63-60 of the address contain a version tag)
>>>          to access the data pages. If a rogue app attempts to access
>>>          ADI enabled data pages, its access is blocked and processor
>>>          generates an exception. Enabling this functionality for all
>>>          data pages of an app requires adding infrastructure to save
>>>          version tags for any data pages that get swapped out and
>>>          restoring those tags when pages are swapped back in. In this
>>>          first implementation I am enabling ADI for hugepages only
>>>          since these pages are locked in memory and hence avoid the
>>>          issue of saving and restoring tags. Once this core functionality
>>>          is stable, ADI for other memory pages can be enabled more
>>>          easily.
>>>
>>> v2:
>>>          - Fixed a build error
>>>
>>>   Documentation/prctl/sparc_adi.txt     |  62 ++++++++++
>>>   Documentation/sparc/adi.txt           | 206
>>> +++++++++++++++++++++++++++++++
>>>   arch/sparc/Kconfig                    |  12 ++
>>>   arch/sparc/include/asm/hugetlb.h      |  14 +++
>>>   arch/sparc/include/asm/hypervisor.h   |   2 +
>>>   arch/sparc/include/asm/mmu_64.h       |   1 +
>>>   arch/sparc/include/asm/pgtable_64.h   |  15 +++
>>>   arch/sparc/include/asm/processor_64.h |  19 +++
>>>   arch/sparc/include/asm/ttable.h       |  10 ++
>>>   arch/sparc/include/uapi/asm/asi.h     |   3 +
>>>   arch/sparc/include/uapi/asm/pstate.h  |  10 ++
>>>   arch/sparc/kernel/entry.h             |   3 +
>>>   arch/sparc/kernel/head_64.S           |   1 +
>>>   arch/sparc/kernel/mdesc.c             |  81 +++++++++++++
>>>   arch/sparc/kernel/process_64.c        | 222
>>> ++++++++++++++++++++++++++++++++++
>>>   arch/sparc/kernel/sun4v_mcd.S         |  16 +++
>>>   arch/sparc/kernel/traps_64.c          |  96 ++++++++++++++-
>>>   arch/sparc/kernel/ttable_64.S         |   6 +-
>>>   include/linux/mm.h                    |   2 +
>>>   include/uapi/asm-generic/siginfo.h    |   5 +-
>>>   include/uapi/linux/prctl.h            |  16 +++
>>>   kernel/sys.c                          |  30 +++++
>>>   22 files changed, 826 insertions(+), 6 deletions(-)
>>>   create mode 100644 Documentation/prctl/sparc_adi.txt
>>>   create mode 100644 Documentation/sparc/adi.txt
>>>   create mode 100644 arch/sparc/kernel/sun4v_mcd.S
>>
>>
>> I must admit that I'm slightly impressed that the documentation is
>> over a quarter of the lines added. =)
>>
>>> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
>>> index 56442d2..0aac0ae 100644
>>> --- a/arch/sparc/Kconfig
>>> +++ b/arch/sparc/Kconfig
>>> @@ -80,6 +80,7 @@ config SPARC64
>>>          select NO_BOOTMEM
>>>          select HAVE_ARCH_AUDITSYSCALL
>>>          select ARCH_SUPPORTS_ATOMIC_RMW
>>> +       select SPARC_ADI
>>
>>
>> This doesn't look right.
>>
>>>   config ARCH_DEFCONFIG
>>>          string
>>> @@ -314,6 +315,17 @@ if SPARC64
>>>   source "kernel/power/Kconfig"
>>>   endif
>>>
>>> +config SPARC_ADI
>>> +       bool "Application Data Integrity support"
>>> +       def_bool y if SPARC64
>>
>>
>> def_bool is for config options without names (i.e. "this is a boolean
>> value and it's default is...")
>>
>> So if you want people to be able to disable this option, then you
>> should remove the select above and just have:
>>
>> bool "Application Data Integrity support"
>> default y if SPARC64
>>
>> If you don't want people disabling it, then there's no point in having
>> a separate Kconfig symbol.
>>
>
> Ah, I see. I do not want people disabling it. I will make changes.

Why don't you want people disabling it? I must acknowledge that it's
not a lot of code, but I can see people wanting to build "minimal"
kernels for processors without ADI or to run some specific thing that
doesn't use ADI. Providing the kernel responds appropriately if
there's an unexpected ADI fault I don't see why the code would be
needed if it'll never be used.

>>> +       help
>>> +         Support for Application Data Integrity (ADI). ADI feature
>>> allows
>>> +         a process to tag memory blocks with version tags. Once ADI is
>>> +         enabled and version tag is set on a memory block, any access to
>>> +         it is allowed only if the correct version tag is presented by
>>> +         a process. This feature is meant to help catch rogue accesses
>>> +         to memory.
>>> +
>>
>>
>> You should probably mention that it's only available on newer
>> processors and recommend that it's enabled on them.
>
>
> Good point.
>
>>
>> This code won't break anything on older processors, right? I haven't
>> looked very closely, but I don't see anything that specifically
>> disables the code if it's run on, say, a UltraSparc I.
>
>
> Right, this code does not break anything on older processors and has been
> tested on older machines. init_adi() will detect the platform does not
> support ADI when it parses machine description and will leave ADI disabled
> in that case (adi_state.enabled=false).

Awesome, I just wanted to check =)

>>>   config SCHED_SMT
>>>          bool "SMT (Hyperthreading) scheduler support"
>>>          depends on SPARC64 && SMP
>>> diff --git a/arch/sparc/include/asm/processor_64.h
>>> b/arch/sparc/include/asm/processor_64.h
>>> index 6924bde..9a71701 100644
>>> --- a/arch/sparc/include/asm/processor_64.h
>>> +++ b/arch/sparc/include/asm/processor_64.h
>>> @@ -97,6 +97,25 @@ struct thread_struct {
>>>   struct task_struct;
>>>   unsigned long thread_saved_pc(struct task_struct *);
>>>
>>> +#ifdef CONFIG_SPARC_ADI
>>> +extern struct adi_caps *get_adi_caps(void);
>>> +extern long get_sparc_adicaps(unsigned long);
>>> +extern long set_sparc_pstate_mcde(unsigned long);
>>> +extern long enable_sparc_adi(unsigned long, unsigned long);
>>> +extern long disable_sparc_adi(unsigned long, unsigned long);
>>> +extern long get_sparc_adi_status(unsigned long);
>>> +extern bool adi_capable(void);
>>> +
>>> +#define GET_SPARC_ADICAPS(a)   get_sparc_adicaps(a)
>>> +#define SET_SPARC_MCDE(a)      set_sparc_pstate_mcde(a)
>>> +#define ENABLE_SPARC_ADI(a, b) enable_sparc_adi(a, b)
>>> +#define DISABLE_SPARC_ADI(a, b)        disable_sparc_adi(a, b)
>>> +#define GET_SPARC_ADI_STATUS(a)        get_sparc_adi_status(a)
>>> +#define ADI_CAPABLE()          adi_capable()
>>
>>
>> Get rid of the ADI_CAPABLE macro, the usual pattern here is to define
>> a static inline function for the entire API when the symbol is
>> disabled, i.e.
>>
>> #ifdef CONFIG_SPARC_ADI
>> ...
>> extern bool adi_capable(void);
>> #else
>> ...
>> static inline bool adi_capable(void) {
>>      return false;
>> }
>> #endif
>>
>> That way you get type checking on the arguments even if the option is
>> disabled and modern compilers are smart enough to optimise all the
>> no-op code away. (Not that the type checking is needed here.)
>>
>> Also, in all but one place you use the ADI_CAPABLE() macro when the
>> adi_capable() function is defined and available.
>
>
> I defined ADI_CAPABLE() 0 for the case when CONFIG_SPARC_ADI is not set to
> help compiler optimize sun4v_mem_corrupt_detect_precise(). Since
> sun4v_mem_corrupt_detect_precise() is exception handler, optimizing it can
> be good for performance but perhaps compiler is smart enough to do that any
> way if adi_capable() is defined inline as you show above? I do like that
> doing it this way retains type checking.

Inlines can usually be treated as if they're macros from an optimising
perspective.

>>> +#else
>>> +#define ADI_CAPABLE()          0
>>> +#endif
>>> +
>>>   /* On Uniprocessor, even in RMO processes see TSO semantics */
>>>   #ifdef CONFIG_SMP
>>>   #define TSTATE_INITIAL_MM      TSTATE_TSO
>>> diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
>>> index 6f80936..79f981c 100644
>>> --- a/arch/sparc/kernel/mdesc.c
>>> +++ b/arch/sparc/kernel/mdesc.c
>>> @@ -1007,6 +1013,80 @@ static int mdesc_open(struct inode *inode, struct
>>> file *file)
>>>          return 0;
>>>   }
>>>
>>> +bool adi_capable(void)
>>> +{
>>> +       return adi_state.enabled;
>>> +}
>>> +
>>> +struct adi_caps *get_adi_caps(void)
>>> +{
>>> +       return &adi_state.caps;
>>> +}
>>> +
>>> +void __init
>>> +init_adi(void)
>>> +{
>>> +       struct mdesc_handle *hp = mdesc_grab();
>>> +       const char *prop;
>>> +       u64 pn, *val;
>>> +       int len;
>>> +
>>> +       adi_state.enabled = false;
>>> +
>>> +       if (!hp)
>>> +               return;
>>> +
>>> +       pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "cpu");
>>> +       if (pn == MDESC_NODE_NULL)
>>> +               goto out;
>>> +
>>> +       prop = mdesc_get_property(hp, pn, "hwcap-list", &len);
>>> +       if (!prop)
>>> +               goto out;
>>> +
>>> +       /*
>>> +        * Look for "adp" keyword in hwcap-list which would indicate
>>> +        * ADI support
>>> +        */
>>> +       while (len) {
>>> +               int plen;
>>> +
>>> +               if (!strcmp(prop, "adp")) {
>>> +                       adi_state.enabled = true;
>>> +                       break;
>>> +               }
>>> +
>>> +               plen = strlen(prop) + 1;
>>> +               prop += plen;
>>> +               len -= plen;
>>> +       }
>>> +
>>> +       if (!adi_state.enabled)
>>> +               goto out;
>>> +
>>> +       pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform");
>>> +       if (pn == MDESC_NODE_NULL)
>>> +               goto out;
>>> +
>>> +       val = (u64 *) mdesc_get_property(hp, pn, "adp-blksz", &len);
>>> +       if (!val)
>>> +               goto out;
>>> +       adi_state.caps.blksz = *val;
>>> +
>>> +       val = (u64 *) mdesc_get_property(hp, pn, "adp-nbits", &len);
>>> +       if (!val)
>>> +               goto out;
>>> +       adi_state.caps.nbits = *val;
>>> +
>>> +       val = (u64 *) mdesc_get_property(hp, pn, "ue-on-adp", &len);
>>> +       if (!val)
>>> +               goto out;
>>> +       adi_state.caps.ue_on_adi = *val;
>>> +
>>> +out:
>>> +       mdesc_release(hp);
>>> +}
>>> +
>>
>>
>> Should all the ADI related functions above be within a #ifdef
>> CONFIG_SPARC_ADI?
>>
>
> CONFIG_SPARC_ADI is selected for 64-bit kernels only since M7 is 64-bit
> only. init_adi() will do the right thing whether CONFIG_SPARC_ADI is enabled
> or not. It will parse machine description on 32-bit kernels, detect ADI is
> not supported by the platform and leave adi_state.enabled=false. I was
> considering adding something like /proc/sys/vm/sparc_adi_available at later
> point which would get its data from what init_adi() detects. On the other
> hand, since 32-bit processors do not support ADI, why have even this much
> code run on them. I can enclose this code as well inside #ifdef.

Precisely.

>>>   static ssize_t mdesc_read(struct file *file, char __user *buf,
>>>                            size_t len, loff_t *offp)
>>>   {
>>> diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
>>> index d21cd62..29db583 100644
>>> --- a/arch/sparc/kernel/traps_64.c
>>> +++ b/arch/sparc/kernel/traps_64.c
>>> @@ -2531,6 +2589,38 @@ void sun4v_do_mna(struct pt_regs *regs, unsigned
>>> long addr, unsigned long type_c
>>>          force_sig_info(SIGBUS, &info, current);
>>>   }
>>>
>>> +void sun4v_mem_corrupt_detect_precise(struct pt_regs *regs, unsigned
>>> long addr,
>>> +                                     unsigned long context)
>>> +{
>>> +       siginfo_t info;
>>> +
>>> +       if (!ADI_CAPABLE()) {
>>> +               bad_trap(regs, 0x1a);
>>> +               return;
>>> +       }
>>> +
>>> +       if (notify_die(DIE_TRAP, "memory corruption precise exception",
>>> regs,
>>> +                      0, 0x8, SIGSEGV) == NOTIFY_STOP)
>>> +               return;
>>> +
>>> +       if (regs->tstate & TSTATE_PRIV) {
>>> +               pr_emerg("sun4v_mem_corrupt_detect_precise: ADDR[%016lx]
>>> "
>>> +                       "CTX[%lx], going.\n", addr, context);
>>> +               die_if_kernel("MCD precise", regs);
>>> +       }
>>> +
>>> +       if (test_thread_flag(TIF_32BIT)) {
>>> +               regs->tpc &= 0xffffffff;
>>> +               regs->tnpc &= 0xffffffff;
>>> +       }
>>> +       info.si_signo = SIGSEGV;
>>> +       info.si_code = SEGV_ADIPERR;
>>> +       info.si_errno = 0;
>>> +       info.si_addr = (void __user *) addr;
>>> +       info.si_trapno = 0;
>>> +       force_sig_info(SIGSEGV, &info, current);
>>> +}
>>> +
>>
>>
>> Should this be ifdef'd too?
>
>
> I would prefer to leave exception handlers in place any way unless there are
> strong objections.

Thinking over it again, you're right. It's possible, however unlikely,
that this trap could occur without ADI being enabled anywhere, so we
should handle it unconditionally.

>>>   void do_privop(struct pt_regs *regs)
>>>   {
>>>          enum ctx_state prev_state = exception_enter();

Thanks,
Khalid Aziz March 3, 2016, 5:28 p.m. UTC | #4
On 03/02/2016 05:48 PM, Julian Calaby wrote:
> Hi Khalid,
>
> On Thu, Mar 3, 2016 at 11:25 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
>> Thanks, Julian! I really appreciate your feedback.
>
> No problem!
>
>> My comments below.
>>
>> On 03/02/2016 04:08 PM, Julian Calaby wrote:
>>>
>>> Hi Khalid,
>>>
>>> On Thu, Mar 3, 2016 at 7:39 AM, Khalid Aziz <khalid.aziz@oracle.com>
>>> wrote:
>>>>
>>>>
>>>> Enable Application Data Integrity (ADI) support in the sparc
>>>> kernel for applications to use ADI in userspace. ADI is a new
>>>> feature supported on sparc M7 and newer processors. ADI is supported
>>>> for data fetches only and not instruction fetches. This patch adds
>>>> prctl commands to enable and disable ADI (TSTATE.mcde), return ADI
>>>> parameters to userspace, enable/disable MCD (Memory Corruption
>>>> Detection) on selected memory ranges and enable TTE.mcd in PTEs. It
>>>> also adds handlers for all traps related to MCD. ADI is not enabled
>>>> by default for any task and a task must explicitly enable ADI
>>>> (TSTATE.mcde), turn MCD on on a memory range and set version tag
>>>> for ADI to be effective for the task. This patch adds support for
>>>> ADI for hugepages only. Addresses passed into system calls must be
>>>> non-ADI tagged addresses.
>>>
>>>
>>> I can't comment on the actual functionality here, but I do see a few
>>> minor style issues in your patch.
>>>
>>> My big concern is that you're defining a lot of new code that is ADI
>>> specific but isn't inside a CONFIG_SPARC_ADI ifdef. (That said,
>>> handling ADI specific traps if ADI isn't enabled looks like a good
>>> idea to me, however most of the other stuff is just dead code if
>>> CONFIG_SPARC_ADI isn't enabled.)
>>
>>
>> Some of the code will be executed when CONFIG_SPARC_ADI is not enabled, for
>> instance init_adi() which will parse machine description to determine if
>> platform supports ADI. On the other hand, it might still make sense to
>> enclose this code in #ifdef. More on that below.
>>
>>
>>>
>>>> Signed-off-by: Khalid Aziz <khalid.aziz@oracle.com>
>>>> ---
>>>> NOTES: ADI is a new feature added to M7 processor to allow hardware
>>>>           to catch rogue accesses to memory. An app can enable ADI on
>>>>           its data pages, set version tags on them and use versioned
>>>>           addresses (bits 63-60 of the address contain a version tag)
>>>>           to access the data pages. If a rogue app attempts to access
>>>>           ADI enabled data pages, its access is blocked and processor
>>>>           generates an exception. Enabling this functionality for all
>>>>           data pages of an app requires adding infrastructure to save
>>>>           version tags for any data pages that get swapped out and
>>>>           restoring those tags when pages are swapped back in. In this
>>>>           first implementation I am enabling ADI for hugepages only
>>>>           since these pages are locked in memory and hence avoid the
>>>>           issue of saving and restoring tags. Once this core functionality
>>>>           is stable, ADI for other memory pages can be enabled more
>>>>           easily.
>>>>
>>>> v2:
>>>>           - Fixed a build error
>>>>
>>>>    Documentation/prctl/sparc_adi.txt     |  62 ++++++++++
>>>>    Documentation/sparc/adi.txt           | 206
>>>> +++++++++++++++++++++++++++++++
>>>>    arch/sparc/Kconfig                    |  12 ++
>>>>    arch/sparc/include/asm/hugetlb.h      |  14 +++
>>>>    arch/sparc/include/asm/hypervisor.h   |   2 +
>>>>    arch/sparc/include/asm/mmu_64.h       |   1 +
>>>>    arch/sparc/include/asm/pgtable_64.h   |  15 +++
>>>>    arch/sparc/include/asm/processor_64.h |  19 +++
>>>>    arch/sparc/include/asm/ttable.h       |  10 ++
>>>>    arch/sparc/include/uapi/asm/asi.h     |   3 +
>>>>    arch/sparc/include/uapi/asm/pstate.h  |  10 ++
>>>>    arch/sparc/kernel/entry.h             |   3 +
>>>>    arch/sparc/kernel/head_64.S           |   1 +
>>>>    arch/sparc/kernel/mdesc.c             |  81 +++++++++++++
>>>>    arch/sparc/kernel/process_64.c        | 222
>>>> ++++++++++++++++++++++++++++++++++
>>>>    arch/sparc/kernel/sun4v_mcd.S         |  16 +++
>>>>    arch/sparc/kernel/traps_64.c          |  96 ++++++++++++++-
>>>>    arch/sparc/kernel/ttable_64.S         |   6 +-
>>>>    include/linux/mm.h                    |   2 +
>>>>    include/uapi/asm-generic/siginfo.h    |   5 +-
>>>>    include/uapi/linux/prctl.h            |  16 +++
>>>>    kernel/sys.c                          |  30 +++++
>>>>    22 files changed, 826 insertions(+), 6 deletions(-)
>>>>    create mode 100644 Documentation/prctl/sparc_adi.txt
>>>>    create mode 100644 Documentation/sparc/adi.txt
>>>>    create mode 100644 arch/sparc/kernel/sun4v_mcd.S
>>>
>>>
>>> I must admit that I'm slightly impressed that the documentation is
>>> over a quarter of the lines added. =)
>>>
>>>> diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
>>>> index 56442d2..0aac0ae 100644
>>>> --- a/arch/sparc/Kconfig
>>>> +++ b/arch/sparc/Kconfig
>>>> @@ -80,6 +80,7 @@ config SPARC64
>>>>           select NO_BOOTMEM
>>>>           select HAVE_ARCH_AUDITSYSCALL
>>>>           select ARCH_SUPPORTS_ATOMIC_RMW
>>>> +       select SPARC_ADI
>>>
>>>
>>> This doesn't look right.
>>>
>>>>    config ARCH_DEFCONFIG
>>>>           string
>>>> @@ -314,6 +315,17 @@ if SPARC64
>>>>    source "kernel/power/Kconfig"
>>>>    endif
>>>>
>>>> +config SPARC_ADI
>>>> +       bool "Application Data Integrity support"
>>>> +       def_bool y if SPARC64
>>>
>>>
>>> def_bool is for config options without names (i.e. "this is a boolean
>>> value and it's default is...")
>>>
>>> So if you want people to be able to disable this option, then you
>>> should remove the select above and just have:
>>>
>>> bool "Application Data Integrity support"
>>> default y if SPARC64
>>>
>>> If you don't want people disabling it, then there's no point in having
>>> a separate Kconfig symbol.
>>>
>>
>> Ah, I see. I do not want people disabling it. I will make changes.
>
> Why don't you want people disabling it? I must acknowledge that it's
> not a lot of code, but I can see people wanting to build "minimal"
> kernels for processors without ADI or to run some specific thing that
> doesn't use ADI. Providing the kernel responds appropriately if
> there's an unexpected ADI fault I don't see why the code would be
> needed if it'll never be used.
>

Hi Julian,

My goal in making CONFIG_SPARC_ADI auto-selected was to not add yet 
another config option that end user has to understand and figure out 
what to do with, and make the kernel self-configuring where ADI simply 
becomes available if platform supports it. Kernel auto-detecting 
platform features is especially useful for distro kernels. I do see your 
point in being able to build a minimal kernel when building a custom 
kernel. Both options of making CONFIG_SPARC_ADI auto-selected or not, 
have pros and cons. I don't have a strong feeling about it one way or 
the other and can go either way.

Thanks,
Khalid
David Miller March 6, 2016, 4:07 a.m. UTC | #5
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Wed,  2 Mar 2016 13:39:37 -0700

> 	In this
> 	first implementation I am enabling ADI for hugepages only
> 	since these pages are locked in memory and hence avoid the
> 	issue of saving and restoring tags.

This makes the feature almost entire useless.

Non-hugepages must be in the initial implementation.

> +	PR_ENABLE_SPARC_ADI - Enable ADI checking in all pages in the address
> +		range specified. The pages in the range must be already
> +		locked. This operation enables the TTE.mcd bit for the
> +		pages specified. arg2 is the starting address for address
> +		range and must be page aligned. arg3 is the length of
> +		memory address range and must be a multiple of page size.

I strongly dislike this interface, and it makes the prtctl cases look
extremely ugly and hide to the casual reader what the code is actually
doing.

This is an mprotect() operation, so add a new flag bit and implement
this via mprotect please.

Then since you are guarenteed to have a consistent ADI setting for
every single VMA region, you never "lose" the ADI state when you swap
out.  It's implicit in the VMA itself, because you'll store in the VMA
that this is an ADI region.

I also want this enabled unconditionally, without any Kconfig knobs.

Thanks.
Khalid Aziz March 7, 2016, 3:07 p.m. UTC | #6
On 03/05/2016 09:07 PM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Wed,  2 Mar 2016 13:39:37 -0700
>
>> 	In this
>> 	first implementation I am enabling ADI for hugepages only
>> 	since these pages are locked in memory and hence avoid the
>> 	issue of saving and restoring tags.
>
> This makes the feature almost entire useless.
>
> Non-hugepages must be in the initial implementation.

Hi David,

Thanks for the feedback. I will get this working for non-hugepages as 
well. ADI state of each VMA region is already stored in the VMA itself 
in my first implementation, so I do not lose it when the page is swapped 
out. The trouble is ADI version tags for each VMA region have to be 
stored on the swapped out pages since the ADI version tags are flushed 
when TLB entry for a page is flushed. When that page is brought back in, 
its version tags have to be set up again. Version tags are set on 
cacheline boundary and hence there can be multiple version tags for a 
single page. Version tags have to be stored in the swap space somehow 
along with the page. I can start out with allowing ADI to be enabled 
only on pages locked in memory.

>
>> +	PR_ENABLE_SPARC_ADI - Enable ADI checking in all pages in the address
>> +		range specified. The pages in the range must be already
>> +		locked. This operation enables the TTE.mcd bit for the
>> +		pages specified. arg2 is the starting address for address
>> +		range and must be page aligned. arg3 is the length of
>> +		memory address range and must be a multiple of page size.
>
> I strongly dislike this interface, and it makes the prtctl cases look
> extremely ugly and hide to the casual reader what the code is actually
> doing.
>
> This is an mprotect() operation, so add a new flag bit and implement
> this via mprotect please.

That is an interesting idea. Adding a PROT_ADI protection to mprotect() 
sounds cleaner. There are three steps to enabling ADI - (1) set 
PSTATE.mcde bit which is not tied to any VMA, (2) set TTE.mcd for each 
VMA, and (3) set the version tag on cacheline using MCD ASI. I can 
combine steps 1 and 2 in one mprotect() call. That will leave 
PR_GET_SPARC_ADICAPS and PR_GET_SPARC_ADI_STATUS prctl commands still to 
be implemented. PR_SET_SPARC_ADI is also used to check if the process 
has PSTATE.mcde bit set. I could use PR_GET_SPARC_ADI_STATUS to do that 
where return values of 0 and 1 mean the same as before and possibly add 
return value of 2 to mean PSTATE.mcde is not set?

>
> Then since you are guarenteed to have a consistent ADI setting for
> every single VMA region, you never "lose" the ADI state when you swap
> out.  It's implicit in the VMA itself, because you'll store in the VMA
> that this is an ADI region.
>
> I also want this enabled unconditionally, without any Kconfig knobs.
>

I can remove CONFIG_SPARC_ADI. It does mean this code will be built into 
32-bit kernels as well but it will be inactive code.

Thanks,
Khalid
Rob Gardner March 7, 2016, 3:30 p.m. UTC | #7
On 03/07/2016 07:07 AM, Khalid Aziz wrote:
> On 03/05/2016 09:07 PM, David Miller wrote:
>> From: Khalid Aziz <khalid.aziz@oracle.com>
>> Date: Wed,  2 Mar 2016 13:39:37 -0700
>>
>>>     In this
>>>     first implementation I am enabling ADI for hugepages only
>>>     since these pages are locked in memory and hence avoid the
>>>     issue of saving and restoring tags.
>>
>> This makes the feature almost entire useless.
>>
>> Non-hugepages must be in the initial implementation.
>
> Hi David,
>
> Thanks for the feedback. I will get this working for non-hugepages as 
> well. ADI state of each VMA region is already stored in the VMA itself 
> in my first implementation, so I do not lose it when the page is 
> swapped out. The trouble is ADI version tags for each VMA region have 
> to be stored on the swapped out pages since the ADI version tags are 
> flushed when TLB entry for a page is flushed. 


Khalid,

Are you sure about that last statement? My understanding is that the 
tags are stored in physical memory, and remain there until explicitly 
changed or removed, and so flushing a TLB entry has no effect on the ADI 
tags. If it worked the way you think, then somebody would have to 
potentially reload a long list of ADI tags on every TLB miss.

Rob



> When that page is brought back in, its version tags have to be set up 
> again. Version tags are set on cacheline boundary and hence there can 
> be multiple version tags for a single page. Version tags have to be 
> stored in the swap space somehow along with the page. I can start out 
> with allowing ADI to be enabled only on pages locked in memory.
>
>>
>>> +    PR_ENABLE_SPARC_ADI - Enable ADI checking in all pages in the 
>>> address
>>> +        range specified. The pages in the range must be already
>>> +        locked. This operation enables the TTE.mcd bit for the
>>> +        pages specified. arg2 is the starting address for address
>>> +        range and must be page aligned. arg3 is the length of
>>> +        memory address range and must be a multiple of page size.
>>
>> I strongly dislike this interface, and it makes the prtctl cases look
>> extremely ugly and hide to the casual reader what the code is actually
>> doing.
>>
>> This is an mprotect() operation, so add a new flag bit and implement
>> this via mprotect please.
>
> That is an interesting idea. Adding a PROT_ADI protection to 
> mprotect() sounds cleaner. There are three steps to enabling ADI - (1) 
> set PSTATE.mcde bit which is not tied to any VMA, (2) set TTE.mcd for 
> each VMA, and (3) set the version tag on cacheline using MCD ASI. I 
> can combine steps 1 and 2 in one mprotect() call. That will leave 
> PR_GET_SPARC_ADICAPS and PR_GET_SPARC_ADI_STATUS prctl commands still 
> to be implemented. PR_SET_SPARC_ADI is also used to check if the 
> process has PSTATE.mcde bit set. I could use PR_GET_SPARC_ADI_STATUS 
> to do that where return values of 0 and 1 mean the same as before and 
> possibly add return value of 2 to mean PSTATE.mcde is not set?
>
>>
>> Then since you are guarenteed to have a consistent ADI setting for
>> every single VMA region, you never "lose" the ADI state when you swap
>> out.  It's implicit in the VMA itself, because you'll store in the VMA
>> that this is an ADI region.
>>
>> I also want this enabled unconditionally, without any Kconfig knobs.
>>
>
> I can remove CONFIG_SPARC_ADI. It does mean this code will be built 
> into 32-bit kernels as well but it will be inactive code.
>
> Thanks,
> Khalid
>
>
>
Andy Lutomirski March 7, 2016, 3:43 p.m. UTC | #8
On Mon, Mar 7, 2016 at 7:30 AM, Rob Gardner <rob.gardner@oracle.com> wrote:
> On 03/07/2016 07:07 AM, Khalid Aziz wrote:
>>
>> On 03/05/2016 09:07 PM, David Miller wrote:
>>>
>>> From: Khalid Aziz <khalid.aziz@oracle.com>
>>> Date: Wed,  2 Mar 2016 13:39:37 -0700
>>>
>>>>     In this
>>>>     first implementation I am enabling ADI for hugepages only
>>>>     since these pages are locked in memory and hence avoid the
>>>>     issue of saving and restoring tags.
>>>
>>>
>>> This makes the feature almost entire useless.
>>>
>>> Non-hugepages must be in the initial implementation.
>>
>>
>> Hi David,
>>
>> Thanks for the feedback. I will get this working for non-hugepages as
>> well. ADI state of each VMA region is already stored in the VMA itself in my
>> first implementation, so I do not lose it when the page is swapped out. The
>> trouble is ADI version tags for each VMA region have to be stored on the
>> swapped out pages since the ADI version tags are flushed when TLB entry for
>> a page is flushed.
>
>
>
> Khalid,
>
> Are you sure about that last statement? My understanding is that the tags
> are stored in physical memory, and remain there until explicitly changed or
> removed, and so flushing a TLB entry has no effect on the ADI tags. If it
> worked the way you think, then somebody would have to potentially reload a
> long list of ADI tags on every TLB miss.
>

I'll bite, since this was sent to linux-api:

Can someone explain what this feature does for the benefit of people
who haven't read the manual (and who don't even know where to find the
manual)?

Are the top few bits of a sparc64 virtual address currently
must-be-zero?  Does this feature change the semantics so that those
bits are ignored for address resolution and instead must match
whatever the ADI tag is determined to be during address resolution?

Is this enforced for both user and kernel accesses?

Is the actual ADI tag associated with a "page" associated with the
page of physical memory or is it associated with a mapping?  That is,
if there are two virtual aliases of the same physical page (in the
same process or otherwise), does the hardware require them to have the
same ADI tag?  If the answer is no, then IMO this is definitely
something that should use mprotect and you should seriously consider
using something like mprotect_key (new syscall, not in Linus' tree
yet) for it.  In fact, you might consider a possible extra parameter
to that syscall for this purpose.

Cc: Dave Hansen.  It seems to be the zeitgeist to throw tag bits at
PTEs these days.
Khalid Aziz March 7, 2016, 3:45 p.m. UTC | #9
On 03/07/2016 08:30 AM, Rob Gardner wrote:
> On 03/07/2016 07:07 AM, Khalid Aziz wrote:
>> On 03/05/2016 09:07 PM, David Miller wrote:
>>> From: Khalid Aziz <khalid.aziz@oracle.com>
>>> Date: Wed,  2 Mar 2016 13:39:37 -0700
>>>
>>>>     In this
>>>>     first implementation I am enabling ADI for hugepages only
>>>>     since these pages are locked in memory and hence avoid the
>>>>     issue of saving and restoring tags.
>>>
>>> This makes the feature almost entire useless.
>>>
>>> Non-hugepages must be in the initial implementation.
>>
>> Hi David,
>>
>> Thanks for the feedback. I will get this working for non-hugepages as
>> well. ADI state of each VMA region is already stored in the VMA itself
>> in my first implementation, so I do not lose it when the page is
>> swapped out. The trouble is ADI version tags for each VMA region have
>> to be stored on the swapped out pages since the ADI version tags are
>> flushed when TLB entry for a page is flushed.
>
>
> Khalid,
>
> Are you sure about that last statement? My understanding is that the
> tags are stored in physical memory, and remain there until explicitly
> changed or removed, and so flushing a TLB entry has no effect on the ADI
> tags. If it worked the way you think, then somebody would have to
> potentially reload a long list of ADI tags on every TLB miss.
>
> Rob
>

Hi Rob,

I am fairly sure that is the case. This is what I found from the 
processor guys and others working on ADI. I tested it out by setting up 
ADI on normal malloc'd pages that got swapped out and I got MCD 
exceptions when those pages were swapped back in on access.

I mis-spoke when I said "....ADI version tags are flushed when TLB entry 
for a page is flushed". I meant ADI version tags are flushed when 
mapping for a virtual address is removed from TSB, not when TLB entry is 
flushed. Yes, ADI tags are stored in physical memory and removed when 
mapping is removed.

Thanks,
Khalid
Khalid Aziz March 7, 2016, 4:06 p.m. UTC | #10
On 03/07/2016 08:43 AM, Andy Lutomirski wrote:
> On Mon, Mar 7, 2016 at 7:30 AM, Rob Gardner <rob.gardner@oracle.com> wrote:
>> On 03/07/2016 07:07 AM, Khalid Aziz wrote:
>>>
>>> On 03/05/2016 09:07 PM, David Miller wrote:
>>>>
>>>> From: Khalid Aziz <khalid.aziz@oracle.com>
>>>> Date: Wed,  2 Mar 2016 13:39:37 -0700
>>>>
>>>>>      In this
>>>>>      first implementation I am enabling ADI for hugepages only
>>>>>      since these pages are locked in memory and hence avoid the
>>>>>      issue of saving and restoring tags.
>>>>
>>>>
>>>> This makes the feature almost entire useless.
>>>>
>>>> Non-hugepages must be in the initial implementation.
>>>
>>>
>>> Hi David,
>>>
>>> Thanks for the feedback. I will get this working for non-hugepages as
>>> well. ADI state of each VMA region is already stored in the VMA itself in my
>>> first implementation, so I do not lose it when the page is swapped out. The
>>> trouble is ADI version tags for each VMA region have to be stored on the
>>> swapped out pages since the ADI version tags are flushed when TLB entry for
>>> a page is flushed.
>>
>>
>>
>> Khalid,
>>
>> Are you sure about that last statement? My understanding is that the tags
>> are stored in physical memory, and remain there until explicitly changed or
>> removed, and so flushing a TLB entry has no effect on the ADI tags. If it
>> worked the way you think, then somebody would have to potentially reload a
>> long list of ADI tags on every TLB miss.
>>
>
> I'll bite, since this was sent to linux-api:
>
> Can someone explain what this feature does for the benefit of people
> who haven't read the manual (and who don't even know where to find the
> manual)?
>
> Are the top few bits of a sparc64 virtual address currently
> must-be-zero?  Does this feature change the semantics so that those
> bits are ignored for address resolution and instead must match
> whatever the ADI tag is determined to be during address resolution?
>
> Is this enforced for both user and kernel accesses?
>
> Is the actual ADI tag associated with a "page" associated with the
> page of physical memory or is it associated with a mapping?  That is,
> if there are two virtual aliases of the same physical page (in the
> same process or otherwise), does the hardware require them to have the
> same ADI tag?  If the answer is no, then IMO this is definitely
> something that should use mprotect and you should seriously consider
> using something like mprotect_key (new syscall, not in Linus' tree
> yet) for it.  In fact, you might consider a possible extra parameter
> to that syscall for this purpose.
>
> Cc: Dave Hansen.  It seems to be the zeitgeist to throw tag bits at
> PTEs these days.
>

Hi Andy,

The primary purpose of this feature is to prevent rogue accesses to 
memory regions. If a database were to allocate memory pages to cache 
database, it can enable ADI on those pages and set version tags. Version 
tag for a memory address is encoded in bits 63-60 in the virtual 
address. When accessing an ADI enabled memory region, top 4 bits of the 
virtual address presented to the MMU must match the version tag set 
earlier. When these bits do not match a tag, an MCD (Memory Corruption 
Detected) exception is raised. Kernel sends a SIGBUS to the offending 
process in response. There is some more info on ADI at 
<https://swisdev.oracle.com/_files/What-Is-ADI.html>.

Top 4-bits of sparc64 virtual address are used for version tag only when 
a process has its PSTATE.mcde bit set and it is accessing a memory 
region that has ADI enabled on it (TTE.mcd set) and a version tag was 
set on the virtual address being accessed. These 4-bits retain their 
original semantics in all other cases.

ADI version tags are checked for data fetches only. My implementation 
enforces this for userspace addresses only. Expanding this to include 
kernel data addresses as well will be a good thing to do to protect 
kernel data but I want to try to do this incrementally - (1) ADI for 
userspace addresses only for mlock'd pages, (2) expand support to 
swappable pages, (3) ADI for kernel data pages, (4)......whatever else 
makes sense...

ADI version tag applies to virtual addresses only. If two processes have 
virtual addresses mapping to the same physical page, they must use the 
same tag. Hardware will send MCD exception if the tags do not match. 
This was done to ensure a hack does not bypass ADI protection by simply 
inserting another VA-to-PA mapping. I do like the idea of mprotect() as 
David suggested and it can be done with existing mprotect() call. I will 
have to add a new key PROT_ADI to support this.

Thanks,
Khalid
David Miller March 7, 2016, 4:45 p.m. UTC | #11
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Mon, 7 Mar 2016 08:07:53 -0700

> I can remove CONFIG_SPARC_ADI. It does mean this code will be built
> into 32-bit kernels as well but it will be inactive code.

The code should be built only into obj-$(CONFIG_SPARC64) just like the
rest of the 64-bit specific code.  I don't know why in the world you
would build it into the 32-bit kernel.
David Miller March 7, 2016, 4:56 p.m. UTC | #12
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Mon, 7 Mar 2016 08:07:53 -0700

> PR_GET_SPARC_ADICAPS

Put this into a new ELF auxiliary vector entry via ARCH_DLINFO.

So now all that's left is supposedly the TAG stuff, please explain
that to me so I can direct you to the correct existing interface to
provide that as well.

Really, try to avoid prtctl, it's poorly typed and almost worse than
ioctl().
Dave Hansen March 7, 2016, 5:32 p.m. UTC | #13
On 03/02/2016 12:39 PM, Khalid Aziz wrote:
> --- a/include/uapi/asm-generic/siginfo.h
> +++ b/include/uapi/asm-generic/siginfo.h
> @@ -206,7 +206,10 @@ typedef struct siginfo {
>  #define SEGV_MAPERR	(__SI_FAULT|1)	/* address not mapped to object */
>  #define SEGV_ACCERR	(__SI_FAULT|2)	/* invalid permissions for mapped object */
>  #define SEGV_BNDERR	(__SI_FAULT|3)  /* failed address bound checks */
> -#define NSIGSEGV	3
> +#define SEGV_ACCADI	(__SI_FAULT|4)	/* ADI not enabled for mapped object */
> +#define SEGV_ADIDERR	(__SI_FAULT|5)	/* Disrupting MCD error */
> +#define SEGV_ADIPERR	(__SI_FAULT|6)	/* Precise MCD exception */
> +#define NSIGSEGV	6

FYI, this will conflict with code in -tip right now:

> http://git.kernel.org/cgit/linux/kernel/git/tip/tip.git/commit/?h=mm/pkeys&id=cd0ea35ff5511cde299a61c21a95889b4a71464e

It's not a big deal to resolve, of course.
Dave Hansen March 7, 2016, 5:35 p.m. UTC | #14
On 03/02/2016 12:39 PM, Khalid Aziz wrote:
> +long enable_sparc_adi(unsigned long addr, unsigned long len)
> +{
> +	unsigned long end, pagemask;
> +	int error;
> +	struct vm_area_struct *vma, *vma2;
> +	struct mm_struct *mm;
> +
> +	if (!ADI_CAPABLE())
> +		return -EINVAL;
...

This whole thing with the VMA splitting and so forth looks pretty darn
arch-independent.  Are you sure you need that much arch-specific code
for it, or can you share more of the generic VMA management code?
Dave Hansen March 7, 2016, 5:46 p.m. UTC | #15
On 03/07/2016 08:06 AM, Khalid Aziz wrote:
> Top 4-bits of sparc64 virtual address are used for version tag only when
> a process has its PSTATE.mcde bit set and it is accessing a memory
> region that has ADI enabled on it (TTE.mcd set) and a version tag was
> set on the virtual address being accessed. These 4-bits retain their
> original semantics in all other cases.

OK, so this effectively reduces the address space of a process using the
feature.  Do we need to do anything explicit to keep an app from using
that address space?  Do we make sure the kernel doesn't place VMAs
there?  Do we respect mmap() hints that try to place memory there?
Khalid Aziz March 7, 2016, 5:51 p.m. UTC | #16
On 03/07/2016 09:45 AM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 08:07:53 -0700
>
>> I can remove CONFIG_SPARC_ADI. It does mean this code will be built
>> into 32-bit kernels as well but it will be inactive code.
>
> The code should be built only into obj-$(CONFIG_SPARC64) just like the
> rest of the 64-bit specific code.  I don't know why in the world you
> would build it into the 32-bit kernel.
>

You are right. I did not understand you correctly the first time.

Thanks,
Khalid
Andy Lutomirski March 7, 2016, 5:53 p.m. UTC | #17
On Mon, Mar 7, 2016 at 9:46 AM, Dave Hansen <dave.hansen@linux.intel.com> wrote:
> On 03/07/2016 08:06 AM, Khalid Aziz wrote:
>> Top 4-bits of sparc64 virtual address are used for version tag only when
>> a process has its PSTATE.mcde bit set and it is accessing a memory
>> region that has ADI enabled on it (TTE.mcd set) and a version tag was
>> set on the virtual address being accessed. These 4-bits retain their
>> original semantics in all other cases.
>
> OK, so this effectively reduces the address space of a process using the
> feature.  Do we need to do anything explicit to keep an app from using
> that address space?  Do we make sure the kernel doesn't place VMAs
> there?  Do we respect mmap() hints that try to place memory there?

Also, what happens when someone does this to an aliased page?  This
could be a MAP_SHARED mapping or a not-yet-COWed MAP_ANONYMOUS
mapping.

Also, what am I missing?  Tying these tags to the physical page seems
like a poor design to me.  This seems really awkward to use.
Khalid Aziz March 7, 2016, 6:04 p.m. UTC | #18
On 03/07/2016 09:56 AM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 08:07:53 -0700
>
>> PR_GET_SPARC_ADICAPS
>
> Put this into a new ELF auxiliary vector entry via ARCH_DLINFO.
>
> So now all that's left is supposedly the TAG stuff, please explain
> that to me so I can direct you to the correct existing interface to
> provide that as well.
>
> Really, try to avoid prtctl, it's poorly typed and almost worse than
> ioctl().
>

The two remaining operations I am looking at are:

1. Is PSTATE.mcde bit set for the process? PR_SET_SPARC_ADI provides 
this in its return value in the patch I sent.

2. Is TTE.mcd set for a given virtual address? PR_GET_SPARC_ADI_STATUS 
provides this function in the patch I sent.

Setting and clearing version tags can be done entirely from userspace:

         while (addr < end) {
                 asm volatile(
                         "stxa %1, [%0]ASI_MCD_PRIMARY\n\t"
                         :
                         : "r" (addr), "r" (version));
                 addr += adicap.blksz;
         }
so I do not have to add any kernel code for tags.

Thanks,
Khalid
Andy Lutomirski March 7, 2016, 6:08 p.m. UTC | #19
On Mon, Mar 7, 2016 at 10:04 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
> On 03/07/2016 09:56 AM, David Miller wrote:
>>
>> From: Khalid Aziz <khalid.aziz@oracle.com>
>> Date: Mon, 7 Mar 2016 08:07:53 -0700
>>
>>> PR_GET_SPARC_ADICAPS
>>
>>
>> Put this into a new ELF auxiliary vector entry via ARCH_DLINFO.
>>
>> So now all that's left is supposedly the TAG stuff, please explain
>> that to me so I can direct you to the correct existing interface to
>> provide that as well.
>>
>> Really, try to avoid prtctl, it's poorly typed and almost worse than
>> ioctl().
>>
>
> The two remaining operations I am looking at are:
>
> 1. Is PSTATE.mcde bit set for the process? PR_SET_SPARC_ADI provides this in
> its return value in the patch I sent.
>
> 2. Is TTE.mcd set for a given virtual address? PR_GET_SPARC_ADI_STATUS
> provides this function in the patch I sent.
>
> Setting and clearing version tags can be done entirely from userspace:
>
>         while (addr < end) {
>                 asm volatile(
>                         "stxa %1, [%0]ASI_MCD_PRIMARY\n\t"
>                         :
>                         : "r" (addr), "r" (version));
>                 addr += adicap.blksz;
>         }
> so I do not have to add any kernel code for tags.

Is the effect of that to change the tag associated with a page to
which the caller has write access?

I sense DoS issues in your future.
Rob Gardner March 7, 2016, 6:09 p.m. UTC | #20
On 03/07/2016 10:04 AM, Khalid Aziz wrote:
> On 03/07/2016 09:56 AM, David Miller wrote:
>> From: Khalid Aziz <khalid.aziz@oracle.com>
>> Date: Mon, 7 Mar 2016 08:07:53 -0700
>>
>>> PR_GET_SPARC_ADICAPS
>>
>> Put this into a new ELF auxiliary vector entry via ARCH_DLINFO.
>>
>> So now all that's left is supposedly the TAG stuff, please explain
>> that to me so I can direct you to the correct existing interface to
>> provide that as well.
>>
>> Really, try to avoid prtctl, it's poorly typed and almost worse than
>> ioctl().
>>
>
> The two remaining operations I am looking at are:
>
> 1. Is PSTATE.mcde bit set for the process? PR_SET_SPARC_ADI provides 
> this in its return value in the patch I sent.
>
> 2. Is TTE.mcd set for a given virtual address? PR_GET_SPARC_ADI_STATUS 
> provides this function in the patch I sent.
>
> Setting and clearing version tags can be done entirely from userspace:
>
>         while (addr < end) {
>                 asm volatile(
>                         "stxa %1, [%0]ASI_MCD_PRIMARY\n\t"
>                         :
>                         : "r" (addr), "r" (version));
>                 addr += adicap.blksz;
>         }
> so I do not have to add any kernel code for tags.
>

What about clearing the tags when the user is done with the memory? You 
can't count on the user to do that, so doesn't the kernel have to do it 
someplace?

Rob
Dave Hansen March 7, 2016, 6:12 p.m. UTC | #21
On 03/07/2016 09:53 AM, Andy Lutomirski wrote:
> Also, what am I missing?  Tying these tags to the physical page seems
> like a poor design to me.  This seems really awkward to use.

Yeah, can you describe the structures that store these things?  Surely
the hardware has some kind of lookup tables for them and stores them in
memory _somewhere_.
Khalid Aziz March 7, 2016, 6:15 p.m. UTC | #22
On 03/07/2016 10:35 AM, Dave Hansen wrote:
> On 03/02/2016 12:39 PM, Khalid Aziz wrote:
>> +long enable_sparc_adi(unsigned long addr, unsigned long len)
>> +{
>> +	unsigned long end, pagemask;
>> +	int error;
>> +	struct vm_area_struct *vma, *vma2;
>> +	struct mm_struct *mm;
>> +
>> +	if (!ADI_CAPABLE())
>> +		return -EINVAL;
> ...
>
> This whole thing with the VMA splitting and so forth looks pretty darn
> arch-independent.  Are you sure you need that much arch-specific code
> for it, or can you share more of the generic VMA management code?
>

All of the VMA splitting/merging code is rather generic and is very 
similar to the code for mbind, mlock, madavise and mprotect. Currently 
there is no code sharing across all of these implementations. Maybe that 
should change. In any case, I am looking at changing the interface to go 
through mprotect instead as Dave suggested. I can share the code in 
mprotect in that case. The only arch dependent part will be to set the 
VM_SPARC_ADI flag on the VMA.

Thanks,
Khalid
Khalid Aziz March 7, 2016, 6:22 p.m. UTC | #23
On 03/07/2016 11:08 AM, Andy Lutomirski wrote:
> On Mon, Mar 7, 2016 at 10:04 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
>> On 03/07/2016 09:56 AM, David Miller wrote:
>>>
>>> From: Khalid Aziz <khalid.aziz@oracle.com>
>>> Date: Mon, 7 Mar 2016 08:07:53 -0700
>>>
>>>> PR_GET_SPARC_ADICAPS
>>>
>>>
>>> Put this into a new ELF auxiliary vector entry via ARCH_DLINFO.
>>>
>>> So now all that's left is supposedly the TAG stuff, please explain
>>> that to me so I can direct you to the correct existing interface to
>>> provide that as well.
>>>
>>> Really, try to avoid prtctl, it's poorly typed and almost worse than
>>> ioctl().
>>>
>>
>> The two remaining operations I am looking at are:
>>
>> 1. Is PSTATE.mcde bit set for the process? PR_SET_SPARC_ADI provides this in
>> its return value in the patch I sent.
>>
>> 2. Is TTE.mcd set for a given virtual address? PR_GET_SPARC_ADI_STATUS
>> provides this function in the patch I sent.
>>
>> Setting and clearing version tags can be done entirely from userspace:
>>
>>          while (addr < end) {
>>                  asm volatile(
>>                          "stxa %1, [%0]ASI_MCD_PRIMARY\n\t"
>>                          :
>>                          : "r" (addr), "r" (version));
>>                  addr += adicap.blksz;
>>          }
>> so I do not have to add any kernel code for tags.
>
> Is the effect of that to change the tag associated with a page to
> which the caller has write access?

No, it changes the tag associated with the virtual address for the 
caller. Physical page backing this virtual address is unaffected. Tag 
checking is done for virtual addresses. The one restriction where 
physical address is relevant is when two processes map the same physical 
page, they both have to use the same tag for the virtual addresses that 
map on to the shared physical pages.

>
> I sense DoS issues in your future.
>

Are you concerned about DoS even if the tag is associated with virtual 
address, not physical address?

Thanks,
Khalid
Khalid Aziz March 7, 2016, 6:24 p.m. UTC | #24
On 03/07/2016 11:09 AM, Rob Gardner wrote:
> On 03/07/2016 10:04 AM, Khalid Aziz wrote:
>> On 03/07/2016 09:56 AM, David Miller wrote:
>>> From: Khalid Aziz <khalid.aziz@oracle.com>
>>> Date: Mon, 7 Mar 2016 08:07:53 -0700
>>>
>>>> PR_GET_SPARC_ADICAPS
>>>
>>> Put this into a new ELF auxiliary vector entry via ARCH_DLINFO.
>>>
>>> So now all that's left is supposedly the TAG stuff, please explain
>>> that to me so I can direct you to the correct existing interface to
>>> provide that as well.
>>>
>>> Really, try to avoid prtctl, it's poorly typed and almost worse than
>>> ioctl().
>>>
>>
>> The two remaining operations I am looking at are:
>>
>> 1. Is PSTATE.mcde bit set for the process? PR_SET_SPARC_ADI provides
>> this in its return value in the patch I sent.
>>
>> 2. Is TTE.mcd set for a given virtual address? PR_GET_SPARC_ADI_STATUS
>> provides this function in the patch I sent.
>>
>> Setting and clearing version tags can be done entirely from userspace:
>>
>>         while (addr < end) {
>>                 asm volatile(
>>                         "stxa %1, [%0]ASI_MCD_PRIMARY\n\t"
>>                         :
>>                         : "r" (addr), "r" (version));
>>                 addr += adicap.blksz;
>>         }
>> so I do not have to add any kernel code for tags.
>>
>
> What about clearing the tags when the user is done with the memory? You
> can't count on the user to do that, so doesn't the kernel have to do it
> someplace?
>

Tags can be cleared by user by setting tag to 0. Tags are automatically 
cleared by the hardware when the mapping for a virtual address is 
removed from TSB (which is why swappable pages are a problem), so kernel 
does not have to do it as part of clean up.

Thanks,
Khalid
Khalid Aziz March 7, 2016, 6:39 p.m. UTC | #25
On 03/07/2016 11:12 AM, Dave Hansen wrote:
> On 03/07/2016 09:53 AM, Andy Lutomirski wrote:
>> Also, what am I missing?  Tying these tags to the physical page seems
>> like a poor design to me.  This seems really awkward to use.
>
> Yeah, can you describe the structures that store these things?  Surely
> the hardware has some kind of lookup tables for them and stores them in
> memory _somewhere_.
>

Version tags are tied to virtual addresses, not physical pages.

Where exactly are the tags stored is part of processor architecture and 
I am not privy to that. MMU stores these lookup tables somewhere and 
uses it to authenticate access to virtual addresses. It really is 
irrelevant to kernel how MMU implements access controls as long as we 
have access to the knowledge of how to use it.

Thanks,
Khalid
Andy Lutomirski March 7, 2016, 6:49 p.m. UTC | #26
On Mon, Mar 7, 2016 at 10:22 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
> On 03/07/2016 11:08 AM, Andy Lutomirski wrote:
>>
>> On Mon, Mar 7, 2016 at 10:04 AM, Khalid Aziz <khalid.aziz@oracle.com>
>> wrote:
>>>
>>> On 03/07/2016 09:56 AM, David Miller wrote:
>>>>
>>>>
>>>> From: Khalid Aziz <khalid.aziz@oracle.com>
>>>> Date: Mon, 7 Mar 2016 08:07:53 -0700
>>>>
>>>>> PR_GET_SPARC_ADICAPS
>>>>
>>>>
>>>>
>>>> Put this into a new ELF auxiliary vector entry via ARCH_DLINFO.
>>>>
>>>> So now all that's left is supposedly the TAG stuff, please explain
>>>> that to me so I can direct you to the correct existing interface to
>>>> provide that as well.
>>>>
>>>> Really, try to avoid prtctl, it's poorly typed and almost worse than
>>>> ioctl().
>>>>
>>>
>>> The two remaining operations I am looking at are:
>>>
>>> 1. Is PSTATE.mcde bit set for the process? PR_SET_SPARC_ADI provides this
>>> in
>>> its return value in the patch I sent.
>>>
>>> 2. Is TTE.mcd set for a given virtual address? PR_GET_SPARC_ADI_STATUS
>>> provides this function in the patch I sent.
>>>
>>> Setting and clearing version tags can be done entirely from userspace:
>>>
>>>          while (addr < end) {
>>>                  asm volatile(
>>>                          "stxa %1, [%0]ASI_MCD_PRIMARY\n\t"
>>>                          :
>>>                          : "r" (addr), "r" (version));
>>>                  addr += adicap.blksz;
>>>          }
>>> so I do not have to add any kernel code for tags.
>>
>>
>> Is the effect of that to change the tag associated with a page to
>> which the caller has write access?
>
>
> No, it changes the tag associated with the virtual address for the caller.
> Physical page backing this virtual address is unaffected. Tag checking is
> done for virtual addresses. The one restriction where physical address is
> relevant is when two processes map the same physical page, they both have to
> use the same tag for the virtual addresses that map on to the shared
> physical pages.

Slow down, please.  *Why* do the tags for two different VAs that map
to the same PA have to match?  What goes wrong if they don't, and why
is requiring them to be the same a good idea?

>
>>
>> I sense DoS issues in your future.
>>
>
> Are you concerned about DoS even if the tag is associated with virtual
> address, not physical address?

Yes, absolutely.

fd = open("/lib/ld.so");
mmap(fd)
stxa to write the tag

*boom*, presumably, because the tags apparently have to match for all mappings.

What data structure or structures changes when this stxa instruction happens?

--Andy
Andy Lutomirski March 7, 2016, 6:53 p.m. UTC | #27
On Mon, Mar 7, 2016 at 10:39 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
> On 03/07/2016 11:12 AM, Dave Hansen wrote:
>>
>> On 03/07/2016 09:53 AM, Andy Lutomirski wrote:
>>>
>>> Also, what am I missing?  Tying these tags to the physical page seems
>>> like a poor design to me.  This seems really awkward to use.
>>
>>
>> Yeah, can you describe the structures that store these things?  Surely
>> the hardware has some kind of lookup tables for them and stores them in
>> memory _somewhere_.
>>
>
> Version tags are tied to virtual addresses, not physical pages.
>
> Where exactly are the tags stored is part of processor architecture and I am
> not privy to that. MMU stores these lookup tables somewhere and uses it to
> authenticate access to virtual addresses. It really is irrelevant to kernel
> how MMU implements access controls as long as we have access to the
> knowledge of how to use it.
>

Can you translate this for people who don't know all the SPARC acronyms?

x86 has an upcoming feature called protection keys.  A page of virtual
memory has a protection key, which is a number from 0 through 16.  The
master copy is in the PTE, i.e. page table entry, which is a
software-managed data structure in memory and is exactly the thing
that Linux calls "pte".  The processor can cache that value in the TLB
(translation lookaside buffer), which is a hardware cache that caches
PTEs.  On access to a page of virtual memory, the processor does a
certain calculation involving a new register called PKRU and the
protection key and may deny access.

Hopefully that description makes sense even to people completely
unfamiliar with x86.

Can you try something similar for SPARC?  So far I'm lost, because
you've said that the ADI tag is associated with a VA, but it has to
match for aliases, and you've mentioned a bunch of acronyms, and I
have no clue what's going on.

--Andy
David Miller March 7, 2016, 7:06 p.m. UTC | #28
From: Dave Hansen <dave.hansen@intel.com>
Date: Mon, 7 Mar 2016 09:35:57 -0800

> On 03/02/2016 12:39 PM, Khalid Aziz wrote:
>> +long enable_sparc_adi(unsigned long addr, unsigned long len)
>> +{
>> +	unsigned long end, pagemask;
>> +	int error;
>> +	struct vm_area_struct *vma, *vma2;
>> +	struct mm_struct *mm;
>> +
>> +	if (!ADI_CAPABLE())
>> +		return -EINVAL;
> ...
> 
> This whole thing with the VMA splitting and so forth looks pretty darn
> arch-independent.  Are you sure you need that much arch-specific code
> for it, or can you share more of the generic VMA management code?

This is exactly what I have suggested to him, and he has agreed to pursue.
David Miller March 7, 2016, 7:09 p.m. UTC | #29
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Mon, 7 Mar 2016 11:04:38 -0700

> On 03/07/2016 09:56 AM, David Miller wrote:
>> From: Khalid Aziz <khalid.aziz@oracle.com>
>> Date: Mon, 7 Mar 2016 08:07:53 -0700
>>
>>> PR_GET_SPARC_ADICAPS
>>
>> Put this into a new ELF auxiliary vector entry via ARCH_DLINFO.
>>
>> So now all that's left is supposedly the TAG stuff, please explain
>> that to me so I can direct you to the correct existing interface to
>> provide that as well.
>>
>> Really, try to avoid prtctl, it's poorly typed and almost worse than
>> ioctl().
>>
> 
> The two remaining operations I am looking at are:
> 
> 1. Is PSTATE.mcde bit set for the process? PR_SET_SPARC_ADI provides
> this in its return value in the patch I sent.

Unnecessary.  If any ADI mappings exist then mcde is set, otherwise it is
clear.  This is internal state and the application has no need to every
set nor query it.

It is implicit from the mprotect() calls the user makes to enable ADI
regions.

> 2. Is TTE.mcd set for a given virtual address? PR_GET_SPARC_ADI_STATUS
> provides this function in the patch I sent.

Again, implied by the mprotect() calls.
David Miller March 7, 2016, 7:16 p.m. UTC | #30
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Mon, 7 Mar 2016 11:24:54 -0700

> Tags can be cleared by user by setting tag to 0. Tags are
> automatically cleared by the hardware when the mapping for a virtual
> address is removed from TSB (which is why swappable pages are a
> problem), so kernel does not have to do it as part of clean up.

You might be able to crib some bits for the Tag in the swp_entry_t, it's
64-bit and you can therefore steal bits from the offset field.

That way you'll have the ADI tag in the page tables, ready to re-install
at swapin time.
David Miller March 7, 2016, 7:19 p.m. UTC | #31
From: Andy Lutomirski <luto@amacapital.net>
Date: Mon, 7 Mar 2016 10:49:57 -0800

> What data structure or structures changes when this stxa instruction happens?

An internal table, maintained by the CPU and/or hypervisor, and if in physical
addresses then in a region which is only accessible by the hypervisor.

The table is not accessible by the kernel at all via loads or stores.
David Miller March 7, 2016, 7:22 p.m. UTC | #32
From: Andy Lutomirski <luto@amacapital.net>
Date: Mon, 7 Mar 2016 10:53:23 -0800

> x86 has an upcoming feature called protection keys.  A page of virtual
> memory has a protection key, which is a number from 0 through 16.  The
> master copy is in the PTE, i.e. page table entry, which is a
> software-managed data structure in memory and is exactly the thing
> that Linux calls "pte".  The processor can cache that value in the TLB
> (translation lookaside buffer), which is a hardware cache that caches
> PTEs.  On access to a page of virtual memory, the processor does a
> certain calculation involving a new register called PKRU and the
> protection key and may deny access.

ADI is similar, except the "keys" (or "tags") are stored externally
rather than in the PTEs.  A bit in the PTE is used to enable tag match
checking.

The tags live in an external table, which is populated by ASI store
instructions.  The location of the table is implementation specific,
it could be hypervisor or CPU managed, but if stored in memory it is
to a region of memory accessible only to the hypervisor at best.

Khalid, maybe you should share notes with the folks working on x86
protection keys.
Khalid Aziz March 7, 2016, 7:44 p.m. UTC | #33
On 03/07/2016 11:49 AM, Andy Lutomirski wrote:
> On Mon, Mar 7, 2016 at 10:22 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
>> No, it changes the tag associated with the virtual address for the caller.
>> Physical page backing this virtual address is unaffected. Tag checking is
>> done for virtual addresses. The one restriction where physical address is
>> relevant is when two processes map the same physical page, they both have to
>> use the same tag for the virtual addresses that map on to the shared
>> physical pages.
>
> Slow down, please.  *Why* do the tags for two different VAs that map
> to the same PA have to match?  What goes wrong if they don't, and why
> is requiring them to be the same a good idea?
>

Consider this scenario:

1. Process A creates a shm and attaches to it.
2. Process A fills shm with data it wants to share with only known 
processes. It enables ADI and sets tags on the shm.
3. Hacker triggers something like stack overflow on process A, exec's a 
new rogue binary and manages to attach to this shm. MMU knows tags were 
set on the virtual address mapping to the physical pages hosting the 
shm. If MMU does not require the rogue process to set the exact same 
tags on its mapping of the same shm, rogue process has defeated the ADI 
protection easily.

Does this make sense?

>>
>>>
>>> I sense DoS issues in your future.
>>>
>>
>> Are you concerned about DoS even if the tag is associated with virtual
>> address, not physical address?
>
> Yes, absolutely.
>
> fd = open("/lib/ld.so");
> mmap(fd)
> stxa to write the tag
>
> *boom*, presumably, because the tags apparently have to match for all mappings.
>

A process can not just write version tags and make the file inaccessible 
to others. It takes three steps to enable ADI:

1. Set PSTATE.mcde for the process.
2. Set TTE.mcd on all PTEs for the virtual addresses ADI is being 
enabled on.
3. Set version tags.

Unless all three steps are taken, tag checking will not be done. stxa 
will fail unless step 2 is completed. In your example, the step of 
setting TTE.mcd will force sharing to stop for the process through 
change_protection(), right?

Thanks for asking these tough questions. These are very helpful in 
refining my implementation and avoiding silly bugs.

--
Khalid
Khalid Aziz March 7, 2016, 7:46 p.m. UTC | #34
On 03/07/2016 12:22 PM, David Miller wrote:
> Khalid, maybe you should share notes with the folks working on x86
> protection keys.
>

Good idea. Sparc ADI feature is indeed similar to x86 protection keys 
sounds like.

Thanks,
Khalid
Andy Lutomirski March 7, 2016, 7:54 p.m. UTC | #35
On Mon, Mar 7, 2016 at 11:44 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
> On 03/07/2016 11:49 AM, Andy Lutomirski wrote:
>>
>> On Mon, Mar 7, 2016 at 10:22 AM, Khalid Aziz <khalid.aziz@oracle.com>
>> wrote:
>>>
>>> No, it changes the tag associated with the virtual address for the
>>> caller.
>>> Physical page backing this virtual address is unaffected. Tag checking is
>>> done for virtual addresses. The one restriction where physical address is
>>> relevant is when two processes map the same physical page, they both have
>>> to
>>> use the same tag for the virtual addresses that map on to the shared
>>> physical pages.
>>
>>
>> Slow down, please.  *Why* do the tags for two different VAs that map
>> to the same PA have to match?  What goes wrong if they don't, and why
>> is requiring them to be the same a good idea?
>>
>
> Consider this scenario:
>
> 1. Process A creates a shm and attaches to it.
> 2. Process A fills shm with data it wants to share with only known
> processes. It enables ADI and sets tags on the shm.
> 3. Hacker triggers something like stack overflow on process A, exec's a new
> rogue binary and manages to attach to this shm. MMU knows tags were set on
> the virtual address mapping to the physical pages hosting the shm. If MMU
> does not require the rogue process to set the exact same tags on its mapping
> of the same shm, rogue process has defeated the ADI protection easily.
>
> Does this make sense?

This makes sense, but I still think the design is poor.  If the hacker
gets code execution, then they can trivially brute force the ADI bits.

Also, if this is the use case in mind, shouldn't the ADI bits bet set
on the file, not the mapping?  E.g. have an ioctl on the shmfs file
that sets its ADI bits?

>
>>>
>>>>
>>>> I sense DoS issues in your future.
>>>>
>>>
>>> Are you concerned about DoS even if the tag is associated with virtual
>>> address, not physical address?
>>
>>
>> Yes, absolutely.
>>
>> fd = open("/lib/ld.so");
>> mmap(fd)
>> stxa to write the tag
>>
>> *boom*, presumably, because the tags apparently have to match for all
>> mappings.
>>
>
> A process can not just write version tags and make the file inaccessible to
> others. It takes three steps to enable ADI:
>
> 1. Set PSTATE.mcde for the process.
> 2. Set TTE.mcd on all PTEs for the virtual addresses ADI is being enabled
> on.
> 3. Set version tags.
>
> Unless all three steps are taken, tag checking will not be done. stxa will
> fail unless step 2 is completed. In your example, the step of setting
> TTE.mcd will force sharing to stop for the process through
> change_protection(), right?

OK, that makes some sense.

Can a shared page ever have TTE.mcd set?  How does one share a page,
even deliberately, between two processes with cmd set?

--Andy
Khalid Aziz March 7, 2016, 8:41 p.m. UTC | #36
On 03/07/2016 12:54 PM, Andy Lutomirski wrote:
> On Mon, Mar 7, 2016 at 11:44 AM, Khalid Aziz <khalid.aziz@oracle.com> wrote:
>>
>> Consider this scenario:
>>
>> 1. Process A creates a shm and attaches to it.
>> 2. Process A fills shm with data it wants to share with only known
>> processes. It enables ADI and sets tags on the shm.
>> 3. Hacker triggers something like stack overflow on process A, exec's a new
>> rogue binary and manages to attach to this shm. MMU knows tags were set on
>> the virtual address mapping to the physical pages hosting the shm. If MMU
>> does not require the rogue process to set the exact same tags on its mapping
>> of the same shm, rogue process has defeated the ADI protection easily.
>>
>> Does this make sense?
>
> This makes sense, but I still think the design is poor.  If the hacker
> gets code execution, then they can trivially brute force the ADI bits.

True, with only 16 possible tag values (actually only 14 since 0 and 15 
are reserved values), it is entirely possible to brute force the ADI 
tag. ADI is just another tool one can use to mitigate attacks. A process 
that accesses an ADI enabled memory with invalid tag gets a SIGBUS and 
is terminated. This can trigger alerts on the system and system policies 
could block the next attack. If a daemon is compromised and is forced to 
hand out data from memory it should not be reading (similar to 
heartbleed bug). the daemon itself is terminated with SIGBUS which 
should be enough to alert system admins. A rotating set of tags would 
reduce the risk from brute force attacks. Tags are set on cacheline 
(which is 64 bytes on M7). A single regular sized page can have 128 sets 
of tags. Allowing for 14 possible values for each set, that is a lot of 
possible combinations of tags making it very hard to brute force tags 
for more than a cacheline at a time. There are probably other better 
ways to make the tags harder to crack.

>
> Also, if this is the use case in mind, shouldn't the ADI bits bet set
> on the file, not the mapping?  E.g. have an ioctl on the shmfs file
> that sets its ADI bits?

Shared data may not always be backed by a file. My understanding is one 
of the use cases is for in-memory databases. This shared space could 
also be used to hand off transactions in flight to other processes. 
These transactions in flight would not be backed by a file. Some of 
these use cases might not use shmfs even. Setting ADI bits at virtual 
address level catches all these cases since what backs the tagged 
virtual address can be anything - a mapped file, mmio space, just plain 
chunk of memory.

>
>> A process can not just write version tags and make the file inaccessible to
>> others. It takes three steps to enable ADI:
>>
>> 1. Set PSTATE.mcde for the process.
>> 2. Set TTE.mcd on all PTEs for the virtual addresses ADI is being enabled
>> on.
>> 3. Set version tags.
>>
>> Unless all three steps are taken, tag checking will not be done. stxa will
>> fail unless step 2 is completed. In your example, the step of setting
>> TTE.mcd will force sharing to stop for the process through
>> change_protection(), right?
>
> OK, that makes some sense.
>
> Can a shared page ever have TTE.mcd set?  How does one share a page,
> even deliberately, between two processes with cmd set?

For two processes to share a page, their VMAs have to be identical as I 
understand it. If one process has TTE.mcd set (which means vma->vm_flags 
is different) while the other does not, they do not share a page.

Thanks,
Khalid
David Miller March 7, 2016, 8:58 p.m. UTC | #37
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Mon, 7 Mar 2016 13:41:39 -0700

> Shared data may not always be backed by a file. My understanding is
> one of the use cases is for in-memory databases. This shared space
> could also be used to hand off transactions in flight to other
> processes. These transactions in flight would not be backed by a
> file. Some of these use cases might not use shmfs even. Setting ADI
> bits at virtual address level catches all these cases since what backs
> the tagged virtual address can be anything - a mapped file, mmio
> space, just plain chunk of memory.

Frankly the most interesting use case to me is simply finding bugs
and memory scribbles, and for that we're want to be able to ADI
arbitrary memory returned from malloc() and friends.

I personally see ADI more as a debugging than a security feature,
but that's just my view.
Andy Lutomirski March 7, 2016, 9:02 p.m. UTC | #38
On Mon, Mar 7, 2016 at 12:58 PM, David Miller <davem@davemloft.net> wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 13:41:39 -0700
>
>> Shared data may not always be backed by a file. My understanding is
>> one of the use cases is for in-memory databases. This shared space
>> could also be used to hand off transactions in flight to other
>> processes. These transactions in flight would not be backed by a
>> file. Some of these use cases might not use shmfs even. Setting ADI
>> bits at virtual address level catches all these cases since what backs
>> the tagged virtual address can be anything - a mapped file, mmio
>> space, just plain chunk of memory.
>
> Frankly the most interesting use case to me is simply finding bugs
> and memory scribbles, and for that we're want to be able to ADI
> arbitrary memory returned from malloc() and friends.
>
> I personally see ADI more as a debugging than a security feature,
> but that's just my view.

The thing that seems awkward to me is that setting, say, ADI=1 seems
almost equivalent to remapping the memory up to 0x10...whatever, and
the latter is a heck of a lot simpler to think about.
Khalid Aziz March 7, 2016, 9:06 p.m. UTC | #39
On 03/07/2016 10:46 AM, Dave Hansen wrote:
> On 03/07/2016 08:06 AM, Khalid Aziz wrote:
>> Top 4-bits of sparc64 virtual address are used for version tag only when
>> a process has its PSTATE.mcde bit set and it is accessing a memory
>> region that has ADI enabled on it (TTE.mcd set) and a version tag was
>> set on the virtual address being accessed. These 4-bits retain their
>> original semantics in all other cases.
>
> OK, so this effectively reduces the address space of a process using the
> feature.  Do we need to do anything explicit to keep an app from using
> that address space?  Do we make sure the kernel doesn't place VMAs
> there?  Do we respect mmap() hints that try to place memory there?
>

Good questions. Isn't set of valid VAs already constrained by VA_BITS 
(set to 44 in arch/sparc/include/asm/processor_64.h)? As I see it we are 
already not using the top 4 bits. Please correct me if I am wrong.

Thanks,
Khalid
Khalid Aziz March 7, 2016, 9:09 p.m. UTC | #40
On 03/07/2016 01:58 PM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 13:41:39 -0700
>
>> Shared data may not always be backed by a file. My understanding is
>> one of the use cases is for in-memory databases. This shared space
>> could also be used to hand off transactions in flight to other
>> processes. These transactions in flight would not be backed by a
>> file. Some of these use cases might not use shmfs even. Setting ADI
>> bits at virtual address level catches all these cases since what backs
>> the tagged virtual address can be anything - a mapped file, mmio
>> space, just plain chunk of memory.
>
> Frankly the most interesting use case to me is simply finding bugs
> and memory scribbles, and for that we're want to be able to ADI
> arbitrary memory returned from malloc() and friends.
>
> I personally see ADI more as a debugging than a security feature,
> but that's just my view.
>

I think that is a very strong use case. It can be a very effective tool 
for debugging especially when it comes to catching wild writes.

--
Khalid
Khalid Aziz March 7, 2016, 9:27 p.m. UTC | #41
On 03/07/2016 12:09 PM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 11:04:38 -0700
>
>> On 03/07/2016 09:56 AM, David Miller wrote:
>>> From: Khalid Aziz <khalid.aziz@oracle.com>
>>> Date: Mon, 7 Mar 2016 08:07:53 -0700
>>>
>>>> PR_GET_SPARC_ADICAPS
>>>
>>> Put this into a new ELF auxiliary vector entry via ARCH_DLINFO.
>>>
>>> So now all that's left is supposedly the TAG stuff, please explain
>>> that to me so I can direct you to the correct existing interface to
>>> provide that as well.
>>>
>>> Really, try to avoid prtctl, it's poorly typed and almost worse than
>>> ioctl().
>>>
>>
>> The two remaining operations I am looking at are:
>>
>> 1. Is PSTATE.mcde bit set for the process? PR_SET_SPARC_ADI provides
>> this in its return value in the patch I sent.
>
> Unnecessary.  If any ADI mappings exist then mcde is set, otherwise it is
> clear.  This is internal state and the application has no need to every
> set nor query it.
>
> It is implicit from the mprotect() calls the user makes to enable ADI
> regions.
>
>> 2. Is TTE.mcd set for a given virtual address? PR_GET_SPARC_ADI_STATUS
>> provides this function in the patch I sent.
>
> Again, implied by the mprotect() calls.
>

Hi Dave,

I agree with your point of view. PSTATE.mcde and TTE.mcd are set in 
response to request from userspace. If userspace asked for them to be 
set, they already know but it was the database guys that asked for these 
two functions and they are the primary customers for the ADI feature. I 
am not crazy about this idea since this extends the mprotect API even 
further but would you consider using the return value from mprotect to 
indicate if PSTATE.mcde or TTE.mcd were already set on the given address?

Thanks,
Khalid
Khalid Aziz March 7, 2016, 9:33 p.m. UTC | #42
On 03/07/2016 12:16 PM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 11:24:54 -0700
>
>> Tags can be cleared by user by setting tag to 0. Tags are
>> automatically cleared by the hardware when the mapping for a virtual
>> address is removed from TSB (which is why swappable pages are a
>> problem), so kernel does not have to do it as part of clean up.
>
> You might be able to crib some bits for the Tag in the swp_entry_t, it's
> 64-bit and you can therefore steal bits from the offset field.
>
> That way you'll have the ADI tag in the page tables, ready to re-install
> at swapin time.
>

That is a possibility but limited in scope. An address range covered by 
a single TTE can have large number of tags. Version tags are set on 
cacheline. In extreme case, one could set a tag for each set of 64-bytes 
in a page. Also tags are set completely in userspace and no transition 
occurs to kernel space, so kernel has no idea of what tags have been 
set. I have not found a way to query the MMU on tags.

I will think some more about it.

Thanks,
Khalid
David Miller March 7, 2016, 9:34 p.m. UTC | #43
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Mon, 7 Mar 2016 14:27:09 -0700

> I agree with your point of view. PSTATE.mcde and TTE.mcd are set in
> response to request from userspace. If userspace asked for them to be
> set, they already know but it was the database guys that asked for
> these two functions and they are the primary customers for the ADI
> feature. I am not crazy about this idea since this extends the
> mprotect API even further but would you consider using the return
> value from mprotect to indicate if PSTATE.mcde or TTE.mcd were already
> set on the given address?

Well, that's the idea.

If the mprotect using MAP_ADI or whatever succeeds, then ADI is
enabled.

Users can thus also pass MAP_ADI as a flag to mmap() to get ADI
protection from the very beginning.
David Miller March 7, 2016, 9:38 p.m. UTC | #44
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Mon, 7 Mar 2016 14:33:56 -0700

> On 03/07/2016 12:16 PM, David Miller wrote:
>> From: Khalid Aziz <khalid.aziz@oracle.com>
>> Date: Mon, 7 Mar 2016 11:24:54 -0700
>>
>>> Tags can be cleared by user by setting tag to 0. Tags are
>>> automatically cleared by the hardware when the mapping for a virtual
>>> address is removed from TSB (which is why swappable pages are a
>>> problem), so kernel does not have to do it as part of clean up.
>>
>> You might be able to crib some bits for the Tag in the swp_entry_t,
>> it's
>> 64-bit and you can therefore steal bits from the offset field.
>>
>> That way you'll have the ADI tag in the page tables, ready to
>> re-install
>> at swapin time.
>>
> 
> That is a possibility but limited in scope. An address range covered
> by a single TTE can have large number of tags. Version tags are set on
> cacheline. In extreme case, one could set a tag for each set of
> 64-bytes in a page. Also tags are set completely in userspace and no
> transition occurs to kernel space, so kernel has no idea of what tags
> have been set. I have not found a way to query the MMU on tags.
> 
> I will think some more about it.

That would mean that ADI is impossible to use for swappable memory.

...

If that's true I'm extremely disappointed that they devoted so much
silicon and engineering to this feature yet didn't take that one
critical step to make it generally useful. :(

We could have a way to do this via the kernel, wherein the user has a
contract with us.  Basically we have a call to pass the Tags (what
granularity to use for this is a design point, pages, cache lines,
etc.)  into the kernel and the user agrees not to change them behind
the kernel's back.

In return the kernel agrees to restore the tags upon swapin.

So we could support something for swappable pages, it would just be
more work.
Khalid Aziz March 7, 2016, 10:30 p.m. UTC | #45
On 03/07/2016 02:34 PM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 14:27:09 -0700
>
>> I agree with your point of view. PSTATE.mcde and TTE.mcd are set in
>> response to request from userspace. If userspace asked for them to be
>> set, they already know but it was the database guys that asked for
>> these two functions and they are the primary customers for the ADI
>> feature. I am not crazy about this idea since this extends the
>> mprotect API even further but would you consider using the return
>> value from mprotect to indicate if PSTATE.mcde or TTE.mcd were already
>> set on the given address?
>
> Well, that's the idea.
>
> If the mprotect using MAP_ADI or whatever succeeds, then ADI is
> enabled.
>
> Users can thus also pass MAP_ADI as a flag to mmap() to get ADI
> protection from the very beginning.
>

MAP_ADI has been sitting in my backlog for some time. Looks like you 
just raised its priority ;)

--
Khalid
Dave Hansen March 7, 2016, 10:40 p.m. UTC | #46
On 03/07/2016 11:46 AM, Khalid Aziz wrote:
> On 03/07/2016 12:22 PM, David Miller wrote:
>> Khalid, maybe you should share notes with the folks working on x86
>> protection keys.
> 
> Good idea. Sparc ADI feature is indeed similar to x86 protection keys
> sounds like.

There are definitely some similarities.  But protection keys doesn't
have any additional tables in which to keep metadata.  It keeps all of
its data in the page tables.  It also doesn't have an impact on the
virtual address layout.

But, it does have metadata to store in the VMA, has a special
siginfo->si_code, and it uses mprotect() (although a new pkey_mprotect()
variant that takes an extra argument).

Protection Keys are described a bit more here:

> http://git.kernel.org/cgit/linux/kernel/git/daveh/x86-pkeys.git/tree/Documentation/x86/protection-keys.txt?h=pkeys-v025&id=1b5b8a8836de8eb667027178b4820665dea5a038

MPX is another Intel feature separate from protection keys, but *it* has
some tables that it keep its metadata memory and special special
instructions to move metadata in and out of it.  It also has a prctl()
to enable/disable kernel assistance for the feature.  Unlike ADI, the
tables are exposed (and accessible) to user applications in normal
application memory.

MPX's documentation is here:

> http://git.kernel.org/cgit/linux/kernel/git/daveh/x86-pkeys.git/tree/Documentation/x86/intel_mpx.txt

Overall, I'm not seeing much overlap at all between the features, honestly.
Rob Gardner March 7, 2016, 11:12 p.m. UTC | #47
On 03/07/2016 01:33 PM, Khalid Aziz wrote:
>
> That is a possibility but limited in scope. An address range covered 
> by a single TTE can have large number of tags. Version tags are set on 
> cacheline. In extreme case, one could set a tag for each set of 
> 64-bytes in a page. Also tags are set completely in userspace and no 
> transition occurs to kernel space, so kernel has no idea of what tags 
> have been set.

   ...
> I have not found a way to query the MMU on tags.
>

To query the tag for a cache line, you just read it back with ldxa and 
ASI_MCD_PRIMARY (ie, asi 0x90), basically the same way you stored the 
tag in the first place.

Rob
Rob Gardner March 7, 2016, 11:13 p.m. UTC | #48
On 03/07/2016 01:38 PM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 14:33:56 -0700
>
>> On 03/07/2016 12:16 PM, David Miller wrote:
>>> From: Khalid Aziz <khalid.aziz@oracle.com>
>>> Date: Mon, 7 Mar 2016 11:24:54 -0700
>>>
>>>> Tags can be cleared by user by setting tag to 0. Tags are
>>>> automatically cleared by the hardware when the mapping for a virtual
>>>> address is removed from TSB (which is why swappable pages are a
>>>> problem), so kernel does not have to do it as part of clean up.
>>> You might be able to crib some bits for the Tag in the swp_entry_t,
>>> it's
>>> 64-bit and you can therefore steal bits from the offset field.
>>>
>>> That way you'll have the ADI tag in the page tables, ready to
>>> re-install
>>> at swapin time.
>>>
>> That is a possibility but limited in scope. An address range covered
>> by a single TTE can have large number of tags. Version tags are set on
>> cacheline. In extreme case, one could set a tag for each set of
>> 64-bytes in a page. Also tags are set completely in userspace and no
>> transition occurs to kernel space, so kernel has no idea of what tags
>> have been set. I have not found a way to query the MMU on tags.
>>
>> I will think some more about it.
> That would mean that ADI is impossible to use for swappable memory.
>
> ...
>
> If that's true I'm extremely disappointed that they devoted so much
> silicon and engineering to this feature yet didn't take that one
> critical step to make it generally useful. :(

You can easily read ADI tags with a simple ldxa #ASI_MCD_PRIMARY 
instruction.

Rob
Khalid Aziz March 7, 2016, 11:27 p.m. UTC | #49
On 03/07/2016 04:12 PM, Rob Gardner wrote:
> On 03/07/2016 01:33 PM, Khalid Aziz wrote:
>>
>> That is a possibility but limited in scope. An address range covered
>> by a single TTE can have large number of tags. Version tags are set on
>> cacheline. In extreme case, one could set a tag for each set of
>> 64-bytes in a page. Also tags are set completely in userspace and no
>> transition occurs to kernel space, so kernel has no idea of what tags
>> have been set.
>
>    ...
>> I have not found a way to query the MMU on tags.
>>
>
> To query the tag for a cache line, you just read it back with ldxa and
> ASI_MCD_PRIMARY (ie, asi 0x90), basically the same way you stored the
> tag in the first place.
>

Thanks, Rob. I just saw it while reading through the manual.

--
Khalid
Rob Gardner March 7, 2016, 11:32 p.m. UTC | #50
On 03/07/2016 10:24 AM, Khalid Aziz wrote:
>
> Tags can be cleared by user by setting tag to 0. Tags are 
> automatically cleared by the hardware when the mapping for a virtual 
> address is removed from TSB (which is why swappable pages are a 
> problem), so kernel does not have to do it as part of clean up.
>

I don't understand this. The hardware isn't involved  when a mapping for 
a virtual address is removed from the TSB, so how could it automatically 
clear tags?

Rob
James Morris March 7, 2016, 11:34 p.m. UTC | #51
On 03/08/2016 07:58 AM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 13:41:39 -0700
>
>> Shared data may not always be backed by a file. My understanding is
>> one of the use cases is for in-memory databases. This shared space
>> could also be used to hand off transactions in flight to other
>> processes. These transactions in flight would not be backed by a
>> file. Some of these use cases might not use shmfs even. Setting ADI
>> bits at virtual address level catches all these cases since what backs
>> the tagged virtual address can be anything - a mapped file, mmio
>> space, just plain chunk of memory.
>
> Frankly the most interesting use case to me is simply finding bugs
> and memory scribbles, and for that we're want to be able to ADI
> arbitrary memory returned from malloc() and friends.
>
> I personally see ADI more as a debugging than a security feature,
> but that's just my view.

This is certainly a major use of the feature. The Solaris folks have 
made some interesting use of it here:

https://docs.oracle.com/cd/E37069_01/html/E37085/gphwb.html
James Morris March 7, 2016, 11:48 p.m. UTC | #52
On 03/08/2016 06:54 AM, Andy Lutomirski wrote:
>
> This makes sense, but I still think the design is poor.  If the hacker
> gets code execution, then they can trivially brute force the ADI bits.
>

ADI in this scenario is intended to prevent the attacker from gaining 
code execution in the first place.
Khalid Aziz March 8, 2016, 12:21 a.m. UTC | #53
On 03/07/2016 12:16 PM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 11:24:54 -0700
>
>> Tags can be cleared by user by setting tag to 0. Tags are
>> automatically cleared by the hardware when the mapping for a virtual
>> address is removed from TSB (which is why swappable pages are a
>> problem), so kernel does not have to do it as part of clean up.
>
> You might be able to crib some bits for the Tag in the swp_entry_t, it's
> 64-bit and you can therefore steal bits from the offset field.
>
> That way you'll have the ADI tag in the page tables, ready to re-install
> at swapin time.
>

Hi Dave,

Can we enable ADI support for swappable pages in a subsequent update 
after the core functionality is stable on mlock'd pages?

Thanks,
Khalid
Rob Gardner March 8, 2016, 1:31 a.m. UTC | #54
On 03/07/2016 10:39 AM, Khalid Aziz wrote:
> On 03/07/2016 11:12 AM, Dave Hansen wrote:
>> On 03/07/2016 09:53 AM, Andy Lutomirski wrote:
>>> Also, what am I missing?  Tying these tags to the physical page seems
>>> like a poor design to me.  This seems really awkward to use.
>>
>> Yeah, can you describe the structures that store these things? Surely
>> the hardware has some kind of lookup tables for them and stores them in
>> memory _somewhere_.
>>
>
> Version tags are tied to virtual addresses, not physical pages.
>
> Where exactly are the tags stored is part of processor architecture 
> and I am not privy to that. MMU stores these lookup tables somewhere 
> and uses it to authenticate access to virtual addresses. It really is 
> irrelevant to kernel how MMU implements access controls as long as we 
> have access to the knowledge of how to use it.

The tags are stored in physical memory, and you can write a tag directly 
to that memory via stxa with ASI_MCD_REAL and completely bypass the MMU. 
When you do that, the tag will still be seen by any virtual address that 
maps to that physical address.

Rob
David Miller March 8, 2016, 4:13 a.m. UTC | #55
From: Rob Gardner <rob.gardner@oracle.com>
Date: Mon, 7 Mar 2016 15:13:31 -0800

> You can easily read ADI tags with a simple ldxa #ASI_MCD_PRIMARY
> instruction.

Awesome!
David Miller March 8, 2016, 4:24 a.m. UTC | #56
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Mon, 7 Mar 2016 17:21:05 -0700

> Can we enable ADI support for swappable pages in a subsequent update
> after the core functionality is stable on mlock'd pages?

I already said no.
James Morris March 8, 2016, 9:33 a.m. UTC | #57
On 03/08/2016 10:48 AM, James Morris wrote:
> On 03/08/2016 06:54 AM, Andy Lutomirski wrote:
>>
>> This makes sense, but I still think the design is poor.  If the hacker
>> gets code execution, then they can trivially brute force the ADI bits.
>>
>
> ADI in this scenario is intended to prevent the attacker from gaining
> code execution in the first place.

Here's some more background from Enrico Perla (who literally wrote the 
book on kernel exploitation):

https://blogs.oracle.com/enrico/entry/hardening_allocators_with_adi

Probably the most significant advantage from a security point of view is 
the ability to eliminate an entire class of vulnerability: adjacent heap 
overflows, as discussed above, where, for example, adjacent heap objects 
are tagged differently.  Classic linear buffer overflows can be eliminated.

As Kees Cook outlined at the 2015 kernel summit, it's best to mitigate 
classes of vulnerabilities rather than patch each instance:

https://outflux.net/slides/2011/defcon/kernel-exploitation.pdf

The Linux ADI implementation is currently very rudimentary, and we 
definitely welcome continued feedback from the community and ideas as it 
evolves.

- James
David Miller March 8, 2016, 7:57 p.m. UTC | #58
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Mon, 7 Mar 2016 14:06:43 -0700

> Good questions. Isn't set of valid VAs already constrained by VA_BITS
> (set to 44 in arch/sparc/include/asm/processor_64.h)? As I see it we
> are already not using the top 4 bits. Please correct me if I am wrong.

Another limiting constraint is the number of address bits coverable by
the 4-level page tables we use.  And this is sign extended so we have
a top-half and a bottom-half with a "hole" in the center of the VA
space.

I want some clarification on the top bits during ADI accesses.

If ADI is enabled, then the top bits of the virtual address are
intepreted as tag bits.  Once "verified" with the ADI settings, what
happense to these tag bits?  Are they dropped from the virtual address
before being passed down the TLB et al. for translations?

If not, then this means you have to map ADI memory to the correct
location so that the tags match up.

And if that's the case, if you really wanted to mix tags within a
single page, you'd have to map that page several times, once for each
and every cacheline granular tag you'd like to use within that page.
Khalid Aziz March 8, 2016, 8:16 p.m. UTC | #59
On 03/08/2016 12:57 PM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Mon, 7 Mar 2016 14:06:43 -0700
>
>> Good questions. Isn't set of valid VAs already constrained by VA_BITS
>> (set to 44 in arch/sparc/include/asm/processor_64.h)? As I see it we
>> are already not using the top 4 bits. Please correct me if I am wrong.
>
> Another limiting constraint is the number of address bits coverable by
> the 4-level page tables we use.  And this is sign extended so we have
> a top-half and a bottom-half with a "hole" in the center of the VA
> space.
>
> I want some clarification on the top bits during ADI accesses.
>
> If ADI is enabled, then the top bits of the virtual address are
> intepreted as tag bits.  Once "verified" with the ADI settings, what
> happense to these tag bits?  Are they dropped from the virtual address
> before being passed down the TLB et al. for translations?

Bits 63-60 (tag bits) are dropped from the virtual address before being 
passed down the TLB for translation when PSTATE.mcde = 1.

--
Khalid

>
> If not, then this means you have to map ADI memory to the correct
> location so that the tags match up.
>
> And if that's the case, if you really wanted to mix tags within a
> single page, you'd have to map that page several times, once for each
> and every cacheline granular tag you'd like to use within that page.
>
David Miller March 8, 2016, 8:27 p.m. UTC | #60
From: Khalid Aziz <khalid.aziz@oracle.com>
Date: Tue, 8 Mar 2016 13:16:11 -0700

> On 03/08/2016 12:57 PM, David Miller wrote:
>> From: Khalid Aziz <khalid.aziz@oracle.com>
>> Date: Mon, 7 Mar 2016 14:06:43 -0700
>>
>>> Good questions. Isn't set of valid VAs already constrained by VA_BITS
>>> (set to 44 in arch/sparc/include/asm/processor_64.h)? As I see it we
>>> are already not using the top 4 bits. Please correct me if I am wrong.
>>
>> Another limiting constraint is the number of address bits coverable by
>> the 4-level page tables we use.  And this is sign extended so we have
>> a top-half and a bottom-half with a "hole" in the center of the VA
>> space.
>>
>> I want some clarification on the top bits during ADI accesses.
>>
>> If ADI is enabled, then the top bits of the virtual address are
>> intepreted as tag bits.  Once "verified" with the ADI settings, what
>> happense to these tag bits?  Are they dropped from the virtual address
>> before being passed down the TLB et al. for translations?
> 
> Bits 63-60 (tag bits) are dropped from the virtual address before
> being passed down the TLB for translation when PSTATE.mcde = 1.

Ok and you said that values 15 and 0 are special.

I'm just wondering if this means you can't really use ADI mappings in
the top half of the 64-bit address space.  If the bits are dropped, they
will be zero, but they need to be all 1's for the top-half of the VA
space since it's sign extended.
Khalid Aziz March 8, 2016, 8:59 p.m. UTC | #61
On 03/08/2016 01:27 PM, David Miller wrote:
> From: Khalid Aziz <khalid.aziz@oracle.com>
> Date: Tue, 8 Mar 2016 13:16:11 -0700
>
>> On 03/08/2016 12:57 PM, David Miller wrote:
>>> From: Khalid Aziz <khalid.aziz@oracle.com>
>>> Date: Mon, 7 Mar 2016 14:06:43 -0700
>>>
>>>> Good questions. Isn't set of valid VAs already constrained by VA_BITS
>>>> (set to 44 in arch/sparc/include/asm/processor_64.h)? As I see it we
>>>> are already not using the top 4 bits. Please correct me if I am wrong.
>>>
>>> Another limiting constraint is the number of address bits coverable by
>>> the 4-level page tables we use.  And this is sign extended so we have
>>> a top-half and a bottom-half with a "hole" in the center of the VA
>>> space.
>>>
>>> I want some clarification on the top bits during ADI accesses.
>>>
>>> If ADI is enabled, then the top bits of the virtual address are
>>> intepreted as tag bits.  Once "verified" with the ADI settings, what
>>> happense to these tag bits?  Are they dropped from the virtual address
>>> before being passed down the TLB et al. for translations?
>>
>> Bits 63-60 (tag bits) are dropped from the virtual address before
>> being passed down the TLB for translation when PSTATE.mcde = 1.
>
> Ok and you said that values 15 and 0 are special.
>
> I'm just wondering if this means you can't really use ADI mappings in
> the top half of the 64-bit address space.  If the bits are dropped, they
> will be zero, but they need to be all 1's for the top-half of the VA
> space since it's sign extended.
>

According to the manual when PSTATE.mcde=1, bits 63:60 of the virtual 
address of any load or store (using virtual address) are masked before 
being sent to memory system which includes MMU. Hardware TSB walker 
masks bits 63:60 and then sign extends from bit 59 before generating TSB 
pointer and before comparison to TSB TTE VAs but the virtual address in 
the TTE tag that is written to DTLB is masked and not sign extended. 
Manual also states that for implementations that fully support 60 bits 
or more of virtual address, they must sign-extend virtual address in TSB 
TTE tag.

--
Khalid

Patch
diff mbox series

diff --git a/Documentation/prctl/sparc_adi.txt b/Documentation/prctl/sparc_adi.txt
new file mode 100644
index 0000000..9cbdcae
--- /dev/null
+++ b/Documentation/prctl/sparc_adi.txt
@@ -0,0 +1,62 @@ 
+========
+Overview
+========
+
+SPARC M7 processor includes the feature Application Data Integrity (ADI).
+ADI allows a tag to be associated with a virtual memory address range
+and a process must access that memory range with the correct tag. ADI
+tag is embedded in bits 63-60 of virtual address. Once ADI is enabled
+on a range of memory addresses, the process can set a tag for blocks
+in this memory range n the cache using ASI_MCD_PRIMARY or
+ASI_MCD_ST_BLKINIT_PRIMARY. This tag is set for ADI block sized blocks
+which is provided to the kernel by machine description table.
+
+Linux kernel supports an application enabling and setting the ADI tag
+for a subset of its data pages. Those data pages have to be locked in
+memory since saving ADI tags to swap is not supported.
+
+
+New prctl options for ADI
+-------------------------
+
+Following new options to prctl() have been added to support ADI.
+
+	PR_GET_SPARC_ADICAPS - Get ADI capabilities for the processor.
+		These capabilities are used to set up ADI correctly
+		from userspace. Machine description table provides all
+		of the ADI capabilities information. arg2 to prctl() is
+		a pointer to struct adi_caps which is defined in
+		linux/prctl.h.
+
+
+	PR_SET_SPARC_ADI - Set the state of ADI in a user thread by
+		setting PSTATE.mcde bit in the user mode PSTATE register
+		of the calling thread based on the value passed in arg2:
+			1 == enable, 0 == disable, other == no change
+		Return the previous state of the PSTATE.mcde bit:
+			0 == was disabled, 1 == was enabled.
+		Set errno to EINVAL and return -1 if ADI is not available.
+
+
+	PR_ENABLE_SPARC_ADI - Enable ADI checking in all pages in the address
+		range specified. The pages in the range must be already
+		locked. This operation enables the TTE.mcd bit for the
+		pages specified. arg2 is the starting address for address
+		range and must be page aligned. arg3 is the length of
+		memory address range and must be a multiple of page size.
+
+
+	PR_DISABLE_SPARC_ADI - Disable ADI checking on all the pages in the
+		address range specified. This operation disables the
+		TTE.mcd bit for the pages specified. arg2 is the
+		starting address for address range and must be page
+		aligned. arg3 is the length of memory address range and
+		must be a multiple of page size.
+
+
+	PR_GET_SPARC_ADI_STATUS - Check if ADI is enabled or not for a
+		given virtual address. Returns 1 for enabled, else 0.
+
+
+All addresses passed to kernel must be non-ADI tagged addresses.
+Kernel does not enable ADI for kernel code.
diff --git a/Documentation/sparc/adi.txt b/Documentation/sparc/adi.txt
new file mode 100644
index 0000000..ac4a9d9
--- /dev/null
+++ b/Documentation/sparc/adi.txt
@@ -0,0 +1,206 @@ 
+Application Data Integrity (ADI)
+================================
+
+Sparc M7 processor adds the Application Data Integrity (ADI) feature.
+ADI allows a task to set version tags on any subset of its address
+space. Once ADI is enabled and version tags are set for ranges of
+address space of a task, the processor will compare the tag in pointers
+to memory in these ranges to the version set by the application
+previously. Access to memory is granted only if the tag in given
+pointer matches the tag set by the application. In case of mismatch,
+processor raises an exception.  ADI can be enabled on pages that are
+locked in memory, i.e.  are not swappable.
+
+Following steps must be taken by a task to enable ADI fully:
+
+1. Set the user mode PSTATE.mcde bit
+
+2. Set TTE.mcd bit on any TLB entries that correspond to the range of
+addresses ADI is being enabled on.
+
+3. Set the version tag for memory addresses.
+
+Kernel provides prctl() calls to perform steps 1 (PR_SET_SPARC_ADI) and
+2 (PR_ENABLE_SPARC_ADI). Please see Documentation/prctl/sparc_adi.txt
+for more details on these prctl calls. Step 3 is performed with an
+stxa instruction on the address using ASI_MCD_PRIMARY or
+ASI_MCD_ST_BLKINIT_PRIMARY. Version tags are stoed in bits 63-60 of
+address and are set on a cache line. Version tag values of 0x0 and 0xf
+are reserved.
+
+NOTE:	ADI is supported on hugepage only. Hugepages are already locked
+	in memory.
+
+
+ADI related traps
+-----------------
+
+With ADI enabled, following new traps may occur:
+
+Disrupting memory corruption
+
+	When a store accesses a memory localtion that has TTE.mcd=1,
+	the task is running with ADI enabled (PSTATE.mcde=1), and the ADI
+	tag in the address used (bits 63:60) does not match the tag set on
+	the corresponding cacheline, a memory corruption trap occurs. By
+	default, it is a disrupting trap and is sent to the hypervisor
+	first. Hypervisor creates a sun4v error report and sends a
+	resumable error (TT=0x7e) trap to the kernel. The kernel sends
+	a SIGSEGV to the task that resulted in this trap with the following
+	info:
+
+		siginfo.si_signo = SIGSEGV;
+		siginfo.errno = 0;
+		siginfo.si_code = SEGV_ADIDERR;
+		siginfo.si_addr = addr; /* address that caused trap */
+		siginfo.si_trapno = 0;
+
+
+Precise memory corruption
+
+	When a store accesses a memory location that has TTE.mcd=1,
+	the task is running with ADI enabled (PSTATE.mcde=1), and the ADI
+	tag in the address used (bits 63:60) does not match the tag set on
+	the corresponding cacheline, a memory corruption trap occurs. If
+	MCD precise exception is enabled (MCDPERR=1), a precise
+	exception is sent to the kernel with TT=0x1a. The kernel sends
+	a SIGSEGV to the task that resulted in this trap with the following
+	info:
+
+		siginfo.si_signo = SIGSEGV;
+		siginfo.errno = 0;
+		siginfo.si_code = SEGV_ADIPERR;
+		siginfo.si_addr = addr;	/* address that caused trap */
+		siginfo.si_trapno = 0;
+
+	NOTE: ADI tag mismatch on a load always results in precise trap.
+
+
+MCD disabled
+
+	When a task has not enabled ADI and attempts to set ADI version
+	on a memory address, processor sends an MCD disabled trap. This
+	trap is handled by hypervisor first and the hypervisor vectors this
+	trap through to the kernel as Data Access Exception trap with
+	fault type set to 0xa (invalid ASI). When this occurs, the kernel
+	sends the task SIGBUS signal with following info:
+
+		siginfo.si_signo = SIGBUS;
+		siginfo.errno = 0;
+		siginfo.si_code = SEGV_ACCADI;
+		siginfo.si_addr = addr;	/* address that caused trap */
+		siginfo.si_trapno = 0;
+
+
+Sample program to use ADI
+-------------------------
+
+Following sample program is meant to illustrate how to use the ADI
+functionality with the default 8M hugepages.
+
+#include <unistd.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/ipc.h>
+#include <sys/shm.h>
+#include <asm/asi.h>
+#include <linux/prctl.h>
+
+#define BUFFER_SIZE	32*1024*1024
+
+struct adi_caps adicap;
+
+main()
+{
+	unsigned long i, mcde;
+	char *shmaddr, *tmp_addr, *end, *veraddr, *clraddr;
+	int shmid, version;
+
+	if ((shmid = shmget(2, BUFFER_SIZE,
+				SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W)) < 0) {
+		perror("shmget failed");
+		exit(1);
+	}
+
+	shmaddr = shmat(shmid, NULL, 0);
+	if (shmaddr == (char *)-1) {
+		perror("shm attach failed");
+		shmctl(shmid, IPC_RMID, NULL);
+		exit(1);
+	}
+
+	/* Get the values for various ADI capabilities bits. These will
+	 * be used later for setting the ADI tag
+	 */
+	if (prctl(PR_GET_SPARC_ADICAPS, &adicap) < 0) {
+		perror("PR_GET_SPARC_ADICAPS failed");
+		goto err_out;
+	}
+
+	/* Set PSTATE.mcde
+	 */
+	if ((mcde = prctl(PR_SET_SPARC_ADI, PR_SET_SPARC_ADI_SET)) < 0) {
+		perror("PR_SET_SPARC_ADI failed");
+		goto err_out;
+	}
+
+	/* Set TTE.mcd on the address range for shm segment
+	 */
+	if (prctl(PR_ENABLE_SPARC_ADI, shmaddr, BUFFER_SIZE) < 0) {
+		perror("prctl failed");
+		goto err_out;
+	}
+
+	/* Set the ADI version tag on the shm segment
+	 */
+	version = 10;
+	tmp_addr = shmaddr;
+	end = shmaddr + BUFFER_SIZE;
+	while (tmp_addr < end) {
+		asm volatile(
+			"stxa %1, [%0]ASI_MCD_PRIMARY\n\t"
+			:
+			: "r" (tmp_addr), "r" (version));
+		tmp_addr += adicap.blksz;
+	}
+
+	/* Create a versioned address from the normal address
+	 */
+	tmp_addr = (void *) ((unsigned long)shmaddr << adicap.nbits);
+	tmp_addr = (void *) ((unsigned long)tmp_addr >> adicap.nbits);
+	veraddr = (void *) (((unsigned long)version << (64-adicap.nbits))
+			| (unsigned long)tmp_addr);
+
+	printf("Starting the writes:\n");
+	for (i = 0; i < BUFFER_SIZE; i++) {
+		veraddr[i] = (char)(i);
+		if (!(i % (1024 * 1024)))
+			printf(".");
+	}
+	printf("\n");
+
+	printf("Verifying data...");
+	for (i = 0; i < BUFFER_SIZE; i++)
+		if (veraddr[i] != (char)i)
+			printf("\nIndex %lu mismatched\n", i);
+	printf("Done.\n");
+
+	/* Disable ADI and clean up
+	 */
+	if (prctl(PR_DISABLE_SPARC_ADI, shmaddr, BUFFER_SIZE) < 0) {
+		perror("prctl failed");
+		goto err_out;
+	}
+
+	if (shmdt((const void *)shmaddr) != 0)
+		perror("Detach failure");
+	shmctl(shmid, IPC_RMID, NULL);
+
+	exit(0);
+
+err_out:
+	if (shmdt((const void *)shmaddr) != 0)
+		perror("Detach failure");
+	shmctl(shmid, IPC_RMID, NULL);
+	exit(1);
+}
diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig
index 56442d2..0aac0ae 100644
--- a/arch/sparc/Kconfig
+++ b/arch/sparc/Kconfig
@@ -80,6 +80,7 @@  config SPARC64
 	select NO_BOOTMEM
 	select HAVE_ARCH_AUDITSYSCALL
 	select ARCH_SUPPORTS_ATOMIC_RMW
+	select SPARC_ADI
 
 config ARCH_DEFCONFIG
 	string
@@ -314,6 +315,17 @@  if SPARC64
 source "kernel/power/Kconfig"
 endif
 
+config SPARC_ADI
+	bool "Application Data Integrity support"
+	def_bool y if SPARC64
+	help
+	  Support for Application Data Integrity (ADI). ADI feature allows
+	  a process to tag memory blocks with version tags. Once ADI is
+	  enabled and version tag is set on a memory block, any access to
+	  it is allowed only if the correct version tag is presented by
+	  a process. This feature is meant to help catch rogue accesses
+	  to memory.
+
 config SCHED_SMT
 	bool "SMT (Hyperthreading) scheduler support"
 	depends on SPARC64 && SMP
diff --git a/arch/sparc/include/asm/hugetlb.h b/arch/sparc/include/asm/hugetlb.h
index 139e711..5e7547c 100644
--- a/arch/sparc/include/asm/hugetlb.h
+++ b/arch/sparc/include/asm/hugetlb.h
@@ -82,4 +82,18 @@  static inline void arch_clear_hugepage_flags(struct page *page)
 {
 }
 
+#ifdef CONFIG_SPARC_ADI
+static inline pte_t arch_make_huge_pte(pte_t entry, struct vm_area_struct *vma,
+			 struct page *page, int writeable)
+{
+	/* If this vma has ADI enabled on it, turn on TTE.mcd
+	 */
+	if (vma->vm_flags & VM_SPARC_ADI)
+		return pte_mkmcd(entry);
+	else
+		return pte_mknotmcd(entry);
+}
+#define arch_make_huge_pte arch_make_huge_pte
+#endif
+
 #endif /* _ASM_SPARC64_HUGETLB_H */
diff --git a/arch/sparc/include/asm/hypervisor.h b/arch/sparc/include/asm/hypervisor.h
index f5b6537..2940bb3 100644
--- a/arch/sparc/include/asm/hypervisor.h
+++ b/arch/sparc/include/asm/hypervisor.h
@@ -547,6 +547,8 @@  struct hv_fault_status {
 #define HV_FAULT_TYPE_RESV1	13
 #define HV_FAULT_TYPE_UNALIGNED	14
 #define HV_FAULT_TYPE_INV_PGSZ	15
+#define HV_FAULT_TYPE_MCD	17
+#define HV_FAULT_TYPE_MCD_DIS	18
 /* Values 16 --> -2 are reserved.  */
 #define HV_FAULT_TYPE_MULTIPLE	-1
 
diff --git a/arch/sparc/include/asm/mmu_64.h b/arch/sparc/include/asm/mmu_64.h
index 70067ce..8e98741 100644
--- a/arch/sparc/include/asm/mmu_64.h
+++ b/arch/sparc/include/asm/mmu_64.h
@@ -95,6 +95,7 @@  typedef struct {
 	unsigned long		huge_pte_count;
 	struct tsb_config	tsb_block[MM_NUM_TSBS];
 	struct hv_tsb_descr	tsb_descr[MM_NUM_TSBS];
+	unsigned char		adi;
 } mm_context_t;
 
 #endif /* !__ASSEMBLY__ */
diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h
index 131d36f..cddea30 100644
--- a/arch/sparc/include/asm/pgtable_64.h
+++ b/arch/sparc/include/asm/pgtable_64.h
@@ -162,6 +162,9 @@  bool kern_addr_valid(unsigned long addr);
 #define _PAGE_E_4V	  _AC(0x0000000000000800,UL) /* side-Effect          */
 #define _PAGE_CP_4V	  _AC(0x0000000000000400,UL) /* Cacheable in P-Cache */
 #define _PAGE_CV_4V	  _AC(0x0000000000000200,UL) /* Cacheable in V-Cache */
+/* Bit 9 is used to enable MCD corruption detection instead on M7
+ */
+#define _PAGE_MCD_4V	  _AC(0x0000000000000200,UL) /* Memory Corruption    */
 #define _PAGE_P_4V	  _AC(0x0000000000000100,UL) /* Privileged Page      */
 #define _PAGE_EXEC_4V	  _AC(0x0000000000000080,UL) /* Executable Page      */
 #define _PAGE_W_4V	  _AC(0x0000000000000040,UL) /* Writable             */
@@ -541,6 +544,18 @@  static inline pte_t pte_mkspecial(pte_t pte)
 	return pte;
 }
 
+static inline pte_t pte_mkmcd(pte_t pte)
+{
+	pte_val(pte) |= _PAGE_MCD_4V;
+	return pte;
+}
+
+static inline pte_t pte_mknotmcd(pte_t pte)
+{
+	pte_val(pte) &= ~_PAGE_MCD_4V;
+	return pte;
+}
+
 static inline unsigned long pte_young(pte_t pte)
 {
 	unsigned long mask;
diff --git a/arch/sparc/include/asm/processor_64.h b/arch/sparc/include/asm/processor_64.h
index 6924bde..9a71701 100644
--- a/arch/sparc/include/asm/processor_64.h
+++ b/arch/sparc/include/asm/processor_64.h
@@ -97,6 +97,25 @@  struct thread_struct {
 struct task_struct;
 unsigned long thread_saved_pc(struct task_struct *);
 
+#ifdef CONFIG_SPARC_ADI
+extern struct adi_caps *get_adi_caps(void);
+extern long get_sparc_adicaps(unsigned long);
+extern long set_sparc_pstate_mcde(unsigned long);
+extern long enable_sparc_adi(unsigned long, unsigned long);
+extern long disable_sparc_adi(unsigned long, unsigned long);
+extern long get_sparc_adi_status(unsigned long);
+extern bool adi_capable(void);
+
+#define GET_SPARC_ADICAPS(a)	get_sparc_adicaps(a)
+#define SET_SPARC_MCDE(a)	set_sparc_pstate_mcde(a)
+#define ENABLE_SPARC_ADI(a, b)	enable_sparc_adi(a, b)
+#define DISABLE_SPARC_ADI(a, b)	disable_sparc_adi(a, b)
+#define GET_SPARC_ADI_STATUS(a)	get_sparc_adi_status(a)
+#define ADI_CAPABLE()		adi_capable()
+#else
+#define ADI_CAPABLE()		0
+#endif
+
 /* On Uniprocessor, even in RMO processes see TSO semantics */
 #ifdef CONFIG_SMP
 #define TSTATE_INITIAL_MM	TSTATE_TSO
diff --git a/arch/sparc/include/asm/ttable.h b/arch/sparc/include/asm/ttable.h
index 71b5a67..342b457 100644
--- a/arch/sparc/include/asm/ttable.h
+++ b/arch/sparc/include/asm/ttable.h
@@ -212,6 +212,16 @@ 
 	nop;						\
 	nop;
 
+#define SUN4V_MCD_PRECISE				\
+	ldxa	[%g0] ASI_SCRATCHPAD, %g2;		\
+	ldx	[%g2 + HV_FAULT_D_ADDR_OFFSET], %g4;	\
+	ldx	[%g2 + HV_FAULT_D_CTX_OFFSET], %g5;	\
+	ba,pt	%xcc, sun4v_mcd_detect_precise;		\
+	 nop;						\
+	nop;						\
+	nop;						\
+	nop;
+
 /* Before touching these macros, you owe it to yourself to go and
  * see how arch/sparc64/kernel/winfixup.S works... -DaveM
  *
diff --git a/arch/sparc/include/uapi/asm/asi.h b/arch/sparc/include/uapi/asm/asi.h
index 7ad7203d..7d099ac 100644
--- a/arch/sparc/include/uapi/asm/asi.h
+++ b/arch/sparc/include/uapi/asm/asi.h
@@ -244,6 +244,9 @@ 
 #define ASI_UDBL_CONTROL_R	0x7f /* External UDB control regs rd low*/
 #define ASI_INTR_R		0x7f /* IRQ vector dispatch read	*/
 #define ASI_INTR_DATAN_R	0x7f /* (III) In irq vector data reg N	*/
+#define ASI_MCD_PRIMARY		0x90 /* (NG7) MCD version load/store	*/
+#define ASI_MCD_ST_BLKINIT_PRIMARY	\
+				0x92 /* (NG7) MCD store BLKINIT primary	*/
 #define ASI_PIC			0xb0 /* (NG4) PIC registers		*/
 #define ASI_PST8_P		0xc0 /* Primary, 8 8-bit, partial	*/
 #define ASI_PST8_S		0xc1 /* Secondary, 8 8-bit, partial	*/
diff --git a/arch/sparc/include/uapi/asm/pstate.h b/arch/sparc/include/uapi/asm/pstate.h
index cf832e1..d0521db 100644
--- a/arch/sparc/include/uapi/asm/pstate.h
+++ b/arch/sparc/include/uapi/asm/pstate.h
@@ -10,7 +10,12 @@ 
  * -----------------------------------------------------------------------
  *  63  12  11   10    9     8    7   6   5     4     3     2     1    0
  */
+/* IG on V9 conflicts with MCDE on M7. PSTATE_MCDE will only be used on
+ * processors that support ADI which do not use IG, hence there is no
+ * functional conflict
+ */
 #define PSTATE_IG   _AC(0x0000000000000800,UL) /* Interrupt Globals.	*/
+#define PSTATE_MCDE _AC(0x0000000000000800,UL) /* MCD Enable		*/
 #define PSTATE_MG   _AC(0x0000000000000400,UL) /* MMU Globals.		*/
 #define PSTATE_CLE  _AC(0x0000000000000200,UL) /* Current Little Endian.*/
 #define PSTATE_TLE  _AC(0x0000000000000100,UL) /* Trap Little Endian.	*/
@@ -47,7 +52,12 @@ 
 #define TSTATE_ASI	_AC(0x00000000ff000000,UL) /* AddrSpace ID.	*/
 #define TSTATE_PIL	_AC(0x0000000000f00000,UL) /* %pil (Linux traps)*/
 #define TSTATE_PSTATE	_AC(0x00000000000fff00,UL) /* PSTATE.		*/
+/* IG on V9 conflicts with MCDE on M7. TSTATE_MCDE will only be used on
+ * processors that support ADI which do not support IG, hence there is
+ * no functional conflict
+ */
 #define TSTATE_IG	_AC(0x0000000000080000,UL) /* Interrupt Globals.*/
+#define TSTATE_MCDE	_AC(0x0000000000080000,UL) /* MCD enable.       */
 #define TSTATE_MG	_AC(0x0000000000040000,UL) /* MMU Globals.	*/
 #define TSTATE_CLE	_AC(0x0000000000020000,UL) /* CurrLittleEndian.	*/
 #define TSTATE_TLE	_AC(0x0000000000010000,UL) /* TrapLittleEndian.	*/
diff --git a/arch/sparc/kernel/entry.h b/arch/sparc/kernel/entry.h
index 0f67942..2078468 100644
--- a/arch/sparc/kernel/entry.h
+++ b/arch/sparc/kernel/entry.h
@@ -159,6 +159,9 @@  void sun4v_resum_overflow(struct pt_regs *regs);
 void sun4v_nonresum_error(struct pt_regs *regs,
 			  unsigned long offset);
 void sun4v_nonresum_overflow(struct pt_regs *regs);
+void sun4v_mem_corrupt_detect_precise(struct pt_regs *regs,
+				      unsigned long addr,
+				      unsigned long context);
 
 extern unsigned long sun4v_err_itlb_vaddr;
 extern unsigned long sun4v_err_itlb_ctx;
diff --git a/arch/sparc/kernel/head_64.S b/arch/sparc/kernel/head_64.S
index f2d30ca..f4a880b 100644
--- a/arch/sparc/kernel/head_64.S
+++ b/arch/sparc/kernel/head_64.S
@@ -878,6 +878,7 @@  sparc64_boot_end:
 #include "helpers.S"
 #include "hvcalls.S"
 #include "sun4v_tlb_miss.S"
+#include "sun4v_mcd.S"
 #include "sun4v_ivec.S"
 #include "ktlb.S"
 #include "tsb.S"
diff --git a/arch/sparc/kernel/mdesc.c b/arch/sparc/kernel/mdesc.c
index 6f80936..79f981c 100644
--- a/arch/sparc/kernel/mdesc.c
+++ b/arch/sparc/kernel/mdesc.c
@@ -12,6 +12,7 @@ 
 #include <linux/miscdevice.h>
 #include <linux/bootmem.h>
 #include <linux/export.h>
+#include <linux/prctl.h>
 
 #include <asm/cpudata.h>
 #include <asm/hypervisor.h>
@@ -512,6 +513,11 @@  EXPORT_SYMBOL(mdesc_node_name);
 
 static u64 max_cpus = 64;
 
+static struct {
+	bool enabled;
+	struct adi_caps caps;
+} adi_state;
+
 static void __init report_platform_properties(void)
 {
 	struct mdesc_handle *hp = mdesc_grab();
@@ -1007,6 +1013,80 @@  static int mdesc_open(struct inode *inode, struct file *file)
 	return 0;
 }
 
+bool adi_capable(void)
+{
+	return adi_state.enabled;
+}
+
+struct adi_caps *get_adi_caps(void)
+{
+	return &adi_state.caps;
+}
+
+void __init
+init_adi(void)
+{
+	struct mdesc_handle *hp = mdesc_grab();
+	const char *prop;
+	u64 pn, *val;
+	int len;
+
+	adi_state.enabled = false;
+
+	if (!hp)
+		return;
+
+	pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "cpu");
+	if (pn == MDESC_NODE_NULL)
+		goto out;
+
+	prop = mdesc_get_property(hp, pn, "hwcap-list", &len);
+	if (!prop)
+		goto out;
+
+	/*
+	 * Look for "adp" keyword in hwcap-list which would indicate
+	 * ADI support
+	 */
+	while (len) {
+		int plen;
+
+		if (!strcmp(prop, "adp")) {
+			adi_state.enabled = true;
+			break;
+		}
+
+		plen = strlen(prop) + 1;
+		prop += plen;
+		len -= plen;
+	}
+
+	if (!adi_state.enabled)
+		goto out;
+
+	pn = mdesc_node_by_name(hp, MDESC_NODE_NULL, "platform");
+	if (pn == MDESC_NODE_NULL)
+		goto out;
+
+	val = (u64 *) mdesc_get_property(hp, pn, "adp-blksz", &len);
+	if (!val)
+		goto out;
+	adi_state.caps.blksz = *val;
+
+	val = (u64 *) mdesc_get_property(hp, pn, "adp-nbits", &len);
+	if (!val)
+		goto out;
+	adi_state.caps.nbits = *val;
+
+	val = (u64 *) mdesc_get_property(hp, pn, "ue-on-adp", &len);
+	if (!val)
+		goto out;
+	adi_state.caps.ue_on_adi = *val;
+
+out:
+	mdesc_release(hp);
+}
+
 static ssize_t mdesc_read(struct file *file, char __user *buf,
 			  size_t len, loff_t *offp)
 {
@@ -1110,5 +1190,6 @@  void __init sun4v_mdesc_init(void)
 
 	cur_mdesc = hp;
 
+	init_adi();
 	report_platform_properties();
 }
diff --git a/arch/sparc/kernel/process_64.c b/arch/sparc/kernel/process_64.c
index 46a5964..d2a7cac 100644
--- a/arch/sparc/kernel/process_64.c
+++ b/arch/sparc/kernel/process_64.c
@@ -32,6 +32,9 @@ 
 #include <linux/sysrq.h>
 #include <linux/nmi.h>
 #include <linux/context_tracking.h>
+#include <linux/prctl.h>
+#include <linux/hugetlb.h>
+#include <linux/mempolicy.h>
 
 #include <asm/uaccess.h>
 #include <asm/page.h>
@@ -777,3 +780,222 @@  unsigned long get_wchan(struct task_struct *task)
 out:
 	return ret;
 }
+
+#ifdef CONFIG_SPARC_ADI
+long get_sparc_adicaps(unsigned long val)
+{
+	struct adi_caps *caps;
+
+	if (!ADI_CAPABLE())
+		return -EINVAL;
+
+	caps = get_adi_caps();
+	if (val)
+		if (copy_to_user((void *)val, caps, sizeof(struct adi_caps)))
+			return -EFAULT;
+	return 0;
+}
+
+long set_sparc_pstate_mcde(unsigned long val)
+{
+	unsigned long error;
+	struct pt_regs *regs;
+
+	if (!ADI_CAPABLE())
+		return -EINVAL;
+
+	/* We do not allow anonymous tasks to enable ADI because they
+	 * run in borrowed aadress space.
+	 */
+	if (current->mm == NULL)
+		return -EINVAL;
+
+	regs = task_pt_regs(current);
+	if (regs->tstate & TSTATE_MCDE)
+		error = 1;
+	else
+		error = 0;
+	switch (val) {
+	case 1:
+		regs->tstate |= TSTATE_MCDE;
+		current->mm->context.adi = 1;
+		break;
+	case 0:
+		regs->tstate &= ~TSTATE_MCDE;
+		current->mm->context.adi = 0;
+		break;
+	default:
+		break;
+	}
+
+	return error;
+}
+
+long enable_sparc_adi(unsigned long addr, unsigned long len)
+{
+	unsigned long end, pagemask;
+	int error;
+	struct vm_area_struct *vma, *vma2;
+	struct mm_struct *mm;
+
+	if (!ADI_CAPABLE())
+		return -EINVAL;
+
+	vma = find_vma(current->mm, addr);
+	if (unlikely(!vma) || (vma->vm_start > addr))
+		return -EFAULT;
+
+	/* ADI is supported for hugepages only
+	 */
+	if (!is_vm_hugetlb_page(vma))
+		return -EFAULT;
+
+	/* Is the start address page aligned and is the length multiple
+	 * of page size?
+	 */
+	pagemask = ~(vma_kernel_pagesize(vma) - 1);
+	if (addr & ~pagemask)
+		return -EINVAL;
+	if (len & ~pagemask)
+		return -EINVAL;
+
+	end = addr + len;
+	if (end == addr)
+		return 0;
+
+	/* Verify end of the region is not out of bounds
+	 */
+	vma2 = find_vma(current->mm, end-1);
+	if (unlikely(!vma2) || (vma2->vm_start > end))
+		return -EFAULT;
+
+	error = 0;
+	while (1) {
+		/* If the address space ADI is to be enabled in, does not cover
+		 * this vma in its entirety, we will need to split it.
+		 */
+		mm = vma->vm_mm;
+		if (addr != vma->vm_start) {
+			error = split_vma(mm, vma, addr, 1);
+			if (error)
+				goto out;
+		}
+
+		if (end < vma->vm_end) {
+			error = split_vma(mm, vma, end, 0);
+			if (error)
+				goto out;
+		}
+
+		/* Update the ADI info in vma and PTE
+		 */
+		vma->vm_flags |= VM_SPARC_ADI;
+
+		if (end > vma->vm_end) {
+			change_protection(vma, addr, vma->vm_end,
+					  vma->vm_page_prot,
+					  vma_wants_writenotify(vma), 0);
+			addr = vma->vm_end;
+		} else {
+			change_protection(vma, addr, end, vma->vm_page_prot,
+					vma_wants_writenotify(vma), 0);
+			break;
+		}
+
+		vma = find_vma(current->mm, addr);
+		if (unlikely(!vma) || (vma->vm_start > addr))
+			return -EFAULT;
+	}
+out:
+	if (error == -ENOMEM)
+		error = -EAGAIN;
+	return error;
+}
+
+long disable_sparc_adi(unsigned long addr, unsigned long len)
+{
+	unsigned long end, pagemask;
+	struct vm_area_struct *vma, *vma2, *prev;
+	struct mm_struct *mm;
+	pgoff_t pgoff;
+
+	if (!ADI_CAPABLE())
+		return -EINVAL;
+
+	vma = find_vma(current->mm, addr);
+	if (unlikely(!vma) || (vma->vm_start > addr))
+		return -EFAULT;
+
+	/* ADI is supported for hugepages only
+	 */
+	if (!is_vm_hugetlb_page(vma))
+		return -EINVAL;
+
+	/* Is the start address page aligned and is the length multiple
+	 * of page size?
+	 */
+	pagemask = ~(vma_kernel_pagesize(vma) - 1);
+	if (addr & ~pagemask)
+		return -EINVAL;
+	if (len & ~pagemask)
+		return -EINVAL;
+
+	end = addr + len;
+	if (end == addr)
+		return 0;
+
+	/* Verify end of the region is not out of bounds
+	 */
+	vma2 = find_vma(current->mm, end-1);
+	if (unlikely(!vma2) || (vma2->vm_start > end))
+		return -EFAULT;
+
+	while (1) {
+		mm = vma->vm_mm;
+
+		/* Update the ADI info in vma and check if this vma can
+		 * be merged with adjacent ones
+		 */
+		pgoff = vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT);
+		prev = vma_merge(mm, prev, addr, end, vma->vm_flags,
+				 vma->anon_vma, vma->vm_file, pgoff,
+				 vma_policy(vma), vma->vm_userfaultfd_ctx);
+		if (prev)
+			vma = prev;
+
+		vma->vm_flags &= ~VM_SPARC_ADI;
+		if (end > vma->vm_end) {
+			change_protection(vma, addr, vma->vm_end,
+					  vma->vm_page_prot,
+					  vma_wants_writenotify(vma), 0);
+			addr = vma->vm_end;
+		} else {
+			change_protection(vma, addr, end, vma->vm_page_prot,
+					  vma_wants_writenotify(vma), 0);
+			break;
+		}
+
+		vma = find_vma_prev(current->mm, addr, &prev);
+		if (unlikely(!vma) || (vma->vm_start > addr))
+			return -EFAULT;
+	}
+	return 0;
+}
+
+long get_sparc_adi_status(unsigned long addr)
+{
+	struct vm_area_struct *vma;
+
+	if (!ADI_CAPABLE())
+		return -EINVAL;
+
+	vma = find_vma(current->mm, addr);
+	if (unlikely(!vma) || (vma->vm_start > addr))
+		return -EFAULT;
+
+	if (vma->vm_flags & VM_SPARC_ADI)
+		return 1;
+
+	return 0;
+}
+#endif
diff --git a/arch/sparc/kernel/sun4v_mcd.S b/arch/sparc/kernel/sun4v_mcd.S
new file mode 100644
index 0000000..d1d1259
--- /dev/null
+++ b/arch/sparc/kernel/sun4v_mcd.S
@@ -0,0 +1,16 @@ 
+/* sun4v_mcd.S: Sun4v memory corruption detected precise exception handler
+ *
+ * Copyright (C) 2015 Bob Picco <bob.picco@oracle.com>
+ * Copyright (C) 2015 Khalid Aziz <khalid.aziz@oracle.com>
+ */
+	.text
+	.align 32
+
+sun4v_mcd_detect_precise:
+	ba,pt	%xcc, etrap
+	rd	%pc, %g7
+	or	%l4, %g0, %o1
+	or 	%l5, %g0, %o2
+	call	sun4v_mem_corrupt_detect_precise
+	add	%sp, PTREGS_OFF, %o0
+	ba,a,pt	%xcc, rtrap
diff --git a/arch/sparc/kernel/traps_64.c b/arch/sparc/kernel/traps_64.c
index d21cd62..29db583 100644
--- a/arch/sparc/kernel/traps_64.c
+++ b/arch/sparc/kernel/traps_64.c
@@ -351,12 +351,31 @@  void sun4v_data_access_exception(struct pt_regs *regs, unsigned long addr, unsig
 		regs->tpc &= 0xffffffff;
 		regs->tnpc &= 0xffffffff;
 	}
-	info.si_signo = SIGSEGV;
+
+	/* MCD (Memory Corruption Detection) disabled trap (TT=0x19) in HV
+	 * is vectored thorugh data access exception trap with fault type
+	 * set to HV_FAULT_TYPE_MCD_DIS. Check for MCD disabled trap
+	 */
 	info.si_errno = 0;
-	info.si_code = SEGV_MAPERR;
 	info.si_addr = (void __user *) addr;
 	info.si_trapno = 0;
-	force_sig_info(SIGSEGV, &info, current);
+	switch (type) {
+	case HV_FAULT_TYPE_INV_ASI:
+		info.si_signo = SIGILL;
+		info.si_code = ILL_ILLADR;
+		force_sig_info(SIGILL, &info, current);
+		break;
+	case HV_FAULT_TYPE_MCD_DIS:
+		info.si_signo = SIGSEGV;
+		info.si_code = SEGV_ACCADI;
+		force_sig_info(SIGSEGV, &info, current);
+		break;
+	default:
+		info.si_signo = SIGSEGV;
+		info.si_code = SEGV_MAPERR;
+		force_sig_info(SIGSEGV, &info, current);
+		break;
+	}
 }
 
 void sun4v_data_access_exception_tl1(struct pt_regs *regs, unsigned long addr, unsigned long type_ctx)
@@ -1801,6 +1820,7 @@  struct sun4v_error_entry {
 #define SUN4V_ERR_ATTRS_ASI		0x00000080
 #define SUN4V_ERR_ATTRS_PRIV_REG	0x00000100
 #define SUN4V_ERR_ATTRS_SPSTATE_MSK	0x00000600
+#define SUN4V_ERR_ATTRS_MCD		0x00000800
 #define SUN4V_ERR_ATTRS_SPSTATE_SHFT	9
 #define SUN4V_ERR_ATTRS_MODE_MSK	0x03000000
 #define SUN4V_ERR_ATTRS_MODE_SHFT	24
@@ -1998,6 +2018,36 @@  static void sun4v_log_error(struct pt_regs *regs, struct sun4v_error_entry *ent,
 	}
 }
 
+/* Handle memory corruption detected error which is vectored in
+ * through resumable error trap.
+ */
+void do_mcd_err(struct pt_regs *regs, struct sun4v_error_entry ent)
+{
+	siginfo_t info;
+
+	if (notify_die(DIE_TRAP, "MCD error", regs,
+		       0, 0x34, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (regs->tstate & TSTATE_PRIV) {
+		/* ADI tag mismatch in kernel mode means illegal access to
+		 * kernel memory through rogue means potentially.
+		 */
+		pr_emerg("mcd_err: ADI tag mismatch in kernel at "
+			"ADDR[%016llx], going.\n", ent.err_raddr);
+		die_if_kernel("MCD error", regs);
+	}
+
+	/* Send SIGSEGV to the userspace process with the right code
+	 */
+	info.si_signo = SIGSEGV;
+	info.si_errno = 0;
+	info.si_code = SEGV_ADIDERR;
+	info.si_addr = (void __user *)ent.err_raddr;
+	info.si_trapno = 0;
+	force_sig_info(SIGSEGV, &info, current);
+}
+
 /* We run with %pil set to PIL_NORMAL_MAX and PSTATE_IE enabled in %pstate.
  * Log the event and clear the first word of the entry.
  */
@@ -2035,6 +2085,14 @@  void sun4v_resum_error(struct pt_regs *regs, unsigned long offset)
 		goto out;
 	}
 
+	/* If this is a memory corruption detected error, call the
+	 * handler
+	 */
+	if (local_copy.err_attrs & SUN4V_ERR_ATTRS_MCD) {
+		do_mcd_err(regs, local_copy);
+		return;
+	}
+
 	sun4v_log_error(regs, &local_copy, cpu,
 			KERN_ERR "RESUMABLE ERROR",
 			&sun4v_resum_oflow_cnt);
@@ -2531,6 +2589,38 @@  void sun4v_do_mna(struct pt_regs *regs, unsigned long addr, unsigned long type_c
 	force_sig_info(SIGBUS, &info, current);
 }
 
+void sun4v_mem_corrupt_detect_precise(struct pt_regs *regs, unsigned long addr,
+				      unsigned long context)
+{
+	siginfo_t info;
+
+	if (!ADI_CAPABLE()) {
+		bad_trap(regs, 0x1a);
+		return;
+	}
+
+	if (notify_die(DIE_TRAP, "memory corruption precise exception", regs,
+		       0, 0x8, SIGSEGV) == NOTIFY_STOP)
+		return;
+
+	if (regs->tstate & TSTATE_PRIV) {
+		pr_emerg("sun4v_mem_corrupt_detect_precise: ADDR[%016lx] "
+			"CTX[%lx], going.\n", addr, context);
+		die_if_kernel("MCD precise", regs);
+	}
+
+	if (test_thread_flag(TIF_32BIT)) {
+		regs->tpc &= 0xffffffff;
+		regs->tnpc &= 0xffffffff;
+	}
+	info.si_signo = SIGSEGV;
+	info.si_code = SEGV_ADIPERR;
+	info.si_errno = 0;
+	info.si_addr = (void __user *) addr;
+	info.si_trapno = 0;
+	force_sig_info(SIGSEGV, &info, current);
+}
+
 void do_privop(struct pt_regs *regs)
 {
 	enum ctx_state prev_state = exception_enter();
diff --git a/arch/sparc/kernel/ttable_64.S b/arch/sparc/kernel/ttable_64.S
index c6dfdaa..2343bf0 100644
--- a/arch/sparc/kernel/ttable_64.S
+++ b/arch/sparc/kernel/ttable_64.S
@@ -25,8 +25,10 @@  tl0_ill:	membar #Sync
 		TRAP_7INSNS(do_illegal_instruction)
 tl0_privop:	TRAP(do_privop)
 tl0_resv012:	BTRAP(0x12) BTRAP(0x13) BTRAP(0x14) BTRAP(0x15) BTRAP(0x16) BTRAP(0x17)
-tl0_resv018:	BTRAP(0x18) BTRAP(0x19) BTRAP(0x1a) BTRAP(0x1b) BTRAP(0x1c) BTRAP(0x1d)
-tl0_resv01e:	BTRAP(0x1e) BTRAP(0x1f)
+tl0_resv018:	BTRAP(0x18) BTRAP(0x19)
+tl0_mcd:	SUN4V_MCD_PRECISE
+tl0_resv01b:	BTRAP(0x1b)
+tl0_resv01c:	BTRAP(0x1c) BTRAP(0x1d)	BTRAP(0x1e) BTRAP(0x1f)
 tl0_fpdis:	TRAP_NOSAVE(do_fpdis)
 tl0_fpieee:	TRAP_SAVEFPU(do_fpieee)
 tl0_fpother:	TRAP_NOSAVE(do_fpother_check_fitos)
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 00bad77..5a80219 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -168,6 +168,8 @@  extern unsigned int kobjsize(const void *objp);
 # define VM_GROWSUP	VM_ARCH_1
 #elif defined(CONFIG_IA64)
 # define VM_GROWSUP	VM_ARCH_1
+#elif defined(CONFIG_SPARC64)
+# define VM_SPARC_ADI	VM_ARCH_1	/* Uses ADI tag for access control */
 #elif !defined(CONFIG_MMU)
 # define VM_MAPPED_COPY	VM_ARCH_1	/* T if mapped copy of data (nommu mmap) */
 #endif
diff --git a/include/uapi/asm-generic/siginfo.h b/include/uapi/asm-generic/siginfo.h
index 1e35520..8235d6e 100644
--- a/include/uapi/asm-generic/siginfo.h
+++ b/include/uapi/asm-generic/siginfo.h
@@ -206,7 +206,10 @@  typedef struct siginfo {
 #define SEGV_MAPERR	(__SI_FAULT|1)	/* address not mapped to object */
 #define SEGV_ACCERR	(__SI_FAULT|2)	/* invalid permissions for mapped object */
 #define SEGV_BNDERR	(__SI_FAULT|3)  /* failed address bound checks */
-#define NSIGSEGV	3
+#define SEGV_ACCADI	(__SI_FAULT|4)	/* ADI not enabled for mapped object */
+#define SEGV_ADIDERR	(__SI_FAULT|5)	/* Disrupting MCD error */
+#define SEGV_ADIPERR	(__SI_FAULT|6)	/* Precise MCD exception */
+#define NSIGSEGV	6
 
 /*
  * SIGBUS si_codes
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index a8d0759..422c246 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -197,4 +197,20 @@  struct prctl_mm_map {
 # define PR_CAP_AMBIENT_LOWER		3
 # define PR_CAP_AMBIENT_CLEAR_ALL	4
 
+/* SPARC ADI operations, see Documentation/prctl/sparc_adi.txt for details */
+#define PR_GET_SPARC_ADICAPS	48
+#define PR_SET_SPARC_ADI	49
+# define PR_SET_SPARC_ADI_CLEAR	0
+# define PR_SET_SPARC_ADI_SET	1
+#define PR_ENABLE_SPARC_ADI	50
+#define PR_DISABLE_SPARC_ADI	51
+#define PR_GET_SPARC_ADI_STATUS	52
+
+/* Data structure returned by PR_GET_SPARC_ADICAPS */
+struct adi_caps {
+	__u64 blksz;
+	__u64 nbits;
+	__u64 ue_on_adi;
+};
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/sys.c b/kernel/sys.c
index 6af9212..fa7b5d9 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -103,6 +103,21 @@ 
 #ifndef SET_FP_MODE
 # define SET_FP_MODE(a,b)	(-EINVAL)
 #endif
+#ifndef GET_SPARC_ADICAPS
+# define GET_SPARC_ADICAPS(a)		(-EINVAL)
+#endif
+#ifndef SET_SPARC_MCDE
+# define SET_SPARC_MCDE(a)		(-EINVAL)
+#endif
+#ifndef ENABLE_SPARC_ADI
+# define ENABLE_SPARC_ADI(a, b)		(-EINVAL)
+#endif
+#ifndef DISABLE_SPARC_ADI
+# define DISABLE_SPARC_ADI(a, b)	(-EINVAL)
+#endif
+#ifndef GET_SPARC_ADI_STATUS
+# define GET_SPARC_ADI_STATUS(a)	(-EINVAL)
+#endif
 
 /*
  * this is where the system-wide overflow UID and GID are defined, for
@@ -2266,6 +2281,21 @@  SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3,
 	case PR_GET_FP_MODE:
 		error = GET_FP_MODE(me);
 		break;
+	case PR_GET_SPARC_ADICAPS:
+		error = GET_SPARC_ADICAPS(arg2);
+		break;
+	case PR_SET_SPARC_ADI:
+		error = SET_SPARC_MCDE(arg2);
+		break;
+	case PR_ENABLE_SPARC_ADI:
+		error = ENABLE_SPARC_ADI(arg2, arg3);
+		break;
+	case PR_DISABLE_SPARC_ADI:
+		error = DISABLE_SPARC_ADI(arg2, arg3);
+		break;
+	case PR_GET_SPARC_ADI_STATUS:
+		error = GET_SPARC_ADI_STATUS(arg2);
+		break;
 	default:
 		error = -EINVAL;
 		break;