All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] x86: Implement Linear Address Masking support
@ 2022-04-07  1:01 Kirill A. Shutemov
  2022-04-07  3:34 ` Richard Henderson
  0 siblings, 1 reply; 15+ messages in thread
From: Kirill A. Shutemov @ 2022-04-07  1:01 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kirill A. Shutemov

Linear Address Masking feature makes CPU ignore some bits of the virtual
address. These bits can be used to encode metadata.

The feature is enumerated with CPUID.(EAX=07H, ECX=01H):EAX.LAM[bit 26].

CR3.LAM_U57[bit 62] allows to encode 6 bits of metadata in bits 62:57 of
user pointers.

CR3.LAM_U48[bit 61] allows to encode 15 bits of metadata in bits 62:48
of user pointers.

CR4.LAM_SUP[bit 28] allows to encode metadata of supervisor pointers.
If 5-level paging is in use, 6 bits of metadata can be encoded in 62:57.
For 4-level paging, 15 bits of metadata can be encoded in bits 62:48.

QEMU strips address from the metadata bits and gets it to canonical
shape before handling memory access. It has to be done very early before
TLB lookup.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 accel/tcg/cputlb.c                   | 20 +++++++++++++++++---
 include/hw/core/tcg-cpu-ops.h        |  5 +++++
 target/i386/cpu.c                    |  4 ++--
 target/i386/cpu.h                    | 26 +++++++++++++++++++++++++-
 target/i386/helper.c                 |  2 +-
 target/i386/tcg/helper-tcg.h         |  1 +
 target/i386/tcg/sysemu/excp_helper.c | 28 +++++++++++++++++++++++++++-
 target/i386/tcg/sysemu/misc_helper.c |  3 +--
 target/i386/tcg/sysemu/svm_helper.c  |  3 +--
 target/i386/tcg/tcg-cpu.c            |  1 +
 10 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 2035b2ac0ac0..15eff0df39c1 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1295,6 +1295,17 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
     return ram_addr;
 }
 
+static vaddr clean_addr(CPUArchState *env, vaddr addr)
+{
+    CPUClass *cc = CPU_GET_CLASS(env_cpu(env));
+
+    if (cc->tcg_ops->do_clean_addr) {
+        addr = cc->tcg_ops->do_clean_addr(env_cpu(env), addr);
+    }
+
+    return addr;
+}
+
 /*
  * Note: tlb_fill() can trigger a resize of the TLB. This means that all of the
  * caller's prior references to the TLB table (e.g. CPUTLBEntry pointers) must
@@ -1757,10 +1768,11 @@ bool tlb_plugin_lookup(CPUState *cpu, target_ulong addr, int mmu_idx,
  *
  * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
  */
-static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+static void *atomic_mmu_lookup(CPUArchState *env, target_ulong address,
                                MemOpIdx oi, int size, int prot,
                                uintptr_t retaddr)
 {
+    target_ulong addr = clean_addr(env, address);
     size_t mmu_idx = get_mmuidx(oi);
     MemOp mop = get_memop(oi);
     int a_bits = get_alignment_bits(mop);
@@ -1904,10 +1916,11 @@ load_memop(const void *haddr, MemOp op)
 }
 
 static inline uint64_t QEMU_ALWAYS_INLINE
-load_helper(CPUArchState *env, target_ulong addr, MemOpIdx oi,
+load_helper(CPUArchState *env, target_ulong address, MemOpIdx oi,
             uintptr_t retaddr, MemOp op, bool code_read,
             FullLoadHelper *full_load)
 {
+    target_ulong addr = clean_addr(env, address);
     uintptr_t mmu_idx = get_mmuidx(oi);
     uintptr_t index = tlb_index(env, mmu_idx, addr);
     CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
@@ -2307,9 +2320,10 @@ store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
 }
 
 static inline void QEMU_ALWAYS_INLINE
-store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+store_helper(CPUArchState *env, target_ulong address, uint64_t val,
              MemOpIdx oi, uintptr_t retaddr, MemOp op)
 {
+    target_ulong addr = clean_addr(env, address);
     uintptr_t mmu_idx = get_mmuidx(oi);
     uintptr_t index = tlb_index(env, mmu_idx, addr);
     CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
index e13898553aff..8e81f45510bf 100644
--- a/include/hw/core/tcg-cpu-ops.h
+++ b/include/hw/core/tcg-cpu-ops.h
@@ -82,6 +82,11 @@ struct TCGCPUOps {
                                 MMUAccessType access_type,
                                 int mmu_idx, uintptr_t retaddr) QEMU_NORETURN;
 
+    /**
+     * @do_clean_addr: Callback for clearing metadata/tags from the address.
+     */
+    vaddr (*do_clean_addr)(CPUState *cpu, vaddr addr);
+
     /**
      * @adjust_watchpoint_address: hack for cpu_check_watchpoint used by ARM
      */
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index cb6b5467d067..6e3e8473bf04 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -662,7 +662,7 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
           /* CPUID_7_0_ECX_OSPKE is dynamic */ \
           CPUID_7_0_ECX_LA57 | CPUID_7_0_ECX_PKS)
 #define TCG_7_0_EDX_FEATURES 0
-#define TCG_7_1_EAX_FEATURES 0
+#define TCG_7_1_EAX_FEATURES CPUID_7_1_EAX_LAM
 #define TCG_APM_FEATURES 0
 #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT
 #define TCG_XSAVE_FEATURES (CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XGETBV1)
@@ -876,7 +876,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
             NULL, NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
-            NULL, NULL, NULL, NULL,
+            NULL, NULL, "lam", NULL,
             NULL, NULL, NULL, NULL,
         },
         .cpuid = {
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 982c5323537c..5d6cc8efb7da 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -232,6 +232,9 @@ typedef enum X86Seg {
 #define CR0_CD_MASK  (1U << 30)
 #define CR0_PG_MASK  (1U << 31)
 
+#define CR3_LAM_U57  (1ULL << 61)
+#define CR3_LAM_U48  (1ULL << 62)
+
 #define CR4_VME_MASK  (1U << 0)
 #define CR4_PVI_MASK  (1U << 1)
 #define CR4_TSD_MASK  (1U << 2)
@@ -255,6 +258,7 @@ typedef enum X86Seg {
 #define CR4_SMAP_MASK   (1U << 21)
 #define CR4_PKE_MASK   (1U << 22)
 #define CR4_PKS_MASK   (1U << 24)
+#define CR4_LAM_SUP    (1U << 28)
 
 #define CR4_RESERVED_MASK \
 (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \
@@ -263,7 +267,8 @@ typedef enum X86Seg {
                 | CR4_OSFXSR_MASK | CR4_OSXMMEXCPT_MASK | CR4_UMIP_MASK \
                 | CR4_LA57_MASK \
                 | CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \
-                | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK))
+                | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK \
+                | CR4_LAM_SUP))
 
 #define DR6_BD          (1 << 13)
 #define DR6_BS          (1 << 14)
@@ -877,6 +882,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 #define CPUID_7_1_EAX_AVX_VNNI          (1U << 4)
 /* AVX512 BFloat16 Instruction */
 #define CPUID_7_1_EAX_AVX512_BF16       (1U << 5)
+/* Linear Address Masking */
+#define CPUID_7_1_EAX_LAM               (1U << 26)
 /* XFD Extend Feature Disabled */
 #define CPUID_D_1_EAX_XFD               (1U << 4)
 
@@ -2287,6 +2294,23 @@ static inline bool hyperv_feat_enabled(X86CPU *cpu, int feat)
     return !!(cpu->hyperv_features & BIT(feat));
 }
 
+static inline uint64_t cr3_reserved_bits(CPUX86State *env)
+{
+    uint64_t reserved_bits;
+
+    if (!(env->efer & MSR_EFER_LMA)) {
+        return 0;
+    }
+
+    reserved_bits = (~0ULL) << env_archcpu(env)->phys_bits;
+
+    if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM) {
+        reserved_bits &= ~(CR3_LAM_U48 | CR3_LAM_U57);
+    }
+
+    return reserved_bits;
+}
+
 static inline uint64_t cr4_reserved_bits(CPUX86State *env)
 {
     uint64_t reserved_bits = CR4_RESERVED_MASK;
diff --git a/target/i386/helper.c b/target/i386/helper.c
index fa409e9c44a8..f91ebab840d6 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -247,7 +247,7 @@ hwaddr x86_cpu_get_phys_page_attrs_debug(CPUState *cs, vaddr addr,
             }
 
             if (la57) {
-                pml5e_addr = ((env->cr[3] & ~0xfff) +
+                pml5e_addr = ((env->cr[3] & PG_ADDRESS_MASK) +
                         (((addr >> 48) & 0x1ff) << 3)) & a20_mask;
                 pml5e = x86_ldq_phys(cs, pml5e_addr);
                 if (!(pml5e & PG_PRESENT_MASK)) {
diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h
index 0a4401e917f9..03ab858598d2 100644
--- a/target/i386/tcg/helper-tcg.h
+++ b/target/i386/tcg/helper-tcg.h
@@ -51,6 +51,7 @@ void x86_cpu_record_sigsegv(CPUState *cs, vaddr addr,
 bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
                       MMUAccessType access_type, int mmu_idx,
                       bool probe, uintptr_t retaddr);
+vaddr x86_cpu_clean_addr(CPUState *cpu, vaddr addr);
 #endif
 
 void breakpoint_handler(CPUState *cs);
diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c
index e1b6d8868338..caaab413381b 100644
--- a/target/i386/tcg/sysemu/excp_helper.c
+++ b/target/i386/tcg/sysemu/excp_helper.c
@@ -64,7 +64,7 @@ static int mmu_translate(CPUState *cs, hwaddr addr, MMUTranslateFunc get_hphys_f
             uint64_t pml4e_addr, pml4e;
 
             if (la57) {
-                pml5e_addr = ((cr3 & ~0xfff) +
+                pml5e_addr = ((cr3 & PG_ADDRESS_MASK) +
                         (((addr >> 48) & 0x1ff) << 3)) & a20_mask;
                 pml5e_addr = GET_HPHYS(cs, pml5e_addr, MMU_DATA_STORE, NULL);
                 pml5e = x86_ldq_phys(cs, pml5e_addr);
@@ -437,3 +437,29 @@ bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size,
     }
     return true;
 }
+
+static inline int64_t sign_extend64(uint64_t value, int index)
+{
+    int shift = 63 - index;
+    return (int64_t)(value << shift) >> shift;
+}
+
+vaddr x86_cpu_clean_addr(CPUState *cs, vaddr addr)
+{
+    CPUX86State *env = &X86_CPU(cs)->env;
+    bool la57 = env->cr[4] & CR4_LA57_MASK;
+
+    if (addr >> 63) {
+        if (env->cr[4] & CR4_LAM_SUP) {
+            return sign_extend64(addr, la57 ? 56 : 47);
+        }
+    } else {
+        if (env->cr[3] & CR3_LAM_U57) {
+            return sign_extend64(addr, 56);
+        } else if (env->cr[3] & CR3_LAM_U48) {
+            return sign_extend64(addr, 47);
+        }
+    }
+
+    return addr;
+}
diff --git a/target/i386/tcg/sysemu/misc_helper.c b/target/i386/tcg/sysemu/misc_helper.c
index 3715c1e2625b..faeb4a16383c 100644
--- a/target/i386/tcg/sysemu/misc_helper.c
+++ b/target/i386/tcg/sysemu/misc_helper.c
@@ -97,8 +97,7 @@ void helper_write_crN(CPUX86State *env, int reg, target_ulong t0)
         cpu_x86_update_cr0(env, t0);
         break;
     case 3:
-        if ((env->efer & MSR_EFER_LMA) &&
-                (t0 & ((~0ULL) << env_archcpu(env)->phys_bits))) {
+        if (t0 & cr3_reserved_bits(env)) {
             cpu_vmexit(env, SVM_EXIT_ERR, 0, GETPC());
         }
         if (!(env->efer & MSR_EFER_LMA)) {
diff --git a/target/i386/tcg/sysemu/svm_helper.c b/target/i386/tcg/sysemu/svm_helper.c
index 2b6f450af959..cbd99f240bb8 100644
--- a/target/i386/tcg/sysemu/svm_helper.c
+++ b/target/i386/tcg/sysemu/svm_helper.c
@@ -287,8 +287,7 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend)
         cpu_vmexit(env, SVM_EXIT_ERR, 0, GETPC());
     }
     new_cr3 = x86_ldq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, save.cr3));
-    if ((env->efer & MSR_EFER_LMA) &&
-            (new_cr3 & ((~0ULL) << cpu->phys_bits))) {
+    if (new_cr3 & cr3_reserved_bits(env)) {
         cpu_vmexit(env, SVM_EXIT_ERR, 0, GETPC());
     }
     new_cr4 = x86_ldq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, save.cr4));
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index 6fdfdf959899..754454d19041 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -77,6 +77,7 @@ static const struct TCGCPUOps x86_tcg_ops = {
     .record_sigsegv = x86_cpu_record_sigsegv,
 #else
     .tlb_fill = x86_cpu_tlb_fill,
+    .do_clean_addr = x86_cpu_clean_addr,
     .do_interrupt = x86_cpu_do_interrupt,
     .cpu_exec_interrupt = x86_cpu_exec_interrupt,
     .debug_excp_handler = breakpoint_handler,
-- 
2.35.1



^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-04-07  1:01 [PATCH] x86: Implement Linear Address Masking support Kirill A. Shutemov
@ 2022-04-07  3:34 ` Richard Henderson
  2022-04-07 13:18   ` Kirill A. Shutemov
  0 siblings, 1 reply; 15+ messages in thread
From: Richard Henderson @ 2022-04-07  3:34 UTC (permalink / raw)
  To: Kirill A. Shutemov, qemu-devel

On 4/6/22 20:01, Kirill A. Shutemov wrote:
> Linear Address Masking feature makes CPU ignore some bits of the virtual
> address. These bits can be used to encode metadata.
> 
> The feature is enumerated with CPUID.(EAX=07H, ECX=01H):EAX.LAM[bit 26].
> 
> CR3.LAM_U57[bit 62] allows to encode 6 bits of metadata in bits 62:57 of
> user pointers.
> 
> CR3.LAM_U48[bit 61] allows to encode 15 bits of metadata in bits 62:48
> of user pointers.
> 
> CR4.LAM_SUP[bit 28] allows to encode metadata of supervisor pointers.
> If 5-level paging is in use, 6 bits of metadata can be encoded in 62:57.
> For 4-level paging, 15 bits of metadata can be encoded in bits 62:48.
> 
> QEMU strips address from the metadata bits and gets it to canonical
> shape before handling memory access. It has to be done very early before
> TLB lookup.

The new hook is incorrect, in that it doesn't apply to addresses along the tlb fast path.

But it isn't really needed.  You can do all of the work in the existing tlb_fill hook. 
AArch64 has a similar feature, and that works fine.


r~


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-04-07  3:34 ` Richard Henderson
@ 2022-04-07 13:18   ` Kirill A. Shutemov
  2022-04-07 14:28     ` Richard Henderson
  0 siblings, 1 reply; 15+ messages in thread
From: Kirill A. Shutemov @ 2022-04-07 13:18 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Wed, Apr 06, 2022 at 10:34:41PM -0500, Richard Henderson wrote:
> On 4/6/22 20:01, Kirill A. Shutemov wrote:
> > Linear Address Masking feature makes CPU ignore some bits of the virtual
> > address. These bits can be used to encode metadata.
> > 
> > The feature is enumerated with CPUID.(EAX=07H, ECX=01H):EAX.LAM[bit 26].
> > 
> > CR3.LAM_U57[bit 62] allows to encode 6 bits of metadata in bits 62:57 of
> > user pointers.
> > 
> > CR3.LAM_U48[bit 61] allows to encode 15 bits of metadata in bits 62:48
> > of user pointers.
> > 
> > CR4.LAM_SUP[bit 28] allows to encode metadata of supervisor pointers.
> > If 5-level paging is in use, 6 bits of metadata can be encoded in 62:57.
> > For 4-level paging, 15 bits of metadata can be encoded in bits 62:48.
> > 
> > QEMU strips address from the metadata bits and gets it to canonical
> > shape before handling memory access. It has to be done very early before
> > TLB lookup.
> 
> The new hook is incorrect, in that it doesn't apply to addresses along
> the tlb fast path.

I'm not sure what you mean by that. tlb_hit() mechanics works. We strip
the tag bits before tlb lookup.

Could you elaborate?

> But it isn't really needed.  You can do all of the work in the existing
> tlb_fill hook. AArch64 has a similar feature, and that works fine.

To be honest I don't fully understand how TBI emulation works.

Consider store_helper(). I failed to find where tag bits get stripped
before getting there for !CONFIG_USER_ONLY. clean_data_tbi() only covers
user-only case.

And if we get there with tags, I don't see how we will ever get to fast
path: tlb_hit() should never return true there if any bit in top byte is
set as cached tlb_addr has them stripped.

tlb_fill() will get it handled correctly, but it is wasteful to go through
pagewalk on every tagged pointer dereference.

Hm?

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-04-07 13:18   ` Kirill A. Shutemov
@ 2022-04-07 14:28     ` Richard Henderson
  2022-04-07 15:27       ` Kirill A. Shutemov
  0 siblings, 1 reply; 15+ messages in thread
From: Richard Henderson @ 2022-04-07 14:28 UTC (permalink / raw)
  To: Kirill A. Shutemov; +Cc: qemu-devel

On 4/7/22 06:18, Kirill A. Shutemov wrote:
>> The new hook is incorrect, in that it doesn't apply to addresses along
>> the tlb fast path.
> 
> I'm not sure what you mean by that. tlb_hit() mechanics works. We strip
> the tag bits before tlb lookup.
> 
> Could you elaborate?

The fast path does not clear the bits, so you enter the slow path before you get to 
clearing the bits.  You've lost most of the advantage of the tlb already.

> To be honest I don't fully understand how TBI emulation works.

In get_phys_addr_lpae:

         addrsize = 64 - 8 * param.tbi;
...
         target_ulong top_bits = sextract64(address, inputsize,
                                            addrsize - inputsize);
         if (-top_bits != param.select) {
             /* The gap between the two regions is a Translation fault */
             fault_type = ARMFault_Translation;
             goto do_fault;
         }

which does not include TBI bits in the validation of the sign-extended address.

> Consider store_helper(). I failed to find where tag bits get stripped
> before getting there for !CONFIG_USER_ONLY. clean_data_tbi() only covers
> user-only case.
> 
> And if we get there with tags, I don't see how we will ever get to fast
> path: tlb_hit() should never return true there if any bit in top byte is
> set as cached tlb_addr has them stripped.
> 
> tlb_fill() will get it handled correctly, but it is wasteful to go through
> pagewalk on every tagged pointer dereference.

We won't do a pagewalk for every tagged pointer dereference.  It'll be pointer 
dereferences with differing tags past the limit of the victim cache (CPU_VTLB_SIZE).  And 
one tag will get to use the fast path, e.g. on the store following a load.

I've just now had a browse through the Intel docs, and I see that you're not performing 
the required modified canonicality check.  While a proper tagged address will have the tag 
removed in CR2 during a page fault, an improper tagged address (with bit 63 != {47,56}) 
should have the original address reported to CR2.

I could imagine a hook that could aid the victim cache in ignoring the tag, so that we 
need go through tlb_fill fewer times.  But I wouldn't want to include that in the base 
version of this feature, and I'd want take more than a moment in the design so that it 
could be used by ARM and RISC-V as well.


r~


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-04-07 14:28     ` Richard Henderson
@ 2022-04-07 15:27       ` Kirill A. Shutemov
  2022-04-07 16:38         ` Paolo Bonzini
  2022-04-08 14:39         ` Richard Henderson
  0 siblings, 2 replies; 15+ messages in thread
From: Kirill A. Shutemov @ 2022-04-07 15:27 UTC (permalink / raw)
  To: Richard Henderson; +Cc: qemu-devel

On Thu, Apr 07, 2022 at 07:28:54AM -0700, Richard Henderson wrote:
> On 4/7/22 06:18, Kirill A. Shutemov wrote:
> > > The new hook is incorrect, in that it doesn't apply to addresses along
> > > the tlb fast path.
> > 
> > I'm not sure what you mean by that. tlb_hit() mechanics works. We strip
> > the tag bits before tlb lookup.
> > 
> > Could you elaborate?
> 
> The fast path does not clear the bits, so you enter the slow path before you
> get to clearing the bits.  You've lost most of the advantage of the tlb
> already.

Sorry for my ignorance, but what do you mean by fast path here?

My understanding is that it is the case when tlb_hit() is true and you
don't need to get into tlb_fill(). Are we talking about the same scheme?

For store_helper() I clear the bits before doing TLB look and fill. So TLB
will always deal with clean addresses.

Hm?

> > To be honest I don't fully understand how TBI emulation works.
> 
> In get_phys_addr_lpae:
> 
>         addrsize = 64 - 8 * param.tbi;
> ...
>         target_ulong top_bits = sextract64(address, inputsize,
>                                            addrsize - inputsize);
>         if (-top_bits != param.select) {
>             /* The gap between the two regions is a Translation fault */
>             fault_type = ARMFault_Translation;
>             goto do_fault;
>         }
> 
> which does not include TBI bits in the validation of the sign-extended address.
> 
> > Consider store_helper(). I failed to find where tag bits get stripped
> > before getting there for !CONFIG_USER_ONLY. clean_data_tbi() only covers
> > user-only case.
> > 
> > And if we get there with tags, I don't see how we will ever get to fast
> > path: tlb_hit() should never return true there if any bit in top byte is
> > set as cached tlb_addr has them stripped.
> > 
> > tlb_fill() will get it handled correctly, but it is wasteful to go through
> > pagewalk on every tagged pointer dereference.
> 
> We won't do a pagewalk for every tagged pointer dereference.  It'll be
> pointer dereferences with differing tags past the limit of the victim cache
> (CPU_VTLB_SIZE).  And one tag will get to use the fast path, e.g. on the
> store following a load.
> 
> I've just now had a browse through the Intel docs, and I see that you're not
> performing the required modified canonicality check.

Modified is effectively done by clearing (and sign-extending) the address
before the check.

> While a proper tagged address will have the tag removed in CR2 during a
> page fault, an improper tagged address (with bit 63 != {47,56}) should
> have the original address reported to CR2.

Hm. I don't see it in spec. It rather points to other direction:

	Page faults report the faulting linear address in CR2. Because LAM
	masking (by sign-extension) applies before paging, the faulting
	linear address recorded in CR2 does not contain the masked
	metadata.

Yes, it talks about CR2 in case of page fault, not #GP due to canonicality
checking, but still.

> I could imagine a hook that could aid the victim cache in ignoring the tag,
> so that we need go through tlb_fill fewer times.  But I wouldn't want to
> include that in the base version of this feature, and I'd want take more
> than a moment in the design so that it could be used by ARM and RISC-V as
> well.

But what other options do you see. Clering the bits before TLB look up
matches the architectural spec and makes INVLPG match described behaviour
without special handling.

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-04-07 15:27       ` Kirill A. Shutemov
@ 2022-04-07 16:38         ` Paolo Bonzini
  2022-04-07 17:44           ` Kirill A. Shutemov
  2022-04-08 14:39         ` Richard Henderson
  1 sibling, 1 reply; 15+ messages in thread
From: Paolo Bonzini @ 2022-04-07 16:38 UTC (permalink / raw)
  To: Kirill A. Shutemov, Richard Henderson; +Cc: qemu-devel

On 4/7/22 17:27, Kirill A. Shutemov wrote:
> On Thu, Apr 07, 2022 at 07:28:54AM -0700, Richard Henderson wrote:
>> On 4/7/22 06:18, Kirill A. Shutemov wrote:
>>>> The new hook is incorrect, in that it doesn't apply to addresses along
>>>> the tlb fast path.
>>>
>>> I'm not sure what you mean by that. tlb_hit() mechanics works. We strip
>>> the tag bits before tlb lookup.
>>>
>>> Could you elaborate?
>>
>> The fast path does not clear the bits, so you enter the slow path before you
>> get to clearing the bits.  You've lost most of the advantage of the tlb
>> already.
> 
> Sorry for my ignorance, but what do you mean by fast path here?

The fast path is the TLB lookup code that is generated by the JIT 
compiler.  If the TLB hits, the memory access doesn't go through any C 
code.  I think tagged addresses always fail the fast path in your patch.

>> While a proper tagged address will have the tag removed in CR2 during a
>> page fault, an improper tagged address (with bit 63 != {47,56}) should
>> have the original address reported to CR2.
> 
> Hm. I don't see it in spec. It rather points to other direction:
> 
> 	Page faults report the faulting linear address in CR2. Because LAM
> 	masking (by sign-extension) applies before paging, the faulting
> 	linear address recorded in CR2 does not contain the masked
> 	metadata.
> 
> Yes, it talks about CR2 in case of page fault, not #GP due to canonicality
> checking, but still.
> 
>> I could imagine a hook that could aid the victim cache in ignoring the tag,
>> so that we need go through tlb_fill fewer times.  But I wouldn't want to
>> include that in the base version of this feature, and I'd want take more
>> than a moment in the design so that it could be used by ARM and RISC-V as
>> well.
> 
> But what other options do you see. Clering the bits before TLB look up
> matches the architectural spec and makes INVLPG match described behaviour
> without special handling.

Ah, INVLPG handling is messy indeed.

Paolo



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-04-07 16:38         ` Paolo Bonzini
@ 2022-04-07 17:44           ` Kirill A. Shutemov
  0 siblings, 0 replies; 15+ messages in thread
From: Kirill A. Shutemov @ 2022-04-07 17:44 UTC (permalink / raw)
  To: Paolo Bonzini; +Cc: Richard Henderson, qemu-devel

On Thu, Apr 07, 2022 at 06:38:40PM +0200, Paolo Bonzini wrote:
> On 4/7/22 17:27, Kirill A. Shutemov wrote:
> > On Thu, Apr 07, 2022 at 07:28:54AM -0700, Richard Henderson wrote:
> > > On 4/7/22 06:18, Kirill A. Shutemov wrote:
> > > > > The new hook is incorrect, in that it doesn't apply to addresses along
> > > > > the tlb fast path.
> > > > 
> > > > I'm not sure what you mean by that. tlb_hit() mechanics works. We strip
> > > > the tag bits before tlb lookup.
> > > > 
> > > > Could you elaborate?
> > > 
> > > The fast path does not clear the bits, so you enter the slow path before you
> > > get to clearing the bits.  You've lost most of the advantage of the tlb
> > > already.
> > 
> > Sorry for my ignorance, but what do you mean by fast path here?
> 
> The fast path is the TLB lookup code that is generated by the JIT compiler.
> If the TLB hits, the memory access doesn't go through any C code.  I think
> tagged addresses always fail the fast path in your patch.

Ah. Got it.

Could you point me to the key code area relevant to the topic? I'm not
familiar with the JIT side of QEMU.

-- 
 Kirill A. Shutemov


^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-04-07 15:27       ` Kirill A. Shutemov
  2022-04-07 16:38         ` Paolo Bonzini
@ 2022-04-08 14:39         ` Richard Henderson
  1 sibling, 0 replies; 15+ messages in thread
From: Richard Henderson @ 2022-04-08 14:39 UTC (permalink / raw)
  To: Kirill A. Shutemov; +Cc: qemu-devel

On 4/7/22 08:27, Kirill A. Shutemov wrote:
>> The fast path does not clear the bits, so you enter the slow path before you
>> get to clearing the bits.  You've lost most of the advantage of the tlb
>> already.
> 
> Sorry for my ignorance, but what do you mean by fast path here?
> 
> My understanding is that it is the case when tlb_hit() is true and you
> don't need to get into tlb_fill(). Are we talking about the same scheme?

We are not.  Paulo already mentioned the JIT.  One example is tcg_out_tlb_load in 
tcg/i386/tcg-target.c.inc.  Obviously, there's an implementation of that for each host 
architecture in the other tcg/arch/ subdirectories.

>> I've just now had a browse through the Intel docs, and I see that you're not
>> performing the required modified canonicality check.
> 
> Modified is effectively done by clearing (and sign-extending) the address
> before the check.
> 
>> While a proper tagged address will have the tag removed in CR2 during a
>> page fault, an improper tagged address (with bit 63 != {47,56}) should
>> have the original address reported to CR2.
> 
> Hm. I don't see it in spec. It rather points to other direction:
> 
> 	Page faults report the faulting linear address in CR2. Because LAM
> 	masking (by sign-extension) applies before paging, the faulting
> 	linear address recorded in CR2 does not contain the masked
> 	metadata.
> 

# Regardless of the paging mode, the processor performs a modified
# canonicality check that enforces that bit 47 of the pointer matches
# bit 63. As illustrated in Figure 14-1, bits 62:48 are not checked
# and are thus available for software metadata. After this modified
# canonicality check is performed, bits 62:48 are masked by
# sign-extending the value of bit 47

Note especially that the sign-extension happens after canonicality check.

> But what other options do you see. Clering the bits before TLB look up
> matches the architectural spec and makes INVLPG match described behaviour
> without special handling.

We have special handling for INVLPG: tlb_flush_page_bits_by_mmuidx.  That's how we handle 
TBI for ARM.  You'd supply 48 or 57 here.


r~



^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-05-12 13:01   ` David Laight
  2022-05-12 14:07     ` Matthew Wilcox
  2022-05-12 14:35     ` Peter Zijlstra
@ 2022-05-12 17:00     ` Kirill A. Shutemov
  2 siblings, 0 replies; 15+ messages in thread
From: Kirill A. Shutemov @ 2022-05-12 17:00 UTC (permalink / raw)
  To: David Laight
  Cc: Dave Hansen, Andy Lutomirski, Peter Zijlstra, x86,
	Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, H . J . Lu,
	Andi Kleen, Rick Edgecombe, linux-mm, linux-kernel

On Thu, May 12, 2022 at 01:01:07PM +0000, David Laight wrote:
> From: Kirill A. Shutemov
> > Sent: 11 May 2022 03:28
> > 
> > Linear Address Masking feature makes CPU ignore some bits of the virtual
> > address. These bits can be used to encode metadata.
> > 
> > The feature is enumerated with CPUID.(EAX=07H, ECX=01H):EAX.LAM[bit 26].
> > 
> > CR3.LAM_U57[bit 62] allows to encode 6 bits of metadata in bits 62:57 of
> > user pointers.
> > 
> > CR3.LAM_U48[bit 61] allows to encode 15 bits of metadata in bits 62:48
> > of user pointers.
> > 
> > CR4.LAM_SUP[bit 28] allows to encode metadata of supervisor pointers.
> > If 5-level paging is in use, 6 bits of metadata can be encoded in 62:57.
> > For 4-level paging, 15 bits of metadata can be encoded in bits 62:48.
> > 
> ...
> > +static vaddr clean_addr(CPUArchState *env, vaddr addr)
> > +{
> > +    CPUClass *cc = CPU_GET_CLASS(env_cpu(env));
> > +
> > +    if (cc->tcg_ops->do_clean_addr) {
> > +        addr = cc->tcg_ops->do_clean_addr(env_cpu(env), addr);
> 
> The performance of a conditional indirect call will be horrid.
> Over-engineered when there is only one possible function.

It is QEMU patch. As I mentioned in the cover letter, it was rejected by
upstream, but it is functional and can be used for testing before HW
arrived.

I may give it another try when I get time to look deeper at TCG.

-- 
 Kirill A. Shutemov

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] x86: Implement Linear Address Masking support
  2022-05-12 15:06       ` Thomas Gleixner
@ 2022-05-12 15:33         ` David Laight
  0 siblings, 0 replies; 15+ messages in thread
From: David Laight @ 2022-05-12 15:33 UTC (permalink / raw)
  To: 'Thomas Gleixner', Matthew Wilcox
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Andy Lutomirski, Peter Zijlstra, x86,
	Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, H . J . Lu,
	Andi Kleen, Rick Edgecombe, linux-mm, linux-kernel

From: Thomas Gleixner
> Sent: 12 May 2022 16:07
> 
> On Thu, May 12 2022 at 15:07, Matthew Wilcox wrote:
> > On Thu, May 12, 2022 at 01:01:07PM +0000, David Laight wrote:
> >> > +static inline int64_t sign_extend64(uint64_t value, int index)
> >> > +{
> >> > +    int shift = 63 - index;
> >> > +    return (int64_t)(value << shift) >> shift;
> >> > +}
> >>
> >> Shift of signed integers are UB.
> >
> > Citation needed.
> 
> I'll bite :)
> 
> C11/19: 6.5.7 Bitwise shift operators
> 
>   4 The result of E1 << E2 is E1 left-shifted E2 bit positions; vacated
>     bits are filled with zeros. If E1 has an unsigned type, the value of
>     the result is E1 × 2E2, reduced modulo one more than the maximum
>     value representable in the result type. If E1 has a signed type and
>     nonnegative value, and E1 × 2E2 is representable in the result type,
>     then that is the resulting value; otherwise, the behavior is
>     undefined.
> 
> This is irrelevant for the case above because the left shift is on an
> unsigned integer. The interesting part is this:
> 
>   5 The result of E1 >> E2 is E1 right-shifted E2 bit positions. If E1
>     has an unsigned type or if E1 has a signed type and a nonnegative
>     value, the value of the result is the integral part of the quotient
>     of E1/2E2.  If E1 has a signed type and a negative value, the
>     resulting value is implementation-defined.
> 
> So it's not UB, it's implementation defined. The obvious choice is to
> keep LSB set, i.e. arithmetic shift, what both GCC and clang do.

I'm sure someone recently said one of the standards had made them UB.

In any case, given the caller seems to know whether the top bit is set
(and does a different call) using |= or &= is distinctly better.
Especially since the required constant can be computed in a slow path.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-05-12 14:07     ` Matthew Wilcox
@ 2022-05-12 15:06       ` Thomas Gleixner
  2022-05-12 15:33         ` David Laight
  0 siblings, 1 reply; 15+ messages in thread
From: Thomas Gleixner @ 2022-05-12 15:06 UTC (permalink / raw)
  To: Matthew Wilcox, David Laight
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Andy Lutomirski, Peter Zijlstra, x86,
	Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, H . J . Lu,
	Andi Kleen, Rick Edgecombe, linux-mm, linux-kernel

On Thu, May 12 2022 at 15:07, Matthew Wilcox wrote:
> On Thu, May 12, 2022 at 01:01:07PM +0000, David Laight wrote:
>> > +static inline int64_t sign_extend64(uint64_t value, int index)
>> > +{
>> > +    int shift = 63 - index;
>> > +    return (int64_t)(value << shift) >> shift;
>> > +}
>> 
>> Shift of signed integers are UB.
>
> Citation needed.

I'll bite :)

C11/19: 6.5.7 Bitwise shift operators

  4 The result of E1 << E2 is E1 left-shifted E2 bit positions; vacated
    bits are filled with zeros. If E1 has an unsigned type, the value of
    the result is E1 × 2E2, reduced modulo one more than the maximum
    value representable in the result type. If E1 has a signed type and
    nonnegative value, and E1 × 2E2 is representable in the result type,
    then that is the resulting value; otherwise, the behavior is
    undefined.

This is irrelevant for the case above because the left shift is on an
unsigned integer. The interesting part is this:

  5 The result of E1 >> E2 is E1 right-shifted E2 bit positions. If E1
    has an unsigned type or if E1 has a signed type and a nonnegative
    value, the value of the result is the integral part of the quotient
    of E1/2E2.  If E1 has a signed type and a negative value, the
    resulting value is implementation-defined.

So it's not UB, it's implementation defined. The obvious choice is to
keep LSB set, i.e. arithmetic shift, what both GCC and clang do.

Thanks,

        tglx




^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-05-12 13:01   ` David Laight
  2022-05-12 14:07     ` Matthew Wilcox
@ 2022-05-12 14:35     ` Peter Zijlstra
  2022-05-12 17:00     ` Kirill A. Shutemov
  2 siblings, 0 replies; 15+ messages in thread
From: Peter Zijlstra @ 2022-05-12 14:35 UTC (permalink / raw)
  To: David Laight
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Andy Lutomirski, x86, Andrey Ryabinin,
	Alexander Potapenko, Dmitry Vyukov, H . J . Lu, Andi Kleen,
	Rick Edgecombe, linux-mm, linux-kernel

On Thu, May 12, 2022 at 01:01:07PM +0000, David Laight wrote:

> > +static inline int64_t sign_extend64(uint64_t value, int index)
> > +{
> > +    int shift = 63 - index;
> > +    return (int64_t)(value << shift) >> shift;
> > +}
> 
> Shift of signed integers are UB.

The kernel uses -fno-strict-overflow, all the signed UB is gone.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [PATCH] x86: Implement Linear Address Masking support
  2022-05-12 13:01   ` David Laight
@ 2022-05-12 14:07     ` Matthew Wilcox
  2022-05-12 15:06       ` Thomas Gleixner
  2022-05-12 14:35     ` Peter Zijlstra
  2022-05-12 17:00     ` Kirill A. Shutemov
  2 siblings, 1 reply; 15+ messages in thread
From: Matthew Wilcox @ 2022-05-12 14:07 UTC (permalink / raw)
  To: David Laight
  Cc: 'Kirill A. Shutemov',
	Dave Hansen, Andy Lutomirski, Peter Zijlstra, x86,
	Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov, H . J . Lu,
	Andi Kleen, Rick Edgecombe, linux-mm, linux-kernel

On Thu, May 12, 2022 at 01:01:07PM +0000, David Laight wrote:
> > +static inline int64_t sign_extend64(uint64_t value, int index)
> > +{
> > +    int shift = 63 - index;
> > +    return (int64_t)(value << shift) >> shift;
> > +}
> 
> Shift of signed integers are UB.

Citation needed.

^ permalink raw reply	[flat|nested] 15+ messages in thread

* RE: [PATCH] x86: Implement Linear Address Masking support
  2022-05-11  2:27 ` [PATCH] x86: Implement Linear Address Masking support Kirill A. Shutemov
@ 2022-05-12 13:01   ` David Laight
  2022-05-12 14:07     ` Matthew Wilcox
                       ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: David Laight @ 2022-05-12 13:01 UTC (permalink / raw)
  To: 'Kirill A. Shutemov',
	Dave Hansen, Andy Lutomirski, Peter Zijlstra
  Cc: x86, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov,
	H . J . Lu, Andi Kleen, Rick Edgecombe, linux-mm, linux-kernel

From: Kirill A. Shutemov
> Sent: 11 May 2022 03:28
> 
> Linear Address Masking feature makes CPU ignore some bits of the virtual
> address. These bits can be used to encode metadata.
> 
> The feature is enumerated with CPUID.(EAX=07H, ECX=01H):EAX.LAM[bit 26].
> 
> CR3.LAM_U57[bit 62] allows to encode 6 bits of metadata in bits 62:57 of
> user pointers.
> 
> CR3.LAM_U48[bit 61] allows to encode 15 bits of metadata in bits 62:48
> of user pointers.
> 
> CR4.LAM_SUP[bit 28] allows to encode metadata of supervisor pointers.
> If 5-level paging is in use, 6 bits of metadata can be encoded in 62:57.
> For 4-level paging, 15 bits of metadata can be encoded in bits 62:48.
> 
...
> +static vaddr clean_addr(CPUArchState *env, vaddr addr)
> +{
> +    CPUClass *cc = CPU_GET_CLASS(env_cpu(env));
> +
> +    if (cc->tcg_ops->do_clean_addr) {
> +        addr = cc->tcg_ops->do_clean_addr(env_cpu(env), addr);

The performance of a conditional indirect call will be horrid.
Over-engineered when there is only one possible function.

....
> +
> +static inline int64_t sign_extend64(uint64_t value, int index)
> +{
> +    int shift = 63 - index;
> +    return (int64_t)(value << shift) >> shift;
> +}

Shift of signed integers are UB.

> +vaddr x86_cpu_clean_addr(CPUState *cs, vaddr addr)
> +{
> +    CPUX86State *env = &X86_CPU(cs)->env;
> +    bool la57 = env->cr[4] & CR4_LA57_MASK;
> +
> +    if (addr >> 63) {
> +        if (env->cr[4] & CR4_LAM_SUP) {
> +            return sign_extend64(addr, la57 ? 56 : 47);
> +        }
> +    } else {
> +        if (env->cr[3] & CR3_LAM_U57) {
> +            return sign_extend64(addr, 56);
> +        } else if (env->cr[3] & CR3_LAM_U48) {
> +            return sign_extend64(addr, 47);
> +        }
> +    }

That is completely horrid.
Surely it can be just:
	if (addr && 1u << 63)
		return addr | env->address_mask;
	else
		return addr & ~env->address_mask;
Where 'address_mask' is 0x7ff....
although since you really want a big gap between valid user and
valid kernel addresses allowing masked kernel addresses adds
costs elsewhere.

I've no idea how often the address masking is required?
Hopefully almost never?

copy_to/from_user() (etc) need to be able to use user addresses
without having to mask them.

	David

-
Registered Address Lakeside, Bramley Road, Mount Farm, Milton Keynes, MK1 1PT, UK
Registration No: 1397386 (Wales)


^ permalink raw reply	[flat|nested] 15+ messages in thread

* [PATCH] x86: Implement Linear Address Masking support
  2022-05-11  2:27 [RFCv2 00/10] Linear Address Masking enabling Kirill A. Shutemov
@ 2022-05-11  2:27 ` Kirill A. Shutemov
  2022-05-12 13:01   ` David Laight
  0 siblings, 1 reply; 15+ messages in thread
From: Kirill A. Shutemov @ 2022-05-11  2:27 UTC (permalink / raw)
  To: Dave Hansen, Andy Lutomirski, Peter Zijlstra
  Cc: x86, Andrey Ryabinin, Alexander Potapenko, Dmitry Vyukov,
	H . J . Lu, Andi Kleen, Rick Edgecombe, linux-mm, linux-kernel,
	Kirill A. Shutemov

Linear Address Masking feature makes CPU ignore some bits of the virtual
address. These bits can be used to encode metadata.

The feature is enumerated with CPUID.(EAX=07H, ECX=01H):EAX.LAM[bit 26].

CR3.LAM_U57[bit 62] allows to encode 6 bits of metadata in bits 62:57 of
user pointers.

CR3.LAM_U48[bit 61] allows to encode 15 bits of metadata in bits 62:48
of user pointers.

CR4.LAM_SUP[bit 28] allows to encode metadata of supervisor pointers.
If 5-level paging is in use, 6 bits of metadata can be encoded in 62:57.
For 4-level paging, 15 bits of metadata can be encoded in bits 62:48.

QEMU strips address from the metadata bits and gets it to canonical
shape before handling memory access. It has to be done very early before
TLB lookup.

Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
---
 accel/tcg/cputlb.c                   | 20 +++++++++++++++++---
 include/hw/core/tcg-cpu-ops.h        |  5 +++++
 target/i386/cpu.c                    |  4 ++--
 target/i386/cpu.h                    | 26 +++++++++++++++++++++++++-
 target/i386/helper.c                 |  2 +-
 target/i386/tcg/helper-tcg.h         |  1 +
 target/i386/tcg/sysemu/excp_helper.c | 28 +++++++++++++++++++++++++++-
 target/i386/tcg/sysemu/misc_helper.c |  3 +--
 target/i386/tcg/sysemu/svm_helper.c  |  3 +--
 target/i386/tcg/tcg-cpu.c            |  1 +
 10 files changed, 81 insertions(+), 12 deletions(-)

diff --git a/accel/tcg/cputlb.c b/accel/tcg/cputlb.c
index 2035b2ac0ac0..15eff0df39c1 100644
--- a/accel/tcg/cputlb.c
+++ b/accel/tcg/cputlb.c
@@ -1295,6 +1295,17 @@ static inline ram_addr_t qemu_ram_addr_from_host_nofail(void *ptr)
     return ram_addr;
 }
 
+static vaddr clean_addr(CPUArchState *env, vaddr addr)
+{
+    CPUClass *cc = CPU_GET_CLASS(env_cpu(env));
+
+    if (cc->tcg_ops->do_clean_addr) {
+        addr = cc->tcg_ops->do_clean_addr(env_cpu(env), addr);
+    }
+
+    return addr;
+}
+
 /*
  * Note: tlb_fill() can trigger a resize of the TLB. This means that all of the
  * caller's prior references to the TLB table (e.g. CPUTLBEntry pointers) must
@@ -1757,10 +1768,11 @@ bool tlb_plugin_lookup(CPUState *cpu, target_ulong addr, int mmu_idx,
  *
  * @prot may be PAGE_READ, PAGE_WRITE, or PAGE_READ|PAGE_WRITE.
  */
-static void *atomic_mmu_lookup(CPUArchState *env, target_ulong addr,
+static void *atomic_mmu_lookup(CPUArchState *env, target_ulong address,
                                MemOpIdx oi, int size, int prot,
                                uintptr_t retaddr)
 {
+    target_ulong addr = clean_addr(env, address);
     size_t mmu_idx = get_mmuidx(oi);
     MemOp mop = get_memop(oi);
     int a_bits = get_alignment_bits(mop);
@@ -1904,10 +1916,11 @@ load_memop(const void *haddr, MemOp op)
 }
 
 static inline uint64_t QEMU_ALWAYS_INLINE
-load_helper(CPUArchState *env, target_ulong addr, MemOpIdx oi,
+load_helper(CPUArchState *env, target_ulong address, MemOpIdx oi,
             uintptr_t retaddr, MemOp op, bool code_read,
             FullLoadHelper *full_load)
 {
+    target_ulong addr = clean_addr(env, address);
     uintptr_t mmu_idx = get_mmuidx(oi);
     uintptr_t index = tlb_index(env, mmu_idx, addr);
     CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
@@ -2307,9 +2320,10 @@ store_helper_unaligned(CPUArchState *env, target_ulong addr, uint64_t val,
 }
 
 static inline void QEMU_ALWAYS_INLINE
-store_helper(CPUArchState *env, target_ulong addr, uint64_t val,
+store_helper(CPUArchState *env, target_ulong address, uint64_t val,
              MemOpIdx oi, uintptr_t retaddr, MemOp op)
 {
+    target_ulong addr = clean_addr(env, address);
     uintptr_t mmu_idx = get_mmuidx(oi);
     uintptr_t index = tlb_index(env, mmu_idx, addr);
     CPUTLBEntry *entry = tlb_entry(env, mmu_idx, addr);
diff --git a/include/hw/core/tcg-cpu-ops.h b/include/hw/core/tcg-cpu-ops.h
index e13898553aff..8e81f45510bf 100644
--- a/include/hw/core/tcg-cpu-ops.h
+++ b/include/hw/core/tcg-cpu-ops.h
@@ -82,6 +82,11 @@ struct TCGCPUOps {
                                 MMUAccessType access_type,
                                 int mmu_idx, uintptr_t retaddr) QEMU_NORETURN;
 
+    /**
+     * @do_clean_addr: Callback for clearing metadata/tags from the address.
+     */
+    vaddr (*do_clean_addr)(CPUState *cpu, vaddr addr);
+
     /**
      * @adjust_watchpoint_address: hack for cpu_check_watchpoint used by ARM
      */
diff --git a/target/i386/cpu.c b/target/i386/cpu.c
index cb6b5467d067..6e3e8473bf04 100644
--- a/target/i386/cpu.c
+++ b/target/i386/cpu.c
@@ -662,7 +662,7 @@ void x86_cpu_vendor_words2str(char *dst, uint32_t vendor1,
           /* CPUID_7_0_ECX_OSPKE is dynamic */ \
           CPUID_7_0_ECX_LA57 | CPUID_7_0_ECX_PKS)
 #define TCG_7_0_EDX_FEATURES 0
-#define TCG_7_1_EAX_FEATURES 0
+#define TCG_7_1_EAX_FEATURES CPUID_7_1_EAX_LAM
 #define TCG_APM_FEATURES 0
 #define TCG_6_EAX_FEATURES CPUID_6_EAX_ARAT
 #define TCG_XSAVE_FEATURES (CPUID_XSAVE_XSAVEOPT | CPUID_XSAVE_XGETBV1)
@@ -876,7 +876,7 @@ FeatureWordInfo feature_word_info[FEATURE_WORDS] = {
             NULL, NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
             NULL, NULL, NULL, NULL,
-            NULL, NULL, NULL, NULL,
+            NULL, NULL, "lam", NULL,
             NULL, NULL, NULL, NULL,
         },
         .cpuid = {
diff --git a/target/i386/cpu.h b/target/i386/cpu.h
index 982c5323537c..5d6cc8efb7da 100644
--- a/target/i386/cpu.h
+++ b/target/i386/cpu.h
@@ -232,6 +232,9 @@ typedef enum X86Seg {
 #define CR0_CD_MASK  (1U << 30)
 #define CR0_PG_MASK  (1U << 31)
 
+#define CR3_LAM_U57  (1ULL << 61)
+#define CR3_LAM_U48  (1ULL << 62)
+
 #define CR4_VME_MASK  (1U << 0)
 #define CR4_PVI_MASK  (1U << 1)
 #define CR4_TSD_MASK  (1U << 2)
@@ -255,6 +258,7 @@ typedef enum X86Seg {
 #define CR4_SMAP_MASK   (1U << 21)
 #define CR4_PKE_MASK   (1U << 22)
 #define CR4_PKS_MASK   (1U << 24)
+#define CR4_LAM_SUP    (1U << 28)
 
 #define CR4_RESERVED_MASK \
 (~(target_ulong)(CR4_VME_MASK | CR4_PVI_MASK | CR4_TSD_MASK \
@@ -263,7 +267,8 @@ typedef enum X86Seg {
                 | CR4_OSFXSR_MASK | CR4_OSXMMEXCPT_MASK | CR4_UMIP_MASK \
                 | CR4_LA57_MASK \
                 | CR4_FSGSBASE_MASK | CR4_PCIDE_MASK | CR4_OSXSAVE_MASK \
-                | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK))
+                | CR4_SMEP_MASK | CR4_SMAP_MASK | CR4_PKE_MASK | CR4_PKS_MASK \
+                | CR4_LAM_SUP))
 
 #define DR6_BD          (1 << 13)
 #define DR6_BS          (1 << 14)
@@ -877,6 +882,8 @@ uint64_t x86_cpu_get_supported_feature_word(FeatureWord w,
 #define CPUID_7_1_EAX_AVX_VNNI          (1U << 4)
 /* AVX512 BFloat16 Instruction */
 #define CPUID_7_1_EAX_AVX512_BF16       (1U << 5)
+/* Linear Address Masking */
+#define CPUID_7_1_EAX_LAM               (1U << 26)
 /* XFD Extend Feature Disabled */
 #define CPUID_D_1_EAX_XFD               (1U << 4)
 
@@ -2287,6 +2294,23 @@ static inline bool hyperv_feat_enabled(X86CPU *cpu, int feat)
     return !!(cpu->hyperv_features & BIT(feat));
 }
 
+static inline uint64_t cr3_reserved_bits(CPUX86State *env)
+{
+    uint64_t reserved_bits;
+
+    if (!(env->efer & MSR_EFER_LMA)) {
+        return 0;
+    }
+
+    reserved_bits = (~0ULL) << env_archcpu(env)->phys_bits;
+
+    if (env->features[FEAT_7_1_EAX] & CPUID_7_1_EAX_LAM) {
+        reserved_bits &= ~(CR3_LAM_U48 | CR3_LAM_U57);
+    }
+
+    return reserved_bits;
+}
+
 static inline uint64_t cr4_reserved_bits(CPUX86State *env)
 {
     uint64_t reserved_bits = CR4_RESERVED_MASK;
diff --git a/target/i386/helper.c b/target/i386/helper.c
index fa409e9c44a8..f91ebab840d6 100644
--- a/target/i386/helper.c
+++ b/target/i386/helper.c
@@ -247,7 +247,7 @@ hwaddr x86_cpu_get_phys_page_attrs_debug(CPUState *cs, vaddr addr,
             }
 
             if (la57) {
-                pml5e_addr = ((env->cr[3] & ~0xfff) +
+                pml5e_addr = ((env->cr[3] & PG_ADDRESS_MASK) +
                         (((addr >> 48) & 0x1ff) << 3)) & a20_mask;
                 pml5e = x86_ldq_phys(cs, pml5e_addr);
                 if (!(pml5e & PG_PRESENT_MASK)) {
diff --git a/target/i386/tcg/helper-tcg.h b/target/i386/tcg/helper-tcg.h
index 0a4401e917f9..03ab858598d2 100644
--- a/target/i386/tcg/helper-tcg.h
+++ b/target/i386/tcg/helper-tcg.h
@@ -51,6 +51,7 @@ void x86_cpu_record_sigsegv(CPUState *cs, vaddr addr,
 bool x86_cpu_tlb_fill(CPUState *cs, vaddr address, int size,
                       MMUAccessType access_type, int mmu_idx,
                       bool probe, uintptr_t retaddr);
+vaddr x86_cpu_clean_addr(CPUState *cpu, vaddr addr);
 #endif
 
 void breakpoint_handler(CPUState *cs);
diff --git a/target/i386/tcg/sysemu/excp_helper.c b/target/i386/tcg/sysemu/excp_helper.c
index e1b6d8868338..caaab413381b 100644
--- a/target/i386/tcg/sysemu/excp_helper.c
+++ b/target/i386/tcg/sysemu/excp_helper.c
@@ -64,7 +64,7 @@ static int mmu_translate(CPUState *cs, hwaddr addr, MMUTranslateFunc get_hphys_f
             uint64_t pml4e_addr, pml4e;
 
             if (la57) {
-                pml5e_addr = ((cr3 & ~0xfff) +
+                pml5e_addr = ((cr3 & PG_ADDRESS_MASK) +
                         (((addr >> 48) & 0x1ff) << 3)) & a20_mask;
                 pml5e_addr = GET_HPHYS(cs, pml5e_addr, MMU_DATA_STORE, NULL);
                 pml5e = x86_ldq_phys(cs, pml5e_addr);
@@ -437,3 +437,29 @@ bool x86_cpu_tlb_fill(CPUState *cs, vaddr addr, int size,
     }
     return true;
 }
+
+static inline int64_t sign_extend64(uint64_t value, int index)
+{
+    int shift = 63 - index;
+    return (int64_t)(value << shift) >> shift;
+}
+
+vaddr x86_cpu_clean_addr(CPUState *cs, vaddr addr)
+{
+    CPUX86State *env = &X86_CPU(cs)->env;
+    bool la57 = env->cr[4] & CR4_LA57_MASK;
+
+    if (addr >> 63) {
+        if (env->cr[4] & CR4_LAM_SUP) {
+            return sign_extend64(addr, la57 ? 56 : 47);
+        }
+    } else {
+        if (env->cr[3] & CR3_LAM_U57) {
+            return sign_extend64(addr, 56);
+        } else if (env->cr[3] & CR3_LAM_U48) {
+            return sign_extend64(addr, 47);
+        }
+    }
+
+    return addr;
+}
diff --git a/target/i386/tcg/sysemu/misc_helper.c b/target/i386/tcg/sysemu/misc_helper.c
index 3715c1e2625b..faeb4a16383c 100644
--- a/target/i386/tcg/sysemu/misc_helper.c
+++ b/target/i386/tcg/sysemu/misc_helper.c
@@ -97,8 +97,7 @@ void helper_write_crN(CPUX86State *env, int reg, target_ulong t0)
         cpu_x86_update_cr0(env, t0);
         break;
     case 3:
-        if ((env->efer & MSR_EFER_LMA) &&
-                (t0 & ((~0ULL) << env_archcpu(env)->phys_bits))) {
+        if (t0 & cr3_reserved_bits(env)) {
             cpu_vmexit(env, SVM_EXIT_ERR, 0, GETPC());
         }
         if (!(env->efer & MSR_EFER_LMA)) {
diff --git a/target/i386/tcg/sysemu/svm_helper.c b/target/i386/tcg/sysemu/svm_helper.c
index 2b6f450af959..cbd99f240bb8 100644
--- a/target/i386/tcg/sysemu/svm_helper.c
+++ b/target/i386/tcg/sysemu/svm_helper.c
@@ -287,8 +287,7 @@ void helper_vmrun(CPUX86State *env, int aflag, int next_eip_addend)
         cpu_vmexit(env, SVM_EXIT_ERR, 0, GETPC());
     }
     new_cr3 = x86_ldq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, save.cr3));
-    if ((env->efer & MSR_EFER_LMA) &&
-            (new_cr3 & ((~0ULL) << cpu->phys_bits))) {
+    if (new_cr3 & cr3_reserved_bits(env)) {
         cpu_vmexit(env, SVM_EXIT_ERR, 0, GETPC());
     }
     new_cr4 = x86_ldq_phys(cs, env->vm_vmcb + offsetof(struct vmcb, save.cr4));
diff --git a/target/i386/tcg/tcg-cpu.c b/target/i386/tcg/tcg-cpu.c
index 6fdfdf959899..754454d19041 100644
--- a/target/i386/tcg/tcg-cpu.c
+++ b/target/i386/tcg/tcg-cpu.c
@@ -77,6 +77,7 @@ static const struct TCGCPUOps x86_tcg_ops = {
     .record_sigsegv = x86_cpu_record_sigsegv,
 #else
     .tlb_fill = x86_cpu_tlb_fill,
+    .do_clean_addr = x86_cpu_clean_addr,
     .do_interrupt = x86_cpu_do_interrupt,
     .cpu_exec_interrupt = x86_cpu_exec_interrupt,
     .debug_excp_handler = breakpoint_handler,
-- 
2.35.1


^ permalink raw reply related	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2022-05-12 17:02 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2022-04-07  1:01 [PATCH] x86: Implement Linear Address Masking support Kirill A. Shutemov
2022-04-07  3:34 ` Richard Henderson
2022-04-07 13:18   ` Kirill A. Shutemov
2022-04-07 14:28     ` Richard Henderson
2022-04-07 15:27       ` Kirill A. Shutemov
2022-04-07 16:38         ` Paolo Bonzini
2022-04-07 17:44           ` Kirill A. Shutemov
2022-04-08 14:39         ` Richard Henderson
2022-05-11  2:27 [RFCv2 00/10] Linear Address Masking enabling Kirill A. Shutemov
2022-05-11  2:27 ` [PATCH] x86: Implement Linear Address Masking support Kirill A. Shutemov
2022-05-12 13:01   ` David Laight
2022-05-12 14:07     ` Matthew Wilcox
2022-05-12 15:06       ` Thomas Gleixner
2022-05-12 15:33         ` David Laight
2022-05-12 14:35     ` Peter Zijlstra
2022-05-12 17:00     ` Kirill A. Shutemov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.