All of lore.kernel.org
 help / color / mirror / Atom feed
* [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
@ 2015-10-20 15:52 David Woodhouse
  2015-10-20 16:03 ` Joerg Roedel
  0 siblings, 1 reply; 10+ messages in thread
From: David Woodhouse @ 2015-10-20 15:52 UTC (permalink / raw)
  To: linux-mm; +Cc: iommu, Sudeep Dutt

[-- Attachment #1: Type: text/plain, Size: 6764 bytes --]

On top of the tree at git.infradead.org/users/dwmw2/linux-svm.git
(http:// or git://).

For userspace addresses, we use the MMU notifiers and flush the IOTLB
as appropriate.

However, we need to do it for kernel addresses too — which basically
means adding a hook to tlb_flush_kernel_range(). Does this look
reasonable? I was trying to avoid it and insist on supporting addresses
within the kernel's static mapping only. But it doesn't look like
that's a reasonable thing to require.

Signed-off-by: David Woodhouse <David.Woodhouse@intel.com>
---
 arch/x86/mm/tlb.c           |    2 ++
 drivers/iommu/intel-svm.c   |   37 ++++++++++++++++++++++++++++++++++---
 include/linux/intel-iommu.h |    6 +++++-
 include/linux/intel-svm.h   |   13 +++++--------
 4 files changed, 46 insertions(+), 12 deletions(-)

diff --git a/include/linux/intel-svm.h b/include/linux/intel-svm.h
index 0a48ccf..61d9533 100644
--- a/include/linux/intel-svm.h
+++ b/include/linux/intel-svm.h
@@ -44,14 +44,11 @@ struct svm_dev_ops {
 
 /*
  * The SVM_FLAG_SUPERVISOR_MODE flag requests a PASID which can be used only
- * for access to kernel addresses. No IOTLB flushes are automatically done
- * for kernel mappings; it is valid only for access to the kernel's static
- * 1:1 mapping of physical memory — not to vmalloc or even module mappings.
- * A future API addition may permit the use of such ranges, by means of an
- * explicit IOTLB flush call (akin to the DMA API's unmap method).
- *
- * It is unlikely that we will ever hook into flush_tlb_kernel_range() to
- * do such IOTLB flushes automatically.
+ * for access to kernel addresses. IOTLB flushes are performed as required
+ * by means of a hook from flush_tlb_kernel_range(). This flag is mutually
+ * exclusive with the SVM_FLAG_PRIVATE_PASID flag — there can be only one
+ * PASID used for kernel mode, to keep the performance implications of the
+ * IOTLB flush hook relatively sane.
  */
 #define SVM_FLAG_SUPERVISOR_MODE	(1<<1)
 diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0..40ebe83 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -6,6 +6,7 @@
 #include <linux/interrupt.h>
 #include <linux/module.h>
 #include <linux/cpu.h>
+#include <linux/intel-iommu.h>
 
 #include <asm/tlbflush.h>
 #include <asm/mmu_context.h>
@@ -266,6 +267,7 @@ static void do_kernel_range_flush(void *info)
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
+	intel_iommu_flush_kernel_pasid(start, end);
 
 	/* Balance as user space task's flush, a bit conservative */
 	if (end == TLB_FLUSH_ALL ||
diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c
index a584df0..f8ca3c1 100644
--- a/drivers/iommu/intel-svm.c
+++ b/drivers/iommu/intel-svm.c
@@ -23,6 +23,7 @@
 #include <linux/pci-ats.h>
 #include <linux/dmar.h>
 #include <linux/interrupt.h>
+#include <asm/tlbflush.h>
 
 static irqreturn_t prq_event_thread(int irq, void *d);
 
@@ -264,6 +265,26 @@ static const struct mmu_notifier_ops intel_mmuops = {
 	.invalidate_range = intel_invalidate_range,
 };
 
+void intel_iommu_flush_kernel_pasid(unsigned long start, unsigned long end)
+{
+	struct dmar_drhd_unit *drhd;
+	struct intel_iommu *iommu;
+	unsigned long pages;
+
+	if (end == TLB_FLUSH_ALL)
+		pages = end;
+	else
+		pages = (end - start) >> VTD_PAGE_SHIFT;
+
+	rcu_read_lock();
+	for_each_active_iommu(iommu, drhd) {
+		struct intel_svm *svm = rcu_dereference(iommu->kernel_svm);
+		if (svm)
+			intel_flush_svm_range(svm, start, pages, 0, 1);
+	}
+	rcu_read_unlock();
+}
+
 static DEFINE_MUTEX(pasid_mutex);
 
 int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_ops *ops)
@@ -286,6 +307,8 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 		pasid_max = 1 << 20;
 
 	if ((flags & SVM_FLAG_SUPERVISOR_MODE)) {
+		if (flags & SVM_FLAG_PRIVATE_PASID)
+			return -EINVAL;
 		if (!ecap_srs(iommu->ecap))
 			return -EINVAL;
 	} else if (pasid) {
@@ -294,7 +317,9 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 	}
 
 	mutex_lock(&pasid_mutex);
-	if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
+	if (SVM_FLAG_SUPERVISOR_MODE)
+		svm = iommu->kernel_svm;
+	else if (pasid && !(flags & SVM_FLAG_PRIVATE_PASID)) {
 		int i;
 
 		idr_for_each_entry(&iommu->pasid_idr, svm, i) {
@@ -378,8 +403,10 @@ int intel_svm_bind_mm(struct device *dev, int *pasid, int flags, struct svm_dev_
 			}
 			iommu->pasid_table[svm->pasid].val = (u64)__pa(mm->pgd) | 1;
 			mm = NULL;
-		} else
+		} else {
 			iommu->pasid_table[svm->pasid].val = (u64)__pa(init_mm.pgd) | 1 | (1ULL << 11);
+			rcu_assign_pointer(iommu->kernel_svm, svm);
+		}
 		wmb();
 	}
 	list_add_rcu(&sdev->list, &svm->devs);
@@ -432,8 +459,12 @@ int intel_svm_unbind_mm(struct device *dev, int pasid)
 					mmu_notifier_unregister(&svm->notifier, svm->mm);
 
 					idr_remove(&svm->iommu->pasid_idr, svm->pasid);
-					if (svm->mm)
+					if (svm->mm) {
 						mmput(svm->mm);
+					} else {
+						rcu_assign_pointer(iommu->kernel_svm, NULL);
+						synchronize_rcu();
+					}
 					/* We mandate that no page faults may be outstanding
 					 * for the PASID when intel_svm_unbind_mm() is called.
 					 * If that is not obeyed, subtle errors will happen.
diff --git a/include/linux/intel-iommu.h b/include/linux/intel-iommu.h
index 821273c..169bc84 100644
--- a/include/linux/intel-iommu.h
+++ b/include/linux/intel-iommu.h
@@ -391,6 +391,7 @@ enum {
 struct pasid_entry;
 struct pasid_state_entry;
 struct page_req_dsc;
+struct intel_svm;
 
 struct intel_iommu {
 	void __iomem	*reg; /* Pointer to hardware regs, virtual addr */
@@ -426,6 +427,7 @@ struct intel_iommu {
 	struct page_req_dsc *prq;
 	unsigned char prq_name[16];    /* Name for PRQ interrupt */
 	struct idr pasid_idr;
+	struct intel_svm __rcu *kernel_svm;
 #endif
 	struct q_inval  *qi;            /* Queued invalidation info */
 	u32 *iommu_state; /* Store iommu states between suspend and resume.*/
@@ -496,8 +498,10 @@ struct intel_svm {
 
 extern int intel_iommu_enable_pasid(struct intel_iommu *iommu, struct intel_svm_dev *sdev);
 extern struct intel_iommu *intel_svm_device_to_iommu(struct device *dev);
+extern void intel_iommu_flush_kernel_pasid(unsigned long start, unsigned long end);
+#else
+#define intel_iommu_flush_kernel_pasid(start, end) do { ; } while(0)
 #endif
-
 extern const struct attribute_group *intel_iommu_groups[];
 
 #endif

-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5691 bytes --]

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
  2015-10-20 15:52 [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses David Woodhouse
@ 2015-10-20 16:03 ` Joerg Roedel
  2015-10-20 16:17   ` David Woodhouse
  0 siblings, 1 reply; 10+ messages in thread
From: Joerg Roedel @ 2015-10-20 16:03 UTC (permalink / raw)
  To: David Woodhouse; +Cc: linux-mm, iommu, Sudeep Dutt

Hi David,

On Tue, Oct 20, 2015 at 04:52:59PM +0100, David Woodhouse wrote:
>  void flush_tlb_kernel_range(unsigned long start, unsigned long end)
>  {
> +	intel_iommu_flush_kernel_pasid(start, end);

A more generic naming would be good, and probably expose it through a
function in the IOMMU-API.

> +void intel_iommu_flush_kernel_pasid(unsigned long start, unsigned long end)
> +{
> +	struct dmar_drhd_unit *drhd;
> +	struct intel_iommu *iommu;
> +	unsigned long pages;

And I think, as a performance optimiztion, we should bail out early here
if the pasid has no users.



	Joerg

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
  2015-10-20 16:03 ` Joerg Roedel
@ 2015-10-20 16:17   ` David Woodhouse
  2015-10-23 10:20     ` Joerg Roedel
  0 siblings, 1 reply; 10+ messages in thread
From: David Woodhouse @ 2015-10-20 16:17 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: linux-mm, iommu, Sudeep Dutt

[-- Attachment #1: Type: text/plain, Size: 2049 bytes --]

On Tue, 2015-10-20 at 18:03 +0200, Joerg Roedel wrote:
> Hi David,
> 
> On Tue, Oct 20, 2015 at 04:52:59PM +0100, David Woodhouse wrote:
> >  void flush_tlb_kernel_range(unsigned long start, unsigned long end)
> >  {
> > +> > 	> > intel_iommu_flush_kernel_pasid(start, end);
> 
> A more generic naming would be good, and probably expose it through a
> function in the IOMMU-API.

Yeah. *All* the SVM functionality needs to be exposed through the DMA
API (for native drivers like i915) and the IOMMU API (so VM guests can
see an IOMMU and do SVM on assigned devices).

Can we assume that only one type of SVM-capable IOMMU will be present
in the system at a time? Perhaps we could just register a single
function (intel_iommu_flush_kernel_pasid in the VT-d case) to be used
as a notifier callback from tlb_flush_kernel_range()? Rather than the
overhead of a *list* of notifiers?

> > +void intel_iommu_flush_kernel_pasid(unsigned long start, unsigned long end)
> > +{
> > +> > 	> > struct dmar_drhd_unit *drhd;
> > +> > 	> > struct intel_iommu *iommu;
> > +> > 	> > unsigned long pages;
> 
> And I think, as a performance optimiztion, we should bail out early here
> if the pasid has no users.

If the PASID has no users, it won't exist (iommu->kernel_svm will be
NULL). We still have to walk each IOMMU to see if it has one.

But... that's because the PASID-space is currently per-IOMMU. The plan
is to have a *single* PASID-space system-wide, And then I still want to
retain the property that there can be only *one* kernel PASID.

I have forbidden the use of a given PASID to access *both* kernel and
user addresses. I'm hoping we can get away with putting that
restriction into the generic SVM APIs.

So yeah, perhaps we can set the notifier pointer to NULL when there's
no kernel PASID assigned, and only set it to point to
${MY_IOMMU}_flush_kernel_pasid() if/when there *is* one?

That way, tlb_flush_kernel_range() doesn't even need to make the call
when there's no work to do...

-- 
dwmw2


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5691 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
  2015-10-20 16:17   ` David Woodhouse
@ 2015-10-23 10:20     ` Joerg Roedel
  2015-10-23 10:33       ` David Woodhouse
  0 siblings, 1 reply; 10+ messages in thread
From: Joerg Roedel @ 2015-10-23 10:20 UTC (permalink / raw)
  To: David Woodhouse; +Cc: linux-mm, iommu, Sudeep Dutt

On Tue, Oct 20, 2015 at 05:17:04PM +0100, David Woodhouse wrote:
> Can we assume that only one type of SVM-capable IOMMU will be present
> in the system at a time? Perhaps we could just register a single
> function (intel_iommu_flush_kernel_pasid in the VT-d case) to be used
> as a notifier callback from tlb_flush_kernel_range()? Rather than the
> overhead of a *list* of notifiers?

Yes, a single notifier is certainly preferable to a list. It is just
too easy for others to attach to this list silently and adding more
overhead to kernel TLB flushing.

> But... that's because the PASID-space is currently per-IOMMU. The plan
> is to have a *single* PASID-space system-wide, And then I still want to
> retain the property that there can be only *one* kernel PASID.

That makes a lot of sense. Then we can check in the call-back simply if
this pasid has users and bail out early when not.

> I have forbidden the use of a given PASID to access *both* kernel and
> user addresses. I'm hoping we can get away with putting that
> restriction into the generic SVM APIs.

We have to, having kernel-pasids already nullifies all protection the
IOMMU provides, giving kernel-access to a process-pasid is security wise
equivalent to accessing /dev/mem.

> So yeah, perhaps we can set the notifier pointer to NULL when there's
> no kernel PASID assigned, and only set it to point to
> ${MY_IOMMU}_flush_kernel_pasid() if/when there *is* one?

That sounds like it needs some clever locking. Instead of checking the
function pointer it is probably easier to put the check for pasid-users
into an inline function and just do the real flush-call only when
necessary.


	Joerg

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
  2015-10-23 10:20     ` Joerg Roedel
@ 2015-10-23 10:33       ` David Woodhouse
  2015-10-23 11:03           ` Joerg Roedel
  0 siblings, 1 reply; 10+ messages in thread
From: David Woodhouse @ 2015-10-23 10:33 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: linux-mm, iommu, Sudeep Dutt

[-- Attachment #1: Type: text/plain, Size: 5388 bytes --]

On Fri, 2015-10-23 at 12:20 +0200, Joerg Roedel wrote:
> On Tue, Oct 20, 2015 at 05:17:04PM +0100, David Woodhouse wrote:
> > Can we assume that only one type of SVM-capable IOMMU will be present
> > in the system at a time? Perhaps we could just register a single
> > function (intel_iommu_flush_kernel_pasid in the VT-d case) to be used
> > as a notifier callback from tlb_flush_kernel_range()? Rather than the
> > overhead of a *list* of notifiers?
> 
> Yes, a single notifier is certainly preferable to a list. It is just
> too easy for others to attach to this list silently and adding more
> overhead to kernel TLB flushing.

Yeah. It's easy enough to add that in the x86 tlb_flush_kernel_range()
but I think we actually want it to be cross-platform.

Which means I'm pondering *renaming* tlb_flush_kernel_range() to
something like arch_tlb_flush_kernel_range() everywhere, then having a
tlb_flush_kernel_range() inline function which optionally calls
iommu_flush_kernel_range() first.

Or I could reduce the churn by adding explicit calls to
iommu_flush_kernel_range() at every location that calls
tlb_flush_kernel_range(), but that's going to lead to some callers
missing the IOMMU flush.

> > But... that's because the PASID-space is currently per-IOMMU. The plan
> > is to have a *single* PASID-space system-wide, And then I still want to
> > retain the property that there can be only *one* kernel PASID.
> 
> That makes a lot of sense. Then we can check in the call-back simply if
> this pasid has users and bail out early when not.
> 
> > I have forbidden the use of a given PASID to access *both* kernel and
> > user addresses. I'm hoping we can get away with putting that
> > restriction into the generic SVM APIs.
> 
> We have to, having kernel-pasids already nullifies all protection the
> IOMMU provides, giving kernel-access to a process-pasid is security wise
> equivalent to accessing /dev/mem.

Not entirely. The device still gets to specify whether it's doing
supervisor or user mode access, for each request it makes. It doesn't
open the door to users just using kernel addresses and getting away
with it!

Sure, we need to trust the *device* — but we need to trust it to
provide the correct PASID too. Which basically means in the VFIO case
where the user gets *full* control of the device, we have to ensure
that it gets its own PASID table with only the *one* PASID in it, and
*that* PASID can't have supervisor mode.

But in the general case, apart from the fact that it makes life hard
for us, there's no fundamental security reason why we couldn't set the
bit which allows supervisor mode access to happen in *any* PASID.

> > So yeah, perhaps we can set the notifier pointer to NULL when there's
> > no kernel PASID assigned, and only set it to point to
> > ${MY_IOMMU}_flush_kernel_pasid() if/when there *is* one?
> 
> That sounds like it needs some clever locking. Instead of checking the
> function pointer it is probably easier to put the check for pasid-users
> into an inline function and just do the real flush-call only when
> necessary.

The locking isn't hard; it's just RCU. Which in the VT-d case is just
the same as the handling of the kernel PASID structure which tells us
if we have any work to do anyway.

What I have in my working tree right now (but will probably throw away)
looks something like...

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6df2029..c9e0b6c 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -7,6 +7,22 @@
 #include <asm/processor.h>
 #include <asm/special_insns.h>
 
+#ifdef CONFIG_IOMMU_TLB_FLUSH
+#include <linux/rcupdate.h>
+typedef void (*iommu_flush_ktlb_fn)(unsigned long start, unsigned long end);
+extern iommu_flush_ktlb_fn __rcu iommu_flush_ktlb;
+
+static inline void do_iommu_flush_ktlb(unsigned long start, unsigned long end)
+{
+	iommu_flush_ktlb_fn *fn;
+	rcu_read_lock();
+	fn = rcu_dereference(iommu_flush_ktlb);
+	if (fn)
+		(*fn)(start, end);
+	rcu_read_unlock();
+}
+#endif
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
@@ -223,6 +239,9 @@ static inline void reset_lazy_tlbstate(void)
 static inline void flush_tlb_kernel_range(unsigned long start,
 					  unsigned long end)
 {
+#ifdef CONFIG_IOMMU_FLUSH_TLB
+	do_iommu_flush_ktlb(start, end);
+#endif
 	flush_tlb_all();
 }
 
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 8ddb5d0..4122b49 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -266,6 +267,9 @@ static void do_kernel_range_flush(void *info)
 
 void flush_tlb_kernel_range(unsigned long start, unsigned long end)
 {
+#ifdef CONFIG_IOMMU_FLUSH_TLB
+	do_iommu_flush_ktlb(start, end);
+#endif
 
 	/* Balance as user space task's flush, a bit conservative */
 	if (end == TLB_FLUSH_ALL ||


Maybe we could keep it simple and just declare that once the function pointer is set, it may never be cleared? But I think we really do want to avoid the out-of-line function call altogether in the case where kernel PASIDs are not being used. Or at *least* the case where SVM isn't being used at all.

-- 
-- 
David Woodhouse                            Open Source Technology Centre
David.Woodhouse@intel.com                              Intel Corporation


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5691 bytes --]

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
@ 2015-10-23 11:03           ` Joerg Roedel
  0 siblings, 0 replies; 10+ messages in thread
From: Joerg Roedel @ 2015-10-23 11:03 UTC (permalink / raw)
  To: David Woodhouse; +Cc: linux-mm, iommu, Sudeep Dutt

On Fri, Oct 23, 2015 at 11:33:33AM +0100, David Woodhouse wrote:
> Which means I'm pondering *renaming* tlb_flush_kernel_range() to
> something like arch_tlb_flush_kernel_range() everywhere, then having a
> tlb_flush_kernel_range() inline function which optionally calls
> iommu_flush_kernel_range() first.

That sounds like some work, but would be the super clean solution :)
Given that only a handful of architecture besides x86 will need it
(thinking of ARM64 and PPC), I prefer the solution below:

> Or I could reduce the churn by adding explicit calls to
> iommu_flush_kernel_range() at every location that calls
> tlb_flush_kernel_range(), but that's going to lead to some callers
> missing the IOMMU flush.

Exactly like this, but when do we miss a flush here?

> Not entirely. The device still gets to specify whether it's doing
> supervisor or user mode access, for each request it makes. It doesn't
> open the door to users just using kernel addresses and getting away
> with it!
> 
> Sure, we need to trust the *device* a?? but we need to trust it to
> provide the correct PASID too. Which basically means in the VFIO case
> where the user gets *full* control of the device, we have to ensure
> that it gets its own PASID table with only the *one* PASID in it, and
> *that* PASID can't have supervisor mode.

Exactly, we need to trust the device and the device driver. But thats
not different to a situation without an iommu. We just run into problems
when a device-driver allows sending requests to access kernel-memory
from user-space, so it needs more care from the driver writers/reviewers
too.

> +static inline void do_iommu_flush_ktlb(unsigned long start, unsigned long end)
> +{
> +	iommu_flush_ktlb_fn *fn;
> +	rcu_read_lock();
> +	fn = rcu_dereference(iommu_flush_ktlb);
> +	if (fn)
> +		(*fn)(start, end);
> +	rcu_read_unlock();
> +}

Yes, that'll work too. When you read/update the iommu_flush_ktlb pointer
atomically, you can even get away without rcu. The function it points to
will not go away, so you can still call it even when the pointer turned
NULL.

> Maybe we could keep it simple and just declare that once the function
> pointer is set, it may never be cleared? But I think we really do want
> to avoid the out-of-line function call altogether in the case where
> kernel PASIDs are not being used. Or at *least* the case where SVM
> isn't being used at all.

Yes, thats why I think an inline function which does the checks would be
a better solution. The mmu_notifiers implement it in the same way, so we
would stay consistent with them.


	Joerg

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
@ 2015-10-23 11:03           ` Joerg Roedel
  0 siblings, 0 replies; 10+ messages in thread
From: Joerg Roedel @ 2015-10-23 11:03 UTC (permalink / raw)
  To: David Woodhouse
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Sudeep Dutt

On Fri, Oct 23, 2015 at 11:33:33AM +0100, David Woodhouse wrote:
> Which means I'm pondering *renaming* tlb_flush_kernel_range() to
> something like arch_tlb_flush_kernel_range() everywhere, then having a
> tlb_flush_kernel_range() inline function which optionally calls
> iommu_flush_kernel_range() first.

That sounds like some work, but would be the super clean solution :)
Given that only a handful of architecture besides x86 will need it
(thinking of ARM64 and PPC), I prefer the solution below:

> Or I could reduce the churn by adding explicit calls to
> iommu_flush_kernel_range() at every location that calls
> tlb_flush_kernel_range(), but that's going to lead to some callers
> missing the IOMMU flush.

Exactly like this, but when do we miss a flush here?

> Not entirely. The device still gets to specify whether it's doing
> supervisor or user mode access, for each request it makes. It doesn't
> open the door to users just using kernel addresses and getting away
> with it!
> 
> Sure, we need to trust the *device* — but we need to trust it to
> provide the correct PASID too. Which basically means in the VFIO case
> where the user gets *full* control of the device, we have to ensure
> that it gets its own PASID table with only the *one* PASID in it, and
> *that* PASID can't have supervisor mode.

Exactly, we need to trust the device and the device driver. But thats
not different to a situation without an iommu. We just run into problems
when a device-driver allows sending requests to access kernel-memory
from user-space, so it needs more care from the driver writers/reviewers
too.

> +static inline void do_iommu_flush_ktlb(unsigned long start, unsigned long end)
> +{
> +	iommu_flush_ktlb_fn *fn;
> +	rcu_read_lock();
> +	fn = rcu_dereference(iommu_flush_ktlb);
> +	if (fn)
> +		(*fn)(start, end);
> +	rcu_read_unlock();
> +}

Yes, that'll work too. When you read/update the iommu_flush_ktlb pointer
atomically, you can even get away without rcu. The function it points to
will not go away, so you can still call it even when the pointer turned
NULL.

> Maybe we could keep it simple and just declare that once the function
> pointer is set, it may never be cleared? But I think we really do want
> to avoid the out-of-line function call altogether in the case where
> kernel PASIDs are not being used. Or at *least* the case where SVM
> isn't being used at all.

Yes, thats why I think an inline function which does the checks would be
a better solution. The mmu_notifiers implement it in the same way, so we
would stay consistent with them.


	Joerg

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
  2015-10-23 11:03           ` Joerg Roedel
  (?)
@ 2015-10-23 11:37           ` David Woodhouse
  2015-10-23 12:42               ` Joerg Roedel
  -1 siblings, 1 reply; 10+ messages in thread
From: David Woodhouse @ 2015-10-23 11:37 UTC (permalink / raw)
  To: Joerg Roedel; +Cc: linux-mm, iommu, Sudeep Dutt

[-- Attachment #1: Type: text/plain, Size: 4681 bytes --]

On Fri, 2015-10-23 at 13:03 +0200, Joerg Roedel wrote:
> On Fri, Oct 23, 2015 at 11:33:33AM +0100, David Woodhouse wrote:
> > Which means I'm pondering *renaming* tlb_flush_kernel_range() to
> > something like arch_tlb_flush_kernel_range() everywhere, then
> > having a
> > tlb_flush_kernel_range() inline function which optionally calls
> > iommu_flush_kernel_range() first.
> 
> That sounds like some work, but would be the super clean solution :)
> Given that only a handful of architecture besides x86 will need it
> (thinking of ARM64 and PPC), I prefer the solution below:
> 
> > Or I could reduce the churn by adding explicit calls to
> > iommu_flush_kernel_range() at every location that calls
> > tlb_flush_kernel_range(), but that's going to lead to some callers
> > missing the IOMMU flush.
> 
> Exactly like this, but when do we miss a flush here?

When we miss a call to flush_tlb_kernel_range() and forget to add the
corresponding call to iommu_flush_kernel_range() next to it. We'd need
to go trawling through arch code and make sure we get them all.

Or more likely, when someone adds a *new* call to
flush_tlb_kernel_range() and doesn't realise that they also need to
call iommu_flush_kernel_range().

It's more work up front to do the "super clean solution", but I think
it's worth it.

> > Not entirely. The device still gets to specify whether it's doing
> > supervisor or user mode access, for each request it makes. It doesn't
> > open the door to users just using kernel addresses and getting away
> > with it!
> > 
> > Sure, we need to trust the *device* — but we need to trust it to
> > provide the correct PASID too. Which basically means in the VFIO case
> > where the user gets *full* control of the device, we have to ensure
> > that it gets its own PASID table with only the *one* PASID in it, and
> > *that* PASID can't have supervisor mode.
> 
> Exactly, we need to trust the device and the device driver. But thats
> not different to a situation without an iommu. We just run into problems
> when a device-driver allows sending requests to access kernel-memory
> from user-space, so it needs more care from the driver writers/reviewers
> too.

It's more than that — it's equivalent to the situation *with* the
IOMMU.

Having a *separate* PASID which is the only PASID we can use for kernel
mode is *not* a security improvement. In the general case, if a user
can trick the device into setting the 'supervisor mode' bit on a given
access, it could probably just as easily trick the device into using
the separate kernel PASID for that access. In neither case is it as
simple as just asking the device to use a kernel address.

I'm not proposing it for that reason, which is why I'm objecting to
your 'we have to...' response. Although maybe I should shut up, because
I'm pleased you aren't objecting to my plan and saying that we *do*
need to permit supervisor-mode access in normal PASIDs.

> > +static inline void do_iommu_flush_ktlb(unsigned long start,
> > unsigned long end)
> > +{
> > +	iommu_flush_ktlb_fn *fn;
> > +	rcu_read_lock();
> > +	fn = rcu_dereference(iommu_flush_ktlb);
> > +	if (fn)
> > +		(*fn)(start, end);
> > +	rcu_read_unlock();
> > +}
> 
> Yes, that'll work too. When you read/update the iommu_flush_ktlb pointer
> atomically, you can even get away without rcu. The function it points to
> will not go away, so you can still call it even when the pointer turned
> NULL.

That's true. I do need to think hard about architectures with function
descriptors though, where it might not be possible to atomically update
a function pointer. Need to dust off my rusty PPC64 knowledge... :)

(Note that this could screw the RCU approach too.)

> > Maybe we could keep it simple and just declare that once the function
> > pointer is set, it may never be cleared? But I think we really do want
> > to avoid the out-of-line function call altogether in the case where
> > kernel PASIDs are not being used. Or at *least* the case where SVM
> > isn't being used at all.
> 
> Yes, thats why I think an inline function which does the checks would be
> a better solution. The mmu_notifiers implement it in the same way, so we
> would stay consistent with them.

You mean an inline function which checks for iommu->kernel_svm ∀ iommu?
And does the equivalent for other IOMMUs? I wouldn't want IOMMU
-specific code in there; just a decision about whether to call the out
-of-line function.

Or maybe if we are making PASID handling generic and system-wide, it
really does become a case of 'if (init_mm.pasid != -1)' ...?

-- 
dwmw2


[-- Attachment #2: smime.p7s --]
[-- Type: application/x-pkcs7-signature, Size: 5691 bytes --]

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
@ 2015-10-23 12:42               ` Joerg Roedel
  0 siblings, 0 replies; 10+ messages in thread
From: Joerg Roedel @ 2015-10-23 12:42 UTC (permalink / raw)
  To: David Woodhouse; +Cc: linux-mm, iommu, Sudeep Dutt

On Fri, Oct 23, 2015 at 12:37:06PM +0100, David Woodhouse wrote:
> It's more than that a?? it's equivalent to the situation *with* the
> IOMMU.
> 
> Having a *separate* PASID which is the only PASID we can use for kernel
> mode is *not* a security improvement. In the general case, if a user
> can trick the device into setting the 'supervisor mode' bit on a given
> access, it could probably just as easily trick the device into using
> the separate kernel PASID for that access. In neither case is it as
> simple as just asking the device to use a kernel address.
> 
> I'm not proposing it for that reason, which is why I'm objecting to
> your 'we have to...' response. Although maybe I should shut up, because
> I'm pleased you aren't objecting to my plan and saying that we *do*
> need to permit supervisor-mode access in normal PASIDs.

At best I'd like to avoid supervisor access for devices at all, but
there seems to be a need for it, so I looks like we need to provide it.
Therefore I think that your idea to have a seperate PASID for kernel
access, and only kernel access, is a good one. We even don't need to use
a defined PASID, we can randomize the PASID used for kernel accesses and
make it harder to guess this way.

But having both, kernel and supervisor access, allowed for a PASID is
another story, and I think we need to be careful with that (or at least
avoid that the driver writers need to care that much about it to prevent
userspace from getting access to kernel memory).

> You mean an inline function which checks for iommu->kernel_svm a?? iommu?
> And does the equivalent for other IOMMUs? I wouldn't want IOMMU
> -specific code in there; just a decision about whether to call the out
> -of-line function.
> 
> Or maybe if we are making PASID handling generic and system-wide, it
> really does become a case of 'if (init_mm.pasid != -1)' ...?

Yes, something like that, and of course independent of the iommu. When
we have a system-wide PASID registry we can check against that, or
introduce a global read_mostly flag. Using init_mm refcounting or flags
also sounds like a good idea.


	Joerg

--
To unsubscribe, send a message with 'unsubscribe linux-mm' in
the body to majordomo@kvack.org.  For more info on Linux MM,
see: http://www.linux-mm.org/ .
Don't email: <a href=mailto:"dont@kvack.org"> email@kvack.org </a>

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses
@ 2015-10-23 12:42               ` Joerg Roedel
  0 siblings, 0 replies; 10+ messages in thread
From: Joerg Roedel @ 2015-10-23 12:42 UTC (permalink / raw)
  To: David Woodhouse
  Cc: linux-mm-Bw31MaZKKs3YtjvyW6yDsg,
	iommu-cunTk1MwBs9QetFLy7KEm3xJsTq8ys+cHZ5vskTnxNA, Sudeep Dutt

On Fri, Oct 23, 2015 at 12:37:06PM +0100, David Woodhouse wrote:
> It's more than that — it's equivalent to the situation *with* the
> IOMMU.
> 
> Having a *separate* PASID which is the only PASID we can use for kernel
> mode is *not* a security improvement. In the general case, if a user
> can trick the device into setting the 'supervisor mode' bit on a given
> access, it could probably just as easily trick the device into using
> the separate kernel PASID for that access. In neither case is it as
> simple as just asking the device to use a kernel address.
> 
> I'm not proposing it for that reason, which is why I'm objecting to
> your 'we have to...' response. Although maybe I should shut up, because
> I'm pleased you aren't objecting to my plan and saying that we *do*
> need to permit supervisor-mode access in normal PASIDs.

At best I'd like to avoid supervisor access for devices at all, but
there seems to be a need for it, so I looks like we need to provide it.
Therefore I think that your idea to have a seperate PASID for kernel
access, and only kernel access, is a good one. We even don't need to use
a defined PASID, we can randomize the PASID used for kernel accesses and
make it harder to guess this way.

But having both, kernel and supervisor access, allowed for a PASID is
another story, and I think we need to be careful with that (or at least
avoid that the driver writers need to care that much about it to prevent
userspace from getting access to kernel memory).

> You mean an inline function which checks for iommu->kernel_svm ∀ iommu?
> And does the equivalent for other IOMMUs? I wouldn't want IOMMU
> -specific code in there; just a decision about whether to call the out
> -of-line function.
> 
> Or maybe if we are making PASID handling generic and system-wide, it
> really does become a case of 'if (init_mm.pasid != -1)' ...?

Yes, something like that, and of course independent of the iommu. When
we have a system-wide PASID registry we can check against that, or
introduce a global read_mostly flag. Using init_mm refcounting or flags
also sounds like a good idea.


	Joerg

_______________________________________________
iommu mailing list
iommu@lists.linux-foundation.org
https://lists.linuxfoundation.org/mailman/listinfo/iommu

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2015-10-23 12:43 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2015-10-20 15:52 [RFC PATCH] iommu/vt-d: Add IOTLB flush support for kernel addresses David Woodhouse
2015-10-20 16:03 ` Joerg Roedel
2015-10-20 16:17   ` David Woodhouse
2015-10-23 10:20     ` Joerg Roedel
2015-10-23 10:33       ` David Woodhouse
2015-10-23 11:03         ` Joerg Roedel
2015-10-23 11:03           ` Joerg Roedel
2015-10-23 11:37           ` David Woodhouse
2015-10-23 12:42             ` Joerg Roedel
2015-10-23 12:42               ` Joerg Roedel

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.