All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-11-20  1:27 ` Tom Roeder
  0 siblings, 0 replies; 13+ messages in thread
From: Tom Roeder @ 2020-11-20  1:27 UTC (permalink / raw)
  To: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg
  Cc: Peter Gonda, Marios Pomonis, linux-nvme, linux-kernel, Tom Roeder

This patch changes the NVMe PCI implementation to cache host_mem_descs
in non-DMA memory instead of depending on descriptors stored in DMA
memory. This change is needed under the malicious-hypervisor threat
model assumed by the AMD SEV and Intel TDX architectures, which encrypt
guest memory to make it unreadable. Some versions of these architectures
also make it cryptographically hard to modify guest memory without
detection.

On these architectures, Linux generally leaves DMA memory unencrypted so
that devices can still communicate directly with the kernel: DMA memory
remains readable to and modifiable by devices. This means that this
memory is also accessible to a hypervisor.

However, this means that a malicious hypervisor could modify the addr or
size fields of descriptors and cause the NVMe driver to call
dma_free_attrs on arbitrary addresses or on the right addresses but with
the wrong size. To prevent this attack, this commit changes the code to
cache those descriptors in non-DMA memory and to use the cached values
when freeing the memory they describe.

Tested: Built and ran with Google-internal NVMe tests.
Tested-by: Tom Roeder <tmroeder@google.com>
Signed-off-by: Tom Roeder <tmroeder@google.com>
---
Changes from v1:
- Use native integers instead of __le{32,64} for the addr and size.
- Rename added fields/variables for better consistency.
- Make comment style consistent with other comments in pci.c.

 drivers/nvme/host/pci.c | 35 ++++++++++++++++++++++++++++-------
 include/linux/nvme.h    |  5 +++++
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3be352403839..4c55a96f9e34 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -148,6 +148,11 @@ struct nvme_dev {
 	u32 nr_host_mem_descs;
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
+	/*
+	 * A cache for the host_mem_descs in non-DMA memory so a malicious
+	 * hypervisor can't change them.
+	 */
+	struct nvme_host_mem_buf_cached_desc *host_mem_cached_descs;
 	void **host_mem_desc_bufs;
 	unsigned int nr_allocated_queues;
 	unsigned int nr_write_queues;
@@ -1874,11 +1879,16 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 	int i;
 
 	for (i = 0; i < dev->nr_host_mem_descs; i++) {
-		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
-		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
+		/*
+		 * Use the cached version to free the DMA allocations, not a
+		 * version that could be controlled by a malicious hypervisor.
+		 */
+		struct nvme_host_mem_buf_cached_desc *desc =
+			&dev->host_mem_cached_descs[i];
+		size_t size = desc->size * NVME_CTRL_PAGE_SIZE;
 
 		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
-			       le64_to_cpu(desc->addr),
+			       desc->addr,
 			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
 	}
 
@@ -1888,6 +1898,8 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
 			dev->host_mem_descs, dev->host_mem_descs_dma);
 	dev->host_mem_descs = NULL;
+	kfree(dev->host_mem_cached_descs);
+	dev->host_mem_cached_descs = NULL;
 	dev->nr_host_mem_descs = 0;
 }
 
@@ -1895,6 +1907,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 		u32 chunk_size)
 {
 	struct nvme_host_mem_buf_desc *descs;
+	struct nvme_host_mem_buf_cached_desc *cached_descs;
 	u32 max_entries, len;
 	dma_addr_t descs_dma;
 	int i = 0;
@@ -1913,9 +1926,13 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	if (!descs)
 		goto out;
 
+	cached_descs = kcalloc(max_entries, sizeof(*cached_descs), GFP_KERNEL);
+	if (!cached_descs)
+		goto out_free_descs;
+
 	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
 	if (!bufs)
-		goto out_free_descs;
+		goto out_free_cached_descs;
 
 	for (size = 0; size < preferred && i < max_entries; size += len) {
 		dma_addr_t dma_addr;
@@ -1928,6 +1945,8 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 
 		descs[i].addr = cpu_to_le64(dma_addr);
 		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
+		cached_descs[i].addr = dma_addr;
+		cached_descs[i].size = len / NVME_CTRL_PAGE_SIZE;
 		i++;
 	}
 
@@ -1937,20 +1956,22 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	dev->nr_host_mem_descs = i;
 	dev->host_mem_size = size;
 	dev->host_mem_descs = descs;
+	dev->host_mem_cached_descs = cached_descs;
 	dev->host_mem_descs_dma = descs_dma;
 	dev->host_mem_desc_bufs = bufs;
 	return 0;
 
 out_free_bufs:
 	while (--i >= 0) {
-		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
+		size_t size = cached_descs[i].size * NVME_CTRL_PAGE_SIZE;
 
-		dma_free_attrs(dev->dev, size, bufs[i],
-			       le64_to_cpu(descs[i].addr),
+		dma_free_attrs(dev->dev, size, bufs[i], cached_descs[i].addr,
 			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
 	}
 
 	kfree(bufs);
+out_free_cached_descs:
+	kfree(cached_descs);
 out_free_descs:
 	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
 			descs_dma);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d92535997687..e9e14df417bc 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1114,6 +1114,11 @@ struct nvme_host_mem_buf_desc {
 	__u32			rsvd;
 };
 
+struct nvme_host_mem_buf_cached_desc {
+	__u64			addr;
+	__u32			size;
+};
+
 struct nvme_create_cq {
 	__u8			opcode;
 	__u8			flags;
-- 
2.29.2.454.gaff20da3a2-goog


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-11-20  1:27 ` Tom Roeder
  0 siblings, 0 replies; 13+ messages in thread
From: Tom Roeder @ 2020-11-20  1:27 UTC (permalink / raw)
  To: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg
  Cc: linux-kernel, Tom Roeder, linux-nvme, Peter Gonda, Marios Pomonis

This patch changes the NVMe PCI implementation to cache host_mem_descs
in non-DMA memory instead of depending on descriptors stored in DMA
memory. This change is needed under the malicious-hypervisor threat
model assumed by the AMD SEV and Intel TDX architectures, which encrypt
guest memory to make it unreadable. Some versions of these architectures
also make it cryptographically hard to modify guest memory without
detection.

On these architectures, Linux generally leaves DMA memory unencrypted so
that devices can still communicate directly with the kernel: DMA memory
remains readable to and modifiable by devices. This means that this
memory is also accessible to a hypervisor.

However, this means that a malicious hypervisor could modify the addr or
size fields of descriptors and cause the NVMe driver to call
dma_free_attrs on arbitrary addresses or on the right addresses but with
the wrong size. To prevent this attack, this commit changes the code to
cache those descriptors in non-DMA memory and to use the cached values
when freeing the memory they describe.

Tested: Built and ran with Google-internal NVMe tests.
Tested-by: Tom Roeder <tmroeder@google.com>
Signed-off-by: Tom Roeder <tmroeder@google.com>
---
Changes from v1:
- Use native integers instead of __le{32,64} for the addr and size.
- Rename added fields/variables for better consistency.
- Make comment style consistent with other comments in pci.c.

 drivers/nvme/host/pci.c | 35 ++++++++++++++++++++++++++++-------
 include/linux/nvme.h    |  5 +++++
 2 files changed, 33 insertions(+), 7 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3be352403839..4c55a96f9e34 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -148,6 +148,11 @@ struct nvme_dev {
 	u32 nr_host_mem_descs;
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
+	/*
+	 * A cache for the host_mem_descs in non-DMA memory so a malicious
+	 * hypervisor can't change them.
+	 */
+	struct nvme_host_mem_buf_cached_desc *host_mem_cached_descs;
 	void **host_mem_desc_bufs;
 	unsigned int nr_allocated_queues;
 	unsigned int nr_write_queues;
@@ -1874,11 +1879,16 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 	int i;
 
 	for (i = 0; i < dev->nr_host_mem_descs; i++) {
-		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
-		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
+		/*
+		 * Use the cached version to free the DMA allocations, not a
+		 * version that could be controlled by a malicious hypervisor.
+		 */
+		struct nvme_host_mem_buf_cached_desc *desc =
+			&dev->host_mem_cached_descs[i];
+		size_t size = desc->size * NVME_CTRL_PAGE_SIZE;
 
 		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
-			       le64_to_cpu(desc->addr),
+			       desc->addr,
 			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
 	}
 
@@ -1888,6 +1898,8 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
 			dev->host_mem_descs, dev->host_mem_descs_dma);
 	dev->host_mem_descs = NULL;
+	kfree(dev->host_mem_cached_descs);
+	dev->host_mem_cached_descs = NULL;
 	dev->nr_host_mem_descs = 0;
 }
 
@@ -1895,6 +1907,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 		u32 chunk_size)
 {
 	struct nvme_host_mem_buf_desc *descs;
+	struct nvme_host_mem_buf_cached_desc *cached_descs;
 	u32 max_entries, len;
 	dma_addr_t descs_dma;
 	int i = 0;
@@ -1913,9 +1926,13 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	if (!descs)
 		goto out;
 
+	cached_descs = kcalloc(max_entries, sizeof(*cached_descs), GFP_KERNEL);
+	if (!cached_descs)
+		goto out_free_descs;
+
 	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
 	if (!bufs)
-		goto out_free_descs;
+		goto out_free_cached_descs;
 
 	for (size = 0; size < preferred && i < max_entries; size += len) {
 		dma_addr_t dma_addr;
@@ -1928,6 +1945,8 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 
 		descs[i].addr = cpu_to_le64(dma_addr);
 		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
+		cached_descs[i].addr = dma_addr;
+		cached_descs[i].size = len / NVME_CTRL_PAGE_SIZE;
 		i++;
 	}
 
@@ -1937,20 +1956,22 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	dev->nr_host_mem_descs = i;
 	dev->host_mem_size = size;
 	dev->host_mem_descs = descs;
+	dev->host_mem_cached_descs = cached_descs;
 	dev->host_mem_descs_dma = descs_dma;
 	dev->host_mem_desc_bufs = bufs;
 	return 0;
 
 out_free_bufs:
 	while (--i >= 0) {
-		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
+		size_t size = cached_descs[i].size * NVME_CTRL_PAGE_SIZE;
 
-		dma_free_attrs(dev->dev, size, bufs[i],
-			       le64_to_cpu(descs[i].addr),
+		dma_free_attrs(dev->dev, size, bufs[i], cached_descs[i].addr,
 			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
 	}
 
 	kfree(bufs);
+out_free_cached_descs:
+	kfree(cached_descs);
 out_free_descs:
 	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
 			descs_dma);
diff --git a/include/linux/nvme.h b/include/linux/nvme.h
index d92535997687..e9e14df417bc 100644
--- a/include/linux/nvme.h
+++ b/include/linux/nvme.h
@@ -1114,6 +1114,11 @@ struct nvme_host_mem_buf_desc {
 	__u32			rsvd;
 };
 
+struct nvme_host_mem_buf_cached_desc {
+	__u64			addr;
+	__u32			size;
+};
+
 struct nvme_create_cq {
 	__u8			opcode;
 	__u8			flags;
-- 
2.29.2.454.gaff20da3a2-goog


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
  2020-11-20  1:27 ` Tom Roeder
@ 2020-11-20  8:02   ` Christoph Hellwig
  -1 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2020-11-20  8:02 UTC (permalink / raw)
  To: Tom Roeder
  Cc: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg,
	Peter Gonda, Marios Pomonis, linux-nvme, linux-kernel

On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
> This patch changes the NVMe PCI implementation to cache host_mem_descs
> in non-DMA memory instead of depending on descriptors stored in DMA
> memory. This change is needed under the malicious-hypervisor threat
> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
> guest memory to make it unreadable. Some versions of these architectures
> also make it cryptographically hard to modify guest memory without
> detection.

I don't think this is a useful threat model, and I've not seen a
discussion on lkml where we had any discussion on this kind of threat
model either.

Before you start sending patches that regress optimizations in various
drivers (and there will be lots with this model) we need to have a
broader discussion first.

And HMB support, which is for low-end consumer devices that are usually
not directly assigned to VMs aren't a good starting point for this.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-11-20  8:02   ` Christoph Hellwig
  0 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2020-11-20  8:02 UTC (permalink / raw)
  To: Tom Roeder
  Cc: Sagi Grimberg, linux-kernel, linux-nvme, Marios Pomonis,
	Jens Axboe, Peter Gonda, Keith Busch, Christoph Hellwig

On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
> This patch changes the NVMe PCI implementation to cache host_mem_descs
> in non-DMA memory instead of depending on descriptors stored in DMA
> memory. This change is needed under the malicious-hypervisor threat
> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
> guest memory to make it unreadable. Some versions of these architectures
> also make it cryptographically hard to modify guest memory without
> detection.

I don't think this is a useful threat model, and I've not seen a
discussion on lkml where we had any discussion on this kind of threat
model either.

Before you start sending patches that regress optimizations in various
drivers (and there will be lots with this model) we need to have a
broader discussion first.

And HMB support, which is for low-end consumer devices that are usually
not directly assigned to VMs aren't a good starting point for this.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
  2020-11-20  8:02   ` Christoph Hellwig
@ 2020-11-20 14:29     ` Keith Busch
  -1 siblings, 0 replies; 13+ messages in thread
From: Keith Busch @ 2020-11-20 14:29 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Tom Roeder, Jens Axboe, Sagi Grimberg, Peter Gonda,
	Marios Pomonis, linux-nvme, linux-kernel

On Fri, Nov 20, 2020 at 09:02:43AM +0100, Christoph Hellwig wrote:
> On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
> > This patch changes the NVMe PCI implementation to cache host_mem_descs
> > in non-DMA memory instead of depending on descriptors stored in DMA
> > memory. This change is needed under the malicious-hypervisor threat
> > model assumed by the AMD SEV and Intel TDX architectures, which encrypt
> > guest memory to make it unreadable. Some versions of these architectures
> > also make it cryptographically hard to modify guest memory without
> > detection.
> 
> I don't think this is a useful threat model, and I've not seen a
> discussion on lkml where we had any discussion on this kind of threat
> model either.
> 
> Before you start sending patches that regress optimizations in various
> drivers (and there will be lots with this model) we need to have a
> broader discussion first.
> 
> And HMB support, which is for low-end consumer devices that are usually
> not directly assigned to VMs aren't a good starting point for this.

Yeah, while doing this for HMB isn't really a performance concern, this
method for chaining SGL/PRP lists would be.

And perhaps more importantly, the proposed mitigation only lets the
guest silently carry on from such an attack while the device is surely
corrupting something. I think we'd rather free the wrong address since
that may at least eventually raise an error.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-11-20 14:29     ` Keith Busch
  0 siblings, 0 replies; 13+ messages in thread
From: Keith Busch @ 2020-11-20 14:29 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Sagi Grimberg, linux-kernel, linux-nvme, Marios Pomonis,
	Jens Axboe, Peter Gonda, Tom Roeder

On Fri, Nov 20, 2020 at 09:02:43AM +0100, Christoph Hellwig wrote:
> On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
> > This patch changes the NVMe PCI implementation to cache host_mem_descs
> > in non-DMA memory instead of depending on descriptors stored in DMA
> > memory. This change is needed under the malicious-hypervisor threat
> > model assumed by the AMD SEV and Intel TDX architectures, which encrypt
> > guest memory to make it unreadable. Some versions of these architectures
> > also make it cryptographically hard to modify guest memory without
> > detection.
> 
> I don't think this is a useful threat model, and I've not seen a
> discussion on lkml where we had any discussion on this kind of threat
> model either.
> 
> Before you start sending patches that regress optimizations in various
> drivers (and there will be lots with this model) we need to have a
> broader discussion first.
> 
> And HMB support, which is for low-end consumer devices that are usually
> not directly assigned to VMs aren't a good starting point for this.

Yeah, while doing this for HMB isn't really a performance concern, this
method for chaining SGL/PRP lists would be.

And perhaps more importantly, the proposed mitigation only lets the
guest silently carry on from such an attack while the device is surely
corrupting something. I think we'd rather free the wrong address since
that may at least eventually raise an error.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
  2020-11-20  8:02   ` Christoph Hellwig
@ 2020-11-30 18:50     ` Tom Roeder
  -1 siblings, 0 replies; 13+ messages in thread
From: Tom Roeder @ 2020-11-30 18:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Keith Busch, Jens Axboe, Sagi Grimberg, Peter Gonda,
	Marios Pomonis, linux-nvme, linux-kernel, Thomas.Lendacky,
	David.Kaplan

On Fri, Nov 20, 2020 at 09:02:43AM +0100, Christoph Hellwig wrote:
>On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
>> This patch changes the NVMe PCI implementation to cache host_mem_descs
>> in non-DMA memory instead of depending on descriptors stored in DMA
>> memory. This change is needed under the malicious-hypervisor threat
>> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
>> guest memory to make it unreadable. Some versions of these architectures
>> also make it cryptographically hard to modify guest memory without
>> detection.
>
>I don't think this is a useful threat model, and I've not seen a
>discussion on lkml where we had any discussion on this kind of threat
>model either.

Thanks for the feedback and apologies for the lack of context.

I was under the impression that support for AMD SEV SNP will start 
showing up in KVM soon, and my understanding of SNP is that it 
implies this threat model for the guest. See the patchset 
for SEV-ES, which is the generation before SNP: 
https://lkml.org/lkml/2020/9/14/1168. This doesn't get quite to the SNP 
threat model, but it starts to assume more maliciousness on the part of 
the hypervisor.

You can also see the talk from David Kaplan of AMD from the 2019 Linux 
Security Summit for info about SNP: 
https://www.youtube.com/watch?v=yr56SaJ_0QI.

>
>Before you start sending patches that regress optimizations in various
>drivers (and there will be lots with this model) we need to have a
>broader discussion first.

I've added Tom Lendacky and David Kaplan from AMD on the thread now, 
since I don't think I have enough context to say where this discussion 
should take place or the degree to which they think it has or hasn't.

Tom, David: can you please comment on this?

>
>And HMB support, which is for low-end consumer devices that are usually
>not directly assigned to VMs aren't a good starting point for this.

I'm glad to hear that this case doesn't apply directly to cases we would 
care about for assignment to guests. I'm not very familiar with this 
codebase, unfortunately. Do the same kinds of issues apply for the kinds 
of devices that would be assigned to guests?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-11-30 18:50     ` Tom Roeder
  0 siblings, 0 replies; 13+ messages in thread
From: Tom Roeder @ 2020-11-30 18:50 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Thomas.Lendacky, Sagi Grimberg, David.Kaplan, linux-kernel,
	linux-nvme, Marios Pomonis, Jens Axboe, Peter Gonda, Keith Busch

On Fri, Nov 20, 2020 at 09:02:43AM +0100, Christoph Hellwig wrote:
>On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
>> This patch changes the NVMe PCI implementation to cache host_mem_descs
>> in non-DMA memory instead of depending on descriptors stored in DMA
>> memory. This change is needed under the malicious-hypervisor threat
>> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
>> guest memory to make it unreadable. Some versions of these architectures
>> also make it cryptographically hard to modify guest memory without
>> detection.
>
>I don't think this is a useful threat model, and I've not seen a
>discussion on lkml where we had any discussion on this kind of threat
>model either.

Thanks for the feedback and apologies for the lack of context.

I was under the impression that support for AMD SEV SNP will start 
showing up in KVM soon, and my understanding of SNP is that it 
implies this threat model for the guest. See the patchset 
for SEV-ES, which is the generation before SNP: 
https://lkml.org/lkml/2020/9/14/1168. This doesn't get quite to the SNP 
threat model, but it starts to assume more maliciousness on the part of 
the hypervisor.

You can also see the talk from David Kaplan of AMD from the 2019 Linux 
Security Summit for info about SNP: 
https://www.youtube.com/watch?v=yr56SaJ_0QI.

>
>Before you start sending patches that regress optimizations in various
>drivers (and there will be lots with this model) we need to have a
>broader discussion first.

I've added Tom Lendacky and David Kaplan from AMD on the thread now, 
since I don't think I have enough context to say where this discussion 
should take place or the degree to which they think it has or hasn't.

Tom, David: can you please comment on this?

>
>And HMB support, which is for low-end consumer devices that are usually
>not directly assigned to VMs aren't a good starting point for this.

I'm glad to hear that this case doesn't apply directly to cases we would 
care about for assignment to guests. I'm not very familiar with this 
codebase, unfortunately. Do the same kinds of issues apply for the kinds 
of devices that would be assigned to guests?

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
  2020-11-20 14:29     ` Keith Busch
@ 2020-11-30 18:55       ` Tom Roeder
  -1 siblings, 0 replies; 13+ messages in thread
From: Tom Roeder @ 2020-11-30 18:55 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Jens Axboe, Sagi Grimberg, Peter Gonda,
	Marios Pomonis, linux-nvme, linux-kernel

On Fri, Nov 20, 2020 at 06:29:54AM -0800, Keith Busch wrote:
>On Fri, Nov 20, 2020 at 09:02:43AM +0100, Christoph Hellwig wrote:
>> On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
>> > This patch changes the NVMe PCI implementation to cache host_mem_descs
>> > in non-DMA memory instead of depending on descriptors stored in DMA
>> > memory. This change is needed under the malicious-hypervisor threat
>> > model assumed by the AMD SEV and Intel TDX architectures, which encrypt
>> > guest memory to make it unreadable. Some versions of these architectures
>> > also make it cryptographically hard to modify guest memory without
>> > detection.
>>
>> I don't think this is a useful threat model, and I've not seen a
>> discussion on lkml where we had any discussion on this kind of threat
>> model either.
>>
>> Before you start sending patches that regress optimizations in various
>> drivers (and there will be lots with this model) we need to have a
>> broader discussion first.
>>
>> And HMB support, which is for low-end consumer devices that are usually
>> not directly assigned to VMs aren't a good starting point for this.
>
>Yeah, while doing this for HMB isn't really a performance concern, this
>method for chaining SGL/PRP lists would be.

I see that this answers a question I just asked in my reply to the 
previous message. Sorry about that. Can you please point me to the code 
in question?

>
>And perhaps more importantly, the proposed mitigation only lets the
>guest silently carry on from such an attack while the device is surely
>corrupting something. I think we'd rather free the wrong address since
>that may at least eventually raise an error.

 From a security perspective, I'd rather not free the wrong address, 
since that could lead to an attack on the guest (use-after-free). But I 
agree with the concern about fixing the problem silently. Maybe this 
code should instead raise an error itself in this case after comparing 
the cached values with the values stored in the DMA memory?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-11-30 18:55       ` Tom Roeder
  0 siblings, 0 replies; 13+ messages in thread
From: Tom Roeder @ 2020-11-30 18:55 UTC (permalink / raw)
  To: Keith Busch
  Cc: Sagi Grimberg, linux-kernel, linux-nvme, Marios Pomonis,
	Jens Axboe, Peter Gonda, Christoph Hellwig

On Fri, Nov 20, 2020 at 06:29:54AM -0800, Keith Busch wrote:
>On Fri, Nov 20, 2020 at 09:02:43AM +0100, Christoph Hellwig wrote:
>> On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
>> > This patch changes the NVMe PCI implementation to cache host_mem_descs
>> > in non-DMA memory instead of depending on descriptors stored in DMA
>> > memory. This change is needed under the malicious-hypervisor threat
>> > model assumed by the AMD SEV and Intel TDX architectures, which encrypt
>> > guest memory to make it unreadable. Some versions of these architectures
>> > also make it cryptographically hard to modify guest memory without
>> > detection.
>>
>> I don't think this is a useful threat model, and I've not seen a
>> discussion on lkml where we had any discussion on this kind of threat
>> model either.
>>
>> Before you start sending patches that regress optimizations in various
>> drivers (and there will be lots with this model) we need to have a
>> broader discussion first.
>>
>> And HMB support, which is for low-end consumer devices that are usually
>> not directly assigned to VMs aren't a good starting point for this.
>
>Yeah, while doing this for HMB isn't really a performance concern, this
>method for chaining SGL/PRP lists would be.

I see that this answers a question I just asked in my reply to the 
previous message. Sorry about that. Can you please point me to the code 
in question?

>
>And perhaps more importantly, the proposed mitigation only lets the
>guest silently carry on from such an attack while the device is surely
>corrupting something. I think we'd rather free the wrong address since
>that may at least eventually raise an error.

 From a security perspective, I'd rather not free the wrong address, 
since that could lead to an attack on the guest (use-after-free). But I 
agree with the concern about fixing the problem silently. Maybe this 
code should instead raise an error itself in this case after comparing 
the cached values with the values stored in the DMA memory?

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
  2020-11-30 18:50     ` Tom Roeder
@ 2020-12-02 16:31       ` Tom Lendacky
  -1 siblings, 0 replies; 13+ messages in thread
From: Tom Lendacky @ 2020-12-02 16:31 UTC (permalink / raw)
  To: Tom Roeder, Christoph Hellwig
  Cc: Keith Busch, Jens Axboe, Sagi Grimberg, Peter Gonda,
	Marios Pomonis, linux-nvme, linux-kernel, David.Kaplan

On 11/30/20 12:50 PM, Tom Roeder wrote:
> On Fri, Nov 20, 2020 at 09:02:43AM +0100, Christoph Hellwig wrote:
>> On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
>>> This patch changes the NVMe PCI implementation to cache host_mem_descs
>>> in non-DMA memory instead of depending on descriptors stored in DMA
>>> memory. This change is needed under the malicious-hypervisor threat
>>> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
>>> guest memory to make it unreadable. Some versions of these architectures
>>> also make it cryptographically hard to modify guest memory without
>>> detection.
>>
>> I don't think this is a useful threat model, and I've not seen a
>> discussion on lkml where we had any discussion on this kind of threat
>> model either.
> 
> Thanks for the feedback and apologies for the lack of context.
> 
> I was under the impression that support for AMD SEV SNP will start showing 
> up in KVM soon, and my understanding of SNP is that it implies this threat 
> model for the guest. See the patchset for SEV-ES, which is the generation 
> before SNP: 
> https://lkml.org/lkml/2020/9/14/1168.> This doesn't get quite to the SNP threat model, but it starts to assume 
> more maliciousness on the part of the hypervisor.
> 
> You can also see the talk from David Kaplan of AMD from the 2019 Linux 
> Security Summit for info about SNP: 
> https://www.youtube.com/watch?v=yr56SaJ_0QI.
> 
> 
>>
>> Before you start sending patches that regress optimizations in various
>> drivers (and there will be lots with this model) we need to have a
>> broader discussion first.
> 
> I've added Tom Lendacky and David Kaplan from AMD on the thread now, since 
> I don't think I have enough context to say where this discussion should 
> take place or the degree to which they think it has or hasn't.
> 
> Tom, David: can you please comment on this?

Any discussion should certainly take place in the open on the mailing
lists.

Further information on SEV-SNP can be found on the SEV developer web page
at https://developer.amd.com/sev.

There is a white paper specific to SNP:
  https://www.amd.com/system/files/TechDocs/SEV-SNP-strengthening-vm-isolation-with-integrity-protection-and-more.pdf

Also, volume 2 of the AMD APM provides further information on the various
SEV features (sections 15.34 to 15.36):
  https://www.amd.com/system/files/TechDocs/24593.pdf

It is a good idea to go through the various drivers and promote changes
to provide protection from a malicious hypervisor, but, as Christoph
states, it needs to be discussed in order to determine the best approach.

Thanks,
Tom

> 
>>
>> And HMB support, which is for low-end consumer devices that are usually
>> not directly assigned to VMs aren't a good starting point for this.
> 
> I'm glad to hear that this case doesn't apply directly to cases we would 
> care about for assignment to guests. I'm not very familiar with this 
> codebase, unfortunately. Do the same kinds of issues apply for the kinds 
> of devices that would be assigned to guests?

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-12-02 16:31       ` Tom Lendacky
  0 siblings, 0 replies; 13+ messages in thread
From: Tom Lendacky @ 2020-12-02 16:31 UTC (permalink / raw)
  To: Tom Roeder, Christoph Hellwig
  Cc: Sagi Grimberg, David.Kaplan, linux-kernel, linux-nvme,
	Marios Pomonis, Jens Axboe, Peter Gonda, Keith Busch

On 11/30/20 12:50 PM, Tom Roeder wrote:
> On Fri, Nov 20, 2020 at 09:02:43AM +0100, Christoph Hellwig wrote:
>> On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
>>> This patch changes the NVMe PCI implementation to cache host_mem_descs
>>> in non-DMA memory instead of depending on descriptors stored in DMA
>>> memory. This change is needed under the malicious-hypervisor threat
>>> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
>>> guest memory to make it unreadable. Some versions of these architectures
>>> also make it cryptographically hard to modify guest memory without
>>> detection.
>>
>> I don't think this is a useful threat model, and I've not seen a
>> discussion on lkml where we had any discussion on this kind of threat
>> model either.
> 
> Thanks for the feedback and apologies for the lack of context.
> 
> I was under the impression that support for AMD SEV SNP will start showing 
> up in KVM soon, and my understanding of SNP is that it implies this threat 
> model for the guest. See the patchset for SEV-ES, which is the generation 
> before SNP: 
> https://lkml.org/lkml/2020/9/14/1168.> This doesn't get quite to the SNP threat model, but it starts to assume 
> more maliciousness on the part of the hypervisor.
> 
> You can also see the talk from David Kaplan of AMD from the 2019 Linux 
> Security Summit for info about SNP: 
> https://www.youtube.com/watch?v=yr56SaJ_0QI.
> 
> 
>>
>> Before you start sending patches that regress optimizations in various
>> drivers (and there will be lots with this model) we need to have a
>> broader discussion first.
> 
> I've added Tom Lendacky and David Kaplan from AMD on the thread now, since 
> I don't think I have enough context to say where this discussion should 
> take place or the degree to which they think it has or hasn't.
> 
> Tom, David: can you please comment on this?

Any discussion should certainly take place in the open on the mailing
lists.

Further information on SEV-SNP can be found on the SEV developer web page
at https://developer.amd.com/sev.

There is a white paper specific to SNP:
  https://www.amd.com/system/files/TechDocs/SEV-SNP-strengthening-vm-isolation-with-integrity-protection-and-more.pdf

Also, volume 2 of the AMD APM provides further information on the various
SEV features (sections 15.34 to 15.36):
  https://www.amd.com/system/files/TechDocs/24593.pdf

It is a good idea to go through the various drivers and promote changes
to provide protection from a malicious hypervisor, but, as Christoph
states, it needs to be discussed in order to determine the best approach.

Thanks,
Tom

> 
>>
>> And HMB support, which is for low-end consumer devices that are usually
>> not directly assigned to VMs aren't a good starting point for this.
> 
> I'm glad to hear that this case doesn't apply directly to cases we would 
> care about for assignment to guests. I'm not very familiar with this 
> codebase, unfortunately. Do the same kinds of issues apply for the kinds 
> of devices that would be assigned to guests?

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH v2] nvme: Cache DMA descriptors to prevent corruption.
@ 2023-01-24 17:24 Julien Bachmann
  0 siblings, 0 replies; 13+ messages in thread
From: Julien Bachmann @ 2023-01-24 17:24 UTC (permalink / raw)
  To: thomas.lendacky
  Cc: David.Kaplan, axboe, hch, kbusch, linux-kernel, linux-nvme,
	Peter Gonda, Marios Pomonis, sagi, Tom Roeder

On 12/02/20 07:31 PM, Tom Lendacky wrote:
> On 11/30/20 12:50 PM, Tom Roeder wrote:
>> On Fri, Nov 20, 2020 at 09:02:43AM +0100, Christoph Hellwig wrote:
>>> On Thu, Nov 19, 2020 at 05:27:37PM -0800, Tom Roeder wrote:
>>>> This patch changes the NVMe PCI implementation to cache host_mem_descs
>>>> in non-DMA memory instead of depending on descriptors stored in DMA
>>>> memory. This change is needed under the malicious-hypervisor threat
>>>> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
>>>> guest memory to make it unreadable. Some versions of these architectures
>>>> also make it cryptographically hard to modify guest memory without
>>>> detection.
>>>
>>> I don't think this is a useful threat model, and I've not seen a
>>> discussion on lkml where we had any discussion on this kind of threat
>>> model either.
>>
>> Thanks for the feedback and apologies for the lack of context.
>>
>> I was under the impression that support for AMD SEV SNP will start showing
>> up in KVM soon, and my understanding of SNP is that it implies this threat
>> model for the guest. See the patchset for SEV-ES, which is the generation
>> before SNP:
>> https://lkml.org/lkml/2020/9/14/1168.> This doesn't get quite to the SNP threat model, but it starts to assume
>> more maliciousness on the part of the hypervisor.
>>
>> You can also see the talk from David Kaplan of AMD from the 2019 Linux
>> Security Summit for info about SNP:
>> https://www.youtube.com/watch?v=yr56SaJ_0QI.
>>
>>
>>>
>>> Before you start sending patches that regress optimizations in various
>>> drivers (and there will be lots with this model) we need to have a
>>> broader discussion first.
>>
>> I've added Tom Lendacky and David Kaplan from AMD on the thread now, since
>> I don't think I have enough context to say where this discussion should
>> take place or the degree to which they think it has or hasn't.
>>
>> Tom, David: can you please comment on this?
>
> Any discussion should certainly take place in the open on the mailing
> lists.
>
> Further information on SEV-SNP can be found on the SEV developer web page
> at https://developer.amd.com/sev.
>
> There is a white paper specific to SNP:
>   https://www.amd.com/system/files/TechDocs/SEV-SNP-strengthening-vm-isolation-with-integrity-protection-and-more.pdf
>
> Also, volume 2 of the AMD APM provides further information on the various
> SEV features (sections 15.34 to 15.36):
>   https://www.amd.com/system/files/TechDocs/24593.pdf
>
> It is a good idea to go through the various drivers and promote changes
> to provide protection from a malicious hypervisor, but, as Christoph
> states, it needs to be discussed in order to determine the best approach.

Following up on this thread as Confidential Computing (CC) gained more
popularity over the last 2 years. The host-to-guest threat model for
CC is more researched and discussed (e.g. Hardening Linux guest kernel
for CC at the Linux Plumbers Conference 2022 [1]).

Has a more general discussion on this threat model happened on the
lkml since then? Cloud providers, chip makers and academic researchers
[2] patched multiple drivers for host-to-guest vulnerabilities
following research.

>>> And HMB support, which is for low-end consumer devices that are usually
>>> not directly assigned to VMs aren't a good starting point for this.
>>
>> I'm glad to hear that this case doesn't apply directly to cases we would
>> care about for assignment to guests. I'm not very familiar with this
>> codebase, unfortunately. Do the same kinds of issues apply for the kinds
>> of devices that would be assigned to guests?

I’m also not familiar with this codebase but would it be possible for
a malicious hypervisor to send a crafted vendor_id or device_id to
reach this code upon kernel’s PCI probing?

Would the patch now be acceptable with the development of CC or do you
see updates that should be made?

Let me know what you think and what would be the preferred next steps.

Best regards

[1] https://lpc.events/event/16/contributions/1328/
[2] Examples of research and patches
- https://arxiv.org/pdf/2109.10660.pdf
- https://lore.kernel.org/linux-hyperv/20201117105437.xbyjrs4m7garb2lj@liuwe-devbox-debian-v2/T/#t
- https://github.com/torvalds/linux/commit/5218e919c8d06279884aa0baf76778a6817d5b93

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2023-01-24 17:24 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-20  1:27 [PATCH v2] nvme: Cache DMA descriptors to prevent corruption Tom Roeder
2020-11-20  1:27 ` Tom Roeder
2020-11-20  8:02 ` Christoph Hellwig
2020-11-20  8:02   ` Christoph Hellwig
2020-11-20 14:29   ` Keith Busch
2020-11-20 14:29     ` Keith Busch
2020-11-30 18:55     ` Tom Roeder
2020-11-30 18:55       ` Tom Roeder
2020-11-30 18:50   ` Tom Roeder
2020-11-30 18:50     ` Tom Roeder
2020-12-02 16:31     ` Tom Lendacky
2020-12-02 16:31       ` Tom Lendacky
2023-01-24 17:24 Julien Bachmann

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.