All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-11-19 18:59 ` Tom Roeder
  0 siblings, 0 replies; 6+ messages in thread
From: Tom Roeder @ 2020-11-19 18:59 UTC (permalink / raw)
  To: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg
  Cc: Peter Gonda, Marios Pomonis, linux-nvme, linux-kernel, Tom Roeder

This patch changes the NVMe PCI implementation to cache host_mem_descs
in non-DMA memory instead of depending on descriptors stored in DMA
memory. This change is needed under the malicious-hypervisor threat
model assumed by the AMD SEV and Intel TDX architectures, which encrypt
guest memory to make it unreadable. Some versions of these architectures
also make it cryptographically hard to modify guest memory without
detection.

On these architectures, Linux generally leaves DMA memory unencrypted so
that devices can still communicate directly with the kernel: DMA memory
remains readable to and modifiable by devices. This means that this
memory is also accessible to a hypervisor.

However, this means that a malicious hypervisor could modify the addr or
size fields of descriptors and cause the NVMe driver to call
dma_free_attrs on arbitrary addresses or on the right addresses but with
the wrong size. To prevent this attack, this commit changes the code to
cache those descriptors in non-DMA memory and to use the cached values
when freeing the memory they describe.

Tested: Built and ran with Google-internal NVMe tests.
Tested-by: Tom Roeder <tmroeder@google.com>
Signed-off-by: Tom Roeder <tmroeder@google.com>
---
 drivers/nvme/host/pci.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3be352403839..28ebe1304cae 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -148,6 +148,10 @@ struct nvme_dev {
 	u32 nr_host_mem_descs;
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
+	/* Cache the host_mem_descs in non-DMA memory so a malicious hypervisor
+	 * can't change them.
+	 */
+	struct nvme_host_mem_buf_desc *host_mem_descs_cache;
 	void **host_mem_desc_bufs;
 	unsigned int nr_allocated_queues;
 	unsigned int nr_write_queues;
@@ -1874,7 +1878,11 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 	int i;
 
 	for (i = 0; i < dev->nr_host_mem_descs; i++) {
-		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
+		/* Use the cached version to free the DMA allocations, not a
+		 * version that could be controlled by a malicious hypervisor.
+		 */
+		struct nvme_host_mem_buf_desc *desc =
+			&dev->host_mem_descs_cache[i];
 		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
 
 		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
@@ -1888,6 +1896,8 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
 			dev->host_mem_descs, dev->host_mem_descs_dma);
 	dev->host_mem_descs = NULL;
+	kfree(dev->host_mem_descs_cache);
+	dev->host_mem_descs_cache = NULL;
 	dev->nr_host_mem_descs = 0;
 }
 
@@ -1895,6 +1905,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 		u32 chunk_size)
 {
 	struct nvme_host_mem_buf_desc *descs;
+	struct nvme_host_mem_buf_desc *descs_cache;
 	u32 max_entries, len;
 	dma_addr_t descs_dma;
 	int i = 0;
@@ -1913,9 +1924,13 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	if (!descs)
 		goto out;
 
+	descs_cache = kcalloc(max_entries, sizeof(*descs), GFP_KERNEL);
+	if (!descs_cache)
+		goto out_free_descs;
+
 	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
 	if (!bufs)
-		goto out_free_descs;
+		goto out_free_descs_cache;
 
 	for (size = 0; size < preferred && i < max_entries; size += len) {
 		dma_addr_t dma_addr;
@@ -1928,6 +1943,8 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 
 		descs[i].addr = cpu_to_le64(dma_addr);
 		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
+		descs_cache[i].addr = cpu_to_le64(dma_addr);
+		descs_cache[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
 		i++;
 	}
 
@@ -1937,20 +1954,24 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	dev->nr_host_mem_descs = i;
 	dev->host_mem_size = size;
 	dev->host_mem_descs = descs;
+	dev->host_mem_descs_cache = descs_cache;
 	dev->host_mem_descs_dma = descs_dma;
 	dev->host_mem_desc_bufs = bufs;
 	return 0;
 
 out_free_bufs:
 	while (--i >= 0) {
-		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
+		size_t size =
+			le32_to_cpu(descs_cache[i].size) * NVME_CTRL_PAGE_SIZE;
 
 		dma_free_attrs(dev->dev, size, bufs[i],
-			       le64_to_cpu(descs[i].addr),
+			       le64_to_cpu(descs_cache[i].addr),
 			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
 	}
 
 	kfree(bufs);
+out_free_descs_cache:
+	kfree(descs_cache);
 out_free_descs:
 	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
 			descs_dma);
-- 
2.29.2.299.gdc1121823c-goog


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-11-19 18:59 ` Tom Roeder
  0 siblings, 0 replies; 6+ messages in thread
From: Tom Roeder @ 2020-11-19 18:59 UTC (permalink / raw)
  To: Keith Busch, Jens Axboe, Christoph Hellwig, Sagi Grimberg
  Cc: linux-kernel, Tom Roeder, linux-nvme, Peter Gonda, Marios Pomonis

This patch changes the NVMe PCI implementation to cache host_mem_descs
in non-DMA memory instead of depending on descriptors stored in DMA
memory. This change is needed under the malicious-hypervisor threat
model assumed by the AMD SEV and Intel TDX architectures, which encrypt
guest memory to make it unreadable. Some versions of these architectures
also make it cryptographically hard to modify guest memory without
detection.

On these architectures, Linux generally leaves DMA memory unencrypted so
that devices can still communicate directly with the kernel: DMA memory
remains readable to and modifiable by devices. This means that this
memory is also accessible to a hypervisor.

However, this means that a malicious hypervisor could modify the addr or
size fields of descriptors and cause the NVMe driver to call
dma_free_attrs on arbitrary addresses or on the right addresses but with
the wrong size. To prevent this attack, this commit changes the code to
cache those descriptors in non-DMA memory and to use the cached values
when freeing the memory they describe.

Tested: Built and ran with Google-internal NVMe tests.
Tested-by: Tom Roeder <tmroeder@google.com>
Signed-off-by: Tom Roeder <tmroeder@google.com>
---
 drivers/nvme/host/pci.c | 29 +++++++++++++++++++++++++----
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 3be352403839..28ebe1304cae 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -148,6 +148,10 @@ struct nvme_dev {
 	u32 nr_host_mem_descs;
 	dma_addr_t host_mem_descs_dma;
 	struct nvme_host_mem_buf_desc *host_mem_descs;
+	/* Cache the host_mem_descs in non-DMA memory so a malicious hypervisor
+	 * can't change them.
+	 */
+	struct nvme_host_mem_buf_desc *host_mem_descs_cache;
 	void **host_mem_desc_bufs;
 	unsigned int nr_allocated_queues;
 	unsigned int nr_write_queues;
@@ -1874,7 +1878,11 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 	int i;
 
 	for (i = 0; i < dev->nr_host_mem_descs; i++) {
-		struct nvme_host_mem_buf_desc *desc = &dev->host_mem_descs[i];
+		/* Use the cached version to free the DMA allocations, not a
+		 * version that could be controlled by a malicious hypervisor.
+		 */
+		struct nvme_host_mem_buf_desc *desc =
+			&dev->host_mem_descs_cache[i];
 		size_t size = le32_to_cpu(desc->size) * NVME_CTRL_PAGE_SIZE;
 
 		dma_free_attrs(dev->dev, size, dev->host_mem_desc_bufs[i],
@@ -1888,6 +1896,8 @@ static void nvme_free_host_mem(struct nvme_dev *dev)
 			dev->nr_host_mem_descs * sizeof(*dev->host_mem_descs),
 			dev->host_mem_descs, dev->host_mem_descs_dma);
 	dev->host_mem_descs = NULL;
+	kfree(dev->host_mem_descs_cache);
+	dev->host_mem_descs_cache = NULL;
 	dev->nr_host_mem_descs = 0;
 }
 
@@ -1895,6 +1905,7 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 		u32 chunk_size)
 {
 	struct nvme_host_mem_buf_desc *descs;
+	struct nvme_host_mem_buf_desc *descs_cache;
 	u32 max_entries, len;
 	dma_addr_t descs_dma;
 	int i = 0;
@@ -1913,9 +1924,13 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	if (!descs)
 		goto out;
 
+	descs_cache = kcalloc(max_entries, sizeof(*descs), GFP_KERNEL);
+	if (!descs_cache)
+		goto out_free_descs;
+
 	bufs = kcalloc(max_entries, sizeof(*bufs), GFP_KERNEL);
 	if (!bufs)
-		goto out_free_descs;
+		goto out_free_descs_cache;
 
 	for (size = 0; size < preferred && i < max_entries; size += len) {
 		dma_addr_t dma_addr;
@@ -1928,6 +1943,8 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 
 		descs[i].addr = cpu_to_le64(dma_addr);
 		descs[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
+		descs_cache[i].addr = cpu_to_le64(dma_addr);
+		descs_cache[i].size = cpu_to_le32(len / NVME_CTRL_PAGE_SIZE);
 		i++;
 	}
 
@@ -1937,20 +1954,24 @@ static int __nvme_alloc_host_mem(struct nvme_dev *dev, u64 preferred,
 	dev->nr_host_mem_descs = i;
 	dev->host_mem_size = size;
 	dev->host_mem_descs = descs;
+	dev->host_mem_descs_cache = descs_cache;
 	dev->host_mem_descs_dma = descs_dma;
 	dev->host_mem_desc_bufs = bufs;
 	return 0;
 
 out_free_bufs:
 	while (--i >= 0) {
-		size_t size = le32_to_cpu(descs[i].size) * NVME_CTRL_PAGE_SIZE;
+		size_t size =
+			le32_to_cpu(descs_cache[i].size) * NVME_CTRL_PAGE_SIZE;
 
 		dma_free_attrs(dev->dev, size, bufs[i],
-			       le64_to_cpu(descs[i].addr),
+			       le64_to_cpu(descs_cache[i].addr),
 			       DMA_ATTR_NO_KERNEL_MAPPING | DMA_ATTR_NO_WARN);
 	}
 
 	kfree(bufs);
+out_free_descs_cache:
+	kfree(descs_cache);
 out_free_descs:
 	dma_free_coherent(dev->dev, max_entries * sizeof(*descs), descs,
 			descs_dma);
-- 
2.29.2.299.gdc1121823c-goog


_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH] nvme: Cache DMA descriptors to prevent corruption.
  2020-11-19 18:59 ` Tom Roeder
@ 2020-11-19 21:09   ` Keith Busch
  -1 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2020-11-19 21:09 UTC (permalink / raw)
  To: Tom Roeder
  Cc: Jens Axboe, Christoph Hellwig, Sagi Grimberg, Peter Gonda,
	Marios Pomonis, linux-nvme, linux-kernel

On Thu, Nov 19, 2020 at 10:59:19AM -0800, Tom Roeder wrote:
> This patch changes the NVMe PCI implementation to cache host_mem_descs
> in non-DMA memory instead of depending on descriptors stored in DMA
> memory. This change is needed under the malicious-hypervisor threat
> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
> guest memory to make it unreadable. Some versions of these architectures
> also make it cryptographically hard to modify guest memory without
> detection.
> 
> On these architectures, Linux generally leaves DMA memory unencrypted so
> that devices can still communicate directly with the kernel: DMA memory
> remains readable to and modifiable by devices. This means that this
> memory is also accessible to a hypervisor.
> 
> However, this means that a malicious hypervisor could modify the addr or
> size fields of descriptors and cause the NVMe driver to call
> dma_free_attrs on arbitrary addresses or on the right addresses but with
> the wrong size. To prevent this attack, this commit changes the code to
> cache those descriptors in non-DMA memory and to use the cached values
> when freeing the memory they describe.
 
If the hypervisor does that, then the device may use the wrong
addresses, too. I guess you can't do anything about that from the
driver, though.

> +	/* Cache the host_mem_descs in non-DMA memory so a malicious hypervisor
> +	 * can't change them.
> +	 */
> +	struct nvme_host_mem_buf_desc *host_mem_descs_cache;
>  	void **host_mem_desc_bufs;

This is never seen by an nvme device, so no need for an nvme specific
type here. You can use arch native types.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-11-19 21:09   ` Keith Busch
  0 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2020-11-19 21:09 UTC (permalink / raw)
  To: Tom Roeder
  Cc: Sagi Grimberg, linux-kernel, linux-nvme, Marios Pomonis,
	Jens Axboe, Peter Gonda, Christoph Hellwig

On Thu, Nov 19, 2020 at 10:59:19AM -0800, Tom Roeder wrote:
> This patch changes the NVMe PCI implementation to cache host_mem_descs
> in non-DMA memory instead of depending on descriptors stored in DMA
> memory. This change is needed under the malicious-hypervisor threat
> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
> guest memory to make it unreadable. Some versions of these architectures
> also make it cryptographically hard to modify guest memory without
> detection.
> 
> On these architectures, Linux generally leaves DMA memory unencrypted so
> that devices can still communicate directly with the kernel: DMA memory
> remains readable to and modifiable by devices. This means that this
> memory is also accessible to a hypervisor.
> 
> However, this means that a malicious hypervisor could modify the addr or
> size fields of descriptors and cause the NVMe driver to call
> dma_free_attrs on arbitrary addresses or on the right addresses but with
> the wrong size. To prevent this attack, this commit changes the code to
> cache those descriptors in non-DMA memory and to use the cached values
> when freeing the memory they describe.
 
If the hypervisor does that, then the device may use the wrong
addresses, too. I guess you can't do anything about that from the
driver, though.

> +	/* Cache the host_mem_descs in non-DMA memory so a malicious hypervisor
> +	 * can't change them.
> +	 */
> +	struct nvme_host_mem_buf_desc *host_mem_descs_cache;
>  	void **host_mem_desc_bufs;

This is never seen by an nvme device, so no need for an nvme specific
type here. You can use arch native types.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nvme: Cache DMA descriptors to prevent corruption.
  2020-11-19 21:09   ` Keith Busch
@ 2020-11-20  1:09     ` Tom Roeder
  -1 siblings, 0 replies; 6+ messages in thread
From: Tom Roeder @ 2020-11-20  1:09 UTC (permalink / raw)
  To: Keith Busch
  Cc: Jens Axboe, Christoph Hellwig, Sagi Grimberg, Peter Gonda,
	Marios Pomonis, linux-nvme, linux-kernel

On Thu, Nov 19, 2020 at 01:09:14PM -0800, Keith Busch wrote:
>On Thu, Nov 19, 2020 at 10:59:19AM -0800, Tom Roeder wrote:
>> This patch changes the NVMe PCI implementation to cache host_mem_descs
>> in non-DMA memory instead of depending on descriptors stored in DMA
>> memory. This change is needed under the malicious-hypervisor threat
>> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
>> guest memory to make it unreadable. Some versions of these architectures
>> also make it cryptographically hard to modify guest memory without
>> detection.
>>
>> On these architectures, Linux generally leaves DMA memory unencrypted so
>> that devices can still communicate directly with the kernel: DMA memory
>> remains readable to and modifiable by devices. This means that this
>> memory is also accessible to a hypervisor.
>>
>> However, this means that a malicious hypervisor could modify the addr or
>> size fields of descriptors and cause the NVMe driver to call
>> dma_free_attrs on arbitrary addresses or on the right addresses but with
>> the wrong size. To prevent this attack, this commit changes the code to
>> cache those descriptors in non-DMA memory and to use the cached values
>> when freeing the memory they describe.
>
>If the hypervisor does that, then the device may use the wrong
>addresses, too. I guess you can't do anything about that from the
>driver, though.
I agree; I don't think there's anything the driver can do about that.

>
>> +	/* Cache the host_mem_descs in non-DMA memory so a malicious hypervisor
>> +	 * can't change them.
>> +	 */
>> +	struct nvme_host_mem_buf_desc *host_mem_descs_cache;
>>  	void **host_mem_desc_bufs;
>
>This is never seen by an nvme device, so no need for an nvme specific
>type here. You can use arch native types.

Thanks! I'll change the type to a new struct that has the addr and size 
fields as native integers and send out a v2 for this patch that makes 
that change and cleans up a couple of minor style issues in my code.

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH] nvme: Cache DMA descriptors to prevent corruption.
@ 2020-11-20  1:09     ` Tom Roeder
  0 siblings, 0 replies; 6+ messages in thread
From: Tom Roeder @ 2020-11-20  1:09 UTC (permalink / raw)
  To: Keith Busch
  Cc: Sagi Grimberg, linux-kernel, linux-nvme, Marios Pomonis,
	Jens Axboe, Peter Gonda, Christoph Hellwig

On Thu, Nov 19, 2020 at 01:09:14PM -0800, Keith Busch wrote:
>On Thu, Nov 19, 2020 at 10:59:19AM -0800, Tom Roeder wrote:
>> This patch changes the NVMe PCI implementation to cache host_mem_descs
>> in non-DMA memory instead of depending on descriptors stored in DMA
>> memory. This change is needed under the malicious-hypervisor threat
>> model assumed by the AMD SEV and Intel TDX architectures, which encrypt
>> guest memory to make it unreadable. Some versions of these architectures
>> also make it cryptographically hard to modify guest memory without
>> detection.
>>
>> On these architectures, Linux generally leaves DMA memory unencrypted so
>> that devices can still communicate directly with the kernel: DMA memory
>> remains readable to and modifiable by devices. This means that this
>> memory is also accessible to a hypervisor.
>>
>> However, this means that a malicious hypervisor could modify the addr or
>> size fields of descriptors and cause the NVMe driver to call
>> dma_free_attrs on arbitrary addresses or on the right addresses but with
>> the wrong size. To prevent this attack, this commit changes the code to
>> cache those descriptors in non-DMA memory and to use the cached values
>> when freeing the memory they describe.
>
>If the hypervisor does that, then the device may use the wrong
>addresses, too. I guess you can't do anything about that from the
>driver, though.
I agree; I don't think there's anything the driver can do about that.

>
>> +	/* Cache the host_mem_descs in non-DMA memory so a malicious hypervisor
>> +	 * can't change them.
>> +	 */
>> +	struct nvme_host_mem_buf_desc *host_mem_descs_cache;
>>  	void **host_mem_desc_bufs;
>
>This is never seen by an nvme device, so no need for an nvme specific
>type here. You can use arch native types.

Thanks! I'll change the type to a new struct that has the addr and size 
fields as native integers and send out a v2 for this patch that makes 
that change and cleans up a couple of minor style issues in my code.

_______________________________________________
Linux-nvme mailing list
Linux-nvme@lists.infradead.org
http://lists.infradead.org/mailman/listinfo/linux-nvme

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2020-11-20  1:10 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-11-19 18:59 [PATCH] nvme: Cache DMA descriptors to prevent corruption Tom Roeder
2020-11-19 18:59 ` Tom Roeder
2020-11-19 21:09 ` Keith Busch
2020-11-19 21:09   ` Keith Busch
2020-11-20  1:09   ` Tom Roeder
2020-11-20  1:09     ` Tom Roeder

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.