linux-nvme.lists.infradead.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v4] nvme: fix corruption for passthrough meta/data
       [not found] <CGME20231013052157epcas5p3dc0698c56f9846191d315fa8d33ccb5c@epcas5p3.samsung.com>
@ 2023-10-13  5:14 ` Kanchan Joshi
  2023-10-13  5:26   ` Christoph Hellwig
  2023-10-13  5:32   ` Kanchan Joshi
  0 siblings, 2 replies; 17+ messages in thread
From: Kanchan Joshi @ 2023-10-13  5:14 UTC (permalink / raw)
  To: hch, kbusch, axboe, sagi
  Cc: linux-nvme, vincentfu, ankit.kumar, joshiiitr, Kanchan Joshi,
	stable, Vincent Fu

User can specify a smaller meta buffer than what the device is
wired to update/access. Kernel makes a copy of the meta buffer into
which the device does DMA.
As a result, the device overwrites the unrelated kernel memory, causing
random kernel crashes.

Same issue is possible for extended-lba case also. When user specifies a
short unaligned buffer, the kernel makes a copy and uses that for DMA.

Detect these situations and prevent corruption for unprivileged user
passthrough. No change to status-quo for privileged/root user.

Fixes: 63263d60e0f9 ("nvme: Use metadata for passthrough commands")
Cc: stable@vger.kernel.org

Reported-by: Vincent Fu <vincent.fu@samsung.com>
Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
---
Changes since v3:
- Block only unprivileged user
- Harden the checks by disallowing everything for which data length
  (nlb) can not be determined
- Separate the bounce buffer checks to a different function
- Factor in CSIs beyond NVM and ZNS

Changes since v2:
- Handle extended-lba case: short unaligned buffer IO
- Reduce the scope of check to only well-known commands
- Do not check early. Move it deeper so that check gets executed less
  often
- Combine two patches into one.

Changes since v1:
- Revise the check to exclude PRACT=1 case

 drivers/nvme/host/ioctl.c | 116 ++++++++++++++++++++++++++++++++++++++
 1 file changed, 116 insertions(+)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d8ff796fd5f2..57160ca02e65 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -96,6 +96,76 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
 	return (void __user *)ptrval;
 }
 
+static inline bool nvme_nlb_in_cdw12(struct nvme_ns *ns, u8 opcode)
+{
+	u8 csi = ns->head->ids.csi;
+
+	if (csi != NVME_CSI_NVM && csi != NVME_CSI_ZNS)
+		return false;
+
+	switch (opcode) {
+	case nvme_cmd_read:
+	case nvme_cmd_write:
+	case nvme_cmd_compare:
+	case nvme_cmd_zone_append:
+		return true;
+	default:
+		return false;
+	}
+}
+
+/*
+ * NVMe has no separate field to encode the metadata length expected
+ * (except when using SGLs).
+ *
+ * Because of that we can't allow to transfer arbitrary metadata, as
+ * a metadata buffer that is shorted than what the device expects for
+ * the command will lead to arbitrary kernel (if bounce buffering) or
+ * userspace (if not) memory corruption.
+ *
+ * Check that external metadata is only specified for the few commands
+ * where we know the length based of other fields, and that it fits
+ * the actual data transfer from/to the device.
+ */
+static bool nvme_validate_metadata_len(struct request *req, unsigned meta_len)
+{
+	struct nvme_ns *ns = req->q->queuedata;
+	struct nvme_command *c = nvme_req(req)->cmd;
+	u32 len_by_nlb;
+
+	/* Do not guard admin */
+	if (capable(CAP_SYS_ADMIN))
+		return true;
+
+	/* Block commands that do not have nlb in cdw12 */
+	if (!nvme_nlb_in_cdw12(ns, c->common.opcode)) {
+		dev_err(ns->ctrl->device,
+			"unknown metadata command %c\n", c->common.opcode);
+		return false;
+	}
+
+	/* Skip when PI is inserted or stripped and not transferred */
+	if (ns->ms == ns->pi_size &&
+	    (c->rw.control & cpu_to_le16(NVME_RW_PRINFO_PRACT)))
+		return true;
+
+	if (ns->features & NVME_NS_EXT_LBAS) {
+		dev_err(ns->ctrl->device,
+			"requires extended LBAs for metadata\n");
+		return false;
+	}
+
+	len_by_nlb = (le16_to_cpu(c->rw.length) + 1) * ns->ms;
+	if (meta_len < len_by_nlb) {
+		dev_err(ns->ctrl->device,
+			"metadata length (%u instad of %u) is too small.\n",
+			meta_len, len_by_nlb);
+		return false;
+	}
+
+	return true;
+}
+
 static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
 		unsigned len, u32 seed)
 {
@@ -104,6 +174,9 @@ static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
 	void *buf;
 	struct bio *bio = req->bio;
 
+	if (!nvme_validate_metadata_len(req, len))
+		return ERR_PTR(-EINVAL);
+
 	buf = kmalloc(len, GFP_KERNEL);
 	if (!buf)
 		goto out;
@@ -134,6 +207,41 @@ static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
 	return ERR_PTR(ret);
 }
 
+static bool nvme_validate_buffer_len(struct nvme_ns *ns, struct nvme_command *c,
+				     unsigned meta_len, unsigned data_len)
+{
+	u32 mlen_by_nlb, dlen_by_nlb;
+
+	/* Do not guard admin */
+	if (capable(CAP_SYS_ADMIN))
+		return true;
+
+	/* Block commands that do not have nlb in cdw12 */
+	if (!nvme_nlb_in_cdw12(ns, c->common.opcode)) {
+		dev_err(ns->ctrl->device,
+			"unknown metadata command %c.\n", c->common.opcode);
+		return false;
+	}
+
+	/* When PI is inserted or stripped and not transferred.*/
+	if (ns->ms == ns->pi_size &&
+	    (c->rw.control & cpu_to_le16(NVME_RW_PRINFO_PRACT)))
+		mlen_by_nlb = 0;
+	else
+		mlen_by_nlb = (le16_to_cpu(c->rw.length) + 1) * ns->ms;
+
+	dlen_by_nlb = (le16_to_cpu(c->rw.length) + 1) << ns->lba_shift;
+
+	if (data_len < (dlen_by_nlb + mlen_by_nlb)) {
+		dev_err(ns->ctrl->device,
+			"buffer length (%u instad of %u) is too small.\n",
+			data_len, dlen_by_nlb + mlen_by_nlb);
+		return false;
+	}
+
+	return true;
+}
+
 static int nvme_finish_user_metadata(struct request *req, void __user *ubuf,
 		void *meta, unsigned len, int ret)
 {
@@ -202,6 +310,14 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		}
 		*metap = meta;
 	}
+	/* Guard for a short bounce buffer */
+	if (bio->bi_private) {
+		if (!nvme_validate_buffer_len(ns, nvme_req(req)->cmd,
+					      meta_len, bufflen)) {
+			ret = -EINVAL;
+			goto out_unmap;
+		}
+	}
 
 	return ret;
 
-- 
2.25.1



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13  5:14 ` [PATCH v4] nvme: fix corruption for passthrough meta/data Kanchan Joshi
@ 2023-10-13  5:26   ` Christoph Hellwig
  2023-10-13  5:37     ` Kanchan Joshi
  2023-10-13 10:14     ` Kanchan Joshi
  2023-10-13  5:32   ` Kanchan Joshi
  1 sibling, 2 replies; 17+ messages in thread
From: Christoph Hellwig @ 2023-10-13  5:26 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: hch, kbusch, axboe, sagi, linux-nvme, vincentfu, ankit.kumar,
	joshiiitr, stable, Vincent Fu

On Fri, Oct 13, 2023 at 10:44:58AM +0530, Kanchan Joshi wrote:
> Changes since v3:
> - Block only unprivileged user

That's not really what at least I had in mind.  I'd much rather
completely disable unprivileged passthrough for now as an easy
backportable patch.  And then only re-enable it later in a way
where it does require using SGLs for all data transfers.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13  5:14 ` [PATCH v4] nvme: fix corruption for passthrough meta/data Kanchan Joshi
  2023-10-13  5:26   ` Christoph Hellwig
@ 2023-10-13  5:32   ` Kanchan Joshi
  1 sibling, 0 replies; 17+ messages in thread
From: Kanchan Joshi @ 2023-10-13  5:32 UTC (permalink / raw)
  To: hch, kbusch, axboe, sagi
  Cc: linux-nvme, vincentfu, ankit.kumar, joshiiitr, stable, Vincent Fu

On 10/13/2023 10:44 AM, Kanchan Joshi wrote:
> User can specify a smaller meta buffer than what the device is
> wired to update/access. Kernel makes a copy of the meta buffer into
> which the device does DMA.
> As a result, the device overwrites the unrelated kernel memory, causing
> random kernel crashes.
> 
> Same issue is possible for extended-lba case also. When user specifies a
> short unaligned buffer, the kernel makes a copy and uses that for DMA.
> 
> Detect these situations and prevent corruption for unprivileged user
> passthrough. No change to status-quo for privileged/root user.
> 
> Fixes: 63263d60e0f9 ("nvme: Use metadata for passthrough commands")

Since change is only for unprivileged user, I should have changed this 
'Fixes:' to point to this patch instead:

5b7717f44b1 (nvme: fine-granular CAP_SYS_ADMIN for nvme io commands)


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13  5:26   ` Christoph Hellwig
@ 2023-10-13  5:37     ` Kanchan Joshi
  2023-10-13 10:14     ` Kanchan Joshi
  1 sibling, 0 replies; 17+ messages in thread
From: Kanchan Joshi @ 2023-10-13  5:37 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: kbusch, axboe, sagi, linux-nvme, vincentfu, ankit.kumar,
	joshiiitr, stable, Vincent Fu

On 10/13/2023 10:56 AM, Christoph Hellwig wrote:
> On Fri, Oct 13, 2023 at 10:44:58AM +0530, Kanchan Joshi wrote:
>> Changes since v3:
>> - Block only unprivileged user
> 
> That's not really what at least I had in mind.  I'd much rather
> completely disable unprivileged passthrough for now as an easy
> backportable patch.  

As you deem fit, but I think even this will be bakported upto the patch 
that introduced unprivileged passthrough.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13  5:26   ` Christoph Hellwig
  2023-10-13  5:37     ` Kanchan Joshi
@ 2023-10-13 10:14     ` Kanchan Joshi
  2023-10-13 12:59       ` Kanchan Joshi
  2023-10-13 13:54       ` Keith Busch
  1 sibling, 2 replies; 17+ messages in thread
From: Kanchan Joshi @ 2023-10-13 10:14 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: kbusch, axboe, sagi, linux-nvme, vincentfu, ankit.kumar,
	joshiiitr, stable, Vincent Fu

On 10/13/2023 10:56 AM, Christoph Hellwig wrote:
> On Fri, Oct 13, 2023 at 10:44:58AM +0530, Kanchan Joshi wrote:
>> Changes since v3:
>> - Block only unprivileged user
> 
> That's not really what at least I had in mind.  I'd much rather
> completely disable unprivileged passthrough for now as an easy
> backportable patch.  And then only re-enable it later in a way
> where it does require using SGLs for all data transfers.
> 

I did not get how forcing SGLs can solve the issue at hand.
The problem happened because (i) user specified short buffer/len, and 
(ii) kernel allocated buffer. Whether the buffer is fed to device using 
PRP or SGL does not seem to solve the large DMA problem.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13 10:14     ` Kanchan Joshi
@ 2023-10-13 12:59       ` Kanchan Joshi
  2023-10-13 13:54       ` Keith Busch
  1 sibling, 0 replies; 17+ messages in thread
From: Kanchan Joshi @ 2023-10-13 12:59 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: kbusch, axboe, sagi, linux-nvme, vincentfu, ankit.kumar,
	joshiiitr, stable, Vincent Fu

On 10/13/2023 3:44 PM, Kanchan Joshi wrote:
> On 10/13/2023 10:56 AM, Christoph Hellwig wrote:
>> On Fri, Oct 13, 2023 at 10:44:58AM +0530, Kanchan Joshi wrote:
>>> Changes since v3:
>>> - Block only unprivileged user
>>
>> That's not really what at least I had in mind.  I'd much rather
>> completely disable unprivileged passthrough for now as an easy
>> backportable patch.  And then only re-enable it later in a way
>> where it does require using SGLs for all data transfers.
>>
> 
> I did not get how forcing SGLs can solve the issue at hand.
> The problem happened because (i) user specified short buffer/len, and
> (ii) kernel allocated buffer. Whether the buffer is fed to device using
> PRP or SGL does not seem to solve the large DMA problem.
> 

FWIW, this is the test-patch I wrote to force passthrough to use SGL.

---
  drivers/nvme/host/ioctl.c | 2 ++
  drivers/nvme/host/nvme.h  | 1 +
  drivers/nvme/host/pci.c   | 8 +++++---
  3 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d8ff796fd5f2..508a813b349e 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -202,6 +202,8 @@ static int nvme_map_user_request(struct request 
*req, u64 ubuffer,
                 }
                 *metap = meta;
         }
+       /* force sgl for data transfer */
+       nvme_req(req)->flags |= NVME_REQ_FORCE_SGL;

         return ret;

diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index f35647c470af..9fe91d25cfdd 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -184,6 +184,7 @@ enum {
         NVME_REQ_CANCELLED              = (1 << 0),
         NVME_REQ_USERCMD                = (1 << 1),
         NVME_MPATH_IO_STATS             = (1 << 2),
+       NVME_REQ_FORCE_SGL              = (1 << 3),
  };

  static inline struct nvme_request *nvme_req(struct request *req)
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index 60a08dfe8d75..e28d3b7b14ef 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -772,18 +772,20 @@ static blk_status_t nvme_map_data(struct nvme_dev 
*dev, struct request *req,
         struct nvme_iod *iod = blk_mq_rq_to_pdu(req);
         blk_status_t ret = BLK_STS_RESOURCE;
         int rc;
+       bool force_sgl = nvme_req(req)->flags & NVME_REQ_FORCE_SGL;

         if (blk_rq_nr_phys_segments(req) == 1) {
                 struct nvme_queue *nvmeq = req->mq_hctx->driver_data;
                 struct bio_vec bv = req_bvec(req);

                 if (!is_pci_p2pdma_page(bv.bv_page)) {
-                       if (bv.bv_offset + bv.bv_len <= 
NVME_CTRL_PAGE_SIZE * 2)
+                       if (!force_sgl &&
+                           bv.bv_offset + bv.bv_len <= 
NVME_CTRL_PAGE_SIZE * 2)
                                 return nvme_setup_prp_simple(dev, req,
                                                              &cmnd->rw, 
&bv);

-                       if (nvmeq->qid && sgl_threshold &&
-                           nvme_ctrl_sgl_supported(&dev->ctrl))
+                       if (nvmeq->qid && 
nvme_ctrl_sgl_supported(&dev->ctrl)
+                           && (sgl_threshold || force_sgl))
                                 return nvme_setup_sgl_simple(dev, req,
                                                              &cmnd->rw, 
&bv);
                 }



^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13 10:14     ` Kanchan Joshi
  2023-10-13 12:59       ` Kanchan Joshi
@ 2023-10-13 13:54       ` Keith Busch
  2023-10-13 15:11         ` Kanchan Joshi
  1 sibling, 1 reply; 17+ messages in thread
From: Keith Busch @ 2023-10-13 13:54 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Christoph Hellwig, axboe, sagi, linux-nvme, vincentfu,
	ankit.kumar, joshiiitr, stable, Vincent Fu

On Fri, Oct 13, 2023 at 03:44:38PM +0530, Kanchan Joshi wrote:
> On 10/13/2023 10:56 AM, Christoph Hellwig wrote:
> > On Fri, Oct 13, 2023 at 10:44:58AM +0530, Kanchan Joshi wrote:
> >> Changes since v3:
> >> - Block only unprivileged user
> > 
> > That's not really what at least I had in mind.  I'd much rather
> > completely disable unprivileged passthrough for now as an easy
> > backportable patch.  And then only re-enable it later in a way
> > where it does require using SGLs for all data transfers.
> > 
> 
> I did not get how forcing SGLs can solve the issue at hand.
> The problem happened because (i) user specified short buffer/len, and 
> (ii) kernel allocated buffer. Whether the buffer is fed to device using 
> PRP or SGL does not seem to solve the large DMA problem.

The problem is a disconnect between the buffer size provided and the
implied size of the command. The idea with SGL is that it requires an
explicit buffer size, so the device will know the buffer is short and
return an appropriate error.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13 13:54       ` Keith Busch
@ 2023-10-13 15:11         ` Kanchan Joshi
  2023-10-13 15:47           ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Kanchan Joshi @ 2023-10-13 15:11 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, axboe, sagi, linux-nvme, vincentfu,
	ankit.kumar, joshiiitr, stable, Vincent Fu

On 10/13/2023 7:24 PM, Keith Busch wrote:
> On Fri, Oct 13, 2023 at 03:44:38PM +0530, Kanchan Joshi wrote:
>> On 10/13/2023 10:56 AM, Christoph Hellwig wrote:
>>> On Fri, Oct 13, 2023 at 10:44:58AM +0530, Kanchan Joshi wrote:
>>>> Changes since v3:
>>>> - Block only unprivileged user
>>>
>>> That's not really what at least I had in mind.  I'd much rather
>>> completely disable unprivileged passthrough for now as an easy
>>> backportable patch.  And then only re-enable it later in a way
>>> where it does require using SGLs for all data transfers.
>>>
>>
>> I did not get how forcing SGLs can solve the issue at hand.
>> The problem happened because (i) user specified short buffer/len, and
>> (ii) kernel allocated buffer. Whether the buffer is fed to device using
>> PRP or SGL does not seem to solve the large DMA problem.
> 
> The problem is a disconnect between the buffer size provided and the
> implied size of the command. The idea with SGL is that it requires an
> explicit buffer size, so the device will know the buffer is short and
> return an appropriate error.

Thanks for clearing this up.
It seems we will have two limitations with this approach - (i) sgl for 
the external metadata buffer, and (ii) using sgl for data-transfer will 
reduce the speed of passthrough io, perhaps more than what can happen 
using the checks. And if we make the sgl opt-in, that means leaving the 
hole for the case when this was not chosen.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13 15:11         ` Kanchan Joshi
@ 2023-10-13 15:47           ` Christoph Hellwig
  2023-10-13 18:35             ` Kanchan Joshi
  2023-10-15 19:19             ` Kanchan Joshi
  0 siblings, 2 replies; 17+ messages in thread
From: Christoph Hellwig @ 2023-10-13 15:47 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Keith Busch, Christoph Hellwig, axboe, sagi, linux-nvme,
	vincentfu, ankit.kumar, joshiiitr, stable, Vincent Fu

On Fri, Oct 13, 2023 at 08:41:54PM +0530, Kanchan Joshi wrote:
> It seems we will have two limitations with this approach - (i) sgl for 
> the external metadata buffer, and (ii) using sgl for data-transfer will 
> reduce the speed of passthrough io, perhaps more than what can happen 
> using the checks. And if we make the sgl opt-in, that means leaving the 
> hole for the case when this was not chosen.

The main limitation is that the device needs to support SGLs, and
we need to as well (we currently don't for metadata).  But for any
non-stupid workload SGLs should be at least as fast if not faster
with modern hardware.  But I see no way out.

Now can we please get a patch to disable the unprivileged passthrough
ASAP to fix this probably exploitable hole?  Or should I write one?


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13 15:47           ` Christoph Hellwig
@ 2023-10-13 18:35             ` Kanchan Joshi
  2023-10-16 18:29               ` Keith Busch
  2023-10-15 19:19             ` Kanchan Joshi
  1 sibling, 1 reply; 17+ messages in thread
From: Kanchan Joshi @ 2023-10-13 18:35 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kanchan Joshi, Keith Busch, axboe, sagi, linux-nvme, vincentfu,
	ankit.kumar, stable, Vincent Fu

On Fri, Oct 13, 2023 at 9:17 PM Christoph Hellwig <hch@lst.de> wrote:
>
> On Fri, Oct 13, 2023 at 08:41:54PM +0530, Kanchan Joshi wrote:
> > It seems we will have two limitations with this approach - (i) sgl for
> > the external metadata buffer, and (ii) using sgl for data-transfer will
> > reduce the speed of passthrough io, perhaps more than what can happen
> > using the checks. And if we make the sgl opt-in, that means leaving the
> > hole for the case when this was not chosen.
>
> The main limitation is that the device needs to support SGLs, and

Indeed. Particularly on non-enterprise drives, SGL is a luxury.

> we need to as well (we currently don't for metadata).  But for any
> non-stupid workload SGLs should be at least as fast if not faster
> with modern hardware.

But nvme-pcie selects PRP for the small IO.

> But I see no way out.
> Now can we please get a patch to disable the unprivileged passthrough
> ASAP to fix this probably exploitable hole?  Or should I write one?

I can write. I was waiting to see whether Keith has any different
opinion on the route that v4 takes.
It seems this is a no go from him.

Disabling is possible with a simple patch that just returns false from
nvme_cmd_allowed() if CAP_SYS_ADMIN is not present.
I assume that is not sought?  But a deep revert that removes all the
things such as carrying the file-mode to various functions.
Hope tomorrow is ok for that.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13 15:47           ` Christoph Hellwig
  2023-10-13 18:35             ` Kanchan Joshi
@ 2023-10-15 19:19             ` Kanchan Joshi
  2023-10-16  5:46               ` Christoph Hellwig
  1 sibling, 1 reply; 17+ messages in thread
From: Kanchan Joshi @ 2023-10-15 19:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kanchan Joshi, Keith Busch, axboe, sagi, linux-nvme, vincentfu,
	ankit.kumar, stable, Vincent Fu

On Fri, Oct 13, 2023 at 9:17 PM Christoph Hellwig <hch@lst.de> wrote:
> The main limitation is that the device needs to support SGLs, and
> we need to as well (we currently don't for metadata).  But for any
> non-stupid workload SGLs should be at least as fast if not faster
> with modern hardware.  But I see no way out.

You may agree that it's a hardware-assisted way out. It is offloading
the checks to a SGL-capable device.
I wrote some quick code in that direction but could not readily get my
hands on a device that exposes metadata-with-sgl capability.
That reminded me that we are limiting unprivileged-passthrough to a
niche set of devices/users. That is the opposite of what the feature
was for.

OTOH, this patch implemented a software-only way out. There are some
checks, but someone (either SW or HW) has to do those to keep things
right.
The patch ensures the regular user cannot exploit the hole and that
the root user continues to work as before (Keith's concern).
So, I really wonder why we don't want to go for the way that solves it
generically.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-15 19:19             ` Kanchan Joshi
@ 2023-10-16  5:46               ` Christoph Hellwig
  2023-10-26 14:33                 ` Kanchan Joshi
  0 siblings, 1 reply; 17+ messages in thread
From: Christoph Hellwig @ 2023-10-16  5:46 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Christoph Hellwig, Kanchan Joshi, Keith Busch, axboe, sagi,
	linux-nvme, vincentfu, ankit.kumar, stable, Vincent Fu

On Mon, Oct 16, 2023 at 12:49:45AM +0530, Kanchan Joshi wrote:
> OTOH, this patch implemented a software-only way out. There are some
> checks, but someone (either SW or HW) has to do those to keep things
> right.

It only verifies it to the read/write family of commands by
interpreting these commands.  It still leaves a wide hole for any
other command.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-13 18:35             ` Kanchan Joshi
@ 2023-10-16 18:29               ` Keith Busch
  2023-10-16 18:34                 ` Christoph Hellwig
  0 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2023-10-16 18:29 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Christoph Hellwig, Kanchan Joshi, axboe, sagi, linux-nvme,
	vincentfu, ankit.kumar, stable, Vincent Fu

On Sat, Oct 14, 2023 at 12:05:17AM +0530, Kanchan Joshi wrote:
> On Fri, Oct 13, 2023 at 9:17 PM Christoph Hellwig <hch@lst.de> wrote:
> > But I see no way out.
> > Now can we please get a patch to disable the unprivileged passthrough
> > ASAP to fix this probably exploitable hole?  Or should I write one?
> 
> I can write. I was waiting to see whether Keith has any different
> opinion on the route that v4 takes.

Sorry for the delay, I had some trouble with my test machine last week.

It sounds like the kernel memory is the only reason for the concern, and
you don't really care if we're corrupting user memory. If so, let's just
use that instead of kernel bounce buffers. (Minor digression, the
current bounce 'buf' is leaking kernel memory on reads since it doesn't
zero it).

Anyway, here's a diff that maps userspace buffer for "integrity"
payloads and tests okay on my side. It enforces that you can't cross
virtual page boundaries, so you can't corrupt anything outside your vma.
The allocation error handling isn't here, but it's just a proof of
concept right now.

---
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index ec8ac8cf6e1b9..f591ba0771d87 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -91,6 +91,19 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
 }
 EXPORT_SYMBOL(bio_integrity_alloc);
 
+void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
+{
+	struct bvec_iter iter;
+	struct bio_vec bv;
+	bool dirty = bio_data_dir(bip->bip_bio) == READ;
+
+	bip_for_each_vec(bv, bip, iter) {
+		if (dirty && !PageCompound(bv.bv_page))
+			set_page_dirty_lock(bv.bv_page);
+		unpin_user_page(bv.bv_page);
+	}
+}
+
 /**
  * bio_integrity_free - Free bio integrity payload
  * @bio:	bio containing bip to be freed
@@ -105,6 +118,8 @@ void bio_integrity_free(struct bio *bio)
 
 	if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
 		kfree(bvec_virt(bip->bip_vec));
+	if (bip->bip_flags & BIP_INTEGRITY_USER)
+		bio_integrity_unmap_user(bip);;
 
 	__bio_integrity_free(bs, bip);
 	bio->bi_integrity = NULL;
@@ -160,6 +175,47 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
 }
 EXPORT_SYMBOL(bio_integrity_add_page);
 
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len,
+			   u32 seed, u32 maxvecs)
+{
+	struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+	unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
+	struct page *stack_pages[UIO_FASTIOV];
+	size_t offset = offset_in_page(ubuf);
+	struct page **pages = stack_pages;
+	struct bio_integrity_payload *bip;
+	unsigned long ptr = (uintptr_t)ubuf;
+	int npages, ret, p;
+	u32 bytes;
+
+	if (ptr & align)
+		return -EINVAL;
+
+	bip = bio_integrity_alloc(bio, GFP_KERNEL, maxvecs);
+	if (IS_ERR(bip))
+		return PTR_ERR(bip);
+
+	ret = pin_user_pages_fast(ptr, UIO_FASTIOV, FOLL_WRITE, pages);
+	if (unlikely(ret < 0))
+		return ret;
+
+	npages = ret;
+	bytes = min_t(u32, len, PAGE_SIZE - offset);
+	for (p = 0; p < npages; p++) {
+		ret = bio_integrity_add_page(bio, pages[p], bytes, offset);
+		if (ret != bytes)
+			return -EINVAL;
+		len -= ret;
+		offset = 0;
+		bytes = min_t(u32, len, PAGE_SIZE);
+	}
+
+	bip->bip_iter.bi_sector = seed;
+	bip->bip_flags |= BIP_INTEGRITY_USER;
+	return 0;
+}
+EXPORT_SYMBOL(bio_integrity_map_user);
+
 /**
  * bio_integrity_process - Process integrity metadata for a bio
  * @bio:	bio to generate/verify integrity metadata for
diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
index d8ff796fd5f21..06d6bba8ed637 100644
--- a/drivers/nvme/host/ioctl.c
+++ b/drivers/nvme/host/ioctl.c
@@ -96,52 +96,18 @@ static void __user *nvme_to_user_ptr(uintptr_t ptrval)
 	return (void __user *)ptrval;
 }
 
-static void *nvme_add_user_metadata(struct request *req, void __user *ubuf,
+static int nvme_add_user_metadata(struct request *req, void __user *ubuf,
 		unsigned len, u32 seed)
 {
-	struct bio_integrity_payload *bip;
-	int ret = -ENOMEM;
-	void *buf;
 	struct bio *bio = req->bio;
+	int ret;
 
-	buf = kmalloc(len, GFP_KERNEL);
-	if (!buf)
-		goto out;
-
-	ret = -EFAULT;
-	if ((req_op(req) == REQ_OP_DRV_OUT) && copy_from_user(buf, ubuf, len))
-		goto out_free_meta;
-
-	bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
-	if (IS_ERR(bip)) {
-		ret = PTR_ERR(bip);
-		goto out_free_meta;
-	}
-
-	bip->bip_iter.bi_sector = seed;
-	ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
-			offset_in_page(buf));
-	if (ret != len) {
-		ret = -ENOMEM;
-		goto out_free_meta;
-	}
+	ret = bio_integrity_map_user(bio, ubuf, len, seed, 1);
+	if (ret)
+		return ret;
 
 	req->cmd_flags |= REQ_INTEGRITY;
-	return buf;
-out_free_meta:
-	kfree(buf);
-out:
-	return ERR_PTR(ret);
-}
-
-static int nvme_finish_user_metadata(struct request *req, void __user *ubuf,
-		void *meta, unsigned len, int ret)
-{
-	if (!ret && req_op(req) == REQ_OP_DRV_IN &&
-	    copy_to_user(ubuf, meta, len))
-		ret = -EFAULT;
-	kfree(meta);
-	return ret;
+	return 0;
 }
 
 static struct request *nvme_alloc_user_request(struct request_queue *q,
@@ -160,14 +126,12 @@ static struct request *nvme_alloc_user_request(struct request_queue *q,
 
 static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		unsigned bufflen, void __user *meta_buffer, unsigned meta_len,
-		u32 meta_seed, void **metap, struct io_uring_cmd *ioucmd,
-		unsigned int flags)
+		u32 meta_seed, struct io_uring_cmd *ioucmd, unsigned int flags)
 {
 	struct request_queue *q = req->q;
 	struct nvme_ns *ns = q->queuedata;
 	struct block_device *bdev = ns ? ns->disk->part0 : NULL;
 	struct bio *bio = NULL;
-	void *meta = NULL;
 	int ret;
 
 	if (ioucmd && (ioucmd->flags & IORING_URING_CMD_FIXED)) {
@@ -194,13 +158,10 @@ static int nvme_map_user_request(struct request *req, u64 ubuffer,
 		bio_set_dev(bio, bdev);
 
 	if (bdev && meta_buffer && meta_len) {
-		meta = nvme_add_user_metadata(req, meta_buffer, meta_len,
+		ret = nvme_add_user_metadata(req, meta_buffer, meta_len,
 				meta_seed);
-		if (IS_ERR(meta)) {
-			ret = PTR_ERR(meta);
+		if (ret)
 			goto out_unmap;
-		}
-		*metap = meta;
 	}
 
 	return ret;
@@ -221,7 +182,6 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	struct nvme_ns *ns = q->queuedata;
 	struct nvme_ctrl *ctrl;
 	struct request *req;
-	void *meta = NULL;
 	struct bio *bio;
 	u32 effects;
 	int ret;
@@ -233,7 +193,7 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	req->timeout = timeout;
 	if (ubuffer && bufflen) {
 		ret = nvme_map_user_request(req, ubuffer, bufflen, meta_buffer,
-				meta_len, meta_seed, &meta, NULL, flags);
+				meta_len, meta_seed, NULL, flags);
 		if (ret)
 			return ret;
 	}
@@ -245,9 +205,6 @@ static int nvme_submit_user_cmd(struct request_queue *q,
 	ret = nvme_execute_rq(req, false);
 	if (result)
 		*result = le64_to_cpu(nvme_req(req)->result.u64);
-	if (meta)
-		ret = nvme_finish_user_metadata(req, meta_buffer, meta,
-						meta_len, ret);
 	if (bio)
 		blk_rq_unmap_user(bio);
 	blk_mq_free_request(req);
@@ -450,7 +407,6 @@ struct nvme_uring_cmd_pdu {
 	u32 nvme_status;
 	union {
 		struct {
-			void *meta; /* kernel-resident buffer */
 			void __user *meta_buffer;
 		};
 		u64 result;
@@ -478,9 +434,6 @@ static void nvme_uring_task_meta_cb(struct io_uring_cmd *ioucmd,
 
 	result = le64_to_cpu(nvme_req(req)->result.u64);
 
-	if (pdu->meta_len)
-		status = nvme_finish_user_metadata(req, pdu->u.meta_buffer,
-					pdu->u.meta, pdu->meta_len, status);
 	if (req->bio)
 		blk_rq_unmap_user(req->bio);
 	blk_mq_free_request(req);
@@ -560,7 +513,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	struct request *req;
 	blk_opf_t rq_flags = REQ_ALLOC_CACHE;
 	blk_mq_req_flags_t blk_flags = 0;
-	void *meta = NULL;
 	int ret;
 
 	c.common.opcode = READ_ONCE(cmd->opcode);
@@ -608,7 +560,7 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	if (d.addr && d.data_len) {
 		ret = nvme_map_user_request(req, d.addr,
 			d.data_len, nvme_to_user_ptr(d.metadata),
-			d.metadata_len, 0, &meta, ioucmd, vec);
+			d.metadata_len, 0, ioucmd, vec);
 		if (ret)
 			return ret;
 	}
@@ -623,7 +575,6 @@ static int nvme_uring_cmd_io(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
 	pdu->meta_len = d.metadata_len;
 	req->end_io_data = ioucmd;
 	if (pdu->meta_len) {
-		pdu->u.meta = meta;
 		pdu->u.meta_buffer = nvme_to_user_ptr(d.metadata);
 		req->end_io = nvme_uring_cmd_end_io_meta;
 	} else {
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 41d417ee13499..965382e62f137 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -324,6 +324,7 @@ enum bip_flags {
 	BIP_CTRL_NOCHECK	= 1 << 2, /* disable HBA integrity checking */
 	BIP_DISK_NOCHECK	= 1 << 3, /* disable disk integrity checking */
 	BIP_IP_CHECKSUM		= 1 << 4, /* IP checksum */
+	BIP_INTEGRITY_USER	= 1 << 5, /* Integrity payload is user address */
 };
 
 /*
@@ -720,6 +721,7 @@ static inline bool bioset_initialized(struct bio_set *bs)
 
 extern struct bio_integrity_payload *bio_integrity_alloc(struct bio *, gfp_t, unsigned int);
 extern int bio_integrity_add_page(struct bio *, struct page *, unsigned int, unsigned int);
+extern int bio_integrity_map_user(struct bio *bio, void __user *ubuf, unsigned int len, u32 seed, u32 maxvecs);
 extern bool bio_integrity_prep(struct bio *);
 extern void bio_integrity_advance(struct bio *, unsigned int);
 extern void bio_integrity_trim(struct bio *);
@@ -789,6 +791,12 @@ static inline int bio_integrity_add_page(struct bio *bio, struct page *page,
 	return 0;
 }
 
+static inline int bio_integrity_map_user(struct bio *bio, void __user *ubuf,
+					 unsigned int len, u32 seed, u32 maxvecs)
+{
+	return -EINVAL
+}
+
 #endif /* CONFIG_BLK_DEV_INTEGRITY */
 
 /*
--


^ permalink raw reply related	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-16 18:29               ` Keith Busch
@ 2023-10-16 18:34                 ` Christoph Hellwig
  0 siblings, 0 replies; 17+ messages in thread
From: Christoph Hellwig @ 2023-10-16 18:34 UTC (permalink / raw)
  To: Keith Busch
  Cc: Kanchan Joshi, Christoph Hellwig, Kanchan Joshi, axboe, sagi,
	linux-nvme, vincentfu, ankit.kumar, stable, Vincent Fu

On Mon, Oct 16, 2023 at 12:29:23PM -0600, Keith Busch wrote:
> It sounds like the kernel memory is the only reason for the concern, and
> you don't really care if we're corrupting user memory. If so, let's just
> use that instead of kernel bounce buffers. (Minor digression, the
> current bounce 'buf' is leaking kernel memory on reads since it doesn't
> zero it).

No, arbitrary memory overwrite is always an issue, userspace or kernel,
data or metadata buffer.

Note that even without block layer bounce buffering, there can always
be other kernel memory involved, e.g. swiotlb.

We need to get the fix to disable the unprivileged passthrough in ASAP.



^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-16  5:46               ` Christoph Hellwig
@ 2023-10-26 14:33                 ` Kanchan Joshi
  2023-10-26 15:08                   ` Keith Busch
  0 siblings, 1 reply; 17+ messages in thread
From: Kanchan Joshi @ 2023-10-26 14:33 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Kanchan Joshi, Keith Busch, axboe, sagi, linux-nvme, vincentfu,
	ankit.kumar, stable, Vincent Fu

On Mon, Oct 16, 2023 at 11:16 AM Christoph Hellwig <hch@lst.de> wrote:
>
> On Mon, Oct 16, 2023 at 12:49:45AM +0530, Kanchan Joshi wrote:
> > OTOH, this patch implemented a software-only way out. There are some
> > checks, but someone (either SW or HW) has to do those to keep things
> > right.
>
> It only verifies it to the read/write family of commands by
> interpreting these commands.  It still leaves a wide hole for any
> other command.

Can you please explain for what command do you see the hole? I am
trying to see if it is really impossible to fix this hole for good.

We only need to check for specific io commands of the NVM/ZNS command
set that can do extra DMA.
Along the same lines, disallowing KV does not seem necessary either.
For admin commands, we allow only very few, and for those this hole
does not exist.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-26 14:33                 ` Kanchan Joshi
@ 2023-10-26 15:08                   ` Keith Busch
  2023-10-27  7:09                     ` Kanchan Joshi
  0 siblings, 1 reply; 17+ messages in thread
From: Keith Busch @ 2023-10-26 15:08 UTC (permalink / raw)
  To: Kanchan Joshi
  Cc: Christoph Hellwig, Kanchan Joshi, axboe, sagi, linux-nvme,
	vincentfu, ankit.kumar, stable, Vincent Fu

On Thu, Oct 26, 2023 at 08:03:30PM +0530, Kanchan Joshi wrote:
> On Mon, Oct 16, 2023 at 11:16 AM Christoph Hellwig <hch@lst.de> wrote:
> >
> > On Mon, Oct 16, 2023 at 12:49:45AM +0530, Kanchan Joshi wrote:
> > > OTOH, this patch implemented a software-only way out. There are some
> > > checks, but someone (either SW or HW) has to do those to keep things
> > > right.
> >
> > It only verifies it to the read/write family of commands by
> > interpreting these commands.  It still leaves a wide hole for any
> > other command.
> 
> Can you please explain for what command do you see the hole? I am
> trying to see if it is really impossible to fix this hole for good.
> 
> We only need to check for specific io commands of the NVM/ZNS command
> set that can do extra DMA.

The spec defines a few commands that may use MPTR, but most of the
possible opcodes that could use it are not defined in spec. You'd have
to break forward compatibility through this interface for non-root users
by limiting its use to only the known opcodes and reject everything
else.


^ permalink raw reply	[flat|nested] 17+ messages in thread

* Re: [PATCH v4] nvme: fix corruption for passthrough meta/data
  2023-10-26 15:08                   ` Keith Busch
@ 2023-10-27  7:09                     ` Kanchan Joshi
  0 siblings, 0 replies; 17+ messages in thread
From: Kanchan Joshi @ 2023-10-27  7:09 UTC (permalink / raw)
  To: Keith Busch
  Cc: Christoph Hellwig, Kanchan Joshi, axboe, sagi, linux-nvme,
	vincentfu, ankit.kumar, stable, Vincent Fu

On Thu, Oct 26, 2023 at 8:38 PM Keith Busch <kbusch@kernel.org> wrote:
>
> On Thu, Oct 26, 2023 at 08:03:30PM +0530, Kanchan Joshi wrote:
> > On Mon, Oct 16, 2023 at 11:16 AM Christoph Hellwig <hch@lst.de> wrote:
> > >
> > > On Mon, Oct 16, 2023 at 12:49:45AM +0530, Kanchan Joshi wrote:
> > > > OTOH, this patch implemented a software-only way out. There are some
> > > > checks, but someone (either SW or HW) has to do those to keep things
> > > > right.
> > >
> > > It only verifies it to the read/write family of commands by
> > > interpreting these commands.  It still leaves a wide hole for any
> > > other command.
> >
> > Can you please explain for what command do you see the hole? I am
> > trying to see if it is really impossible to fix this hole for good.
> >
> > We only need to check for specific io commands of the NVM/ZNS command
> > set that can do extra DMA.
>
> The spec defines a few commands that may use MPTR, but most of the
> possible opcodes that could use it are not defined in spec. You'd have
> to break forward compatibility through this interface for non-root users
> by limiting its use to only the known opcodes and reject everything
> else.

I see. Future commands. Looks like every solution involves non-root
users to face some form of inconvenience.


^ permalink raw reply	[flat|nested] 17+ messages in thread

end of thread, other threads:[~2023-10-27  7:09 UTC | newest]

Thread overview: 17+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <CGME20231013052157epcas5p3dc0698c56f9846191d315fa8d33ccb5c@epcas5p3.samsung.com>
2023-10-13  5:14 ` [PATCH v4] nvme: fix corruption for passthrough meta/data Kanchan Joshi
2023-10-13  5:26   ` Christoph Hellwig
2023-10-13  5:37     ` Kanchan Joshi
2023-10-13 10:14     ` Kanchan Joshi
2023-10-13 12:59       ` Kanchan Joshi
2023-10-13 13:54       ` Keith Busch
2023-10-13 15:11         ` Kanchan Joshi
2023-10-13 15:47           ` Christoph Hellwig
2023-10-13 18:35             ` Kanchan Joshi
2023-10-16 18:29               ` Keith Busch
2023-10-16 18:34                 ` Christoph Hellwig
2023-10-15 19:19             ` Kanchan Joshi
2023-10-16  5:46               ` Christoph Hellwig
2023-10-26 14:33                 ` Kanchan Joshi
2023-10-26 15:08                   ` Keith Busch
2023-10-27  7:09                     ` Kanchan Joshi
2023-10-13  5:32   ` Kanchan Joshi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).