linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH v2] nvme: provide fallback for discard alloc failure
@ 2018-12-12 16:18 Jens Axboe
  2018-12-12 16:28 ` Keith Busch
                   ` (2 more replies)
  0 siblings, 3 replies; 6+ messages in thread
From: Jens Axboe @ 2018-12-12 16:18 UTC (permalink / raw)
  To: linux-block, linux-nvme; +Cc: Christoph Hellwig

When boxes are run near (or to) OOM, we have a problem with the discard
page allocation in nvme. If we fail allocating the special page, we
return busy, and it'll get retried. But since ordering is honored for
dispatch requests, we can keep retrying this same IO and failing. Behind
that IO could be requests that want to free memory, but they never get
the chance.

Allocate a fixed discard page per controller for a safe fallback, and use
that if the initial allocation fails.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

---

No functional change, just added a comment, and a safe guard BUILD_BUG_ON
to ensure that our max discard segments always fit within a single page.
And corrected the s/sd/nvme typo in the commit message.

diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index c71e879821ad..c57cf40e7e04 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -564,9 +564,19 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	struct nvme_dsm_range *range;
 	struct bio *bio;
 
-	range = kmalloc_array(segments, sizeof(*range), GFP_ATOMIC);
-	if (!range)
-		return BLK_STS_RESOURCE;
+	range = kmalloc_array(segments, sizeof(*range),
+				GFP_ATOMIC | __GFP_NOWARN);
+	if (!range) {
+		/*
+		 * If we fail allocation our range, fallback to the controller
+		 * discard page. If that's also busy, it's safe to return
+		 * busy, as we know we can make progress once that's freed.
+		 */
+		if (test_and_set_bit_lock(0, &ns->ctrl->discard_page_busy))
+			return BLK_STS_RESOURCE;
+
+		range = page_address(ns->ctrl->discard_page);
+	}
 
 	__rq_for_each_bio(bio, req) {
 		u64 slba = nvme_block_nr(ns, bio->bi_iter.bi_sector);
@@ -581,7 +591,10 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
 	}
 
 	if (WARN_ON_ONCE(n != segments)) {
-		kfree(range);
+		if (virt_to_page(range) == ns->ctrl->discard_page)
+			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+		else
+			kfree(range);
 		return BLK_STS_IOERR;
 	}
 
@@ -664,8 +677,13 @@ void nvme_cleanup_cmd(struct request *req)
 				blk_rq_bytes(req) >> ns->lba_shift);
 	}
 	if (req->rq_flags & RQF_SPECIAL_PAYLOAD) {
-		kfree(page_address(req->special_vec.bv_page) +
-		      req->special_vec.bv_offset);
+		struct nvme_ns *ns = req->rq_disk->private_data;
+		struct page *page = req->special_vec.bv_page;
+
+		if (page == ns->ctrl->discard_page)
+			clear_bit_unlock(0, &ns->ctrl->discard_page_busy);
+		else
+			kfree(page_address(page) + req->special_vec.bv_offset);
 	}
 }
 EXPORT_SYMBOL_GPL(nvme_cleanup_cmd);
@@ -3578,6 +3596,7 @@ static void nvme_free_ctrl(struct device *dev)
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 	kfree(ctrl->effects);
 	nvme_mpath_uninit(ctrl);
+	kfree(ctrl->discard_page);
 
 	if (subsys) {
 		mutex_lock(&subsys->lock);
@@ -3618,6 +3637,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 	memset(&ctrl->ka_cmd, 0, sizeof(ctrl->ka_cmd));
 	ctrl->ka_cmd.common.opcode = nvme_admin_keep_alive;
 
+	BUILD_BUG_ON(NVME_DSM_MAX_RANGES * sizeof(struct nvme_dsm_range) >
+			PAGE_SIZE);
+	ctrl->discard_page = alloc_page(GFP_KERNEL);
+	if (!ctrl->discard_page) {
+		ret = -ENOMEM;
+		goto out;
+	}
+
 	ret = ida_simple_get(&nvme_instance_ida, 0, 0, GFP_KERNEL);
 	if (ret < 0)
 		goto out;
@@ -3655,6 +3682,8 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
 out_release_instance:
 	ida_simple_remove(&nvme_instance_ida, ctrl->instance);
 out:
+	if (ctrl->discard_page)
+		__free_page(ctrl->discard_page);
 	return ret;
 }
 EXPORT_SYMBOL_GPL(nvme_init_ctrl);
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index e20e737ac10c..f1fe88598a04 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -241,6 +241,9 @@ struct nvme_ctrl {
 	u16 maxcmd;
 	int nr_reconnects;
 	struct nvmf_ctrl_options *opts;
+
+	struct page *discard_page;
+	unsigned long discard_page_busy;
 };
 
 struct nvme_subsystem {

-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] nvme: provide fallback for discard alloc failure
  2018-12-12 16:18 [PATCH v2] nvme: provide fallback for discard alloc failure Jens Axboe
@ 2018-12-12 16:28 ` Keith Busch
  2018-12-12 16:36   ` Jens Axboe
  2018-12-13  3:00 ` Sagi Grimberg
  2018-12-13  8:36 ` Christoph Hellwig
  2 siblings, 1 reply; 6+ messages in thread
From: Keith Busch @ 2018-12-12 16:28 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-nvme, Christoph Hellwig

On Wed, Dec 12, 2018 at 09:18:11AM -0700, Jens Axboe wrote:
> When boxes are run near (or to) OOM, we have a problem with the discard
> page allocation in nvme. If we fail allocating the special page, we
> return busy, and it'll get retried. But since ordering is honored for
> dispatch requests, we can keep retrying this same IO and failing. Behind
> that IO could be requests that want to free memory, but they never get
> the chance.
> 
> Allocate a fixed discard page per controller for a safe fallback, and use
> that if the initial allocation fails.

Do we need to allocate this per controller? One page for the whole driver
may be sufficient to make forward progress, right?

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] nvme: provide fallback for discard alloc failure
  2018-12-12 16:28 ` Keith Busch
@ 2018-12-12 16:36   ` Jens Axboe
  2018-12-12 16:50     ` Keith Busch
  0 siblings, 1 reply; 6+ messages in thread
From: Jens Axboe @ 2018-12-12 16:36 UTC (permalink / raw)
  To: Keith Busch; +Cc: linux-block, linux-nvme, Christoph Hellwig

On 12/12/18 9:28 AM, Keith Busch wrote:
> On Wed, Dec 12, 2018 at 09:18:11AM -0700, Jens Axboe wrote:
>> When boxes are run near (or to) OOM, we have a problem with the discard
>> page allocation in nvme. If we fail allocating the special page, we
>> return busy, and it'll get retried. But since ordering is honored for
>> dispatch requests, we can keep retrying this same IO and failing. Behind
>> that IO could be requests that want to free memory, but they never get
>> the chance.
>>
>> Allocate a fixed discard page per controller for a safe fallback, and use
>> that if the initial allocation fails.
> 
> Do we need to allocate this per controller? One page for the whole driver
> may be sufficient to make forward progress, right?

It should be, but that might create a shit storm if we're OOM and have
tons of drives. I think one per controller is saner, and it's dwarfed
by memory we consume anyway in static allocations.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] nvme: provide fallback for discard alloc failure
  2018-12-12 16:36   ` Jens Axboe
@ 2018-12-12 16:50     ` Keith Busch
  0 siblings, 0 replies; 6+ messages in thread
From: Keith Busch @ 2018-12-12 16:50 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-nvme, Christoph Hellwig

On Wed, Dec 12, 2018 at 09:36:36AM -0700, Jens Axboe wrote:
> On 12/12/18 9:28 AM, Keith Busch wrote:
> > On Wed, Dec 12, 2018 at 09:18:11AM -0700, Jens Axboe wrote:
> >> When boxes are run near (or to) OOM, we have a problem with the discard
> >> page allocation in nvme. If we fail allocating the special page, we
> >> return busy, and it'll get retried. But since ordering is honored for
> >> dispatch requests, we can keep retrying this same IO and failing. Behind
> >> that IO could be requests that want to free memory, but they never get
> >> the chance.
> >>
> >> Allocate a fixed discard page per controller for a safe fallback, and use
> >> that if the initial allocation fails.
> > 
> > Do we need to allocate this per controller? One page for the whole driver
> > may be sufficient to make forward progress, right?
> 
> It should be, but that might create a shit storm if we're OOM and have
> tons of drives. I think one per controller is saner, and it's dwarfed
> by memory we consume anyway in static allocations.

Okay fair enough.

Reviewed-by: Keith Busch <keith.busch@intel.com>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] nvme: provide fallback for discard alloc failure
  2018-12-12 16:18 [PATCH v2] nvme: provide fallback for discard alloc failure Jens Axboe
  2018-12-12 16:28 ` Keith Busch
@ 2018-12-13  3:00 ` Sagi Grimberg
  2018-12-13  8:36 ` Christoph Hellwig
  2 siblings, 0 replies; 6+ messages in thread
From: Sagi Grimberg @ 2018-12-13  3:00 UTC (permalink / raw)
  To: Jens Axboe, linux-block, linux-nvme; +Cc: Christoph Hellwig

Reviewed-by: Sagi Grimberg <sagi@grimberg.me>

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH v2] nvme: provide fallback for discard alloc failure
  2018-12-12 16:18 [PATCH v2] nvme: provide fallback for discard alloc failure Jens Axboe
  2018-12-12 16:28 ` Keith Busch
  2018-12-13  3:00 ` Sagi Grimberg
@ 2018-12-13  8:36 ` Christoph Hellwig
  2 siblings, 0 replies; 6+ messages in thread
From: Christoph Hellwig @ 2018-12-13  8:36 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block, linux-nvme, Christoph Hellwig

Thanks, applied to nvme-4.21.

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2018-12-13  8:36 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-12-12 16:18 [PATCH v2] nvme: provide fallback for discard alloc failure Jens Axboe
2018-12-12 16:28 ` Keith Busch
2018-12-12 16:36   ` Jens Axboe
2018-12-12 16:50     ` Keith Busch
2018-12-13  3:00 ` Sagi Grimberg
2018-12-13  8:36 ` Christoph Hellwig

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).