All of lore.kernel.org
 help / color / mirror / Atom feed
From: Damien Le Moal <dlemoal@kernel.org>
To: linux-block@vger.kernel.org, Jens Axboe <axboe@kernel.dk>,
	linux-scsi@vger.kernel.org,
	"Martin K . Petersen" <martin.petersen@oracle.com>,
	dm-devel@lists.linux.dev, Mike Snitzer <snitzer@redhat.com>,
	linux-nvme@lists.infradead.org, Keith Busch <kbusch@kernel.org>,
	Christoph Hellwig <hch@lst.de>
Subject: [PATCH v3 09/30] block: Pre-allocate zone write plugs
Date: Thu, 28 Mar 2024 09:43:48 +0900	[thread overview]
Message-ID: <20240328004409.594888-10-dlemoal@kernel.org> (raw)
In-Reply-To: <20240328004409.594888-1-dlemoal@kernel.org>

Allocating zone write plugs using kmalloc() does not guarantee that
enough write plugs can be allocated to simultaneously write up to
the maximum number of active zones or maximum number of open zones of
a zoned block device.

Avoid any issue with memory allocation by pre-allocating zone write
plugs up to the disk maximum number of open zones or maximum number of
active zones, whichever is larger. For zoned devices that do not have
open or active zone limits, the default 128 is used as the number of
write plugs to pre-allocate.

Pre-allocated zone write plugs are managed using a free list. If a
change to the device zone limits is detected, the disk free list is
grown if needed when blk_revalidate_disk_zones() is executed.

Signed-off-by: Damien Le Moal <dlemoal@kernel.org>
---
 block/blk-zoned.c      | 124 ++++++++++++++++++++++++++++++++++++-----
 include/linux/blkdev.h |   2 +
 2 files changed, 113 insertions(+), 13 deletions(-)

diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 03083522df84..3084dae5408e 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -39,7 +39,8 @@ static const char *const zone_cond_name[] = {
 /*
  * Per-zone write plug.
  * @node: hlist_node structure for managing the plug using a hash table.
- * @link: To list the plug in the zone write plug error list of the disk.
+ * @link: To list the plug in the zone write plug free list or error list of
+ *        the disk.
  * @ref: Zone write plug reference counter. A zone write plug reference is
  *       always at least 1 when the plug is hashed in the disk plug hash table.
  *       The reference is incremented whenever a new BIO needing plugging is
@@ -57,6 +58,7 @@ static const char *const zone_cond_name[] = {
  * @bio_list: The list of BIOs that are currently plugged.
  * @bio_work: Work struct to handle issuing of plugged BIOs
  * @rcu_head: RCU head to free zone write plugs with an RCU grace period.
+ * @disk: The gendisk the plug belongs to.
  */
 struct blk_zone_wplug {
 	struct hlist_node	node;
@@ -69,6 +71,7 @@ struct blk_zone_wplug {
 	struct bio_list		bio_list;
 	struct work_struct	bio_work;
 	struct rcu_head		rcu_head;
+	struct gendisk		*disk;
 };
 
 /*
@@ -85,10 +88,14 @@ struct blk_zone_wplug {
  *    to prevent new references to the zone write plug to be taken for
  *    newly incoming BIOs. A zone write plug flagged with this flag will be
  *    freed once all remaining references from BIOs or functions are dropped.
+ *  - BLK_ZONE_WPLUG_NEEDS_FREE: Indicates that the zone write plug was
+ *    dynamically allocated and needs to be freed instead of returned to the
+ *    free list of zone write plugs of the disk.
  */
 #define BLK_ZONE_WPLUG_PLUGGED		(1U << 0)
 #define BLK_ZONE_WPLUG_ERROR		(1U << 1)
 #define BLK_ZONE_WPLUG_UNHASHED		(1U << 2)
+#define BLK_ZONE_WPLUG_NEEDS_FREE	(1U << 3)
 
 #define BLK_ZONE_WPLUG_BUSY	(BLK_ZONE_WPLUG_PLUGGED | BLK_ZONE_WPLUG_ERROR)
 
@@ -519,23 +526,51 @@ static void disk_init_zone_wplug(struct gendisk *disk,
 	zwplug->wp_offset = sector & (disk->queue->limits.chunk_sectors - 1);
 	bio_list_init(&zwplug->bio_list);
 	INIT_WORK(&zwplug->bio_work, blk_zone_wplug_bio_work);
+	zwplug->disk = disk;
 }
 
 static struct blk_zone_wplug *disk_alloc_zone_wplug(struct gendisk *disk,
 						sector_t sector, gfp_t gfp_mask)
 {
-	struct blk_zone_wplug *zwplug;
+	struct blk_zone_wplug *zwplug = NULL;
+	unsigned int zwp_flags = 0;
+	unsigned long flags;
 
-	/* Allocate a new zone write plug. */
-	zwplug = kmalloc(sizeof(struct blk_zone_wplug), gfp_mask);
-	if (!zwplug)
-		return NULL;
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	zwplug = list_first_entry_or_null(&disk->zone_wplugs_free_list,
+					  struct blk_zone_wplug, link);
+	if (zwplug)
+		list_del_init(&zwplug->link);
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
 
-	disk_init_zone_wplug(disk, zwplug, 0, sector);
+	if (!zwplug) {
+		/* Allocate a new zone write plug. */
+		zwplug = kmalloc(sizeof(struct blk_zone_wplug), gfp_mask);
+		if (!zwplug)
+			return NULL;
+		zwp_flags = BLK_ZONE_WPLUG_NEEDS_FREE;
+	}
+
+	disk_init_zone_wplug(disk, zwplug, zwp_flags, sector);
 
 	return zwplug;
 }
 
+static void disk_free_zone_wplug(struct blk_zone_wplug *zwplug)
+{
+	struct gendisk *disk = zwplug->disk;
+	unsigned long flags;
+
+	if (zwplug->flags & BLK_ZONE_WPLUG_NEEDS_FREE) {
+		kfree(zwplug);
+		return;
+	}
+
+	spin_lock_irqsave(&disk->zone_wplugs_lock, flags);
+	list_add_tail(&zwplug->link, &disk->zone_wplugs_free_list);
+	spin_unlock_irqrestore(&disk->zone_wplugs_lock, flags);
+}
+
 static bool disk_insert_zone_wplug(struct gendisk *disk,
 				   struct blk_zone_wplug *zwplug)
 {
@@ -630,18 +665,24 @@ static struct blk_zone_wplug *disk_get_zone_wplug(struct gendisk *disk,
 	return zwplug;
 }
 
+static void disk_free_zone_wplug_rcu(struct rcu_head *rcu_head)
+{
+	struct blk_zone_wplug *zwplug =
+		container_of(rcu_head, struct blk_zone_wplug, rcu_head);
+
+	disk_free_zone_wplug(zwplug);
+}
+
 static inline void disk_put_zone_wplug(struct blk_zone_wplug *zwplug)
 {
 	if (atomic_dec_and_test(&zwplug->ref)) {
 		WARN_ON_ONCE(!bio_list_empty(&zwplug->bio_list));
 		WARN_ON_ONCE(!list_empty(&zwplug->link));
 
-		kfree_rcu(zwplug, rcu_head);
+		call_rcu(&zwplug->rcu_head, disk_free_zone_wplug_rcu);
 	}
 }
 
-static void blk_zone_wplug_bio_work(struct work_struct *work);
-
 /*
  * Get a reference on the write plug for the zone containing @sector.
  * If the plug does not exist, it is allocated and hashed.
@@ -684,7 +725,7 @@ static struct blk_zone_wplug *disk_get_and_lock_zone_wplug(struct gendisk *disk,
 	 */
 	if (!disk_insert_zone_wplug(disk, zwplug)) {
 		spin_unlock_irqrestore(&zwplug->lock, *flags);
-		kfree(zwplug);
+		disk_free_zone_wplug(zwplug);
 		goto again;
 	}
 
@@ -1401,6 +1442,30 @@ static inline unsigned int disk_zone_wplugs_hash_size(struct gendisk *disk)
 	return 1U << disk->zone_wplugs_hash_bits;
 }
 
+static int disk_alloc_zone_wplugs(struct gendisk *disk,
+				  unsigned int max_nr_zwplugs)
+{
+	struct blk_zone_wplug *zwplug;
+	unsigned int i;
+
+	if (!disk->zone_wplugs_hash)
+		return 0;
+
+	/* Pre-allocate zone write plugs */
+	for (i = 0; i < max_nr_zwplugs; i++) {
+		zwplug = kmalloc(sizeof(struct blk_zone_wplug), GFP_KERNEL);
+		if (!zwplug)
+			return -ENOMEM;
+		disk_init_zone_wplug(disk, zwplug, 0, 0);
+
+		list_add_tail(&zwplug->link, &disk->zone_wplugs_free_list);
+	}
+
+	disk->zone_wplugs_max_nr += max_nr_zwplugs;
+
+	return 0;
+}
+
 static void disk_free_zone_wplugs(struct gendisk *disk)
 {
 	struct blk_zone_wplug *zwplug;
@@ -1422,11 +1487,22 @@ static void disk_free_zone_wplugs(struct gendisk *disk)
 
 	/* Wait for the zone write plugs to be RCU-freed. */
 	rcu_barrier();
+
+	while (!list_empty(&disk->zone_wplugs_free_list)) {
+		zwplug = list_first_entry(&disk->zone_wplugs_free_list,
+					  struct blk_zone_wplug, link);
+		list_del_init(&zwplug->link);
+
+		kfree(zwplug);
+	}
+
+	disk->zone_wplugs_max_nr = 0;
 }
 
 void disk_init_zone_resources(struct gendisk *disk)
 {
 	spin_lock_init(&disk->zone_wplugs_lock);
+	INIT_LIST_HEAD(&disk->zone_wplugs_free_list);
 	INIT_LIST_HEAD(&disk->zone_wplugs_err_list);
 	INIT_WORK(&disk->zone_wplugs_work, disk_zone_wplugs_work);
 }
@@ -1444,6 +1520,7 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
 				     unsigned int max_nr_zwplugs)
 {
 	unsigned int i;
+	int ret;
 
 	disk->zone_wplugs_hash_bits =
 		min(ilog2(max_nr_zwplugs) + 1, BLK_ZONE_MAX_WPLUG_HASH_BITS);
@@ -1457,6 +1534,15 @@ static int disk_alloc_zone_resources(struct gendisk *disk,
 	for (i = 0; i < disk_zone_wplugs_hash_size(disk); i++)
 		INIT_HLIST_HEAD(&disk->zone_wplugs_hash[i]);
 
+	ret = disk_alloc_zone_wplugs(disk, max_nr_zwplugs);
+	if (ret) {
+		disk_free_zone_wplugs(disk);
+		kfree(disk->zone_wplugs_hash);
+		disk->zone_wplugs_hash = NULL;
+		disk->zone_wplugs_hash_bits = 0;
+		return ret;
+	}
+
 	return 0;
 }
 
@@ -1484,6 +1570,7 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
 {
 	struct queue_limits *lim = &disk->queue->limits;
 	unsigned int max_nr_zwplugs;
+	int ret;
 
 	/*
 	 * If the device has no limit on the maximum number of open and active
@@ -1495,8 +1582,19 @@ static int disk_revalidate_zone_resources(struct gendisk *disk,
 		max_nr_zwplugs =
 			min(BLK_ZONE_DEFAULT_MAX_NR_WPLUGS, nr_zones);
 
-	if (!disk->zone_wplugs_hash)
-		return disk_alloc_zone_resources(disk, max_nr_zwplugs);
+	if (!disk->zone_wplugs_hash) {
+		ret = disk_alloc_zone_resources(disk, max_nr_zwplugs);
+		if (ret)
+			return ret;
+	}
+
+	/* Grow the free list of zone write plugs if needed. */
+	if (disk->zone_wplugs_max_nr < max_nr_zwplugs) {
+		ret = disk_alloc_zone_wplugs(disk,
+				max_nr_zwplugs - disk->zone_wplugs_max_nr);
+		if (ret)
+			return ret;
+	}
 
 	return 0;
 }
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 6faa1abe8506..962ee0496659 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -194,9 +194,11 @@ struct gendisk {
 	unsigned int		zone_capacity;
 	unsigned long		*conv_zones_bitmap;
 	unsigned long		*seq_zones_wlock;
+	unsigned int		zone_wplugs_max_nr;
 	unsigned int            zone_wplugs_hash_bits;
 	spinlock_t              zone_wplugs_lock;
 	struct hlist_head       *zone_wplugs_hash;
+	struct list_head        zone_wplugs_free_list;
 	struct list_head        zone_wplugs_err_list;
 	struct work_struct	zone_wplugs_work;
 #endif /* CONFIG_BLK_DEV_ZONED */
-- 
2.44.0


  parent reply	other threads:[~2024-03-28  0:44 UTC|newest]

Thread overview: 109+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2024-03-28  0:43 [PATCH v3 00/30] Zone write plugging Damien Le Moal
2024-03-28  0:43 ` [PATCH v3 01/30] block: Do not force full zone append completion in req_bio_endio() Damien Le Moal
2024-03-28  4:10   ` Christoph Hellwig
2024-03-28 18:14   ` Bart Van Assche
2024-03-28 22:43     ` Damien Le Moal
2024-03-28 23:03       ` Jens Axboe
2024-03-28  0:43 ` [PATCH v3 02/30] block: Restore sector of flush requests Damien Le Moal
2024-03-28  0:43 ` [PATCH v3 03/30] block: Remove req_bio_endio() Damien Le Moal
2024-03-28  4:13   ` Christoph Hellwig
2024-03-28 21:28   ` Bart Van Assche
2024-03-28 22:42     ` Damien Le Moal
2024-03-28  0:43 ` [PATCH v3 04/30] block: Introduce blk_zone_update_request_bio() Damien Le Moal
2024-03-28  4:14   ` Christoph Hellwig
2024-03-28  5:20     ` Damien Le Moal
2024-03-28  5:42       ` Christoph Hellwig
2024-03-28  5:54         ` Damien Le Moal
2024-03-28 21:31   ` Bart Van Assche
2024-03-28  0:43 ` [PATCH v3 05/30] block: Introduce bio_straddles_zones() and bio_offset_from_zone_start() Damien Le Moal
2024-03-28 21:32   ` Bart Van Assche
2024-03-28  0:43 ` [PATCH v3 06/30] block: Allow using bio_attempt_back_merge() internally Damien Le Moal
2024-03-28  0:43 ` [PATCH v3 07/30] block: Remember zone capacity when revalidating zones Damien Le Moal
2024-03-28 21:38   ` Bart Van Assche
2024-03-28 22:40     ` Damien Le Moal
2024-03-28  0:43 ` [PATCH v3 08/30] block: Introduce zone write plugging Damien Le Moal
2024-03-28  4:48   ` Christoph Hellwig
2024-03-28 22:20   ` Bart Van Assche
2024-03-28 22:38     ` Damien Le Moal
2024-03-29 18:20       ` Bart Van Assche
2024-03-28  0:43 ` Damien Le Moal [this message]
2024-03-28  4:30   ` [PATCH v3 09/30] block: Pre-allocate zone write plugs Christoph Hellwig
2024-03-28  5:28     ` Damien Le Moal
2024-03-28  5:46       ` Christoph Hellwig
2024-03-28  6:02         ` Damien Le Moal
2024-03-28  6:03           ` Christoph Hellwig
2024-03-28  6:18             ` Damien Le Moal
2024-03-28  6:22               ` Christoph Hellwig
2024-03-28  6:33                 ` Damien Le Moal
2024-03-28  6:38                   ` Christoph Hellwig
2024-03-28  6:51                     ` Damien Le Moal
2024-03-28  6:52                       ` Christoph Hellwig
2024-03-28  6:53                         ` Damien Le Moal
2024-03-28 22:25     ` Bart Van Assche
2024-03-28 22:29   ` Bart Van Assche
2024-03-28 22:33     ` Damien Le Moal
2024-03-28  0:43 ` [PATCH v3 10/30] block: Fake max open zones limit when there is no limit Damien Le Moal
2024-03-28  4:49   ` Christoph Hellwig
2024-03-29 20:37   ` Bart Van Assche
2024-03-28  0:43 ` [PATCH v3 11/30] block: Allow zero value of max_zone_append_sectors queue limit Damien Le Moal
2024-03-28  4:49   ` Christoph Hellwig
2024-03-29 20:50   ` Bart Van Assche
2024-03-28  0:43 ` [PATCH v3 12/30] block: Implement zone append emulation Damien Le Moal
2024-03-28  4:50   ` Christoph Hellwig
2024-03-29 21:22   ` Bart Van Assche
2024-03-29 21:26   ` Bart Van Assche
2024-03-28  0:43 ` [PATCH v3 13/30] block: Allow BIO-based drivers to use blk_revalidate_disk_zones() Damien Le Moal
2024-03-28  0:43 ` [PATCH v3 14/30] dm: Use the block layer zone append emulation Damien Le Moal
2024-03-28  0:43 ` [PATCH v3 15/30] scsi: sd: " Damien Le Moal
2024-03-28  4:50   ` Christoph Hellwig
2024-03-28 10:49   ` Johannes Thumshirn
2024-03-29 21:27   ` Bart Van Assche
2024-03-28  0:43 ` [PATCH v3 16/30] ublk_drv: Do not request ELEVATOR_F_ZBD_SEQ_WRITE elevator feature Damien Le Moal
2024-03-28  4:50   ` Christoph Hellwig
2024-03-29 21:28   ` Bart Van Assche
2024-03-28  0:43 ` [PATCH v3 17/30] null_blk: " Damien Le Moal
2024-03-28  4:51   ` Christoph Hellwig
2024-03-29 21:29   ` Bart Van Assche
2024-04-02  6:43   ` Chaitanya Kulkarni
2024-03-28  0:43 ` [PATCH v3 18/30] null_blk: Introduce zone_append_max_sectors attribute Damien Le Moal
2024-03-28  4:51   ` Christoph Hellwig
2024-03-29 21:35   ` Bart Van Assche
2024-03-30  0:33     ` Damien Le Moal
2024-04-02  6:44   ` Chaitanya Kulkarni
2024-03-28  0:43 ` [PATCH v3 19/30] null_blk: Introduce fua attribute Damien Le Moal
2024-03-28  4:52   ` Christoph Hellwig
2024-03-29 21:36   ` Bart Van Assche
2024-04-02  6:42   ` Chaitanya Kulkarni
2024-03-28  0:43 ` [PATCH v3 20/30] nvmet: zns: Do not reference the gendisk conv_zones_bitmap Damien Le Moal
2024-04-02  6:45   ` Chaitanya Kulkarni
2024-03-28  0:44 ` [PATCH v3 21/30] block: Remove BLK_STS_ZONE_RESOURCE Damien Le Moal
2024-03-29 21:37   ` Bart Van Assche
2024-03-28  0:44 ` [PATCH v3 22/30] block: Simplify blk_revalidate_disk_zones() interface Damien Le Moal
2024-03-29 21:41   ` Bart Van Assche
2024-03-28  0:44 ` [PATCH v3 23/30] block: mq-deadline: Remove support for zone write locking Damien Le Moal
2024-03-28  4:52   ` Christoph Hellwig
2024-03-29 21:43   ` Bart Van Assche
2024-03-28  0:44 ` [PATCH v3 24/30] block: Remove elevator required features Damien Le Moal
2024-03-29 21:44   ` Bart Van Assche
2024-03-28  0:44 ` [PATCH v3 25/30] block: Do not check zone type in blk_check_zone_append() Damien Le Moal
2024-03-29 21:45   ` Bart Van Assche
2024-03-28  0:44 ` [PATCH v3 26/30] block: Move zone related debugfs attribute to blk-zoned.c Damien Le Moal
2024-03-28  4:52   ` Christoph Hellwig
2024-03-29 19:00   ` Bart Van Assche
2024-03-28  0:44 ` [PATCH v3 27/30] block: Replace zone_wlock debugfs entry with zone_wplugs entry Damien Le Moal
2024-03-28  4:53   ` Christoph Hellwig
2024-03-29 18:54   ` Bart Van Assche
2024-03-28  0:44 ` [PATCH v3 28/30] block: Remove zone write locking Damien Le Moal
2024-03-29 18:57   ` Bart Van Assche
2024-03-28  0:44 ` [PATCH v3 29/30] block: Do not force select mq-deadline with CONFIG_BLK_DEV_ZONED Damien Le Moal
2024-03-28  4:53   ` Christoph Hellwig
2024-03-28  0:44 ` [PATCH v3 30/30] block: Do not special-case plugging of zone write operations Damien Le Moal
2024-03-28  4:54   ` Christoph Hellwig
2024-03-28  6:43     ` Damien Le Moal
2024-03-28  6:51       ` Christoph Hellwig
2024-03-28  6:54         ` Damien Le Moal
2024-03-29 18:58   ` Bart Van Assche
2024-03-28 23:05 ` (subset) [PATCH v3 00/30] Zone write plugging Jens Axboe
2024-03-28 23:13   ` Damien Le Moal
2024-03-28 23:27     ` Jens Axboe
2024-03-28 23:33       ` Damien Le Moal

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20240328004409.594888-10-dlemoal@kernel.org \
    --to=dlemoal@kernel.org \
    --cc=axboe@kernel.dk \
    --cc=dm-devel@lists.linux.dev \
    --cc=hch@lst.de \
    --cc=kbusch@kernel.org \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-nvme@lists.infradead.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=martin.petersen@oracle.com \
    --cc=snitzer@redhat.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.