All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v5 0/3] md/bitmap: Optimal last page size
@ 2023-02-24 18:33 Jonathan Derrick
  2023-02-24 18:33 ` [PATCH v5 1/3] md: Move sb writer loop to its own function Jonathan Derrick
                   ` (3 more replies)
  0 siblings, 4 replies; 12+ messages in thread
From: Jonathan Derrick @ 2023-02-24 18:33 UTC (permalink / raw)
  To: Song Liu, linux-raid
  Cc: Reindl Harald, Xiao Ni, Christoph Hellwig, Christoph Hellwig,
	Paul Menzel, Sushma Kalakota, Jon Derrick

From: Jon Derrick <jonathan.derrick@linux.dev>

Currently the last bitmap page write will size itself down to the logical block
size. This could cause less performance for devices which have atomic write
units larger than the block size, such as many NVMe devices with 4kB write
units and 512B block sizes. There is usually a large amount of space after the
bitmap and using the optimal I/O size could favor speed over size.

This was tested on an Intel/Solidigm P5520 drive with lba format 512B,
optimal I/O size of 4kB, resulting in a > 10x IOPS increase.

See patch 3 log for results.

v4->v5: Initialized rdev (kernel test bot)
v3->v4: Fixed reviewers concerns
v2->v3: Prep patch added and types fixed
Added helpers for optimal I/O sizes

Jon Derrick (3):
  md: Move sb writer loop to its own function
  md: Fix types in sb writer
  md: Use optimal I/O size for last bitmap page

 drivers/md/md-bitmap.c | 143 ++++++++++++++++++++++++-----------------
 1 file changed, 83 insertions(+), 60 deletions(-)

-- 
2.27.0


^ permalink raw reply	[flat|nested] 12+ messages in thread

* [PATCH v5 1/3] md: Move sb writer loop to its own function
  2023-02-24 18:33 [PATCH v5 0/3] md/bitmap: Optimal last page size Jonathan Derrick
@ 2023-02-24 18:33 ` Jonathan Derrick
  2023-02-24 18:33 ` [PATCH v5 2/3] md: Fix types in sb writer Jonathan Derrick
                   ` (2 subsequent siblings)
  3 siblings, 0 replies; 12+ messages in thread
From: Jonathan Derrick @ 2023-02-24 18:33 UTC (permalink / raw)
  To: Song Liu, linux-raid
  Cc: Reindl Harald, Xiao Ni, Christoph Hellwig, Christoph Hellwig,
	Paul Menzel, Sushma Kalakota, Jon Derrick

From: Jon Derrick <jonathan.derrick@linux.dev>

Preparatory patch for optimal I/O size calculation. Move the sb writer
loop routine into its own function for clarity.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jon Derrick <jonathan.derrick@linux.dev>
---
 drivers/md/md-bitmap.c | 125 +++++++++++++++++++++--------------------
 1 file changed, 65 insertions(+), 60 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index e7cc6ba1b657..cdc61e65abae 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -209,76 +209,81 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
 	return NULL;
 }
 
-static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
+			   struct page *page)
 {
-	struct md_rdev *rdev;
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
 	struct bitmap_storage *store = &bitmap->storage;
+	loff_t offset = mddev->bitmap_info.offset;
+	int size = PAGE_SIZE;
+
+	bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
+	if (page->index == store->file_pages - 1) {
+		int last_page_size = store->bytes & (PAGE_SIZE - 1);
+
+		if (last_page_size == 0)
+			last_page_size = PAGE_SIZE;
+		size = roundup(last_page_size,
+			       bdev_logical_block_size(bdev));
+	}
+
+	/* Just make sure we aren't corrupting data or metadata */
+	if (mddev->external) {
+		/* Bitmap could be anywhere. */
+		if (rdev->sb_start + offset
+		    + (page->index * (PAGE_SIZE / SECTOR_SIZE))
+		    > rdev->data_offset &&
+		    rdev->sb_start + offset
+		    < (rdev->data_offset + mddev->dev_sectors
+		     + (PAGE_SIZE / SECTOR_SIZE)))
+			return -EINVAL;
+	} else if (offset < 0) {
+		/* DATA  BITMAP METADATA  */
+		if (offset
+		    + (long)(page->index * (PAGE_SIZE / SECTOR_SIZE))
+		    + size / SECTOR_SIZE > 0)
+			/* bitmap runs in to metadata */
+			return -EINVAL;
+
+		if (rdev->data_offset + mddev->dev_sectors
+		    > rdev->sb_start + offset)
+			/* data runs in to bitmap */
+			return -EINVAL;
+	} else if (rdev->sb_start < rdev->data_offset) {
+		/* METADATA BITMAP DATA */
+		if (rdev->sb_start + offset
+		    + page->index * (PAGE_SIZE / SECTOR_SIZE)
+		    + size / SECTOR_SIZE > rdev->data_offset)
+			/* bitmap runs in to data */
+			return -EINVAL;
+	} else {
+		/* DATA METADATA BITMAP - no problems */
+	}
 
-restart:
-	rdev = NULL;
-	while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
-		int size = PAGE_SIZE;
-		loff_t offset = mddev->bitmap_info.offset;
+	md_super_write(mddev, rdev,
+		       rdev->sb_start + offset
+		       + page->index * (PAGE_SIZE / SECTOR_SIZE),
+		       size, page);
+	return 0;
+}
 
-		bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
+static int write_sb_page(struct bitmap *bitmap, struct page *page, int wait)
+{
+	struct md_rdev *rdev;
+	struct mddev *mddev = bitmap->mddev;
+	int ret;
 
-		if (page->index == store->file_pages-1) {
-			int last_page_size = store->bytes & (PAGE_SIZE-1);
-			if (last_page_size == 0)
-				last_page_size = PAGE_SIZE;
-			size = roundup(last_page_size,
-				       bdev_logical_block_size(bdev));
-		}
-		/* Just make sure we aren't corrupting data or
-		 * metadata
-		 */
-		if (mddev->external) {
-			/* Bitmap could be anywhere. */
-			if (rdev->sb_start + offset + (page->index
-						       * (PAGE_SIZE/512))
-			    > rdev->data_offset
-			    &&
-			    rdev->sb_start + offset
-			    < (rdev->data_offset + mddev->dev_sectors
-			     + (PAGE_SIZE/512)))
-				goto bad_alignment;
-		} else if (offset < 0) {
-			/* DATA  BITMAP METADATA  */
-			if (offset
-			    + (long)(page->index * (PAGE_SIZE/512))
-			    + size/512 > 0)
-				/* bitmap runs in to metadata */
-				goto bad_alignment;
-			if (rdev->data_offset + mddev->dev_sectors
-			    > rdev->sb_start + offset)
-				/* data runs in to bitmap */
-				goto bad_alignment;
-		} else if (rdev->sb_start < rdev->data_offset) {
-			/* METADATA BITMAP DATA */
-			if (rdev->sb_start
-			    + offset
-			    + page->index*(PAGE_SIZE/512) + size/512
-			    > rdev->data_offset)
-				/* bitmap runs in to data */
-				goto bad_alignment;
-		} else {
-			/* DATA METADATA BITMAP - no problems */
+	do {
+		rdev = NULL;
+		while ((rdev = next_active_rdev(rdev, mddev)) != NULL) {
+			ret = __write_sb_page(rdev, bitmap, page);
+			if (ret)
+				return ret;
 		}
-		md_super_write(mddev, rdev,
-			       rdev->sb_start + offset
-			       + page->index * (PAGE_SIZE/512),
-			       size,
-			       page);
-	}
+	} while (wait && md_super_wait(mddev) < 0);
 
-	if (wait && md_super_wait(mddev) < 0)
-		goto restart;
 	return 0;
-
- bad_alignment:
-	return -EINVAL;
 }
 
 static void md_bitmap_file_kick(struct bitmap *bitmap);
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v5 2/3] md: Fix types in sb writer
  2023-02-24 18:33 [PATCH v5 0/3] md/bitmap: Optimal last page size Jonathan Derrick
  2023-02-24 18:33 ` [PATCH v5 1/3] md: Move sb writer loop to its own function Jonathan Derrick
@ 2023-02-24 18:33 ` Jonathan Derrick
  2023-02-24 18:33 ` [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page Jonathan Derrick
  2023-03-13 21:38 ` [PATCH v5 0/3] md/bitmap: Optimal last page size Song Liu
  3 siblings, 0 replies; 12+ messages in thread
From: Jonathan Derrick @ 2023-02-24 18:33 UTC (permalink / raw)
  To: Song Liu, linux-raid
  Cc: Reindl Harald, Xiao Ni, Christoph Hellwig, Christoph Hellwig,
	Paul Menzel, Sushma Kalakota, Jon Derrick

From: Jon Derrick <jonathan.derrick@linux.dev>

Page->index is a pgoff_t and multiplying could cause overflows on a
32-bit architecture. In the sb writer, this is used to calculate and
verify the sector being used, and is multiplied by a sector value. Using
sector_t will cast it to a u64 type and is the more appropriate type for
the unit. Additionally, the integer size unit is converted to a sector
unit in later calculations, and is now corrected to be an unsigned type.

Finally, clean up the calculations using variable aliases to improve
readabiliy.

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jon Derrick <jonathan.derrick@linux.dev>
---
 drivers/md/md-bitmap.c | 35 ++++++++++++++---------------------
 1 file changed, 14 insertions(+), 21 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index cdc61e65abae..bf250a5e3a86 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -215,12 +215,13 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 	struct block_device *bdev;
 	struct mddev *mddev = bitmap->mddev;
 	struct bitmap_storage *store = &bitmap->storage;
-	loff_t offset = mddev->bitmap_info.offset;
-	int size = PAGE_SIZE;
+	sector_t offset = mddev->bitmap_info.offset;
+	sector_t ps, sboff, doff;
+	unsigned int size = PAGE_SIZE;
 
 	bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
 	if (page->index == store->file_pages - 1) {
-		int last_page_size = store->bytes & (PAGE_SIZE - 1);
+		unsigned int last_page_size = store->bytes & (PAGE_SIZE - 1);
 
 		if (last_page_size == 0)
 			last_page_size = PAGE_SIZE;
@@ -228,43 +229,35 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 			       bdev_logical_block_size(bdev));
 	}
 
+	ps = page->index * PAGE_SIZE / SECTOR_SIZE;
+	sboff = rdev->sb_start + offset;
+	doff = rdev->data_offset;
+
 	/* Just make sure we aren't corrupting data or metadata */
 	if (mddev->external) {
 		/* Bitmap could be anywhere. */
-		if (rdev->sb_start + offset
-		    + (page->index * (PAGE_SIZE / SECTOR_SIZE))
-		    > rdev->data_offset &&
-		    rdev->sb_start + offset
-		    < (rdev->data_offset + mddev->dev_sectors
-		     + (PAGE_SIZE / SECTOR_SIZE)))
+		if (sboff + ps > doff &&
+		    sboff < (doff + mddev->dev_sectors + PAGE_SIZE / SECTOR_SIZE))
 			return -EINVAL;
 	} else if (offset < 0) {
 		/* DATA  BITMAP METADATA  */
-		if (offset
-		    + (long)(page->index * (PAGE_SIZE / SECTOR_SIZE))
-		    + size / SECTOR_SIZE > 0)
+		if (offset + ps + size / SECTOR_SIZE > 0)
 			/* bitmap runs in to metadata */
 			return -EINVAL;
 
-		if (rdev->data_offset + mddev->dev_sectors
-		    > rdev->sb_start + offset)
+		if (doff + mddev->dev_sectors > sboff)
 			/* data runs in to bitmap */
 			return -EINVAL;
 	} else if (rdev->sb_start < rdev->data_offset) {
 		/* METADATA BITMAP DATA */
-		if (rdev->sb_start + offset
-		    + page->index * (PAGE_SIZE / SECTOR_SIZE)
-		    + size / SECTOR_SIZE > rdev->data_offset)
+		if (sboff + ps + size / SECTOR_SIZE > doff)
 			/* bitmap runs in to data */
 			return -EINVAL;
 	} else {
 		/* DATA METADATA BITMAP - no problems */
 	}
 
-	md_super_write(mddev, rdev,
-		       rdev->sb_start + offset
-		       + page->index * (PAGE_SIZE / SECTOR_SIZE),
-		       size, page);
+	md_super_write(mddev, rdev, sboff + ps, (int) size, page);
 	return 0;
 }
 
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page
  2023-02-24 18:33 [PATCH v5 0/3] md/bitmap: Optimal last page size Jonathan Derrick
  2023-02-24 18:33 ` [PATCH v5 1/3] md: Move sb writer loop to its own function Jonathan Derrick
  2023-02-24 18:33 ` [PATCH v5 2/3] md: Fix types in sb writer Jonathan Derrick
@ 2023-02-24 18:33 ` Jonathan Derrick
  2023-02-27  1:56   ` Xiao Ni
  2023-03-13 21:38 ` [PATCH v5 0/3] md/bitmap: Optimal last page size Song Liu
  3 siblings, 1 reply; 12+ messages in thread
From: Jonathan Derrick @ 2023-02-24 18:33 UTC (permalink / raw)
  To: Song Liu, linux-raid
  Cc: Reindl Harald, Xiao Ni, Christoph Hellwig, Christoph Hellwig,
	Paul Menzel, Sushma Kalakota, Jon Derrick

From: Jon Derrick <jonathan.derrick@linux.dev>

If the bitmap space has enough room, size the I/O for the last bitmap
page write to the optimal I/O size for the storage device. The expanded
write is checked that it won't overrun the data or metadata.

The drive this was tested against has higher latencies when there are
sub-4k writes due to device-side read-mod-writes of its atomic 4k write
unit. This change helps increase performance by sizing the last bitmap
page I/O for the device's preferred write unit, if it is given.

Example Intel/Solidigm P5520
Raid10, Chunk-size 64M, bitmap-size 57228 bits

$ mdadm --create /dev/md0 --level=10 --raid-devices=4 /dev/nvme{0,1,2,3}n1
        --assume-clean --bitmap=internal --bitmap-chunk=64M
$ fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --bs=4k --runtime=60

Without patch:
  write: IOPS=1676, BW=6708KiB/s (6869kB/s)(393MiB/60001msec); 0 zone resets

With patch:
  write: IOPS=15.7k, BW=61.4MiB/s (64.4MB/s)(3683MiB/60001msec); 0 zone resets

Biosnoop:
Without patch:
Time        Process        PID     Device      LBA        Size      Lat
1.410377    md0_raid10     6900    nvme0n1   W 16         4096      0.02
1.410387    md0_raid10     6900    nvme2n1   W 16         4096      0.02
1.410374    md0_raid10     6900    nvme3n1   W 16         4096      0.01
1.410381    md0_raid10     6900    nvme1n1   W 16         4096      0.02
1.410411    md0_raid10     6900    nvme1n1   W 115346512  4096      0.01
1.410418    md0_raid10     6900    nvme0n1   W 115346512  4096      0.02
1.410915    md0_raid10     6900    nvme2n1   W 24         3584      0.43 <--
1.410935    md0_raid10     6900    nvme3n1   W 24         3584      0.45 <--
1.411124    md0_raid10     6900    nvme1n1   W 24         3584      0.64 <--
1.411147    md0_raid10     6900    nvme0n1   W 24         3584      0.66 <--
1.411176    md0_raid10     6900    nvme3n1   W 2019022184 4096      0.01
1.411189    md0_raid10     6900    nvme2n1   W 2019022184 4096      0.02

With patch:
Time        Process        PID     Device      LBA        Size      Lat
5.747193    md0_raid10     727     nvme0n1   W 16         4096      0.01
5.747192    md0_raid10     727     nvme1n1   W 16         4096      0.02
5.747195    md0_raid10     727     nvme3n1   W 16         4096      0.01
5.747202    md0_raid10     727     nvme2n1   W 16         4096      0.02
5.747229    md0_raid10     727     nvme3n1   W 1196223704 4096      0.02
5.747224    md0_raid10     727     nvme0n1   W 1196223704 4096      0.01
5.747279    md0_raid10     727     nvme0n1   W 24         4096      0.01 <--
5.747279    md0_raid10     727     nvme1n1   W 24         4096      0.02 <--
5.747284    md0_raid10     727     nvme3n1   W 24         4096      0.02 <--
5.747291    md0_raid10     727     nvme2n1   W 24         4096      0.02 <--
5.747314    md0_raid10     727     nvme2n1   W 2234636712 4096      0.01
5.747317    md0_raid10     727     nvme1n1   W 2234636712 4096      0.02

Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Jon Derrick <jonathan.derrick@linux.dev>
---
 drivers/md/md-bitmap.c | 33 +++++++++++++++++++++++++++++----
 1 file changed, 29 insertions(+), 4 deletions(-)

diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
index bf250a5e3a86..920bb68156d2 100644
--- a/drivers/md/md-bitmap.c
+++ b/drivers/md/md-bitmap.c
@@ -209,6 +209,28 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
 	return NULL;
 }
 
+static unsigned int optimal_io_size(struct block_device *bdev,
+				    unsigned int last_page_size,
+				    unsigned int io_size)
+{
+	if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev))
+		return roundup(last_page_size, bdev_io_opt(bdev));
+	return io_size;
+}
+
+static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
+				   sector_t start, sector_t boundary)
+{
+	if (io_size != opt_size &&
+	    start + opt_size / SECTOR_SIZE <= boundary)
+		return opt_size;
+	if (start + io_size / SECTOR_SIZE <= boundary)
+		return io_size;
+
+	/* Overflows boundary */
+	return 0;
+}
+
 static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 			   struct page *page)
 {
@@ -218,6 +240,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 	sector_t offset = mddev->bitmap_info.offset;
 	sector_t ps, sboff, doff;
 	unsigned int size = PAGE_SIZE;
+	unsigned int opt_size = PAGE_SIZE;
 
 	bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
 	if (page->index == store->file_pages - 1) {
@@ -225,8 +248,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 
 		if (last_page_size == 0)
 			last_page_size = PAGE_SIZE;
-		size = roundup(last_page_size,
-			       bdev_logical_block_size(bdev));
+		size = roundup(last_page_size, bdev_logical_block_size(bdev));
+		opt_size = optimal_io_size(bdev, last_page_size, size);
 	}
 
 	ps = page->index * PAGE_SIZE / SECTOR_SIZE;
@@ -241,7 +264,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 			return -EINVAL;
 	} else if (offset < 0) {
 		/* DATA  BITMAP METADATA  */
-		if (offset + ps + size / SECTOR_SIZE > 0)
+		size = bitmap_io_size(size, opt_size, offset + ps, 0);
+		if (size == 0)
 			/* bitmap runs in to metadata */
 			return -EINVAL;
 
@@ -250,7 +274,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
 			return -EINVAL;
 	} else if (rdev->sb_start < rdev->data_offset) {
 		/* METADATA BITMAP DATA */
-		if (sboff + ps + size / SECTOR_SIZE > doff)
+		size = bitmap_io_size(size, opt_size, sboff + ps, doff);
+		if (size == 0)
 			/* bitmap runs in to data */
 			return -EINVAL;
 	} else {
-- 
2.27.0


^ permalink raw reply related	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page
  2023-02-24 18:33 ` [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page Jonathan Derrick
@ 2023-02-27  1:56   ` Xiao Ni
  2023-02-28 23:09     ` Jonathan Derrick
  0 siblings, 1 reply; 12+ messages in thread
From: Xiao Ni @ 2023-02-27  1:56 UTC (permalink / raw)
  To: Jonathan Derrick
  Cc: Song Liu, linux-raid, Reindl Harald, Christoph Hellwig,
	Christoph Hellwig, Paul Menzel, Sushma Kalakota

Hi Jonathan

I did a test in my environment, but I didn't see such a big
performance difference.

The first environment:
All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then
I used your way to rebuild the kernel
/sys/block/nvme0n1/queue/physical_block_size 512
/sys/block/nvme0n1/queue/optimal_io_size 4096
cat /sys/block/nvme0n1/queue/logical_block_size 512

without the patch set
write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets
with the patch set
write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets

The second environment:
The nvme devices' opt size are 4096. So I don't need to rebuild the kernel.
/sys/block/nvme0n1/queue/logical_block_size
/sys/block/nvme0n1/queue/physical_block_size
/sys/block/nvme0n1/queue/optimal_io_size

without the patch set
write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets
with the patch set
write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets

Best Regards
Xiao

On Sat, Feb 25, 2023 at 2:34 AM Jonathan Derrick
<jonathan.derrick@linux.dev> wrote:
>
> From: Jon Derrick <jonathan.derrick@linux.dev>
>
> If the bitmap space has enough room, size the I/O for the last bitmap
> page write to the optimal I/O size for the storage device. The expanded
> write is checked that it won't overrun the data or metadata.
>
> The drive this was tested against has higher latencies when there are
> sub-4k writes due to device-side read-mod-writes of its atomic 4k write
> unit. This change helps increase performance by sizing the last bitmap
> page I/O for the device's preferred write unit, if it is given.
>
> Example Intel/Solidigm P5520
> Raid10, Chunk-size 64M, bitmap-size 57228 bits
>
> $ mdadm --create /dev/md0 --level=10 --raid-devices=4 /dev/nvme{0,1,2,3}n1
>         --assume-clean --bitmap=internal --bitmap-chunk=64M
> $ fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --bs=4k --runtime=60
>
> Without patch:
>   write: IOPS=1676, BW=6708KiB/s (6869kB/s)(393MiB/60001msec); 0 zone resets
>
> With patch:
>   write: IOPS=15.7k, BW=61.4MiB/s (64.4MB/s)(3683MiB/60001msec); 0 zone resets
>
> Biosnoop:
> Without patch:
> Time        Process        PID     Device      LBA        Size      Lat
> 1.410377    md0_raid10     6900    nvme0n1   W 16         4096      0.02
> 1.410387    md0_raid10     6900    nvme2n1   W 16         4096      0.02
> 1.410374    md0_raid10     6900    nvme3n1   W 16         4096      0.01
> 1.410381    md0_raid10     6900    nvme1n1   W 16         4096      0.02
> 1.410411    md0_raid10     6900    nvme1n1   W 115346512  4096      0.01
> 1.410418    md0_raid10     6900    nvme0n1   W 115346512  4096      0.02
> 1.410915    md0_raid10     6900    nvme2n1   W 24         3584      0.43 <--
> 1.410935    md0_raid10     6900    nvme3n1   W 24         3584      0.45 <--
> 1.411124    md0_raid10     6900    nvme1n1   W 24         3584      0.64 <--
> 1.411147    md0_raid10     6900    nvme0n1   W 24         3584      0.66 <--
> 1.411176    md0_raid10     6900    nvme3n1   W 2019022184 4096      0.01
> 1.411189    md0_raid10     6900    nvme2n1   W 2019022184 4096      0.02
>
> With patch:
> Time        Process        PID     Device      LBA        Size      Lat
> 5.747193    md0_raid10     727     nvme0n1   W 16         4096      0.01
> 5.747192    md0_raid10     727     nvme1n1   W 16         4096      0.02
> 5.747195    md0_raid10     727     nvme3n1   W 16         4096      0.01
> 5.747202    md0_raid10     727     nvme2n1   W 16         4096      0.02
> 5.747229    md0_raid10     727     nvme3n1   W 1196223704 4096      0.02
> 5.747224    md0_raid10     727     nvme0n1   W 1196223704 4096      0.01
> 5.747279    md0_raid10     727     nvme0n1   W 24         4096      0.01 <--
> 5.747279    md0_raid10     727     nvme1n1   W 24         4096      0.02 <--
> 5.747284    md0_raid10     727     nvme3n1   W 24         4096      0.02 <--
> 5.747291    md0_raid10     727     nvme2n1   W 24         4096      0.02 <--
> 5.747314    md0_raid10     727     nvme2n1   W 2234636712 4096      0.01
> 5.747317    md0_raid10     727     nvme1n1   W 2234636712 4096      0.02
>
> Reviewed-by: Christoph Hellwig <hch@lst.de>
> Signed-off-by: Jon Derrick <jonathan.derrick@linux.dev>
> ---
>  drivers/md/md-bitmap.c | 33 +++++++++++++++++++++++++++++----
>  1 file changed, 29 insertions(+), 4 deletions(-)
>
> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
> index bf250a5e3a86..920bb68156d2 100644
> --- a/drivers/md/md-bitmap.c
> +++ b/drivers/md/md-bitmap.c
> @@ -209,6 +209,28 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
>         return NULL;
>  }
>
> +static unsigned int optimal_io_size(struct block_device *bdev,
> +                                   unsigned int last_page_size,
> +                                   unsigned int io_size)
> +{
> +       if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev))
> +               return roundup(last_page_size, bdev_io_opt(bdev));
> +       return io_size;
> +}
> +
> +static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
> +                                  sector_t start, sector_t boundary)
> +{
> +       if (io_size != opt_size &&
> +           start + opt_size / SECTOR_SIZE <= boundary)
> +               return opt_size;
> +       if (start + io_size / SECTOR_SIZE <= boundary)
> +               return io_size;
> +
> +       /* Overflows boundary */
> +       return 0;
> +}
> +
>  static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>                            struct page *page)
>  {
> @@ -218,6 +240,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>         sector_t offset = mddev->bitmap_info.offset;
>         sector_t ps, sboff, doff;
>         unsigned int size = PAGE_SIZE;
> +       unsigned int opt_size = PAGE_SIZE;
>
>         bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
>         if (page->index == store->file_pages - 1) {
> @@ -225,8 +248,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>
>                 if (last_page_size == 0)
>                         last_page_size = PAGE_SIZE;
> -               size = roundup(last_page_size,
> -                              bdev_logical_block_size(bdev));
> +               size = roundup(last_page_size, bdev_logical_block_size(bdev));
> +               opt_size = optimal_io_size(bdev, last_page_size, size);
>         }
>
>         ps = page->index * PAGE_SIZE / SECTOR_SIZE;
> @@ -241,7 +264,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>                         return -EINVAL;
>         } else if (offset < 0) {
>                 /* DATA  BITMAP METADATA  */
> -               if (offset + ps + size / SECTOR_SIZE > 0)
> +               size = bitmap_io_size(size, opt_size, offset + ps, 0);
> +               if (size == 0)
>                         /* bitmap runs in to metadata */
>                         return -EINVAL;
>
> @@ -250,7 +274,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>                         return -EINVAL;
>         } else if (rdev->sb_start < rdev->data_offset) {
>                 /* METADATA BITMAP DATA */
> -               if (sboff + ps + size / SECTOR_SIZE > doff)
> +               size = bitmap_io_size(size, opt_size, sboff + ps, doff);
> +               if (size == 0)
>                         /* bitmap runs in to data */
>                         return -EINVAL;
>         } else {
> --
> 2.27.0
>


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page
  2023-02-27  1:56   ` Xiao Ni
@ 2023-02-28 23:09     ` Jonathan Derrick
  2023-03-01 12:36       ` Xiao Ni
  0 siblings, 1 reply; 12+ messages in thread
From: Jonathan Derrick @ 2023-02-28 23:09 UTC (permalink / raw)
  To: Xiao Ni
  Cc: Song Liu, linux-raid, Reindl Harald, Christoph Hellwig,
	Christoph Hellwig, Paul Menzel, Sushma Kalakota

Hi Xiao

On 2/26/2023 6:56 PM, Xiao Ni wrote:
> Hi Jonathan
> 
> I did a test in my environment, but I didn't see such a big
> performance difference.
> 
> The first environment:
> All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then
> I used your way to rebuild the kernel
> /sys/block/nvme0n1/queue/physical_block_size 512
> /sys/block/nvme0n1/queue/optimal_io_size 4096
> cat /sys/block/nvme0n1/queue/logical_block_size 512
> 
> without the patch set
> write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets
> with the patch set
> write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets
> 
> The second environment:
> The nvme devices' opt size are 4096. So I don't need to rebuild the kernel.
> /sys/block/nvme0n1/queue/logical_block_size
> /sys/block/nvme0n1/queue/physical_block_size
> /sys/block/nvme0n1/queue/optimal_io_size
> 
> without the patch set
> write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets
> with the patch set
> write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets
> 
Sounds like your devices may not have latency issues at sub-optimal sizes.
Can you provide biosnoop traces with and without patches?

Still, 'works fine for me' is generally not a reason to reject the patches.

> Best Regards
> Xiao
> 
> On Sat, Feb 25, 2023 at 2:34 AM Jonathan Derrick
> <jonathan.derrick@linux.dev> wrote:
>>
>> From: Jon Derrick <jonathan.derrick@linux.dev>
>>
>> If the bitmap space has enough room, size the I/O for the last bitmap
>> page write to the optimal I/O size for the storage device. The expanded
>> write is checked that it won't overrun the data or metadata.
>>
>> The drive this was tested against has higher latencies when there are
>> sub-4k writes due to device-side read-mod-writes of its atomic 4k write
>> unit. This change helps increase performance by sizing the last bitmap
>> page I/O for the device's preferred write unit, if it is given.
>>
>> Example Intel/Solidigm P5520
>> Raid10, Chunk-size 64M, bitmap-size 57228 bits
>>
>> $ mdadm --create /dev/md0 --level=10 --raid-devices=4 /dev/nvme{0,1,2,3}n1
>>         --assume-clean --bitmap=internal --bitmap-chunk=64M
>> $ fio --name=test --direct=1 --filename=/dev/md0 --rw=randwrite --bs=4k --runtime=60
>>
>> Without patch:
>>   write: IOPS=1676, BW=6708KiB/s (6869kB/s)(393MiB/60001msec); 0 zone resets
>>
>> With patch:
>>   write: IOPS=15.7k, BW=61.4MiB/s (64.4MB/s)(3683MiB/60001msec); 0 zone resets
>>
>> Biosnoop:
>> Without patch:
>> Time        Process        PID     Device      LBA        Size      Lat
>> 1.410377    md0_raid10     6900    nvme0n1   W 16         4096      0.02
>> 1.410387    md0_raid10     6900    nvme2n1   W 16         4096      0.02
>> 1.410374    md0_raid10     6900    nvme3n1   W 16         4096      0.01
>> 1.410381    md0_raid10     6900    nvme1n1   W 16         4096      0.02
>> 1.410411    md0_raid10     6900    nvme1n1   W 115346512  4096      0.01
>> 1.410418    md0_raid10     6900    nvme0n1   W 115346512  4096      0.02
>> 1.410915    md0_raid10     6900    nvme2n1   W 24         3584      0.43 <--
>> 1.410935    md0_raid10     6900    nvme3n1   W 24         3584      0.45 <--
>> 1.411124    md0_raid10     6900    nvme1n1   W 24         3584      0.64 <--
>> 1.411147    md0_raid10     6900    nvme0n1   W 24         3584      0.66 <--
>> 1.411176    md0_raid10     6900    nvme3n1   W 2019022184 4096      0.01
>> 1.411189    md0_raid10     6900    nvme2n1   W 2019022184 4096      0.02
>>
>> With patch:
>> Time        Process        PID     Device      LBA        Size      Lat
>> 5.747193    md0_raid10     727     nvme0n1   W 16         4096      0.01
>> 5.747192    md0_raid10     727     nvme1n1   W 16         4096      0.02
>> 5.747195    md0_raid10     727     nvme3n1   W 16         4096      0.01
>> 5.747202    md0_raid10     727     nvme2n1   W 16         4096      0.02
>> 5.747229    md0_raid10     727     nvme3n1   W 1196223704 4096      0.02
>> 5.747224    md0_raid10     727     nvme0n1   W 1196223704 4096      0.01
>> 5.747279    md0_raid10     727     nvme0n1   W 24         4096      0.01 <--
>> 5.747279    md0_raid10     727     nvme1n1   W 24         4096      0.02 <--
>> 5.747284    md0_raid10     727     nvme3n1   W 24         4096      0.02 <--
>> 5.747291    md0_raid10     727     nvme2n1   W 24         4096      0.02 <--
>> 5.747314    md0_raid10     727     nvme2n1   W 2234636712 4096      0.01
>> 5.747317    md0_raid10     727     nvme1n1   W 2234636712 4096      0.02
>>
>> Reviewed-by: Christoph Hellwig <hch@lst.de>
>> Signed-off-by: Jon Derrick <jonathan.derrick@linux.dev>
>> ---
>>  drivers/md/md-bitmap.c | 33 +++++++++++++++++++++++++++++----
>>  1 file changed, 29 insertions(+), 4 deletions(-)
>>
>> diff --git a/drivers/md/md-bitmap.c b/drivers/md/md-bitmap.c
>> index bf250a5e3a86..920bb68156d2 100644
>> --- a/drivers/md/md-bitmap.c
>> +++ b/drivers/md/md-bitmap.c
>> @@ -209,6 +209,28 @@ static struct md_rdev *next_active_rdev(struct md_rdev *rdev, struct mddev *mdde
>>         return NULL;
>>  }
>>
>> +static unsigned int optimal_io_size(struct block_device *bdev,
>> +                                   unsigned int last_page_size,
>> +                                   unsigned int io_size)
>> +{
>> +       if (bdev_io_opt(bdev) > bdev_logical_block_size(bdev))
>> +               return roundup(last_page_size, bdev_io_opt(bdev));
>> +       return io_size;
>> +}
>> +
>> +static unsigned int bitmap_io_size(unsigned int io_size, unsigned int opt_size,
>> +                                  sector_t start, sector_t boundary)
>> +{
>> +       if (io_size != opt_size &&
>> +           start + opt_size / SECTOR_SIZE <= boundary)
>> +               return opt_size;
>> +       if (start + io_size / SECTOR_SIZE <= boundary)
>> +               return io_size;
>> +
>> +       /* Overflows boundary */
>> +       return 0;
>> +}
>> +
>>  static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>>                            struct page *page)
>>  {
>> @@ -218,6 +240,7 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>>         sector_t offset = mddev->bitmap_info.offset;
>>         sector_t ps, sboff, doff;
>>         unsigned int size = PAGE_SIZE;
>> +       unsigned int opt_size = PAGE_SIZE;
>>
>>         bdev = (rdev->meta_bdev) ? rdev->meta_bdev : rdev->bdev;
>>         if (page->index == store->file_pages - 1) {
>> @@ -225,8 +248,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>>
>>                 if (last_page_size == 0)
>>                         last_page_size = PAGE_SIZE;
>> -               size = roundup(last_page_size,
>> -                              bdev_logical_block_size(bdev));
>> +               size = roundup(last_page_size, bdev_logical_block_size(bdev));
>> +               opt_size = optimal_io_size(bdev, last_page_size, size);
>>         }
>>
>>         ps = page->index * PAGE_SIZE / SECTOR_SIZE;
>> @@ -241,7 +264,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>>                         return -EINVAL;
>>         } else if (offset < 0) {
>>                 /* DATA  BITMAP METADATA  */
>> -               if (offset + ps + size / SECTOR_SIZE > 0)
>> +               size = bitmap_io_size(size, opt_size, offset + ps, 0);
>> +               if (size == 0)
>>                         /* bitmap runs in to metadata */
>>                         return -EINVAL;
>>
>> @@ -250,7 +274,8 @@ static int __write_sb_page(struct md_rdev *rdev, struct bitmap *bitmap,
>>                         return -EINVAL;
>>         } else if (rdev->sb_start < rdev->data_offset) {
>>                 /* METADATA BITMAP DATA */
>> -               if (sboff + ps + size / SECTOR_SIZE > doff)
>> +               size = bitmap_io_size(size, opt_size, sboff + ps, doff);
>> +               if (size == 0)
>>                         /* bitmap runs in to data */
>>                         return -EINVAL;
>>         } else {
>> --
>> 2.27.0
>>
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page
  2023-02-28 23:09     ` Jonathan Derrick
@ 2023-03-01 12:36       ` Xiao Ni
  2023-03-01 17:56         ` Jonathan Derrick
  0 siblings, 1 reply; 12+ messages in thread
From: Xiao Ni @ 2023-03-01 12:36 UTC (permalink / raw)
  To: Jonathan Derrick
  Cc: Song Liu, linux-raid, Reindl Harald, Christoph Hellwig,
	Christoph Hellwig, Paul Menzel, Sushma Kalakota

On Wed, Mar 1, 2023 at 7:10 AM Jonathan Derrick
<jonathan.derrick@linux.dev> wrote:
>
> Hi Xiao
>
> On 2/26/2023 6:56 PM, Xiao Ni wrote:
> > Hi Jonathan
> >
> > I did a test in my environment, but I didn't see such a big
> > performance difference.
> >
> > The first environment:
> > All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then
> > I used your way to rebuild the kernel
> > /sys/block/nvme0n1/queue/physical_block_size 512
> > /sys/block/nvme0n1/queue/optimal_io_size 4096
> > cat /sys/block/nvme0n1/queue/logical_block_size 512
> >
> > without the patch set
> > write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets
> > with the patch set
> > write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets
> >
> > The second environment:
> > The nvme devices' opt size are 4096. So I don't need to rebuild the kernel.
> > /sys/block/nvme0n1/queue/logical_block_size
> > /sys/block/nvme0n1/queue/physical_block_size
> > /sys/block/nvme0n1/queue/optimal_io_size
> >
> > without the patch set
> > write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets
> > with the patch set
> > write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets
> >
> Sounds like your devices may not have latency issues at sub-optimal sizes.
> Can you provide biosnoop traces with and without patches?
>
> Still, 'works fine for me' is generally not a reason to reject the patches.

Yes, I can. I tried to install the biosnoop in fedora38 but it failed.
These are the rpm packages I've installed:
bcc-tools-0.25.0-1.fc38.x86_64
bcc-0.25.0-1.fc38.x86_64
python3-bcc-0.25.0-1.fc38.noarch

Are there other packages that I need to install?

Regards
Xiao


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page
  2023-03-01 12:36       ` Xiao Ni
@ 2023-03-01 17:56         ` Jonathan Derrick
  2023-03-02  9:05           ` Xiao Ni
  0 siblings, 1 reply; 12+ messages in thread
From: Jonathan Derrick @ 2023-03-01 17:56 UTC (permalink / raw)
  To: Xiao Ni
  Cc: Song Liu, linux-raid, Reindl Harald, Christoph Hellwig,
	Christoph Hellwig, Paul Menzel, Sushma Kalakota



On 3/1/2023 5:36 AM, Xiao Ni wrote:
> On Wed, Mar 1, 2023 at 7:10 AM Jonathan Derrick
> <jonathan.derrick@linux.dev> wrote:
>>
>> Hi Xiao
>>
>> On 2/26/2023 6:56 PM, Xiao Ni wrote:
>>> Hi Jonathan
>>>
>>> I did a test in my environment, but I didn't see such a big
>>> performance difference.
>>>
>>> The first environment:
>>> All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then
>>> I used your way to rebuild the kernel
>>> /sys/block/nvme0n1/queue/physical_block_size 512
>>> /sys/block/nvme0n1/queue/optimal_io_size 4096
>>> cat /sys/block/nvme0n1/queue/logical_block_size 512
>>>
>>> without the patch set
>>> write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets
>>> with the patch set
>>> write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets
>>>
>>> The second environment:
>>> The nvme devices' opt size are 4096. So I don't need to rebuild the kernel.
>>> /sys/block/nvme0n1/queue/logical_block_size
>>> /sys/block/nvme0n1/queue/physical_block_size
>>> /sys/block/nvme0n1/queue/optimal_io_size
>>>
>>> without the patch set
>>> write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets
>>> with the patch set
>>> write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets
>>>
>> Sounds like your devices may not have latency issues at sub-optimal sizes.
>> Can you provide biosnoop traces with and without patches?
>>
>> Still, 'works fine for me' is generally not a reason to reject the patches.
> 
> Yes, I can. I tried to install the biosnoop in fedora38 but it failed.
> These are the rpm packages I've installed:
> bcc-tools-0.25.0-1.fc38.x86_64
> bcc-0.25.0-1.fc38.x86_64
> python3-bcc-0.25.0-1.fc38.noarch
> 
> Are there other packages that I need to install?
> 
I've had issues with the packaged versions as well

Best to install from source:
https://github.com/iovisor/bcc/
https://github.com/iovisor/bcc/blob/master/INSTALL.md#fedora---source

> Regards
> Xiao
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page
  2023-03-01 17:56         ` Jonathan Derrick
@ 2023-03-02  9:05           ` Xiao Ni
  2023-03-02 17:17             ` Jonathan Derrick
  0 siblings, 1 reply; 12+ messages in thread
From: Xiao Ni @ 2023-03-02  9:05 UTC (permalink / raw)
  To: Jonathan Derrick
  Cc: Song Liu, linux-raid, Reindl Harald, Christoph Hellwig,
	Christoph Hellwig, Paul Menzel, Sushma Kalakota

On Thu, Mar 2, 2023 at 1:57 AM Jonathan Derrick
<jonathan.derrick@linux.dev> wrote:
>
>
>
> On 3/1/2023 5:36 AM, Xiao Ni wrote:
> > On Wed, Mar 1, 2023 at 7:10 AM Jonathan Derrick
> > <jonathan.derrick@linux.dev> wrote:
> >>
> >> Hi Xiao
> >>
> >> On 2/26/2023 6:56 PM, Xiao Ni wrote:
> >>> Hi Jonathan
> >>>
> >>> I did a test in my environment, but I didn't see such a big
> >>> performance difference.
> >>>
> >>> The first environment:
> >>> All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then
> >>> I used your way to rebuild the kernel
> >>> /sys/block/nvme0n1/queue/physical_block_size 512
> >>> /sys/block/nvme0n1/queue/optimal_io_size 4096
> >>> cat /sys/block/nvme0n1/queue/logical_block_size 512
> >>>
> >>> without the patch set
> >>> write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets
> >>> with the patch set
> >>> write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets
> >>>
> >>> The second environment:
> >>> The nvme devices' opt size are 4096. So I don't need to rebuild the kernel.
> >>> /sys/block/nvme0n1/queue/logical_block_size
> >>> /sys/block/nvme0n1/queue/physical_block_size
> >>> /sys/block/nvme0n1/queue/optimal_io_size
> >>>
> >>> without the patch set
> >>> write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets
> >>> with the patch set
> >>> write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets
> >>>
> >> Sounds like your devices may not have latency issues at sub-optimal sizes.
> >> Can you provide biosnoop traces with and without patches?
> >>
> >> Still, 'works fine for me' is generally not a reason to reject the patches.
> >
> > Yes, I can. I tried to install the biosnoop in fedora38 but it failed.
> > These are the rpm packages I've installed:
> > bcc-tools-0.25.0-1.fc38.x86_64
> > bcc-0.25.0-1.fc38.x86_64
> > python3-bcc-0.25.0-1.fc38.noarch
> >
> > Are there other packages that I need to install?
> >
> I've had issues with the packaged versions as well
>
> Best to install from source:
> https://github.com/iovisor/bcc/
> https://github.com/iovisor/bcc/blob/master/INSTALL.md#fedora---source
>
Hi Jonathan

I did a test without modifying phys_size and opt_size. And I picked up a part
of the result:

0.172142    md0_raid10     2094    nvme1n1   W 1225496264 4096      0.01
0.172145    md0_raid10     2094    nvme0n1   W 1225496264 4096      0.01
0.172161    md0_raid10     2094    nvme3n1   W 16         4096      0.01
0.172164    md0_raid10     2094    nvme2n1   W 16         4096      0.01
0.172166    md0_raid10     2094    nvme1n1   W 16         4096      0.01
0.172168    md0_raid10     2094    nvme0n1   W 16         4096      0.01
0.172178    md0_raid10     2094    nvme3n1   W 633254624  4096      0.01
0.172180    md0_raid10     2094    nvme2n1   W 633254624  4096      0.01
0.172196    md0_raid10     2094    nvme3n1   W 16         4096      0.01
0.172199    md0_raid10     2094    nvme2n1   W 16         4096      0.01
0.172201    md0_raid10     2094    nvme1n1   W 16         4096      0.01
0.172203    md0_raid10     2094    nvme0n1   W 16         4096      0.01
0.172213    md0_raid10     2094    nvme3n1   W 1060251672 4096      0.01
0.172215    md0_raid10     2094    nvme2n1   W 1060251672 4096      0.01

The last column always shows 0.01. Is that the reason I can't see the
performance
improvement? What do you think if I use ssd or hdds?

Best Regards
Xiao


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page
  2023-03-02  9:05           ` Xiao Ni
@ 2023-03-02 17:17             ` Jonathan Derrick
  2023-03-03  1:55               ` Xiao Ni
  0 siblings, 1 reply; 12+ messages in thread
From: Jonathan Derrick @ 2023-03-02 17:17 UTC (permalink / raw)
  To: Xiao Ni
  Cc: Song Liu, linux-raid, Reindl Harald, Christoph Hellwig,
	Christoph Hellwig, Paul Menzel, Sushma Kalakota



On 3/2/2023 2:05 AM, Xiao Ni wrote:
> On Thu, Mar 2, 2023 at 1:57 AM Jonathan Derrick
> <jonathan.derrick@linux.dev> wrote:
>>
>>
>>
>> On 3/1/2023 5:36 AM, Xiao Ni wrote:
>>> On Wed, Mar 1, 2023 at 7:10 AM Jonathan Derrick
>>> <jonathan.derrick@linux.dev> wrote:
>>>>
>>>> Hi Xiao
>>>>
>>>> On 2/26/2023 6:56 PM, Xiao Ni wrote:
>>>>> Hi Jonathan
>>>>>
>>>>> I did a test in my environment, but I didn't see such a big
>>>>> performance difference.
>>>>>
>>>>> The first environment:
>>>>> All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then
>>>>> I used your way to rebuild the kernel
>>>>> /sys/block/nvme0n1/queue/physical_block_size 512
>>>>> /sys/block/nvme0n1/queue/optimal_io_size 4096
>>>>> cat /sys/block/nvme0n1/queue/logical_block_size 512
>>>>>
>>>>> without the patch set
>>>>> write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets
>>>>> with the patch set
>>>>> write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets
>>>>>
>>>>> The second environment:
>>>>> The nvme devices' opt size are 4096. So I don't need to rebuild the kernel.
>>>>> /sys/block/nvme0n1/queue/logical_block_size
>>>>> /sys/block/nvme0n1/queue/physical_block_size
>>>>> /sys/block/nvme0n1/queue/optimal_io_size
>>>>>
>>>>> without the patch set
>>>>> write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets
>>>>> with the patch set
>>>>> write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets
>>>>>
>>>> Sounds like your devices may not have latency issues at sub-optimal sizes.
>>>> Can you provide biosnoop traces with and without patches?
>>>>
>>>> Still, 'works fine for me' is generally not a reason to reject the patches.
>>>
>>> Yes, I can. I tried to install the biosnoop in fedora38 but it failed.
>>> These are the rpm packages I've installed:
>>> bcc-tools-0.25.0-1.fc38.x86_64
>>> bcc-0.25.0-1.fc38.x86_64
>>> python3-bcc-0.25.0-1.fc38.noarch
>>>
>>> Are there other packages that I need to install?
>>>
>> I've had issues with the packaged versions as well
>>
>> Best to install from source:
>> https://github.com/iovisor/bcc/
>> https://github.com/iovisor/bcc/blob/master/INSTALL.md#fedora---source
>>
> Hi Jonathan
> 
> I did a test without modifying phys_size and opt_size. And I picked up a part
> of the result:
> 
> 0.172142    md0_raid10     2094    nvme1n1   W 1225496264 4096      0.01
> 0.172145    md0_raid10     2094    nvme0n1   W 1225496264 4096      0.01
> 0.172161    md0_raid10     2094    nvme3n1   W 16         4096      0.01
> 0.172164    md0_raid10     2094    nvme2n1   W 16         4096      0.01
> 0.172166    md0_raid10     2094    nvme1n1   W 16         4096      0.01
> 0.172168    md0_raid10     2094    nvme0n1   W 16         4096      0.01
> 0.172178    md0_raid10     2094    nvme3n1   W 633254624  4096      0.01
> 0.172180    md0_raid10     2094    nvme2n1   W 633254624  4096      0.01
> 0.172196    md0_raid10     2094    nvme3n1   W 16         4096      0.01
> 0.172199    md0_raid10     2094    nvme2n1   W 16         4096      0.01
> 0.172201    md0_raid10     2094    nvme1n1   W 16         4096      0.01
> 0.172203    md0_raid10     2094    nvme0n1   W 16         4096      0.01
> 0.172213    md0_raid10     2094    nvme3n1   W 1060251672 4096      0.01
> 0.172215    md0_raid10     2094    nvme2n1   W 1060251672 4096      0.01
> 
> The last column always shows 0.01. Is that the reason I can't see the
> performance
> improvement? What do you think if I use ssd or hdds?
Try reducing your mdadm's --bitmap-chunk first. In above logs, only LBA 16 is
being used, and that's the first bitmap page (and seemingly also the last). You
want to configure it such that you have more bitmap pages. Reducing the
--bitmap-chunk parameter should create more bitmap pages, and you may run into
the scenario predicted by the patch.

> 
> Best Regards
> Xiao
> 

^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page
  2023-03-02 17:17             ` Jonathan Derrick
@ 2023-03-03  1:55               ` Xiao Ni
  0 siblings, 0 replies; 12+ messages in thread
From: Xiao Ni @ 2023-03-03  1:55 UTC (permalink / raw)
  To: Jonathan Derrick
  Cc: Song Liu, linux-raid, Reindl Harald, Christoph Hellwig,
	Christoph Hellwig, Paul Menzel, Sushma Kalakota

On Fri, Mar 3, 2023 at 1:18 AM Jonathan Derrick
<jonathan.derrick@linux.dev> wrote:
>
>
>
> On 3/2/2023 2:05 AM, Xiao Ni wrote:
> > On Thu, Mar 2, 2023 at 1:57 AM Jonathan Derrick
> > <jonathan.derrick@linux.dev> wrote:
> >>
> >>
> >>
> >> On 3/1/2023 5:36 AM, Xiao Ni wrote:
> >>> On Wed, Mar 1, 2023 at 7:10 AM Jonathan Derrick
> >>> <jonathan.derrick@linux.dev> wrote:
> >>>>
> >>>> Hi Xiao
> >>>>
> >>>> On 2/26/2023 6:56 PM, Xiao Ni wrote:
> >>>>> Hi Jonathan
> >>>>>
> >>>>> I did a test in my environment, but I didn't see such a big
> >>>>> performance difference.
> >>>>>
> >>>>> The first environment:
> >>>>> All nvme devices have 512 logical size, 512 phy size, and 0 optimal size. Then
> >>>>> I used your way to rebuild the kernel
> >>>>> /sys/block/nvme0n1/queue/physical_block_size 512
> >>>>> /sys/block/nvme0n1/queue/optimal_io_size 4096
> >>>>> cat /sys/block/nvme0n1/queue/logical_block_size 512
> >>>>>
> >>>>> without the patch set
> >>>>> write: IOPS=68.0k, BW=266MiB/s (279MB/s)(15.6GiB/60001msec); 0 zone resets
> >>>>> with the patch set
> >>>>> write: IOPS=69.1k, BW=270MiB/s (283MB/s)(15.8GiB/60001msec); 0 zone resets
> >>>>>
> >>>>> The second environment:
> >>>>> The nvme devices' opt size are 4096. So I don't need to rebuild the kernel.
> >>>>> /sys/block/nvme0n1/queue/logical_block_size
> >>>>> /sys/block/nvme0n1/queue/physical_block_size
> >>>>> /sys/block/nvme0n1/queue/optimal_io_size
> >>>>>
> >>>>> without the patch set
> >>>>> write: IOPS=51.6k, BW=202MiB/s (212MB/s)(11.8GiB/60001msec); 0 zone resets
> >>>>> with the patch set
> >>>>> write: IOPS=53.5k, BW=209MiB/s (219MB/s)(12.2GiB/60001msec); 0 zone resets
> >>>>>
> >>>> Sounds like your devices may not have latency issues at sub-optimal sizes.
> >>>> Can you provide biosnoop traces with and without patches?
> >>>>
> >>>> Still, 'works fine for me' is generally not a reason to reject the patches.
> >>>
> >>> Yes, I can. I tried to install the biosnoop in fedora38 but it failed.
> >>> These are the rpm packages I've installed:
> >>> bcc-tools-0.25.0-1.fc38.x86_64
> >>> bcc-0.25.0-1.fc38.x86_64
> >>> python3-bcc-0.25.0-1.fc38.noarch
> >>>
> >>> Are there other packages that I need to install?
> >>>
> >> I've had issues with the packaged versions as well
> >>
> >> Best to install from source:
> >> https://github.com/iovisor/bcc/
> >> https://github.com/iovisor/bcc/blob/master/INSTALL.md#fedora---source
> >>
> > Hi Jonathan
> >
> > I did a test without modifying phys_size and opt_size. And I picked up a part
> > of the result:
> >
> > 0.172142    md0_raid10     2094    nvme1n1   W 1225496264 4096      0.01
> > 0.172145    md0_raid10     2094    nvme0n1   W 1225496264 4096      0.01
> > 0.172161    md0_raid10     2094    nvme3n1   W 16         4096      0.01
> > 0.172164    md0_raid10     2094    nvme2n1   W 16         4096      0.01
> > 0.172166    md0_raid10     2094    nvme1n1   W 16         4096      0.01
> > 0.172168    md0_raid10     2094    nvme0n1   W 16         4096      0.01
> > 0.172178    md0_raid10     2094    nvme3n1   W 633254624  4096      0.01
> > 0.172180    md0_raid10     2094    nvme2n1   W 633254624  4096      0.01
> > 0.172196    md0_raid10     2094    nvme3n1   W 16         4096      0.01
> > 0.172199    md0_raid10     2094    nvme2n1   W 16         4096      0.01
> > 0.172201    md0_raid10     2094    nvme1n1   W 16         4096      0.01
> > 0.172203    md0_raid10     2094    nvme0n1   W 16         4096      0.01
> > 0.172213    md0_raid10     2094    nvme3n1   W 1060251672 4096      0.01
> > 0.172215    md0_raid10     2094    nvme2n1   W 1060251672 4096      0.01
> >
> > The last column always shows 0.01. Is that the reason I can't see the
> > performance
> > improvement? What do you think if I use ssd or hdds?
> Try reducing your mdadm's --bitmap-chunk first. In above logs, only LBA 16 is
> being used, and that's the first bitmap page (and seemingly also the last). You
> want to configure it such that you have more bitmap pages. Reducing the
> --bitmap-chunk parameter should create more bitmap pages, and you may run into
> the scenario predicted by the patch.
>

Finally, I can see the performance improvement. It's cool. Thanks for this.

The raid with 64MB bitmap size
write: IOPS=58.4k, BW=228MiB/s (239MB/s)(13.4GiB/60001msec); 0 zone
resets (without patch)
write: IOPS=69.3k, BW=271MiB/s (284MB/s)(252MiB/931msec); 0 zone
resets (with patch)

The raid with 8MB bitmap size
write: IOPS=11.6k, BW=45.4MiB/s (47.6MB/s)(2724MiB/60002msec); 0 zone
resets (without patch)
write: IOPS=43.7k, BW=171MiB/s (179MB/s)(10.0GiB/60001msec); 0 zone
resets (with patch)

Best Regards
Xiao


^ permalink raw reply	[flat|nested] 12+ messages in thread

* Re: [PATCH v5 0/3] md/bitmap: Optimal last page size
  2023-02-24 18:33 [PATCH v5 0/3] md/bitmap: Optimal last page size Jonathan Derrick
                   ` (2 preceding siblings ...)
  2023-02-24 18:33 ` [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page Jonathan Derrick
@ 2023-03-13 21:38 ` Song Liu
  3 siblings, 0 replies; 12+ messages in thread
From: Song Liu @ 2023-03-13 21:38 UTC (permalink / raw)
  To: Jonathan Derrick
  Cc: linux-raid, Reindl Harald, Xiao Ni, Christoph Hellwig,
	Christoph Hellwig, Paul Menzel, Sushma Kalakota

On Fri, Feb 24, 2023 at 10:35 AM Jonathan Derrick
<jonathan.derrick@linux.dev> wrote:
>
> From: Jon Derrick <jonathan.derrick@linux.dev>
>
> Currently the last bitmap page write will size itself down to the logical block
> size. This could cause less performance for devices which have atomic write
> units larger than the block size, such as many NVMe devices with 4kB write
> units and 512B block sizes. There is usually a large amount of space after the
> bitmap and using the optimal I/O size could favor speed over size.
>
> This was tested on an Intel/Solidigm P5520 drive with lba format 512B,
> optimal I/O size of 4kB, resulting in a > 10x IOPS increase.
>
> See patch 3 log for results.
>
> v4->v5: Initialized rdev (kernel test bot)
> v3->v4: Fixed reviewers concerns
> v2->v3: Prep patch added and types fixed
> Added helpers for optimal I/O sizes
>

Applied to md-next. Thanks!

I also added

Tested-by: Xiao Ni <xni@redhat.com>

Thanks,
Song

^ permalink raw reply	[flat|nested] 12+ messages in thread

end of thread, other threads:[~2023-03-13 21:39 UTC | newest]

Thread overview: 12+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2023-02-24 18:33 [PATCH v5 0/3] md/bitmap: Optimal last page size Jonathan Derrick
2023-02-24 18:33 ` [PATCH v5 1/3] md: Move sb writer loop to its own function Jonathan Derrick
2023-02-24 18:33 ` [PATCH v5 2/3] md: Fix types in sb writer Jonathan Derrick
2023-02-24 18:33 ` [PATCH v5 3/3] md: Use optimal I/O size for last bitmap page Jonathan Derrick
2023-02-27  1:56   ` Xiao Ni
2023-02-28 23:09     ` Jonathan Derrick
2023-03-01 12:36       ` Xiao Ni
2023-03-01 17:56         ` Jonathan Derrick
2023-03-02  9:05           ` Xiao Ni
2023-03-02 17:17             ` Jonathan Derrick
2023-03-03  1:55               ` Xiao Ni
2023-03-13 21:38 ` [PATCH v5 0/3] md/bitmap: Optimal last page size Song Liu

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.