All of lore.kernel.org
 help / color / mirror / Atom feed
* Re: [PATCH] dm-writecache: improve performance of large linear writes on SSDs
       [not found] <b0341161-1a49-d03a-d096-abf3a81d319c@petasan.org>
@ 2020-01-16 22:47 ` Maged Mokhtar
  0 siblings, 0 replies; 2+ messages in thread
From: Maged Mokhtar @ 2020-01-16 22:47 UTC (permalink / raw)
  To: Mikulas Patocka, dm-devel



> From: Mikulas Patocka <mpatocka redhat com>
> To: Mike Snitzer <msnitzer redhat com>, Nikhil Kshirsagar <nkshirsa 
> redhat com>
> Cc: dm-devel redhat com
> Subject: [dm-devel] [PATCH] dm-writecache: improve performance of large 
> linear writes on SSDs
> Date: Wed, 15 Jan 2020 04:35:22 -0500 (EST)
> 
> When dm-writecache is used with SSD as a cache device, it would submit
> separate bio for each written block. The I/Os would be merged by the disk
> scheduler, but this merging degrades performance.
> 
> This patch makes dm-writecache submit larger bios - we can submit large
> bio as long as there is consecutive free space on the cache device.
> 
> Benchmark (arm64 with 64k page size, using /dev/ram0 as a cache device):
> 
> fio --bs=512k --iodepth=32 --size=400M --direct=1 
> --filename=/dev/mapper/cache --rw=randwrite --numjobs=1 --name=test
> 
> block   old    new
> size    MiB/s  MiB/s
> ---------------------
> 512	181     700
> 1k	347     1256
> 2k    644     2020
> 4k    1183    2759
> 8k    1852    3333
> 16k   2469    3509
> 32k   2974    3670
> 64k   3404    3810
> 
> Signed-off-by: Mikulas Patocka <mpatocka redhat com>
> 
> ---
>   drivers/md/dm-writecache.c |   28 ++++++++++++++++++++++++----
>   1 file changed, 24 insertions(+), 4 deletions(-)
> 
> Index: linux-2.6/drivers/md/dm-writecache.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-writecache.c    2020-01-14 
> 16:11:09.000000000 +0100
> +++ linux-2.6/drivers/md/dm-writecache.c    2020-01-14 
> 21:42:44.000000000 +0100
> @@ -626,7 +626,7 @@ static void writecache_add_to_freelist(s
>       wc->freelist_size++;
>   }
> 
> -static struct wc_entry *writecache_pop_from_freelist(struct 
> dm_writecache *wc)
> +static struct wc_entry *writecache_pop_from_freelist(struct 
> dm_writecache *wc, sector_t expected_sector)
>   {
>       struct wc_entry *e;
> 
> @@ -635,6 +635,8 @@ static struct wc_entry *writecache_pop_f
>           if (unlikely(!wc->current_free))
>               return NULL;
>           e = wc->current_free;
> +        if (expected_sector != (sector_t)-1 && 
> unlikely(cache_sector(wc, e) != expected_sector))
> +            return NULL;
>           next = rb_next(&e->rb_node);
>           rb_erase(&e->rb_node, &wc->freetree);
>           if (unlikely(!next))
> @@ -644,6 +646,8 @@ static struct wc_entry *writecache_pop_f
>           if (unlikely(list_empty(&wc->freelist)))
>               return NULL;
>           e = container_of(wc->freelist.next, struct wc_entry, lru);
> +        if (expected_sector != (sector_t)-1 && 
> unlikely(cache_sector(wc, e) != expected_sector))
> +            return NULL;
>           list_del(&e->lru);
>       }
>       wc->freelist_size--;
> @@ -1194,7 +1198,7 @@ read_next_block:
>                       goto bio_copy;
>                   }
>               }
> -            e = writecache_pop_from_freelist(wc);
> +            e = writecache_pop_from_freelist(wc, (sector_t)-1);
>               if (unlikely(!e)) {
>                   writecache_wait_on_freelist(wc);
>                   continue;
> @@ -1206,9 +1210,25 @@ bio_copy:
>               if (WC_MODE_PMEM(wc)) {
>                   bio_copy_block(wc, bio, memory_data(wc, e));
>               } else {
> -                dm_accept_partial_bio(bio, wc->block_size >> 
> SECTOR_SHIFT);
> +                unsigned bio_size = wc->block_size;
> +                sector_t start_cache_sec = cache_sector(wc, e);
> +                sector_t current_cache_sec = start_cache_sec + 
> (bio_size >> SECTOR_SHIFT);
> +
> +                while (bio_size < bio->bi_iter.bi_size) {
> +                    struct wc_entry *f = 
> writecache_pop_from_freelist(wc, current_cache_sec);
> +                    if (!f)
> +                        break;
> +                    write_original_sector_seq_count(wc, f, 
> bio->bi_iter.bi_sector + (bio_size >> SECTOR_SHIFT), wc->seq_count);
> +                    writecache_insert_entry(wc, f);
> +                    wc->uncommitted_blocks++;
> +                    bio_size += wc->block_size;
> +                    current_cache_sec += wc->block_size >> SECTOR_SHIFT;
> +                }
> +
>                   bio_set_dev(bio, wc->ssd_dev->bdev);
> -                bio->bi_iter.bi_sector = cache_sector(wc, e);
> +                bio->bi_iter.bi_sector = start_cache_sec;
> +                dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
> +
>                   if (unlikely(wc->uncommitted_blocks >= 
> wc->autocommit_blocks)) {
>                       wc->uncommitted_blocks = 0;
>                       queue_work(wc->writeback_wq, &wc->flush_work);


The speed gain looks quite good.
One concern is if over time the free list becomes defragmented meaning 
it may become harder for the current free entry to have consecutive free 
blocks.


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 2+ messages in thread

* [PATCH] dm-writecache: improve performance of large linear writes on SSDs
@ 2020-01-15  9:35 Mikulas Patocka
  0 siblings, 0 replies; 2+ messages in thread
From: Mikulas Patocka @ 2020-01-15  9:35 UTC (permalink / raw)
  To: Mike Snitzer, Nikhil Kshirsagar; +Cc: dm-devel

When dm-writecache is used with SSD as a cache device, it would submit
separate bio for each written block. The I/Os would be merged by the disk
scheduler, but this merging degrades performance.

This patch makes dm-writecache submit larger bios - we can submit large
bio as long as there is consecutive free space on the cache device.

Benchmark (arm64 with 64k page size, using /dev/ram0 as a cache device):

fio --bs=512k --iodepth=32 --size=400M --direct=1 --filename=/dev/mapper/cache --rw=randwrite --numjobs=1 --name=test

block	old	new
size	MiB/s	MiB/s
---------------------
512	181	700
1k	347	1256
2k	644	2020
4k	1183	2759
8k	1852	3333
16k	2469	3509
32k	2974	3670
64k	3404	3810

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-writecache.c |   28 ++++++++++++++++++++++++----
 1 file changed, 24 insertions(+), 4 deletions(-)

Index: linux-2.6/drivers/md/dm-writecache.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-writecache.c	2020-01-14 16:11:09.000000000 +0100
+++ linux-2.6/drivers/md/dm-writecache.c	2020-01-14 21:42:44.000000000 +0100
@@ -626,7 +626,7 @@ static void writecache_add_to_freelist(s
 	wc->freelist_size++;
 }
 
-static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
+static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
 {
 	struct wc_entry *e;
 
@@ -635,6 +635,8 @@ static struct wc_entry *writecache_pop_f
 		if (unlikely(!wc->current_free))
 			return NULL;
 		e = wc->current_free;
+		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+			return NULL;
 		next = rb_next(&e->rb_node);
 		rb_erase(&e->rb_node, &wc->freetree);
 		if (unlikely(!next))
@@ -644,6 +646,8 @@ static struct wc_entry *writecache_pop_f
 		if (unlikely(list_empty(&wc->freelist)))
 			return NULL;
 		e = container_of(wc->freelist.next, struct wc_entry, lru);
+		if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+			return NULL;
 		list_del(&e->lru);
 	}
 	wc->freelist_size--;
@@ -1194,7 +1198,7 @@ read_next_block:
 					goto bio_copy;
 				}
 			}
-			e = writecache_pop_from_freelist(wc);
+			e = writecache_pop_from_freelist(wc, (sector_t)-1);
 			if (unlikely(!e)) {
 				writecache_wait_on_freelist(wc);
 				continue;
@@ -1206,9 +1210,25 @@ bio_copy:
 			if (WC_MODE_PMEM(wc)) {
 				bio_copy_block(wc, bio, memory_data(wc, e));
 			} else {
-				dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+				unsigned bio_size = wc->block_size;
+				sector_t start_cache_sec = cache_sector(wc, e);
+				sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
+
+				while (bio_size < bio->bi_iter.bi_size) {
+					struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
+					if (!f)
+						break;
+					write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + (bio_size >> SECTOR_SHIFT), wc->seq_count);
+					writecache_insert_entry(wc, f);
+					wc->uncommitted_blocks++;
+					bio_size += wc->block_size;
+					current_cache_sec += wc->block_size >> SECTOR_SHIFT;
+				}
+
 				bio_set_dev(bio, wc->ssd_dev->bdev);
-				bio->bi_iter.bi_sector = cache_sector(wc, e);
+				bio->bi_iter.bi_sector = start_cache_sec;
+				dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
+
 				if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
 					wc->uncommitted_blocks = 0;
 					queue_work(wc->writeback_wq, &wc->flush_work);

^ permalink raw reply	[flat|nested] 2+ messages in thread

end of thread, other threads:[~2020-01-16 22:47 UTC | newest]

Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <b0341161-1a49-d03a-d096-abf3a81d319c@petasan.org>
2020-01-16 22:47 ` [PATCH] dm-writecache: improve performance of large linear writes on SSDs Maged Mokhtar
2020-01-15  9:35 Mikulas Patocka

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.