* Re: [PATCH] dm-writecache: improve performance of large linear writes on SSDs
[not found] <b0341161-1a49-d03a-d096-abf3a81d319c@petasan.org>
@ 2020-01-16 22:47 ` Maged Mokhtar
0 siblings, 0 replies; 2+ messages in thread
From: Maged Mokhtar @ 2020-01-16 22:47 UTC (permalink / raw)
To: Mikulas Patocka, dm-devel
> From: Mikulas Patocka <mpatocka redhat com>
> To: Mike Snitzer <msnitzer redhat com>, Nikhil Kshirsagar <nkshirsa
> redhat com>
> Cc: dm-devel redhat com
> Subject: [dm-devel] [PATCH] dm-writecache: improve performance of large
> linear writes on SSDs
> Date: Wed, 15 Jan 2020 04:35:22 -0500 (EST)
>
> When dm-writecache is used with SSD as a cache device, it would submit
> separate bio for each written block. The I/Os would be merged by the disk
> scheduler, but this merging degrades performance.
>
> This patch makes dm-writecache submit larger bios - we can submit large
> bio as long as there is consecutive free space on the cache device.
>
> Benchmark (arm64 with 64k page size, using /dev/ram0 as a cache device):
>
> fio --bs=512k --iodepth=32 --size=400M --direct=1
> --filename=/dev/mapper/cache --rw=randwrite --numjobs=1 --name=test
>
> block old new
> size MiB/s MiB/s
> ---------------------
> 512 181 700
> 1k 347 1256
> 2k 644 2020
> 4k 1183 2759
> 8k 1852 3333
> 16k 2469 3509
> 32k 2974 3670
> 64k 3404 3810
>
> Signed-off-by: Mikulas Patocka <mpatocka redhat com>
>
> ---
> drivers/md/dm-writecache.c | 28 ++++++++++++++++++++++++----
> 1 file changed, 24 insertions(+), 4 deletions(-)
>
> Index: linux-2.6/drivers/md/dm-writecache.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-writecache.c 2020-01-14
> 16:11:09.000000000 +0100
> +++ linux-2.6/drivers/md/dm-writecache.c 2020-01-14
> 21:42:44.000000000 +0100
> @@ -626,7 +626,7 @@ static void writecache_add_to_freelist(s
> wc->freelist_size++;
> }
>
> -static struct wc_entry *writecache_pop_from_freelist(struct
> dm_writecache *wc)
> +static struct wc_entry *writecache_pop_from_freelist(struct
> dm_writecache *wc, sector_t expected_sector)
> {
> struct wc_entry *e;
>
> @@ -635,6 +635,8 @@ static struct wc_entry *writecache_pop_f
> if (unlikely(!wc->current_free))
> return NULL;
> e = wc->current_free;
> + if (expected_sector != (sector_t)-1 &&
> unlikely(cache_sector(wc, e) != expected_sector))
> + return NULL;
> next = rb_next(&e->rb_node);
> rb_erase(&e->rb_node, &wc->freetree);
> if (unlikely(!next))
> @@ -644,6 +646,8 @@ static struct wc_entry *writecache_pop_f
> if (unlikely(list_empty(&wc->freelist)))
> return NULL;
> e = container_of(wc->freelist.next, struct wc_entry, lru);
> + if (expected_sector != (sector_t)-1 &&
> unlikely(cache_sector(wc, e) != expected_sector))
> + return NULL;
> list_del(&e->lru);
> }
> wc->freelist_size--;
> @@ -1194,7 +1198,7 @@ read_next_block:
> goto bio_copy;
> }
> }
> - e = writecache_pop_from_freelist(wc);
> + e = writecache_pop_from_freelist(wc, (sector_t)-1);
> if (unlikely(!e)) {
> writecache_wait_on_freelist(wc);
> continue;
> @@ -1206,9 +1210,25 @@ bio_copy:
> if (WC_MODE_PMEM(wc)) {
> bio_copy_block(wc, bio, memory_data(wc, e));
> } else {
> - dm_accept_partial_bio(bio, wc->block_size >>
> SECTOR_SHIFT);
> + unsigned bio_size = wc->block_size;
> + sector_t start_cache_sec = cache_sector(wc, e);
> + sector_t current_cache_sec = start_cache_sec +
> (bio_size >> SECTOR_SHIFT);
> +
> + while (bio_size < bio->bi_iter.bi_size) {
> + struct wc_entry *f =
> writecache_pop_from_freelist(wc, current_cache_sec);
> + if (!f)
> + break;
> + write_original_sector_seq_count(wc, f,
> bio->bi_iter.bi_sector + (bio_size >> SECTOR_SHIFT), wc->seq_count);
> + writecache_insert_entry(wc, f);
> + wc->uncommitted_blocks++;
> + bio_size += wc->block_size;
> + current_cache_sec += wc->block_size >> SECTOR_SHIFT;
> + }
> +
> bio_set_dev(bio, wc->ssd_dev->bdev);
> - bio->bi_iter.bi_sector = cache_sector(wc, e);
> + bio->bi_iter.bi_sector = start_cache_sec;
> + dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
> +
> if (unlikely(wc->uncommitted_blocks >=
> wc->autocommit_blocks)) {
> wc->uncommitted_blocks = 0;
> queue_work(wc->writeback_wq, &wc->flush_work);
The speed gain looks quite good.
One concern is if over time the free list becomes defragmented meaning
it may become harder for the current free entry to have consecutive free
blocks.
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
^ permalink raw reply [flat|nested] 2+ messages in thread
* [PATCH] dm-writecache: improve performance of large linear writes on SSDs
@ 2020-01-15 9:35 Mikulas Patocka
0 siblings, 0 replies; 2+ messages in thread
From: Mikulas Patocka @ 2020-01-15 9:35 UTC (permalink / raw)
To: Mike Snitzer, Nikhil Kshirsagar; +Cc: dm-devel
When dm-writecache is used with SSD as a cache device, it would submit
separate bio for each written block. The I/Os would be merged by the disk
scheduler, but this merging degrades performance.
This patch makes dm-writecache submit larger bios - we can submit large
bio as long as there is consecutive free space on the cache device.
Benchmark (arm64 with 64k page size, using /dev/ram0 as a cache device):
fio --bs=512k --iodepth=32 --size=400M --direct=1 --filename=/dev/mapper/cache --rw=randwrite --numjobs=1 --name=test
block old new
size MiB/s MiB/s
---------------------
512 181 700
1k 347 1256
2k 644 2020
4k 1183 2759
8k 1852 3333
16k 2469 3509
32k 2974 3670
64k 3404 3810
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
drivers/md/dm-writecache.c | 28 ++++++++++++++++++++++++----
1 file changed, 24 insertions(+), 4 deletions(-)
Index: linux-2.6/drivers/md/dm-writecache.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-writecache.c 2020-01-14 16:11:09.000000000 +0100
+++ linux-2.6/drivers/md/dm-writecache.c 2020-01-14 21:42:44.000000000 +0100
@@ -626,7 +626,7 @@ static void writecache_add_to_freelist(s
wc->freelist_size++;
}
-static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc)
+static struct wc_entry *writecache_pop_from_freelist(struct dm_writecache *wc, sector_t expected_sector)
{
struct wc_entry *e;
@@ -635,6 +635,8 @@ static struct wc_entry *writecache_pop_f
if (unlikely(!wc->current_free))
return NULL;
e = wc->current_free;
+ if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+ return NULL;
next = rb_next(&e->rb_node);
rb_erase(&e->rb_node, &wc->freetree);
if (unlikely(!next))
@@ -644,6 +646,8 @@ static struct wc_entry *writecache_pop_f
if (unlikely(list_empty(&wc->freelist)))
return NULL;
e = container_of(wc->freelist.next, struct wc_entry, lru);
+ if (expected_sector != (sector_t)-1 && unlikely(cache_sector(wc, e) != expected_sector))
+ return NULL;
list_del(&e->lru);
}
wc->freelist_size--;
@@ -1194,7 +1198,7 @@ read_next_block:
goto bio_copy;
}
}
- e = writecache_pop_from_freelist(wc);
+ e = writecache_pop_from_freelist(wc, (sector_t)-1);
if (unlikely(!e)) {
writecache_wait_on_freelist(wc);
continue;
@@ -1206,9 +1210,25 @@ bio_copy:
if (WC_MODE_PMEM(wc)) {
bio_copy_block(wc, bio, memory_data(wc, e));
} else {
- dm_accept_partial_bio(bio, wc->block_size >> SECTOR_SHIFT);
+ unsigned bio_size = wc->block_size;
+ sector_t start_cache_sec = cache_sector(wc, e);
+ sector_t current_cache_sec = start_cache_sec + (bio_size >> SECTOR_SHIFT);
+
+ while (bio_size < bio->bi_iter.bi_size) {
+ struct wc_entry *f = writecache_pop_from_freelist(wc, current_cache_sec);
+ if (!f)
+ break;
+ write_original_sector_seq_count(wc, f, bio->bi_iter.bi_sector + (bio_size >> SECTOR_SHIFT), wc->seq_count);
+ writecache_insert_entry(wc, f);
+ wc->uncommitted_blocks++;
+ bio_size += wc->block_size;
+ current_cache_sec += wc->block_size >> SECTOR_SHIFT;
+ }
+
bio_set_dev(bio, wc->ssd_dev->bdev);
- bio->bi_iter.bi_sector = cache_sector(wc, e);
+ bio->bi_iter.bi_sector = start_cache_sec;
+ dm_accept_partial_bio(bio, bio_size >> SECTOR_SHIFT);
+
if (unlikely(wc->uncommitted_blocks >= wc->autocommit_blocks)) {
wc->uncommitted_blocks = 0;
queue_work(wc->writeback_wq, &wc->flush_work);
^ permalink raw reply [flat|nested] 2+ messages in thread
end of thread, other threads:[~2020-01-16 22:47 UTC | newest]
Thread overview: 2+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
[not found] <b0341161-1a49-d03a-d096-abf3a81d319c@petasan.org>
2020-01-16 22:47 ` [PATCH] dm-writecache: improve performance of large linear writes on SSDs Maged Mokhtar
2020-01-15 9:35 Mikulas Patocka
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.