* [PATCH] dm writecache: SB remove seq_count
@ 2019-12-06 19:03 Maged Mokhtar
2020-01-02 15:37 ` Maged Mokhtar
0 siblings, 1 reply; 9+ messages in thread
From: Maged Mokhtar @ 2019-12-06 19:03 UTC (permalink / raw)
To: mpatocka; +Cc: dm-devel
Removes seq_count from super block. Currently the super block gets
written in each commit to update the seq_count which is just used when
the target is restarted/resumed. This extra iop has a performance impact
on small block size writes which do FUA/sync on every request. A 4k sync
write currently requires 3 write ops: submitted data, metadata + super
block seq_count update, removal of seq_count update reduces required
write ops to 2.
Rebuild of seq_count at start/resumption can be done quickly by looping
through memory entry metadata within the resume() function.
Signed-off-by: Maged Mokhtar <mmokhtar@petasan.org>
---
drivers/md/dm-writecache.c | 56 ++++++++++++++++++++++++++---------
1 file changed, 42 insertions(+), 14 deletions(-)
--- a/drivers/md/dm-writecache.c 2019-12-06 03:07:53.000000000 -0800
+++ b/drivers/md/dm-writecache.c 2019-12-06 09:25:45.000000000 -0800
@@ -52,7 +52,8 @@ do { \
#endif
#define MEMORY_SUPERBLOCK_MAGIC 0x23489321
-#define MEMORY_SUPERBLOCK_VERSION 1
+#define MEMORY_SUPERBLOCK_VERSION_1 1
+#define MEMORY_SUPERBLOCK_VERSION_2 2
struct wc_memory_entry {
__le64 original_sector;
@@ -67,7 +68,6 @@ struct wc_memory_superblock {
__le32 block_size;
__le32 pad;
__le64 n_blocks;
- __le64 seq_count;
};
__le64 padding[8];
};
@@ -380,6 +380,41 @@ static uint64_t read_seq_count(struct dm
#endif
}
+static uint64_t read_last_seq_count(struct dm_writecache *wc)
+{
+ size_t b;
+ uint64_t last_seq_count = 0;
+ uint64_t seq_count;
+ __le64 empty = cpu_to_le64(-1);
+
+ if (WC_MODE_PMEM(wc)) {
+ struct wc_memory_entry wme;
+ for (b = 0; b < wc->n_blocks; b++) {
+ BUG_ON(memcpy_mcsafe(&wme, &sb(wc)->entries[b],
+ sizeof(struct wc_memory_entry)));
+ if (wme.seq_count != empty) {
+ seq_count = le64_to_cpu(wme.seq_count);
+ if (last_seq_count < seq_count)
+ last_seq_count = seq_count;
+ }
+ }
+ }
+ else {
+ struct wc_memory_entry *p = &sb(wc)->entries[0];
+ b = wc->n_blocks;
+ while (0 < b) {
+ if (p->seq_count != empty) {
+ seq_count = le64_to_cpu(p->seq_count);
+ if (last_seq_count < seq_count)
+ last_seq_count = seq_count;
+ }
+ p++;
+ b--;
+ }
+ }
+ return last_seq_count;
+}
+
static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
{
#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
@@ -730,8 +765,6 @@ static void writecache_flush(struct dm_w
writecache_wait_for_ios(wc, WRITE);
wc->seq_count++;
- pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
- writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
writecache_commit_flushed(wc);
wc->overwrote_committed = false;
@@ -876,7 +909,6 @@ static void writecache_resume(struct dm_
struct dm_writecache *wc = ti->private;
size_t b;
bool need_flush = false;
- __le64 sb_seq_count;
int r;
wc_lock(wc);
@@ -894,12 +926,7 @@ static void writecache_resume(struct dm_
}
wc->freelist_size = 0;
- r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
- if (r) {
- writecache_error(wc, r, "hardware memory error when reading
superblock: %d", r);
- sb_seq_count = cpu_to_le64(0);
- }
- wc->seq_count = le64_to_cpu(sb_seq_count);
+ wc->seq_count = read_last_seq_count(wc) + 1;
#ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
for (b = 0; b < wc->n_blocks; b++) {
@@ -1757,10 +1784,9 @@ static int init_memory(struct dm_writeca
for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
- pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
+ pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
- pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
for (b = 0; b < wc->n_blocks; b++)
write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
@@ -2159,11 +2185,13 @@ invalid_optional:
goto bad;
}
- if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
+ if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_1 &&
+ le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_2) {
ti->error = "Invalid version in the superblock";
r = -EINVAL;
goto bad;
}
+ pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
if (le32_to_cpu(s.block_size) != wc->block_size) {
ti->error = "Block size does not match superblock";
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] dm writecache: SB remove seq_count
2019-12-06 19:03 [PATCH] dm writecache: SB remove seq_count Maged Mokhtar
@ 2020-01-02 15:37 ` Maged Mokhtar
2020-01-08 13:54 ` Mikulas Patocka
0 siblings, 1 reply; 9+ messages in thread
From: Maged Mokhtar @ 2020-01-02 15:37 UTC (permalink / raw)
To: mpatocka; +Cc: dm-devel
Any feedback on this patch please.
On 06/12/2019 21:03, Maged Mokhtar wrote:
> Removes seq_count from super block. Currently the super block gets
> written in each commit to update the seq_count which is just used when
> the target is restarted/resumed. This extra iop has a performance impact
> on small block size writes which do FUA/sync on every request. A 4k sync
> write currently requires 3 write ops: submitted data, metadata + super
> block seq_count update, removal of seq_count update reduces required
> write ops to 2.
>
> Rebuild of seq_count at start/resumption can be done quickly by looping
> through memory entry metadata within the resume() function.
>
> Signed-off-by: Maged Mokhtar <mmokhtar@petasan.org>
> ---
> drivers/md/dm-writecache.c | 56 ++++++++++++++++++++++++++---------
> 1 file changed, 42 insertions(+), 14 deletions(-)
>
> --- a/drivers/md/dm-writecache.c 2019-12-06 03:07:53.000000000 -0800
> +++ b/drivers/md/dm-writecache.c 2019-12-06 09:25:45.000000000 -0800
> @@ -52,7 +52,8 @@ do { \
> #endif
>
> #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
> -#define MEMORY_SUPERBLOCK_VERSION 1
> +#define MEMORY_SUPERBLOCK_VERSION_1 1
> +#define MEMORY_SUPERBLOCK_VERSION_2 2
>
> struct wc_memory_entry {
> __le64 original_sector;
> @@ -67,7 +68,6 @@ struct wc_memory_superblock {
> __le32 block_size;
> __le32 pad;
> __le64 n_blocks;
> - __le64 seq_count;
> };
> __le64 padding[8];
> };
> @@ -380,6 +380,41 @@ static uint64_t read_seq_count(struct dm
> #endif
> }
>
> +static uint64_t read_last_seq_count(struct dm_writecache *wc)
> +{
> + size_t b;
> + uint64_t last_seq_count = 0;
> + uint64_t seq_count;
> + __le64 empty = cpu_to_le64(-1);
> +
> + if (WC_MODE_PMEM(wc)) {
> + struct wc_memory_entry wme;
> + for (b = 0; b < wc->n_blocks; b++) {
> + BUG_ON(memcpy_mcsafe(&wme, &sb(wc)->entries[b],
> + sizeof(struct wc_memory_entry)));
> + if (wme.seq_count != empty) {
> + seq_count = le64_to_cpu(wme.seq_count);
> + if (last_seq_count < seq_count)
> + last_seq_count = seq_count;
> + }
> + }
> + }
> + else {
> + struct wc_memory_entry *p = &sb(wc)->entries[0];
> + b = wc->n_blocks;
> + while (0 < b) {
> + if (p->seq_count != empty) {
> + seq_count = le64_to_cpu(p->seq_count);
> + if (last_seq_count < seq_count)
> + last_seq_count = seq_count;
> + }
> + p++;
> + b--;
> + }
> + }
> + return last_seq_count;
> +}
> +
> static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
> {
> #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
> @@ -730,8 +765,6 @@ static void writecache_flush(struct dm_w
> writecache_wait_for_ios(wc, WRITE);
>
> wc->seq_count++;
> - pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
> - writecache_flush_region(wc, &sb(wc)->seq_count, sizeof
> sb(wc)->seq_count);
> writecache_commit_flushed(wc);
>
> wc->overwrote_committed = false;
> @@ -876,7 +909,6 @@ static void writecache_resume(struct dm_
> struct dm_writecache *wc = ti->private;
> size_t b;
> bool need_flush = false;
> - __le64 sb_seq_count;
> int r;
>
> wc_lock(wc);
> @@ -894,12 +926,7 @@ static void writecache_resume(struct dm_
> }
> wc->freelist_size = 0;
>
> - r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count,
> sizeof(uint64_t));
> - if (r) {
> - writecache_error(wc, r, "hardware memory error when reading
> superblock: %d", r);
> - sb_seq_count = cpu_to_le64(0);
> - }
> - wc->seq_count = le64_to_cpu(sb_seq_count);
> + wc->seq_count = read_last_seq_count(wc) + 1;
>
> #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
> for (b = 0; b < wc->n_blocks; b++) {
> @@ -1757,10 +1784,9 @@ static int init_memory(struct dm_writeca
>
> for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
> pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
> - pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
> + pmem_assign(sb(wc)->version,
> cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
> pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
> pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
> - pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
>
> for (b = 0; b < wc->n_blocks; b++)
> write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
> @@ -2159,11 +2185,13 @@ invalid_optional:
> goto bad;
> }
>
> - if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
> + if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_1 &&
> + le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_2) {
> ti->error = "Invalid version in the superblock";
> r = -EINVAL;
> goto bad;
> }
> + pmem_assign(sb(wc)->version,
> cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
>
> if (le32_to_cpu(s.block_size) != wc->block_size) {
> ti->error = "Block size does not match superblock";
--
Maged Mokhtar
CEO PetaSAN
4 Emad El Deen Kamel
Cairo 11371, Egypt
www.petasan.org
+201006979931
skype: maged.mokhtar
--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] dm writecache: SB remove seq_count
2020-01-02 15:37 ` Maged Mokhtar
@ 2020-01-08 13:54 ` Mikulas Patocka
2020-01-08 17:04 ` Mikulas Patocka
2020-01-13 20:02 ` Maged Mokhtar
0 siblings, 2 replies; 9+ messages in thread
From: Mikulas Patocka @ 2020-01-08 13:54 UTC (permalink / raw)
To: Maged Mokhtar; +Cc: dm-devel, Mike Snitzer
[-- Attachment #1: Type: TEXT/PLAIN, Size: 6775 bytes --]
Hi
On Thu, 2 Jan 2020, Maged Mokhtar wrote:
> Any feedback on this patch please.
This will definitely not work for persistent memory - it could corrupt
data if a crash happens. The CPU can flush data in arbitrary order and it
may happen that the seq count is flushed before the pertaining data.
As for SSD mode - we could avoid updating the refcount in the superblock,
but it wouldn't be much helpful.
I.e. normally, commit is done this way:
1. submit data writes
2. submit metadata writes
3. flush disk cache
4. submit the write of superblock with increased seq_count
5. flush disk cache
If we wanted to avoid writing the seq_count, we would need to change it
to:
1. submit data writes
2. flush disk cache
3. submit metadata writes
4. flush disk cache
- i.e. it sill needs two disk cache flushes per one commit request - and
it is not much better than the existing solution.
Mikulas
> On 06/12/2019 21:03, Maged Mokhtar wrote:
> > Removes seq_count from super block. Currently the super block gets written
> > in each commit to update the seq_count which is just used when the target is
> > restarted/resumed. This extra iop has a performance impact on small block
> > size writes which do FUA/sync on every request. A 4k sync write currently
> > requires 3 write ops: submitted data, metadata + super block seq_count
> > update, removal of seq_count update reduces required write ops to 2.
> >
> > Rebuild of seq_count at start/resumption can be done quickly by looping
> > through memory entry metadata within the resume() function.
> >
> > Signed-off-by: Maged Mokhtar <mmokhtar@petasan.org>
> > ---
> > drivers/md/dm-writecache.c | 56 ++++++++++++++++++++++++++---------
> > 1 file changed, 42 insertions(+), 14 deletions(-)
> >
> > --- a/drivers/md/dm-writecache.c 2019-12-06 03:07:53.000000000 -0800
> > +++ b/drivers/md/dm-writecache.c 2019-12-06 09:25:45.000000000 -0800
> > @@ -52,7 +52,8 @@ do { \
> > #endif
> >
> > #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
> > -#define MEMORY_SUPERBLOCK_VERSION 1
> > +#define MEMORY_SUPERBLOCK_VERSION_1 1
> > +#define MEMORY_SUPERBLOCK_VERSION_2 2
> >
> > struct wc_memory_entry {
> > __le64 original_sector;
> > @@ -67,7 +68,6 @@ struct wc_memory_superblock {
> > __le32 block_size;
> > __le32 pad;
> > __le64 n_blocks;
> > - __le64 seq_count;
> > };
> > __le64 padding[8];
> > };
> > @@ -380,6 +380,41 @@ static uint64_t read_seq_count(struct dm
> > #endif
> > }
> >
> > +static uint64_t read_last_seq_count(struct dm_writecache *wc)
> > +{
> > + size_t b;
> > + uint64_t last_seq_count = 0;
> > + uint64_t seq_count;
> > + __le64 empty = cpu_to_le64(-1);
> > +
> > + if (WC_MODE_PMEM(wc)) {
> > + struct wc_memory_entry wme;
> > + for (b = 0; b < wc->n_blocks; b++) {
> > + BUG_ON(memcpy_mcsafe(&wme, &sb(wc)->entries[b],
> > + sizeof(struct wc_memory_entry)));
> > + if (wme.seq_count != empty) {
> > + seq_count = le64_to_cpu(wme.seq_count);
> > + if (last_seq_count < seq_count)
> > + last_seq_count = seq_count;
> > + }
> > + }
> > + }
> > + else {
> > + struct wc_memory_entry *p = &sb(wc)->entries[0];
> > + b = wc->n_blocks;
> > + while (0 < b) {
> > + if (p->seq_count != empty) {
> > + seq_count = le64_to_cpu(p->seq_count);
> > + if (last_seq_count < seq_count)
> > + last_seq_count = seq_count;
> > + }
> > + p++;
> > + b--;
> > + }
> > + }
> > + return last_seq_count;
> > +}
> > +
> > static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
> > {
> > #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
> > @@ -730,8 +765,6 @@ static void writecache_flush(struct dm_w
> > writecache_wait_for_ios(wc, WRITE);
> >
> > wc->seq_count++;
> > - pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
> > - writecache_flush_region(wc, &sb(wc)->seq_count, sizeof
> > sb(wc)->seq_count);
> > writecache_commit_flushed(wc);
> >
> > wc->overwrote_committed = false;
> > @@ -876,7 +909,6 @@ static void writecache_resume(struct dm_
> > struct dm_writecache *wc = ti->private;
> > size_t b;
> > bool need_flush = false;
> > - __le64 sb_seq_count;
> > int r;
> >
> > wc_lock(wc);
> > @@ -894,12 +926,7 @@ static void writecache_resume(struct dm_
> > }
> > wc->freelist_size = 0;
> >
> > - r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
> > - if (r) {
> > - writecache_error(wc, r, "hardware memory error when reading
> > superblock: %d", r);
> > - sb_seq_count = cpu_to_le64(0);
> > - }
> > - wc->seq_count = le64_to_cpu(sb_seq_count);
> > + wc->seq_count = read_last_seq_count(wc) + 1;
> >
> > #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
> > for (b = 0; b < wc->n_blocks; b++) {
> > @@ -1757,10 +1784,9 @@ static int init_memory(struct dm_writeca
> >
> > for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
> > pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
> > - pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
> > + pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
> > pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
> > pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
> > - pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
> >
> > for (b = 0; b < wc->n_blocks; b++)
> > write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
> > @@ -2159,11 +2185,13 @@ invalid_optional:
> > goto bad;
> > }
> >
> > - if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
> > + if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_1 &&
> > + le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_2) {
> > ti->error = "Invalid version in the superblock";
> > r = -EINVAL;
> > goto bad;
> > }
> > + pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
> >
> > if (le32_to_cpu(s.block_size) != wc->block_size) {
> > ti->error = "Block size does not match superblock";
>
> --
> Maged Mokhtar
> CEO PetaSAN
> 4 Emad El Deen Kamel
> Cairo 11371, Egypt
> www.petasan.org
> +201006979931
> skype: maged.mokhtar
>
[-- Attachment #2: Type: text/plain, Size: 0 bytes --]
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] dm writecache: SB remove seq_count
2020-01-08 13:54 ` Mikulas Patocka
@ 2020-01-08 17:04 ` Mikulas Patocka
2020-01-08 19:14 ` Maged Mokhtar
2020-01-13 19:35 ` Maged Mokhtar
2020-01-13 20:02 ` Maged Mokhtar
1 sibling, 2 replies; 9+ messages in thread
From: Mikulas Patocka @ 2020-01-08 17:04 UTC (permalink / raw)
To: Maged Mokhtar; +Cc: dm-devel, Mike Snitzer
BTW. I would be interested if this patch improves performance for you.
Could you test it?
(you also need my previous patch posted here
https://www.redhat.com/archives/dm-devel/2020-January/msg00027.html )
Mikulas
dm-writecache: use REQ_FUA when writing the superblock
When writing the superblock, it may be better to submit just one I/O with
the FUA bit set instead of two I/Os.
Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
---
drivers/md/dm-writecache.c | 29 +++++++++++++++--------------
1 file changed, 15 insertions(+), 14 deletions(-)
Index: linux-2.6/drivers/md/dm-writecache.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-writecache.c 2020-01-08 17:56:41.000000000 +0100
+++ linux-2.6/drivers/md/dm-writecache.c 2020-01-08 17:56:49.000000000 +0100
@@ -448,7 +448,7 @@ static void writecache_wait_for_ios(stru
!atomic_read(&wc->bio_in_progress[direction]));
}
-static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
+static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
{
struct dm_io_region region;
struct dm_io_request req;
@@ -479,7 +479,7 @@ static void ssd_commit_flushed(struct dm
region.sector += wc->start_sector;
atomic_inc(&endio.count);
req.bi_op = REQ_OP_WRITE;
- req.bi_op_flags = REQ_SYNC;
+ req.bi_op_flags = REQ_SYNC | (use_fua ? REQ_FUA : 0);
req.mem.type = DM_IO_VMA;
req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
req.client = wc->dm_io;
@@ -497,17 +497,18 @@ static void ssd_commit_flushed(struct dm
if (wait_for_ios)
writecache_wait_for_ios(wc, WRITE);
- writecache_disk_flush(wc, wc->ssd_dev);
+ if (!use_fua)
+ writecache_disk_flush(wc, wc->ssd_dev);
memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
}
-static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
+static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
{
if (WC_MODE_PMEM(wc))
wmb();
else
- ssd_commit_flushed(wc, wait_for_ios);
+ ssd_commit_flushed(wc, wait_for_ios, use_fua);
}
static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
@@ -727,12 +728,12 @@ static void writecache_flush(struct dm_w
e = e2;
cond_resched();
}
- writecache_commit_flushed(wc, true);
+ writecache_commit_flushed(wc, true, false);
wc->seq_count++;
pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
- writecache_commit_flushed(wc, false);
+ writecache_commit_flushed(wc, false, true);
wc->overwrote_committed = false;
@@ -756,7 +757,7 @@ static void writecache_flush(struct dm_w
}
if (need_flush_after_free)
- writecache_commit_flushed(wc, false);
+ writecache_commit_flushed(wc, false, false);
}
static void writecache_flush_work(struct work_struct *work)
@@ -809,7 +810,7 @@ static void writecache_discard(struct dm
}
if (discarded_something)
- writecache_commit_flushed(wc, false);
+ writecache_commit_flushed(wc, false, false);
}
static bool writecache_wait_for_writeback(struct dm_writecache *wc)
@@ -958,7 +959,7 @@ erase_this:
if (need_flush) {
writecache_flush_all_metadata(wc);
- writecache_commit_flushed(wc, false);
+ writecache_commit_flushed(wc, false, false);
}
wc_unlock(wc);
@@ -1342,7 +1343,7 @@ static void __writecache_endio_pmem(stru
wc->writeback_size--;
n_walked++;
if (unlikely(n_walked >= ENDIO_LATENCY)) {
- writecache_commit_flushed(wc, false);
+ writecache_commit_flushed(wc, false, false);
wc_unlock(wc);
wc_lock(wc);
n_walked = 0;
@@ -1423,7 +1424,7 @@ pop_from_list:
writecache_wait_for_ios(wc, READ);
}
- writecache_commit_flushed(wc, false);
+ writecache_commit_flushed(wc, false, false);
wc_unlock(wc);
}
@@ -1766,10 +1767,10 @@ static int init_memory(struct dm_writeca
write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
writecache_flush_all_metadata(wc);
- writecache_commit_flushed(wc, false);
+ writecache_commit_flushed(wc, false, false);
pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
- writecache_commit_flushed(wc, false);
+ writecache_commit_flushed(wc, false, false);
return 0;
}
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] dm writecache: SB remove seq_count
2020-01-08 17:04 ` Mikulas Patocka
@ 2020-01-08 19:14 ` Maged Mokhtar
2020-01-13 19:35 ` Maged Mokhtar
1 sibling, 0 replies; 9+ messages in thread
From: Maged Mokhtar @ 2020-01-08 19:14 UTC (permalink / raw)
To: Mikulas Patocka; +Cc: dm-devel, Mike Snitzer
Yes sure. i will send you something early next week.
/Maged
On 08/01/2020 19:04, Mikulas Patocka wrote:
> BTW. I would be interested if this patch improves performance for you.
> Could you test it?
>
> (you also need my previous patch posted here
> https://www.redhat.com/archives/dm-devel/2020-January/msg00027.html )
>
> Mikulas
>
>
>
>
> dm-writecache: use REQ_FUA when writing the superblock
>
> When writing the superblock, it may be better to submit just one I/O with
> the FUA bit set instead of two I/Os.
>
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>
> ---
> drivers/md/dm-writecache.c | 29 +++++++++++++++--------------
> 1 file changed, 15 insertions(+), 14 deletions(-)
>
> Index: linux-2.6/drivers/md/dm-writecache.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-writecache.c 2020-01-08 17:56:41.000000000 +0100
> +++ linux-2.6/drivers/md/dm-writecache.c 2020-01-08 17:56:49.000000000 +0100
> @@ -448,7 +448,7 @@ static void writecache_wait_for_ios(stru
> !atomic_read(&wc->bio_in_progress[direction]));
> }
>
> -static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
> +static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
> {
> struct dm_io_region region;
> struct dm_io_request req;
> @@ -479,7 +479,7 @@ static void ssd_commit_flushed(struct dm
> region.sector += wc->start_sector;
> atomic_inc(&endio.count);
> req.bi_op = REQ_OP_WRITE;
> - req.bi_op_flags = REQ_SYNC;
> + req.bi_op_flags = REQ_SYNC | (use_fua ? REQ_FUA : 0);
> req.mem.type = DM_IO_VMA;
> req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
> req.client = wc->dm_io;
> @@ -497,17 +497,18 @@ static void ssd_commit_flushed(struct dm
> if (wait_for_ios)
> writecache_wait_for_ios(wc, WRITE);
>
> - writecache_disk_flush(wc, wc->ssd_dev);
> + if (!use_fua)
> + writecache_disk_flush(wc, wc->ssd_dev);
>
> memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
> }
>
> -static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
> +static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
> {
> if (WC_MODE_PMEM(wc))
> wmb();
> else
> - ssd_commit_flushed(wc, wait_for_ios);
> + ssd_commit_flushed(wc, wait_for_ios, use_fua);
> }
>
> static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
> @@ -727,12 +728,12 @@ static void writecache_flush(struct dm_w
> e = e2;
> cond_resched();
> }
> - writecache_commit_flushed(wc, true);
> + writecache_commit_flushed(wc, true, false);
>
> wc->seq_count++;
> pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
> writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, true);
>
> wc->overwrote_committed = false;
>
> @@ -756,7 +757,7 @@ static void writecache_flush(struct dm_w
> }
>
> if (need_flush_after_free)
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
> }
>
> static void writecache_flush_work(struct work_struct *work)
> @@ -809,7 +810,7 @@ static void writecache_discard(struct dm
> }
>
> if (discarded_something)
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
> }
>
> static bool writecache_wait_for_writeback(struct dm_writecache *wc)
> @@ -958,7 +959,7 @@ erase_this:
>
> if (need_flush) {
> writecache_flush_all_metadata(wc);
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
> }
>
> wc_unlock(wc);
> @@ -1342,7 +1343,7 @@ static void __writecache_endio_pmem(stru
> wc->writeback_size--;
> n_walked++;
> if (unlikely(n_walked >= ENDIO_LATENCY)) {
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
> wc_unlock(wc);
> wc_lock(wc);
> n_walked = 0;
> @@ -1423,7 +1424,7 @@ pop_from_list:
> writecache_wait_for_ios(wc, READ);
> }
>
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
>
> wc_unlock(wc);
> }
> @@ -1766,10 +1767,10 @@ static int init_memory(struct dm_writeca
> write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
>
> writecache_flush_all_metadata(wc);
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
> pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
> writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
>
> return 0;
> }
>
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] dm writecache: SB remove seq_count
2020-01-08 17:04 ` Mikulas Patocka
2020-01-08 19:14 ` Maged Mokhtar
@ 2020-01-13 19:35 ` Maged Mokhtar
1 sibling, 0 replies; 9+ messages in thread
From: Maged Mokhtar @ 2020-01-13 19:35 UTC (permalink / raw)
To: Mikulas Patocka; +Cc: dm-devel, Mike Snitzer
On 08/01/2020 19:04, Mikulas Patocka wrote:
> BTW. I would be interested if this patch improves performance for you.
> Could you test it?
>
> (you also need my previous patch posted here
> https://www.redhat.com/archives/dm-devel/2020-January/msg00027.html )
>
> Mikulas
>
>
>
>
> dm-writecache: use REQ_FUA when writing the superblock
>
> When writing the superblock, it may be better to submit just one I/O with
> the FUA bit set instead of two I/Os.
>
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
>
> ---
> drivers/md/dm-writecache.c | 29 +++++++++++++++--------------
> 1 file changed, 15 insertions(+), 14 deletions(-)
>
> Index: linux-2.6/drivers/md/dm-writecache.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-writecache.c 2020-01-08 17:56:41.000000000 +0100
> +++ linux-2.6/drivers/md/dm-writecache.c 2020-01-08 17:56:49.000000000 +0100
> @@ -448,7 +448,7 @@ static void writecache_wait_for_ios(stru
> !atomic_read(&wc->bio_in_progress[direction]));
> }
>
> -static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
> +static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
> {
> struct dm_io_region region;
> struct dm_io_request req;
> @@ -479,7 +479,7 @@ static void ssd_commit_flushed(struct dm
> region.sector += wc->start_sector;
> atomic_inc(&endio.count);
> req.bi_op = REQ_OP_WRITE;
> - req.bi_op_flags = REQ_SYNC;
> + req.bi_op_flags = REQ_SYNC | (use_fua ? REQ_FUA : 0);
> req.mem.type = DM_IO_VMA;
> req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
> req.client = wc->dm_io;
> @@ -497,17 +497,18 @@ static void ssd_commit_flushed(struct dm
> if (wait_for_ios)
> writecache_wait_for_ios(wc, WRITE);
>
> - writecache_disk_flush(wc, wc->ssd_dev);
> + if (!use_fua)
> + writecache_disk_flush(wc, wc->ssd_dev);
>
> memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
> }
>
> -static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
> +static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
> {
> if (WC_MODE_PMEM(wc))
> wmb();
> else
> - ssd_commit_flushed(wc, wait_for_ios);
> + ssd_commit_flushed(wc, wait_for_ios, use_fua);
> }
>
> static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
> @@ -727,12 +728,12 @@ static void writecache_flush(struct dm_w
> e = e2;
> cond_resched();
> }
> - writecache_commit_flushed(wc, true);
> + writecache_commit_flushed(wc, true, false);
>
> wc->seq_count++;
> pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
> writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, true);
>
> wc->overwrote_committed = false;
>
> @@ -756,7 +757,7 @@ static void writecache_flush(struct dm_w
> }
>
> if (need_flush_after_free)
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
> }
>
> static void writecache_flush_work(struct work_struct *work)
> @@ -809,7 +810,7 @@ static void writecache_discard(struct dm
> }
>
> if (discarded_something)
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
> }
>
> static bool writecache_wait_for_writeback(struct dm_writecache *wc)
> @@ -958,7 +959,7 @@ erase_this:
>
> if (need_flush) {
> writecache_flush_all_metadata(wc);
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
> }
>
> wc_unlock(wc);
> @@ -1342,7 +1343,7 @@ static void __writecache_endio_pmem(stru
> wc->writeback_size--;
> n_walked++;
> if (unlikely(n_walked >= ENDIO_LATENCY)) {
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
> wc_unlock(wc);
> wc_lock(wc);
> n_walked = 0;
> @@ -1423,7 +1424,7 @@ pop_from_list:
> writecache_wait_for_ios(wc, READ);
> }
>
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
>
> wc_unlock(wc);
> }
> @@ -1766,10 +1767,10 @@ static int init_memory(struct dm_writeca
> write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
>
> writecache_flush_all_metadata(wc);
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
> pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
> writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
> - writecache_commit_flushed(wc, false);
> + writecache_commit_flushed(wc, false, false);
>
> return 0;
> }
>
Hi Mikulas
On a test disk, performance is same with or without patch.
Using fio 4k rand write iops:
12 K iops with sync/fua = 0
2.7K iops with sync/fua = 1
------- Without patch ----------
fio --name=4kiops --filename=/dev/vg1/slow --rw=randwrite --bs=4k
--direct=1 --sync=0 --runtime=10 --time_based --numjobs=16 --iodepth=1
--group_reporting
Run status group 0 (all jobs):
WRITE: bw=46.9MiB/s (49.2MB/s), 46.9MiB/s-46.9MiB/s
(49.2MB/s-49.2MB/s), io=469MiB (492MB), run=10005-10005msec
fio --name=4kiops --filename=/dev/vg1/slow --rw=randwrite --bs=4k
--direct=1 --sync=1 --runtime=10 --time_based --numjobs=16 --iodepth=1
--group_reporting
Run status group 0 (all jobs):
WRITE: bw=10.7MiB/s (11.2MB/s), 10.7MiB/s-10.7MiB/s
(11.2MB/s-11.2MB/s), io=107MiB (112MB), run=10007-1000 7msec
------- With patch ----------
fio --name=4kiops --filename=/dev/vg1/slow --rw=randwrite --bs=4k
--direct=1 --sync=0 --runtime=10 --time_based --numjobs=16 --iodepth=1
--group_reporting
Run status group 0 (all jobs):
WRITE: bw=46.8MiB/s (49.0MB/s), 46.8MiB/s-46.8MiB/s
(49.0MB/s-49.0MB/s), io=468MiB (491MB), run=10004-10004msec
fio --name=4kiops --filename=/dev/vg1/slow --rw=randwrite --bs=4k
--direct=1 --sync=1 --runtime=10 --time_based --numjobs=16 --iodepth=1
--group_reporting
Run status group 0 (all jobs):
WRITE: bw=10.8MiB/s (11.3MB/s), 10.8MiB/s-10.8MiB/s
(11.3MB/s-11.3MB/s), io=108MiB (113MB), run=10005-10005msec
/Maged
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] dm writecache: SB remove seq_count
2020-01-08 13:54 ` Mikulas Patocka
2020-01-08 17:04 ` Mikulas Patocka
@ 2020-01-13 20:02 ` Maged Mokhtar
2020-01-14 14:50 ` Mikulas Patocka
1 sibling, 1 reply; 9+ messages in thread
From: Maged Mokhtar @ 2020-01-13 20:02 UTC (permalink / raw)
To: Mikulas Patocka; +Cc: dm-devel, Mike Snitzer
> Hi
>
>
> On Thu, 2 Jan 2020, Maged Mokhtar wrote:
>
>> Any feedback on this patch please.
>
> This will definitely not work for persistent memory - it could corrupt
> data if a crash happens. The CPU can flush data in arbitrary order and it
> may happen that the seq count is flushed before the pertaining data.
>
> As for SSD mode - we could avoid updating the refcount in the superblock,
> but it wouldn't be much helpful.
>
> I.e. normally, commit is done this way:
> 1. submit data writes
> 2. submit metadata writes
> 3. flush disk cache
> 4. submit the write of superblock with increased seq_count
> 5. flush disk cache
>
> If we wanted to avoid writing the seq_count, we would need to change it
> to:
> 1. submit data writes
> 2. flush disk cache
> 3. submit metadata writes
> 4. flush disk cache
>
> - i.e. it sill needs two disk cache flushes per one commit request - and
> it is not much better than the existing solution.
>
> Mikulas
>
Hi Mikulas,
I appreciate your review. For SSD mode, I see the advantages of SB
writes for handling crash recovery and agree with what you say. Note
however that after a crash a proper client should not assume the data is
valid on a device, only at the point it last issued a successful flush
should the data be consistent, after this it should not assume so. A
filesystem/db should handle journals/transaction at a higher level than
the device. But again anything we can do on the device/target to make
things more consistent, the better, so i agree there.
There is also limit to what the current crash recovery code can do, as i
understand it if you have metadata already committed, their seq count is
not incremented for new io on the same blocks, the crash recovery code
will therefore not detect or recover cases where new data is written to
existing 4k blocks at the time of crash, some blocks will end up with
new data, others will not. Again this is my understanding so i could be
wrong.
I think if crash consistency needs to be enhanced, it should take into
account that most consumer/non-enterprise SSDs do not offer power loss
protection. For many such devices power loss can corrupt data that is
already written as they commit data in larger chunks via a
read/modify/erase/write cycle. It is particularly bad for metadata as it
could affect many data blocks. Maybe it could be good to have metadata
writes via transactions or journaling, dm-cache and thin provisioning do
something like this i think.
i also think your suggestion of:
> If we wanted to avoid writing the seq_count, we would need to change it
> to:
> 1. submit data writes
> 2. flush disk cache
> 3. submit metadata writes
> 4. flush disk cache
could be better in terms of prolonging SSD lifetime, as currently the
superblock gets much higher write frequency.
/Maged
>> On 06/12/2019 21:03, Maged Mokhtar wrote:
>>> Removes seq_count from super block. Currently the super block gets written
>>> in each commit to update the seq_count which is just used when the target is
>>> restarted/resumed. This extra iop has a performance impact on small block
>>> size writes which do FUA/sync on every request. A 4k sync write currently
>>> requires 3 write ops: submitted data, metadata + super block seq_count
>>> update, removal of seq_count update reduces required write ops to 2.
>>>
>>> Rebuild of seq_count at start/resumption can be done quickly by looping
>>> through memory entry metadata within the resume() function.
>>>
>>> Signed-off-by: Maged Mokhtar <mmokhtar@petasan.org>
>>> ---
>>> drivers/md/dm-writecache.c | 56 ++++++++++++++++++++++++++---------
>>> 1 file changed, 42 insertions(+), 14 deletions(-)
>>>
>>> --- a/drivers/md/dm-writecache.c 2019-12-06 03:07:53.000000000 -0800
>>> +++ b/drivers/md/dm-writecache.c 2019-12-06 09:25:45.000000000 -0800
>>> @@ -52,7 +52,8 @@ do { \
>>> #endif
>>>
>>> #define MEMORY_SUPERBLOCK_MAGIC 0x23489321
>>> -#define MEMORY_SUPERBLOCK_VERSION 1
>>> +#define MEMORY_SUPERBLOCK_VERSION_1 1
>>> +#define MEMORY_SUPERBLOCK_VERSION_2 2
>>>
>>> struct wc_memory_entry {
>>> __le64 original_sector;
>>> @@ -67,7 +68,6 @@ struct wc_memory_superblock {
>>> __le32 block_size;
>>> __le32 pad;
>>> __le64 n_blocks;
>>> - __le64 seq_count;
>>> };
>>> __le64 padding[8];
>>> };
>>> @@ -380,6 +380,41 @@ static uint64_t read_seq_count(struct dm
>>> #endif
>>> }
>>>
>>> +static uint64_t read_last_seq_count(struct dm_writecache *wc)
>>> +{
>>> + size_t b;
>>> + uint64_t last_seq_count = 0;
>>> + uint64_t seq_count;
>>> + __le64 empty = cpu_to_le64(-1);
>>> +
>>> + if (WC_MODE_PMEM(wc)) {
>>> + struct wc_memory_entry wme;
>>> + for (b = 0; b < wc->n_blocks; b++) {
>>> + BUG_ON(memcpy_mcsafe(&wme, &sb(wc)->entries[b],
>>> + sizeof(struct wc_memory_entry)));
>>> + if (wme.seq_count != empty) {
>>> + seq_count = le64_to_cpu(wme.seq_count);
>>> + if (last_seq_count < seq_count)
>>> + last_seq_count = seq_count;
>>> + }
>>> + }
>>> + }
>>> + else {
>>> + struct wc_memory_entry *p = &sb(wc)->entries[0];
>>> + b = wc->n_blocks;
>>> + while (0 < b) {
>>> + if (p->seq_count != empty) {
>>> + seq_count = le64_to_cpu(p->seq_count);
>>> + if (last_seq_count < seq_count)
>>> + last_seq_count = seq_count;
>>> + }
>>> + p++;
>>> + b--;
>>> + }
>>> + }
>>> + return last_seq_count;
>>> +}
>>> +
>>> static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
>>> {
>>> #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
>>> @@ -730,8 +765,6 @@ static void writecache_flush(struct dm_w
>>> writecache_wait_for_ios(wc, WRITE);
>>>
>>> wc->seq_count++;
>>> - pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
>>> - writecache_flush_region(wc, &sb(wc)->seq_count, sizeof
>>> sb(wc)->seq_count);
>>> writecache_commit_flushed(wc);
>>>
>>> wc->overwrote_committed = false;
>>> @@ -876,7 +909,6 @@ static void writecache_resume(struct dm_
>>> struct dm_writecache *wc = ti->private;
>>> size_t b;
>>> bool need_flush = false;
>>> - __le64 sb_seq_count;
>>> int r;
>>>
>>> wc_lock(wc);
>>> @@ -894,12 +926,7 @@ static void writecache_resume(struct dm_
>>> }
>>> wc->freelist_size = 0;
>>>
>>> - r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
>>> - if (r) {
>>> - writecache_error(wc, r, "hardware memory error when reading
>>> superblock: %d", r);
>>> - sb_seq_count = cpu_to_le64(0);
>>> - }
>>> - wc->seq_count = le64_to_cpu(sb_seq_count);
>>> + wc->seq_count = read_last_seq_count(wc) + 1;
>>>
>>> #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
>>> for (b = 0; b < wc->n_blocks; b++) {
>>> @@ -1757,10 +1784,9 @@ static int init_memory(struct dm_writeca
>>>
>>> for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
>>> pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
>>> - pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
>>> + pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
>>> pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
>>> pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
>>> - pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
>>>
>>> for (b = 0; b < wc->n_blocks; b++)
>>> write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
>>> @@ -2159,11 +2185,13 @@ invalid_optional:
>>> goto bad;
>>> }
>>>
>>> - if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
>>> + if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_1 &&
>>> + le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_2) {
>>> ti->error = "Invalid version in the superblock";
>>> r = -EINVAL;
>>> goto bad;
>>> }
>>> + pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
>>>
>>> if (le32_to_cpu(s.block_size) != wc->block_size) {
>>> ti->error = "Block size does not match superblock";
>>
>> --
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] dm writecache: SB remove seq_count
2020-01-13 20:02 ` Maged Mokhtar
@ 2020-01-14 14:50 ` Mikulas Patocka
2020-01-15 19:48 ` Maged Mokhtar
0 siblings, 1 reply; 9+ messages in thread
From: Mikulas Patocka @ 2020-01-14 14:50 UTC (permalink / raw)
To: Maged Mokhtar; +Cc: dm-devel, Mike Snitzer
On Mon, 13 Jan 2020, Maged Mokhtar wrote:
>
>
> > Hi
> >
> >
> > On Thu, 2 Jan 2020, Maged Mokhtar wrote:
> >
> > > Any feedback on this patch please.
> >
> > This will definitely not work for persistent memory - it could corrupt
> > data if a crash happens. The CPU can flush data in arbitrary order and it
> > may happen that the seq count is flushed before the pertaining data.
> >
> > As for SSD mode - we could avoid updating the refcount in the superblock,
> > but it wouldn't be much helpful.
> >
> > I.e. normally, commit is done this way:
> > 1. submit data writes
> > 2. submit metadata writes
> > 3. flush disk cache
> > 4. submit the write of superblock with increased seq_count
> > 5. flush disk cache
> >
> > If we wanted to avoid writing the seq_count, we would need to change it
> > to:
> > 1. submit data writes
> > 2. flush disk cache
> > 3. submit metadata writes
> > 4. flush disk cache
> >
> > - i.e. it sill needs two disk cache flushes per one commit request - and
> > it is not much better than the existing solution.
> >
> > Mikulas
> >
>
>
> Hi Mikulas,
>
> I appreciate your review. For SSD mode, I see the advantages of SB writes for
> handling crash recovery and agree with what you say. Note however that after a
> crash a proper client should not assume the data is valid on a device, only at
> the point it last issued a successful flush should the data be consistent,
Yes.
> after this it should not assume so. A filesystem/db should handle
> journals/transaction at a higher level than the device. But again anything we
> can do on the device/target to make things more consistent, the better, so i
> agree there.
>
> There is also limit to what the current crash recovery code can do, as i
> understand it if you have metadata already committed, their seq count is not
> incremented for new io on the same blocks, the crash recovery code will
> therefore not detect or recover cases where new data is written to existing 4k
> blocks at the time of crash, some blocks will end up with new data, others
> will not. Again this is my understanding so i could be wrong.
If the userspace writes some block, it is unspecified if the block will
contain old data or new data after a crash. (the SCSI standard at some
point even specified that the written block may contain arbitrary data -
not just old or new).
> I think if crash consistency needs to be enhanced, it should take into account
> that most consumer/non-enterprise SSDs do not offer power loss protection. For
Both SATA and SCSI standard have command that flushes the cache in the
disk or SSD. If the SSD ignores this command, it is broken.
> many such devices power loss can corrupt data that is already written as they
> commit data in larger chunks via a read/modify/erase/write cycle. It is
> particularly bad for metadata as it could affect many data blocks. Maybe it
> could be good to have metadata writes via transactions or journaling, dm-cache
> and thin provisioning do something like this i think.
Both dm-cache and dm-writecache use the flush command to write the device
cache.
> i also think your suggestion of:
> > If we wanted to avoid writing the seq_count, we would need to change it
> > to:
> > 1. submit data writes
> > 2. flush disk cache
> > 3. submit metadata writes
> > 4. flush disk cache
>
> could be better in terms of prolonging SSD lifetime, as currently the
> superblock gets much higher write frequency.
The SSDs allocate new blocks with each write, so it doesn't matter that we
write the same block multiple times. With real persistent memory, it may
be an issue.
> /Maged
Mikulas
^ permalink raw reply [flat|nested] 9+ messages in thread
* Re: [PATCH] dm writecache: SB remove seq_count
2020-01-14 14:50 ` Mikulas Patocka
@ 2020-01-15 19:48 ` Maged Mokhtar
0 siblings, 0 replies; 9+ messages in thread
From: Maged Mokhtar @ 2020-01-15 19:48 UTC (permalink / raw)
To: Mikulas Patocka; +Cc: dm-devel, Mike Snitzer
On 14/01/2020 16:50, Mikulas Patocka wrote:
>
>
> On Mon, 13 Jan 2020, Maged Mokhtar wrote:
>
>>
>>
>>> Hi
>>>
>>>
>>> On Thu, 2 Jan 2020, Maged Mokhtar wrote:
>>>
>>>> Any feedback on this patch please.
>>>
>>> This will definitely not work for persistent memory - it could corrupt
>>> data if a crash happens. The CPU can flush data in arbitrary order and it
>>> may happen that the seq count is flushed before the pertaining data.
>>>
>>> As for SSD mode - we could avoid updating the refcount in the superblock,
>>> but it wouldn't be much helpful.
>>>
>>> I.e. normally, commit is done this way:
>>> 1. submit data writes
>>> 2. submit metadata writes
>>> 3. flush disk cache
>>> 4. submit the write of superblock with increased seq_count
>>> 5. flush disk cache
>>>
>>> If we wanted to avoid writing the seq_count, we would need to change it
>>> to:
>>> 1. submit data writes
>>> 2. flush disk cache
>>> 3. submit metadata writes
>>> 4. flush disk cache
>>>
>>> - i.e. it sill needs two disk cache flushes per one commit request - and
>>> it is not much better than the existing solution.
>>>
>>> Mikulas
>>>
>>
>>
>> Hi Mikulas,
>>
>> I appreciate your review. For SSD mode, I see the advantages of SB writes for
>> handling crash recovery and agree with what you say. Note however that after a
>> crash a proper client should not assume the data is valid on a device, only at
>> the point it last issued a successful flush should the data be consistent,
>
> Yes.
>
>> after this it should not assume so. A filesystem/db should handle
>> journals/transaction at a higher level than the device. But again anything we
>> can do on the device/target to make things more consistent, the better, so i
>> agree there.
>>
>> There is also limit to what the current crash recovery code can do, as i
>> understand it if you have metadata already committed, their seq count is not
>> incremented for new io on the same blocks, the crash recovery code will
>> therefore not detect or recover cases where new data is written to existing 4k
>> blocks at the time of crash, some blocks will end up with new data, others
>> will not. Again this is my understanding so i could be wrong.
>
> If the userspace writes some block, it is unspecified if the block will
> contain old data or new data after a crash. (the SCSI standard at some
> point even specified that the written block may contain arbitrary data -
> not just old or new).
>
>> I think if crash consistency needs to be enhanced, it should take into account
>> that most consumer/non-enterprise SSDs do not offer power loss protection. For
>
> Both SATA and SCSI standard have command that flushes the cache in the
> disk or SSD. If the SSD ignores this command, it is broken.
>
>> many such devices power loss can corrupt data that is already written as they
>> commit data in larger chunks via a read/modify/erase/write cycle. It is
>> particularly bad for metadata as it could affect many data blocks. Maybe it
>> could be good to have metadata writes via transactions or journaling, dm-cache
>> and thin provisioning do something like this i think.
>
> Both dm-cache and dm-writecache use the flush command to write the device
> cache.
>
>> i also think your suggestion of:
>>> If we wanted to avoid writing the seq_count, we would need to change it
>>> to:
>>> 1. submit data writes
>>> 2. flush disk cache
>>> 3. submit metadata writes
>>> 4. flush disk cache
>>
>> could be better in terms of prolonging SSD lifetime, as currently the
>> superblock gets much higher write frequency.
>
> The SSDs allocate new blocks with each write, so it doesn't matter that we
> write the same block multiple times. With real persistent memory, it may
> be an issue.
>
>> /Maged
>
> Mikulas
>
Hi Mikulas,
If writing the superblock still does not provide full consistency for
data written during a crash, then maybe is it really worth doing ?
On the other hand, i was suggesting there we would be a more robust
metadata crash recovery, something along the lines of metadata
transactions via dm-transaction-manager or some journaling of metadata,
like your own implementation in dm-integrity. I was referring to
consumer grade SSDs that may be subject of data corruption during power
loss, this is not the same as not having flush support, maybe having
more robust metadata writing will benefit these devices.
> The SSDs allocate new blocks with each write, ..
Yes, but if the device does not have free blocks and/or the writer does
not issue trim/discard i presume it will write on same block.
/Maged
^ permalink raw reply [flat|nested] 9+ messages in thread
end of thread, other threads:[~2020-01-15 19:48 UTC | newest]
Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-06 19:03 [PATCH] dm writecache: SB remove seq_count Maged Mokhtar
2020-01-02 15:37 ` Maged Mokhtar
2020-01-08 13:54 ` Mikulas Patocka
2020-01-08 17:04 ` Mikulas Patocka
2020-01-08 19:14 ` Maged Mokhtar
2020-01-13 19:35 ` Maged Mokhtar
2020-01-13 20:02 ` Maged Mokhtar
2020-01-14 14:50 ` Mikulas Patocka
2020-01-15 19:48 ` Maged Mokhtar
This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.