All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] dm writecache: SB remove seq_count
@ 2019-12-06 19:03 Maged Mokhtar
  2020-01-02 15:37 ` Maged Mokhtar
  0 siblings, 1 reply; 9+ messages in thread
From: Maged Mokhtar @ 2019-12-06 19:03 UTC (permalink / raw)
  To: mpatocka; +Cc: dm-devel

Removes seq_count from super block. Currently the super block gets 
written in each commit to update the seq_count which is just used when 
the target is restarted/resumed. This extra iop has a performance impact 
on small block size writes which do FUA/sync on every request. A 4k sync 
write currently requires 3 write ops: submitted data, metadata + super 
block seq_count update, removal of seq_count update reduces required 
write ops to 2.

Rebuild of seq_count at start/resumption can be done quickly by looping 
through memory entry metadata within the resume() function.

Signed-off-by: Maged Mokhtar <mmokhtar@petasan.org>
---
  drivers/md/dm-writecache.c |   56 ++++++++++++++++++++++++++---------
  1 file changed, 42 insertions(+), 14 deletions(-)

--- a/drivers/md/dm-writecache.c	2019-12-06 03:07:53.000000000 -0800
+++ b/drivers/md/dm-writecache.c	2019-12-06 09:25:45.000000000 -0800
@@ -52,7 +52,8 @@ do {								\
  #endif

  #define MEMORY_SUPERBLOCK_MAGIC		0x23489321
-#define MEMORY_SUPERBLOCK_VERSION	1
+#define MEMORY_SUPERBLOCK_VERSION_1	1
+#define MEMORY_SUPERBLOCK_VERSION_2	2

  struct wc_memory_entry {
  	__le64 original_sector;
@@ -67,7 +68,6 @@ struct wc_memory_superblock {
  			__le32 block_size;
  			__le32 pad;
  			__le64 n_blocks;
-			__le64 seq_count;
  		};
  		__le64 padding[8];
  	};
@@ -380,6 +380,41 @@ static uint64_t read_seq_count(struct dm
  #endif
  }

+static uint64_t read_last_seq_count(struct dm_writecache *wc)
+{
+	size_t b;
+	uint64_t last_seq_count = 0;
+	uint64_t seq_count;
+	__le64 empty = cpu_to_le64(-1);
+
+	if (WC_MODE_PMEM(wc)) {
+		struct wc_memory_entry wme;
+		for (b = 0; b < wc->n_blocks; b++) {
+			BUG_ON(memcpy_mcsafe(&wme, &sb(wc)->entries[b],
+				sizeof(struct wc_memory_entry)));
+			if (wme.seq_count != empty) {
+				seq_count = le64_to_cpu(wme.seq_count);
+				if (last_seq_count < seq_count)
+					last_seq_count = seq_count;
+			}
+		}
+	}
+	else {
+		struct wc_memory_entry *p = &sb(wc)->entries[0];
+		b = wc->n_blocks;
+		while (0 < b) {
+			if (p->seq_count != empty) {	
+				seq_count = le64_to_cpu(p->seq_count);
+				if (last_seq_count < seq_count)
+					last_seq_count = seq_count;
+			}
+			p++;		
+			b--;
+		}
+	}
+	return last_seq_count;
+}
+
  static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
  {
  #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
@@ -730,8 +765,6 @@ static void writecache_flush(struct dm_w
  		writecache_wait_for_ios(wc, WRITE);

  	wc->seq_count++;
-	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
-	writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
  	writecache_commit_flushed(wc);

  	wc->overwrote_committed = false;
@@ -876,7 +909,6 @@ static void writecache_resume(struct dm_
  	struct dm_writecache *wc = ti->private;
  	size_t b;
  	bool need_flush = false;
-	__le64 sb_seq_count;
  	int r;

  	wc_lock(wc);
@@ -894,12 +926,7 @@ static void writecache_resume(struct dm_
  	}
  	wc->freelist_size = 0;

-	r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
-	if (r) {
-		writecache_error(wc, r, "hardware memory error when reading 
superblock: %d", r);
-		sb_seq_count = cpu_to_le64(0);
-	}
-	wc->seq_count = le64_to_cpu(sb_seq_count);
+	wc->seq_count = read_last_seq_count(wc) + 1;

  #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
  	for (b = 0; b < wc->n_blocks; b++) {
@@ -1757,10 +1784,9 @@ static int init_memory(struct dm_writeca

  	for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
  		pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
-	pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
+	pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
  	pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
  	pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
-	pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));

  	for (b = 0; b < wc->n_blocks; b++)
  		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
@@ -2159,11 +2185,13 @@ invalid_optional:
  		goto bad;
  	}

-	if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
+	if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_1 &&
+		le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_2) {
  		ti->error = "Invalid version in the superblock";
  		r = -EINVAL;
  		goto bad;
  	}
+	pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));

  	if (le32_to_cpu(s.block_size) != wc->block_size) {
  		ti->error = "Block size does not match superblock";

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] dm writecache: SB remove seq_count
  2019-12-06 19:03 [PATCH] dm writecache: SB remove seq_count Maged Mokhtar
@ 2020-01-02 15:37 ` Maged Mokhtar
  2020-01-08 13:54   ` Mikulas Patocka
  0 siblings, 1 reply; 9+ messages in thread
From: Maged Mokhtar @ 2020-01-02 15:37 UTC (permalink / raw)
  To: mpatocka; +Cc: dm-devel

Any feedback on this patch please.


On 06/12/2019 21:03, Maged Mokhtar wrote:
> Removes seq_count from super block. Currently the super block gets 
> written in each commit to update the seq_count which is just used when 
> the target is restarted/resumed. This extra iop has a performance impact 
> on small block size writes which do FUA/sync on every request. A 4k sync 
> write currently requires 3 write ops: submitted data, metadata + super 
> block seq_count update, removal of seq_count update reduces required 
> write ops to 2.
> 
> Rebuild of seq_count at start/resumption can be done quickly by looping 
> through memory entry metadata within the resume() function.
> 
> Signed-off-by: Maged Mokhtar <mmokhtar@petasan.org>
> ---
>   drivers/md/dm-writecache.c |   56 ++++++++++++++++++++++++++---------
>   1 file changed, 42 insertions(+), 14 deletions(-)
> 
> --- a/drivers/md/dm-writecache.c    2019-12-06 03:07:53.000000000 -0800
> +++ b/drivers/md/dm-writecache.c    2019-12-06 09:25:45.000000000 -0800
> @@ -52,7 +52,8 @@ do {                                \
>   #endif
> 
>   #define MEMORY_SUPERBLOCK_MAGIC        0x23489321
> -#define MEMORY_SUPERBLOCK_VERSION    1
> +#define MEMORY_SUPERBLOCK_VERSION_1    1
> +#define MEMORY_SUPERBLOCK_VERSION_2    2
> 
>   struct wc_memory_entry {
>       __le64 original_sector;
> @@ -67,7 +68,6 @@ struct wc_memory_superblock {
>               __le32 block_size;
>               __le32 pad;
>               __le64 n_blocks;
> -            __le64 seq_count;
>           };
>           __le64 padding[8];
>       };
> @@ -380,6 +380,41 @@ static uint64_t read_seq_count(struct dm
>   #endif
>   }
> 
> +static uint64_t read_last_seq_count(struct dm_writecache *wc)
> +{
> +    size_t b;
> +    uint64_t last_seq_count = 0;
> +    uint64_t seq_count;
> +    __le64 empty = cpu_to_le64(-1);
> +
> +    if (WC_MODE_PMEM(wc)) {
> +        struct wc_memory_entry wme;
> +        for (b = 0; b < wc->n_blocks; b++) {
> +            BUG_ON(memcpy_mcsafe(&wme, &sb(wc)->entries[b],
> +                sizeof(struct wc_memory_entry)));
> +            if (wme.seq_count != empty) {
> +                seq_count = le64_to_cpu(wme.seq_count);
> +                if (last_seq_count < seq_count)
> +                    last_seq_count = seq_count;
> +            }
> +        }
> +    }
> +    else {
> +        struct wc_memory_entry *p = &sb(wc)->entries[0];
> +        b = wc->n_blocks;
> +        while (0 < b) {
> +            if (p->seq_count != empty) {
> +                seq_count = le64_to_cpu(p->seq_count);
> +                if (last_seq_count < seq_count)
> +                    last_seq_count = seq_count;
> +            }
> +            p++;
> +            b--;
> +        }
> +    }
> +    return last_seq_count;
> +}
> +
>   static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
>   {
>   #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
> @@ -730,8 +765,6 @@ static void writecache_flush(struct dm_w
>           writecache_wait_for_ios(wc, WRITE);
> 
>       wc->seq_count++;
> -    pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
> -    writecache_flush_region(wc, &sb(wc)->seq_count, sizeof 
> sb(wc)->seq_count);
>       writecache_commit_flushed(wc);
> 
>       wc->overwrote_committed = false;
> @@ -876,7 +909,6 @@ static void writecache_resume(struct dm_
>       struct dm_writecache *wc = ti->private;
>       size_t b;
>       bool need_flush = false;
> -    __le64 sb_seq_count;
>       int r;
> 
>       wc_lock(wc);
> @@ -894,12 +926,7 @@ static void writecache_resume(struct dm_
>       }
>       wc->freelist_size = 0;
> 
> -    r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, 
> sizeof(uint64_t));
> -    if (r) {
> -        writecache_error(wc, r, "hardware memory error when reading 
> superblock: %d", r);
> -        sb_seq_count = cpu_to_le64(0);
> -    }
> -    wc->seq_count = le64_to_cpu(sb_seq_count);
> +    wc->seq_count = read_last_seq_count(wc) + 1;
> 
>   #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
>       for (b = 0; b < wc->n_blocks; b++) {
> @@ -1757,10 +1784,9 @@ static int init_memory(struct dm_writeca
> 
>       for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
>           pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
> -    pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
> +    pmem_assign(sb(wc)->version, 
> cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
>       pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
>       pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
> -    pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
> 
>       for (b = 0; b < wc->n_blocks; b++)
>           write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
> @@ -2159,11 +2185,13 @@ invalid_optional:
>           goto bad;
>       }
> 
> -    if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
> +    if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_1 &&
> +        le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_2) {
>           ti->error = "Invalid version in the superblock";
>           r = -EINVAL;
>           goto bad;
>       }
> +    pmem_assign(sb(wc)->version, 
> cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
> 
>       if (le32_to_cpu(s.block_size) != wc->block_size) {
>           ti->error = "Block size does not match superblock";

-- 
Maged Mokhtar
CEO PetaSAN
4 Emad El Deen Kamel
Cairo 11371, Egypt
www.petasan.org
+201006979931
skype: maged.mokhtar


--
dm-devel mailing list
dm-devel@redhat.com
https://www.redhat.com/mailman/listinfo/dm-devel

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] dm writecache: SB remove seq_count
  2020-01-02 15:37 ` Maged Mokhtar
@ 2020-01-08 13:54   ` Mikulas Patocka
  2020-01-08 17:04     ` Mikulas Patocka
  2020-01-13 20:02     ` Maged Mokhtar
  0 siblings, 2 replies; 9+ messages in thread
From: Mikulas Patocka @ 2020-01-08 13:54 UTC (permalink / raw)
  To: Maged Mokhtar; +Cc: dm-devel, Mike Snitzer

[-- Attachment #1: Type: TEXT/PLAIN, Size: 6775 bytes --]

Hi


On Thu, 2 Jan 2020, Maged Mokhtar wrote:

> Any feedback on this patch please.

This will definitely not work for persistent memory - it could corrupt 
data if a crash happens. The CPU can flush data in arbitrary order and it 
may happen that the seq count is flushed before the pertaining data.

As for SSD mode - we could avoid updating the refcount in the superblock, 
but it wouldn't be much helpful.

I.e. normally, commit is done this way:
1. submit data writes
2. submit metadata writes
3. flush disk cache
4. submit the write of superblock with increased seq_count
5. flush disk cache

If we wanted to avoid writing the seq_count, we would need to change it 
to:
1. submit data writes
2. flush disk cache
3. submit metadata writes
4. flush disk cache

- i.e. it sill needs two disk cache flushes per one commit request - and 
it is not much better than the existing solution.

Mikulas

> On 06/12/2019 21:03, Maged Mokhtar wrote:
> > Removes seq_count from super block. Currently the super block gets written
> > in each commit to update the seq_count which is just used when the target is
> > restarted/resumed. This extra iop has a performance impact on small block
> > size writes which do FUA/sync on every request. A 4k sync write currently
> > requires 3 write ops: submitted data, metadata + super block seq_count
> > update, removal of seq_count update reduces required write ops to 2.
> > 
> > Rebuild of seq_count at start/resumption can be done quickly by looping
> > through memory entry metadata within the resume() function.
> > 
> > Signed-off-by: Maged Mokhtar <mmokhtar@petasan.org>
> > ---
> >   drivers/md/dm-writecache.c |   56 ++++++++++++++++++++++++++---------
> >   1 file changed, 42 insertions(+), 14 deletions(-)
> > 
> > --- a/drivers/md/dm-writecache.c    2019-12-06 03:07:53.000000000 -0800
> > +++ b/drivers/md/dm-writecache.c    2019-12-06 09:25:45.000000000 -0800
> > @@ -52,7 +52,8 @@ do {                                \
> >   #endif
> > 
> >   #define MEMORY_SUPERBLOCK_MAGIC        0x23489321
> > -#define MEMORY_SUPERBLOCK_VERSION    1
> > +#define MEMORY_SUPERBLOCK_VERSION_1    1
> > +#define MEMORY_SUPERBLOCK_VERSION_2    2
> > 
> >   struct wc_memory_entry {
> >       __le64 original_sector;
> > @@ -67,7 +68,6 @@ struct wc_memory_superblock {
> >               __le32 block_size;
> >               __le32 pad;
> >               __le64 n_blocks;
> > -            __le64 seq_count;
> >           };
> >           __le64 padding[8];
> >       };
> > @@ -380,6 +380,41 @@ static uint64_t read_seq_count(struct dm
> >   #endif
> >   }
> > 
> > +static uint64_t read_last_seq_count(struct dm_writecache *wc)
> > +{
> > +    size_t b;
> > +    uint64_t last_seq_count = 0;
> > +    uint64_t seq_count;
> > +    __le64 empty = cpu_to_le64(-1);
> > +
> > +    if (WC_MODE_PMEM(wc)) {
> > +        struct wc_memory_entry wme;
> > +        for (b = 0; b < wc->n_blocks; b++) {
> > +            BUG_ON(memcpy_mcsafe(&wme, &sb(wc)->entries[b],
> > +                sizeof(struct wc_memory_entry)));
> > +            if (wme.seq_count != empty) {
> > +                seq_count = le64_to_cpu(wme.seq_count);
> > +                if (last_seq_count < seq_count)
> > +                    last_seq_count = seq_count;
> > +            }
> > +        }
> > +    }
> > +    else {
> > +        struct wc_memory_entry *p = &sb(wc)->entries[0];
> > +        b = wc->n_blocks;
> > +        while (0 < b) {
> > +            if (p->seq_count != empty) {
> > +                seq_count = le64_to_cpu(p->seq_count);
> > +                if (last_seq_count < seq_count)
> > +                    last_seq_count = seq_count;
> > +            }
> > +            p++;
> > +            b--;
> > +        }
> > +    }
> > +    return last_seq_count;
> > +}
> > +
> >   static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
> >   {
> >   #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
> > @@ -730,8 +765,6 @@ static void writecache_flush(struct dm_w
> >           writecache_wait_for_ios(wc, WRITE);
> > 
> >       wc->seq_count++;
> > -    pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
> > -    writecache_flush_region(wc, &sb(wc)->seq_count, sizeof
> > sb(wc)->seq_count);
> >       writecache_commit_flushed(wc);
> > 
> >       wc->overwrote_committed = false;
> > @@ -876,7 +909,6 @@ static void writecache_resume(struct dm_
> >       struct dm_writecache *wc = ti->private;
> >       size_t b;
> >       bool need_flush = false;
> > -    __le64 sb_seq_count;
> >       int r;
> > 
> >       wc_lock(wc);
> > @@ -894,12 +926,7 @@ static void writecache_resume(struct dm_
> >       }
> >       wc->freelist_size = 0;
> > 
> > -    r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
> > -    if (r) {
> > -        writecache_error(wc, r, "hardware memory error when reading
> > superblock: %d", r);
> > -        sb_seq_count = cpu_to_le64(0);
> > -    }
> > -    wc->seq_count = le64_to_cpu(sb_seq_count);
> > +    wc->seq_count = read_last_seq_count(wc) + 1;
> > 
> >   #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
> >       for (b = 0; b < wc->n_blocks; b++) {
> > @@ -1757,10 +1784,9 @@ static int init_memory(struct dm_writeca
> > 
> >       for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
> >           pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
> > -    pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
> > +    pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
> >       pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
> >       pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
> > -    pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
> > 
> >       for (b = 0; b < wc->n_blocks; b++)
> >           write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
> > @@ -2159,11 +2185,13 @@ invalid_optional:
> >           goto bad;
> >       }
> > 
> > -    if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
> > +    if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_1 &&
> > +        le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_2) {
> >           ti->error = "Invalid version in the superblock";
> >           r = -EINVAL;
> >           goto bad;
> >       }
> > +    pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
> > 
> >       if (le32_to_cpu(s.block_size) != wc->block_size) {
> >           ti->error = "Block size does not match superblock";
> 
> -- 
> Maged Mokhtar
> CEO PetaSAN
> 4 Emad El Deen Kamel
> Cairo 11371, Egypt
> www.petasan.org
> +201006979931
> skype: maged.mokhtar
> 

[-- Attachment #2: Type: text/plain, Size: 0 bytes --]



^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] dm writecache: SB remove seq_count
  2020-01-08 13:54   ` Mikulas Patocka
@ 2020-01-08 17:04     ` Mikulas Patocka
  2020-01-08 19:14       ` Maged Mokhtar
  2020-01-13 19:35       ` Maged Mokhtar
  2020-01-13 20:02     ` Maged Mokhtar
  1 sibling, 2 replies; 9+ messages in thread
From: Mikulas Patocka @ 2020-01-08 17:04 UTC (permalink / raw)
  To: Maged Mokhtar; +Cc: dm-devel, Mike Snitzer

BTW. I would be interested if this patch improves performance for you. 
Could you test it?

(you also need my previous patch posted here 
https://www.redhat.com/archives/dm-devel/2020-January/msg00027.html )

Mikulas




dm-writecache: use REQ_FUA when writing the superblock

When writing the superblock, it may be better to submit just one I/O with
the FUA bit set instead of two I/Os.

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>

---
 drivers/md/dm-writecache.c |   29 +++++++++++++++--------------
 1 file changed, 15 insertions(+), 14 deletions(-)

Index: linux-2.6/drivers/md/dm-writecache.c
===================================================================
--- linux-2.6.orig/drivers/md/dm-writecache.c	2020-01-08 17:56:41.000000000 +0100
+++ linux-2.6/drivers/md/dm-writecache.c	2020-01-08 17:56:49.000000000 +0100
@@ -448,7 +448,7 @@ static void writecache_wait_for_ios(stru
 		   !atomic_read(&wc->bio_in_progress[direction]));
 }
 
-static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
+static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
 {
 	struct dm_io_region region;
 	struct dm_io_request req;
@@ -479,7 +479,7 @@ static void ssd_commit_flushed(struct dm
 		region.sector += wc->start_sector;
 		atomic_inc(&endio.count);
 		req.bi_op = REQ_OP_WRITE;
-		req.bi_op_flags = REQ_SYNC;
+		req.bi_op_flags = REQ_SYNC | (use_fua ? REQ_FUA : 0);
 		req.mem.type = DM_IO_VMA;
 		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
 		req.client = wc->dm_io;
@@ -497,17 +497,18 @@ static void ssd_commit_flushed(struct dm
 	if (wait_for_ios)
 		writecache_wait_for_ios(wc, WRITE);
 
-	writecache_disk_flush(wc, wc->ssd_dev);
+	if (!use_fua)
+		writecache_disk_flush(wc, wc->ssd_dev);
 
 	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
 }
 
-static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
+static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
 {
 	if (WC_MODE_PMEM(wc))
 		wmb();
 	else
-		ssd_commit_flushed(wc, wait_for_ios);
+		ssd_commit_flushed(wc, wait_for_ios, use_fua);
 }
 
 static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
@@ -727,12 +728,12 @@ static void writecache_flush(struct dm_w
 		e = e2;
 		cond_resched();
 	}
-	writecache_commit_flushed(wc, true);
+	writecache_commit_flushed(wc, true, false);
 
 	wc->seq_count++;
 	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
 	writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
-	writecache_commit_flushed(wc, false);
+	writecache_commit_flushed(wc, false, true);
 
 	wc->overwrote_committed = false;
 
@@ -756,7 +757,7 @@ static void writecache_flush(struct dm_w
 	}
 
 	if (need_flush_after_free)
-		writecache_commit_flushed(wc, false);
+		writecache_commit_flushed(wc, false, false);
 }
 
 static void writecache_flush_work(struct work_struct *work)
@@ -809,7 +810,7 @@ static void writecache_discard(struct dm
 	}
 
 	if (discarded_something)
-		writecache_commit_flushed(wc, false);
+		writecache_commit_flushed(wc, false, false);
 }
 
 static bool writecache_wait_for_writeback(struct dm_writecache *wc)
@@ -958,7 +959,7 @@ erase_this:
 
 	if (need_flush) {
 		writecache_flush_all_metadata(wc);
-		writecache_commit_flushed(wc, false);
+		writecache_commit_flushed(wc, false, false);
 	}
 
 	wc_unlock(wc);
@@ -1342,7 +1343,7 @@ static void __writecache_endio_pmem(stru
 			wc->writeback_size--;
 			n_walked++;
 			if (unlikely(n_walked >= ENDIO_LATENCY)) {
-				writecache_commit_flushed(wc, false);
+				writecache_commit_flushed(wc, false, false);
 				wc_unlock(wc);
 				wc_lock(wc);
 				n_walked = 0;
@@ -1423,7 +1424,7 @@ pop_from_list:
 			writecache_wait_for_ios(wc, READ);
 		}
 
-		writecache_commit_flushed(wc, false);
+		writecache_commit_flushed(wc, false, false);
 
 		wc_unlock(wc);
 	}
@@ -1766,10 +1767,10 @@ static int init_memory(struct dm_writeca
 		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
 
 	writecache_flush_all_metadata(wc);
-	writecache_commit_flushed(wc, false);
+	writecache_commit_flushed(wc, false, false);
 	pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
 	writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
-	writecache_commit_flushed(wc, false);
+	writecache_commit_flushed(wc, false, false);
 
 	return 0;
 }

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] dm writecache: SB remove seq_count
  2020-01-08 17:04     ` Mikulas Patocka
@ 2020-01-08 19:14       ` Maged Mokhtar
  2020-01-13 19:35       ` Maged Mokhtar
  1 sibling, 0 replies; 9+ messages in thread
From: Maged Mokhtar @ 2020-01-08 19:14 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Mike Snitzer

Yes sure. i will send you something early next week.

/Maged

On 08/01/2020 19:04, Mikulas Patocka wrote:
> BTW. I would be interested if this patch improves performance for you.
> Could you test it?
> 
> (you also need my previous patch posted here
> https://www.redhat.com/archives/dm-devel/2020-January/msg00027.html )
> 
> Mikulas
> 
> 
> 
> 
> dm-writecache: use REQ_FUA when writing the superblock
> 
> When writing the superblock, it may be better to submit just one I/O with
> the FUA bit set instead of two I/Os.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> 
> ---
>   drivers/md/dm-writecache.c |   29 +++++++++++++++--------------
>   1 file changed, 15 insertions(+), 14 deletions(-)
> 
> Index: linux-2.6/drivers/md/dm-writecache.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-writecache.c	2020-01-08 17:56:41.000000000 +0100
> +++ linux-2.6/drivers/md/dm-writecache.c	2020-01-08 17:56:49.000000000 +0100
> @@ -448,7 +448,7 @@ static void writecache_wait_for_ios(stru
>   		   !atomic_read(&wc->bio_in_progress[direction]));
>   }
>   
> -static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
> +static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
>   {
>   	struct dm_io_region region;
>   	struct dm_io_request req;
> @@ -479,7 +479,7 @@ static void ssd_commit_flushed(struct dm
>   		region.sector += wc->start_sector;
>   		atomic_inc(&endio.count);
>   		req.bi_op = REQ_OP_WRITE;
> -		req.bi_op_flags = REQ_SYNC;
> +		req.bi_op_flags = REQ_SYNC | (use_fua ? REQ_FUA : 0);
>   		req.mem.type = DM_IO_VMA;
>   		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
>   		req.client = wc->dm_io;
> @@ -497,17 +497,18 @@ static void ssd_commit_flushed(struct dm
>   	if (wait_for_ios)
>   		writecache_wait_for_ios(wc, WRITE);
>   
> -	writecache_disk_flush(wc, wc->ssd_dev);
> +	if (!use_fua)
> +		writecache_disk_flush(wc, wc->ssd_dev);
>   
>   	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
>   }
>   
> -static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
> +static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
>   {
>   	if (WC_MODE_PMEM(wc))
>   		wmb();
>   	else
> -		ssd_commit_flushed(wc, wait_for_ios);
> +		ssd_commit_flushed(wc, wait_for_ios, use_fua);
>   }
>   
>   static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
> @@ -727,12 +728,12 @@ static void writecache_flush(struct dm_w
>   		e = e2;
>   		cond_resched();
>   	}
> -	writecache_commit_flushed(wc, true);
> +	writecache_commit_flushed(wc, true, false);
>   
>   	wc->seq_count++;
>   	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
>   	writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
> -	writecache_commit_flushed(wc, false);
> +	writecache_commit_flushed(wc, false, true);
>   
>   	wc->overwrote_committed = false;
>   
> @@ -756,7 +757,7 @@ static void writecache_flush(struct dm_w
>   	}
>   
>   	if (need_flush_after_free)
> -		writecache_commit_flushed(wc, false);
> +		writecache_commit_flushed(wc, false, false);
>   }
>   
>   static void writecache_flush_work(struct work_struct *work)
> @@ -809,7 +810,7 @@ static void writecache_discard(struct dm
>   	}
>   
>   	if (discarded_something)
> -		writecache_commit_flushed(wc, false);
> +		writecache_commit_flushed(wc, false, false);
>   }
>   
>   static bool writecache_wait_for_writeback(struct dm_writecache *wc)
> @@ -958,7 +959,7 @@ erase_this:
>   
>   	if (need_flush) {
>   		writecache_flush_all_metadata(wc);
> -		writecache_commit_flushed(wc, false);
> +		writecache_commit_flushed(wc, false, false);
>   	}
>   
>   	wc_unlock(wc);
> @@ -1342,7 +1343,7 @@ static void __writecache_endio_pmem(stru
>   			wc->writeback_size--;
>   			n_walked++;
>   			if (unlikely(n_walked >= ENDIO_LATENCY)) {
> -				writecache_commit_flushed(wc, false);
> +				writecache_commit_flushed(wc, false, false);
>   				wc_unlock(wc);
>   				wc_lock(wc);
>   				n_walked = 0;
> @@ -1423,7 +1424,7 @@ pop_from_list:
>   			writecache_wait_for_ios(wc, READ);
>   		}
>   
> -		writecache_commit_flushed(wc, false);
> +		writecache_commit_flushed(wc, false, false);
>   
>   		wc_unlock(wc);
>   	}
> @@ -1766,10 +1767,10 @@ static int init_memory(struct dm_writeca
>   		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
>   
>   	writecache_flush_all_metadata(wc);
> -	writecache_commit_flushed(wc, false);
> +	writecache_commit_flushed(wc, false, false);
>   	pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
>   	writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
> -	writecache_commit_flushed(wc, false);
> +	writecache_commit_flushed(wc, false, false);
>   
>   	return 0;
>   }
> 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] dm writecache: SB remove seq_count
  2020-01-08 17:04     ` Mikulas Patocka
  2020-01-08 19:14       ` Maged Mokhtar
@ 2020-01-13 19:35       ` Maged Mokhtar
  1 sibling, 0 replies; 9+ messages in thread
From: Maged Mokhtar @ 2020-01-13 19:35 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Mike Snitzer



On 08/01/2020 19:04, Mikulas Patocka wrote:
> BTW. I would be interested if this patch improves performance for you.
> Could you test it?
> 
> (you also need my previous patch posted here
> https://www.redhat.com/archives/dm-devel/2020-January/msg00027.html )
> 
> Mikulas
> 
> 
> 
> 
> dm-writecache: use REQ_FUA when writing the superblock
> 
> When writing the superblock, it may be better to submit just one I/O with
> the FUA bit set instead of two I/Os.
> 
> Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
> 
> ---
>   drivers/md/dm-writecache.c |   29 +++++++++++++++--------------
>   1 file changed, 15 insertions(+), 14 deletions(-)
> 
> Index: linux-2.6/drivers/md/dm-writecache.c
> ===================================================================
> --- linux-2.6.orig/drivers/md/dm-writecache.c	2020-01-08 17:56:41.000000000 +0100
> +++ linux-2.6/drivers/md/dm-writecache.c	2020-01-08 17:56:49.000000000 +0100
> @@ -448,7 +448,7 @@ static void writecache_wait_for_ios(stru
>   		   !atomic_read(&wc->bio_in_progress[direction]));
>   }
>   
> -static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
> +static void ssd_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
>   {
>   	struct dm_io_region region;
>   	struct dm_io_request req;
> @@ -479,7 +479,7 @@ static void ssd_commit_flushed(struct dm
>   		region.sector += wc->start_sector;
>   		atomic_inc(&endio.count);
>   		req.bi_op = REQ_OP_WRITE;
> -		req.bi_op_flags = REQ_SYNC;
> +		req.bi_op_flags = REQ_SYNC | (use_fua ? REQ_FUA : 0);
>   		req.mem.type = DM_IO_VMA;
>   		req.mem.ptr.vma = (char *)wc->memory_map + (size_t)i * BITMAP_GRANULARITY;
>   		req.client = wc->dm_io;
> @@ -497,17 +497,18 @@ static void ssd_commit_flushed(struct dm
>   	if (wait_for_ios)
>   		writecache_wait_for_ios(wc, WRITE);
>   
> -	writecache_disk_flush(wc, wc->ssd_dev);
> +	if (!use_fua)
> +		writecache_disk_flush(wc, wc->ssd_dev);
>   
>   	memset(wc->dirty_bitmap, 0, wc->dirty_bitmap_size);
>   }
>   
> -static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios)
> +static void writecache_commit_flushed(struct dm_writecache *wc, bool wait_for_ios, bool use_fua)
>   {
>   	if (WC_MODE_PMEM(wc))
>   		wmb();
>   	else
> -		ssd_commit_flushed(wc, wait_for_ios);
> +		ssd_commit_flushed(wc, wait_for_ios, use_fua);
>   }
>   
>   static void writecache_disk_flush(struct dm_writecache *wc, struct dm_dev *dev)
> @@ -727,12 +728,12 @@ static void writecache_flush(struct dm_w
>   		e = e2;
>   		cond_resched();
>   	}
> -	writecache_commit_flushed(wc, true);
> +	writecache_commit_flushed(wc, true, false);
>   
>   	wc->seq_count++;
>   	pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
>   	writecache_flush_region(wc, &sb(wc)->seq_count, sizeof sb(wc)->seq_count);
> -	writecache_commit_flushed(wc, false);
> +	writecache_commit_flushed(wc, false, true);
>   
>   	wc->overwrote_committed = false;
>   
> @@ -756,7 +757,7 @@ static void writecache_flush(struct dm_w
>   	}
>   
>   	if (need_flush_after_free)
> -		writecache_commit_flushed(wc, false);
> +		writecache_commit_flushed(wc, false, false);
>   }
>   
>   static void writecache_flush_work(struct work_struct *work)
> @@ -809,7 +810,7 @@ static void writecache_discard(struct dm
>   	}
>   
>   	if (discarded_something)
> -		writecache_commit_flushed(wc, false);
> +		writecache_commit_flushed(wc, false, false);
>   }
>   
>   static bool writecache_wait_for_writeback(struct dm_writecache *wc)
> @@ -958,7 +959,7 @@ erase_this:
>   
>   	if (need_flush) {
>   		writecache_flush_all_metadata(wc);
> -		writecache_commit_flushed(wc, false);
> +		writecache_commit_flushed(wc, false, false);
>   	}
>   
>   	wc_unlock(wc);
> @@ -1342,7 +1343,7 @@ static void __writecache_endio_pmem(stru
>   			wc->writeback_size--;
>   			n_walked++;
>   			if (unlikely(n_walked >= ENDIO_LATENCY)) {
> -				writecache_commit_flushed(wc, false);
> +				writecache_commit_flushed(wc, false, false);
>   				wc_unlock(wc);
>   				wc_lock(wc);
>   				n_walked = 0;
> @@ -1423,7 +1424,7 @@ pop_from_list:
>   			writecache_wait_for_ios(wc, READ);
>   		}
>   
> -		writecache_commit_flushed(wc, false);
> +		writecache_commit_flushed(wc, false, false);
>   
>   		wc_unlock(wc);
>   	}
> @@ -1766,10 +1767,10 @@ static int init_memory(struct dm_writeca
>   		write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
>   
>   	writecache_flush_all_metadata(wc);
> -	writecache_commit_flushed(wc, false);
> +	writecache_commit_flushed(wc, false, false);
>   	pmem_assign(sb(wc)->magic, cpu_to_le32(MEMORY_SUPERBLOCK_MAGIC));
>   	writecache_flush_region(wc, &sb(wc)->magic, sizeof sb(wc)->magic);
> -	writecache_commit_flushed(wc, false);
> +	writecache_commit_flushed(wc, false, false);
>   
>   	return 0;
>   }
> 

Hi Mikulas

On a test disk, performance is same with or without patch.
Using fio 4k rand write iops:
12 K iops with sync/fua = 0
2.7K iops with sync/fua = 1

------- Without patch ----------

fio --name=4kiops --filename=/dev/vg1/slow --rw=randwrite --bs=4k 
--direct=1 --sync=0 --runtime=10 --time_based --numjobs=16 --iodepth=1 
--group_reporting

Run status group 0 (all jobs):
   WRITE: bw=46.9MiB/s (49.2MB/s), 46.9MiB/s-46.9MiB/s 
(49.2MB/s-49.2MB/s), io=469MiB (492MB), run=10005-10005msec

fio --name=4kiops --filename=/dev/vg1/slow --rw=randwrite --bs=4k 
--direct=1 --sync=1 --runtime=10 --time_based --numjobs=16 --iodepth=1 
--group_reporting

Run status group 0 (all jobs):
   WRITE: bw=10.7MiB/s (11.2MB/s), 10.7MiB/s-10.7MiB/s 
(11.2MB/s-11.2MB/s), io=107MiB (112MB), run=10007-1000    7msec


------- With patch ----------

fio --name=4kiops --filename=/dev/vg1/slow --rw=randwrite --bs=4k 
--direct=1 --sync=0 --runtime=10 --time_based --numjobs=16 --iodepth=1 
--group_reporting

Run status group 0 (all jobs):
   WRITE: bw=46.8MiB/s (49.0MB/s), 46.8MiB/s-46.8MiB/s 
(49.0MB/s-49.0MB/s), io=468MiB (491MB), run=10004-10004msec

fio --name=4kiops --filename=/dev/vg1/slow --rw=randwrite --bs=4k 
--direct=1 --sync=1 --runtime=10 --time_based --numjobs=16 --iodepth=1 
--group_reporting

Run status group 0 (all jobs):
   WRITE: bw=10.8MiB/s (11.3MB/s), 10.8MiB/s-10.8MiB/s 
(11.3MB/s-11.3MB/s), io=108MiB (113MB), run=10005-10005msec

/Maged

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] dm writecache: SB remove seq_count
  2020-01-08 13:54   ` Mikulas Patocka
  2020-01-08 17:04     ` Mikulas Patocka
@ 2020-01-13 20:02     ` Maged Mokhtar
  2020-01-14 14:50       ` Mikulas Patocka
  1 sibling, 1 reply; 9+ messages in thread
From: Maged Mokhtar @ 2020-01-13 20:02 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Mike Snitzer



> Hi
> 
> 
> On Thu, 2 Jan 2020, Maged Mokhtar wrote:
> 
>> Any feedback on this patch please.
> 
> This will definitely not work for persistent memory - it could corrupt
> data if a crash happens. The CPU can flush data in arbitrary order and it
> may happen that the seq count is flushed before the pertaining data.
> 
> As for SSD mode - we could avoid updating the refcount in the superblock,
> but it wouldn't be much helpful.
> 
> I.e. normally, commit is done this way:
> 1. submit data writes
> 2. submit metadata writes
> 3. flush disk cache
> 4. submit the write of superblock with increased seq_count
> 5. flush disk cache
> 
> If we wanted to avoid writing the seq_count, we would need to change it
> to:
> 1. submit data writes
> 2. flush disk cache
> 3. submit metadata writes
> 4. flush disk cache
> 
> - i.e. it sill needs two disk cache flushes per one commit request - and
> it is not much better than the existing solution.
> 
> Mikulas
> 


Hi Mikulas,

I appreciate your review. For SSD mode, I see the advantages of SB 
writes for handling crash recovery and agree with what you say. Note 
however that after a crash a proper client should not assume the data is 
valid on a device, only at the point it last issued a successful flush 
should the data be consistent, after this it should not assume so. A 
filesystem/db should handle journals/transaction at a higher level than 
the device. But again anything we can do on the device/target to make 
things more consistent, the better, so i agree there.

There is also limit to what the current crash recovery code can do, as i 
understand it if you have metadata already committed, their seq count is 
not incremented for new io on the same blocks, the crash recovery code 
will therefore not detect or recover cases where new data is written to 
existing 4k blocks at the time of crash, some blocks will end up with 
new data, others will not. Again this is my understanding so i could be 
wrong.

I think if crash consistency needs to be enhanced, it should take into 
account that most consumer/non-enterprise SSDs do not offer power loss 
protection. For many such devices power loss can corrupt data that is 
already written as they commit data in larger chunks via a 
read/modify/erase/write cycle. It is particularly bad for metadata as it 
could affect many data blocks. Maybe it could be good to have metadata 
writes via transactions or journaling, dm-cache and thin provisioning do 
something like this i think.

i also think your suggestion of:
 > If we wanted to avoid writing the seq_count, we would need to change it
 > to:
 > 1. submit data writes
 > 2. flush disk cache
 > 3. submit metadata writes
 > 4. flush disk cache

could be better in terms of prolonging SSD lifetime, as currently the 
superblock gets much higher write frequency.

/Maged




>> On 06/12/2019 21:03, Maged Mokhtar wrote:
>>> Removes seq_count from super block. Currently the super block gets written
>>> in each commit to update the seq_count which is just used when the target is
>>> restarted/resumed. This extra iop has a performance impact on small block
>>> size writes which do FUA/sync on every request. A 4k sync write currently
>>> requires 3 write ops: submitted data, metadata + super block seq_count
>>> update, removal of seq_count update reduces required write ops to 2.
>>>
>>> Rebuild of seq_count at start/resumption can be done quickly by looping
>>> through memory entry metadata within the resume() function.
>>>
>>> Signed-off-by: Maged Mokhtar <mmokhtar@petasan.org>
>>> ---
>>>    drivers/md/dm-writecache.c |   56 ++++++++++++++++++++++++++---------
>>>    1 file changed, 42 insertions(+), 14 deletions(-)
>>>
>>> --- a/drivers/md/dm-writecache.c    2019-12-06 03:07:53.000000000 -0800
>>> +++ b/drivers/md/dm-writecache.c    2019-12-06 09:25:45.000000000 -0800
>>> @@ -52,7 +52,8 @@ do {                                \
>>>    #endif
>>>
>>>    #define MEMORY_SUPERBLOCK_MAGIC        0x23489321
>>> -#define MEMORY_SUPERBLOCK_VERSION    1
>>> +#define MEMORY_SUPERBLOCK_VERSION_1    1
>>> +#define MEMORY_SUPERBLOCK_VERSION_2    2
>>>
>>>    struct wc_memory_entry {
>>>        __le64 original_sector;
>>> @@ -67,7 +68,6 @@ struct wc_memory_superblock {
>>>                __le32 block_size;
>>>                __le32 pad;
>>>                __le64 n_blocks;
>>> -            __le64 seq_count;
>>>            };
>>>            __le64 padding[8];
>>>        };
>>> @@ -380,6 +380,41 @@ static uint64_t read_seq_count(struct dm
>>>    #endif
>>>    }
>>>
>>> +static uint64_t read_last_seq_count(struct dm_writecache *wc)
>>> +{
>>> +    size_t b;
>>> +    uint64_t last_seq_count = 0;
>>> +    uint64_t seq_count;
>>> +    __le64 empty = cpu_to_le64(-1);
>>> +
>>> +    if (WC_MODE_PMEM(wc)) {
>>> +        struct wc_memory_entry wme;
>>> +        for (b = 0; b < wc->n_blocks; b++) {
>>> +            BUG_ON(memcpy_mcsafe(&wme, &sb(wc)->entries[b],
>>> +                sizeof(struct wc_memory_entry)));
>>> +            if (wme.seq_count != empty) {
>>> +                seq_count = le64_to_cpu(wme.seq_count);
>>> +                if (last_seq_count < seq_count)
>>> +                    last_seq_count = seq_count;
>>> +            }
>>> +        }
>>> +    }
>>> +    else {
>>> +        struct wc_memory_entry *p = &sb(wc)->entries[0];
>>> +        b = wc->n_blocks;
>>> +        while (0 < b) {
>>> +            if (p->seq_count != empty) {
>>> +                seq_count = le64_to_cpu(p->seq_count);
>>> +                if (last_seq_count < seq_count)
>>> +                    last_seq_count = seq_count;
>>> +            }
>>> +            p++;
>>> +            b--;
>>> +        }
>>> +    }
>>> +    return last_seq_count;
>>> +}
>>> +
>>>    static void clear_seq_count(struct dm_writecache *wc, struct wc_entry *e)
>>>    {
>>>    #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
>>> @@ -730,8 +765,6 @@ static void writecache_flush(struct dm_w
>>>            writecache_wait_for_ios(wc, WRITE);
>>>
>>>        wc->seq_count++;
>>> -    pmem_assign(sb(wc)->seq_count, cpu_to_le64(wc->seq_count));
>>> -    writecache_flush_region(wc, &sb(wc)->seq_count, sizeof
>>> sb(wc)->seq_count);
>>>        writecache_commit_flushed(wc);
>>>
>>>        wc->overwrote_committed = false;
>>> @@ -876,7 +909,6 @@ static void writecache_resume(struct dm_
>>>        struct dm_writecache *wc = ti->private;
>>>        size_t b;
>>>        bool need_flush = false;
>>> -    __le64 sb_seq_count;
>>>        int r;
>>>
>>>        wc_lock(wc);
>>> @@ -894,12 +926,7 @@ static void writecache_resume(struct dm_
>>>        }
>>>        wc->freelist_size = 0;
>>>
>>> -    r = memcpy_mcsafe(&sb_seq_count, &sb(wc)->seq_count, sizeof(uint64_t));
>>> -    if (r) {
>>> -        writecache_error(wc, r, "hardware memory error when reading
>>> superblock: %d", r);
>>> -        sb_seq_count = cpu_to_le64(0);
>>> -    }
>>> -    wc->seq_count = le64_to_cpu(sb_seq_count);
>>> +    wc->seq_count = read_last_seq_count(wc) + 1;
>>>
>>>    #ifdef DM_WRITECACHE_HANDLE_HARDWARE_ERRORS
>>>        for (b = 0; b < wc->n_blocks; b++) {
>>> @@ -1757,10 +1784,9 @@ static int init_memory(struct dm_writeca
>>>
>>>        for (b = 0; b < ARRAY_SIZE(sb(wc)->padding); b++)
>>>            pmem_assign(sb(wc)->padding[b], cpu_to_le64(0));
>>> -    pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION));
>>> +    pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
>>>        pmem_assign(sb(wc)->block_size, cpu_to_le32(wc->block_size));
>>>        pmem_assign(sb(wc)->n_blocks, cpu_to_le64(wc->n_blocks));
>>> -    pmem_assign(sb(wc)->seq_count, cpu_to_le64(0));
>>>
>>>        for (b = 0; b < wc->n_blocks; b++)
>>>            write_original_sector_seq_count(wc, &wc->entries[b], -1, -1);
>>> @@ -2159,11 +2185,13 @@ invalid_optional:
>>>            goto bad;
>>>        }
>>>
>>> -    if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION) {
>>> +    if (le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_1 &&
>>> +        le32_to_cpu(s.version) != MEMORY_SUPERBLOCK_VERSION_2) {
>>>            ti->error = "Invalid version in the superblock";
>>>            r = -EINVAL;
>>>            goto bad;
>>>        }
>>> +    pmem_assign(sb(wc)->version, cpu_to_le32(MEMORY_SUPERBLOCK_VERSION_2));
>>>
>>>        if (le32_to_cpu(s.block_size) != wc->block_size) {
>>>            ti->error = "Block size does not match superblock";
>>
>> -- 

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] dm writecache: SB remove seq_count
  2020-01-13 20:02     ` Maged Mokhtar
@ 2020-01-14 14:50       ` Mikulas Patocka
  2020-01-15 19:48         ` Maged Mokhtar
  0 siblings, 1 reply; 9+ messages in thread
From: Mikulas Patocka @ 2020-01-14 14:50 UTC (permalink / raw)
  To: Maged Mokhtar; +Cc: dm-devel, Mike Snitzer



On Mon, 13 Jan 2020, Maged Mokhtar wrote:

> 
> 
> > Hi
> > 
> > 
> > On Thu, 2 Jan 2020, Maged Mokhtar wrote:
> > 
> > > Any feedback on this patch please.
> > 
> > This will definitely not work for persistent memory - it could corrupt
> > data if a crash happens. The CPU can flush data in arbitrary order and it
> > may happen that the seq count is flushed before the pertaining data.
> > 
> > As for SSD mode - we could avoid updating the refcount in the superblock,
> > but it wouldn't be much helpful.
> > 
> > I.e. normally, commit is done this way:
> > 1. submit data writes
> > 2. submit metadata writes
> > 3. flush disk cache
> > 4. submit the write of superblock with increased seq_count
> > 5. flush disk cache
> > 
> > If we wanted to avoid writing the seq_count, we would need to change it
> > to:
> > 1. submit data writes
> > 2. flush disk cache
> > 3. submit metadata writes
> > 4. flush disk cache
> > 
> > - i.e. it sill needs two disk cache flushes per one commit request - and
> > it is not much better than the existing solution.
> > 
> > Mikulas
> > 
> 
> 
> Hi Mikulas,
> 
> I appreciate your review. For SSD mode, I see the advantages of SB writes for
> handling crash recovery and agree with what you say. Note however that after a
> crash a proper client should not assume the data is valid on a device, only at
> the point it last issued a successful flush should the data be consistent,

Yes.

> after this it should not assume so. A filesystem/db should handle
> journals/transaction at a higher level than the device. But again anything we
> can do on the device/target to make things more consistent, the better, so i
> agree there.
> 
> There is also limit to what the current crash recovery code can do, as i
> understand it if you have metadata already committed, their seq count is not
> incremented for new io on the same blocks, the crash recovery code will
> therefore not detect or recover cases where new data is written to existing 4k
> blocks at the time of crash, some blocks will end up with new data, others
> will not. Again this is my understanding so i could be wrong.

If the userspace writes some block, it is unspecified if the block will 
contain old data or new data after a crash. (the SCSI standard at some 
point even specified that the written block may contain arbitrary data - 
not just old or new).

> I think if crash consistency needs to be enhanced, it should take into account
> that most consumer/non-enterprise SSDs do not offer power loss protection. For

Both SATA and SCSI standard have command that flushes the cache in the 
disk or SSD. If the SSD ignores this command, it is broken.

> many such devices power loss can corrupt data that is already written as they
> commit data in larger chunks via a read/modify/erase/write cycle. It is
> particularly bad for metadata as it could affect many data blocks. Maybe it
> could be good to have metadata writes via transactions or journaling, dm-cache
> and thin provisioning do something like this i think.

Both dm-cache and dm-writecache use the flush command to write the device 
cache.

> i also think your suggestion of:
> > If we wanted to avoid writing the seq_count, we would need to change it
> > to:
> > 1. submit data writes
> > 2. flush disk cache
> > 3. submit metadata writes
> > 4. flush disk cache
> 
> could be better in terms of prolonging SSD lifetime, as currently the
> superblock gets much higher write frequency.

The SSDs allocate new blocks with each write, so it doesn't matter that we 
write the same block multiple times. With real persistent memory, it may 
be an issue.

> /Maged

Mikulas

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] dm writecache: SB remove seq_count
  2020-01-14 14:50       ` Mikulas Patocka
@ 2020-01-15 19:48         ` Maged Mokhtar
  0 siblings, 0 replies; 9+ messages in thread
From: Maged Mokhtar @ 2020-01-15 19:48 UTC (permalink / raw)
  To: Mikulas Patocka; +Cc: dm-devel, Mike Snitzer



On 14/01/2020 16:50, Mikulas Patocka wrote:
> 
> 
> On Mon, 13 Jan 2020, Maged Mokhtar wrote:
> 
>>
>>
>>> Hi
>>>
>>>
>>> On Thu, 2 Jan 2020, Maged Mokhtar wrote:
>>>
>>>> Any feedback on this patch please.
>>>
>>> This will definitely not work for persistent memory - it could corrupt
>>> data if a crash happens. The CPU can flush data in arbitrary order and it
>>> may happen that the seq count is flushed before the pertaining data.
>>>
>>> As for SSD mode - we could avoid updating the refcount in the superblock,
>>> but it wouldn't be much helpful.
>>>
>>> I.e. normally, commit is done this way:
>>> 1. submit data writes
>>> 2. submit metadata writes
>>> 3. flush disk cache
>>> 4. submit the write of superblock with increased seq_count
>>> 5. flush disk cache
>>>
>>> If we wanted to avoid writing the seq_count, we would need to change it
>>> to:
>>> 1. submit data writes
>>> 2. flush disk cache
>>> 3. submit metadata writes
>>> 4. flush disk cache
>>>
>>> - i.e. it sill needs two disk cache flushes per one commit request - and
>>> it is not much better than the existing solution.
>>>
>>> Mikulas
>>>
>>
>>
>> Hi Mikulas,
>>
>> I appreciate your review. For SSD mode, I see the advantages of SB writes for
>> handling crash recovery and agree with what you say. Note however that after a
>> crash a proper client should not assume the data is valid on a device, only at
>> the point it last issued a successful flush should the data be consistent,
> 
> Yes.
> 
>> after this it should not assume so. A filesystem/db should handle
>> journals/transaction at a higher level than the device. But again anything we
>> can do on the device/target to make things more consistent, the better, so i
>> agree there.
>>
>> There is also limit to what the current crash recovery code can do, as i
>> understand it if you have metadata already committed, their seq count is not
>> incremented for new io on the same blocks, the crash recovery code will
>> therefore not detect or recover cases where new data is written to existing 4k
>> blocks at the time of crash, some blocks will end up with new data, others
>> will not. Again this is my understanding so i could be wrong.
> 
> If the userspace writes some block, it is unspecified if the block will
> contain old data or new data after a crash. (the SCSI standard at some
> point even specified that the written block may contain arbitrary data -
> not just old or new).
> 
>> I think if crash consistency needs to be enhanced, it should take into account
>> that most consumer/non-enterprise SSDs do not offer power loss protection. For
> 
> Both SATA and SCSI standard have command that flushes the cache in the
> disk or SSD. If the SSD ignores this command, it is broken.
> 
>> many such devices power loss can corrupt data that is already written as they
>> commit data in larger chunks via a read/modify/erase/write cycle. It is
>> particularly bad for metadata as it could affect many data blocks. Maybe it
>> could be good to have metadata writes via transactions or journaling, dm-cache
>> and thin provisioning do something like this i think.
> 
> Both dm-cache and dm-writecache use the flush command to write the device
> cache.
> 
>> i also think your suggestion of:
>>> If we wanted to avoid writing the seq_count, we would need to change it
>>> to:
>>> 1. submit data writes
>>> 2. flush disk cache
>>> 3. submit metadata writes
>>> 4. flush disk cache
>>
>> could be better in terms of prolonging SSD lifetime, as currently the
>> superblock gets much higher write frequency.
> 
> The SSDs allocate new blocks with each write, so it doesn't matter that we
> write the same block multiple times. With real persistent memory, it may
> be an issue.
> 
>> /Maged
> 
> Mikulas
> 

Hi Mikulas,

If writing the superblock still does not provide full consistency for 
data written during a crash, then maybe is it really worth doing ?

On the other hand, i was suggesting there we would be a more robust 
metadata crash recovery, something along the lines of metadata 
transactions via dm-transaction-manager or some journaling of metadata, 
like your own implementation in dm-integrity. I was referring to 
consumer grade SSDs that may be subject of data corruption during power 
loss, this is not the same as not having flush support, maybe having 
more robust metadata writing will benefit these devices.

 > The SSDs allocate new blocks with each write, ..
Yes, but if the device does not have free blocks and/or the writer does 
not issue trim/discard i presume it will write on same block.

/Maged

^ permalink raw reply	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2020-01-15 19:48 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-12-06 19:03 [PATCH] dm writecache: SB remove seq_count Maged Mokhtar
2020-01-02 15:37 ` Maged Mokhtar
2020-01-08 13:54   ` Mikulas Patocka
2020-01-08 17:04     ` Mikulas Patocka
2020-01-08 19:14       ` Maged Mokhtar
2020-01-13 19:35       ` Maged Mokhtar
2020-01-13 20:02     ` Maged Mokhtar
2020-01-14 14:50       ` Mikulas Patocka
2020-01-15 19:48         ` Maged Mokhtar

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.