All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH v2] nilfs2: improve the performance of fdatasync()
@ 2014-09-23  8:46 Andreas Rohner
       [not found] ` <1411462018-5780-1-git-send-email-andreas.rohner-hi6Y0CQ0nG0@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Andreas Rohner @ 2014-09-23  8:46 UTC (permalink / raw)
  To: linux-nilfs-u79uwXL29TY76Z2rM5mHXA; +Cc: Andreas Rohner

Support for fdatasync() has been implemented in NILFS2 for a long time,
but whenever the corresponding inode is dirty the implementation falls
back to a full-flegded sync(). Since every write operation has to update
the modification time of the file, the inode will almost always be dirty
and fdatasync() will fall back to sync() most of the time. But this
fallback is only necessary for a change of the file size and not for
a change of the various timestamps.

This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between
those two situations.

 * If it is set the file size was changed and a full sync is necessary.
 * If it is not set then only the timestamps were updated and
   fdatasync() can go ahead.

There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with
the exact same semantics. Unfortunately it cannot be used directly,
because NILFS2 doesn't implement write_inode() and doesn't clear the VFS
flags when inodes are written out. So the VFS writeback thread can
clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So
I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in
nilfs_update_inode().

Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org>
---
 fs/nilfs2/inode.c   | 13 +++++++------
 fs/nilfs2/nilfs.h   | 14 +++++++++++---
 fs/nilfs2/segment.c |  4 ++--
 3 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 6252b17..67f2112 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -125,7 +125,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff,
 			nilfs_transaction_abort(inode->i_sb);
 			goto out;
 		}
-		nilfs_mark_inode_dirty(inode);
+		nilfs_mark_inode_dirty_sync(inode);
 		nilfs_transaction_commit(inode->i_sb); /* never fails */
 		/* Error handling should be detailed */
 		set_buffer_new(bh_result);
@@ -667,7 +667,7 @@ void nilfs_write_inode_common(struct inode *inode,
 	   for substitutions of appended fields */
 }
 
-void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
+void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
 {
 	ino_t ino = inode->i_ino;
 	struct nilfs_inode_info *ii = NILFS_I(inode);
@@ -678,7 +678,8 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh)
 
 	if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
 		memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
-	set_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+	if (flags & I_DIRTY_DATASYNC)
+		set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
 
 	nilfs_write_inode_common(inode, raw_inode, 0);
 		/* XXX: call with has_bmap = 0 is a workaround to avoid
@@ -934,7 +935,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
 	return 0;
 }
 
-int nilfs_mark_inode_dirty(struct inode *inode)
+int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
 {
 	struct buffer_head *ibh;
 	int err;
@@ -945,7 +946,7 @@ int nilfs_mark_inode_dirty(struct inode *inode)
 			      "failed to reget inode block.\n");
 		return err;
 	}
-	nilfs_update_inode(inode, ibh);
+	nilfs_update_inode(inode, ibh, flags);
 	mark_buffer_dirty(ibh);
 	nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
 	brelse(ibh);
@@ -978,7 +979,7 @@ void nilfs_dirty_inode(struct inode *inode, int flags)
 		return;
 	}
 	nilfs_transaction_begin(inode->i_sb, &ti, 0);
-	nilfs_mark_inode_dirty(inode);
+	__nilfs_mark_inode_dirty(inode, flags);
 	nilfs_transaction_commit(inode->i_sb); /* never fails */
 }
 
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index 0696161..91093cd 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -104,7 +104,7 @@ enum {
 					   constructor */
 	NILFS_I_COLLECTED,		/* All dirty blocks are collected */
 	NILFS_I_UPDATED,		/* The file has been written back */
-	NILFS_I_INODE_DIRTY,		/* write_inode is requested */
+	NILFS_I_INODE_SYNC,		/* dsync is not allowed for inode */
 	NILFS_I_BMAP,			/* has bmap and btnode_cache */
 	NILFS_I_GCINODE,		/* inode for GC, on memory only */
 };
@@ -273,7 +273,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
 			 unsigned long ino);
 extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
 				       unsigned long ino, __u64 cno);
-extern void nilfs_update_inode(struct inode *, struct buffer_head *);
+extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
 extern void nilfs_truncate(struct inode *);
 extern void nilfs_evict_inode(struct inode *);
 extern int nilfs_setattr(struct dentry *, struct iattr *);
@@ -282,10 +282,18 @@ int nilfs_permission(struct inode *inode, int mask);
 int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh);
 extern int nilfs_inode_dirty(struct inode *);
 int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty);
-extern int nilfs_mark_inode_dirty(struct inode *);
+extern int __nilfs_mark_inode_dirty(struct inode *, int);
 extern void nilfs_dirty_inode(struct inode *, int flags);
 int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
 		 __u64 start, __u64 len);
+static inline int nilfs_mark_inode_dirty(struct inode *inode)
+{
+	return __nilfs_mark_inode_dirty(inode, I_DIRTY);
+}
+static inline int nilfs_mark_inode_dirty_sync(struct inode *inode)
+{
+	return __nilfs_mark_inode_dirty(inode, I_DIRTY_SYNC);
+}
 
 /* super.c */
 extern struct inode *nilfs_alloc_inode(struct super_block *);
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index a1a1916..6e6690e 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -930,7 +930,7 @@ static void nilfs_drop_collected_inodes(struct list_head *head)
 		if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state))
 			continue;
 
-		clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state);
+		clear_bit(NILFS_I_INODE_SYNC, &ii->i_state);
 		set_bit(NILFS_I_UPDATED, &ii->i_state);
 	}
 }
@@ -2194,7 +2194,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode,
 	nilfs_transaction_lock(sb, &ti, 0);
 
 	ii = NILFS_I(inode);
-	if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) ||
+	if (test_bit(NILFS_I_INODE_SYNC, &ii->i_state) ||
 	    nilfs_test_opt(nilfs, STRICT_ORDER) ||
 	    test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) ||
 	    nilfs_discontinued(nilfs)) {
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] nilfs2: improve the performance of fdatasync()
       [not found] ` <1411462018-5780-1-git-send-email-andreas.rohner-hi6Y0CQ0nG0@public.gmane.org>
@ 2014-09-23 10:50   ` Ryusuke Konishi
       [not found]     ` <20140923.195012.716508926019147354.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Ryusuke Konishi @ 2014-09-23 10:50 UTC (permalink / raw)
  To: Andreas Rohner; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA

On Tue, 23 Sep 2014 10:46:58 +0200, Andreas Rohner wrote:
> Support for fdatasync() has been implemented in NILFS2 for a long time,
> but whenever the corresponding inode is dirty the implementation falls
> back to a full-flegded sync(). Since every write operation has to update
> the modification time of the file, the inode will almost always be dirty
> and fdatasync() will fall back to sync() most of the time. But this
> fallback is only necessary for a change of the file size and not for
> a change of the various timestamps.
> 
> This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between
> those two situations.
> 
>  * If it is set the file size was changed and a full sync is necessary.
>  * If it is not set then only the timestamps were updated and
>    fdatasync() can go ahead.
> 
> There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with
> the exact same semantics. Unfortunately it cannot be used directly,
> because NILFS2 doesn't implement write_inode() and doesn't clear the VFS
> flags when inodes are written out. So the VFS writeback thread can
> clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So
> I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in
> nilfs_update_inode().
> 
> Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org>

I now sent this to Andrew.

The datasync segments that this patch creates more frequently, will
cause rollforward recovery after a crash or a power failure.

So, please test also that the recovery works properly for fdatasync()
and reset.  The situation can be simulated, for example, by using
"reboot -nfh":

 # dd if=/dev/zero of=/nilfs/test bs=4k count=1 seek=9999
 # dd if=/dev/urandom of=/nilfs/test bs=8k count=1 seek=50 conv=fdatasync,notrunc,nocreat
 # reboot -nfh

We can use dumpseg command to confirm that the datasync segment is
actually made or how recovery has done after mount.

Thanks,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] nilfs2: improve the performance of fdatasync()
       [not found]     ` <20140923.195012.716508926019147354.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
@ 2014-09-23 11:11       ` Andreas Rohner
  2014-09-23 12:17       ` Andreas Rohner
  1 sibling, 0 replies; 10+ messages in thread
From: Andreas Rohner @ 2014-09-23 11:11 UTC (permalink / raw)
  To: Ryusuke Konishi; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA

On 2014-09-23 12:50, Ryusuke Konishi wrote:
> On Tue, 23 Sep 2014 10:46:58 +0200, Andreas Rohner wrote:
>> Support for fdatasync() has been implemented in NILFS2 for a long time,
>> but whenever the corresponding inode is dirty the implementation falls
>> back to a full-flegded sync(). Since every write operation has to update
>> the modification time of the file, the inode will almost always be dirty
>> and fdatasync() will fall back to sync() most of the time. But this
>> fallback is only necessary for a change of the file size and not for
>> a change of the various timestamps.
>>
>> This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between
>> those two situations.
>>
>>  * If it is set the file size was changed and a full sync is necessary.
>>  * If it is not set then only the timestamps were updated and
>>    fdatasync() can go ahead.
>>
>> There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with
>> the exact same semantics. Unfortunately it cannot be used directly,
>> because NILFS2 doesn't implement write_inode() and doesn't clear the VFS
>> flags when inodes are written out. So the VFS writeback thread can
>> clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So
>> I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in
>> nilfs_update_inode().
>>
>> Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org>
> 
> I now sent this to Andrew.
> 
> The datasync segments that this patch creates more frequently, will
> cause rollforward recovery after a crash or a power failure.
> 
> So, please test also that the recovery works properly for fdatasync()
> and reset.  The situation can be simulated, for example, by using
> "reboot -nfh":
> 
>  # dd if=/dev/zero of=/nilfs/test bs=4k count=1 seek=9999
>  # dd if=/dev/urandom of=/nilfs/test bs=8k count=1 seek=50 conv=fdatasync,notrunc,nocreat
>  # reboot -nfh

Very nice script to test this!

> We can use dumpseg command to confirm that the datasync segment is
> actually made or how recovery has done after mount.

I already tested this before I sent in my patch. I used a virtual
machine and just killed the process after the fdatasync(). After the
rollforward NILFS reports the correct number of blocks salvaged and the
md5sum of the file is correct.

I will test it again with dumpseg the way you suggested.

br,
Andreas Rohner
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] nilfs2: improve the performance of fdatasync()
       [not found]     ` <20140923.195012.716508926019147354.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
  2014-09-23 11:11       ` Andreas Rohner
@ 2014-09-23 12:17       ` Andreas Rohner
       [not found]         ` <542164C1.7090504-hi6Y0CQ0nG0@public.gmane.org>
  1 sibling, 1 reply; 10+ messages in thread
From: Andreas Rohner @ 2014-09-23 12:17 UTC (permalink / raw)
  To: Ryusuke Konishi; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA

On 2014-09-23 12:50, Ryusuke Konishi wrote:
> On Tue, 23 Sep 2014 10:46:58 +0200, Andreas Rohner wrote:
>> Support for fdatasync() has been implemented in NILFS2 for a long time,
>> but whenever the corresponding inode is dirty the implementation falls
>> back to a full-flegded sync(). Since every write operation has to update
>> the modification time of the file, the inode will almost always be dirty
>> and fdatasync() will fall back to sync() most of the time. But this
>> fallback is only necessary for a change of the file size and not for
>> a change of the various timestamps.
>>
>> This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between
>> those two situations.
>>
>>  * If it is set the file size was changed and a full sync is necessary.
>>  * If it is not set then only the timestamps were updated and
>>    fdatasync() can go ahead.
>>
>> There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with
>> the exact same semantics. Unfortunately it cannot be used directly,
>> because NILFS2 doesn't implement write_inode() and doesn't clear the VFS
>> flags when inodes are written out. So the VFS writeback thread can
>> clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So
>> I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in
>> nilfs_update_inode().
>>
>> Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org>
> 
> I now sent this to Andrew.
> 
> The datasync segments that this patch creates more frequently, will
> cause rollforward recovery after a crash or a power failure.
> 
> So, please test also that the recovery works properly for fdatasync()
> and reset.  The situation can be simulated, for example, by using
> "reboot -nfh":
> 
>  # dd if=/dev/zero of=/nilfs/test bs=4k count=1 seek=9999
>  # dd if=/dev/urandom of=/nilfs/test bs=8k count=1 seek=50 conv=fdatasync,notrunc,nocreat
>  # reboot -nfh
> 
> We can use dumpseg command to confirm that the datasync segment is
> actually made or how recovery has done after mount.

I tested it using your script, but I duplicated the second line twice
with different values for seek and added a md5sum at the end. So in
total 6 blocks were written with fdatasync().

The checksum before the reboot was: 66500bd6c7a1f89ed860cd7203f5c6e8

The last lines of the output of dumpseg after reboot:

  partial segment: blocknr = 26, nblocks = 3
    creation time = 2014-09-23 12:02:56
    nfinfo = 1
    finfo
      ino = 12, cno = 3, nblocks = 2, ndatblk = 2
        vblocknr = 16385, blkoff = 100, blocknr = 27
        vblocknr = 16386, blkoff = 101, blocknr = 28
  partial segment: blocknr = 29, nblocks = 3
    creation time = 2014-09-23 12:02:56
    nfinfo = 1
    finfo
      ino = 12, cno = 3, nblocks = 2, ndatblk = 2
        vblocknr = 16387, blkoff = 120, blocknr = 30
        vblocknr = 16389, blkoff = 121, blocknr = 31
  partial segment: blocknr = 32, nblocks = 3
    creation time = 2014-09-23 12:02:56
    nfinfo = 1
    finfo
      ino = 12, cno = 3, nblocks = 2, ndatblk = 2
        vblocknr = 16390, blkoff = 140, blocknr = 33
        vblocknr = 16391, blkoff = 141, blocknr = 34

The output of dmesg for the rollforward:

[  110.701337] NILFS warning: mounting unchecked fs
[  110.833196] NILFS (device sdb): salvaged 6 blocks
[  110.837311] segctord starting. Construction interval = 5 seconds, CP
frequency < 30 seconds
[  110.878959] NILFS: recovery complete.
[  110.882674] segctord starting. Construction interval = 5 seconds, CP
frequency < 30 seconds

The checksum after rollforward: 66500bd6c7a1f89ed860cd7203f5c6e8

Works like a charm :)

br,
Andreas Rohner

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] nilfs2: improve the performance of fdatasync()
       [not found]         ` <542164C1.7090504-hi6Y0CQ0nG0@public.gmane.org>
@ 2014-09-23 12:47           ` Ryusuke Konishi
       [not found]             ` <20140923.214701.237540042662663531.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Ryusuke Konishi @ 2014-09-23 12:47 UTC (permalink / raw)
  To: Andreas Rohner; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA

On Tue, 23 Sep 2014 14:17:05 +0200, Andreas Rohner wrote:
> On 2014-09-23 12:50, Ryusuke Konishi wrote:
>> On Tue, 23 Sep 2014 10:46:58 +0200, Andreas Rohner wrote:
>>> Support for fdatasync() has been implemented in NILFS2 for a long time,
>>> but whenever the corresponding inode is dirty the implementation falls
>>> back to a full-flegded sync(). Since every write operation has to update
>>> the modification time of the file, the inode will almost always be dirty
>>> and fdatasync() will fall back to sync() most of the time. But this
>>> fallback is only necessary for a change of the file size and not for
>>> a change of the various timestamps.
>>>
>>> This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between
>>> those two situations.
>>>
>>>  * If it is set the file size was changed and a full sync is necessary.
>>>  * If it is not set then only the timestamps were updated and
>>>    fdatasync() can go ahead.
>>>
>>> There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with
>>> the exact same semantics. Unfortunately it cannot be used directly,
>>> because NILFS2 doesn't implement write_inode() and doesn't clear the VFS
>>> flags when inodes are written out. So the VFS writeback thread can
>>> clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So
>>> I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in
>>> nilfs_update_inode().
>>>
>>> Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org>
>> 
>> I now sent this to Andrew.
>> 
>> The datasync segments that this patch creates more frequently, will
>> cause rollforward recovery after a crash or a power failure.
>> 
>> So, please test also that the recovery works properly for fdatasync()
>> and reset.  The situation can be simulated, for example, by using
>> "reboot -nfh":
>> 
>>  # dd if=/dev/zero of=/nilfs/test bs=4k count=1 seek=9999
>>  # dd if=/dev/urandom of=/nilfs/test bs=8k count=1 seek=50 conv=fdatasync,notrunc,nocreat
>>  # reboot -nfh
>> 
>> We can use dumpseg command to confirm that the datasync segment is
>> actually made or how recovery has done after mount.
> 
> I tested it using your script, but I duplicated the second line twice
> with different values for seek and added a md5sum at the end. So in
> total 6 blocks were written with fdatasync().
> 
> The checksum before the reboot was: 66500bd6c7a1f89ed860cd7203f5c6e8
> 
> The last lines of the output of dumpseg after reboot:
> 
>   partial segment: blocknr = 26, nblocks = 3
>     creation time = 2014-09-23 12:02:56
>     nfinfo = 1
>     finfo
>       ino = 12, cno = 3, nblocks = 2, ndatblk = 2
>         vblocknr = 16385, blkoff = 100, blocknr = 27
>         vblocknr = 16386, blkoff = 101, blocknr = 28
>   partial segment: blocknr = 29, nblocks = 3
>     creation time = 2014-09-23 12:02:56
>     nfinfo = 1
>     finfo
>       ino = 12, cno = 3, nblocks = 2, ndatblk = 2
>         vblocknr = 16387, blkoff = 120, blocknr = 30
>         vblocknr = 16389, blkoff = 121, blocknr = 31
>   partial segment: blocknr = 32, nblocks = 3
>     creation time = 2014-09-23 12:02:56
>     nfinfo = 1
>     finfo
>       ino = 12, cno = 3, nblocks = 2, ndatblk = 2
>         vblocknr = 16390, blkoff = 140, blocknr = 33
>         vblocknr = 16391, blkoff = 141, blocknr = 34
> 
> The output of dmesg for the rollforward:
> 
> [  110.701337] NILFS warning: mounting unchecked fs
> [  110.833196] NILFS (device sdb): salvaged 6 blocks
> [  110.837311] segctord starting. Construction interval = 5 seconds, CP
> frequency < 30 seconds
> [  110.878959] NILFS: recovery complete.
> [  110.882674] segctord starting. Construction interval = 5 seconds, CP
> frequency < 30 seconds
> 
> The checksum after rollforward: 66500bd6c7a1f89ed860cd7203f5c6e8
> 
> Works like a charm :)

Thank you, it looks perfect so far.


By the way, if you are interested in improving this sort of bad
implemetation, please consider improving inode allocator that we can
see at nilfs_ifile_create_inode().

It always searches free inode from ino=0.  It doesn't use the
knowledge of the last allocated inode number (inumber) nor any
locality of close-knit inodes such as a file and the directory that
contains it.

A simple strategy is to start finding a free inode from (inumber of
the parent directory) + 1, but this may not work efficiently if the
namespace has multiple active directories, and requires that inumbers
of directories are suitably dispersed.  On the other hands, it
increases the number of disk read and also increases the number of
inode blocks to be written out if inodes are allocated too discretely.

The optimal strategy may differ from that of other file systems
because inode blocks are not allocated to static places in nilfs.  For
example, it may be better if we gather inodes of frequently accessed
directories into the first valid inode block (on ifile) for nilfs.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: [PATCH v2] nilfs2: improve the performance of fdatasync()
       [not found]             ` <20140923.214701.237540042662663531.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
@ 2014-09-23 14:21               ` Andreas Rohner
       [not found]                 ` <542181ED.606-hi6Y0CQ0nG0@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Andreas Rohner @ 2014-09-23 14:21 UTC (permalink / raw)
  To: Ryusuke Konishi; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA

On 2014-09-23 14:47, Ryusuke Konishi wrote:
> On Tue, 23 Sep 2014 14:17:05 +0200, Andreas Rohner wrote:
>> On 2014-09-23 12:50, Ryusuke Konishi wrote:
>>> On Tue, 23 Sep 2014 10:46:58 +0200, Andreas Rohner wrote:
>>>> Support for fdatasync() has been implemented in NILFS2 for a long time,
>>>> but whenever the corresponding inode is dirty the implementation falls
>>>> back to a full-flegded sync(). Since every write operation has to update
>>>> the modification time of the file, the inode will almost always be dirty
>>>> and fdatasync() will fall back to sync() most of the time. But this
>>>> fallback is only necessary for a change of the file size and not for
>>>> a change of the various timestamps.
>>>>
>>>> This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between
>>>> those two situations.
>>>>
>>>>  * If it is set the file size was changed and a full sync is necessary.
>>>>  * If it is not set then only the timestamps were updated and
>>>>    fdatasync() can go ahead.
>>>>
>>>> There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with
>>>> the exact same semantics. Unfortunately it cannot be used directly,
>>>> because NILFS2 doesn't implement write_inode() and doesn't clear the VFS
>>>> flags when inodes are written out. So the VFS writeback thread can
>>>> clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So
>>>> I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in
>>>> nilfs_update_inode().
>>>>
>>>> Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org>
>>>
>>> I now sent this to Andrew.
>>>
>>> The datasync segments that this patch creates more frequently, will
>>> cause rollforward recovery after a crash or a power failure.
>>>
>>> So, please test also that the recovery works properly for fdatasync()
>>> and reset.  The situation can be simulated, for example, by using
>>> "reboot -nfh":
>>>
>>>  # dd if=/dev/zero of=/nilfs/test bs=4k count=1 seek=9999
>>>  # dd if=/dev/urandom of=/nilfs/test bs=8k count=1 seek=50 conv=fdatasync,notrunc,nocreat
>>>  # reboot -nfh
>>>
>>> We can use dumpseg command to confirm that the datasync segment is
>>> actually made or how recovery has done after mount.
>>
>> I tested it using your script, but I duplicated the second line twice
>> with different values for seek and added a md5sum at the end. So in
>> total 6 blocks were written with fdatasync().
>>
>> The checksum before the reboot was: 66500bd6c7a1f89ed860cd7203f5c6e8
>>
>> The last lines of the output of dumpseg after reboot:
>>
>>   partial segment: blocknr = 26, nblocks = 3
>>     creation time = 2014-09-23 12:02:56
>>     nfinfo = 1
>>     finfo
>>       ino = 12, cno = 3, nblocks = 2, ndatblk = 2
>>         vblocknr = 16385, blkoff = 100, blocknr = 27
>>         vblocknr = 16386, blkoff = 101, blocknr = 28
>>   partial segment: blocknr = 29, nblocks = 3
>>     creation time = 2014-09-23 12:02:56
>>     nfinfo = 1
>>     finfo
>>       ino = 12, cno = 3, nblocks = 2, ndatblk = 2
>>         vblocknr = 16387, blkoff = 120, blocknr = 30
>>         vblocknr = 16389, blkoff = 121, blocknr = 31
>>   partial segment: blocknr = 32, nblocks = 3
>>     creation time = 2014-09-23 12:02:56
>>     nfinfo = 1
>>     finfo
>>       ino = 12, cno = 3, nblocks = 2, ndatblk = 2
>>         vblocknr = 16390, blkoff = 140, blocknr = 33
>>         vblocknr = 16391, blkoff = 141, blocknr = 34
>>
>> The output of dmesg for the rollforward:
>>
>> [  110.701337] NILFS warning: mounting unchecked fs
>> [  110.833196] NILFS (device sdb): salvaged 6 blocks
>> [  110.837311] segctord starting. Construction interval = 5 seconds, CP
>> frequency < 30 seconds
>> [  110.878959] NILFS: recovery complete.
>> [  110.882674] segctord starting. Construction interval = 5 seconds, CP
>> frequency < 30 seconds
>>
>> The checksum after rollforward: 66500bd6c7a1f89ed860cd7203f5c6e8
>>
>> Works like a charm :)
> 
> Thank you, it looks perfect so far.

You're welcome :)

> 
> By the way, if you are interested in improving this sort of bad
> implemetation, please consider improving inode allocator that we can
> see at nilfs_ifile_create_inode().
> 
> It always searches free inode from ino=0.  It doesn't use the
> knowledge of the last allocated inode number (inumber) nor any
> locality of close-knit inodes such as a file and the directory that
> contains it.
> 
> A simple strategy is to start finding a free inode from (inumber of
> the parent directory) + 1, but this may not work efficiently if the
> namespace has multiple active directories, and requires that inumbers
> of directories are suitably dispersed.  On the other hands, it
> increases the number of disk read and also increases the number of
> inode blocks to be written out if inodes are allocated too discretely.
> 
> The optimal strategy may differ from that of other file systems
> because inode blocks are not allocated to static places in nilfs.  For
> example, it may be better if we gather inodes of frequently accessed
> directories into the first valid inode block (on ifile) for nilfs.

Sure I'll have a look at it, but this seems to be a hard problem.

Since one inode has 128 bytes a typical block of 4096 contains 32
inodes. We could just allocate every directory inode into an empty block
with 31 free slots. Then any subsequent file inode allocation would
first search the 31 slots of the parent directory and if they are full,
fallback to a search starting with ino 0.

This way if a directory has less than 32 files, all its inodes can be
read in with one single block. If a directory has more than 32 files its
inodes will spill over into the slots of other directories.

But I am not sure if this strategy would pay off.

br,
Andreas Rohner
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* improve inode allocation (was Re: [PATCH v2] nilfs2: improve the performance of fdatasync())
       [not found]                 ` <542181ED.606-hi6Y0CQ0nG0@public.gmane.org>
@ 2014-09-23 16:35                   ` Ryusuke Konishi
       [not found]                     ` <20140924.013505.1831094490963391096.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Ryusuke Konishi @ 2014-09-23 16:35 UTC (permalink / raw)
  To: Andreas Rohner; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA

On Tue, 23 Sep 2014 16:21:33 +0200, Andreas Rohner wrote:
> On 2014-09-23 14:47, Ryusuke Konishi wrote:
>> By the way, if you are interested in improving this sort of bad
>> implemetation, please consider improving inode allocator that we can
>> see at nilfs_ifile_create_inode().
>> 
>> It always searches free inode from ino=0.  It doesn't use the
>> knowledge of the last allocated inode number (inumber) nor any
>> locality of close-knit inodes such as a file and the directory that
>> contains it.
>> 
>> A simple strategy is to start finding a free inode from (inumber of
>> the parent directory) + 1, but this may not work efficiently if the
>> namespace has multiple active directories, and requires that inumbers
>> of directories are suitably dispersed.  On the other hands, it
>> increases the number of disk read and also increases the number of
>> inode blocks to be written out if inodes are allocated too discretely.
>> 
>> The optimal strategy may differ from that of other file systems
>> because inode blocks are not allocated to static places in nilfs.  For
>> example, it may be better if we gather inodes of frequently accessed
>> directories into the first valid inode block (on ifile) for nilfs.
> 
> Sure I'll have a look at it, but this seems to be a hard problem.
> 
> Since one inode has 128 bytes a typical block of 4096 contains 32
> inodes. We could just allocate every directory inode into an empty block
> with 31 free slots. Then any subsequent file inode allocation would
> first search the 31 slots of the parent directory and if they are full,
> fallback to a search starting with ino 0.

We can utilize several characteristics of metadata files for this
problem:

- It supports read ahead feature.  when ifile reads an inode block, we
  can expect that several subsequent blocks will be loaded to page
  cache in the background.

- B-tree of NILFS is efficient to hold sparse blocks.  This means that
  putting close-knit 32 * n inodes far from offset=0 is not so bad.

- ifile now can have private variables in nilfs_ifile_info (on-memory)
  struct.  They are available to store context information of
  allocator without compatibility issue.

- We can also use nilfs_inode_info struct of directories to store
  directory-based context of allocator without losing compatibility.

- Only caller of nilfs_ifile_create_inode() is nilfs_new_inode(), and
  this function knows the inode of the parent directory.

> This way if a directory has less than 32 files, all its inodes can be
> read in with one single block. If a directory has more than 32 files its
> inodes will spill over into the slots of other directories.
> 
> But I am not sure if this strategy would pay off.

Yes, for small namespaces, the current implementation may be enough.
We should first decide how we evaluate the effect of the algorithm.
It may be the scalability of namespace.

Regards,
Ryusuke Konishi
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: improve inode allocation (was Re: [PATCH v2] nilfs2: improve the performance of fdatasync())
       [not found]                     ` <20140924.013505.1831094490963391096.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
@ 2014-09-24  8:01                       ` Andreas Rohner
       [not found]                         ` <54227A41.2090509-hi6Y0CQ0nG0@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Andreas Rohner @ 2014-09-24  8:01 UTC (permalink / raw)
  To: Ryusuke Konishi; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA

On 2014-09-23 18:35, Ryusuke Konishi wrote:
> On Tue, 23 Sep 2014 16:21:33 +0200, Andreas Rohner wrote:
>> On 2014-09-23 14:47, Ryusuke Konishi wrote:
>>> By the way, if you are interested in improving this sort of bad
>>> implemetation, please consider improving inode allocator that we can
>>> see at nilfs_ifile_create_inode().
>>>
>>> It always searches free inode from ino=0.  It doesn't use the
>>> knowledge of the last allocated inode number (inumber) nor any
>>> locality of close-knit inodes such as a file and the directory that
>>> contains it.
>>>
>>> A simple strategy is to start finding a free inode from (inumber of
>>> the parent directory) + 1, but this may not work efficiently if the
>>> namespace has multiple active directories, and requires that inumbers
>>> of directories are suitably dispersed.  On the other hands, it
>>> increases the number of disk read and also increases the number of
>>> inode blocks to be written out if inodes are allocated too discretely.
>>>
>>> The optimal strategy may differ from that of other file systems
>>> because inode blocks are not allocated to static places in nilfs.  For
>>> example, it may be better if we gather inodes of frequently accessed
>>> directories into the first valid inode block (on ifile) for nilfs.
>>
>> Sure I'll have a look at it, but this seems to be a hard problem.
>>
>> Since one inode has 128 bytes a typical block of 4096 contains 32
>> inodes. We could just allocate every directory inode into an empty block
>> with 31 free slots. Then any subsequent file inode allocation would
>> first search the 31 slots of the parent directory and if they are full,
>> fallback to a search starting with ino 0.
> 
> We can utilize several characteristics of metadata files for this
> problem:
> 
> - It supports read ahead feature.  when ifile reads an inode block, we
>   can expect that several subsequent blocks will be loaded to page
>   cache in the background.
> 
> - B-tree of NILFS is efficient to hold sparse blocks.  This means that
>   putting close-knit 32 * n inodes far from offset=0 is not so bad.
> 
> - ifile now can have private variables in nilfs_ifile_info (on-memory)
>   struct.  They are available to store context information of
>   allocator without compatibility issue.
> 
> - We can also use nilfs_inode_info struct of directories to store
>   directory-based context of allocator without losing compatibility.
> 
> - Only caller of nilfs_ifile_create_inode() is nilfs_new_inode(), and
>   this function knows the inode of the parent directory.

Then the only problem is how to efficiently allocate the directories. We
could do something similar to the Orlov allocator used by the ext2/3/4
file systems:

1. We spread first level directories. Every one gets a full bitmap
   block (or half a bitmap block)
2. For the other directories we will try to choose the bitmap block of
   the parent unless the number of free inodes is below a certain
   threshold. Within this bitmap block the directories should also
   spread out.

File inodes will just start a linear search at the parents inode if
there is enough space left in the bitmap.

>> This way if a directory has less than 32 files, all its inodes can be
>> read in with one single block. If a directory has more than 32 files its
>> inodes will spill over into the slots of other directories.
>>
>> But I am not sure if this strategy would pay off.
> 
> Yes, for small namespaces, the current implementation may be enough.
> We should first decide how we evaluate the effect of the algorithm.
> It may be the scalability of namespace.

It will be very difficult to measure the time accurately. I would
suggest to simply count the number of reads and writes on the device.
This can be easily done:

mkfs.nilfs2 /dev/sdb

cat /proc/diskstats > rw_before.txt

do_tests

extract_kernel_sources

...

find /mnt

cat /proc/diskstats > rw_after.txt

The algorithm with fewer writes and reads wins.

I am still not convinced that all of this will pay off, but I will try a
few things and see if it works.

br,
Andreas Rohner

--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: improve inode allocation
       [not found]                         ` <54227A41.2090509-hi6Y0CQ0nG0@public.gmane.org>
@ 2014-09-24 15:01                           ` Ryusuke Konishi
       [not found]                             ` <20140925.000102.524843255498407534.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
  0 siblings, 1 reply; 10+ messages in thread
From: Ryusuke Konishi @ 2014-09-24 15:01 UTC (permalink / raw)
  To: Andreas Rohner; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA

On Wed, 24 Sep 2014 10:01:05 +0200, Andreas Rohner wrote:
> On 2014-09-23 18:35, Ryusuke Konishi wrote:
>> On Tue, 23 Sep 2014 16:21:33 +0200, Andreas Rohner wrote:
>>> On 2014-09-23 14:47, Ryusuke Konishi wrote:
>>>> By the way, if you are interested in improving this sort of bad
>>>> implemetation, please consider improving inode allocator that we can
>>>> see at nilfs_ifile_create_inode().
>>>>
>>>> It always searches free inode from ino=0.  It doesn't use the
>>>> knowledge of the last allocated inode number (inumber) nor any
>>>> locality of close-knit inodes such as a file and the directory that
>>>> contains it.
>>>>
>>>> A simple strategy is to start finding a free inode from (inumber of
>>>> the parent directory) + 1, but this may not work efficiently if the
>>>> namespace has multiple active directories, and requires that inumbers
>>>> of directories are suitably dispersed.  On the other hands, it
>>>> increases the number of disk read and also increases the number of
>>>> inode blocks to be written out if inodes are allocated too discretely.
>>>>
>>>> The optimal strategy may differ from that of other file systems
>>>> because inode blocks are not allocated to static places in nilfs.  For
>>>> example, it may be better if we gather inodes of frequently accessed
>>>> directories into the first valid inode block (on ifile) for nilfs.
>>>
>>> Sure I'll have a look at it, but this seems to be a hard problem.
>>>
>>> Since one inode has 128 bytes a typical block of 4096 contains 32
>>> inodes. We could just allocate every directory inode into an empty block
>>> with 31 free slots. Then any subsequent file inode allocation would
>>> first search the 31 slots of the parent directory and if they are full,
>>> fallback to a search starting with ino 0.
>> 
>> We can utilize several characteristics of metadata files for this
>> problem:
>> 
>> - It supports read ahead feature.  when ifile reads an inode block, we
>>   can expect that several subsequent blocks will be loaded to page
>>   cache in the background.
>> 
>> - B-tree of NILFS is efficient to hold sparse blocks.  This means that
>>   putting close-knit 32 * n inodes far from offset=0 is not so bad.
>> 
>> - ifile now can have private variables in nilfs_ifile_info (on-memory)
>>   struct.  They are available to store context information of
>>   allocator without compatibility issue.
>> 
>> - We can also use nilfs_inode_info struct of directories to store
>>   directory-based context of allocator without losing compatibility.
>> 
>> - Only caller of nilfs_ifile_create_inode() is nilfs_new_inode(), and
>>   this function knows the inode of the parent directory.
> 
> Then the only problem is how to efficiently allocate the directories. We
> could do something similar to the Orlov allocator used by the ext2/3/4
> file systems:
> 
> 1. We spread first level directories. Every one gets a full bitmap
>    block (or half a bitmap block)
> 2. For the other directories we will try to choose the bitmap block of
>    the parent unless the number of free inodes is below a certain
>    threshold. Within this bitmap block the directories should also
>    spread out.

In my understanding, the basic strategy of the Orlov allocator is to
physically spead out subtrees over cylinder groups.  This strategy is
effective for ext2/ext3/ext4 to mitigate overheads which come from
disk seeks.  The strategy increases the locality of data and metadata
and that of a parent directory and its childs nodes, but the same
thing isn't always true for nilfs because real block allocation of
ifile and other files including directories is virtualized and doesn't
reflect underlying phyiscs (e.g. relation between LBA and seek
time) as is.

I think the strategy 1 above doesn't make sense unlike ext2/3/4.

> File inodes will just start a linear search at the parents inode if
> there is enough space left in the bitmap.
> 
>>> This way if a directory has less than 32 files, all its inodes can be
>>> read in with one single block. If a directory has more than 32 files its
>>> inodes will spill over into the slots of other directories.
>>>
>>> But I am not sure if this strategy would pay off.
>> 
>> Yes, for small namespaces, the current implementation may be enough.
>> We should first decide how we evaluate the effect of the algorithm.
>> It may be the scalability of namespace.
> 
> It will be very difficult to measure the time accurately. I would
> suggest to simply count the number of reads and writes on the device.
> This can be easily done:
> 
> mkfs.nilfs2 /dev/sdb
> 
> cat /proc/diskstats > rw_before.txt
> 
> do_tests
> 
> extract_kernel_sources
> 
> ...
> 
> find /mnt
> 
> cat /proc/diskstats > rw_after.txt
> 
> The algorithm with fewer writes and reads wins.
> 
> I am still not convinced that all of this will pay off, but I will try a
> few things and see if it works.

How about measuring the following performance?

(1) Latency of inode allocation and deletion in a file system which
    holds millions of inodes.

    (Can you estimate the cost of bitmap scan per block and
     its relation with the number of inodes ?)

(2) Time to traverse a real model directory tree such as a full UNIX
    system tree or data storage of an actually-used samba server.

How do you think ?

Regards,
Ryusuke Konishi

> br,
> Andreas Rohner
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
> the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

* Re: improve inode allocation
       [not found]                             ` <20140925.000102.524843255498407534.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
@ 2014-09-24 16:18                               ` Andreas Rohner
  0 siblings, 0 replies; 10+ messages in thread
From: Andreas Rohner @ 2014-09-24 16:18 UTC (permalink / raw)
  To: Ryusuke Konishi; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA

On 2014-09-24 17:01, Ryusuke Konishi wrote:
> On Wed, 24 Sep 2014 10:01:05 +0200, Andreas Rohner wrote:
>> On 2014-09-23 18:35, Ryusuke Konishi wrote:
>>> On Tue, 23 Sep 2014 16:21:33 +0200, Andreas Rohner wrote:
>>>> On 2014-09-23 14:47, Ryusuke Konishi wrote:
>>>>> By the way, if you are interested in improving this sort of bad
>>>>> implemetation, please consider improving inode allocator that we can
>>>>> see at nilfs_ifile_create_inode().
>>>>>
>>>>> It always searches free inode from ino=0.  It doesn't use the
>>>>> knowledge of the last allocated inode number (inumber) nor any
>>>>> locality of close-knit inodes such as a file and the directory that
>>>>> contains it.
>>>>>
>>>>> A simple strategy is to start finding a free inode from (inumber of
>>>>> the parent directory) + 1, but this may not work efficiently if the
>>>>> namespace has multiple active directories, and requires that inumbers
>>>>> of directories are suitably dispersed.  On the other hands, it
>>>>> increases the number of disk read and also increases the number of
>>>>> inode blocks to be written out if inodes are allocated too discretely.
>>>>>
>>>>> The optimal strategy may differ from that of other file systems
>>>>> because inode blocks are not allocated to static places in nilfs.  For
>>>>> example, it may be better if we gather inodes of frequently accessed
>>>>> directories into the first valid inode block (on ifile) for nilfs.
>>>>
>>>> Sure I'll have a look at it, but this seems to be a hard problem.
>>>>
>>>> Since one inode has 128 bytes a typical block of 4096 contains 32
>>>> inodes. We could just allocate every directory inode into an empty block
>>>> with 31 free slots. Then any subsequent file inode allocation would
>>>> first search the 31 slots of the parent directory and if they are full,
>>>> fallback to a search starting with ino 0.
>>>
>>> We can utilize several characteristics of metadata files for this
>>> problem:
>>>
>>> - It supports read ahead feature.  when ifile reads an inode block, we
>>>   can expect that several subsequent blocks will be loaded to page
>>>   cache in the background.
>>>
>>> - B-tree of NILFS is efficient to hold sparse blocks.  This means that
>>>   putting close-knit 32 * n inodes far from offset=0 is not so bad.
>>>
>>> - ifile now can have private variables in nilfs_ifile_info (on-memory)
>>>   struct.  They are available to store context information of
>>>   allocator without compatibility issue.
>>>
>>> - We can also use nilfs_inode_info struct of directories to store
>>>   directory-based context of allocator without losing compatibility.
>>>
>>> - Only caller of nilfs_ifile_create_inode() is nilfs_new_inode(), and
>>>   this function knows the inode of the parent directory.
>>
>> Then the only problem is how to efficiently allocate the directories. We
>> could do something similar to the Orlov allocator used by the ext2/3/4
>> file systems:
>>
>> 1. We spread first level directories. Every one gets a full bitmap
>>    block (or half a bitmap block)
>> 2. For the other directories we will try to choose the bitmap block of
>>    the parent unless the number of free inodes is below a certain
>>    threshold. Within this bitmap block the directories should also
>>    spread out.
> 
> In my understanding, the basic strategy of the Orlov allocator is to
> physically spead out subtrees over cylinder groups.  This strategy is
> effective for ext2/ext3/ext4 to mitigate overheads which come from
> disk seeks.  The strategy increases the locality of data and metadata
> and that of a parent directory and its childs nodes, but the same
> thing isn't always true for nilfs because real block allocation of
> ifile and other files including directories is virtualized and doesn't
> reflect underlying phyiscs (e.g. relation between LBA and seek
> time) as is.
> 
> I think the strategy 1 above doesn't make sense unlike ext2/3/4.

I know that it is a sparse file and the blocks can end up anywhere on
disk, independent of the offset in the ifile. I just thought it may be a
good idea to give top level directories more room to grow. But you are
probably right and it makes no sense for nilfs...

>> File inodes will just start a linear search at the parents inode if
>> there is enough space left in the bitmap.
>>
>>>> This way if a directory has less than 32 files, all its inodes can be
>>>> read in with one single block. If a directory has more than 32 files its
>>>> inodes will spill over into the slots of other directories.
>>>>
>>>> But I am not sure if this strategy would pay off.
>>>
>>> Yes, for small namespaces, the current implementation may be enough.
>>> We should first decide how we evaluate the effect of the algorithm.
>>> It may be the scalability of namespace.
>>
>> It will be very difficult to measure the time accurately. I would
>> suggest to simply count the number of reads and writes on the device.
>> This can be easily done:
>>
>> mkfs.nilfs2 /dev/sdb
>>
>> cat /proc/diskstats > rw_before.txt
>>
>> do_tests
>>
>> extract_kernel_sources
>>
>> ...
>>
>> find /mnt
>>
>> cat /proc/diskstats > rw_after.txt
>>
>> The algorithm with fewer writes and reads wins.
>>
>> I am still not convinced that all of this will pay off, but I will try a
>> few things and see if it works.
> 
> How about measuring the following performance?
> 
> (1) Latency of inode allocation and deletion in a file system which
>     holds millions of inodes.
> 
>     (Can you estimate the cost of bitmap scan per block and
>      its relation with the number of inodes ?)
> 
> (2) Time to traverse a real model directory tree such as a full UNIX
>     system tree or data storage of an actually-used samba server.
> 
> How do you think ?

Yes I agree. We definitely need some test case with millions of inodes
and test for allocation latency and traversal time.

But I think that even with millions of inodes both algorithms will be
very fast. So any time measurement will contain a lot of variability
from sources we cannot control. So it will be hard to measure a
difference. That is why I suggested to measure the number of blocks read
and written to disk, which is easy, repeatable and a good indicator of
the performance of the algorithm.

Regards,
Andreas Rohner
--
To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in
the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 10+ messages in thread

end of thread, other threads:[~2014-09-24 16:18 UTC | newest]

Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2014-09-23  8:46 [PATCH v2] nilfs2: improve the performance of fdatasync() Andreas Rohner
     [not found] ` <1411462018-5780-1-git-send-email-andreas.rohner-hi6Y0CQ0nG0@public.gmane.org>
2014-09-23 10:50   ` Ryusuke Konishi
     [not found]     ` <20140923.195012.716508926019147354.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
2014-09-23 11:11       ` Andreas Rohner
2014-09-23 12:17       ` Andreas Rohner
     [not found]         ` <542164C1.7090504-hi6Y0CQ0nG0@public.gmane.org>
2014-09-23 12:47           ` Ryusuke Konishi
     [not found]             ` <20140923.214701.237540042662663531.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
2014-09-23 14:21               ` Andreas Rohner
     [not found]                 ` <542181ED.606-hi6Y0CQ0nG0@public.gmane.org>
2014-09-23 16:35                   ` improve inode allocation (was Re: [PATCH v2] nilfs2: improve the performance of fdatasync()) Ryusuke Konishi
     [not found]                     ` <20140924.013505.1831094490963391096.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
2014-09-24  8:01                       ` Andreas Rohner
     [not found]                         ` <54227A41.2090509-hi6Y0CQ0nG0@public.gmane.org>
2014-09-24 15:01                           ` improve inode allocation Ryusuke Konishi
     [not found]                             ` <20140925.000102.524843255498407534.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>
2014-09-24 16:18                               ` Andreas Rohner

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.