* [PATCH v2] nilfs2: improve the performance of fdatasync() @ 2014-09-23 8:46 Andreas Rohner [not found] ` <1411462018-5780-1-git-send-email-andreas.rohner-hi6Y0CQ0nG0@public.gmane.org> 0 siblings, 1 reply; 10+ messages in thread From: Andreas Rohner @ 2014-09-23 8:46 UTC (permalink / raw) To: linux-nilfs-u79uwXL29TY76Z2rM5mHXA; +Cc: Andreas Rohner Support for fdatasync() has been implemented in NILFS2 for a long time, but whenever the corresponding inode is dirty the implementation falls back to a full-flegded sync(). Since every write operation has to update the modification time of the file, the inode will almost always be dirty and fdatasync() will fall back to sync() most of the time. But this fallback is only necessary for a change of the file size and not for a change of the various timestamps. This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between those two situations. * If it is set the file size was changed and a full sync is necessary. * If it is not set then only the timestamps were updated and fdatasync() can go ahead. There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with the exact same semantics. Unfortunately it cannot be used directly, because NILFS2 doesn't implement write_inode() and doesn't clear the VFS flags when inodes are written out. So the VFS writeback thread can clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in nilfs_update_inode(). Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org> --- fs/nilfs2/inode.c | 13 +++++++------ fs/nilfs2/nilfs.h | 14 +++++++++++--- fs/nilfs2/segment.c | 4 ++-- 3 files changed, 20 insertions(+), 11 deletions(-) diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 6252b17..67f2112 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -125,7 +125,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, nilfs_transaction_abort(inode->i_sb); goto out; } - nilfs_mark_inode_dirty(inode); + nilfs_mark_inode_dirty_sync(inode); nilfs_transaction_commit(inode->i_sb); /* never fails */ /* Error handling should be detailed */ set_buffer_new(bh_result); @@ -667,7 +667,7 @@ void nilfs_write_inode_common(struct inode *inode, for substitutions of appended fields */ } -void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) +void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) { ino_t ino = inode->i_ino; struct nilfs_inode_info *ii = NILFS_I(inode); @@ -678,7 +678,8 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size); - set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + if (flags & I_DIRTY_DATASYNC) + set_bit(NILFS_I_INODE_SYNC, &ii->i_state); nilfs_write_inode_common(inode, raw_inode, 0); /* XXX: call with has_bmap = 0 is a workaround to avoid @@ -934,7 +935,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) return 0; } -int nilfs_mark_inode_dirty(struct inode *inode) +int __nilfs_mark_inode_dirty(struct inode *inode, int flags) { struct buffer_head *ibh; int err; @@ -945,7 +946,7 @@ int nilfs_mark_inode_dirty(struct inode *inode) "failed to reget inode block.\n"); return err; } - nilfs_update_inode(inode, ibh); + nilfs_update_inode(inode, ibh, flags); mark_buffer_dirty(ibh); nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); brelse(ibh); @@ -978,7 +979,7 @@ void nilfs_dirty_inode(struct inode *inode, int flags) return; } nilfs_transaction_begin(inode->i_sb, &ti, 0); - nilfs_mark_inode_dirty(inode); + __nilfs_mark_inode_dirty(inode, flags); nilfs_transaction_commit(inode->i_sb); /* never fails */ } diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 0696161..91093cd 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -104,7 +104,7 @@ enum { constructor */ NILFS_I_COLLECTED, /* All dirty blocks are collected */ NILFS_I_UPDATED, /* The file has been written back */ - NILFS_I_INODE_DIRTY, /* write_inode is requested */ + NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */ NILFS_I_BMAP, /* has bmap and btnode_cache */ NILFS_I_GCINODE, /* inode for GC, on memory only */ }; @@ -273,7 +273,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, unsigned long ino); extern struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, __u64 cno); -extern void nilfs_update_inode(struct inode *, struct buffer_head *); +extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); extern int nilfs_setattr(struct dentry *, struct iattr *); @@ -282,10 +282,18 @@ int nilfs_permission(struct inode *inode, int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); -extern int nilfs_mark_inode_dirty(struct inode *); +extern int __nilfs_mark_inode_dirty(struct inode *, int); extern void nilfs_dirty_inode(struct inode *, int flags); int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +static inline int nilfs_mark_inode_dirty(struct inode *inode) +{ + return __nilfs_mark_inode_dirty(inode, I_DIRTY); +} +static inline int nilfs_mark_inode_dirty_sync(struct inode *inode) +{ + return __nilfs_mark_inode_dirty(inode, I_DIRTY_SYNC); +} /* super.c */ extern struct inode *nilfs_alloc_inode(struct super_block *); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index a1a1916..6e6690e 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -930,7 +930,7 @@ static void nilfs_drop_collected_inodes(struct list_head *head) if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state)) continue; - clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + clear_bit(NILFS_I_INODE_SYNC, &ii->i_state); set_bit(NILFS_I_UPDATED, &ii->i_state); } } @@ -2194,7 +2194,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, nilfs_transaction_lock(sb, &ti, 0); ii = NILFS_I(inode); - if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) || + if (test_bit(NILFS_I_INODE_SYNC, &ii->i_state) || nilfs_test_opt(nilfs, STRICT_ORDER) || test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || nilfs_discontinued(nilfs)) { -- 2.1.0 -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply related [flat|nested] 10+ messages in thread
[parent not found: <1411462018-5780-1-git-send-email-andreas.rohner-hi6Y0CQ0nG0@public.gmane.org>]
* Re: [PATCH v2] nilfs2: improve the performance of fdatasync() [not found] ` <1411462018-5780-1-git-send-email-andreas.rohner-hi6Y0CQ0nG0@public.gmane.org> @ 2014-09-23 10:50 ` Ryusuke Konishi [not found] ` <20140923.195012.716508926019147354.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> 0 siblings, 1 reply; 10+ messages in thread From: Ryusuke Konishi @ 2014-09-23 10:50 UTC (permalink / raw) To: Andreas Rohner; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA On Tue, 23 Sep 2014 10:46:58 +0200, Andreas Rohner wrote: > Support for fdatasync() has been implemented in NILFS2 for a long time, > but whenever the corresponding inode is dirty the implementation falls > back to a full-flegded sync(). Since every write operation has to update > the modification time of the file, the inode will almost always be dirty > and fdatasync() will fall back to sync() most of the time. But this > fallback is only necessary for a change of the file size and not for > a change of the various timestamps. > > This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between > those two situations. > > * If it is set the file size was changed and a full sync is necessary. > * If it is not set then only the timestamps were updated and > fdatasync() can go ahead. > > There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with > the exact same semantics. Unfortunately it cannot be used directly, > because NILFS2 doesn't implement write_inode() and doesn't clear the VFS > flags when inodes are written out. So the VFS writeback thread can > clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So > I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in > nilfs_update_inode(). > > Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org> I now sent this to Andrew. The datasync segments that this patch creates more frequently, will cause rollforward recovery after a crash or a power failure. So, please test also that the recovery works properly for fdatasync() and reset. The situation can be simulated, for example, by using "reboot -nfh": # dd if=/dev/zero of=/nilfs/test bs=4k count=1 seek=9999 # dd if=/dev/urandom of=/nilfs/test bs=8k count=1 seek=50 conv=fdatasync,notrunc,nocreat # reboot -nfh We can use dumpseg command to confirm that the datasync segment is actually made or how recovery has done after mount. Thanks, Ryusuke Konishi -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
[parent not found: <20140923.195012.716508926019147354.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>]
* Re: [PATCH v2] nilfs2: improve the performance of fdatasync() [not found] ` <20140923.195012.716508926019147354.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> @ 2014-09-23 11:11 ` Andreas Rohner 2014-09-23 12:17 ` Andreas Rohner 1 sibling, 0 replies; 10+ messages in thread From: Andreas Rohner @ 2014-09-23 11:11 UTC (permalink / raw) To: Ryusuke Konishi; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA On 2014-09-23 12:50, Ryusuke Konishi wrote: > On Tue, 23 Sep 2014 10:46:58 +0200, Andreas Rohner wrote: >> Support for fdatasync() has been implemented in NILFS2 for a long time, >> but whenever the corresponding inode is dirty the implementation falls >> back to a full-flegded sync(). Since every write operation has to update >> the modification time of the file, the inode will almost always be dirty >> and fdatasync() will fall back to sync() most of the time. But this >> fallback is only necessary for a change of the file size and not for >> a change of the various timestamps. >> >> This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between >> those two situations. >> >> * If it is set the file size was changed and a full sync is necessary. >> * If it is not set then only the timestamps were updated and >> fdatasync() can go ahead. >> >> There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with >> the exact same semantics. Unfortunately it cannot be used directly, >> because NILFS2 doesn't implement write_inode() and doesn't clear the VFS >> flags when inodes are written out. So the VFS writeback thread can >> clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So >> I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in >> nilfs_update_inode(). >> >> Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org> > > I now sent this to Andrew. > > The datasync segments that this patch creates more frequently, will > cause rollforward recovery after a crash or a power failure. > > So, please test also that the recovery works properly for fdatasync() > and reset. The situation can be simulated, for example, by using > "reboot -nfh": > > # dd if=/dev/zero of=/nilfs/test bs=4k count=1 seek=9999 > # dd if=/dev/urandom of=/nilfs/test bs=8k count=1 seek=50 conv=fdatasync,notrunc,nocreat > # reboot -nfh Very nice script to test this! > We can use dumpseg command to confirm that the datasync segment is > actually made or how recovery has done after mount. I already tested this before I sent in my patch. I used a virtual machine and just killed the process after the fdatasync(). After the rollforward NILFS reports the correct number of blocks salvaged and the md5sum of the file is correct. I will test it again with dumpseg the way you suggested. br, Andreas Rohner -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
* Re: [PATCH v2] nilfs2: improve the performance of fdatasync() [not found] ` <20140923.195012.716508926019147354.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> 2014-09-23 11:11 ` Andreas Rohner @ 2014-09-23 12:17 ` Andreas Rohner [not found] ` <542164C1.7090504-hi6Y0CQ0nG0@public.gmane.org> 1 sibling, 1 reply; 10+ messages in thread From: Andreas Rohner @ 2014-09-23 12:17 UTC (permalink / raw) To: Ryusuke Konishi; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA On 2014-09-23 12:50, Ryusuke Konishi wrote: > On Tue, 23 Sep 2014 10:46:58 +0200, Andreas Rohner wrote: >> Support for fdatasync() has been implemented in NILFS2 for a long time, >> but whenever the corresponding inode is dirty the implementation falls >> back to a full-flegded sync(). Since every write operation has to update >> the modification time of the file, the inode will almost always be dirty >> and fdatasync() will fall back to sync() most of the time. But this >> fallback is only necessary for a change of the file size and not for >> a change of the various timestamps. >> >> This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between >> those two situations. >> >> * If it is set the file size was changed and a full sync is necessary. >> * If it is not set then only the timestamps were updated and >> fdatasync() can go ahead. >> >> There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with >> the exact same semantics. Unfortunately it cannot be used directly, >> because NILFS2 doesn't implement write_inode() and doesn't clear the VFS >> flags when inodes are written out. So the VFS writeback thread can >> clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So >> I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in >> nilfs_update_inode(). >> >> Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org> > > I now sent this to Andrew. > > The datasync segments that this patch creates more frequently, will > cause rollforward recovery after a crash or a power failure. > > So, please test also that the recovery works properly for fdatasync() > and reset. The situation can be simulated, for example, by using > "reboot -nfh": > > # dd if=/dev/zero of=/nilfs/test bs=4k count=1 seek=9999 > # dd if=/dev/urandom of=/nilfs/test bs=8k count=1 seek=50 conv=fdatasync,notrunc,nocreat > # reboot -nfh > > We can use dumpseg command to confirm that the datasync segment is > actually made or how recovery has done after mount. I tested it using your script, but I duplicated the second line twice with different values for seek and added a md5sum at the end. So in total 6 blocks were written with fdatasync(). The checksum before the reboot was: 66500bd6c7a1f89ed860cd7203f5c6e8 The last lines of the output of dumpseg after reboot: partial segment: blocknr = 26, nblocks = 3 creation time = 2014-09-23 12:02:56 nfinfo = 1 finfo ino = 12, cno = 3, nblocks = 2, ndatblk = 2 vblocknr = 16385, blkoff = 100, blocknr = 27 vblocknr = 16386, blkoff = 101, blocknr = 28 partial segment: blocknr = 29, nblocks = 3 creation time = 2014-09-23 12:02:56 nfinfo = 1 finfo ino = 12, cno = 3, nblocks = 2, ndatblk = 2 vblocknr = 16387, blkoff = 120, blocknr = 30 vblocknr = 16389, blkoff = 121, blocknr = 31 partial segment: blocknr = 32, nblocks = 3 creation time = 2014-09-23 12:02:56 nfinfo = 1 finfo ino = 12, cno = 3, nblocks = 2, ndatblk = 2 vblocknr = 16390, blkoff = 140, blocknr = 33 vblocknr = 16391, blkoff = 141, blocknr = 34 The output of dmesg for the rollforward: [ 110.701337] NILFS warning: mounting unchecked fs [ 110.833196] NILFS (device sdb): salvaged 6 blocks [ 110.837311] segctord starting. Construction interval = 5 seconds, CP frequency < 30 seconds [ 110.878959] NILFS: recovery complete. [ 110.882674] segctord starting. Construction interval = 5 seconds, CP frequency < 30 seconds The checksum after rollforward: 66500bd6c7a1f89ed860cd7203f5c6e8 Works like a charm :) br, Andreas Rohner -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
[parent not found: <542164C1.7090504-hi6Y0CQ0nG0@public.gmane.org>]
* Re: [PATCH v2] nilfs2: improve the performance of fdatasync() [not found] ` <542164C1.7090504-hi6Y0CQ0nG0@public.gmane.org> @ 2014-09-23 12:47 ` Ryusuke Konishi [not found] ` <20140923.214701.237540042662663531.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> 0 siblings, 1 reply; 10+ messages in thread From: Ryusuke Konishi @ 2014-09-23 12:47 UTC (permalink / raw) To: Andreas Rohner; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA On Tue, 23 Sep 2014 14:17:05 +0200, Andreas Rohner wrote: > On 2014-09-23 12:50, Ryusuke Konishi wrote: >> On Tue, 23 Sep 2014 10:46:58 +0200, Andreas Rohner wrote: >>> Support for fdatasync() has been implemented in NILFS2 for a long time, >>> but whenever the corresponding inode is dirty the implementation falls >>> back to a full-flegded sync(). Since every write operation has to update >>> the modification time of the file, the inode will almost always be dirty >>> and fdatasync() will fall back to sync() most of the time. But this >>> fallback is only necessary for a change of the file size and not for >>> a change of the various timestamps. >>> >>> This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between >>> those two situations. >>> >>> * If it is set the file size was changed and a full sync is necessary. >>> * If it is not set then only the timestamps were updated and >>> fdatasync() can go ahead. >>> >>> There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with >>> the exact same semantics. Unfortunately it cannot be used directly, >>> because NILFS2 doesn't implement write_inode() and doesn't clear the VFS >>> flags when inodes are written out. So the VFS writeback thread can >>> clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So >>> I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in >>> nilfs_update_inode(). >>> >>> Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org> >> >> I now sent this to Andrew. >> >> The datasync segments that this patch creates more frequently, will >> cause rollforward recovery after a crash or a power failure. >> >> So, please test also that the recovery works properly for fdatasync() >> and reset. The situation can be simulated, for example, by using >> "reboot -nfh": >> >> # dd if=/dev/zero of=/nilfs/test bs=4k count=1 seek=9999 >> # dd if=/dev/urandom of=/nilfs/test bs=8k count=1 seek=50 conv=fdatasync,notrunc,nocreat >> # reboot -nfh >> >> We can use dumpseg command to confirm that the datasync segment is >> actually made or how recovery has done after mount. > > I tested it using your script, but I duplicated the second line twice > with different values for seek and added a md5sum at the end. So in > total 6 blocks were written with fdatasync(). > > The checksum before the reboot was: 66500bd6c7a1f89ed860cd7203f5c6e8 > > The last lines of the output of dumpseg after reboot: > > partial segment: blocknr = 26, nblocks = 3 > creation time = 2014-09-23 12:02:56 > nfinfo = 1 > finfo > ino = 12, cno = 3, nblocks = 2, ndatblk = 2 > vblocknr = 16385, blkoff = 100, blocknr = 27 > vblocknr = 16386, blkoff = 101, blocknr = 28 > partial segment: blocknr = 29, nblocks = 3 > creation time = 2014-09-23 12:02:56 > nfinfo = 1 > finfo > ino = 12, cno = 3, nblocks = 2, ndatblk = 2 > vblocknr = 16387, blkoff = 120, blocknr = 30 > vblocknr = 16389, blkoff = 121, blocknr = 31 > partial segment: blocknr = 32, nblocks = 3 > creation time = 2014-09-23 12:02:56 > nfinfo = 1 > finfo > ino = 12, cno = 3, nblocks = 2, ndatblk = 2 > vblocknr = 16390, blkoff = 140, blocknr = 33 > vblocknr = 16391, blkoff = 141, blocknr = 34 > > The output of dmesg for the rollforward: > > [ 110.701337] NILFS warning: mounting unchecked fs > [ 110.833196] NILFS (device sdb): salvaged 6 blocks > [ 110.837311] segctord starting. Construction interval = 5 seconds, CP > frequency < 30 seconds > [ 110.878959] NILFS: recovery complete. > [ 110.882674] segctord starting. Construction interval = 5 seconds, CP > frequency < 30 seconds > > The checksum after rollforward: 66500bd6c7a1f89ed860cd7203f5c6e8 > > Works like a charm :) Thank you, it looks perfect so far. By the way, if you are interested in improving this sort of bad implemetation, please consider improving inode allocator that we can see at nilfs_ifile_create_inode(). It always searches free inode from ino=0. It doesn't use the knowledge of the last allocated inode number (inumber) nor any locality of close-knit inodes such as a file and the directory that contains it. A simple strategy is to start finding a free inode from (inumber of the parent directory) + 1, but this may not work efficiently if the namespace has multiple active directories, and requires that inumbers of directories are suitably dispersed. On the other hands, it increases the number of disk read and also increases the number of inode blocks to be written out if inodes are allocated too discretely. The optimal strategy may differ from that of other file systems because inode blocks are not allocated to static places in nilfs. For example, it may be better if we gather inodes of frequently accessed directories into the first valid inode block (on ifile) for nilfs. Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
[parent not found: <20140923.214701.237540042662663531.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>]
* Re: [PATCH v2] nilfs2: improve the performance of fdatasync() [not found] ` <20140923.214701.237540042662663531.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> @ 2014-09-23 14:21 ` Andreas Rohner [not found] ` <542181ED.606-hi6Y0CQ0nG0@public.gmane.org> 0 siblings, 1 reply; 10+ messages in thread From: Andreas Rohner @ 2014-09-23 14:21 UTC (permalink / raw) To: Ryusuke Konishi; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA On 2014-09-23 14:47, Ryusuke Konishi wrote: > On Tue, 23 Sep 2014 14:17:05 +0200, Andreas Rohner wrote: >> On 2014-09-23 12:50, Ryusuke Konishi wrote: >>> On Tue, 23 Sep 2014 10:46:58 +0200, Andreas Rohner wrote: >>>> Support for fdatasync() has been implemented in NILFS2 for a long time, >>>> but whenever the corresponding inode is dirty the implementation falls >>>> back to a full-flegded sync(). Since every write operation has to update >>>> the modification time of the file, the inode will almost always be dirty >>>> and fdatasync() will fall back to sync() most of the time. But this >>>> fallback is only necessary for a change of the file size and not for >>>> a change of the various timestamps. >>>> >>>> This patch adds a new flag NILFS_I_INODE_SYNC to differentiate between >>>> those two situations. >>>> >>>> * If it is set the file size was changed and a full sync is necessary. >>>> * If it is not set then only the timestamps were updated and >>>> fdatasync() can go ahead. >>>> >>>> There is already a similar flag I_DIRTY_DATASYNC on the VFS layer with >>>> the exact same semantics. Unfortunately it cannot be used directly, >>>> because NILFS2 doesn't implement write_inode() and doesn't clear the VFS >>>> flags when inodes are written out. So the VFS writeback thread can >>>> clear I_DIRTY_DATASYNC at any time without notifying NILFS2. So >>>> I_DIRTY_DATASYNC has to be mapped onto NILFS_I_INODE_SYNC in >>>> nilfs_update_inode(). >>>> >>>> Signed-off-by: Andreas Rohner <andreas.rohner-hi6Y0CQ0nG0@public.gmane.org> >>> >>> I now sent this to Andrew. >>> >>> The datasync segments that this patch creates more frequently, will >>> cause rollforward recovery after a crash or a power failure. >>> >>> So, please test also that the recovery works properly for fdatasync() >>> and reset. The situation can be simulated, for example, by using >>> "reboot -nfh": >>> >>> # dd if=/dev/zero of=/nilfs/test bs=4k count=1 seek=9999 >>> # dd if=/dev/urandom of=/nilfs/test bs=8k count=1 seek=50 conv=fdatasync,notrunc,nocreat >>> # reboot -nfh >>> >>> We can use dumpseg command to confirm that the datasync segment is >>> actually made or how recovery has done after mount. >> >> I tested it using your script, but I duplicated the second line twice >> with different values for seek and added a md5sum at the end. So in >> total 6 blocks were written with fdatasync(). >> >> The checksum before the reboot was: 66500bd6c7a1f89ed860cd7203f5c6e8 >> >> The last lines of the output of dumpseg after reboot: >> >> partial segment: blocknr = 26, nblocks = 3 >> creation time = 2014-09-23 12:02:56 >> nfinfo = 1 >> finfo >> ino = 12, cno = 3, nblocks = 2, ndatblk = 2 >> vblocknr = 16385, blkoff = 100, blocknr = 27 >> vblocknr = 16386, blkoff = 101, blocknr = 28 >> partial segment: blocknr = 29, nblocks = 3 >> creation time = 2014-09-23 12:02:56 >> nfinfo = 1 >> finfo >> ino = 12, cno = 3, nblocks = 2, ndatblk = 2 >> vblocknr = 16387, blkoff = 120, blocknr = 30 >> vblocknr = 16389, blkoff = 121, blocknr = 31 >> partial segment: blocknr = 32, nblocks = 3 >> creation time = 2014-09-23 12:02:56 >> nfinfo = 1 >> finfo >> ino = 12, cno = 3, nblocks = 2, ndatblk = 2 >> vblocknr = 16390, blkoff = 140, blocknr = 33 >> vblocknr = 16391, blkoff = 141, blocknr = 34 >> >> The output of dmesg for the rollforward: >> >> [ 110.701337] NILFS warning: mounting unchecked fs >> [ 110.833196] NILFS (device sdb): salvaged 6 blocks >> [ 110.837311] segctord starting. Construction interval = 5 seconds, CP >> frequency < 30 seconds >> [ 110.878959] NILFS: recovery complete. >> [ 110.882674] segctord starting. Construction interval = 5 seconds, CP >> frequency < 30 seconds >> >> The checksum after rollforward: 66500bd6c7a1f89ed860cd7203f5c6e8 >> >> Works like a charm :) > > Thank you, it looks perfect so far. You're welcome :) > > By the way, if you are interested in improving this sort of bad > implemetation, please consider improving inode allocator that we can > see at nilfs_ifile_create_inode(). > > It always searches free inode from ino=0. It doesn't use the > knowledge of the last allocated inode number (inumber) nor any > locality of close-knit inodes such as a file and the directory that > contains it. > > A simple strategy is to start finding a free inode from (inumber of > the parent directory) + 1, but this may not work efficiently if the > namespace has multiple active directories, and requires that inumbers > of directories are suitably dispersed. On the other hands, it > increases the number of disk read and also increases the number of > inode blocks to be written out if inodes are allocated too discretely. > > The optimal strategy may differ from that of other file systems > because inode blocks are not allocated to static places in nilfs. For > example, it may be better if we gather inodes of frequently accessed > directories into the first valid inode block (on ifile) for nilfs. Sure I'll have a look at it, but this seems to be a hard problem. Since one inode has 128 bytes a typical block of 4096 contains 32 inodes. We could just allocate every directory inode into an empty block with 31 free slots. Then any subsequent file inode allocation would first search the 31 slots of the parent directory and if they are full, fallback to a search starting with ino 0. This way if a directory has less than 32 files, all its inodes can be read in with one single block. If a directory has more than 32 files its inodes will spill over into the slots of other directories. But I am not sure if this strategy would pay off. br, Andreas Rohner -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
[parent not found: <542181ED.606-hi6Y0CQ0nG0@public.gmane.org>]
* improve inode allocation (was Re: [PATCH v2] nilfs2: improve the performance of fdatasync()) [not found] ` <542181ED.606-hi6Y0CQ0nG0@public.gmane.org> @ 2014-09-23 16:35 ` Ryusuke Konishi [not found] ` <20140924.013505.1831094490963391096.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> 0 siblings, 1 reply; 10+ messages in thread From: Ryusuke Konishi @ 2014-09-23 16:35 UTC (permalink / raw) To: Andreas Rohner; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA On Tue, 23 Sep 2014 16:21:33 +0200, Andreas Rohner wrote: > On 2014-09-23 14:47, Ryusuke Konishi wrote: >> By the way, if you are interested in improving this sort of bad >> implemetation, please consider improving inode allocator that we can >> see at nilfs_ifile_create_inode(). >> >> It always searches free inode from ino=0. It doesn't use the >> knowledge of the last allocated inode number (inumber) nor any >> locality of close-knit inodes such as a file and the directory that >> contains it. >> >> A simple strategy is to start finding a free inode from (inumber of >> the parent directory) + 1, but this may not work efficiently if the >> namespace has multiple active directories, and requires that inumbers >> of directories are suitably dispersed. On the other hands, it >> increases the number of disk read and also increases the number of >> inode blocks to be written out if inodes are allocated too discretely. >> >> The optimal strategy may differ from that of other file systems >> because inode blocks are not allocated to static places in nilfs. For >> example, it may be better if we gather inodes of frequently accessed >> directories into the first valid inode block (on ifile) for nilfs. > > Sure I'll have a look at it, but this seems to be a hard problem. > > Since one inode has 128 bytes a typical block of 4096 contains 32 > inodes. We could just allocate every directory inode into an empty block > with 31 free slots. Then any subsequent file inode allocation would > first search the 31 slots of the parent directory and if they are full, > fallback to a search starting with ino 0. We can utilize several characteristics of metadata files for this problem: - It supports read ahead feature. when ifile reads an inode block, we can expect that several subsequent blocks will be loaded to page cache in the background. - B-tree of NILFS is efficient to hold sparse blocks. This means that putting close-knit 32 * n inodes far from offset=0 is not so bad. - ifile now can have private variables in nilfs_ifile_info (on-memory) struct. They are available to store context information of allocator without compatibility issue. - We can also use nilfs_inode_info struct of directories to store directory-based context of allocator without losing compatibility. - Only caller of nilfs_ifile_create_inode() is nilfs_new_inode(), and this function knows the inode of the parent directory. > This way if a directory has less than 32 files, all its inodes can be > read in with one single block. If a directory has more than 32 files its > inodes will spill over into the slots of other directories. > > But I am not sure if this strategy would pay off. Yes, for small namespaces, the current implementation may be enough. We should first decide how we evaluate the effect of the algorithm. It may be the scalability of namespace. Regards, Ryusuke Konishi -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
[parent not found: <20140924.013505.1831094490963391096.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>]
* Re: improve inode allocation (was Re: [PATCH v2] nilfs2: improve the performance of fdatasync()) [not found] ` <20140924.013505.1831094490963391096.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> @ 2014-09-24 8:01 ` Andreas Rohner [not found] ` <54227A41.2090509-hi6Y0CQ0nG0@public.gmane.org> 0 siblings, 1 reply; 10+ messages in thread From: Andreas Rohner @ 2014-09-24 8:01 UTC (permalink / raw) To: Ryusuke Konishi; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA On 2014-09-23 18:35, Ryusuke Konishi wrote: > On Tue, 23 Sep 2014 16:21:33 +0200, Andreas Rohner wrote: >> On 2014-09-23 14:47, Ryusuke Konishi wrote: >>> By the way, if you are interested in improving this sort of bad >>> implemetation, please consider improving inode allocator that we can >>> see at nilfs_ifile_create_inode(). >>> >>> It always searches free inode from ino=0. It doesn't use the >>> knowledge of the last allocated inode number (inumber) nor any >>> locality of close-knit inodes such as a file and the directory that >>> contains it. >>> >>> A simple strategy is to start finding a free inode from (inumber of >>> the parent directory) + 1, but this may not work efficiently if the >>> namespace has multiple active directories, and requires that inumbers >>> of directories are suitably dispersed. On the other hands, it >>> increases the number of disk read and also increases the number of >>> inode blocks to be written out if inodes are allocated too discretely. >>> >>> The optimal strategy may differ from that of other file systems >>> because inode blocks are not allocated to static places in nilfs. For >>> example, it may be better if we gather inodes of frequently accessed >>> directories into the first valid inode block (on ifile) for nilfs. >> >> Sure I'll have a look at it, but this seems to be a hard problem. >> >> Since one inode has 128 bytes a typical block of 4096 contains 32 >> inodes. We could just allocate every directory inode into an empty block >> with 31 free slots. Then any subsequent file inode allocation would >> first search the 31 slots of the parent directory and if they are full, >> fallback to a search starting with ino 0. > > We can utilize several characteristics of metadata files for this > problem: > > - It supports read ahead feature. when ifile reads an inode block, we > can expect that several subsequent blocks will be loaded to page > cache in the background. > > - B-tree of NILFS is efficient to hold sparse blocks. This means that > putting close-knit 32 * n inodes far from offset=0 is not so bad. > > - ifile now can have private variables in nilfs_ifile_info (on-memory) > struct. They are available to store context information of > allocator without compatibility issue. > > - We can also use nilfs_inode_info struct of directories to store > directory-based context of allocator without losing compatibility. > > - Only caller of nilfs_ifile_create_inode() is nilfs_new_inode(), and > this function knows the inode of the parent directory. Then the only problem is how to efficiently allocate the directories. We could do something similar to the Orlov allocator used by the ext2/3/4 file systems: 1. We spread first level directories. Every one gets a full bitmap block (or half a bitmap block) 2. For the other directories we will try to choose the bitmap block of the parent unless the number of free inodes is below a certain threshold. Within this bitmap block the directories should also spread out. File inodes will just start a linear search at the parents inode if there is enough space left in the bitmap. >> This way if a directory has less than 32 files, all its inodes can be >> read in with one single block. If a directory has more than 32 files its >> inodes will spill over into the slots of other directories. >> >> But I am not sure if this strategy would pay off. > > Yes, for small namespaces, the current implementation may be enough. > We should first decide how we evaluate the effect of the algorithm. > It may be the scalability of namespace. It will be very difficult to measure the time accurately. I would suggest to simply count the number of reads and writes on the device. This can be easily done: mkfs.nilfs2 /dev/sdb cat /proc/diskstats > rw_before.txt do_tests extract_kernel_sources ... find /mnt cat /proc/diskstats > rw_after.txt The algorithm with fewer writes and reads wins. I am still not convinced that all of this will pay off, but I will try a few things and see if it works. br, Andreas Rohner -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
[parent not found: <54227A41.2090509-hi6Y0CQ0nG0@public.gmane.org>]
* Re: improve inode allocation [not found] ` <54227A41.2090509-hi6Y0CQ0nG0@public.gmane.org> @ 2014-09-24 15:01 ` Ryusuke Konishi [not found] ` <20140925.000102.524843255498407534.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> 0 siblings, 1 reply; 10+ messages in thread From: Ryusuke Konishi @ 2014-09-24 15:01 UTC (permalink / raw) To: Andreas Rohner; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA On Wed, 24 Sep 2014 10:01:05 +0200, Andreas Rohner wrote: > On 2014-09-23 18:35, Ryusuke Konishi wrote: >> On Tue, 23 Sep 2014 16:21:33 +0200, Andreas Rohner wrote: >>> On 2014-09-23 14:47, Ryusuke Konishi wrote: >>>> By the way, if you are interested in improving this sort of bad >>>> implemetation, please consider improving inode allocator that we can >>>> see at nilfs_ifile_create_inode(). >>>> >>>> It always searches free inode from ino=0. It doesn't use the >>>> knowledge of the last allocated inode number (inumber) nor any >>>> locality of close-knit inodes such as a file and the directory that >>>> contains it. >>>> >>>> A simple strategy is to start finding a free inode from (inumber of >>>> the parent directory) + 1, but this may not work efficiently if the >>>> namespace has multiple active directories, and requires that inumbers >>>> of directories are suitably dispersed. On the other hands, it >>>> increases the number of disk read and also increases the number of >>>> inode blocks to be written out if inodes are allocated too discretely. >>>> >>>> The optimal strategy may differ from that of other file systems >>>> because inode blocks are not allocated to static places in nilfs. For >>>> example, it may be better if we gather inodes of frequently accessed >>>> directories into the first valid inode block (on ifile) for nilfs. >>> >>> Sure I'll have a look at it, but this seems to be a hard problem. >>> >>> Since one inode has 128 bytes a typical block of 4096 contains 32 >>> inodes. We could just allocate every directory inode into an empty block >>> with 31 free slots. Then any subsequent file inode allocation would >>> first search the 31 slots of the parent directory and if they are full, >>> fallback to a search starting with ino 0. >> >> We can utilize several characteristics of metadata files for this >> problem: >> >> - It supports read ahead feature. when ifile reads an inode block, we >> can expect that several subsequent blocks will be loaded to page >> cache in the background. >> >> - B-tree of NILFS is efficient to hold sparse blocks. This means that >> putting close-knit 32 * n inodes far from offset=0 is not so bad. >> >> - ifile now can have private variables in nilfs_ifile_info (on-memory) >> struct. They are available to store context information of >> allocator without compatibility issue. >> >> - We can also use nilfs_inode_info struct of directories to store >> directory-based context of allocator without losing compatibility. >> >> - Only caller of nilfs_ifile_create_inode() is nilfs_new_inode(), and >> this function knows the inode of the parent directory. > > Then the only problem is how to efficiently allocate the directories. We > could do something similar to the Orlov allocator used by the ext2/3/4 > file systems: > > 1. We spread first level directories. Every one gets a full bitmap > block (or half a bitmap block) > 2. For the other directories we will try to choose the bitmap block of > the parent unless the number of free inodes is below a certain > threshold. Within this bitmap block the directories should also > spread out. In my understanding, the basic strategy of the Orlov allocator is to physically spead out subtrees over cylinder groups. This strategy is effective for ext2/ext3/ext4 to mitigate overheads which come from disk seeks. The strategy increases the locality of data and metadata and that of a parent directory and its childs nodes, but the same thing isn't always true for nilfs because real block allocation of ifile and other files including directories is virtualized and doesn't reflect underlying phyiscs (e.g. relation between LBA and seek time) as is. I think the strategy 1 above doesn't make sense unlike ext2/3/4. > File inodes will just start a linear search at the parents inode if > there is enough space left in the bitmap. > >>> This way if a directory has less than 32 files, all its inodes can be >>> read in with one single block. If a directory has more than 32 files its >>> inodes will spill over into the slots of other directories. >>> >>> But I am not sure if this strategy would pay off. >> >> Yes, for small namespaces, the current implementation may be enough. >> We should first decide how we evaluate the effect of the algorithm. >> It may be the scalability of namespace. > > It will be very difficult to measure the time accurately. I would > suggest to simply count the number of reads and writes on the device. > This can be easily done: > > mkfs.nilfs2 /dev/sdb > > cat /proc/diskstats > rw_before.txt > > do_tests > > extract_kernel_sources > > ... > > find /mnt > > cat /proc/diskstats > rw_after.txt > > The algorithm with fewer writes and reads wins. > > I am still not convinced that all of this will pay off, but I will try a > few things and see if it works. How about measuring the following performance? (1) Latency of inode allocation and deletion in a file system which holds millions of inodes. (Can you estimate the cost of bitmap scan per block and its relation with the number of inodes ?) (2) Time to traverse a real model directory tree such as a full UNIX system tree or data storage of an actually-used samba server. How do you think ? Regards, Ryusuke Konishi > br, > Andreas Rohner > > -- > To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in > the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org > More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
[parent not found: <20140925.000102.524843255498407534.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org>]
* Re: improve inode allocation [not found] ` <20140925.000102.524843255498407534.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> @ 2014-09-24 16:18 ` Andreas Rohner 0 siblings, 0 replies; 10+ messages in thread From: Andreas Rohner @ 2014-09-24 16:18 UTC (permalink / raw) To: Ryusuke Konishi; +Cc: linux-nilfs-u79uwXL29TY76Z2rM5mHXA On 2014-09-24 17:01, Ryusuke Konishi wrote: > On Wed, 24 Sep 2014 10:01:05 +0200, Andreas Rohner wrote: >> On 2014-09-23 18:35, Ryusuke Konishi wrote: >>> On Tue, 23 Sep 2014 16:21:33 +0200, Andreas Rohner wrote: >>>> On 2014-09-23 14:47, Ryusuke Konishi wrote: >>>>> By the way, if you are interested in improving this sort of bad >>>>> implemetation, please consider improving inode allocator that we can >>>>> see at nilfs_ifile_create_inode(). >>>>> >>>>> It always searches free inode from ino=0. It doesn't use the >>>>> knowledge of the last allocated inode number (inumber) nor any >>>>> locality of close-knit inodes such as a file and the directory that >>>>> contains it. >>>>> >>>>> A simple strategy is to start finding a free inode from (inumber of >>>>> the parent directory) + 1, but this may not work efficiently if the >>>>> namespace has multiple active directories, and requires that inumbers >>>>> of directories are suitably dispersed. On the other hands, it >>>>> increases the number of disk read and also increases the number of >>>>> inode blocks to be written out if inodes are allocated too discretely. >>>>> >>>>> The optimal strategy may differ from that of other file systems >>>>> because inode blocks are not allocated to static places in nilfs. For >>>>> example, it may be better if we gather inodes of frequently accessed >>>>> directories into the first valid inode block (on ifile) for nilfs. >>>> >>>> Sure I'll have a look at it, but this seems to be a hard problem. >>>> >>>> Since one inode has 128 bytes a typical block of 4096 contains 32 >>>> inodes. We could just allocate every directory inode into an empty block >>>> with 31 free slots. Then any subsequent file inode allocation would >>>> first search the 31 slots of the parent directory and if they are full, >>>> fallback to a search starting with ino 0. >>> >>> We can utilize several characteristics of metadata files for this >>> problem: >>> >>> - It supports read ahead feature. when ifile reads an inode block, we >>> can expect that several subsequent blocks will be loaded to page >>> cache in the background. >>> >>> - B-tree of NILFS is efficient to hold sparse blocks. This means that >>> putting close-knit 32 * n inodes far from offset=0 is not so bad. >>> >>> - ifile now can have private variables in nilfs_ifile_info (on-memory) >>> struct. They are available to store context information of >>> allocator without compatibility issue. >>> >>> - We can also use nilfs_inode_info struct of directories to store >>> directory-based context of allocator without losing compatibility. >>> >>> - Only caller of nilfs_ifile_create_inode() is nilfs_new_inode(), and >>> this function knows the inode of the parent directory. >> >> Then the only problem is how to efficiently allocate the directories. We >> could do something similar to the Orlov allocator used by the ext2/3/4 >> file systems: >> >> 1. We spread first level directories. Every one gets a full bitmap >> block (or half a bitmap block) >> 2. For the other directories we will try to choose the bitmap block of >> the parent unless the number of free inodes is below a certain >> threshold. Within this bitmap block the directories should also >> spread out. > > In my understanding, the basic strategy of the Orlov allocator is to > physically spead out subtrees over cylinder groups. This strategy is > effective for ext2/ext3/ext4 to mitigate overheads which come from > disk seeks. The strategy increases the locality of data and metadata > and that of a parent directory and its childs nodes, but the same > thing isn't always true for nilfs because real block allocation of > ifile and other files including directories is virtualized and doesn't > reflect underlying phyiscs (e.g. relation between LBA and seek > time) as is. > > I think the strategy 1 above doesn't make sense unlike ext2/3/4. I know that it is a sparse file and the blocks can end up anywhere on disk, independent of the offset in the ifile. I just thought it may be a good idea to give top level directories more room to grow. But you are probably right and it makes no sense for nilfs... >> File inodes will just start a linear search at the parents inode if >> there is enough space left in the bitmap. >> >>>> This way if a directory has less than 32 files, all its inodes can be >>>> read in with one single block. If a directory has more than 32 files its >>>> inodes will spill over into the slots of other directories. >>>> >>>> But I am not sure if this strategy would pay off. >>> >>> Yes, for small namespaces, the current implementation may be enough. >>> We should first decide how we evaluate the effect of the algorithm. >>> It may be the scalability of namespace. >> >> It will be very difficult to measure the time accurately. I would >> suggest to simply count the number of reads and writes on the device. >> This can be easily done: >> >> mkfs.nilfs2 /dev/sdb >> >> cat /proc/diskstats > rw_before.txt >> >> do_tests >> >> extract_kernel_sources >> >> ... >> >> find /mnt >> >> cat /proc/diskstats > rw_after.txt >> >> The algorithm with fewer writes and reads wins. >> >> I am still not convinced that all of this will pay off, but I will try a >> few things and see if it works. > > How about measuring the following performance? > > (1) Latency of inode allocation and deletion in a file system which > holds millions of inodes. > > (Can you estimate the cost of bitmap scan per block and > its relation with the number of inodes ?) > > (2) Time to traverse a real model directory tree such as a full UNIX > system tree or data storage of an actually-used samba server. > > How do you think ? Yes I agree. We definitely need some test case with millions of inodes and test for allocation latency and traversal time. But I think that even with millions of inodes both algorithms will be very fast. So any time measurement will contain a lot of variability from sources we cannot control. So it will be hard to measure a difference. That is why I suggested to measure the number of blocks read and written to disk, which is easy, repeatable and a good indicator of the performance of the algorithm. Regards, Andreas Rohner -- To unsubscribe from this list: send the line "unsubscribe linux-nilfs" in the body of a message to majordomo-u79uwXL29TY76Z2rM5mHXA@public.gmane.org More majordomo info at http://vger.kernel.org/majordomo-info.html ^ permalink raw reply [flat|nested] 10+ messages in thread
end of thread, other threads:[~2014-09-24 16:18 UTC | newest] Thread overview: 10+ messages (download: mbox.gz / follow: Atom feed) -- links below jump to the message on this page -- 2014-09-23 8:46 [PATCH v2] nilfs2: improve the performance of fdatasync() Andreas Rohner [not found] ` <1411462018-5780-1-git-send-email-andreas.rohner-hi6Y0CQ0nG0@public.gmane.org> 2014-09-23 10:50 ` Ryusuke Konishi [not found] ` <20140923.195012.716508926019147354.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> 2014-09-23 11:11 ` Andreas Rohner 2014-09-23 12:17 ` Andreas Rohner [not found] ` <542164C1.7090504-hi6Y0CQ0nG0@public.gmane.org> 2014-09-23 12:47 ` Ryusuke Konishi [not found] ` <20140923.214701.237540042662663531.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> 2014-09-23 14:21 ` Andreas Rohner [not found] ` <542181ED.606-hi6Y0CQ0nG0@public.gmane.org> 2014-09-23 16:35 ` improve inode allocation (was Re: [PATCH v2] nilfs2: improve the performance of fdatasync()) Ryusuke Konishi [not found] ` <20140924.013505.1831094490963391096.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> 2014-09-24 8:01 ` Andreas Rohner [not found] ` <54227A41.2090509-hi6Y0CQ0nG0@public.gmane.org> 2014-09-24 15:01 ` improve inode allocation Ryusuke Konishi [not found] ` <20140925.000102.524843255498407534.konishi.ryusuke-Zyj7fXuS5i5L9jVzuh4AOg@public.gmane.org> 2014-09-24 16:18 ` Andreas Rohner
This is an external index of several public inboxes, see mirroring instructions on how to clone and mirror all data and code used by this external index.