linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [RFC] Ext3 nanosecond timestamps in big inodes
@ 2005-01-14 21:16 Andreas Gruenbacher
  2005-01-16  4:37 ` Andi Kleen
  2005-01-16  5:46 ` Andreas Dilger
  0 siblings, 2 replies; 5+ messages in thread
From: Andreas Gruenbacher @ 2005-01-14 21:16 UTC (permalink / raw)
  To: linux-fsdevel, linux-kernel; +Cc: Alex Tomas, Andrew Tridgell

Hello,

this is a spin-off of an old patch by Alex Tomas <alex@clusterfs.com>:
Alex originally had nanosecond timestamps in his original patch; here is
a rejuvenated version. Please tell me what you think. Alex also added a
create timestamp in his original patch. Do we actually need that?

Nanoseconds consume 30 bits in the 32-bit fields. The remaining two bits 
currently are zeroed out implicitly. We could later use them remaining two 
bits for years beyond 2038.

Index: linux-2.6.11-rc1-mm1/fs/ext3/inode.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/inode.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/inode.c
@@ -628,7 +628,7 @@ static int ext3_splice_branch(handle_t *
 
 	/* We are done with atomic stuff, now do the rest of housekeeping */
 
-	inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_ctime = ext3_current_time(inode);
 	ext3_mark_inode_dirty(handle, inode);
 
 	/* had we spliced it onto indirect block? */
@@ -2201,7 +2201,7 @@ do_indirects:
 			;
 	}
 	up(&ei->truncate_sem);
-	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_mtime = inode->i_ctime = ext3_current_time(inode);
 	ext3_mark_inode_dirty(handle, inode);
 
 	/* In a multi-transaction truncate, we only make the final
@@ -2501,6 +2501,14 @@ void ext3_read_inode(struct inode * inod
 				ei->i_extra_isize;
 		if (le32_to_cpu(*magic))
 			 ei->i_state |= EXT3_STATE_XATTR;
+		if (ei->i_extra_isize >= 2*sizeof(__le16) + 3*sizeof(__le32)) {
+			inode->i_atime.tv_nsec =
+				le32_to_cpu(raw_inode->i_atime_nsec);
+			inode->i_ctime.tv_nsec =
+				le32_to_cpu(raw_inode->i_ctime_nsec);
+			inode->i_mtime.tv_nsec =
+				le32_to_cpu(raw_inode->i_mtime_nsec);
+		}
 	}
 
 	if (S_ISREG(inode->i_mode)) {
@@ -2638,8 +2646,17 @@ static int ext3_do_update_inode(handle_t
 	} else for (block = 0; block < EXT3_N_BLOCKS; block++)
 		raw_inode->i_block[block] = ei->i_data[block];
 
-	if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE)
+	if (ei->i_extra_isize) {
 		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
+		if (ei->i_extra_isize > 2*sizeof(__le16) + 3*sizeof(__le32)) {
+			raw_inode->i_atime_nsec =
+				cpu_to_le32(inode->i_atime.tv_nsec);
+			raw_inode->i_ctime_nsec =
+				cpu_to_le32(inode->i_ctime.tv_nsec);
+			raw_inode->i_mtime_nsec =
+				cpu_to_le32(inode->i_mtime.tv_nsec);
+		}
+	}
 
 	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
 	rc = ext3_journal_dirty_metadata(handle, bh);
Index: linux-2.6.11-rc1-mm1/fs/ext3/super.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/super.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/super.c
@@ -1402,6 +1402,8 @@ static int ext3_fill_super (struct super
 				sbi->s_inode_size);
 			goto failed_mount;
 		}
+		if (sbi->s_inode_size > EXT3_GOOD_OLD_INODE_SIZE)
+			sb->s_time_gran = 1;
 	}
 	sbi->s_frag_size = EXT3_MIN_FRAG_SIZE <<
 				   le32_to_cpu(es->s_log_frag_size);
Index: linux-2.6.11-rc1-mm1/fs/ext3/ialloc.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/ialloc.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/ialloc.c
@@ -558,7 +558,7 @@ got:
 	/* This is the optimal IO size (for stat), not the fs block size */
 	inode->i_blksize = PAGE_SIZE;
 	inode->i_blocks = 0;
-	inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_mtime = inode->i_atime = inode->i_ctime = ext3_current_time(inode);
 
 	memset(ei->i_data, 0, sizeof(ei->i_data));
 	ei->i_next_alloc_block = 0;
Index: linux-2.6.11-rc1-mm1/fs/ext3/xattr.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/xattr.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/xattr.c
@@ -1001,7 +1001,7 @@ ext3_xattr_set_handle(handle_t *handle, 
 		}
 	}
 	if (!error) {
-		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_ctime = ext3_current_time(inode);
 		error = ext3_mark_iloc_dirty(handle, inode, &is.iloc);
 		/*
 		 * The bh is consumed by ext3_mark_iloc_dirty, even with
Index: linux-2.6.11-rc1-mm1/fs/ext3/ioctl.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/ioctl.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/ioctl.c
@@ -87,7 +87,7 @@ int ext3_ioctl (struct inode * inode, st
 		ei->i_flags = flags;
 
 		ext3_set_inode_flags(inode);
-		inode->i_ctime = CURRENT_TIME_SEC;
+		inode->i_ctime = ext3_current_time(inode);
 
 		err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 flags_err:
@@ -121,7 +121,7 @@ flags_err:
 			return PTR_ERR(handle);
 		err = ext3_reserve_inode_write(handle, inode, &iloc);
 		if (err == 0) {
-			inode->i_ctime = CURRENT_TIME_SEC;
+			inode->i_ctime = ext3_current_time(inode);
 			inode->i_generation = generation;
 			err = ext3_mark_iloc_dirty(handle, inode, &iloc);
 		}
Index: linux-2.6.11-rc1-mm1/include/linux/ext3_fs.h
===================================================================
--- linux-2.6.11-rc1-mm1.orig/include/linux/ext3_fs.h
+++ linux-2.6.11-rc1-mm1/include/linux/ext3_fs.h
@@ -245,9 +245,9 @@ struct ext3_inode {
 	__le16	i_mode;		/* File mode */
 	__le16	i_uid;		/* Low 16 bits of Owner Uid */
 	__le32	i_size;		/* Size in bytes */
-	__le32	i_atime;	/* Access time */
-	__le32	i_ctime;	/* Creation time */
-	__le32	i_mtime;	/* Modification time */
+	__le32	i_atime;	/* Access time (seconds) */
+	__le32	i_ctime;	/* Change time (seconds) */
+	__le32	i_mtime;	/* Modification time (seconds) */
 	__le32	i_dtime;	/* Deletion Time */
 	__le16	i_gid;		/* Low 16 bits of Group Id */
 	__le16	i_links_count;	/* Links count */
@@ -295,6 +295,9 @@ struct ext3_inode {
 	} osd2;				/* OS dependent 2 */
 	__le16	i_extra_isize;
 	__le16	i_pad1;
+	__le32	i_atime_nsec;		/* Access time (nanoseconds) */
+	__le32	i_ctime_nsec;		/* Change time (nanoseconds) */
+	__le32	i_mtime_nsec;		/* Modification time (nanoseconds) */
 };
 
 #define i_size_high	i_dir_acl
@@ -476,6 +479,11 @@ static inline struct ext3_inode_info *EX
 {
 	return container_of(inode, struct ext3_inode_info, vfs_inode);
 }
+static inline struct timespec ext3_current_time(struct inode *inode)
+{
+	return (inode->i_sb->s_time_gran == 1) ?
+	       CURRENT_TIME : CURRENT_TIME_SEC;
+}
 #else
 /* Assume that user mode programs are passing in an ext3fs superblock, not
  * a kernel struct super_block.  This will allow us to call the feature-test
Index: linux-2.6.11-rc1-mm1/fs/ext3/namei.c
===================================================================
--- linux-2.6.11-rc1-mm1.orig/fs/ext3/namei.c
+++ linux-2.6.11-rc1-mm1/fs/ext3/namei.c
@@ -1256,7 +1256,7 @@ static int add_dirent_to_buf(handle_t *h
 	 * happen is that the times are slightly out of date
 	 * and/or different from the directory change time.
 	 */
-	dir->i_mtime = dir->i_ctime = CURRENT_TIME_SEC;
+	dir->i_mtime = dir->i_ctime = ext3_current_time(dir);
 	ext3_update_dx_flag(dir);
 	dir->i_version++;
 	ext3_mark_inode_dirty(handle, dir);
@@ -2034,7 +2034,7 @@ static int ext3_rmdir (struct inode * di
 	 * recovery. */
 	inode->i_size = 0;
 	ext3_orphan_add(handle, inode);
-	inode->i_ctime = dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	inode->i_ctime = dir->i_ctime = dir->i_mtime = ext3_current_time(inode);
 	ext3_mark_inode_dirty(handle, inode);
 	dir->i_nlink--;
 	ext3_update_dx_flag(dir);
@@ -2084,7 +2084,7 @@ static int ext3_unlink(struct inode * di
 	retval = ext3_delete_entry(handle, dir, de, bh);
 	if (retval)
 		goto end_unlink;
-	dir->i_ctime = dir->i_mtime = CURRENT_TIME_SEC;
+	dir->i_ctime = dir->i_mtime = ext3_current_time(dir);
 	ext3_update_dx_flag(dir);
 	ext3_mark_inode_dirty(handle, dir);
 	inode->i_nlink--;
@@ -2174,7 +2174,7 @@ retry:
 	if (IS_DIRSYNC(dir))
 		handle->h_sync = 1;
 
-	inode->i_ctime = CURRENT_TIME_SEC;
+	inode->i_ctime = ext3_current_time(inode);
 	ext3_inc_count(handle, inode);
 	atomic_inc(&inode->i_count);
 
@@ -2275,7 +2275,7 @@ static int ext3_rename (struct inode * o
 	 * Like most other Unix systems, set the ctime for inodes on a
 	 * rename.
 	 */
-	old_inode->i_ctime = CURRENT_TIME_SEC;
+	old_inode->i_ctime = ext3_current_time(old_inode);
 	ext3_mark_inode_dirty(handle, old_inode);
 
 	/*
@@ -2308,9 +2308,9 @@ static int ext3_rename (struct inode * o
 
 	if (new_inode) {
 		new_inode->i_nlink--;
-		new_inode->i_ctime = CURRENT_TIME_SEC;
+		new_inode->i_ctime = ext3_current_time(new_inode);
 	}
-	old_dir->i_ctime = old_dir->i_mtime = CURRENT_TIME_SEC;
+	old_dir->i_ctime = old_dir->i_mtime = ext3_current_time(old_dir);
 	ext3_update_dx_flag(old_dir);
 	if (dir_bh) {
 		BUFFER_TRACE(dir_bh, "get_write_access");

Regards,
-- 
Andreas Gruenbacher <agruen@suse.de>
SUSE Labs, SUSE LINUX PRODUCTS GMBH

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] Ext3 nanosecond timestamps in big inodes
  2005-01-16  4:37 ` Andi Kleen
@ 2005-01-15 14:41   ` Andreas Gruenbacher
  0 siblings, 0 replies; 5+ messages in thread
From: Andreas Gruenbacher @ 2005-01-15 14:41 UTC (permalink / raw)
  To: Andi Kleen; +Cc: Alex Tomas, Andrew Tridgell, linux-kernel

On Sunday 16 January 2005 05:37, Andi Kleen wrote:
> Andreas Gruenbacher <agruen@suse.de> writes:
> > this is a spin-off of an old patch by Alex Tomas <alex@clusterfs.com>:
> > Alex originally had nanosecond timestamps in his original patch; here is
> > a rejuvenated version. Please tell me what you think. Alex also added a
> > create timestamp in his original patch. Do we actually need that?
> >
> > Nanoseconds consume 30 bits in the 32-bit fields. The remaining two bits
> > currently are zeroed out implicitly. We could later use them remaining
> > two bits for years beyond 2038.
>
> Looks good. Just two suggestions:
>
> - Provide an mount option to turn it off because there may be
> performance regressions in some workload because inodes will be
> flushed more often.
> [I actually considered doing this generally at the VFS level
> when doing the s_time_gran patch, but it needed some more changes
> that I didn't want to do at that time. Doing it in the FS as
> interim solution would be fine too]

I think I agree, also with doing it at the VFS layer. We must make sure that 
eventually the most recent timestamp makes it to disk (unless the machine 
crashes, in which case slightly out-of-date timestamps probably can be 
tolerated).

> - Use the 2 bits for additionals years right now on 64bit
> hosts. No need to keep the y2038 issue around longer than necessary.

Yes. Actually zeroing out the upper two bits is wrong for timestamps before 
the epoch if we use plain two's complement representation. The two bits give 
us an extended range of about [1697 .. 2242] instead of [1901 .. 2038].

-- 
Andreas Gruenbacher <agruen@suse.de>
SUSE Labs, SUSE LINUX PRODUCTS GMBH

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] Ext3 nanosecond timestamps in big inodes
  2005-01-14 21:16 [RFC] Ext3 nanosecond timestamps in big inodes Andreas Gruenbacher
@ 2005-01-16  4:37 ` Andi Kleen
  2005-01-15 14:41   ` Andreas Gruenbacher
  2005-01-16  5:46 ` Andreas Dilger
  1 sibling, 1 reply; 5+ messages in thread
From: Andi Kleen @ 2005-01-16  4:37 UTC (permalink / raw)
  To: Andreas Gruenbacher; +Cc: Alex Tomas, Andrew Tridgell, linux-kernel

Andreas Gruenbacher <agruen@suse.de> writes:

> this is a spin-off of an old patch by Alex Tomas <alex@clusterfs.com>:
> Alex originally had nanosecond timestamps in his original patch; here is
> a rejuvenated version. Please tell me what you think. Alex also added a
> create timestamp in his original patch. Do we actually need that?
>
> Nanoseconds consume 30 bits in the 32-bit fields. The remaining two bits 
> currently are zeroed out implicitly. We could later use them remaining two 
> bits for years beyond 2038.

Looks good. Just two suggestions:

- Provide an mount option to turn it off because there may be
performance regressions in some workload because inodes will be
flushed more often.
[I actually considered doing this generally at the VFS level
when doing the s_time_gran patch, but it needed some more changes
that I didn't want to do at that time. Doing it in the FS as
interim solution would be fine too]

- Use the 2 bits for additionals years right now on 64bit
hosts. No need to keep the y2038 issue around longer than necessary.

-Andi

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] Ext3 nanosecond timestamps in big inodes
  2005-01-14 21:16 [RFC] Ext3 nanosecond timestamps in big inodes Andreas Gruenbacher
  2005-01-16  4:37 ` Andi Kleen
@ 2005-01-16  5:46 ` Andreas Dilger
  2005-01-17  9:49   ` Anton Altaparmakov
  1 sibling, 1 reply; 5+ messages in thread
From: Andreas Dilger @ 2005-01-16  5:46 UTC (permalink / raw)
  To: Andreas Gruenbacher
  Cc: linux-fsdevel, linux-kernel, Alex Tomas, Andrew Tridgell

[-- Attachment #1: Type: text/plain, Size: 3360 bytes --]

On Jan 14, 2005  22:16 +0100, Andreas Gruenbacher wrote:
> this is a spin-off of an old patch by Alex Tomas <alex@clusterfs.com>:
> Alex originally had nanosecond timestamps in his original patch; here is
> a rejuvenated version. Please tell me what you think. Alex also added a
> create timestamp in his original patch. Do we actually need that?

I think the create timestamp can't hurt things and might be useful in some
cases (e.g. CIFS could use this for exporting, and Lustre will also).  It
would also be useful for some disk "forensics" (term used loosely) unlike
the ctime.  I haven't ever been very keen on nsecond atimes, but it can't
kill us (atime is already flushed at most 1/sec I think).

> Nanoseconds consume 30 bits in the 32-bit fields. The remaining two bits 
> currently are zeroed out implicitly. We could later use them remaining two 
> bits for years beyond 2038.

My thought has always been to use, say, 24 bits for "nseconds>>6" and 8 bits
for "seconds<<32" or something close to this (a few bits either way) to give
us more range on the high end.  In the past we talked of usecond timestamps
(which would have only been 20 bits, leaving 12 bits for seconds msb),
but the kernel ended up using nseconds for the internal timestamps.

> @@ -2501,6 +2501,14 @@ void ext3_read_inode(struct inode * inod
> +		if (ei->i_extra_isize >= 2*sizeof(__le16) + 3*sizeof(__le32)) {
> +			inode->i_atime.tv_nsec =
> +				le32_to_cpu(raw_inode->i_atime_nsec);
> +			inode->i_ctime.tv_nsec =
> +				le32_to_cpu(raw_inode->i_ctime_nsec);
> +			inode->i_mtime.tv_nsec =
> +				le32_to_cpu(raw_inode->i_mtime_nsec);
> +		}

You may as well put a macro here to convert from the raw timestamp
(probably shouldn't just be called "nsec" if we are also encoding more
info there, maybe msb_and_ns?) so we can mask (and shift) as necessary.

> @@ -2638,8 +2646,17 @@ static int ext3_do_update_inode(handle_t
> -	if (EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE)
> +	if (ei->i_extra_isize) {

I don't think this is right.  This means we would only ever store nsec
timestamps if there was already other data in the large inode.  Since
writing that part of the inode is free (we will be writing a full sector
anyways), no reason not to save it.

>  		raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
> +		if (ei->i_extra_isize > 2*sizeof(__le16) + 3*sizeof(__le32)) {
> +			raw_inode->i_atime_nsec =
> +				cpu_to_le32(inode->i_atime.tv_nsec);
> +			raw_inode->i_ctime_nsec =
> +				cpu_to_le32(inode->i_ctime.tv_nsec);
> +			raw_inode->i_mtime_nsec =
> +				cpu_to_le32(inode->i_mtime.tv_nsec);
> +		}
> +	}

Macro here also to convert from nsec to the ext3-internal "msb_and_ns" format.

> +static inline struct timespec ext3_current_time(struct inode *inode)
> +{
> +	return (inode->i_sb->s_time_gran == 1) ?
> +	       CURRENT_TIME : CURRENT_TIME_SEC;
> +}

If "s_time_gran" (I haven't seen this before but it doesn't appear to
be a part of your patch) had some useful meaning we could use it to e.g.
shift the nsec part of the timestamps as Andy requested so as not to make
the timestamps change too often.

Cheers, Andreas
--
Andreas Dilger
http://sourceforge.net/projects/ext2resize/
http://members.shaw.ca/adilger/             http://members.shaw.ca/golinux/


[-- Attachment #2: Type: application/pgp-signature, Size: 189 bytes --]

^ permalink raw reply	[flat|nested] 5+ messages in thread

* Re: [RFC] Ext3 nanosecond timestamps in big inodes
  2005-01-16  5:46 ` Andreas Dilger
@ 2005-01-17  9:49   ` Anton Altaparmakov
  0 siblings, 0 replies; 5+ messages in thread
From: Anton Altaparmakov @ 2005-01-17  9:49 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Andreas Gruenbacher, linux-fsdevel, lkml, Alex Tomas, Andrew Tridgell

On Sat, 2005-01-15 at 22:46 -0700, Andreas Dilger wrote:
> On Jan 14, 2005  22:16 +0100, Andreas Gruenbacher wrote:
> > +static inline struct timespec ext3_current_time(struct inode *inode)
> > +{
> > +	return (inode->i_sb->s_time_gran == 1) ?
> > +	       CURRENT_TIME : CURRENT_TIME_SEC;
> > +}
> 
> If "s_time_gran" (I haven't seen this before but it doesn't appear to
> be a part of your patch) had some useful meaning we could use it to e.g.
> shift the nsec part of the timestamps as Andy requested so as not to make
> the timestamps change too often.

sb->s_time_gran is the granularity used for the time in each fs in
nanoseconds.  So, for example in NTFS it is set to 100 as NTFS stores
time as 100ns intervals.  This means the kernel time can be rounded
appropriately when the fs inode times are being updated.  Without this
you can see inode time jumping backwards in time if the inode is thrown
out of memory and then read in again and in the process it had some of
the time bits truncated...

See the original post of the patch from Andi Kleen for details:

http://marc.theaimsgroup.com/?l=linux-kernel&m=110134111125012&w=2

Best regards,

        Anton
-- 
Anton Altaparmakov <aia21 at cam.ac.uk> (replace at with @)
Unix Support, Computing Service, University of Cambridge, CB2 3QH, UK
Linux NTFS maintainer / IRC: #ntfs on irc.freenode.net
WWW: http://linux-ntfs.sf.net/ & http://www-stu.christs.cam.ac.uk/~aia21/


^ permalink raw reply	[flat|nested] 5+ messages in thread

end of thread, other threads:[~2005-01-17  9:49 UTC | newest]

Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2005-01-14 21:16 [RFC] Ext3 nanosecond timestamps in big inodes Andreas Gruenbacher
2005-01-16  4:37 ` Andi Kleen
2005-01-15 14:41   ` Andreas Gruenbacher
2005-01-16  5:46 ` Andreas Dilger
2005-01-17  9:49   ` Anton Altaparmakov

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).