All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-18  7:36 Lukas Czerner
  2010-11-18  7:36 ` [PATCH 2/2] ext4: Add EXT4_IOC_TRIM ioctl to handle batched discard Lukas Czerner
                   ` (2 more replies)
  0 siblings, 3 replies; 106+ messages in thread
From: Lukas Czerner @ 2010-11-18  7:36 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4, linux-kernel, linux-fsdevel, hch, sandeen, lczerner

There was concern that FITRIM ioctl is not common enough to be included
in core vfs ioctl, as Christoph Hellwig pointed out there's no real point
in dispatching this out to a separate vector instead of just through
->ioctl.

So this commit removes ioctl_fstrim() from vfs ioctl and trim_fs
from super_operation structure.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/super.c    |    1 -
 fs/ioctl.c         |   39 ---------------------------------------
 include/linux/fs.h |    1 -
 3 files changed, 0 insertions(+), 41 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 61182fe..1d3c2aa 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1197,7 +1197,6 @@ static const struct super_operations ext4_sops = {
 	.quota_write	= ext4_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
-	.trim_fs	= ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
diff --git a/fs/ioctl.c b/fs/ioctl.c
index e92fdbb..f855ea4 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -530,41 +530,6 @@ static int ioctl_fsthaw(struct file *filp)
 	return thaw_super(sb);
 }
 
-static int ioctl_fstrim(struct file *filp, void __user *argp)
-{
-	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
-	struct fstrim_range range;
-	int ret = 0;
-
-	if (!capable(CAP_SYS_ADMIN))
-		return -EPERM;
-
-	/* If filesystem doesn't support trim feature, return. */
-	if (sb->s_op->trim_fs == NULL)
-		return -EOPNOTSUPP;
-
-	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
-	if (sb->s_bdev == NULL)
-		return -EINVAL;
-
-	if (argp == NULL) {
-		range.start = 0;
-		range.len = ULLONG_MAX;
-		range.minlen = 0;
-	} else if (copy_from_user(&range, argp, sizeof(range)))
-		return -EFAULT;
-
-	ret = sb->s_op->trim_fs(sb, &range);
-	if (ret < 0)
-		return ret;
-
-	if ((argp != NULL) &&
-	    (copy_to_user(argp, &range, sizeof(range))))
-		return -EFAULT;
-
-	return 0;
-}
-
 /*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
@@ -615,10 +580,6 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		error = ioctl_fsthaw(filp);
 		break;
 
-	case FITRIM:
-		error = ioctl_fstrim(filp, argp);
-		break;
-
 	case FS_IOC_FIEMAP:
 		return ioctl_fiemap(filp, arg);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 334d68a..eedc00b 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1612,7 +1612,6 @@ struct super_operations {
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
-	int (*trim_fs) (struct super_block *, struct fstrim_range *);
 };
 
 /*
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 106+ messages in thread

* [PATCH 2/2] ext4: Add EXT4_IOC_TRIM ioctl to handle batched discard
  2010-11-18  7:36 [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation Lukas Czerner
@ 2010-11-18  7:36 ` Lukas Czerner
  2010-11-19 16:19   ` Ted Ts'o
  2010-11-18 13:06 ` [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation Matthew Wilcox
  2010-11-19 15:41 ` Ted Ts'o
  2 siblings, 1 reply; 106+ messages in thread
From: Lukas Czerner @ 2010-11-18  7:36 UTC (permalink / raw)
  To: tytso; +Cc: linux-ext4, linux-kernel, linux-fsdevel, hch, sandeen, lczerner

Filesystem independent ioctl was rejected as not common enough to be in
core vfs ioctl. Since we still need to access to this functionality this
commit adds ext4 specific ioctl EXT4_IOC_TRIM to dispatch
ext4_trim_fs().

It takes fstrim_range structure as an argument. fstrim_range is definec in
the include/linux/fs.h and its definition is as follows.

struct fstrim_range {
	__u64 start;
	__u64 len;
	__u64 minlen;
}

start	- first Byte to trim
len	- number of Bytes to trim from start
minlen	- minimum extent length to trim, free extents shorter than this
  number of Bytes will be ignored. This will be rounded up to fs
  block size.

After the FITRIM is done, the number of actually discarded Bytes is stored
in fstrim_range.len to give the user better insight on how much storage
space has been really released for wear-leveling.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext4/ext4.h  |    1 +
 fs/ext4/ioctl.c |   24 ++++++++++++++++++++++++
 2 files changed, 25 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 6a5edea..2af5042 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -541,6 +541,7 @@ struct ext4_new_group_data {
  /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
 #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
 #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
+#define EXT4_IOC_TRIM			FITRIM
 
 #if defined(__KERNEL__) && defined(CONFIG_COMPAT)
 /*
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index bf5ae88..e07944a 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -331,6 +331,30 @@ mext_out:
 		return err;
 	}
 
+	case EXT4_IOC_TRIM:
+	{
+		struct super_block *sb = inode->i_sb;
+		struct fstrim_range range;
+		int ret = 0;
+
+		if (!capable(CAP_SYS_ADMIN))
+			return -EPERM;
+
+		if (copy_from_user(&range, (struct fstrim_range *)arg,
+		    sizeof(range)))
+			return -EFAULT;
+
+		ret = ext4_trim_fs(sb, &range);
+		if (ret < 0)
+			return ret;
+
+		if (copy_to_user((struct fstrim_range *)arg, &range,
+		    sizeof(range)))
+			return -EFAULT;
+
+		return 0;
+	}
+
 	default:
 		return -ENOTTY;
 	}
-- 
1.7.2.3


^ permalink raw reply related	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18  7:36 [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation Lukas Czerner
  2010-11-18  7:36 ` [PATCH 2/2] ext4: Add EXT4_IOC_TRIM ioctl to handle batched discard Lukas Czerner
@ 2010-11-18 13:06 ` Matthew Wilcox
  2010-11-18 13:48   ` Josef Bacik
  2010-11-19 15:41 ` Ted Ts'o
  2 siblings, 1 reply; 106+ messages in thread
From: Matthew Wilcox @ 2010-11-18 13:06 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: tytso, linux-ext4, linux-kernel, linux-fsdevel, hch, sandeen,
	Josef Bacik

On Thu, Nov 18, 2010 at 08:36:48AM +0100, Lukas Czerner wrote:
> There was concern that FITRIM ioctl is not common enough to be included
> in core vfs ioctl, as Christoph Hellwig pointed out there's no real point
> in dispatching this out to a separate vector instead of just through
> ->ioctl.

Um, are you and Josef working independently of each other?  You don't
seem to be cc'ing each other on your patches, and you're basically doing
the same thing.

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 13:06 ` [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation Matthew Wilcox
@ 2010-11-18 13:48   ` Josef Bacik
  2010-11-18 14:19     ` Matthew Wilcox
  0 siblings, 1 reply; 106+ messages in thread
From: Josef Bacik @ 2010-11-18 13:48 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	hch, sandeen, Josef Bacik

On Thu, Nov 18, 2010 at 06:06:30AM -0700, Matthew Wilcox wrote:
> On Thu, Nov 18, 2010 at 08:36:48AM +0100, Lukas Czerner wrote:
> > There was concern that FITRIM ioctl is not common enough to be included
> > in core vfs ioctl, as Christoph Hellwig pointed out there's no real point
> > in dispatching this out to a separate vector instead of just through
> > ->ioctl.
> 
> Um, are you and Josef working independently of each other?  You don't
> seem to be cc'ing each other on your patches, and you're basically doing
> the same thing.
>

I guess they are the same thing in that we're both dealing with free'ing up
space, but thats about where the similarities end.  Lukas' work is in TRIM'ing
already free'd space, mine is in creating free'd space.  Plus I don't know
anything nor wish to ever know anything about TRIM ;).  Thanks,

Josef 

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 13:48   ` Josef Bacik
@ 2010-11-18 14:19     ` Matthew Wilcox
  2010-11-18 14:29       ` Christoph Hellwig
  2010-11-18 14:31       ` Josef Bacik
  0 siblings, 2 replies; 106+ messages in thread
From: Matthew Wilcox @ 2010-11-18 14:19 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	hch, sandeen

On Thu, Nov 18, 2010 at 08:48:04AM -0500, Josef Bacik wrote:
> On Thu, Nov 18, 2010 at 06:06:30AM -0700, Matthew Wilcox wrote:
> > On Thu, Nov 18, 2010 at 08:36:48AM +0100, Lukas Czerner wrote:
> > > There was concern that FITRIM ioctl is not common enough to be included
> > > in core vfs ioctl, as Christoph Hellwig pointed out there's no real point
> > > in dispatching this out to a separate vector instead of just through
> > > ->ioctl.
> > 
> > Um, are you and Josef working independently of each other?  You don't
> > seem to be cc'ing each other on your patches, and you're basically doing
> > the same thing.
> >
> 
> I guess they are the same thing in that we're both dealing with free'ing up
> space, but thats about where the similarities end.  Lukas' work is in TRIM'ing
> already free'd space, mine is in creating free'd space.  Plus I don't know
> anything nor wish to ever know anything about TRIM ;).  Thanks,

I guess I was assuming that, on receiving a FALLOC_FL_PUNCH_HOLE, a
filesystem that was TRIM-aware would pass that information down to the
block device that it's mounted on.  I strongly feel that we shouldn't
have two interfaces to do essentially the same thing.

I guess I'm saying that you're going to have to learn about TRIM :-)

-- 
Matthew Wilcox				Intel Open Source Technology Centre
"Bill, look, we understand that you're interested in selling us this
operating system, but compare it to ours.  We can't possibly take such
a retrograde step."

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 14:19     ` Matthew Wilcox
@ 2010-11-18 14:29       ` Christoph Hellwig
  2010-11-18 17:19         ` James Bottomley
  2010-11-18 14:31       ` Josef Bacik
  1 sibling, 1 reply; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-18 14:29 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, hch, sandeen

On Thu, Nov 18, 2010 at 07:19:58AM -0700, Matthew Wilcox wrote:
> I guess I was assuming that, on receiving a FALLOC_FL_PUNCH_HOLE, a
> filesystem that was TRIM-aware would pass that information down to the
> block device that it's mounted on.  I strongly feel that we shouldn't
> have two interfaces to do essentially the same thing.
> 
> I guess I'm saying that you're going to have to learn about TRIM :-)

Did you actually look Lukas FITRIM code (not the slight reordering here,
but the original one).  It's the ext4 version of the batched discard
model, that is a userspace ioctl to discard free space in the
filesystem.

hole punching will free the blocks into the free space pool.  If you do
online discard it will also get discarded, but a filesystem that has
online discard enabled doesn't need FITRIM.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 14:19     ` Matthew Wilcox
  2010-11-18 14:29       ` Christoph Hellwig
@ 2010-11-18 14:31       ` Josef Bacik
  2010-11-18 14:36         ` Tao Ma
  1 sibling, 1 reply; 106+ messages in thread
From: Josef Bacik @ 2010-11-18 14:31 UTC (permalink / raw)
  To: Matthew Wilcox
  Cc: Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, hch, sandeen

On Thu, Nov 18, 2010 at 07:19:58AM -0700, Matthew Wilcox wrote:
> On Thu, Nov 18, 2010 at 08:48:04AM -0500, Josef Bacik wrote:
> > On Thu, Nov 18, 2010 at 06:06:30AM -0700, Matthew Wilcox wrote:
> > > On Thu, Nov 18, 2010 at 08:36:48AM +0100, Lukas Czerner wrote:
> > > > There was concern that FITRIM ioctl is not common enough to be included
> > > > in core vfs ioctl, as Christoph Hellwig pointed out there's no real point
> > > > in dispatching this out to a separate vector instead of just through
> > > > ->ioctl.
> > > 
> > > Um, are you and Josef working independently of each other?  You don't
> > > seem to be cc'ing each other on your patches, and you're basically doing
> > > the same thing.
> > >
> > 
> > I guess they are the same thing in that we're both dealing with free'ing up
> > space, but thats about where the similarities end.  Lukas' work is in TRIM'ing
> > already free'd space, mine is in creating free'd space.  Plus I don't know
> > anything nor wish to ever know anything about TRIM ;).  Thanks,
> 
> I guess I was assuming that, on receiving a FALLOC_FL_PUNCH_HOLE, a
> filesystem that was TRIM-aware would pass that information down to the
> block device that it's mounted on.  I strongly feel that we shouldn't
> have two interfaces to do essentially the same thing.

But they aren't doing the same thing, his is discarding already free'd space,
I'm enabling people to de-allocate space in the middle of files, they are two
seperate things.  Of course if the filesystem is TRIM aware the de-allocation
would lead to a TRIM, but not if the filesystem isn't mounted with -o discard.
Hole punching is useful independantly of the ability to do TRIM.  Thanks,

Josef

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 14:31       ` Josef Bacik
@ 2010-11-18 14:36         ` Tao Ma
  0 siblings, 0 replies; 106+ messages in thread
From: Tao Ma @ 2010-11-18 14:36 UTC (permalink / raw)
  To: Josef Bacik
  Cc: Matthew Wilcox, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, hch, sandeen



On 11/18/2010 10:31 PM, Josef Bacik wrote:
> On Thu, Nov 18, 2010 at 07:19:58AM -0700, Matthew Wilcox wrote:
>> On Thu, Nov 18, 2010 at 08:48:04AM -0500, Josef Bacik wrote:
>>> On Thu, Nov 18, 2010 at 06:06:30AM -0700, Matthew Wilcox wrote:
>>>> On Thu, Nov 18, 2010 at 08:36:48AM +0100, Lukas Czerner wrote:
>>>>> There was concern that FITRIM ioctl is not common enough to be included
>>>>> in core vfs ioctl, as Christoph Hellwig pointed out there's no real point
>>>>> in dispatching this out to a separate vector instead of just through
>>>>> ->ioctl.
>>>>
>>>> Um, are you and Josef working independently of each other?  You don't
>>>> seem to be cc'ing each other on your patches, and you're basically doing
>>>> the same thing.
>>>>
>>>
>>> I guess they are the same thing in that we're both dealing with free'ing up
>>> space, but thats about where the similarities end.  Lukas' work is in TRIM'ing
>>> already free'd space, mine is in creating free'd space.  Plus I don't know
>>> anything nor wish to ever know anything about TRIM ;).  Thanks,
>>
>> I guess I was assuming that, on receiving a FALLOC_FL_PUNCH_HOLE, a
>> filesystem that was TRIM-aware would pass that information down to the
>> block device that it's mounted on.  I strongly feel that we shouldn't
>> have two interfaces to do essentially the same thing.
>
> But they aren't doing the same thing, his is discarding already free'd space,
> I'm enabling people to de-allocate space in the middle of files, they are two
> seperate things.  Of course if the filesystem is TRIM aware the de-allocation
> would lead to a TRIM, but not if the filesystem isn't mounted with -o discard.
> Hole punching is useful independantly of the ability to do TRIM.  Thanks,
yeah, actually, ocfs2 has implemented punching holes while we don't have 
TRIM support enabled yet.

Regards,
Tao

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 14:29       ` Christoph Hellwig
@ 2010-11-18 17:19         ` James Bottomley
  2010-11-18 17:22           ` Jeff Moyer
                             ` (3 more replies)
  0 siblings, 4 replies; 106+ messages in thread
From: James Bottomley @ 2010-11-18 17:19 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Thu, 2010-11-18 at 09:29 -0500, Christoph Hellwig wrote:
> On Thu, Nov 18, 2010 at 07:19:58AM -0700, Matthew Wilcox wrote:
> > I guess I was assuming that, on receiving a FALLOC_FL_PUNCH_HOLE, a
> > filesystem that was TRIM-aware would pass that information down to the
> > block device that it's mounted on.  I strongly feel that we shouldn't
> > have two interfaces to do essentially the same thing.
> > 
> > I guess I'm saying that you're going to have to learn about TRIM :-)
> 
> Did you actually look Lukas FITRIM code (not the slight reordering here,
> but the original one).  It's the ext4 version of the batched discard
> model, that is a userspace ioctl to discard free space in the
> filesystem.
> 
> hole punching will free the blocks into the free space pool.  If you do
> online discard it will also get discarded, but a filesystem that has
> online discard enabled doesn't need FITRIM.

Not stepping into the debate: I'm happy to see punch go to the mapping
data and FITRIM pick it up later.

However, I think it's time to question whether we actually still want to
allow online discard at all.  Most of the benchmarks show it to be a net
lose to almost everything (either SSD or Thinly Provisioned arrays), so
it's become an "enable this to degrade performance" option with no
upside.

James



^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 17:19         ` James Bottomley
@ 2010-11-18 17:22           ` Jeff Moyer
  2010-11-18 17:41             ` James Bottomley
  2010-11-18 18:05             ` Jamie Lokier
  2010-11-18 17:35           ` Lukas Czerner
                             ` (2 subsequent siblings)
  3 siblings, 2 replies; 106+ messages in thread
From: Jeff Moyer @ 2010-11-18 17:22 UTC (permalink / raw)
  To: James Bottomley
  Cc: Christoph Hellwig, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

James Bottomley <James.Bottomley@suse.de> writes:

> Not stepping into the debate: I'm happy to see punch go to the mapping
> data and FITRIM pick it up later.
>
> However, I think it's time to question whether we actually still want to
> allow online discard at all.  Most of the benchmarks show it to be a net

Define online discard, please.

> lose to almost everything (either SSD or Thinly Provisioned arrays), so
> it's become an "enable this to degrade performance" option with no
> upside.

Some SSDs very much require TRIMming to perform well as they age.  If
you're suggesting that we move from doing discards in journal commits to
a batched discard, like the one Lukas implemented, then I think that's
fine.  If we need to reintroduce the finer-grained discards due to some
hardware changes in the future, we can always do that.

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 17:19         ` James Bottomley
  2010-11-18 17:22           ` Jeff Moyer
@ 2010-11-18 17:35           ` Lukas Czerner
  2010-11-19 12:16             ` Steven Whitehouse
  2010-11-18 17:55           ` Chris Mason
  2010-11-18 21:37           ` Mark Lord
  3 siblings, 1 reply; 106+ messages in thread
From: Lukas Czerner @ 2010-11-18 17:35 UTC (permalink / raw)
  To: James Bottomley
  Cc: Christoph Hellwig, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Thu, 18 Nov 2010, James Bottomley wrote:

> On Thu, 2010-11-18 at 09:29 -0500, Christoph Hellwig wrote:
> > On Thu, Nov 18, 2010 at 07:19:58AM -0700, Matthew Wilcox wrote:
> > > I guess I was assuming that, on receiving a FALLOC_FL_PUNCH_HOLE, a
> > > filesystem that was TRIM-aware would pass that information down to the
> > > block device that it's mounted on.  I strongly feel that we shouldn't
> > > have two interfaces to do essentially the same thing.
> > > 
> > > I guess I'm saying that you're going to have to learn about TRIM :-)
> > 
> > Did you actually look Lukas FITRIM code (not the slight reordering here,
> > but the original one).  It's the ext4 version of the batched discard
> > model, that is a userspace ioctl to discard free space in the
> > filesystem.
> > 
> > hole punching will free the blocks into the free space pool.  If you do
> > online discard it will also get discarded, but a filesystem that has
> > online discard enabled doesn't need FITRIM.
> 
> Not stepping into the debate: I'm happy to see punch go to the mapping
> data and FITRIM pick it up later.
> 
> However, I think it's time to question whether we actually still want to
> allow online discard at all.  Most of the benchmarks show it to be a net
> lose to almost everything (either SSD or Thinly Provisioned arrays), so
> it's become an "enable this to degrade performance" option with no
> upside.
> 
> James
> 

This time began a long time ago :) that is why am I originally created
batched discard for ext4 (ext3) accessible through FITRIM ioctl. Ext4
performance with -o discard mount option goes down on the most of the
SSD's and every Thinly-provisioned storage I have a chance to benchmark.

But, for example SSD's are getting better and as time goes by we might
see devices that does not suffer terrible performance loss with discard
enabled (discard on unlink in ext4 etc...), so this "online" discard
probably still does make sense.

-Lukas

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 17:22           ` Jeff Moyer
@ 2010-11-18 17:41             ` James Bottomley
  2010-11-18 20:04                 ` Greg Freemyer
  2010-11-18 18:05             ` Jamie Lokier
  1 sibling, 1 reply; 106+ messages in thread
From: James Bottomley @ 2010-11-18 17:41 UTC (permalink / raw)
  To: Jeff Moyer
  Cc: Christoph Hellwig, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Thu, 2010-11-18 at 12:22 -0500, Jeff Moyer wrote:
> James Bottomley <James.Bottomley@suse.de> writes:
> 
> > Not stepping into the debate: I'm happy to see punch go to the mapping
> > data and FITRIM pick it up later.
> >
> > However, I think it's time to question whether we actually still want to
> > allow online discard at all.  Most of the benchmarks show it to be a net
> 
> Define online discard, please.

Trims emitted inline at the FS operates (mount option -o discard)

> > lose to almost everything (either SSD or Thinly Provisioned arrays), so
> > it's become an "enable this to degrade performance" option with no
> > upside.
> 
> Some SSDs very much require TRIMming to perform well as they age.  If
> you're suggesting that we move from doing discards in journal commits to
> a batched discard, like the one Lukas implemented, then I think that's
> fine.  If we need to reintroduce the finer-grained discards due to some
> hardware changes in the future, we can always do that.

Right, I'm suggesting we just rely on offline methods.  Regardless of
what happens to FITRIM, we have wiper.sh now (although it does require
unmounted use for most of the less than modern fs).

James



^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 17:19         ` James Bottomley
  2010-11-18 17:22           ` Jeff Moyer
  2010-11-18 17:35           ` Lukas Czerner
@ 2010-11-18 17:55           ` Chris Mason
  2010-12-03 18:24             ` Ric Wheeler
  2010-11-18 21:37           ` Mark Lord
  3 siblings, 1 reply; 106+ messages in thread
From: Chris Mason @ 2010-11-18 17:55 UTC (permalink / raw)
  To: James Bottomley
  Cc: Christoph Hellwig, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

Excerpts from James Bottomley's message of 2010-11-18 12:19:10 -0500:
> On Thu, 2010-11-18 at 09:29 -0500, Christoph Hellwig wrote:
> > On Thu, Nov 18, 2010 at 07:19:58AM -0700, Matthew Wilcox wrote:
> > > I guess I was assuming that, on receiving a FALLOC_FL_PUNCH_HOLE, a
> > > filesystem that was TRIM-aware would pass that information down to the
> > > block device that it's mounted on.  I strongly feel that we shouldn't
> > > have two interfaces to do essentially the same thing.
> > > 
> > > I guess I'm saying that you're going to have to learn about TRIM :-)
> > 
> > Did you actually look Lukas FITRIM code (not the slight reordering here,
> > but the original one).  It's the ext4 version of the batched discard
> > model, that is a userspace ioctl to discard free space in the
> > filesystem.
> > 
> > hole punching will free the blocks into the free space pool.  If you do
> > online discard it will also get discarded, but a filesystem that has
> > online discard enabled doesn't need FITRIM.
> 
> Not stepping into the debate: I'm happy to see punch go to the mapping
> data and FITRIM pick it up later.
> 
> However, I think it's time to question whether we actually still want to
> allow online discard at all.  Most of the benchmarks show it to be a net
> lose to almost everything (either SSD or Thinly Provisioned arrays), so
> it's become an "enable this to degrade performance" option with no
> upside.

I think we want to keep it.  In general we've (except for hch) spent
almost zero time actually tuning online discard, and the benchmarking
needs to be redone with the shiny new barrier code.

-chris

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 17:22           ` Jeff Moyer
  2010-11-18 17:41             ` James Bottomley
@ 2010-11-18 18:05             ` Jamie Lokier
  2010-11-18 19:32               ` Markus Trippelsdorf
  1 sibling, 1 reply; 106+ messages in thread
From: Jamie Lokier @ 2010-11-18 18:05 UTC (permalink / raw)
  To: Jeff Moyer
  Cc: James Bottomley, Christoph Hellwig, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

Jeff Moyer wrote:
> James Bottomley <James.Bottomley@suse.de> writes:
> 
> > Not stepping into the debate: I'm happy to see punch go to the mapping
> > data and FITRIM pick it up later.
> >
> > However, I think it's time to question whether we actually still want to
> > allow online discard at all.  Most of the benchmarks show it to be a net
> 
> Define online discard, please.
> 
> > lose to almost everything (either SSD or Thinly Provisioned arrays), so
> > it's become an "enable this to degrade performance" option with no
> > upside.
> 
> Some SSDs very much require TRIMming to perform well as they age.  If
> you're suggesting that we move from doing discards in journal commits to
> a batched discard, like the one Lukas implemented, then I think that's
> fine.  If we need to reintroduce the finer-grained discards due to some
> hardware changes in the future, we can always do that.

"Growable" virtual disks benefit from it too, if it frees up a lot of space.

Windows has some ability to trim unused space in NTFS on virtual disks
for this reason; I'm not sure if it's an online or offline procedure.

Online trim may be slow, but offline would be awfully inconvenient
when an fs is big and needed for a live system, or when it's your root fs.

-- Jamie

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 18:05             ` Jamie Lokier
@ 2010-11-18 19:32               ` Markus Trippelsdorf
  2010-11-18 21:45                 ` Mark Lord
  0 siblings, 1 reply; 106+ messages in thread
From: Markus Trippelsdorf @ 2010-11-18 19:32 UTC (permalink / raw)
  To: Jamie Lokier
  Cc: Jeff Moyer, James Bottomley, Christoph Hellwig, Matthew Wilcox,
	Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On 2010.11.18 at 18:05 +0000, Jamie Lokier wrote:
> Jeff Moyer wrote:
> > James Bottomley <James.Bottomley@suse.de> writes:
> > 
> > > Not stepping into the debate: I'm happy to see punch go to the mapping
> > > data and FITRIM pick it up later.
> > >
> > > However, I think it's time to question whether we actually still want to
> > > allow online discard at all.  Most of the benchmarks show it to be a net
> > 
> > Define online discard, please.
> > 
> > > lose to almost everything (either SSD or Thinly Provisioned arrays), so
> > > it's become an "enable this to degrade performance" option with no
> > > upside.
> > 
> > Some SSDs very much require TRIMming to perform well as they age.  If
> > you're suggesting that we move from doing discards in journal commits to
> > a batched discard, like the one Lukas implemented, then I think that's
> > fine.  If we need to reintroduce the finer-grained discards due to some
> > hardware changes in the future, we can always do that.
> 
> "Growable" virtual disks benefit from it too, if it frees up a lot of space.
> 
> Windows has some ability to trim unused space in NTFS on virtual disks
> for this reason; I'm not sure if it's an online or offline procedure.
> 
> Online trim may be slow, but offline would be awfully inconvenient
> when an fs is big and needed for a live system, or when it's your root fs.

You can call FITRIM from a running system. Infact I run it once per week
as a cron job on my (mounted) root fs.

-- 
Markus

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 17:41             ` James Bottomley
@ 2010-11-18 20:04                 ` Greg Freemyer
  0 siblings, 0 replies; 106+ messages in thread
From: Greg Freemyer @ 2010-11-18 20:04 UTC (permalink / raw)
  To: James Bottomley, Mark Lord
  Cc: Jeff Moyer, Christoph Hellwig, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

Adding Mark Lord in CC:

On Thu, Nov 18, 2010 at 9:41 AM, James Bottomley
<James.Bottomley@suse.de> wrote:
> On Thu, 2010-11-18 at 12:22 -0500, Jeff Moyer wrote:
>> James Bottomley <James.Bottomley@suse.de> writes:
>>
>> > Not stepping into the debate: I'm happy to see punch go to the mapping
>> > data and FITRIM pick it up later.
>> >
>> > However, I think it's time to question whether we actually still want to
>> > allow online discard at all.  Most of the benchmarks show it to be a net
>>
>> Define online discard, please.
>
> Trims emitted inline at the FS operates (mount option -o discard)
>
>> > lose to almost everything (either SSD or Thinly Provisioned arrays), so
>> > it's become an "enable this to degrade performance" option with no
>> > upside.
>>
>> Some SSDs very much require TRIMming to perform well as they age.  If
>> you're suggesting that we move from doing discards in journal commits to
>> a batched discard, like the one Lukas implemented, then I think that's
>> fine.  If we need to reintroduce the finer-grained discards due to some
>> hardware changes in the future, we can always do that.
>
> Right, I'm suggesting we just rely on offline methods.  Regardless of
> what happens to FITRIM, we have wiper.sh now (although it does require
> unmounted use for most of the less than modern fs).
>
> James

I'm a fan of wiper.sh, but afaik it still cannot address a
multi-spindle LVM setup, Nor a MDraid setup.  etc.

That's because it bypasses the block stack and talks directly to the
devices.  Thus it doesn't get the benefit of all the logical to
physical sector remapping handled via the block stack.

Mark, please correct me if I'm wrong.

The LVM and MDraid setup are important to support, and only "mount -o
discard" and Lucas's FITRIM support them.

So afaik we have 3 options, each with an opportunity for improvement:

1) mount -o discard - needs kernel tuning / new hardware to be a
performance win.

2) FITRIM doesn't leverage the fact the a TRIM command can handle
multiple ranges per TRIM command payload.  I haven't seen any FITRIM
vs. wiper.sh benchmarks, so I don't know what impact that has in
practice.  Mark Lord thought that this lacking feature would cause
FITRIM to take minutes or hours with some hardware.  Especially early
generation SSDs.

3) wiper.sh does leverage the multiple ranges per TRIM command, but it
really needs a new block layer interface that would allow it to push
discard commands into the kernel via the block layer, not just down at
the physical drive layer.  The interface should accept multiple ranges
per invocation and trigger TRIM commands to the SSD that have have a
multi-range discard payload.

So it seems that for now keeping all 3 is best.  My personal hope is
that the block layer grows the ability to accept multirange discard
requests and  FITRIM is updated to leverage it.

Greg

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-18 20:04                 ` Greg Freemyer
  0 siblings, 0 replies; 106+ messages in thread
From: Greg Freemyer @ 2010-11-18 20:04 UTC (permalink / raw)
  To: James Bottomley, Mark Lord
  Cc: Jeff Moyer, Christoph Hellwig, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

Adding Mark Lord in CC:

On Thu, Nov 18, 2010 at 9:41 AM, James Bottomley
<James.Bottomley@suse.de> wrote:
> On Thu, 2010-11-18 at 12:22 -0500, Jeff Moyer wrote:
>> James Bottomley <James.Bottomley@suse.de> writes:
>>
>> > Not stepping into the debate: I'm happy to see punch go to the mapping
>> > data and FITRIM pick it up later.
>> >
>> > However, I think it's time to question whether we actually still want to
>> > allow online discard at all.  Most of the benchmarks show it to be a net
>>
>> Define online discard, please.
>
> Trims emitted inline at the FS operates (mount option -o discard)
>
>> > lose to almost everything (either SSD or Thinly Provisioned arrays), so
>> > it's become an "enable this to degrade performance" option with no
>> > upside.
>>
>> Some SSDs very much require TRIMming to perform well as they age.  If
>> you're suggesting that we move from doing discards in journal commits to
>> a batched discard, like the one Lukas implemented, then I think that's
>> fine.  If we need to reintroduce the finer-grained discards due to some
>> hardware changes in the future, we can always do that.
>
> Right, I'm suggesting we just rely on offline methods.  Regardless of
> what happens to FITRIM, we have wiper.sh now (although it does require
> unmounted use for most of the less than modern fs).
>
> James

I'm a fan of wiper.sh, but afaik it still cannot address a
multi-spindle LVM setup, Nor a MDraid setup.  etc.

That's because it bypasses the block stack and talks directly to the
devices.  Thus it doesn't get the benefit of all the logical to
physical sector remapping handled via the block stack.

Mark, please correct me if I'm wrong.

The LVM and MDraid setup are important to support, and only "mount -o
discard" and Lucas's FITRIM support them.

So afaik we have 3 options, each with an opportunity for improvement:

1) mount -o discard - needs kernel tuning / new hardware to be a
performance win.

2) FITRIM doesn't leverage the fact the a TRIM command can handle
multiple ranges per TRIM command payload.  I haven't seen any FITRIM
vs. wiper.sh benchmarks, so I don't know what impact that has in
practice.  Mark Lord thought that this lacking feature would cause
FITRIM to take minutes or hours with some hardware.  Especially early
generation SSDs.

3) wiper.sh does leverage the multiple ranges per TRIM command, but it
really needs a new block layer interface that would allow it to push
discard commands into the kernel via the block layer, not just down at
the physical drive layer.  The interface should accept multiple ranges
per invocation and trigger TRIM commands to the SSD that have have a
multi-range discard payload.

So it seems that for now keeping all 3 is best.  My personal hope is
that the block layer grows the ability to accept multirange discard
requests and  FITRIM is updated to leverage it.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 17:19         ` James Bottomley
                             ` (2 preceding siblings ...)
  2010-11-18 17:55           ` Chris Mason
@ 2010-11-18 21:37           ` Mark Lord
  2010-11-19 11:09             ` Christoph Hellwig
  2010-11-19 14:40             ` Chris Mason
  3 siblings, 2 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-18 21:37 UTC (permalink / raw)
  To: James Bottomley
  Cc: Christoph Hellwig, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

On 10-11-18 12:19 PM, James Bottomley wrote:
>
> Not stepping into the debate: I'm happy to see punch go to the mapping
> data and FITRIM pick it up later.
>
> However, I think it's time to question whether we actually still want to
> allow online discard at all.  Most of the benchmarks show it to be a net
> lose to almost everything (either SSD or Thinly Provisioned arrays), so
> it's become an "enable this to degrade performance" option with no
> upside.

I also suspect that online TRIM exerts significant premature wear on the SSDs.
TRIM operations most likely trigger immediate copy/erase operations internal
to most SSDs, often regardless of the amount of data being trimmed.

Performing a 256KB erase because of a 1024-byte TRIM, over and over, is going
to harm the expected lifetime of an SSD.  Sure, some SSDs may do things differently
internally, but I don't see it working that way in much of the current crop of SSDs.

Currently, I patch my kernels to remove the automatic online TRIMs.
Is there a knob somewhere for this in the later kernels?

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 20:04                 ` Greg Freemyer
  (?)
@ 2010-11-18 21:42                 ` Mark Lord
  -1 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-18 21:42 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: James Bottomley, Jeff Moyer, Christoph Hellwig, Matthew Wilcox,
	Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On 10-11-18 03:04 PM, Greg Freemyer wrote:
>
> I'm a fan of wiper.sh, but afaik it still cannot address a
> multi-spindle LVM setup, Nor a MDraid setup.  etc.
>
> That's because it bypasses the block stack and talks directly to the
> devices.  Thus it doesn't get the benefit of all the logical to
> physical sector remapping handled via the block stack.
>
> Mark, please correct me if I'm wrong.

I'm a big fan of FITRIM, especially as it should work on MD devices
as well, which are problematic for wiper.sh today.   I originally proposed
FITRIM (without the name, though) back when first implementing wiper.sh,
and I really believe we should extend FITRIM to btrfs and xfs.

hdparm is picking up support for FITRIM in the next release,
and wiper.sh will use it when it can in place of raw TRIMs.

This remains my own preferred method for TRIM:  offline, that is.  :)

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 20:04                 ` Greg Freemyer
  (?)
  (?)
@ 2010-11-18 21:44                 ` Mark Lord
  2010-11-18 21:50                   ` James Bottomley
  2010-11-18 23:52                   ` Martin K. Petersen
  -1 siblings, 2 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-18 21:44 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: James Bottomley, Jeff Moyer, Christoph Hellwig, Matthew Wilcox,
	Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On 10-11-18 03:04 PM, Greg Freemyer wrote:
>
> 2) FITRIM doesn't leverage the fact the a TRIM command can handle
> multiple ranges per TRIM command payload.  I haven't seen any FITRIM
> vs. wiper.sh benchmarks, so I don't know what impact that has in
> practice.  Mark Lord thought that this lacking feature would cause
> FITRIM to take minutes or hours with some hardware.  Especially early
> generation SSDs.

If FITRIM is still issuing single-range-at-a-time TRIMs,
then I'd call that a BUG that needs fixing.  Doing TRIM like that
causes tons of unnecessary ERASE cycles, shortening the SSD lifetime.
It really needs to batch them into groups of (up to) 64 ranges at a time
(64 ranges fits into a single 512-byte parameter block).

> 3) wiper.sh does leverage the multiple ranges per TRIM command, but it
> really needs a new block layer interface that would allow it to push
> discard commands into the kernel via the block layer, not just down at
> the physical drive layer.  The interface should accept multiple ranges
> per invocation and trigger TRIM commands to the SSD that have have a
> multi-range discard payload.

I think FITRIM should be doing that, and it should work for more than just ext4.

Cheers!

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 19:32               ` Markus Trippelsdorf
@ 2010-11-18 21:45                 ` Mark Lord
  2010-11-18 21:50                   ` Markus Trippelsdorf
  0 siblings, 1 reply; 106+ messages in thread
From: Mark Lord @ 2010-11-18 21:45 UTC (permalink / raw)
  To: Markus Trippelsdorf
  Cc: Jamie Lokier, Jeff Moyer, James Bottomley, Christoph Hellwig,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-18 02:32 PM, Markus Trippelsdorf wrote:
> On 2010.11.18 at 18:05 +0000, Jamie Lokier wrote:
>> Online trim may be slow, but offline would be awfully inconvenient
>> when an fs is big and needed for a live system, or when it's your root fs.
>
> You can call FITRIM from a running system. Infact I run it once per week
> as a cron job on my (mounted) root fs.


Ditto for wiper.sh.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 21:45                 ` Mark Lord
@ 2010-11-18 21:50                   ` Markus Trippelsdorf
  2010-11-18 22:09                     ` Mark Lord
  0 siblings, 1 reply; 106+ messages in thread
From: Markus Trippelsdorf @ 2010-11-18 21:50 UTC (permalink / raw)
  To: Mark Lord
  Cc: Jamie Lokier, Jeff Moyer, James Bottomley, Christoph Hellwig,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 2010.11.18 at 16:45 -0500, Mark Lord wrote:
> On 10-11-18 02:32 PM, Markus Trippelsdorf wrote:
> >On 2010.11.18 at 18:05 +0000, Jamie Lokier wrote:
> >>Online trim may be slow, but offline would be awfully inconvenient
> >>when an fs is big and needed for a live system, or when it's your root fs.
> >
> >You can call FITRIM from a running system. Infact I run it once per week
> >as a cron job on my (mounted) root fs.
> 
> 
> Ditto for wiper.sh.

But I always thought that wiper has no access to the filesystem
internals. So there is always a chance that you write to a sector that
wiper.sh is currently trimming. FITRIM should be safer in this regard.

-- 
Markus

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 21:44                 ` Mark Lord
@ 2010-11-18 21:50                   ` James Bottomley
  2010-11-18 22:07                     ` Mark Lord
  2010-11-18 23:52                   ` Martin K. Petersen
  1 sibling, 1 reply; 106+ messages in thread
From: James Bottomley @ 2010-11-18 21:50 UTC (permalink / raw)
  To: Mark Lord
  Cc: Greg Freemyer, Jeff Moyer, Christoph Hellwig, Matthew Wilcox,
	Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On Thu, 2010-11-18 at 16:44 -0500, Mark Lord wrote:
> On 10-11-18 03:04 PM, Greg Freemyer wrote:
> >
> > 2) FITRIM doesn't leverage the fact the a TRIM command can handle
> > multiple ranges per TRIM command payload.  I haven't seen any FITRIM
> > vs. wiper.sh benchmarks, so I don't know what impact that has in
> > practice.  Mark Lord thought that this lacking feature would cause
> > FITRIM to take minutes or hours with some hardware.  Especially early
> > generation SSDs.
> 
> If FITRIM is still issuing single-range-at-a-time TRIMs,
> then I'd call that a BUG that needs fixing.  Doing TRIM like that
> causes tons of unnecessary ERASE cycles, shortening the SSD lifetime.
> It really needs to batch them into groups of (up to) 64 ranges at a time
> (64 ranges fits into a single 512-byte parameter block).

Before we go gung ho on this, there's no evidence that N discontiguous
ranges in one command are any better than the ranges sent N times ...
the same amount of erase overhead gets sent on SSDs.  However, we also
have to remember Thin Provisioning.  Right at the moment WRITE SAME
seems to be preferred to UNMAP ... and WRITE SAME only has a single
effective range.

Now if you're thinking of multiple contiguous ranges because of the LBA
limits to TRIM, then yes, I'm fine with that because WRITE SAME doesn't
have that problem and we can do the translation in the SATL.

James



^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 21:50                   ` James Bottomley
@ 2010-11-18 22:07                     ` Mark Lord
  2010-11-19  1:33                       ` Ted Ts'o
  0 siblings, 1 reply; 106+ messages in thread
From: Mark Lord @ 2010-11-18 22:07 UTC (permalink / raw)
  To: James Bottomley
  Cc: Greg Freemyer, Jeff Moyer, Christoph Hellwig, Matthew Wilcox,
	Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On 10-11-18 04:50 PM, James Bottomley wrote:
>
> Before we go gung ho on this, there's no evidence that N discontiguous
> ranges in one command are any better than the ranges sent N times ...
> the same amount of erase overhead gets sent on SSDs.

No, we do have evidence:  execution time of the TRIM commands on the SSD.

The one-range-at-a-time is incredibly slow compared to multiple ranges at a time.
That slowness comes from somewhere, with about 99.9% certainty that it is due
to the drive performing slow flash erase cycles.

I think perhaps we should do the batching as much as possible,
and then split them into single ranges for LLDs that cannot handle multi-ranges.

Way more efficient that way.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 21:50                   ` Markus Trippelsdorf
@ 2010-11-18 22:09                     ` Mark Lord
  0 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-18 22:09 UTC (permalink / raw)
  To: Markus Trippelsdorf
  Cc: Jamie Lokier, Jeff Moyer, James Bottomley, Christoph Hellwig,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-18 04:50 PM, Markus Trippelsdorf wrote:
> On 2010.11.18 at 16:45 -0500, Mark Lord wrote:
>> On 10-11-18 02:32 PM, Markus Trippelsdorf wrote:
>>> On 2010.11.18 at 18:05 +0000, Jamie Lokier wrote:
>>>> Online trim may be slow, but offline would be awfully inconvenient
>>>> when an fs is big and needed for a live system, or when it's your root fs.
>>>
>>> You can call FITRIM from a running system. Infact I run it once per week
>>> as a cron job on my (mounted) root fs.
>>
>>
>> Ditto for wiper.sh.
>
> But I always thought that wiper has no access to the filesystem
> internals. So there is always a chance that you write to a sector that
> wiper.sh is currently trimming. FITRIM should be safer in this regard.


No, wiper.sh is very safe for online TRIM.
It reserves the blocks before trimming them, so nothing else can write to them.

The danger with wiper.sh, is that WHILE it is running, it reserves most
of the free space of the filesystem.  So if there are other apps writing
tons of new data to the drive, while not freeing up old data, the system
can run low on diskspace for a moment.

wiper.sh counters that by not reserving/trimming *everything* when run online,
leaving a reasonable amount of scratch space available for other apps during the 
run.

This is where having a fully capable FITRIM, with multi-range TRIMs, would be 
useful.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 21:44                 ` Mark Lord
  2010-11-18 21:50                   ` James Bottomley
@ 2010-11-18 23:52                   ` Martin K. Petersen
  2010-11-19  0:34                     ` Mark Lord
  1 sibling, 1 reply; 106+ messages in thread
From: Martin K. Petersen @ 2010-11-18 23:52 UTC (permalink / raw)
  To: Mark Lord
  Cc: Greg Freemyer, James Bottomley, Jeff Moyer, Christoph Hellwig,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

>>>>> "Mark" == Mark Lord <kernel@teksavvy.com> writes:

>> 2) FITRIM doesn't leverage the fact the a TRIM command can handle
>> multiple ranges per TRIM command payload.  I haven't seen any FITRIM
>> vs. wiper.sh benchmarks, so I don't know what impact that has in
>> practice.  Mark Lord thought that this lacking feature would cause
>> FITRIM to take minutes or hours with some hardware.  Especially early
>> generation SSDs.

Mark> If FITRIM is still issuing single-range-at-a-time TRIMs, then I'd
Mark> call that a BUG that needs fixing.  Doing TRIM like that causes
Mark> tons of unnecessary ERASE cycles, shortening the SSD lifetime.  It
Mark> really needs to batch them into groups of (up to) 64 ranges at a
Mark> time (64 ranges fits into a single 512-byte parameter block).

We don't support coalescing discontiguous requests into one command. But
we will issue contiguous TRIM requests as big as the payload can
handle. That's just short of two gigs per command given a 512-byte
block.

I spent quite a bit of time trying to make coalescing work in the
spring. It got very big and unwieldy. When we discussed it at the
filesystem summit the consensus was that it was too intrusive to the I/O
stack, elevators, etc.

I have previously posted a list of reasons why it is hard to support
given how our I/O stack works. I can dig them out if there's interest.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 23:52                   ` Martin K. Petersen
@ 2010-11-19  0:34                     ` Mark Lord
  2010-11-19  1:16                         ` Greg Freemyer
  2010-11-19  1:49                       ` Martin K. Petersen
  0 siblings, 2 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19  0:34 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Greg Freemyer, James Bottomley, Jeff Moyer, Christoph Hellwig,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-18 06:52 PM, Martin K. Petersen wrote:
>>>>>> "Mark" == Mark Lord<kernel@teksavvy.com>  writes:
> Mark>  If FITRIM is still issuing single-range-at-a-time TRIMs, then I'd
> Mark>  call that a BUG that needs fixing.  Doing TRIM like that causes
> Mark>  tons of unnecessary ERASE cycles, shortening the SSD lifetime.  It
> Mark>  really needs to batch them into groups of (up to) 64 ranges at a
> Mark>  time (64 ranges fits into a single 512-byte parameter block).
>
> We don't support coalescing discontiguous requests into one command. But
> we will issue contiguous TRIM requests as big as the payload can
> handle. That's just short of two gigs per command given a 512-byte
> block.
>
> I spent quite a bit of time trying to make coalescing work in the
> spring. It got very big and unwieldy. When we discussed it at the
> filesystem summit the consensus was that it was too intrusive to the I/O
> stack, elevators, etc.

Surely if a userspace tool and shell-script can accomplish this,
totally lacking real filesystem knowledge, then we should be able
to approximate it in kernel space?

This is FITRIM we're talking about, not the on-the-fly automatic TRIM.

FITRIM could perhaps use a similar approach to what wiper.sh does:
reserve a large number of free blocks, and issue coalesced TRIM(s) on them.

The difference being, it could walk through the filesystem,
trimming in sections, rather than trying to reserve/trim the entire
freespace all in one go.

Over-thinking it???

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19  0:34                     ` Mark Lord
@ 2010-11-19  1:16                         ` Greg Freemyer
  2010-11-19  1:49                       ` Martin K. Petersen
  1 sibling, 0 replies; 106+ messages in thread
From: Greg Freemyer @ 2010-11-19  1:16 UTC (permalink / raw)
  To: Mark Lord
  Cc: Martin K. Petersen, James Bottomley, Jeff Moyer,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Thu, Nov 18, 2010 at 4:34 PM, Mark Lord <kernel@teksavvy.com> wrote:
> On 10-11-18 06:52 PM, Martin K. Petersen wrote:
>>>>>>>
>>>>>>> "Mark" == Mark Lord<kernel@teksavvy.com>  writes:
>>
>> Mark>  If FITRIM is still issuing single-range-at-a-time TRIMs, then I'd
>> Mark>  call that a BUG that needs fixing.  Doing TRIM like that causes
>> Mark>  tons of unnecessary ERASE cycles, shortening the SSD lifetime.  It
>> Mark>  really needs to batch them into groups of (up to) 64 ranges at a
>> Mark>  time (64 ranges fits into a single 512-byte parameter block).
>>
>> We don't support coalescing discontiguous requests into one command. But
>> we will issue contiguous TRIM requests as big as the payload can
>> handle. That's just short of two gigs per command given a 512-byte
>> block.
>>
>> I spent quite a bit of time trying to make coalescing work in the
>> spring. It got very big and unwieldy. When we discussed it at the
>> filesystem summit the consensus was that it was too intrusive to the I/O
>> stack, elevators, etc.
>
> Surely if a userspace tool and shell-script can accomplish this,
> totally lacking real filesystem knowledge, then we should be able
> to approximate it in kernel space?
>
> This is FITRIM we're talking about, not the on-the-fly automatic TRIM.
>
> FITRIM could perhaps use a similar approach to what wiper.sh does:
> reserve a large number of free blocks, and issue coalesced TRIM(s) on them.
>
> The difference being, it could walk through the filesystem,
> trimming in sections, rather than trying to reserve/trim the entire
> freespace all in one go.
>
> Over-thinking it???

Martin,

I agree with Mark.  When you say "make coalescing work" it sounds like
major overkill.

FITRIM should be able to lock a group of non-contiguous free ranges,
send them down to the block layer as a single pre-coalesced set, and
the block layer just needs to pass it on in a synchronous way.  Then
when that group of ranges is discarded, FITRIM releases the locks.

Every TRIM causes a cache flush anyway on the SSD, so the synchronous
aspect of the process should not be a problem.  (Maybe SCSI with thin
provisioning would see a performance hit?)

To my small brain, the only really complex part is sending something
like that into MDraid, because that one set of ranges might explode
into thousands of ranges and then have to be coalesced back down to a
more manageable number of ranges.

ie. with a simple raid 0, each range will need to be broken into a
bunch of stride sized ranges, then the contiguous strides on each
spindle coalesced back into larger ranges.

But if MDraid can handle discards now with one range, it should not be
that hard to teach it handle a group of ranges.

Greg


-- 
Greg Freemyer
Head of EDD Tape Extraction and Processing team
Litigation Triage Solutions Specialist
http://www.linkedin.com/in/gregfreemyer
CNN/TruTV Aired Forensic Imaging Demo -
   http://insession.blogs.cnn.com/2010/03/23/how-computer-evidence-gets-retrieved/

The Norcross Group
The Intersection of Evidence & Technology
http://www.norcrossgroup.com

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-19  1:16                         ` Greg Freemyer
  0 siblings, 0 replies; 106+ messages in thread
From: Greg Freemyer @ 2010-11-19  1:16 UTC (permalink / raw)
  To: Mark Lord
  Cc: Martin K. Petersen, James Bottomley, Jeff Moyer,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Thu, Nov 18, 2010 at 4:34 PM, Mark Lord <kernel@teksavvy.com> wrote:
> On 10-11-18 06:52 PM, Martin K. Petersen wrote:
>>>>>>>
>>>>>>> "Mark" == Mark Lord<kernel@teksavvy.com>  writes:
>>
>> Mark>  If FITRIM is still issuing single-range-at-a-time TRIMs, then I'd
>> Mark>  call that a BUG that needs fixing.  Doing TRIM like that causes
>> Mark>  tons of unnecessary ERASE cycles, shortening the SSD lifetime.  It
>> Mark>  really needs to batch them into groups of (up to) 64 ranges at a
>> Mark>  time (64 ranges fits into a single 512-byte parameter block).
>>
>> We don't support coalescing discontiguous requests into one command. But
>> we will issue contiguous TRIM requests as big as the payload can
>> handle. That's just short of two gigs per command given a 512-byte
>> block.
>>
>> I spent quite a bit of time trying to make coalescing work in the
>> spring. It got very big and unwieldy. When we discussed it at the
>> filesystem summit the consensus was that it was too intrusive to the I/O
>> stack, elevators, etc.
>
> Surely if a userspace tool and shell-script can accomplish this,
> totally lacking real filesystem knowledge, then we should be able
> to approximate it in kernel space?
>
> This is FITRIM we're talking about, not the on-the-fly automatic TRIM.
>
> FITRIM could perhaps use a similar approach to what wiper.sh does:
> reserve a large number of free blocks, and issue coalesced TRIM(s) on them.
>
> The difference being, it could walk through the filesystem,
> trimming in sections, rather than trying to reserve/trim the entire
> freespace all in one go.
>
> Over-thinking it???

Martin,

I agree with Mark.  When you say "make coalescing work" it sounds like
major overkill.

FITRIM should be able to lock a group of non-contiguous free ranges,
send them down to the block layer as a single pre-coalesced set, and
the block layer just needs to pass it on in a synchronous way.  Then
when that group of ranges is discarded, FITRIM releases the locks.

Every TRIM causes a cache flush anyway on the SSD, so the synchronous
aspect of the process should not be a problem.  (Maybe SCSI with thin
provisioning would see a performance hit?)

To my small brain, the only really complex part is sending something
like that into MDraid, because that one set of ranges might explode
into thousands of ranges and then have to be coalesced back down to a
more manageable number of ranges.

ie. with a simple raid 0, each range will need to be broken into a
bunch of stride sized ranges, then the contiguous strides on each
spindle coalesced back into larger ranges.

But if MDraid can handle discards now with one range, it should not be
that hard to teach it handle a group of ranges.

Greg


-- 
Greg Freemyer
Head of EDD Tape Extraction and Processing team
Litigation Triage Solutions Specialist
http://www.linkedin.com/in/gregfreemyer
CNN/TruTV Aired Forensic Imaging Demo -
   http://insession.blogs.cnn.com/2010/03/23/how-computer-evidence-gets-retrieved/

The Norcross Group
The Intersection of Evidence & Technology
http://www.norcrossgroup.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 22:07                     ` Mark Lord
@ 2010-11-19  1:33                       ` Ted Ts'o
  2010-11-19  3:44                         ` Mark Lord
  2010-11-19  3:44                           ` Mark Lord
  0 siblings, 2 replies; 106+ messages in thread
From: Ted Ts'o @ 2010-11-19  1:33 UTC (permalink / raw)
  To: Mark Lord
  Cc: James Bottomley, Greg Freemyer, Jeff Moyer, Christoph Hellwig,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

> >
> >Before we go gung ho on this, there's no evidence that N discontiguous
> >ranges in one command are any better than the ranges sent N times ...
> >the same amount of erase overhead gets sent on SSDs.
> 
> No, we do have evidence:  execution time of the TRIM commands on the SSD.
> 
> The one-range-at-a-time is incredibly slow compared to multiple
> ranges at a time.  That slowness comes from somewhere, with about
> 99.9% certainty that it is due to the drive performing slow flash
> erase cycles.

Mark, I think you are over-generalizing here.  You have observed with
some number of flash drives --- maybe only one, but I don't know that
for sure --- that TRIM is slow.  Even if we grant that you are correct
in your conclusion that it is because the drive is doing slow flash
erase cycles (and I don't completely accept that; I haven't seen your
your measurements since we know that any kind of command that requires
a queue drain/flush before it can execute is going to be slow, and I
don't know what kind of _slow_ you are observing).

But even if we *do* grant that you've seen one disk, or even a lot of
disks which is doing something stupid, that just means that their
manufacturer has some idiotic engineers.  It does not follow that all
SSD's, or thin-provisioned drives, or other devices implementing the
the ATA TRIM command, will do so in an incompetent way.

If you look a the the T13 definition of TRIM, it is just a hint that
the contents of the block range do not _have_ to be preserved.  It
does not say that they *must* be erased.  This is not a security erase
command.  In fact, it is perfectly reasonable for the TRIM command to
store state in volatile storage, and the information of which blocks
have been TRIM gets discarded on a power failure.

So if SSD's are doing a full flash erase cycle for each TRIM, that may
not necessarily be a good idea.  I accept that there may be some
incompetent implementations out there.  But I don't think this means
we should assume that _all_ implementations are incompetent.  It does
mean, though, that we can't turn any of these features on by default.
But that's something we know already.

					- Ted


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19  0:34                     ` Mark Lord
  2010-11-19  1:16                         ` Greg Freemyer
@ 2010-11-19  1:49                       ` Martin K. Petersen
  2010-11-19  3:42                         ` Mark Lord
  1 sibling, 1 reply; 106+ messages in thread
From: Martin K. Petersen @ 2010-11-19  1:49 UTC (permalink / raw)
  To: Mark Lord
  Cc: Martin K. Petersen, Greg Freemyer, James Bottomley, Jeff Moyer,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

>>>>> "Mark" == Mark Lord <kernel@teksavvy.com> writes:

Mark> Surely if a userspace tool and shell-script can accomplish this,
Mark> totally lacking real filesystem knowledge, then we should be able
Mark> to approximate it in kernel space?

It's the splitting and merging on stacked devices that's the hard
part. Something wiper.sh does not have to deal with. And thanks to
differences in the protocols the SCSI-ATA translation isn't a perfect
fit.

Every time TRIM comes up the discussion turns into how much we suck at
it because we don't support coalescing of discontiguous ranges.

However, we *do* support discarding contiguous ranges of up to about 2GB
per command on ATA. It's not like we're issuing a TRIM command for every
sector.

For offline/weekly reclaim/FITRIM we have the full picture when the
discard is issued. And thus we have the luxury of being able to send out
relatively big contiguous discards unless the filesystem is insanely
fragmented.

For runtime discard usage we'll inevitably be issuing lots of itty-bitty
512 or 4KB single-command discards. That's going to suck for performance
on your average ATA SSD. Doctor, it hurts when I do this...

So assuming we walk the filesystem to reclaim space on ATA SSDs on a
weekly basis (since that's the only sane approach): 

       What is the performance impact of not coalescing discontiguous
       block ranges when cron scrubs your /home at 4am Sunday morning?

That, to me, is the important question. That obviously depends on the
SSD, filesystem, fragmentation and so on. Is the win really big enough
to justify a bunch of highly intrusive changes to our I/O stack?

Thanks to PCIe SSDs and other upcoming I/O technologies we're working
hard to bring request latency down by simplifying things. Adding
complexity seems like a bad idea at this time. And that was the
rationale behind the consensus at the filesystem workshop.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19  1:49                       ` Martin K. Petersen
@ 2010-11-19  3:42                         ` Mark Lord
  0 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19  3:42 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Greg Freemyer, James Bottomley, Jeff Moyer, Christoph Hellwig,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-18 08:49 PM, Martin K. Petersen wrote:
> So assuming we walk the filesystem to reclaim space on ATA SSDs on a
> weekly basis (since that's the only sane approach):
>
>         What is the performance impact of not coalescing discontiguous
>         block ranges when cron scrubs your /home at 4am Sunday morning?

In the case of FITRIM, you're right: the performance impact
probably doesn't matter much for a 4am cronjob.

But a lot of people currently (at least) prefer to run it manually,
and they don't want it to take forever.

Though that's still not the primary worry:  each TRIM seems to trigger
a flash erase cycle (or cycles) on the most common SSDs on the market,
including anything Indilinx-based and as far as I can tell also for
anything SandForce based.  That's probably 70% of the SSDs out there today.

And I'm very concerned about premature wear -- MLC is only for 10000 cycles (avg).

Also, the current one-range-at-a-time interface is just not correct
for the majority of devices out there:  they are SATA, not some obscure
enterprise-only one-range-at-a-time thing.  We need an implementation
that reflects real-life for uses other than data centres.

If nobody else does it, I'll probably implement it correctly this winter.
But it would really be better for a real filesystem/DM person to do it.

Thanks for hanging in there this far, though!

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19  1:33                       ` Ted Ts'o
@ 2010-11-19  3:44                           ` Mark Lord
  2010-11-19  3:44                           ` Mark Lord
  1 sibling, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19  3:44 UTC (permalink / raw)
  To: Ted Ts'o, James Bottomley, Greg Freemyer, Jeff Moyer,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	linux-ext4, linux-kernel, linux-fsdevel, sandeen

On 10-11-18 08:33 PM, Ted Ts'o wrote:
>>>
>>> Before we go gung ho on this, there's no evidence that N discontiguous
>>> ranges in one command are any better than the ranges sent N times ...
>>> the same amount of erase overhead gets sent on SSDs.
>>
>> No, we do have evidence:  execution time of the TRIM commands on the SSD.
>>
>> The one-range-at-a-time is incredibly slow compared to multiple
>> ranges at a time.  That slowness comes from somewhere, with about
>> 99.9% certainty that it is due to the drive performing slow flash
>> erase cycles.
>
> Mark, I think you are over-generalizing here.  You have observed with
> some number of flash drives --- maybe only one, but I don't know that
> for sure --- that TRIM is slow.  Even if we grant that you are correct
> in your conclusion that it is because the drive is doing slow flash
> erase cycles (and I don't completely accept that; I haven't seen your
> your measurements since we know that any kind of command that requires
> a queue drain/flush before it can execute is going to be slow, and I
> don't know what kind of _slow_ you are observing).

I do this stuff on modest hardware:  ata_piix.
There is NO QUEUE TO FLUSH.

So one might expect TRIM to operate at the same speed as ordinary WRITEs.
But it doesn't.  When I measured this in detail (and things have not changed
much since then), we were talking 10s of milliseconds to 100s of milliseconds
per TRIM command.

The only possible explanation for that would be waiting on flash erase commands.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19  1:33                       ` Ted Ts'o
@ 2010-11-19  3:44                         ` Mark Lord
  2010-11-19  3:44                           ` Mark Lord
  1 sibling, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19  3:44 UTC (permalink / raw)
  To: Ted Ts'o, James Bottomley, Greg Freemyer, Jeff Moyer,
	Christoph Hellwig

On 10-11-18 08:33 PM, Ted Ts'o wrote:
>>>
>>> Before we go gung ho on this, there's no evidence that N discontiguous
>>> ranges in one command are any better than the ranges sent N times ...
>>> the same amount of erase overhead gets sent on SSDs.
>>
>> No, we do have evidence:  execution time of the TRIM commands on the SSD.
>>
>> The one-range-at-a-time is incredibly slow compared to multiple
>> ranges at a time.  That slowness comes from somewhere, with about
>> 99.9% certainty that it is due to the drive performing slow flash
>> erase cycles.
>
> Mark, I think you are over-generalizing here.  You have observed with
> some number of flash drives --- maybe only one, but I don't know that
> for sure --- that TRIM is slow.  Even if we grant that you are correct
> in your conclusion that it is because the drive is doing slow flash
> erase cycles (and I don't completely accept that; I haven't seen your
> your measurements since we know that any kind of command that requires
> a queue drain/flush before it can execute is going to be slow, and I
> don't know what kind of _slow_ you are observing).

I do this stuff on modest hardware:  ata_piix.
There is NO QUEUE TO FLUSH.

So one might expect TRIM to operate at the same speed as ordinary WRITEs.
But it doesn't.  When I measured this in detail (and things have not changed
much since then), we were talking 10s of milliseconds to 100s of milliseconds
per TRIM command.

The only possible explanation for that would be waiting on flash erase commands.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-19  3:44                           ` Mark Lord
  0 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19  3:44 UTC (permalink / raw)
  To: Ted Ts'o, James Bottomley, Greg Freemyer, Jeff Moyer,
	Christoph Hellwig

On 10-11-18 08:33 PM, Ted Ts'o wrote:
>>>
>>> Before we go gung ho on this, there's no evidence that N discontiguous
>>> ranges in one command are any better than the ranges sent N times ...
>>> the same amount of erase overhead gets sent on SSDs.
>>
>> No, we do have evidence:  execution time of the TRIM commands on the SSD.
>>
>> The one-range-at-a-time is incredibly slow compared to multiple
>> ranges at a time.  That slowness comes from somewhere, with about
>> 99.9% certainty that it is due to the drive performing slow flash
>> erase cycles.
>
> Mark, I think you are over-generalizing here.  You have observed with
> some number of flash drives --- maybe only one, but I don't know that
> for sure --- that TRIM is slow.  Even if we grant that you are correct
> in your conclusion that it is because the drive is doing slow flash
> erase cycles (and I don't completely accept that; I haven't seen your
> your measurements since we know that any kind of command that requires
> a queue drain/flush before it can execute is going to be slow, and I
> don't know what kind of _slow_ you are observing).

I do this stuff on modest hardware:  ata_piix.
There is NO QUEUE TO FLUSH.

So one might expect TRIM to operate at the same speed as ordinary WRITEs.
But it doesn't.  When I measured this in detail (and things have not changed
much since then), we were talking 10s of milliseconds to 100s of milliseconds
per TRIM command.

The only possible explanation for that would be waiting on flash erase commands.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 21:37           ` Mark Lord
@ 2010-11-19 11:09             ` Christoph Hellwig
  2010-11-19 13:54               ` Mark Lord
  2010-11-19 14:40             ` Chris Mason
  1 sibling, 1 reply; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 11:09 UTC (permalink / raw)
  To: Mark Lord
  Cc: James Bottomley, Christoph Hellwig, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

On Thu, Nov 18, 2010 at 04:37:34PM -0500, Mark Lord wrote:
> Currently, I patch my kernels to remove the automatic online TRIMs.
> Is there a knob somewhere for this in the later kernels?

There is not automatic discard in any recent kernel.  You always need
to mount with -o discard to enabled it.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19  1:16                         ` Greg Freemyer
  (?)
@ 2010-11-19 11:55                         ` Christoph Hellwig
  2010-11-19 14:01                           ` Mark Lord
  2010-11-19 16:20                           ` Greg Freemyer
  -1 siblings, 2 replies; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 11:55 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Mark Lord, Martin K. Petersen, James Bottomley, Jeff Moyer,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Thu, Nov 18, 2010 at 05:16:36PM -0800, Greg Freemyer wrote:
> I agree with Mark.  When you say "make coalescing work" it sounds like
> major overkill.
> 
> FITRIM should be able to lock a group of non-contiguous free ranges,
> send them down to the block layer as a single pre-coalesced set, and
> the block layer just needs to pass it on in a synchronous way.  Then
> when that group of ranges is discarded, FITRIM releases the locks.

Given that you know the Linux I/O stack and hardware so well may I
volunteer you to implement it?  Including remapping the multiple ranges
in device mapper, and dealing with the incompatible range formats for
TRIM vs UNMAP.  If your implementation is clean enough I'm pretty
sure no one will object to merge it.

> Every TRIM causes a cache flush anyway on the SSD, so the synchronous
> aspect of the process should not be a problem.  (Maybe SCSI with thin
> provisioning would see a performance hit?)

What's your evidence?


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 17:35           ` Lukas Czerner
@ 2010-11-19 12:16             ` Steven Whitehouse
  2010-11-19 13:53               ` Mark Lord
  0 siblings, 1 reply; 106+ messages in thread
From: Steven Whitehouse @ 2010-11-19 12:16 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: James Bottomley, Christoph Hellwig, Matthew Wilcox, Josef Bacik,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

Hi,

On Thu, 2010-11-18 at 18:35 +0100, Lukas Czerner wrote:
> On Thu, 18 Nov 2010, James Bottomley wrote:
> 
> > On Thu, 2010-11-18 at 09:29 -0500, Christoph Hellwig wrote:
> > > On Thu, Nov 18, 2010 at 07:19:58AM -0700, Matthew Wilcox wrote:
> > > > I guess I was assuming that, on receiving a FALLOC_FL_PUNCH_HOLE, a
> > > > filesystem that was TRIM-aware would pass that information down to the
> > > > block device that it's mounted on.  I strongly feel that we shouldn't
> > > > have two interfaces to do essentially the same thing.
> > > > 
> > > > I guess I'm saying that you're going to have to learn about TRIM :-)
> > > 
> > > Did you actually look Lukas FITRIM code (not the slight reordering here,
> > > but the original one).  It's the ext4 version of the batched discard
> > > model, that is a userspace ioctl to discard free space in the
> > > filesystem.
> > > 
> > > hole punching will free the blocks into the free space pool.  If you do
> > > online discard it will also get discarded, but a filesystem that has
> > > online discard enabled doesn't need FITRIM.
> > 
> > Not stepping into the debate: I'm happy to see punch go to the mapping
> > data and FITRIM pick it up later.
> > 
> > However, I think it's time to question whether we actually still want to
> > allow online discard at all.  Most of the benchmarks show it to be a net
> > lose to almost everything (either SSD or Thinly Provisioned arrays), so
> > it's become an "enable this to degrade performance" option with no
> > upside.
> > 
> > James
> > 
> 
> This time began a long time ago :) that is why am I originally created
> batched discard for ext4 (ext3) accessible through FITRIM ioctl. Ext4
> performance with -o discard mount option goes down on the most of the
> SSD's and every Thinly-provisioned storage I have a chance to benchmark.
> 
> But, for example SSD's are getting better and as time goes by we might
> see devices that does not suffer terrible performance loss with discard
> enabled (discard on unlink in ext4 etc...), so this "online" discard
> probably still does make sense.
> 
> -Lukas

I agree that it is early days for trim/discard hardware implementations.
I hope that if it can be shown that it is useful (and if the hardware
vendors are following this thread!) then maybe it will spur them on to
providing faster implementations in the future. There doesn't seem to be
any technical reason why faster implementations are not possible.

Equally, FITRIM is useful since the overhead can be reduced to certain
points in time when a system is less busy. With GFS2 (this may well also
apply to OCFS2) doing a userspace trim is not very easy since there is
no simple way to access the locking for the fs from userspace, so it
would imply that an admin unmounted the filesystem on all nodes and then
runs a utility on just one node. Using FITRIM though, it would be
possible to perform the trim during normal fs operation (initiating
FITRIM from multiple nodes at once would also work correctly, even if it
is not desirable from a performance point of view)

GFS2 already has code for online discard, and the changes required to
support FITRIM are relatively small. Both online discard and FITRIM
simply require iterating through the resource group bitmaps and
generating discard requests depending on the bitmap states encountered.
I'm intending to put a patch together fairly shortly to implement FITRIM
for GFS2.

So I think there is a place for both approaches,

Steve.



^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 12:16             ` Steven Whitehouse
@ 2010-11-19 13:53               ` Mark Lord
  2010-11-19 14:02                 ` Ted Ts'o
  0 siblings, 1 reply; 106+ messages in thread
From: Mark Lord @ 2010-11-19 13:53 UTC (permalink / raw)
  To: Steven Whitehouse
  Cc: Lukas Czerner, James Bottomley, Christoph Hellwig,
	Matthew Wilcox, Josef Bacik, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On 10-11-19 07:16 AM, Steven Whitehouse wrote:
>
> There doesn't seem to be any technical reason why faster implementations are not possible.

There is a very good reason why faster implementations may be *difficult*
(if not impossible) in many cases:  DETERMINISTIC trim.  This requires
that the drive guarantee the block ranges will return a constant known
value after TRIM.  Which means they MUST write to flash during the trim.
And any WRITE to flash means a potential ERASE operation may be needed.

Simply buffering the trim in RAM and returning success is not an option here,
because loss of power would negate the (virtual) TRIM.  So they MUST record
the trim operation to non-volatile storage.  This can be done in a variety of
ways, but one of the simplest is to just do the full TRIM then and there,
shuffling data and erasing the blocks before signaling completion.

Another, possibly faster way, is to have TRIM just update a block bitmap
somewhere inside FLASH, and avoid ERASE until most of an entire flash block
(eg. 256KB) is marked as "trimmed".  This is the implementation we all hope
for, but which many (most?) current drives do not seem to implement.

Non-deterministic TRIM should also try to ensure that the original data
is no longer there (for security reasons), so it may have the same issues.

> Equally, FITRIM is useful since the overhead can be reduced to certain
> points in time when a system is less busy. With GFS2 (this may well also
> apply to OCFS2) doing a userspace trim is not very easy since there is
> no simple way to access the locking for the fs from userspace

wiper.sh locks the blocks by reserving the space for a file.
But it has to lock ALL freespace, whereas FITRIM could be clever
and only lock the bits it is actually trimming at any instant
(I'm agreeing with you!).

> I'm intending to put a patch together fairly shortly to implement FITRIM for GFS2.

Excellent.  So eventually we might expect FITRIM to reappear at the VFS level,
rather than being buried inside each individual fs's ioctl() handler?

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 11:09             ` Christoph Hellwig
@ 2010-11-19 13:54               ` Mark Lord
  0 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 13:54 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: James Bottomley, Matthew Wilcox, Josef Bacik, Lukas Czerner,
	tytso, linux-ext4, linux-kernel, linux-fsdevel, sandeen

On 10-11-19 06:09 AM, Christoph Hellwig wrote:
> On Thu, Nov 18, 2010 at 04:37:34PM -0500, Mark Lord wrote:
>> Currently, I patch my kernels to remove the automatic online TRIMs.
>> Is there a knob somewhere for this in the later kernels?
>
> There is not automatic discard in any recent kernel.  You always need
> to mount with -o discard to enabled it.

Peachy.  Thanks, Christoph.

My 2-year old userspace here doesn't have "-o discard",
so I couldn't find it in the man pages etc.. thus my paranoia.  :)

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19  3:44                           ` Mark Lord
  (?)
@ 2010-11-19 13:58                           ` Jeff Moyer
  -1 siblings, 0 replies; 106+ messages in thread
From: Jeff Moyer @ 2010-11-19 13:58 UTC (permalink / raw)
  To: Mark Lord
  Cc: Ted Ts'o, James Bottomley, Greg Freemyer, Christoph Hellwig,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

Mark Lord <kernel@teksavvy.com> writes:

> On 10-11-18 08:33 PM, Ted Ts'o wrote:
>>>>
>>>> Before we go gung ho on this, there's no evidence that N discontiguous
>>>> ranges in one command are any better than the ranges sent N times ...
>>>> the same amount of erase overhead gets sent on SSDs.
>>>
>>> No, we do have evidence:  execution time of the TRIM commands on the SSD.
>>>
>>> The one-range-at-a-time is incredibly slow compared to multiple
>>> ranges at a time.  That slowness comes from somewhere, with about
>>> 99.9% certainty that it is due to the drive performing slow flash
>>> erase cycles.
>>
>> Mark, I think you are over-generalizing here.  You have observed with
>> some number of flash drives --- maybe only one, but I don't know that
>> for sure --- that TRIM is slow.  Even if we grant that you are correct
>> in your conclusion that it is because the drive is doing slow flash
>> erase cycles (and I don't completely accept that; I haven't seen your
>> your measurements since we know that any kind of command that requires
>> a queue drain/flush before it can execute is going to be slow, and I
>> don't know what kind of _slow_ you are observing).
>
> I do this stuff on modest hardware:  ata_piix.
> There is NO QUEUE TO FLUSH.
>
> So one might expect TRIM to operate at the same speed as ordinary WRITEs.
> But it doesn't.  When I measured this in detail (and things have not changed
> much since then), we were talking 10s of milliseconds to 100s of milliseconds
> per TRIM command.
>
> The only possible explanation for that would be waiting on flash erase commands.

If you guys want to test how long trims take, Lukas wrote a test program
that does this.  It can be found here:
  http://sourceforge.net/projects/test-discard/

It will even spit out nice graphs that show you b/w, average trim
duration, maximum duration, etc.

Some devices are better than others.  We've definitely seen trims take a
lot of time compared to regular I/O.  However, using the batched discard
ioctl in a cron job, I don't think we have to worry about this
particular problem.  And I don't buy the argument that users want to do
this by hand.  Most users want things to Just Work(TM).

Cheers,
Jeff

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 11:55                         ` Christoph Hellwig
@ 2010-11-19 14:01                           ` Mark Lord
  2010-11-19 14:06                             ` Christoph Hellwig
  2010-11-19 16:20                           ` Greg Freemyer
  1 sibling, 1 reply; 106+ messages in thread
From: Mark Lord @ 2010-11-19 14:01 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Greg Freemyer, Martin K. Petersen, James Bottomley, Jeff Moyer,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-19 06:55 AM, Christoph Hellwig wrote:
> On Thu, Nov 18, 2010 at 05:16:36PM -0800, Greg Freemyer wrote:
>> I agree with Mark.  When you say "make coalescing work" it sounds like
>> major overkill.
>>
>> FITRIM should be able to lock a group of non-contiguous free ranges,
>> send them down to the block layer as a single pre-coalesced set, and
>> the block layer just needs to pass it on in a synchronous way.  Then
>> when that group of ranges is discarded, FITRIM releases the locks.
>
> Given that you know the Linux I/O stack and hardware so well may I
> volunteer you to implement it?

That is my intent already, thanks.  Just needs time, perhaps this winter.

I think a reasonable approach would be to modify the existing interfaces
so that the LLD can report a "max discard ranges per command" back up
the stack.

This way, libata could report a max of say, 64 ranges per "discard" (trim),
and DM/RAID could simply (for now) report a max of one range per discard.

Way up at the FITRIM level, code could interrogate the "discard" limit
for the device holding the fs, and construct the discard commands such that
they respect that limit.  For a filesystem on DM/RAID, we would (for now)
end up with single-range discards, no change from the present.

For the much more common case of end-user SATA SSDs, though, we would
suddenly get multi-range trims working with probably very little effort.

That's the plan.  Feel free to beat me to it -- you've been working on
the I/O stack nearly as long as I have (since 1992), and I expect you
know it far better by now, too!  ;)

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 13:53               ` Mark Lord
@ 2010-11-19 14:02                 ` Ted Ts'o
  2010-11-19 14:10                   ` Christoph Hellwig
                                     ` (6 more replies)
  0 siblings, 7 replies; 106+ messages in thread
From: Ted Ts'o @ 2010-11-19 14:02 UTC (permalink / raw)
  To: Mark Lord
  Cc: Steven Whitehouse, Lukas Czerner, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Fri, Nov 19, 2010 at 08:53:25AM -0500, Mark Lord wrote:
> There is a very good reason why faster implementations may be *difficult*
> (if not impossible) in many cases:  DETERMINISTIC trim.  This requires
> that the drive guarantee the block ranges will return a constant known
> value after TRIM.  Which means they MUST write to flash during the trim.
> And any WRITE to flash means a potential ERASE operation may be needed.

Deterministic TRIM is an option.  It doesn't have to be implemented.
And as you even pointed out, there are ways of doing this
intelligently.  Whether "intelligently" and "drive firmware authors"
are two phrases that should be used in the same sentence is a concern
that I will grant, but that's why mount -o discard is not the default.

> Non-deterministic TRIM should also try to ensure that the original data
> is no longer there (for security reasons), so it may have the same issues.

Says who?  We've deleted files on hard drives for a long time without
scrubbing data blocks.  Why should a non-deterministic trim be any
different.  If the goal is a better performing SSD, and not security,
then non-deterministic trim should definitely _not_ ensure that the
original data is no longer accessible.

						- Ted

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:01                           ` Mark Lord
@ 2010-11-19 14:06                             ` Christoph Hellwig
  2010-11-19 14:48                               ` Mark Lord
  0 siblings, 1 reply; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 14:06 UTC (permalink / raw)
  To: Mark Lord
  Cc: Christoph Hellwig, Greg Freemyer, Martin K. Petersen,
	James Bottomley, Jeff Moyer, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

On Fri, Nov 19, 2010 at 09:01:02AM -0500, Mark Lord wrote:
> That is my intent already, thanks.  Just needs time, perhaps this winter.

This wasn't addressed at you, but a snide remark at Greg, who is just
contantly bickering without actually beeing any help.

> I think a reasonable approach would be to modify the existing interfaces
> so that the LLD can report a "max discard ranges per command" back up
> the stack.
> 
> This way, libata could report a max of say, 64 ranges per "discard" (trim),
> and DM/RAID could simply (for now) report a max of one range per discard.

That's certainly the easy way out.  You'll need a good way to actually
transport the ranges as we can't simply sote them in bi_sector/bi_size
and adapt the whole block layer to deal with the two types of different
discards.  Not saying it's impossible, but when I tried it before it
wasn't pretty.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:02                 ` Ted Ts'o
  2010-11-19 14:10                   ` Christoph Hellwig
  2010-11-19 14:10                   ` Christoph Hellwig
@ 2010-11-19 14:10                   ` Christoph Hellwig
  2010-11-19 15:37                     ` Ted Ts'o
  2010-11-19 14:50                   ` Mark Lord
                                     ` (3 subsequent siblings)
  6 siblings, 1 reply; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 14:10 UTC (permalink / raw)
  To: Ted Ts'o, Mark Lord, Steven Whitehouse, Lukas Czerner,
	James Bottomley, Christoph Hellwig, Matthew Wilcox, Josef Bacik,
	linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Fri, Nov 19, 2010 at 09:02:03AM -0500, Ted Ts'o wrote:
> Deterministic TRIM is an option.  It doesn't have to be implemented.

For reasonable use it has to.  That rationale for it is pretty clearly
outline in ftp://ftp.t10.org/t10/document.08/08-347r1.pdf.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:02                 ` Ted Ts'o
  2010-11-19 14:10                   ` Christoph Hellwig
@ 2010-11-19 14:10                   ` Christoph Hellwig
  2010-11-19 14:10                   ` Christoph Hellwig
                                     ` (4 subsequent siblings)
  6 siblings, 0 replies; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 14:10 UTC (permalink / raw)
  To: Ted Ts'o, Mark Lord, Steven Whitehouse, Lukas Czerner,
	James Bottomley, Ch

On Fri, Nov 19, 2010 at 09:02:03AM -0500, Ted Ts'o wrote:
> Deterministic TRIM is an option.  It doesn't have to be implemented.

For reasonable use it has to.  That rationale for it is pretty clearly
outline in ftp://ftp.t10.org/t10/document.08/08-347r1.pdf.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:02                 ` Ted Ts'o
@ 2010-11-19 14:10                   ` Christoph Hellwig
  2010-11-19 14:10                   ` Christoph Hellwig
                                     ` (5 subsequent siblings)
  6 siblings, 0 replies; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 14:10 UTC (permalink / raw)
  To: Ted Ts'o, Mark Lord, Steven Whitehouse, Lukas Czerner,
	James Bottomley, Ch

On Fri, Nov 19, 2010 at 09:02:03AM -0500, Ted Ts'o wrote:
> Deterministic TRIM is an option.  It doesn't have to be implemented.

For reasonable use it has to.  That rationale for it is pretty clearly
outline in ftp://ftp.t10.org/t10/document.08/08-347r1.pdf.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 21:37           ` Mark Lord
  2010-11-19 11:09             ` Christoph Hellwig
@ 2010-11-19 14:40             ` Chris Mason
  2010-11-19 14:53               ` Mark Lord
  1 sibling, 1 reply; 106+ messages in thread
From: Chris Mason @ 2010-11-19 14:40 UTC (permalink / raw)
  To: Mark Lord
  Cc: James Bottomley, Christoph Hellwig, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

Excerpts from Mark Lord's message of 2010-11-18 16:37:34 -0500:
> On 10-11-18 12:19 PM, James Bottomley wrote:
> >
> > Not stepping into the debate: I'm happy to see punch go to the mapping
> > data and FITRIM pick it up later.
> >
> > However, I think it's time to question whether we actually still want to
> > allow online discard at all.  Most of the benchmarks show it to be a net
> > lose to almost everything (either SSD or Thinly Provisioned arrays), so
> > it's become an "enable this to degrade performance" option with no
> > upside.
> 
> I also suspect that online TRIM exerts significant premature wear on the SSDs.
> TRIM operations most likely trigger immediate copy/erase operations internal
> to most SSDs, often regardless of the amount of data being trimmed.
> 
> Performing a 256KB erase because of a 1024-byte TRIM, over and over, is going
> to harm the expected lifetime of an SSD.  Sure, some SSDs may do things differently
> internally, but I don't see it working that way in much of the current crop of SSDs.
> 
> Currently, I patch my kernels to remove the automatic online TRIMs.
> Is there a knob somewhere for this in the later kernels?

We've been told that online and constant trimming is the default in
windows7.   The ssds are most likely to just start ignoring the trims
they can't service efficiently.

-chris

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:06                             ` Christoph Hellwig
@ 2010-11-19 14:48                               ` Mark Lord
  2010-11-19 14:54                                 ` Christoph Hellwig
  0 siblings, 1 reply; 106+ messages in thread
From: Mark Lord @ 2010-11-19 14:48 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Greg Freemyer, Martin K. Petersen, James Bottomley, Jeff Moyer,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-19 09:06 AM, Christoph Hellwig wrote:
> On Fri, Nov 19, 2010 at 09:01:02AM -0500, Mark Lord wrote:
>> Ithink a reasonable approach would be to modify the existing interfaces
>> so that the LLD can report a "max discard ranges per command" back up
>> the stack.
>>
>> This way, libata could report a max of say, 64 ranges per "discard" (trim),
>> and DM/RAID could simply (for now) report a max of one range per discard.
>
> That's certainly the easy way out.  You'll need a good way to actually
> transport the ranges as we can't simply sote them in bi_sector/bi_size
> and adapt the whole block layer to deal with the two types of different
> discards.  Not saying it's impossible, but when I tried it before it
> wasn't pretty.

Yeah.  I think for this to happen is has to be evolutionary,
and the proposal above looks like a reasonable first crack at it.

I'm not sure about the issues on "adapting the block layer" ?
For FITRIM, the blocks being trimmed would be reserved at the fs level,
before issuing the discard for them.  So ordering through the block layer
shouldn't matter much for it.  Does that simplify things?

I see FITRIM just allocating a page to hold the ranges (for the >1 case)
and passing that page down through the layers to libata (or any other
LLD that supports >1 ranges).

??

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:02                 ` Ted Ts'o
@ 2010-11-19 14:50                     ` Mark Lord
  2010-11-19 14:10                   ` Christoph Hellwig
                                       ` (5 subsequent siblings)
  6 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 14:50 UTC (permalink / raw)
  To: Ted Ts'o, Steven Whitehouse, Lukas Czerner, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-19 09:02 AM, Ted Ts'o wrote:
> On Fri, Nov 19, 2010 at 08:53:25AM -0500, Mark Lord wrote:
>
>> Non-deterministic TRIM _should_ also try to ensure that the original data
>> is no longer there (for security reasons), so it _may_ have the same issues.
>
> Says who?  We've deleted files on hard drives for a long time without
> scrubbing data blocks.

"should" try, not "must" try.  :)

It's not mandatory, but I feel it is a Good Idea(tm).
That's why it "may" have the same issues.  Or not.

Thanks Ted.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:02                 ` Ted Ts'o
                                     ` (2 preceding siblings ...)
  2010-11-19 14:10                   ` Christoph Hellwig
@ 2010-11-19 14:50                   ` Mark Lord
  2010-11-19 14:50                     ` Mark Lord
                                     ` (2 subsequent siblings)
  6 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 14:50 UTC (permalink / raw)
  To: Ted Ts'o, Steven Whitehouse, Lukas Czerner, James Bottomley,
	Christoph Hellwig

On 10-11-19 09:02 AM, Ted Ts'o wrote:
> On Fri, Nov 19, 2010 at 08:53:25AM -0500, Mark Lord wrote:
>
>> Non-deterministic TRIM _should_ also try to ensure that the original data
>> is no longer there (for security reasons), so it _may_ have the same issues.
>
> Says who?  We've deleted files on hard drives for a long time without
> scrubbing data blocks.

"should" try, not "must" try.  :)

It's not mandatory, but I feel it is a Good Idea(tm).
That's why it "may" have the same issues.  Or not.

Thanks Ted.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-19 14:50                     ` Mark Lord
  0 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 14:50 UTC (permalink / raw)
  To: Ted Ts'o, Steven Whitehouse, Lukas Czerner, James Bottomley,
	Christoph Hellwig

On 10-11-19 09:02 AM, Ted Ts'o wrote:
> On Fri, Nov 19, 2010 at 08:53:25AM -0500, Mark Lord wrote:
>
>> Non-deterministic TRIM _should_ also try to ensure that the original data
>> is no longer there (for security reasons), so it _may_ have the same issues.
>
> Says who?  We've deleted files on hard drives for a long time without
> scrubbing data blocks.

"should" try, not "must" try.  :)

It's not mandatory, but I feel it is a Good Idea(tm).
That's why it "may" have the same issues.  Or not.

Thanks Ted.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:40             ` Chris Mason
@ 2010-11-19 14:53               ` Mark Lord
  2010-11-19 14:57                 ` Christoph Hellwig
  2010-11-21 19:07                 ` Valdis.Kletnieks
  0 siblings, 2 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 14:53 UTC (permalink / raw)
  To: Chris Mason
  Cc: James Bottomley, Christoph Hellwig, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

On 10-11-19 09:40 AM, Chris Mason wrote:
>
> We've been told that online and constant trimming is the default in
> windows7.   The ssds are most likely to just start ignoring the trims
> they can't service efficiently.

Win7 collects multiple TRIM ranges over time and batches them as single TRIMs
(as reported to me by an SSD vendor who traced it with a SATA analyzer,
  and who also apparently has "inside info").

That's what we also need Linux to do.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:48                               ` Mark Lord
@ 2010-11-19 14:54                                 ` Christoph Hellwig
  2010-11-19 15:24                                   ` Mark Lord
  0 siblings, 1 reply; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 14:54 UTC (permalink / raw)
  To: Mark Lord
  Cc: Christoph Hellwig, Greg Freemyer, Martin K. Petersen,
	James Bottomley, Jeff Moyer, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

On Fri, Nov 19, 2010 at 09:48:12AM -0500, Mark Lord wrote:
> I'm not sure about the issues on "adapting the block layer" ?
> For FITRIM, the blocks being trimmed would be reserved at the fs level,
> before issuing the discard for them.  So ordering through the block layer
> shouldn't matter much for it.  Does that simplify things?
> 
> I see FITRIM just allocating a page to hold the ranges (for the >1 case)
> and passing that page down through the layers to libata (or any other
> LLD that supports >1 ranges).

Ordering should not be an issue.  What were problems when I tried this
before is that we currently assume in the block layer that discard
bios have a valid bi_sector/bi_size, which is already needed e.g. for
the trivial remapping use for partitions and that they don't have
a payload.  You'd need to teach various places that discard payloads
may have a payload, which contains multiple ranges that have a
sector/len tuple that needs to be remapped and checked in various
places.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:53               ` Mark Lord
@ 2010-11-19 14:57                 ` Christoph Hellwig
  2010-11-19 15:21                   ` Mark Lord
  2010-11-19 15:30                   ` Mark Lord
  2010-11-21 19:07                 ` Valdis.Kletnieks
  1 sibling, 2 replies; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 14:57 UTC (permalink / raw)
  To: Mark Lord
  Cc: Chris Mason, James Bottomley, Christoph Hellwig, Matthew Wilcox,
	Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On Fri, Nov 19, 2010 at 09:53:52AM -0500, Mark Lord wrote:
> On 10-11-19 09:40 AM, Chris Mason wrote:
> >
> >We've been told that online and constant trimming is the default in
> >windows7.   The ssds are most likely to just start ignoring the trims
> >they can't service efficiently.
> 
> Win7 collects multiple TRIM ranges over time and batches them as single TRIMs
> (as reported to me by an SSD vendor who traced it with a SATA analyzer,
>  and who also apparently has "inside info").

I really hate to rely on this third party hearsay (from all sides), and
have implement TRIM support in qemu now.  I'll soon install win7 and
will check out the TRIM patters myself.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:57                 ` Christoph Hellwig
@ 2010-11-19 15:21                   ` Mark Lord
  2010-12-07  9:27                     ` Christoph Hellwig
  2010-11-19 15:30                   ` Mark Lord
  1 sibling, 1 reply; 106+ messages in thread
From: Mark Lord @ 2010-11-19 15:21 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chris Mason, James Bottomley, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

On 10-11-19 09:57 AM, Christoph Hellwig wrote:
> On Fri, Nov 19, 2010 at 09:53:52AM -0500, Mark Lord wrote:
>> On 10-11-19 09:40 AM, Chris Mason wrote:
>>>
>>> We've been told that online and constant trimming is the default in
>>> windows7.   The ssds are most likely to just start ignoring the trims
>>> they can't service efficiently.
>>
>> Win7 collects multiple TRIM ranges over time and batches them as single TRIMs
>> (as reported to me by an SSD vendor who traced it with a SATA analyzer,
>>   and who also apparently has "inside info").
>
> I really hate to rely on this third party hearsay (from all sides), and
> have implement TRIM support in qemu now.  I'll soon install win7 and
> will check out the TRIM patters myself.

Excellent!

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:54                                 ` Christoph Hellwig
@ 2010-11-19 15:24                                   ` Mark Lord
  2010-11-19 15:34                                     ` Christoph Hellwig
  0 siblings, 1 reply; 106+ messages in thread
From: Mark Lord @ 2010-11-19 15:24 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Greg Freemyer, Martin K. Petersen, James Bottomley, Jeff Moyer,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-19 09:54 AM, Christoph Hellwig wrote:
> On Fri, Nov 19, 2010 at 09:48:12AM -0500, Mark Lord wrote:
>> I'm not sure about the issues on "adapting the block layer" ?
>> For FITRIM, the blocks being trimmed would be reserved at the fs level,
>> before issuing the discard for them.  So ordering through the block layer
>> shouldn't matter much for it.  Does that simplify things?
>>
>> I see FITRIM just allocating a page to hold the ranges (for the>1 case)
>> and passing that page down through the layers to libata (or any other
>> LLD that supports>1 ranges).
>
> Ordering should not be an issue.  What were problems when I tried this
> before is that we currently assume in the block layer that discard
> bios have a valid bi_sector/bi_size, which is already needed e.g. for
> the trivial remapping use for partitions and that they don't have
> a payload.  You'd need to teach various places that discard payloads
> may have a payload, which contains multiple ranges that have a
> sector/len tuple that needs to be remapped and checked in various
> places.

I wonder if this can be treated more like how SG_IO does things?
The SG_IO mechanism seems to have no issues passing through stuff
like this, so perhaps we could implement something in a similar fashion?

-ml

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:57                 ` Christoph Hellwig
  2010-11-19 15:21                   ` Mark Lord
@ 2010-11-19 15:30                   ` Mark Lord
  1 sibling, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 15:30 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Chris Mason, James Bottomley, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

On 10-11-19 09:57 AM, Christoph Hellwig wrote:
> On Fri, Nov 19, 2010 at 09:53:52AM -0500, Mark Lord wrote:
>> On 10-11-19 09:40 AM, Chris Mason wrote:
>>>
>>> We've been told that online and constant trimming is the default in
>>> windows7.   The ssds are most likely to just start ignoring the trims
>>> they can't service efficiently.
>>
>> Win7 collects multiple TRIM ranges over time and batches them as single TRIMs
>> (as reported to me by an SSD vendor who traced it with a SATA analyzer,
>>   and who also apparently has "inside info").
>
> I really hate to rely on this third party hearsay (from all sides), and
> have implement TRIM support in qemu now.  I'll soon install win7 and
> will check out the TRIM patters myself.

Here is some of the heresay about Win7:

   TRIM commands are sent when:
      -- About 2/3 of partition is filled up, when file is (really) deleted,
           not when merely sending file to trash bin.
      -- In the above case, when trash bin gets emptied.
      -- In the above case, when partition is deleted.

So the behaviour may vary, depending upon how full the filesystem is
at any point in time.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 15:24                                   ` Mark Lord
@ 2010-11-19 15:34                                     ` Christoph Hellwig
  0 siblings, 0 replies; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 15:34 UTC (permalink / raw)
  To: Mark Lord
  Cc: Christoph Hellwig, Greg Freemyer, Martin K. Petersen,
	James Bottomley, Jeff Moyer, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

On Fri, Nov 19, 2010 at 10:24:52AM -0500, Mark Lord wrote:
> I wonder if this can be treated more like how SG_IO does things?
> The SG_IO mechanism seems to have no issues passing through stuff
> like this, so perhaps we could implement something in a similar fashion?

We actually sent discards down as BLOCK_PC commands, which basically
is SG_IO style I/O from kernelspace.  But that caused a lot of problems
with scsi error handling so we moved away from it.  But even when
you do a BLOCK_PC UNMAP command that later gets translated to a TRIM
by libata you have a few issues:

 - you need to get partion offsets in the I/O sumitter to include them
   in every range.  That's doable but a quite nasty layering violation,
   and it also prevents us from ever adding DM/MD support to that
   scheme.
 - you'll have to allocate new memory for the TRIM payload in libata,
   and switch the libata command to use it, instead of the current
   hack to reuse the zeroed page sent down with the WRITE SAME command.
   I tried to get the payload switching in libata to work a few times,
   but never managed to get it right.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:02                 ` Ted Ts'o
@ 2010-11-19 15:35                     ` Mark Lord
  2010-11-19 14:10                   ` Christoph Hellwig
                                       ` (5 subsequent siblings)
  6 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 15:35 UTC (permalink / raw)
  To: Ted Ts'o, Steven Whitehouse, Lukas Czerner, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-19 09:02 AM, Ted Ts'o wrote:
>but that's why mount -o discard is not the default.


But, oddly, it _is_ the default for mke2fs -t ext4,
which really threw me for a loop recently.

I though my system had locked up when suddenly everything
went dead for a very long time (many minutes) while installing a new system.

Then I figured out what was going on.
I really believe that "-K" should be the _default_ for mke2fs,
as otherwise lots of other people probably also wonder
if Linux is killing their SSD at system install time..

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:02                 ` Ted Ts'o
                                     ` (4 preceding siblings ...)
  2010-11-19 14:50                     ` Mark Lord
@ 2010-11-19 15:35                   ` Mark Lord
  2010-11-19 15:35                     ` Mark Lord
  6 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 15:35 UTC (permalink / raw)
  To: Ted Ts'o, Steven Whitehouse, Lukas Czerner, James Bottomley,
	Christoph Hellwig

On 10-11-19 09:02 AM, Ted Ts'o wrote:
>but that's why mount -o discard is not the default.


But, oddly, it _is_ the default for mke2fs -t ext4,
which really threw me for a loop recently.

I though my system had locked up when suddenly everything
went dead for a very long time (many minutes) while installing a new system.

Then I figured out what was going on.
I really believe that "-K" should be the _default_ for mke2fs,
as otherwise lots of other people probably also wonder
if Linux is killing their SSD at system install time..

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-19 15:35                     ` Mark Lord
  0 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 15:35 UTC (permalink / raw)
  To: Ted Ts'o, Steven Whitehouse, Lukas Czerner, James Bottomley,
	Christoph Hellwig

On 10-11-19 09:02 AM, Ted Ts'o wrote:
>but that's why mount -o discard is not the default.


But, oddly, it _is_ the default for mke2fs -t ext4,
which really threw me for a loop recently.

I though my system had locked up when suddenly everything
went dead for a very long time (many minutes) while installing a new system.

Then I figured out what was going on.
I really believe that "-K" should be the _default_ for mke2fs,
as otherwise lots of other people probably also wonder
if Linux is killing their SSD at system install time..

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:10                   ` Christoph Hellwig
@ 2010-11-19 15:37                     ` Ted Ts'o
  2010-11-19 15:50                       ` Christoph Hellwig
  2010-11-19 15:50                         ` Christoph Hellwig
  0 siblings, 2 replies; 106+ messages in thread
From: Ted Ts'o @ 2010-11-19 15:37 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mark Lord, Steven Whitehouse, Lukas Czerner, James Bottomley,
	Matthew Wilcox, Josef Bacik, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On Fri, Nov 19, 2010 at 09:10:07AM -0500, Christoph Hellwig wrote:
> On Fri, Nov 19, 2010 at 09:02:03AM -0500, Ted Ts'o wrote:
> > Deterministic TRIM is an option.  It doesn't have to be implemented.
> 
> For reasonable use it has to.  That rationale for it is pretty clearly
> outline in ftp://ftp.t10.org/t10/document.08/08-347r1.pdf.
> 

I think the author of that slide deck was being a little hysterical.
In nearly all of these cases, if the file system is comptently
implemented (i.e., you don't do a trim on anything but a deleted file
block, and _only_ when you know the deleted is committed, and only the
filesystem --- not non-privileged users --- are allowed to use the
TRIM command), there's no issue.

One case where I'll admit he may have a point is where you are using
software RAID, where if the TRIM is not determinstic, the only thing
which is safe to do is a TRIM of an entire RAID stripe.  If the file
system doesn't know about the RAID geometry, then yes, in that case
determinstic TRIM is better --- although even then, it implies that
you have to rebuild the raid strip after you do a TRIM, which a
RAID-oblivious file system wouldn't do, and so you would still be
toast.

In the case of incompetently implemented, or buggy file systems, I'll
agree that non-determinstic trim offers a bunch of pitfalls --- but so
does deterministic trim in many of these cases.  For example, one of
their disaster scenarios was where you do a trim on an allocated
block, and then proceed to read from that allocated block.  That could
only happen if the trim happened but the delete didn't commit.  But in
that case, the fact that data blocks would disappear for a non-deleted
file is also a disaster, and means the file system wasn't implemented
correctly.

The other case where non-determinstic trim could get you in trouble is
if (a) you give a user direct read access to a partition of the disk,
without any file system as an intermediary, and (b) you allow that
non-privileged user to issue TRIM commands to that partition.  In that
case, it's possible that non-deterministic trim might allow you access
to non-deleted blocks in other partitions.  But this goes back to my
observation that you're totally safe so long as TRIM is a privileged
operation and/or only allowed to be executed by the file system.

So yeah, deterministic trim is tricker, and you have to be more
careful, especially if you're using RAID, but it's not inherently a
disaster...

						- Ted

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18  7:36 [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation Lukas Czerner
  2010-11-18  7:36 ` [PATCH 2/2] ext4: Add EXT4_IOC_TRIM ioctl to handle batched discard Lukas Czerner
  2010-11-18 13:06 ` [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation Matthew Wilcox
@ 2010-11-19 15:41 ` Ted Ts'o
  2010-11-19 15:50   ` Christoph Hellwig
  2 siblings, 1 reply; 106+ messages in thread
From: Ted Ts'o @ 2010-11-19 15:41 UTC (permalink / raw)
  To: Lukas Czerner; +Cc: linux-ext4, linux-kernel, linux-fsdevel, hch, sandeen

On Thu, Nov 18, 2010 at 08:36:48AM +0100, Lukas Czerner wrote:
> There was concern that FITRIM ioctl is not common enough to be included
> in core vfs ioctl, as Christoph Hellwig pointed out there's no real point
> in dispatching this out to a separate vector instead of just through
> ->ioctl.
> 
> So this commit removes ioctl_fstrim() from vfs ioctl and trim_fs
> from super_operation structure.
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>

Trying to redirect this thread back to the original patches....

What do people think?  Should I try to push these two patches to Linus
now, to make it easier for other file systems who might be interested
in implementing FITRIM functionality?

Or should I wait until the 2.6.37-rc1 window?

We're at -rc3, so a change like this is a bit late, but so far no one
else is using trim_fs besides ext4, and it will make life easier for
other file systems, so I'm willing to try pushing this to Linus if
there is consensus from other fs developers.

						- Ted

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 15:35                     ` Mark Lord
  (?)
@ 2010-11-19 15:44                     ` Lukas Czerner
  2010-11-19 16:30                       ` Ted Ts'o
  -1 siblings, 1 reply; 106+ messages in thread
From: Lukas Czerner @ 2010-11-19 15:44 UTC (permalink / raw)
  To: Mark Lord
  Cc: Ted Ts'o, Steven Whitehouse, Lukas Czerner, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Fri, 19 Nov 2010, Mark Lord wrote:

> On 10-11-19 09:02 AM, Ted Ts'o wrote:
> > but that's why mount -o discard is not the default.
> 
> 
> But, oddly, it _is_ the default for mke2fs -t ext4,
> which really threw me for a loop recently.
> 
> I though my system had locked up when suddenly everything
> went dead for a very long time (many minutes) while installing a new system.
> 
> Then I figured out what was going on.
> I really believe that "-K" should be the _default_ for mke2fs,
> as otherwise lots of other people probably also wonder
> if Linux is killing their SSD at system install time..
> 
> Cheers
> 

That is exactly a reason why I posted a patch for
"Make blkdev_issue_discard() interruptible", but nobody seems to care. As
an addition I have patched mke2fs to inform user about ongoing discard,
also with not much attention (Ted?).


Thanks!
-Lukas

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 15:37                     ` Ted Ts'o
@ 2010-11-19 15:50                         ` Christoph Hellwig
  2010-11-19 15:50                         ` Christoph Hellwig
  1 sibling, 0 replies; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 15:50 UTC (permalink / raw)
  To: Ted Ts'o, Christoph Hellwig, Mark Lord, Steven Whitehouse,
	Lukas Czerner, James Bottomley, Matthew Wilcox, Josef Bacik,
	linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Fri, Nov 19, 2010 at 10:37:48AM -0500, Ted Ts'o wrote:
> I think the author of that slide deck was being a little hysterical.
> In nearly all of these cases, if the file system is comptently
> implemented (i.e., you don't do a trim on anything but a deleted file
> block, and _only_ when you know the deleted is committed, and only the
> filesystem --- not non-privileged users --- are allowed to use the
> TRIM command), there's no issue.

It's a huge issue for virtualization, where naive TRIM implementations
can expose data deleted in one VM to others.  It's also a huge issues
for RAIDs as mentioned by you.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 15:37                     ` Ted Ts'o
@ 2010-11-19 15:50                       ` Christoph Hellwig
  2010-11-19 15:50                         ` Christoph Hellwig
  1 sibling, 0 replies; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 15:50 UTC (permalink / raw)
  To: Ted Ts'o, Christoph Hellwig, Mark Lord, Steven Whitehouse,
	Lukas Czerner, James

On Fri, Nov 19, 2010 at 10:37:48AM -0500, Ted Ts'o wrote:
> I think the author of that slide deck was being a little hysterical.
> In nearly all of these cases, if the file system is comptently
> implemented (i.e., you don't do a trim on anything but a deleted file
> block, and _only_ when you know the deleted is committed, and only the
> filesystem --- not non-privileged users --- are allowed to use the
> TRIM command), there's no issue.

It's a huge issue for virtualization, where naive TRIM implementations
can expose data deleted in one VM to others.  It's also a huge issues
for RAIDs as mentioned by you.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-19 15:50                         ` Christoph Hellwig
  0 siblings, 0 replies; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 15:50 UTC (permalink / raw)
  To: Ted Ts'o, Christoph Hellwig, Mark Lord, Steven Whitehouse,
	Lukas Czerner, James

On Fri, Nov 19, 2010 at 10:37:48AM -0500, Ted Ts'o wrote:
> I think the author of that slide deck was being a little hysterical.
> In nearly all of these cases, if the file system is comptently
> implemented (i.e., you don't do a trim on anything but a deleted file
> block, and _only_ when you know the deleted is committed, and only the
> filesystem --- not non-privileged users --- are allowed to use the
> TRIM command), there's no issue.

It's a huge issue for virtualization, where naive TRIM implementations
can expose data deleted in one VM to others.  It's also a huge issues
for RAIDs as mentioned by you.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 15:41 ` Ted Ts'o
@ 2010-11-19 15:50   ` Christoph Hellwig
  0 siblings, 0 replies; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 15:50 UTC (permalink / raw)
  To: Ted Ts'o, Lukas Czerner, linux-ext4, linux-kernel,
	linux-fsdevel, hch, sandeen

On Fri, Nov 19, 2010 at 10:41:19AM -0500, Ted Ts'o wrote:
> What do people think?  Should I try to push these two patches to Linus
> now, to make it easier for other file systems who might be interested
> in implementing FITRIM functionality?

Yes, please.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 15:50                         ` Christoph Hellwig
  (?)
@ 2010-11-19 16:16                         ` Ted Ts'o
  -1 siblings, 0 replies; 106+ messages in thread
From: Ted Ts'o @ 2010-11-19 16:16 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mark Lord, Steven Whitehouse, Lukas Czerner, James Bottomley,
	Matthew Wilcox, Josef Bacik, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On Fri, Nov 19, 2010 at 10:50:03AM -0500, Christoph Hellwig wrote:
> 
> It's a huge issue for virtualization, where naive TRIM implementations
> can expose data deleted in one VM to others.  It's also a huge issues
> for RAIDs as mentioned by you.

Fair enough, I'll buy that.  If you are sharing an SSD using
virtualization across two VM's with different trust bounaries,
non-deterministic TRIM could very well be an issue, depending on how
it the "non-deterministic" bit was implemented.

(I can think of one PCIe-attached flash implementation I know of where
the trim is simply not persistent across a power failure, such that if
Alice trims a block, it will return 0, but if no one rewrites the
block before a power failure, then reading that some block will return
Alice's original data, and not data belonging to Bob.  This would be
an example of a non-deterministic TRIM that would be problematic for
RAID, but not from a security perspective.  OTOH, the trim command is
blazingly fast on this implementation, since there is absolutely no
flash write or erase operation associated with the TRIM, and so mount
-o discard makes a lot of sense for this particular storage device.)

						- Ted

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 2/2] ext4: Add EXT4_IOC_TRIM ioctl to handle batched discard
  2010-11-18  7:36 ` [PATCH 2/2] ext4: Add EXT4_IOC_TRIM ioctl to handle batched discard Lukas Czerner
@ 2010-11-19 16:19   ` Ted Ts'o
  2010-11-19 16:26     ` Lukas Czerner
  0 siblings, 1 reply; 106+ messages in thread
From: Ted Ts'o @ 2010-11-19 16:19 UTC (permalink / raw)
  To: Lukas Czerner; +Cc: linux-ext4, linux-kernel, linux-fsdevel, hch, sandeen

On Thu, Nov 18, 2010 at 08:36:49AM +0100, Lukas Czerner wrote:
> @@ -541,6 +541,7 @@ struct ext4_new_group_data {
>   /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
>  #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
>  #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
> +#define EXT4_IOC_TRIM			FITRIM

If we're going to keep FITRIM defined in include/fs.h, then we should
just dispatch off of FITRIM below, and not define EXT4_IOC_TRIM.

> +	case EXT4_IOC_TRIM:
> +	{

Agreed?

There's no point making this look like an EXT4-specific interface; if
other file systems want to implement FITRIM as an ioctl, they are free
to do so.

						- Ted

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 11:55                         ` Christoph Hellwig
  2010-11-19 14:01                           ` Mark Lord
@ 2010-11-19 16:20                           ` Greg Freemyer
  2010-11-19 16:38                             ` Christoph Hellwig
  1 sibling, 1 reply; 106+ messages in thread
From: Greg Freemyer @ 2010-11-19 16:20 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mark Lord, Martin K. Petersen, James Bottomley, Jeff Moyer,
	Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Fri, Nov 19, 2010 at 3:55 AM, Christoph Hellwig <hch@infradead.org> wrote:
> On Thu, Nov 18, 2010 at 05:16:36PM -0800, Greg Freemyer wrote:
>> I agree with Mark.  When you say "make coalescing work" it sounds like
>> major overkill.
>>
>> FITRIM should be able to lock a group of non-contiguous free ranges,
>> send them down to the block layer as a single pre-coalesced set, and
>> the block layer just needs to pass it on in a synchronous way.  Then
>> when that group of ranges is discarded, FITRIM releases the locks.
>
> Given that you know the Linux I/O stack and hardware so well may I
> volunteer you to implement it?  Including remapping the multiple ranges
> in device mapper, and dealing with the incompatible range formats for
> TRIM vs UNMAP.  If your implementation is clean enough I'm pretty
> sure no one will object to merge it.
>
>> Every TRIM causes a cache flush anyway on the SSD, so the synchronous
>> aspect of the process should not be a problem.  (Maybe SCSI with thin
>> provisioning would see a performance hit?)
>
> What's your evidence?

The kernel team has been coding around some Utopian SSD TRIM
implementation for at least 2 years with the basic assumption that
SSDs can handle thousands of trims per second.  Just mix em in with
the rest of the i/o.  No problem.  Intel swore to us its the right
thing to do.

I'm still waiting to see the first benchmark report from anywhere
(SSD, Thin Provisioned SCSI) that the online approach used by mount -o
discard is a win performance wise.  Linux has a history of designing
for reality, but for some reason when it comes to SSDs reality seems
not to be a big concern.

Greg

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 2/2] ext4: Add EXT4_IOC_TRIM ioctl to handle batched discard
  2010-11-19 16:19   ` Ted Ts'o
@ 2010-11-19 16:26     ` Lukas Czerner
  2010-11-20  1:37       ` Ted Ts'o
  0 siblings, 1 reply; 106+ messages in thread
From: Lukas Czerner @ 2010-11-19 16:26 UTC (permalink / raw)
  To: Ted Ts'o
  Cc: Lukas Czerner, linux-ext4, linux-kernel, linux-fsdevel, hch, sandeen

On Fri, 19 Nov 2010, Ted Ts'o wrote:

> On Thu, Nov 18, 2010 at 08:36:49AM +0100, Lukas Czerner wrote:
> > @@ -541,6 +541,7 @@ struct ext4_new_group_data {
> >   /* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
> >  #define EXT4_IOC_ALLOC_DA_BLKS		_IO('f', 12)
> >  #define EXT4_IOC_MOVE_EXT		_IOWR('f', 15, struct move_extent)
> > +#define EXT4_IOC_TRIM			FITRIM
> 
> If we're going to keep FITRIM defined in include/fs.h, then we should
> just dispatch off of FITRIM below, and not define EXT4_IOC_TRIM.
> 
> > +	case EXT4_IOC_TRIM:
> > +	{
> 
> Agreed?

I am ok either way, I was just following example of EXT4_IOC_GETFLAGS -> 
FS_IOC_GETFLAGS etc..

-Lukas

> 
> There's no point making this look like an EXT4-specific interface; if
> other file systems want to implement FITRIM as an ioctl, they are free
> to do so.
> 
> 						- Ted
> 

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 15:44                     ` Lukas Czerner
@ 2010-11-19 16:30                       ` Ted Ts'o
  2010-11-19 22:49                           ` Mark Lord
                                           ` (3 more replies)
  0 siblings, 4 replies; 106+ messages in thread
From: Ted Ts'o @ 2010-11-19 16:30 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Mark Lord, Steven Whitehouse, James Bottomley, Christoph Hellwig,
	Matthew Wilcox, Josef Bacik, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On Fri, Nov 19, 2010 at 04:44:33PM +0100, Lukas Czerner wrote:
> > 
> > But, oddly, it _is_ the default for mke2fs -t ext4,
> > which really threw me for a loop recently.
> > 
> > I though my system had locked up when suddenly everything
> > went dead for a very long time (many minutes) while installing a 
> > new system.

Yeah, the assumption was doing a single big discard (which is all
mke2fs is doing) should be fast.  At least on sanely implemented SSD's
(i.e., like the Intel X25-M) it should be, since all that should
require is a flash write to the global mapping table, declaring all of
the blocks as free.

If there are some incompetently implemented SSD's out there which do a
flash erase of the entire SSD upon receiving a TRIM command (new
flash!  Part of the whole *point* of a TRIM was to increase write
endurance by eliminating the need to copy blocks that really weren't
in use any more by the OS when the SSD is doing a GC copy/compaction
of a partially written flash sector), all I can do is do a sigh, and
wish that T13 had defined a "comptently implemented SSD bit" --- not
that Indilinix would admit if it they were incompetent.  :-/

> That is exactly a reason why I posted a patch for
> "Make blkdev_issue_discard() interruptible", but nobody seems to care. As
> an addition I have patched mke2fs to inform user about ongoing discard,
> also with not much attention (Ted?).

Yeah, sorry.  I'm still recovering from the kernel summit and
plumber's.  I've got to get the critical bugfix patches out to Linus
before -rc3, and then I will try to get back to e2fsprogs.

       	      	       				- Ted

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 16:20                           ` Greg Freemyer
@ 2010-11-19 16:38                             ` Christoph Hellwig
  2010-11-19 18:06                               ` Lukas Czerner
  0 siblings, 1 reply; 106+ messages in thread
From: Christoph Hellwig @ 2010-11-19 16:38 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Christoph Hellwig, Mark Lord, Martin K. Petersen,
	James Bottomley, Jeff Moyer, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

On Fri, Nov 19, 2010 at 08:20:58AM -0800, Greg Freemyer wrote:
> The kernel team has been coding around some Utopian SSD TRIM
> implementation for at least 2 years with the basic assumption that
> SSDs can handle thousands of trims per second.  Just mix em in with
> the rest of the i/o.  No problem.  Intel swore to us its the right
> thing to do.

Thanks Greg, good that you told us what we've been doing.  I would have
forgot myself if you didn't remember me.

> I'm still waiting to see the first benchmark report from anywhere
> (SSD, Thin Provisioned SCSI) that the online approach used by mount -o
> discard is a win performance wise.  Linux has a history of designing
> for reality, but for some reason when it comes to SSDs reality seems
> not to be a big concern.

Both Lukas and I have done extensive benchmarks on various SSDs and
thinkly provisioned raids.  Unfortunately most of the hardware is only
available under NDA so we can't publish it.

For the XFS side which I've looked it I can summarize that we do have
arrays that do the online discard without measureable performance
penalty on various workloads, and we have devices (both SSDs and arrays)
where the overhead is incredibly huge.  I can also say that doing the
walk of the freespace btrees similar to the offline discard, but every
30 seconds or at a similarly high interval is a sure way to completely
kill performance.

Or in short we haven't fund the holy grail yet.


^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 16:38                             ` Christoph Hellwig
@ 2010-11-19 18:06                               ` Lukas Czerner
  2010-11-19 18:10                                 ` Lukas Czerner
  2010-11-19 19:29                                 ` Chris Mason
  0 siblings, 2 replies; 106+ messages in thread
From: Lukas Czerner @ 2010-11-19 18:06 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Greg Freemyer, Mark Lord, Martin K. Petersen, James Bottomley,
	Jeff Moyer, Matthew Wilcox, Josef Bacik, Lukas Czerner, tytso,
	linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Fri, 19 Nov 2010, Christoph Hellwig wrote:

> On Fri, Nov 19, 2010 at 08:20:58AM -0800, Greg Freemyer wrote:
> > The kernel team has been coding around some Utopian SSD TRIM
> > implementation for at least 2 years with the basic assumption that
> > SSDs can handle thousands of trims per second.  Just mix em in with
> > the rest of the i/o.  No problem.  Intel swore to us its the right
> > thing to do.
> 
> Thanks Greg, good that you told us what we've been doing.  I would have
> forgot myself if you didn't remember me.
> 
> > I'm still waiting to see the first benchmark report from anywhere
> > (SSD, Thin Provisioned SCSI) that the online approach used by mount -o
> > discard is a win performance wise.  Linux has a history of designing
> > for reality, but for some reason when it comes to SSDs reality seems
> > not to be a big concern.
> 
> Both Lukas and I have done extensive benchmarks on various SSDs and
> thinkly provisioned raids.  Unfortunately most of the hardware is only
> available under NDA so we can't publish it.
> 
> For the XFS side which I've looked it I can summarize that we do have
> arrays that do the online discard without measureable performance
> penalty on various workloads, and we have devices (both SSDs and arrays)
> where the overhead is incredibly huge.  I can also say that doing the
> walk of the freespace btrees similar to the offline discard, but every
> 30 seconds or at a similarly high interval is a sure way to completely
> kill performance.
> 
> Or in short we haven't fund the holy grail yet.
> 

Indeed we have not. But speaking of benchmarks I have just finished
quick run (well, not so quick:)) of my discard-kit for btrfs filesystem
and here are results. Note that tool used for this benchmark is
postmark, hence it might not be the realest use-case, but it provides
nice comparison between ext4 (below) and btrfs online discard
implementation (FITRIM is NOT involved).


(Sadly the table is too wide so you have to...well, you guys can manage
it somehow, right?).

BTRFS
-----

                                   |          BUFFERING ENABLED         |       BUFFERING DISABLED           |
--------------------------------------------------------------------------------------------------------------
Type                               |NODISCARD    DISCARD      DIFF      |NODISCARD    DISCARD      DIFF      |
==============================================================================================================
Total_duration                     |230.90       336.20       45.60%    |232.00       335.00       44.40%    |
Duration_of_transactions           |159.60       266.10       66.73%    |158.90       264.60       66.52%    |
Transactions/s                     |313.32       188.01       -39.99%   |314.70       189.07       -39.92%   |
Files_created/s                    |323.84       222.48       -31.30%   |322.28       223.28       -30.72%   |
Creation_alone/s                   |778.08       796.37       2.35%     |756.66       787.68       4.10%     |
Creation_mixed_with_transaction/s  |155.16       93.11        -39.99%   |155.84       93.63        -39.92%   |
Read/s                             |156.50       93.91        -39.99%   |157.18       94.44        -39.92%   |
Append/s                           |156.82       94.10        -39.99%   |157.50       94.63        -39.92%   |
Deleted/s                          |323.84       222.48       -31.30%   |322.28       223.28       -30.72%   |
Deletion_alone/s                   |770.64       788.75       2.35%     |749.42       780.15       4.10%     |
Deletion_mixed_with_transaction/s  |158.16       94.90        -40.00%   |158.85       95.44        -39.92%   |
Read_B/s                           |11925050.90  8192800.35   -31.30%   |11867797.20  8221997.40   -30.72%   |
Write_B/s                          |37318466.00  25638695.00  -31.30%   |37139294.00  25730064.60  -30.72%   |
==============================================================================================================

EXT4
----
                                   |          BUFFERING ENABLED         |       BUFFERING DISABLED           |
--------------------------------------------------------------------------------------------------------------
Type                               |NODISCARD    DISCARD      DIFF      |NODISCARD    DISCARD      DIFF      |
==============================================================================================================
Total_duration                     |306.10       512.70       67.49%    |301.60       516.10       71.12%    |
Duration_of_transactions           |243.50       449.80       84.72%    |239.00       453.90       89.92%    |
Transactions/s                     |205.43       111.19       -45.87%   |209.32       110.17       -47.37%   |
Files_created/s                    |244.30       145.85       -40.30%   |247.97       144.87       -41.58%   |
Creation_alone/s                   |834.88       830.60       -0.51%    |830.60       833.42       0.34%     |
Creation_mixed_with_transaction/s  |101.73       55.06        -45.88%   |103.66       54.55        -47.38%   |
Read/s                             |102.61       55.54        -45.87%   |104.55       55.03        -47.36%   |
Append/s                           |102.82       55.65        -45.88%   |104.76       55.14        -47.37%   |
Deleted/s                          |244.30       145.85       -40.30%   |247.97       144.87       -41.58%   |
Deletion_alone/s                   |826.90       822.66       -0.51%    |822.66       825.46       0.34%     |
Deletion_mixed_with_transaction/s  |103.70       56.13        -45.87%   |105.66       55.61        -47.37%   |
Read_B/s                           |8996110.60   5370694.40   -40.30%   |9131349.20   5334560.40   -41.58%   |
Write_B/s                          |28152588.40  16807146.60  -40.30%   |28575806.40  16694068.00  -41.58%   |
==============================================================================================================


(Buffering means that C library function like fopen, fread, fwrite are
used instead of open, read, write. I have used the word buffering in the
same way as it is used in the postmark test)

So, you can see that Btrfs handles online discard quite better than ext4
(cca 20% difference), but it is still pretty massive performance loss on
not-so-good-but-I-have-seen-worse SSD. So, I would say that you guys
(Josef?) should at least consider the possibility of using FITRIM as well.

Thanks!

-Lukas

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 18:06                               ` Lukas Czerner
@ 2010-11-19 18:10                                 ` Lukas Czerner
  2010-11-19 18:14                                   ` Lukas Czerner
  2010-11-19 19:29                                 ` Chris Mason
  1 sibling, 1 reply; 106+ messages in thread
From: Lukas Czerner @ 2010-11-19 18:10 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Christoph Hellwig, Greg Freemyer, Mark Lord, Martin K. Petersen,
	James Bottomley, Jeff Moyer, Matthew Wilcox, Josef Bacik, tytso,
	linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Fri, 19 Nov 2010, Lukas Czerner wrote:

> On Fri, 19 Nov 2010, Christoph Hellwig wrote:
> 
> > On Fri, Nov 19, 2010 at 08:20:58AM -0800, Greg Freemyer wrote:
> > > The kernel team has been coding around some Utopian SSD TRIM
> > > implementation for at least 2 years with the basic assumption that
> > > SSDs can handle thousands of trims per second.  Just mix em in with
> > > the rest of the i/o.  No problem.  Intel swore to us its the right
> > > thing to do.
> > 
> > Thanks Greg, good that you told us what we've been doing.  I would have
> > forgot myself if you didn't remember me.
> > 
> > > I'm still waiting to see the first benchmark report from anywhere
> > > (SSD, Thin Provisioned SCSI) that the online approach used by mount -o
> > > discard is a win performance wise.  Linux has a history of designing
> > > for reality, but for some reason when it comes to SSDs reality seems
> > > not to be a big concern.
> > 
> > Both Lukas and I have done extensive benchmarks on various SSDs and
> > thinkly provisioned raids.  Unfortunately most of the hardware is only
> > available under NDA so we can't publish it.
> > 
> > For the XFS side which I've looked it I can summarize that we do have
> > arrays that do the online discard without measureable performance
> > penalty on various workloads, and we have devices (both SSDs and arrays)
> > where the overhead is incredibly huge.  I can also say that doing the
> > walk of the freespace btrees similar to the offline discard, but every
> > 30 seconds or at a similarly high interval is a sure way to completely
> > kill performance.
> > 
> > Or in short we haven't fund the holy grail yet.
> > 
> 
> Indeed we have not. But speaking of benchmarks I have just finished
> quick run (well, not so quick:)) of my discard-kit for btrfs filesystem
> and here are results. Note that tool used for this benchmark is
> postmark, hence it might not be the realest use-case, but it provides
> nice comparison between ext4 (below) and btrfs online discard
> implementation (FITRIM is NOT involved).
> 
> 
> (Sadly the table is too wide so you have to...well, you guys can manage
> it somehow, right?).
> 
-snip-

In the sake of completeness here is an information how it was tested.

1. mkfs
2. mount -o (nodiscard|discard)
3. ./postmark
4. umount
5. repeat 1. - 4. ten times for each mnt option

-snip-
> 
> (Buffering means that C library function like fopen, fread, fwrite are
> used instead of open, read, write. I have used the word buffering in the
> same way as it is used in the postmark test)
> 
> So, you can see that Btrfs handles online discard quite better than ext4
> (cca 20% difference), but it is still pretty massive performance loss on
> not-so-good-but-I-have-seen-worse SSD. So, I would say that you guys
> (Josef?) should at least consider the possibility of using FITRIM as well.
> 
> Thanks!
> 
> -Lukas
> 

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 18:10                                 ` Lukas Czerner
@ 2010-11-19 18:14                                   ` Lukas Czerner
  0 siblings, 0 replies; 106+ messages in thread
From: Lukas Czerner @ 2010-11-19 18:14 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Christoph Hellwig, Greg Freemyer, Mark Lord, Martin K. Petersen,
	James Bottomley, Jeff Moyer, Matthew Wilcox, Josef Bacik, tytso,
	linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Fri, 19 Nov 2010, Lukas Czerner wrote:
> 
> 1. mkfs
> 2. mount -o (nodiscard|discard)
> 3. ./postmark
> 4. umount
> 5. repeat 1. - 4. ten times for each mnt option
> 
> -snip-

Done on 2.6.37-rc2 kernel.

-Lukas

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 18:06                               ` Lukas Czerner
  2010-11-19 18:10                                 ` Lukas Czerner
@ 2010-11-19 19:29                                 ` Chris Mason
  1 sibling, 0 replies; 106+ messages in thread
From: Chris Mason @ 2010-11-19 19:29 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Christoph Hellwig, Greg Freemyer, Mark Lord, Martin K. Petersen,
	James Bottomley, Jeff Moyer, Matthew Wilcox, Josef Bacik, tytso,
	linux-ext4, linux-kernel, linux-fsdevel, sandeen

Excerpts from Lukas Czerner's message of 2010-11-19 13:06:16 -0500:
> On Fri, 19 Nov 2010, Christoph Hellwig wrote:
> 
> > On Fri, Nov 19, 2010 at 08:20:58AM -0800, Greg Freemyer wrote:
> > > The kernel team has been coding around some Utopian SSD TRIM
> > > implementation for at least 2 years with the basic assumption that
> > > SSDs can handle thousands of trims per second.  Just mix em in with
> > > the rest of the i/o.  No problem.  Intel swore to us its the right
> > > thing to do.
> > 
> > Thanks Greg, good that you told us what we've been doing.  I would have
> > forgot myself if you didn't remember me.
> > 
> > > I'm still waiting to see the first benchmark report from anywhere
> > > (SSD, Thin Provisioned SCSI) that the online approach used by mount -o
> > > discard is a win performance wise.  Linux has a history of designing
> > > for reality, but for some reason when it comes to SSDs reality seems
> > > not to be a big concern.
> > 
> > Both Lukas and I have done extensive benchmarks on various SSDs and
> > thinkly provisioned raids.  Unfortunately most of the hardware is only
> > available under NDA so we can't publish it.
> > 
> > For the XFS side which I've looked it I can summarize that we do have
> > arrays that do the online discard without measureable performance
> > penalty on various workloads, and we have devices (both SSDs and arrays)
> > where the overhead is incredibly huge.  I can also say that doing the
> > walk of the freespace btrees similar to the offline discard, but every
> > 30 seconds or at a similarly high interval is a sure way to completely
> > kill performance.
> > 
> > Or in short we haven't fund the holy grail yet.
> > 
> 
> Indeed we have not. But speaking of benchmarks I have just finished
> quick run (well, not so quick:)) of my discard-kit for btrfs filesystem
> and here are results. Note that tool used for this benchmark is
> postmark, hence it might not be the realest use-case, but it provides
> nice comparison between ext4 (below) and btrfs online discard
> implementation (FITRIM is NOT involved).

Thanks a lot for posting these, I know it takes forever to run them.

I hesitate to trust postmark too much for comparing the ext4 trim with
the btrfs trim because we might have dramatically different lifetimes on
the blocks.  So if I manage to just do fewer allocations than ext4, I'll
also do fewer trims.

I'd also be curious to see how many trims each of us did, maybe running
w/blktrace could show that?

The btrfs online discard will trim all the metadata blocks as they are
freed, and in a COW filesystem this makes for a very noisy trim.  We
could reduce our trim load considerably by only trimming data blocks,
and only trimming metadata when we make a big free extent.

The default btrfs options duplicate metadata, so we actually end up
doing 2 trims for every metadata block we free.

At any rate, I definitely think both the online trim and the FITRIM have
their uses.  One thing that has burnt us in the past is coding too much
for the performance of the current crop of ssds when the next crop ends
up making our optimizations useless.

This is the main reason I think the online trim is going to be better
and better.  The FS has a ton of low hanging fruit in there and the
devices are going to improve.  At some point the biggest perf problem
will just be the non-queueable trim command.

One thing I haven't seen benchmarked is how trim changes the performance
of the SSD as the poor little log structured squirrels inside run out
of places to store things.  Does it get rid of the cliffs in performance as
the drive ages, and how do we measure that in general?

-chris

> 
> 
> (Sadly the table is too wide so you have to...well, you guys can manage
> it somehow, right?).
> 
> BTRFS
> -----
> 
>                                    |          BUFFERING ENABLED         |       BUFFERING DISABLED           |
> --------------------------------------------------------------------------------------------------------------
> Type                               |NODISCARD    DISCARD      DIFF      |NODISCARD    DISCARD      DIFF      |
> ==============================================================================================================
> Total_duration                     |230.90       336.20       45.60%    |232.00       335.00       44.40%    |
> Duration_of_transactions           |159.60       266.10       66.73%    |158.90       264.60       66.52%    |
> Transactions/s                     |313.32       188.01       -39.99%   |314.70       189.07       -39.92%   |
> Files_created/s                    |323.84       222.48       -31.30%   |322.28       223.28       -30.72%   |
> Creation_alone/s                   |778.08       796.37       2.35%     |756.66       787.68       4.10%     |
> Creation_mixed_with_transaction/s  |155.16       93.11        -39.99%   |155.84       93.63        -39.92%   |
> Read/s                             |156.50       93.91        -39.99%   |157.18       94.44        -39.92%   |
> Append/s                           |156.82       94.10        -39.99%   |157.50       94.63        -39.92%   |
> Deleted/s                          |323.84       222.48       -31.30%   |322.28       223.28       -30.72%   |
> Deletion_alone/s                   |770.64       788.75       2.35%     |749.42       780.15       4.10%     |
> Deletion_mixed_with_transaction/s  |158.16       94.90        -40.00%   |158.85       95.44        -39.92%   |
> Read_B/s                           |11925050.90  8192800.35   -31.30%   |11867797.20  8221997.40   -30.72%   |
> Write_B/s                          |37318466.00  25638695.00  -31.30%   |37139294.00  25730064.60  -30.72%   |
> ==============================================================================================================
> 
> EXT4
> ----
>                                    |          BUFFERING ENABLED         |       BUFFERING DISABLED           |
> --------------------------------------------------------------------------------------------------------------
> Type                               |NODISCARD    DISCARD      DIFF      |NODISCARD    DISCARD      DIFF      |
> ==============================================================================================================
> Total_duration                     |306.10       512.70       67.49%    |301.60       516.10       71.12%    |
> Duration_of_transactions           |243.50       449.80       84.72%    |239.00       453.90       89.92%    |
> Transactions/s                     |205.43       111.19       -45.87%   |209.32       110.17       -47.37%   |
> Files_created/s                    |244.30       145.85       -40.30%   |247.97       144.87       -41.58%   |
> Creation_alone/s                   |834.88       830.60       -0.51%    |830.60       833.42       0.34%     |
> Creation_mixed_with_transaction/s  |101.73       55.06        -45.88%   |103.66       54.55        -47.38%   |
> Read/s                             |102.61       55.54        -45.87%   |104.55       55.03        -47.36%   |
> Append/s                           |102.82       55.65        -45.88%   |104.76       55.14        -47.37%   |
> Deleted/s                          |244.30       145.85       -40.30%   |247.97       144.87       -41.58%   |
> Deletion_alone/s                   |826.90       822.66       -0.51%    |822.66       825.46       0.34%     |
> Deletion_mixed_with_transaction/s  |103.70       56.13        -45.87%   |105.66       55.61        -47.37%   |
> Read_B/s                           |8996110.60   5370694.40   -40.30%   |9131349.20   5334560.40   -41.58%   |
> Write_B/s                          |28152588.40  16807146.60  -40.30%   |28575806.40  16694068.00  -41.58%   |
> ==============================================================================================================
> 
> 
> (Buffering means that C library function like fopen, fread, fwrite are
> used instead of open, read, write. I have used the word buffering in the
> same way as it is used in the postmark test)
> 
> So, you can see that Btrfs handles online discard quite better than ext4
> (cca 20% difference), but it is still pretty massive performance loss on
> not-so-good-but-I-have-seen-worse SSD. So, I would say that you guys
> (Josef?) should at least consider the possibility of using FITRIM as well.
> 
> Thanks!
> 
> -Lukas

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 16:30                       ` Ted Ts'o
@ 2010-11-19 22:49                           ` Mark Lord
  2010-11-19 22:49                         ` Mark Lord
                                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 22:49 UTC (permalink / raw)
  To: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-19 11:30 AM, Ted Ts'o wrote:
>
> If there are some incompetently implemented SSD's out there which do a
> flash erase of the entire SSD upon receiving a TRIM command (new
> flash!  Part of the whole *point* of a TRIM was to increase write
> endurance by eliminating the need to copy blocks that really weren't
> in use any more by the OS when the SSD is doing a GC copy/compaction
> of a partially written flash sector), all I can do is do a sigh, and
> wish that T13 had defined a "comptently implemented SSD bit" --- not
> that Indilinix would admit if it they were incompetent.  :-/


The Sandforce drives seem even slower for mke2fs than the Indilinx.
And both types are faster than the Intel units for desktop use.
At least they optimized for the important commands.  :)

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-19 22:49                           ` Mark Lord
  0 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 22:49 UTC (permalink / raw)
  To: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig

On 10-11-19 11:30 AM, Ted Ts'o wrote:
>
> If there are some incompetently implemented SSD's out there which do a
> flash erase of the entire SSD upon receiving a TRIM command (new
> flash!  Part of the whole *point* of a TRIM was to increase write
> endurance by eliminating the need to copy blocks that really weren't
> in use any more by the OS when the SSD is doing a GC copy/compaction
> of a partially written flash sector), all I can do is do a sigh, and
> wish that T13 had defined a "comptently implemented SSD bit" --- not
> that Indilinix would admit if it they were incompetent.  :-/


The Sandforce drives seem even slower for mke2fs than the Indilinx.
And both types are faster than the Intel units for desktop use.
At least they optimized for the important commands.  :)

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 16:30                       ` Ted Ts'o
  2010-11-19 22:49                           ` Mark Lord
@ 2010-11-19 22:49                         ` Mark Lord
  2010-11-25  2:48                         ` Mark Lord
  2010-11-25  2:48                           ` Mark Lord
  3 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-19 22:49 UTC (permalink / raw)
  To: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig

On 10-11-19 11:30 AM, Ted Ts'o wrote:
>
> If there are some incompetently implemented SSD's out there which do a
> flash erase of the entire SSD upon receiving a TRIM command (new
> flash!  Part of the whole *point* of a TRIM was to increase write
> endurance by eliminating the need to copy blocks that really weren't
> in use any more by the OS when the SSD is doing a GC copy/compaction
> of a partially written flash sector), all I can do is do a sigh, and
> wish that T13 had defined a "comptently implemented SSD bit" --- not
> that Indilinix would admit if it they were incompetent.  :-/


The Sandforce drives seem even slower for mke2fs than the Indilinx.
And both types are faster than the Intel units for desktop use.
At least they optimized for the important commands.  :)

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 2/2] ext4: Add EXT4_IOC_TRIM ioctl to handle batched discard
  2010-11-19 16:26     ` Lukas Czerner
@ 2010-11-20  1:37       ` Ted Ts'o
  0 siblings, 0 replies; 106+ messages in thread
From: Ted Ts'o @ 2010-11-20  1:37 UTC (permalink / raw)
  To: Lukas Czerner; +Cc: linux-ext4, linux-kernel, linux-fsdevel, hch, sandeen

On Fri, Nov 19, 2010 at 05:26:51PM +0100, Lukas Czerner wrote:
> > If we're going to keep FITRIM defined in include/fs.h, then we should
> > just dispatch off of FITRIM below, and not define EXT4_IOC_TRIM.
> > 
> > > +	case EXT4_IOC_TRIM:
> > > +	{
> > 
> > Agreed?
> 
> I am ok either way, I was just following example of EXT4_IOC_GETFLAGS -> 
> FS_IOC_GETFLAGS etc..

The history of that was the ioctl was originally EXT4_IOC_GETFLAGS and
it was later generalized to be FS_IOC_GETFLAGS.  In this case though
we started with it being an generalized interface, so there's no need
to create an EXT4_IOC_TRIM ioctl.  I'll fix that up in your patch when
I add it to the ext4 patch queue.

						- Ted

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 14:53               ` Mark Lord
  2010-11-19 14:57                 ` Christoph Hellwig
@ 2010-11-21 19:07                 ` Valdis.Kletnieks
  2010-11-21 20:20                   ` James Bottomley
  1 sibling, 1 reply; 106+ messages in thread
From: Valdis.Kletnieks @ 2010-11-21 19:07 UTC (permalink / raw)
  To: Mark Lord
  Cc: Chris Mason, James Bottomley, Christoph Hellwig, Matthew Wilcox,
	Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

[-- Attachment #1: Type: text/plain, Size: 755 bytes --]

On Fri, 19 Nov 2010 09:53:52 EST, Mark Lord said:
> On 10-11-19 09:40 AM, Chris Mason wrote:
> >
> > We've been told that online and constant trimming is the default in
> > windows7.   The ssds are most likely to just start ignoring the trims
> > they can't service efficiently.
> 
> Win7 collects multiple TRIM ranges over time and batches them as single TRIMs
> (as reported to me by an SSD vendor who traced it with a SATA analyzer,
>   and who also apparently has "inside info").

What should happen if we have (for instance) a "collect 64 trims at a time" policy,
and the system crashes at trim number 47? (Probably not an issue if you're
doing non-deterministic trim, but is an exposure if you're relying on deterministic
trim for security reasons)

[-- Attachment #2: Type: application/pgp-signature, Size: 227 bytes --]

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-21 19:07                 ` Valdis.Kletnieks
@ 2010-11-21 20:20                   ` James Bottomley
  0 siblings, 0 replies; 106+ messages in thread
From: James Bottomley @ 2010-11-21 20:20 UTC (permalink / raw)
  To: Valdis.Kletnieks
  Cc: Mark Lord, Chris Mason, Christoph Hellwig, Matthew Wilcox,
	Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On Sun, 2010-11-21 at 14:07 -0500, Valdis.Kletnieks@vt.edu wrote:
> On Fri, 19 Nov 2010 09:53:52 EST, Mark Lord said:
> > On 10-11-19 09:40 AM, Chris Mason wrote:
> > >
> > > We've been told that online and constant trimming is the default in
> > > windows7.   The ssds are most likely to just start ignoring the trims
> > > they can't service efficiently.
> > 
> > Win7 collects multiple TRIM ranges over time and batches them as single TRIMs
> > (as reported to me by an SSD vendor who traced it with a SATA analyzer,
> >   and who also apparently has "inside info").
> 
> What should happen if we have (for instance) a "collect 64 trims at a time" policy,
> and the system crashes at trim number 47? (Probably not an issue if you're
> doing non-deterministic trim, but is an exposure if you're relying on deterministic
> trim for security reasons)

I think it's about the third time in the thread this has been said but
just in case anyone else missed it: TRIM != SECURE ERASE.
TRIM/UNMAP/WRITE_SAME are used to provide optional information about
which blocks the filesystem doesn't care about.  They have no bearing on
information security which is preserved by separate mechanisms.

James



^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 16:30                       ` Ted Ts'o
@ 2010-11-25  2:48                           ` Mark Lord
  2010-11-19 22:49                         ` Mark Lord
                                             ` (2 subsequent siblings)
  3 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-25  2:48 UTC (permalink / raw)
  To: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-19 11:30 AM, Ted Ts'o wrote:
> On Fri, Nov 19, 2010 at 04:44:33PM +0100, Lukas Czerner wrote:
>>>
>>> But, oddly, it _is_ the default for mke2fs -t ext4,
>>> which really threw me for a loop recently.
>>>
>>> I though my system had locked up when suddenly everything
>>> went dead for a very long time (many minutes) while installing a
>>> new system.
>
> Yeah, the assumption was doing a single big discard (which is all
> mke2fs is doing) should be fast.  At least on sanely implemented SSD's
> (i.e., like the Intel X25-M) it should be, since all that should
> require is a flash write to the global mapping table, declaring all of
> the blocks as free.

But mke2fs probably is NOT doing a "single big discard", because for SATA the
TRIM command is limited to 64K sectors per range.. and the in-kernel TRIM
code only ever does single ranges..

So doing a discard over an entire drive-encompassing partition, say.. 100GB,
will require 3000+ individual TRIM commands.  At (say) 200msecs each, that
adds up to about ten minutes of execution time.  Or less if the drive is
faster than that.

Whereas.. grouping them into 64-ranges per trim, could reduce the execution
time down to perhaps 1/50th of that, or in the range of 10-20 seconds instead.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 16:30                       ` Ted Ts'o
  2010-11-19 22:49                           ` Mark Lord
  2010-11-19 22:49                         ` Mark Lord
@ 2010-11-25  2:48                         ` Mark Lord
  2010-11-25  2:48                           ` Mark Lord
  3 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-25  2:48 UTC (permalink / raw)
  To: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig

On 10-11-19 11:30 AM, Ted Ts'o wrote:
> On Fri, Nov 19, 2010 at 04:44:33PM +0100, Lukas Czerner wrote:
>>>
>>> But, oddly, it _is_ the default for mke2fs -t ext4,
>>> which really threw me for a loop recently.
>>>
>>> I though my system had locked up when suddenly everything
>>> went dead for a very long time (many minutes) while installing a
>>> new system.
>
> Yeah, the assumption was doing a single big discard (which is all
> mke2fs is doing) should be fast.  At least on sanely implemented SSD's
> (i.e., like the Intel X25-M) it should be, since all that should
> require is a flash write to the global mapping table, declaring all of
> the blocks as free.

But mke2fs probably is NOT doing a "single big discard", because for SATA the
TRIM command is limited to 64K sectors per range.. and the in-kernel TRIM
code only ever does single ranges..

So doing a discard over an entire drive-encompassing partition, say.. 100GB,
will require 3000+ individual TRIM commands.  At (say) 200msecs each, that
adds up to about ten minutes of execution time.  Or less if the drive is
faster than that.

Whereas.. grouping them into 64-ranges per trim, could reduce the execution
time down to perhaps 1/50th of that, or in the range of 10-20 seconds instead.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-25  2:48                           ` Mark Lord
  0 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-25  2:48 UTC (permalink / raw)
  To: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig

On 10-11-19 11:30 AM, Ted Ts'o wrote:
> On Fri, Nov 19, 2010 at 04:44:33PM +0100, Lukas Czerner wrote:
>>>
>>> But, oddly, it _is_ the default for mke2fs -t ext4,
>>> which really threw me for a loop recently.
>>>
>>> I though my system had locked up when suddenly everything
>>> went dead for a very long time (many minutes) while installing a
>>> new system.
>
> Yeah, the assumption was doing a single big discard (which is all
> mke2fs is doing) should be fast.  At least on sanely implemented SSD's
> (i.e., like the Intel X25-M) it should be, since all that should
> require is a flash write to the global mapping table, declaring all of
> the blocks as free.

But mke2fs probably is NOT doing a "single big discard", because for SATA the
TRIM command is limited to 64K sectors per range.. and the in-kernel TRIM
code only ever does single ranges..

So doing a discard over an entire drive-encompassing partition, say.. 100GB,
will require 3000+ individual TRIM commands.  At (say) 200msecs each, that
adds up to about ten minutes of execution time.  Or less if the drive is
faster than that.

Whereas.. grouping them into 64-ranges per trim, could reduce the execution
time down to perhaps 1/50th of that, or in the range of 10-20 seconds instead.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-25  2:48                           ` Mark Lord
  (?)
@ 2010-11-25  4:23                           ` Martin K. Petersen
  2010-11-25 14:44                             ` Mark Lord
  -1 siblings, 1 reply; 106+ messages in thread
From: Martin K. Petersen @ 2010-11-25  4:23 UTC (permalink / raw)
  To: Mark Lord
  Cc: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

>>>>> "Mark" == Mark Lord <kernel@teksavvy.com> writes:

Mark> But mke2fs probably is NOT doing a "single big discard", because
Mark> for SATA the TRIM command is limited to 64K sectors per
Mark> range.. and the in-kernel TRIM code only ever does single ranges..

How many times must I reiterate that this is not true?

Due to various limitations in the way the block layer works we can not
clear LBA 10 to 20 and LBA 40 to 50 in one command. We don't have an
infrastructure in place that allows us to operate on several discrete
areas in one request.

But for contiguous areas we will cram as many ranges entries in as we
can fit in the TRIM payload. This means we'll issue one TRIM command for
every 2 GiB minus change (65535*64*512 / 1048576 = 2047 MiB).

In your example above blkdev_issue_discard() will loop over the 100 GiB
range requested by the caller (filesystem or BLKDISCARD ioctl) and issue
about fifty TRIM commands to the drive.

Large discards are a best-case scenario (short of SECURE ERASE). So if
your drive takes forever to perform this operation then all bets are
off in the TRIM department.


Mark> So doing a discard over an entire drive-encompassing partition,
Mark> say.. 100GB, will require 3000+ individual TRIM commands. 

No.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-25  2:48                           ` Mark Lord
@ 2010-11-25  4:41                             ` Greg Freemyer
  -1 siblings, 0 replies; 106+ messages in thread
From: Greg Freemyer @ 2010-11-25  4:41 UTC (permalink / raw)
  To: Mark Lord
  Cc: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Wed, Nov 24, 2010 at 9:48 PM, Mark Lord <kernel@teksavvy.com> wrote:
> On 10-11-19 11:30 AM, Ted Ts'o wrote:
>>
>> On Fri, Nov 19, 2010 at 04:44:33PM +0100, Lukas Czerner wrote:
>>>>
>>>> But, oddly, it _is_ the default for mke2fs -t ext4,
>>>> which really threw me for a loop recently.
>>>>
>>>> I though my system had locked up when suddenly everything
>>>> went dead for a very long time (many minutes) while installing a
>>>> new system.
>>
>> Yeah, the assumption was doing a single big discard (which is all
>> mke2fs is doing) should be fast.  At least on sanely implemented SSD's
>> (i.e., like the Intel X25-M) it should be, since all that should
>> require is a flash write to the global mapping table, declaring all of
>> the blocks as free.
>
> But mke2fs probably is NOT doing a "single big discard", because for SATA
> the
> TRIM command is limited to 64K sectors per range.. and the in-kernel TRIM
> code only ever does single ranges..
>
> So doing a discard over an entire drive-encompassing partition, say.. 100GB,
> will require 3000+ individual TRIM commands.  At (say) 200msecs each, that
> adds up to about ten minutes of execution time.  Or less if the drive is
> faster than that.
>
> Whereas.. grouping them into 64-ranges per trim, could reduce the execution
> time down to perhaps 1/50th of that, or in the range of 10-20 seconds
> instead.
>
> Cheers

Mark,

With recent kernels, this is supposed to work as you describe.  ie. 64
contiguous ranges per trim command.

If you see a significant speed difference between mke2fs and running
wiper.sh on that same filesystem immediately after formatting, then
their is likely a bug worth chasing.

Are you seeing an actual speed difference, or just assuming there is
one?  If mke2fs is slower than wiper.sh, what kernel are you testing
with?

Greg
-- 
Greg Freemyer
Head of EDD Tape Extraction and Processing team
Litigation Triage Solutions Specialist
http://www.linkedin.com/in/gregfreemyer
CNN/TruTV Aired Forensic Imaging Demo -
   http://insession.blogs.cnn.com/2010/03/23/how-computer-evidence-gets-retrieved/

The Norcross Group
The Intersection of Evidence & Technology
http://www.norcrossgroup.com

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-25  4:41                             ` Greg Freemyer
  0 siblings, 0 replies; 106+ messages in thread
From: Greg Freemyer @ 2010-11-25  4:41 UTC (permalink / raw)
  To: Mark Lord
  Cc: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Wed, Nov 24, 2010 at 9:48 PM, Mark Lord <kernel@teksavvy.com> wrote:
> On 10-11-19 11:30 AM, Ted Ts'o wrote:
>>
>> On Fri, Nov 19, 2010 at 04:44:33PM +0100, Lukas Czerner wrote:
>>>>
>>>> But, oddly, it _is_ the default for mke2fs -t ext4,
>>>> which really threw me for a loop recently.
>>>>
>>>> I though my system had locked up when suddenly everything
>>>> went dead for a very long time (many minutes) while installing a
>>>> new system.
>>
>> Yeah, the assumption was doing a single big discard (which is all
>> mke2fs is doing) should be fast.  At least on sanely implemented SSD's
>> (i.e., like the Intel X25-M) it should be, since all that should
>> require is a flash write to the global mapping table, declaring all of
>> the blocks as free.
>
> But mke2fs probably is NOT doing a "single big discard", because for SATA
> the
> TRIM command is limited to 64K sectors per range.. and the in-kernel TRIM
> code only ever does single ranges..
>
> So doing a discard over an entire drive-encompassing partition, say.. 100GB,
> will require 3000+ individual TRIM commands.  At (say) 200msecs each, that
> adds up to about ten minutes of execution time.  Or less if the drive is
> faster than that.
>
> Whereas.. grouping them into 64-ranges per trim, could reduce the execution
> time down to perhaps 1/50th of that, or in the range of 10-20 seconds
> instead.
>
> Cheers

Mark,

With recent kernels, this is supposed to work as you describe.  ie. 64
contiguous ranges per trim command.

If you see a significant speed difference between mke2fs and running
wiper.sh on that same filesystem immediately after formatting, then
their is likely a bug worth chasing.

Are you seeing an actual speed difference, or just assuming there is
one?  If mke2fs is slower than wiper.sh, what kernel are you testing
with?

Greg
-- 
Greg Freemyer
Head of EDD Tape Extraction and Processing team
Litigation Triage Solutions Specialist
http://www.linkedin.com/in/gregfreemyer
CNN/TruTV Aired Forensic Imaging Demo -
   http://insession.blogs.cnn.com/2010/03/23/how-computer-evidence-gets-retrieved/

The Norcross Group
The Intersection of Evidence & Technology
http://www.norcrossgroup.com
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-25  4:23                           ` Martin K. Petersen
@ 2010-11-25 14:44                             ` Mark Lord
  0 siblings, 0 replies; 106+ messages in thread
From: Mark Lord @ 2010-11-25 14:44 UTC (permalink / raw)
  To: Martin K. Petersen
  Cc: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-24 11:23 PM, Martin K. Petersen wrote:
>>>>>> "Mark" == Mark Lord<kernel@teksavvy.com>  writes:
>
> Mark>  But mke2fs probably is NOT doing a "single big discard", because
> Mark>  for SATA the TRIM command is limited to 64K sectors per
> Mark>  range.. and the in-kernel TRIM code only ever does single ranges..
>
> How many times must I reiterate that this is not true?

> Due to various limitations in the way the block layer works we can not
> clear LBA 10 to 20 and LBA 40 to 50 in one command. We don't have an
> infrastructure in place that allows us to operate on several discrete
> areas in one request.
>
> But for contiguous areas we will cram as many ranges entries in as we
> can fit in the TRIM payload. This means we'll issue one TRIM command for
> every 2 GiB minus change (65535*64*512 / 1048576 = 2047 MiB).

Once, will do, thanks.
That's the first time I've read it here with sufficient explanation.
You have my apologies if you've attempted it earlier.

Thanks!

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-25  4:41                             ` Greg Freemyer
  (?)
@ 2010-11-25 14:53                             ` Mark Lord
  2010-11-25 16:24                                 ` Greg Freemyer
  -1 siblings, 1 reply; 106+ messages in thread
From: Mark Lord @ 2010-11-25 14:53 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-24 11:41 PM, Greg Freemyer wrote:
>
> If you see a significant speed difference between mke2fs and running
> wiper.sh on that same filesystem immediately after formatting, then
> their is likely a bug worth chasing.


You're right.  They're both equally slow on a blank Sandforce drive,
taking about 83-90 seconds to TRIM 100GB.

The difference being that mke2fs just sits there with zero indication
or information as to what the huge delay is.

It's a pity the TRIM commands are so sluggish on the Sandforce.
It's a generation newer than the original Vertex, and TRIM is way faster
on the older Vertex.  Not that either of them do it quickly enough that
I'd ever want the kernel doing it automatically "on my behalf".

The strange thing, is that when we TRIM the entire raw device,
using ATA SECURITY ERASE, it takes less than 2 seconds.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-25 14:53                             ` Mark Lord
@ 2010-11-25 16:24                                 ` Greg Freemyer
  0 siblings, 0 replies; 106+ messages in thread
From: Greg Freemyer @ 2010-11-25 16:24 UTC (permalink / raw)
  To: Mark Lord
  Cc: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Thu, Nov 25, 2010 at 9:53 AM, Mark Lord <kernel@teksavvy.com> wrote:
> On 10-11-24 11:41 PM, Greg Freemyer wrote:
>>
>> If you see a significant speed difference between mke2fs and running
>> wiper.sh on that same filesystem immediately after formatting, then
>> their is likely a bug worth chasing.
>
>
> You're right.  They're both equally slow on a blank Sandforce drive,
> taking about 83-90 seconds to TRIM 100GB.
>
> The difference being that mke2fs just sits there with zero indication
> or information as to what the huge delay is.
>
> It's a pity the TRIM commands are so sluggish on the Sandforce.
> It's a generation newer than the original Vertex, and TRIM is way faster
> on the older Vertex.  Not that either of them do it quickly enough that
> I'd ever want the kernel doing it automatically "on my behalf".
>
> The strange thing, is that when we TRIM the entire raw device,
> using ATA SECURITY ERASE, it takes less than 2 seconds.
>
> Cheers

Mark,

I'm away from my systems today, but is there an easy way to tweak
wiper.sh or hdparm to cause only a single range per trim command.

I'm curious if for the Sandforce that would still be in the 90 second
range, or closer to 64x that.

Or if there is a readily available userspace tool to invoke FITRIM and
you have a 2.6.37-rc? kernel, you could just time that on a newly
formatted drive.

Greg

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2010-11-25 16:24                                 ` Greg Freemyer
  0 siblings, 0 replies; 106+ messages in thread
From: Greg Freemyer @ 2010-11-25 16:24 UTC (permalink / raw)
  To: Mark Lord
  Cc: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Thu, Nov 25, 2010 at 9:53 AM, Mark Lord <kernel@teksavvy.com> wrote:
> On 10-11-24 11:41 PM, Greg Freemyer wrote:
>>
>> If you see a significant speed difference between mke2fs and running
>> wiper.sh on that same filesystem immediately after formatting, then
>> their is likely a bug worth chasing.
>
>
> You're right.  They're both equally slow on a blank Sandforce drive,
> taking about 83-90 seconds to TRIM 100GB.
>
> The difference being that mke2fs just sits there with zero indication
> or information as to what the huge delay is.
>
> It's a pity the TRIM commands are so sluggish on the Sandforce.
> It's a generation newer than the original Vertex, and TRIM is way faster
> on the older Vertex.  Not that either of them do it quickly enough that
> I'd ever want the kernel doing it automatically "on my behalf".
>
> The strange thing, is that when we TRIM the entire raw device,
> using ATA SECURITY ERASE, it takes less than 2 seconds.
>
> Cheers

Mark,

I'm away from my systems today, but is there an easy way to tweak
wiper.sh or hdparm to cause only a single range per trim command.

I'm curious if for the Sandforce that would still be in the 90 second
range, or closer to 64x that.

Or if there is a readily available userspace tool to invoke FITRIM and
you have a 2.6.37-rc? kernel, you could just time that on a newly
formatted drive.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-25 16:24                                 ` Greg Freemyer
  (?)
@ 2010-11-26 13:49                                 ` Mark Lord
  2010-11-26 14:00                                   ` Lukas Czerner
  -1 siblings, 1 reply; 106+ messages in thread
From: Mark Lord @ 2010-11-26 13:49 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Ted Ts'o, Lukas Czerner, Steven Whitehouse, James Bottomley,
	Christoph Hellwig, Matthew Wilcox, Josef Bacik, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On 10-11-25 11:24 AM, Greg Freemyer wrote:
>
> I'm away from my systems today, but is there an easy way to tweak
> wiper.sh or hdparm to cause only a single range per trim command.
>
> I'm curious if for the Sandforce that would still be in the 90 second
> range, or closer to 64x that.

Good question.  The Indilinx based drives would be in the 64x range,
no doubt there.  But I don't know about the Sandforce.

And I don't think I'm willing to inflict so many life-shortening erase
cycles onto it just to find out.

One thing to note:  the execution time for TRIM does vary depending upon
whether the (logical) LBAs are already mostly in a "trimmed" state or not.

So anyone aspiring to benchmark this stuff will need to keep that in mind.
My timings above were for "already trimmed" cases.  I would expect them
to be much slower (2x - 3x) if the sectors all held data prior to trim.

Cheers

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-26 13:49                                 ` Mark Lord
@ 2010-11-26 14:00                                   ` Lukas Czerner
  0 siblings, 0 replies; 106+ messages in thread
From: Lukas Czerner @ 2010-11-26 14:00 UTC (permalink / raw)
  To: Mark Lord
  Cc: Greg Freemyer, Ted Ts'o, Lukas Czerner, Steven Whitehouse,
	James Bottomley, Christoph Hellwig, Matthew Wilcox, Josef Bacik,
	linux-ext4, linux-kernel, linux-fsdevel, sandeen

On Fri, 26 Nov 2010, Mark Lord wrote:

> On 10-11-25 11:24 AM, Greg Freemyer wrote:
> > 
> > I'm away from my systems today, but is there an easy way to tweak
> > wiper.sh or hdparm to cause only a single range per trim command.
> > 
> > I'm curious if for the Sandforce that would still be in the 90 second
> > range, or closer to 64x that.
> 
> Good question.  The Indilinx based drives would be in the 64x range,
> no doubt there.  But I don't know about the Sandforce.
> 
> And I don't think I'm willing to inflict so many life-shortening erase
> cycles onto it just to find out.
> 
> One thing to note:  the execution time for TRIM does vary depending upon
> whether the (logical) LBAs are already mostly in a "trimmed" state or not.
> 
> So anyone aspiring to benchmark this stuff will need to keep that in mind.
> My timings above were for "already trimmed" cases.  I would expect them
> to be much slower (2x - 3x) if the sectors all held data prior to trim.
> 
> Cheers
> 

I have already did some TRIM benchmarking, but especially regarding trim
extent size. Also there is a tool test-discard for that purpose, so it
may be handy for anyone trying to do something similar.

http://people.redhat.com/lczerner/discard/

Thanks!

-Lukas

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-18 17:55           ` Chris Mason
@ 2010-12-03 18:24             ` Ric Wheeler
  0 siblings, 0 replies; 106+ messages in thread
From: Ric Wheeler @ 2010-12-03 18:24 UTC (permalink / raw)
  To: Chris Mason
  Cc: James Bottomley, Christoph Hellwig, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

On 11/18/2010 12:55 PM, Chris Mason wrote:
> Excerpts from James Bottomley's message of 2010-11-18 12:19:10 -0500:
>> On Thu, 2010-11-18 at 09:29 -0500, Christoph Hellwig wrote:
>>> On Thu, Nov 18, 2010 at 07:19:58AM -0700, Matthew Wilcox wrote:
>>>> I guess I was assuming that, on receiving a FALLOC_FL_PUNCH_HOLE, a
>>>> filesystem that was TRIM-aware would pass that information down to the
>>>> block device that it's mounted on.  I strongly feel that we shouldn't
>>>> have two interfaces to do essentially the same thing.
>>>>
>>>> I guess I'm saying that you're going to have to learn about TRIM :-)
>>> Did you actually look Lukas FITRIM code (not the slight reordering here,
>>> but the original one).  It's the ext4 version of the batched discard
>>> model, that is a userspace ioctl to discard free space in the
>>> filesystem.
>>>
>>> hole punching will free the blocks into the free space pool.  If you do
>>> online discard it will also get discarded, but a filesystem that has
>>> online discard enabled doesn't need FITRIM.
>> Not stepping into the debate: I'm happy to see punch go to the mapping
>> data and FITRIM pick it up later.
>>
>> However, I think it's time to question whether we actually still want to
>> allow online discard at all.  Most of the benchmarks show it to be a net
>> lose to almost everything (either SSD or Thinly Provisioned arrays), so
>> it's become an "enable this to degrade performance" option with no
>> upside.
> I think we want to keep it.  In general we've (except for hch) spent
> almost zero time actually tuning online discard, and the benchmarking
> needs to be redone with the shiny new barrier code.
>
> -chris
>

Very belated response - I agree that we should keep the online discard support 
in (but off by default).

Some of the devices we have tested perform well with it and I expect that 
hardware vendors will get better now that we have the support for them to test with.

Ric



^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-11-19 15:21                   ` Mark Lord
@ 2010-12-07  9:27                     ` Christoph Hellwig
  2010-12-07 16:52                       ` Chris Mason
  0 siblings, 1 reply; 106+ messages in thread
From: Christoph Hellwig @ 2010-12-07  9:27 UTC (permalink / raw)
  To: Mark Lord
  Cc: Christoph Hellwig, Chris Mason, James Bottomley, Matthew Wilcox,
	Josef Bacik, Lukas Czerner, tytso, linux-ext4, linux-kernel,
	linux-fsdevel, sandeen

On Fri, Nov 19, 2010 at 10:21:35AM -0500, Mark Lord wrote:
> >I really hate to rely on this third party hearsay (from all sides), and
> >have implement TRIM support in qemu now.  I'll soon install win7 and
> >will check out the TRIM patters myself.
> 
> Excellent!

I did a Windows 7 installation under qemu today, and the result is:

 - it TRIMs the whole device early during the installation
 - after that I see a constant stream of small trims during the
   installation.  It's using lots of non-contiguous ranges in a single
   TRIM command, with sizes down to 8 sectors (4k) for a single range.
 - after installation there's is some background-trimming going on
   even when doing no user interaction with the VM at all.
 - removing files leads to an instant stream of TRIMs, again vectored
   and of all sizes down to 4k.  Note that the TRIMs are a lot more
   instant than even with btrfs and -o discard, which delays most
   TRIMs until doing a sync.

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-12-07  9:27                     ` Christoph Hellwig
@ 2010-12-07 16:52                       ` Chris Mason
  2011-06-02  4:52                         ` Kyungmin Park
  0 siblings, 1 reply; 106+ messages in thread
From: Chris Mason @ 2010-12-07 16:52 UTC (permalink / raw)
  To: Christoph Hellwig
  Cc: Mark Lord, James Bottomley, Matthew Wilcox, Josef Bacik,
	Lukas Czerner, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

Excerpts from Christoph Hellwig's message of 2010-12-07 04:27:49 -0500:
> On Fri, Nov 19, 2010 at 10:21:35AM -0500, Mark Lord wrote:
> > >I really hate to rely on this third party hearsay (from all sides), and
> > >have implement TRIM support in qemu now.  I'll soon install win7 and
> > >will check out the TRIM patters myself.
> > 
> > Excellent!
> 
> I did a Windows 7 installation under qemu today, and the result is:

Great, thanks for testing this.

> 
>  - it TRIMs the whole device early during the installation
>  - after that I see a constant stream of small trims during the
>    installation.  It's using lots of non-contiguous ranges in a single
>    TRIM command, with sizes down to 8 sectors (4k) for a single range.
>  - after installation there's is some background-trimming going on
>    even when doing no user interaction with the VM at all.
>  - removing files leads to an instant stream of TRIMs, again vectored
>    and of all sizes down to 4k.  Note that the TRIMs are a lot more
>    instant than even with btrfs and -o discard, which delays most
>    TRIMs until doing a sync.

Btrfs will do some small trims right when the block is freed, especially
in fsync heavy workloads but this is a suboptimal thing I want to fix.

The code tries to gather a whole transaction worth of trims and do them
after the commit is done.

-chris

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2010-12-07 16:52                       ` Chris Mason
@ 2011-06-02  4:52                         ` Kyungmin Park
  2011-06-02  8:14                           ` Lukas Czerner
  2011-06-03  2:06                             ` Dave Chinner
  0 siblings, 2 replies; 106+ messages in thread
From: Kyungmin Park @ 2011-06-02  4:52 UTC (permalink / raw)
  To: Chris Mason, Lukas Czerner
  Cc: Christoph Hellwig, Mark Lord, James Bottomley, Matthew Wilcox,
	Josef Bacik, tytso, linux-ext4, linux-kernel, linux-fsdevel,
	sandeen

On Wed, Dec 8, 2010 at 1:52 AM, Chris Mason <chris.mason@oracle.com> wrote:
> Excerpts from Christoph Hellwig's message of 2010-12-07 04:27:49 -0500:
>> On Fri, Nov 19, 2010 at 10:21:35AM -0500, Mark Lord wrote:
>> > >I really hate to rely on this third party hearsay (from all sides), and
>> > >have implement TRIM support in qemu now.  I'll soon install win7 and
>> > >will check out the TRIM patters myself.
>> >
>> > Excellent!
>>
>> I did a Windows 7 installation under qemu today, and the result is:
>
> Great, thanks for testing this.
>
>>
>>  - it TRIMs the whole device early during the installation
>>  - after that I see a constant stream of small trims during the
>>    installation.  It's using lots of non-contiguous ranges in a single
>>    TRIM command, with sizes down to 8 sectors (4k) for a single range.
>>  - after installation there's is some background-trimming going on
>>    even when doing no user interaction with the VM at all.

Hi Lukas,

Now FITRIM is based on user interaction. So how about to implement the
AUTO batched discard at kernel level?
Idea is same as windows, make a single thread and iterate the
superblocks and call the trim.

here's pseudo codes.

1. generate the trim thread.
2. iterate the superblocks by iterate_supers() at fs/super.c
3. check the queue which support the discard feature or not.
blk_queue_discard(q)
4. wait on events
5. call the sb->trim (need to re-introduce it)

The difficult things are how to define the events and how to trigger
the trim thread.
e.g., notified from block layer, called from filesystem and so on.

How do you think?

Thank you,
Kyungmin Park

>>  - removing files leads to an instant stream of TRIMs, again vectored
>>    and of all sizes down to 4k.  Note that the TRIMs are a lot more
>>    instant than even with btrfs and -o discard, which delays most
>>    TRIMs until doing a sync.
>
> Btrfs will do some small trims right when the block is freed, especially
> in fsync heavy workloads but this is a suboptimal thing I want to fix.
>
> The code tries to gather a whole transaction worth of trims and do them
> after the commit is done.
>
> -chris
> --
> To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> Please read the FAQ at  http://www.tux.org/lkml/
>

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2011-06-02  4:52                         ` Kyungmin Park
@ 2011-06-02  8:14                           ` Lukas Czerner
  2011-06-03  2:06                             ` Dave Chinner
  1 sibling, 0 replies; 106+ messages in thread
From: Lukas Czerner @ 2011-06-02  8:14 UTC (permalink / raw)
  To: Kyungmin Park
  Cc: Chris Mason, Lukas Czerner, Christoph Hellwig, Mark Lord,
	James Bottomley, Matthew Wilcox, Josef Bacik, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen, Dave Chinner

[-- Attachment #1: Type: TEXT/PLAIN, Size: 3812 bytes --]

On Thu, 2 Jun 2011, Kyungmin Park wrote:

> On Wed, Dec 8, 2010 at 1:52 AM, Chris Mason <chris.mason@oracle.com> wrote:
> > Excerpts from Christoph Hellwig's message of 2010-12-07 04:27:49 -0500:
> >> On Fri, Nov 19, 2010 at 10:21:35AM -0500, Mark Lord wrote:
> >> > >I really hate to rely on this third party hearsay (from all sides), and
> >> > >have implement TRIM support in qemu now.  I'll soon install win7 and
> >> > >will check out the TRIM patters myself.
> >> >
> >> > Excellent!
> >>
> >> I did a Windows 7 installation under qemu today, and the result is:
> >
> > Great, thanks for testing this.
> >
> >>
> >>  - it TRIMs the whole device early during the installation
> >>  - after that I see a constant stream of small trims during the
> >>    installation.  It's using lots of non-contiguous ranges in a single
> >>    TRIM command, with sizes down to 8 sectors (4k) for a single range.
> >>  - after installation there's is some background-trimming going on
> >>    even when doing no user interaction with the VM at all.
> 
> Hi Lukas,
> 
> Now FITRIM is based on user interaction. So how about to implement the
> AUTO batched discard at kernel level?
> Idea is same as windows, make a single thread and iterate the
> superblocks and call the trim.
> 
> here's pseudo codes.
> 
> 1. generate the trim thread.
> 2. iterate the superblocks by iterate_supers() at fs/super.c
> 3. check the queue which support the discard feature or not.
> blk_queue_discard(q)
> 4. wait on events
> 5. call the sb->trim (need to re-introduce it)
> 
> The difficult things are how to define the events and how to trigger
> the trim thread.
> e.g., notified from block layer, called from filesystem and so on.
> 
> How do you think?

Hi Kyungmin,

generally I think this is a good idea and I thought about it as well.
However I also think that we might want to wait for the FITRIM and
discard supported devices to settle down to see how it performs and if
frequently calling discard on big chunks of the device does not have
some unwanted consequences (as Dave Chinner pointed out in a different
thread, that this automations usually do).

Regarding events, filesystem might watch amount of data written to it
(and it usually do right now) and trigger the event when it exceeds,
let's say 50% of the fs size and zero the counter. The downside of this
is that it is not controlled behaviour hence we'll end up with
unpredictable behaviour of the filesystem in the long term.

The solution might be (and it is something I want to look into)
infrastructure to determine size of the queue to the device from within
the filesystem (or vfs) so we can tell when it is busy and we want to wait,
or when it is doing nothing and we can discard.

Also per filesystem, or per device, control will be needed, so we can
selectively turn it on and off. But as I said I am not sure that this is
the best idea to do it right now, but others might have different
opinion.

Thanks!
-Lukas

> 
> Thank you,
> Kyungmin Park
> 
> >>  - removing files leads to an instant stream of TRIMs, again vectored
> >>    and of all sizes down to 4k.  Note that the TRIMs are a lot more
> >>    instant than even with btrfs and -o discard, which delays most
> >>    TRIMs until doing a sync.
> >
> > Btrfs will do some small trims right when the block is freed, especially
> > in fsync heavy workloads but this is a suboptimal thing I want to fix.
> >
> > The code tries to gather a whole transaction worth of trims and do them
> > after the commit is done.
> >
> > -chris
> > --
> > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
> > the body of a message to majordomo@vger.kernel.org
> > More majordomo info at  http://vger.kernel.org/majordomo-info.html
> > Please read the FAQ at  http://www.tux.org/lkml/
> >
> 

-- 

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2011-06-02  4:52                         ` Kyungmin Park
@ 2011-06-03  2:06                             ` Dave Chinner
  2011-06-03  2:06                             ` Dave Chinner
  1 sibling, 0 replies; 106+ messages in thread
From: Dave Chinner @ 2011-06-03  2:06 UTC (permalink / raw)
  To: Kyungmin Park
  Cc: Chris Mason, Lukas Czerner, Christoph Hellwig, Mark Lord,
	James Bottomley, Matthew Wilcox, Josef Bacik, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Thu, Jun 02, 2011 at 01:52:12PM +0900, Kyungmin Park wrote:
> On Wed, Dec 8, 2010 at 1:52 AM, Chris Mason <chris.mason@oracle.com> wrote:
> > Excerpts from Christoph Hellwig's message of 2010-12-07 04:27:49 -0500:
> >> On Fri, Nov 19, 2010 at 10:21:35AM -0500, Mark Lord wrote:
> >> > >I really hate to rely on this third party hearsay (from all sides), and
> >> > >have implement TRIM support in qemu now.  I'll soon install win7 and
> >> > >will check out the TRIM patters myself.
> >> >
> >> > Excellent!
> >>
> >> I did a Windows 7 installation under qemu today, and the result is:
> >
> > Great, thanks for testing this.
> >
> >>
> >>  - it TRIMs the whole device early during the installation
> >>  - after that I see a constant stream of small trims during the
> >>    installation.  It's using lots of non-contiguous ranges in a single
> >>    TRIM command, with sizes down to 8 sectors (4k) for a single range.
> >>  - after installation there's is some background-trimming going on
> >>    even when doing no user interaction with the VM at all.
> 
> Hi Lukas,
> 
> Now FITRIM is based on user interaction. So how about to implement the
> AUTO batched discard at kernel level?
> Idea is same as windows, make a single thread and iterate the
> superblocks and call the trim.

Surely this the responsibility of a userspace daemon and a config
file to decide and implement the background trim policy?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2011-06-03  2:06                             ` Dave Chinner
  0 siblings, 0 replies; 106+ messages in thread
From: Dave Chinner @ 2011-06-03  2:06 UTC (permalink / raw)
  To: Kyungmin Park
  Cc: Chris Mason, Lukas Czerner, Christoph Hellwig, Mark Lord,
	James Bottomley, Matthew Wilcox, Josef Bacik, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Thu, Jun 02, 2011 at 01:52:12PM +0900, Kyungmin Park wrote:
> On Wed, Dec 8, 2010 at 1:52 AM, Chris Mason <chris.mason@oracle.com> wrote:
> > Excerpts from Christoph Hellwig's message of 2010-12-07 04:27:49 -0500:
> >> On Fri, Nov 19, 2010 at 10:21:35AM -0500, Mark Lord wrote:
> >> > >I really hate to rely on this third party hearsay (from all sides), and
> >> > >have implement TRIM support in qemu now.  I'll soon install win7 and
> >> > >will check out the TRIM patters myself.
> >> >
> >> > Excellent!
> >>
> >> I did a Windows 7 installation under qemu today, and the result is:
> >
> > Great, thanks for testing this.
> >
> >>
> >>  - it TRIMs the whole device early during the installation
> >>  - after that I see a constant stream of small trims during the
> >>    installation.  It's using lots of non-contiguous ranges in a single
> >>    TRIM command, with sizes down to 8 sectors (4k) for a single range.
> >>  - after installation there's is some background-trimming going on
> >>    even when doing no user interaction with the VM at all.
> 
> Hi Lukas,
> 
> Now FITRIM is based on user interaction. So how about to implement the
> AUTO batched discard at kernel level?
> Idea is same as windows, make a single thread and iterate the
> superblocks and call the trim.

Surely this the responsibility of a userspace daemon and a config
file to decide and implement the background trim policy?

Cheers,

Dave.
-- 
Dave Chinner
david@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-fsdevel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
  2011-06-03  2:06                             ` Dave Chinner
@ 2011-06-03  4:25                               ` Kyungmin Park
  -1 siblings, 0 replies; 106+ messages in thread
From: Kyungmin Park @ 2011-06-03  4:25 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Chris Mason, Lukas Czerner, Christoph Hellwig, Mark Lord,
	James Bottomley, Matthew Wilcox, Josef Bacik, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Fri, Jun 3, 2011 at 11:06 AM, Dave Chinner <david@fromorbit.com> wrote:
> On Thu, Jun 02, 2011 at 01:52:12PM +0900, Kyungmin Park wrote:
>> On Wed, Dec 8, 2010 at 1:52 AM, Chris Mason <chris.mason@oracle.com> wrote:
>> > Excerpts from Christoph Hellwig's message of 2010-12-07 04:27:49 -0500:
>> >> On Fri, Nov 19, 2010 at 10:21:35AM -0500, Mark Lord wrote:
>> >> > >I really hate to rely on this third party hearsay (from all sides), and
>> >> > >have implement TRIM support in qemu now.  I'll soon install win7 and
>> >> > >will check out the TRIM patters myself.
>> >> >
>> >> > Excellent!
>> >>
>> >> I did a Windows 7 installation under qemu today, and the result is:
>> >
>> > Great, thanks for testing this.
>> >
>> >>
>> >>  - it TRIMs the whole device early during the installation
>> >>  - after that I see a constant stream of small trims during the
>> >>    installation.  It's using lots of non-contiguous ranges in a single
>> >>    TRIM command, with sizes down to 8 sectors (4k) for a single range.
>> >>  - after installation there's is some background-trimming going on
>> >>    even when doing no user interaction with the VM at all.
>>
>> Hi Lukas,
>>
>> Now FITRIM is based on user interaction. So how about to implement the
>> AUTO batched discard at kernel level?
>> Idea is same as windows, make a single thread and iterate the
>> superblocks and call the trim.
>
> Surely this the responsibility of a userspace daemon and a config
> file to decide and implement the background trim policy?

It seems to run as crond. BTW, what's config file? time based or behavior?
my rough idea to run at kernel level.

1. Block I/O aware notification.
2. Explicit filesystem call
3. user space request. (same as current)

Thank you,
Kyungmin Park
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
>

^ permalink raw reply	[flat|nested] 106+ messages in thread

* Re: [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation
@ 2011-06-03  4:25                               ` Kyungmin Park
  0 siblings, 0 replies; 106+ messages in thread
From: Kyungmin Park @ 2011-06-03  4:25 UTC (permalink / raw)
  To: Dave Chinner
  Cc: Chris Mason, Lukas Czerner, Christoph Hellwig, Mark Lord,
	James Bottomley, Matthew Wilcox, Josef Bacik, tytso, linux-ext4,
	linux-kernel, linux-fsdevel, sandeen

On Fri, Jun 3, 2011 at 11:06 AM, Dave Chinner <david@fromorbit.com> wrote:
> On Thu, Jun 02, 2011 at 01:52:12PM +0900, Kyungmin Park wrote:
>> On Wed, Dec 8, 2010 at 1:52 AM, Chris Mason <chris.mason@oracle.com> wrote:
>> > Excerpts from Christoph Hellwig's message of 2010-12-07 04:27:49 -0500:
>> >> On Fri, Nov 19, 2010 at 10:21:35AM -0500, Mark Lord wrote:
>> >> > >I really hate to rely on this third party hearsay (from all sides), and
>> >> > >have implement TRIM support in qemu now.  I'll soon install win7 and
>> >> > >will check out the TRIM patters myself.
>> >> >
>> >> > Excellent!
>> >>
>> >> I did a Windows 7 installation under qemu today, and the result is:
>> >
>> > Great, thanks for testing this.
>> >
>> >>
>> >>  - it TRIMs the whole device early during the installation
>> >>  - after that I see a constant stream of small trims during the
>> >>    installation.  It's using lots of non-contiguous ranges in a single
>> >>    TRIM command, with sizes down to 8 sectors (4k) for a single range.
>> >>  - after installation there's is some background-trimming going on
>> >>    even when doing no user interaction with the VM at all.
>>
>> Hi Lukas,
>>
>> Now FITRIM is based on user interaction. So how about to implement the
>> AUTO batched discard at kernel level?
>> Idea is same as windows, make a single thread and iterate the
>> superblocks and call the trim.
>
> Surely this the responsibility of a userspace daemon and a config
> file to decide and implement the background trim policy?

It seems to run as crond. BTW, what's config file? time based or behavior?
my rough idea to run at kernel level.

1. Block I/O aware notification.
2. Explicit filesystem call
3. user space request. (same as current)

Thank you,
Kyungmin Park
>
> Cheers,
>
> Dave.
> --
> Dave Chinner
> david@fromorbit.com
>
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 106+ messages in thread

end of thread, other threads:[~2011-06-03  4:25 UTC | newest]

Thread overview: 106+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-11-18  7:36 [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation Lukas Czerner
2010-11-18  7:36 ` [PATCH 2/2] ext4: Add EXT4_IOC_TRIM ioctl to handle batched discard Lukas Czerner
2010-11-19 16:19   ` Ted Ts'o
2010-11-19 16:26     ` Lukas Czerner
2010-11-20  1:37       ` Ted Ts'o
2010-11-18 13:06 ` [PATCH 1/2] fs: Do not dispatch FITRIM through separate super_operation Matthew Wilcox
2010-11-18 13:48   ` Josef Bacik
2010-11-18 14:19     ` Matthew Wilcox
2010-11-18 14:29       ` Christoph Hellwig
2010-11-18 17:19         ` James Bottomley
2010-11-18 17:22           ` Jeff Moyer
2010-11-18 17:41             ` James Bottomley
2010-11-18 20:04               ` Greg Freemyer
2010-11-18 20:04                 ` Greg Freemyer
2010-11-18 21:42                 ` Mark Lord
2010-11-18 21:44                 ` Mark Lord
2010-11-18 21:50                   ` James Bottomley
2010-11-18 22:07                     ` Mark Lord
2010-11-19  1:33                       ` Ted Ts'o
2010-11-19  3:44                         ` Mark Lord
2010-11-19  3:44                         ` Mark Lord
2010-11-19  3:44                           ` Mark Lord
2010-11-19 13:58                           ` Jeff Moyer
2010-11-18 23:52                   ` Martin K. Petersen
2010-11-19  0:34                     ` Mark Lord
2010-11-19  1:16                       ` Greg Freemyer
2010-11-19  1:16                         ` Greg Freemyer
2010-11-19 11:55                         ` Christoph Hellwig
2010-11-19 14:01                           ` Mark Lord
2010-11-19 14:06                             ` Christoph Hellwig
2010-11-19 14:48                               ` Mark Lord
2010-11-19 14:54                                 ` Christoph Hellwig
2010-11-19 15:24                                   ` Mark Lord
2010-11-19 15:34                                     ` Christoph Hellwig
2010-11-19 16:20                           ` Greg Freemyer
2010-11-19 16:38                             ` Christoph Hellwig
2010-11-19 18:06                               ` Lukas Czerner
2010-11-19 18:10                                 ` Lukas Czerner
2010-11-19 18:14                                   ` Lukas Czerner
2010-11-19 19:29                                 ` Chris Mason
2010-11-19  1:49                       ` Martin K. Petersen
2010-11-19  3:42                         ` Mark Lord
2010-11-18 18:05             ` Jamie Lokier
2010-11-18 19:32               ` Markus Trippelsdorf
2010-11-18 21:45                 ` Mark Lord
2010-11-18 21:50                   ` Markus Trippelsdorf
2010-11-18 22:09                     ` Mark Lord
2010-11-18 17:35           ` Lukas Czerner
2010-11-19 12:16             ` Steven Whitehouse
2010-11-19 13:53               ` Mark Lord
2010-11-19 14:02                 ` Ted Ts'o
2010-11-19 14:10                   ` Christoph Hellwig
2010-11-19 14:10                   ` Christoph Hellwig
2010-11-19 14:10                   ` Christoph Hellwig
2010-11-19 15:37                     ` Ted Ts'o
2010-11-19 15:50                       ` Christoph Hellwig
2010-11-19 15:50                       ` Christoph Hellwig
2010-11-19 15:50                         ` Christoph Hellwig
2010-11-19 16:16                         ` Ted Ts'o
2010-11-19 14:50                   ` Mark Lord
2010-11-19 14:50                   ` Mark Lord
2010-11-19 14:50                     ` Mark Lord
2010-11-19 15:35                   ` Mark Lord
2010-11-19 15:35                   ` Mark Lord
2010-11-19 15:35                     ` Mark Lord
2010-11-19 15:44                     ` Lukas Czerner
2010-11-19 16:30                       ` Ted Ts'o
2010-11-19 22:49                         ` Mark Lord
2010-11-19 22:49                           ` Mark Lord
2010-11-19 22:49                         ` Mark Lord
2010-11-25  2:48                         ` Mark Lord
2010-11-25  2:48                         ` Mark Lord
2010-11-25  2:48                           ` Mark Lord
2010-11-25  4:23                           ` Martin K. Petersen
2010-11-25 14:44                             ` Mark Lord
2010-11-25  4:41                           ` Greg Freemyer
2010-11-25  4:41                             ` Greg Freemyer
2010-11-25 14:53                             ` Mark Lord
2010-11-25 16:24                               ` Greg Freemyer
2010-11-25 16:24                                 ` Greg Freemyer
2010-11-26 13:49                                 ` Mark Lord
2010-11-26 14:00                                   ` Lukas Czerner
2010-11-18 17:55           ` Chris Mason
2010-12-03 18:24             ` Ric Wheeler
2010-11-18 21:37           ` Mark Lord
2010-11-19 11:09             ` Christoph Hellwig
2010-11-19 13:54               ` Mark Lord
2010-11-19 14:40             ` Chris Mason
2010-11-19 14:53               ` Mark Lord
2010-11-19 14:57                 ` Christoph Hellwig
2010-11-19 15:21                   ` Mark Lord
2010-12-07  9:27                     ` Christoph Hellwig
2010-12-07 16:52                       ` Chris Mason
2011-06-02  4:52                         ` Kyungmin Park
2011-06-02  8:14                           ` Lukas Czerner
2011-06-03  2:06                           ` Dave Chinner
2011-06-03  2:06                             ` Dave Chinner
2011-06-03  4:25                             ` Kyungmin Park
2011-06-03  4:25                               ` Kyungmin Park
2010-11-19 15:30                   ` Mark Lord
2010-11-21 19:07                 ` Valdis.Kletnieks
2010-11-21 20:20                   ` James Bottomley
2010-11-18 14:31       ` Josef Bacik
2010-11-18 14:36         ` Tao Ma
2010-11-19 15:41 ` Ted Ts'o
2010-11-19 15:50   ` Christoph Hellwig

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.