All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/3] Add ioctl FITRIM.
@ 2010-08-04 13:44 Lukas Czerner
  2010-08-04 13:44 ` [PATCH 2/3] Add batched discard support for ext3 Lukas Czerner
                   ` (2 more replies)
  0 siblings, 3 replies; 24+ messages in thread
From: Lukas Czerner @ 2010-08-04 13:44 UTC (permalink / raw)
  To: linux-ext4; +Cc: jmoyer, rwheeler, eshishki, sandeen, jack, tytso, lczerner

Adds an filesystem independent ioctl to allow implementation of file
system batched discard support.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ioctl.c         |   31 +++++++++++++++++++++++++++++++
 include/linux/fs.h |    2 ++
 2 files changed, 33 insertions(+), 0 deletions(-)

diff --git a/fs/ioctl.c b/fs/ioctl.c
index 2d140a7..6c01c3c 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -540,6 +540,33 @@ static int ioctl_fsthaw(struct file *filp)
 	return thaw_super(sb);
 }
 
+static int ioctl_fstrim(struct file *filp, unsigned long arg)
+{
+	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
+	unsigned int minlen;
+	int err;
+
+	if (!capable(CAP_SYS_ADMIN))
+		return -EPERM;
+
+	/* If filesystem doesn't support trim feature, return. */
+	if (sb->s_op->trim_fs == NULL)
+		return -EOPNOTSUPP;
+
+	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
+	if (sb->s_bdev == NULL)
+		return -EINVAL;
+
+	err = get_user(minlen, (unsigned int __user *) arg);
+	if (err)
+		return err;
+
+	err = sb->s_op->trim_fs(minlen, sb);
+	if (err)
+		return err;
+	return 0;
+}
+
 /*
  * When you add any new common ioctls to the switches above and below
  * please update compat_sys_ioctl() too.
@@ -590,6 +617,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
 		error = ioctl_fsthaw(filp);
 		break;
 
+	case FITRIM:
+		error = ioctl_fstrim(filp, arg);
+		break;
+
 	case FS_IOC_FIEMAP:
 		return ioctl_fiemap(filp, arg);
 
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 68ca1b0..01632e4 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -315,6 +315,7 @@ struct inodes_stat_t {
 #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
 #define FIFREEZE	_IOWR('X', 119, int)	/* Freeze */
 #define FITHAW		_IOWR('X', 120, int)	/* Thaw */
+#define FITRIM		_IOWR('X', 121, int)	/* Trim */
 
 #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
 #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
@@ -1580,6 +1581,7 @@ struct super_operations {
 	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
 #endif
 	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
+	int (*trim_fs) (unsigned int, struct super_block *);
 };
 
 /*
-- 
1.7.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 2/3] Add batched discard support for ext3
  2010-08-04 13:44 [PATCH 1/3] Add ioctl FITRIM Lukas Czerner
@ 2010-08-04 13:44 ` Lukas Czerner
  2010-08-04 14:03   ` Jan Kara
  2010-08-04 19:39   ` Andreas Dilger
  2010-08-04 13:44 ` [PATCH 3/3] Add batched discard support for ext4 Lukas Czerner
  2010-08-04 14:57 ` [PATCH 1/3] Add ioctl FITRIM Dmitry Monakhov
  2 siblings, 2 replies; 24+ messages in thread
From: Lukas Czerner @ 2010-08-04 13:44 UTC (permalink / raw)
  To: linux-ext4; +Cc: jmoyer, rwheeler, eshishki, sandeen, jack, tytso, lczerner

Walk through each allocation group and trim all free extents. It can be
invoked through FITRIM ioctl on the file system. The main idea is to
provide a way to trim the whole file system if needed, since some SSD's
may suffer from performance loss after the whole device was filled (it
does not mean that fs is full!).

It search for free extents in each allocation group. When the free
extent is found, blocks are marked as used and then trimmed. Afterwards
these blocks are marked as free in per-group bitmap.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
---
 fs/ext3/balloc.c        |  217 +++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext3/super.c         |    1 +
 include/linux/ext3_fs.h |    1 +
 3 files changed, 219 insertions(+), 0 deletions(-)

diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c
index 4a32511..dffc2e2 100644
--- a/fs/ext3/balloc.c
+++ b/fs/ext3/balloc.c
@@ -20,6 +20,7 @@
 #include <linux/ext3_jbd.h>
 #include <linux/quotaops.h>
 #include <linux/buffer_head.h>
+#include <linux/blkdev.h>
 
 /*
  * balloc.c contains the blocks allocation and deallocation routines
@@ -1882,3 +1883,219 @@ unsigned long ext3_bg_num_gdb(struct super_block *sb, int group)
 	return ext3_bg_num_gdb_meta(sb,group);
 
 }
+
+/**
+ * ext3_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @group:		allocation group to trim
+ * @gdp:		allocation group description structure
+ * @minblocks:		minimum extent block count
+ *
+ * ext3_trim_all_free walks through group's block bitmap searching for free
+ * blocks. When the free block is found, it tries to allocate this block and
+ * consequent free block to get the biggest free extent possible, until it
+ * reaches any used block. Then issue a TRIM command on this extent and free
+ * the extent in the block bitmap. This is done until whole group is scanned.
+ */
+ext3_grpblk_t ext3_trim_all_free(struct super_block *sb,
+			unsigned int group, ext3_grpblk_t minblocks)
+{
+	handle_t *handle;
+	ext3_grpblk_t max = EXT3_BLOCKS_PER_GROUP(sb);
+	ext3_grpblk_t next, count = 0, start, bit;
+	struct ext3_sb_info *sbi;
+	ext3_fsblk_t discard_block;
+	struct buffer_head *bitmap_bh = NULL;
+	struct buffer_head *gdp_bh;
+	ext3_grpblk_t free_blocks;
+	struct ext3_group_desc *gdp;
+	int err = 0, ret;
+	ext3_grpblk_t freed;
+
+	/*
+	 * We will update one block bitmap, and one group descriptor
+	 */
+	handle = ext3_journal_start_sb(sb, 2);
+	if (IS_ERR(handle)) {
+		err = PTR_ERR(handle);
+		return err;
+	}
+
+	bitmap_bh = read_block_bitmap(sb, group);
+	if (!bitmap_bh)
+		goto err_out;
+
+	BUFFER_TRACE(bitmap_bh, "getting undo access");
+	err = ext3_journal_get_undo_access(handle, bitmap_bh);
+	if (err)
+		goto err_out;
+
+	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
+	if (!gdp)
+		goto err_out;
+
+	BUFFER_TRACE(gd_bh, "get_write_access");
+	err = ext3_journal_get_write_access(handle, gdp_bh);
+	if (err)
+		goto err_out;
+
+	free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+	sbi = EXT3_SB(sb);
+
+	 /* Walk through the whole group */
+	start = 0;
+	while (start < max) {
+
+		start = bitmap_search_next_usable_block(start, bitmap_bh, max);
+		if (start < 0)
+			break;
+		next = start;
+
+		/*
+		 * Allocate contiguous free extents by setting bits in the
+		 * block bitmap
+		 */
+		while (next < max
+			&& claim_block(sb_bgl_lock(sbi, group),
+					next, bitmap_bh)) {
+			next++;
+		}
+
+		 /* We did not claim any blocks */
+		if (next == start)
+			continue;
+
+		discard_block = (ext3_fsblk_t)start +
+				ext3_group_first_block_no(sb, group);
+
+		/* Update counters */
+		spin_lock(sb_bgl_lock(sbi, group));
+		le16_add_cpu(&gdp->bg_free_blocks_count, start - next);
+		spin_unlock(sb_bgl_lock(sbi, group));
+		percpu_counter_sub(&sbi->s_freeblocks_counter, next - start);
+
+		/* Do not issue a TRIM on extents smaller than minblocks */
+		if ((next - start) < minblocks)
+			goto free_extent;
+
+		 /* Send the TRIM command down to the device */
+		ret = sb_issue_discard(sb, discard_block, next - start);
+		count += (next - start);
+
+free_extent:
+		freed = 0;
+
+		/*
+		 * Clear bits in the bitmap
+		 */
+		for (bit = start; bit < next; bit++) {
+			BUFFER_TRACE(bitmap_bh, "clear bit");
+			if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
+						bit, bitmap_bh->b_data)) {
+				ext3_error(sb, __func__,
+					"bit already cleared for block "E3FSBLK,
+					 (unsigned long)bit);
+				BUFFER_TRACE(bitmap_bh, "bit already cleared");
+			} else {
+				freed++;
+			}
+		}
+
+		/* Update couters */
+		spin_lock(sb_bgl_lock(sbi, group));
+		le16_add_cpu(&gdp->bg_free_blocks_count, freed);
+		spin_unlock(sb_bgl_lock(sbi, group));
+		percpu_counter_add(&sbi->s_freeblocks_counter, next - start);
+
+		start = next;
+
+		if (ret == EOPNOTSUPP) {
+			ext3_warning(sb, __func__,
+				"discard not supported!");
+			count = -EOPNOTSUPP;
+			break;
+
+		}
+
+		if (signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		cond_resched();
+
+		/* No more suitable extents */
+		if ((free_blocks - count) < minblocks)
+			break;
+	}
+
+	/* We dirtied the bitmap block */
+	BUFFER_TRACE(bitmap_bh, "dirtied bitmap block");
+	err = ext3_journal_dirty_metadata(handle, bitmap_bh);
+
+	/* And the group descriptor block */
+	BUFFER_TRACE(gdp_bh, "dirtied group descriptor block");
+	ret = ext3_journal_dirty_metadata(handle, gdp_bh);
+	if (!err)
+		err = ret;
+
+	ext3_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+err_out:
+	if (err) {
+		ext3_std_error(sb, err);
+		count = -err;
+	}
+
+	ext3_journal_stop(handle);
+	brelse(bitmap_bh);
+
+	return count;
+}
+
+/**
+ * ext3_trim_fs() -- trim ioctl handle function
+ * @minlen:		minimum extent length in Bytes
+ * @sb:			superblock for filesystem
+ *
+ * ext3_trim_fs goes through all allocation group searching for groups with more
+ * free space than minlen. For such a group ext3_trim_all_free function is
+ * invoked to trim all free space.
+ */
+int ext3_trim_fs(unsigned int minlen, struct super_block *sb)
+{
+	ext3_grpblk_t minblocks;
+	unsigned long ngroups;
+	unsigned int group;
+	struct ext3_group_desc *gdp;
+	ext3_grpblk_t free_blocks;
+	int ret = 0;
+
+	minblocks = DIV_ROUND_UP(minlen, sb->s_blocksize);
+	if (unlikely(minblocks > EXT3_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+
+	ngroups = EXT3_SB(sb)->s_groups_count;
+	smp_rmb();
+
+	for (group = 0; group < ngroups; group++) {
+
+		gdp = ext3_get_group_desc(sb, group, NULL);
+		if (!gdp)
+			break;
+
+		free_blocks = le16_to_cpu(gdp->bg_free_blocks_count);
+		if (free_blocks < minblocks)
+			continue;
+
+		ret = ext3_trim_all_free(sb, group, minblocks);
+		if (ret < 0)
+			break;
+	}
+
+	if (ret >= 0)
+		ret = 0;
+
+	return ret;
+}
diff --git a/fs/ext3/super.c b/fs/ext3/super.c
index 6c953bb..b172c88 100644
--- a/fs/ext3/super.c
+++ b/fs/ext3/super.c
@@ -797,6 +797,7 @@ static const struct super_operations ext3_sops = {
 	.quota_write	= ext3_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
+	.trim_fs	= ext3_trim_fs,
 };
 
 static const struct export_operations ext3_export_ops = {
diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h
index 7fc62d4..9083e16 100644
--- a/include/linux/ext3_fs.h
+++ b/include/linux/ext3_fs.h
@@ -857,6 +857,7 @@ extern struct ext3_group_desc * ext3_get_group_desc(struct super_block * sb,
 extern int ext3_should_retry_alloc(struct super_block *sb, int *retries);
 extern void ext3_init_block_alloc_info(struct inode *);
 extern void ext3_rsv_window_add(struct super_block *sb, struct ext3_reserve_window_node *rsv);
+extern int ext3_trim_fs(unsigned int minlen, struct super_block *sb);
 
 /* dir.c */
 extern int ext3_check_dir_entry(const char *, struct inode *,
-- 
1.7.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/3] Add batched discard support for ext4
  2010-08-04 13:44 [PATCH 1/3] Add ioctl FITRIM Lukas Czerner
  2010-08-04 13:44 ` [PATCH 2/3] Add batched discard support for ext3 Lukas Czerner
@ 2010-08-04 13:44 ` Lukas Czerner
  2010-08-04 14:17   ` Jan Kara
  2010-08-04 14:57 ` [PATCH 1/3] Add ioctl FITRIM Dmitry Monakhov
  2 siblings, 1 reply; 24+ messages in thread
From: Lukas Czerner @ 2010-08-04 13:44 UTC (permalink / raw)
  To: linux-ext4
  Cc: jmoyer, rwheeler, eshishki, sandeen, jack, tytso, lczerner,
	Dmitry Monakhov

Walk through each allocation group and trim all free extents. It can be
invoked through TRIM ioctl on the file system. The main idea is to
provide a way to trim the whole file system if needed, since some SSD's
may suffer from performance loss after the whole device was filled (it
does not mean that fs is full!).

It search fro free extents in each allocation group. When the free
extent is found, blocks are marked as used in the buddy bitmap and then
trimmed. Afterwards these blocks are marked as free in per-group buddy
bitmap.

Since fstrim is a long operation it is good to have an ability to interrupt
it by a signal. This was added by Dmitry Monakhov. Thanks Dimitry.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/ext4.h    |    2 +
 fs/ext4/mballoc.c |  157 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c   |    1 +
 3 files changed, 160 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 19a4de5..a3447d7 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1558,6 +1558,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
+extern int ext4_trim_fs(unsigned int, struct super_block *);
+
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 12b3bc0..1faf858 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4690,3 +4690,160 @@ error_return:
 		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:		super block for the file system
+ * @start:	starting block of the free extent in the alloc. group
+ * @count:	number of blocks to TRIM
+ * @group:	alloc. group we are working with
+ * @e4b:	ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	ext4_fsblk_t discard_block;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_free_extent ex;
+	int ret = 0;
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	/*
+	 * Mark blocks used, so no one can reuse them while
+	 * being trimmed.
+	 */
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+
+	discard_block = (ext4_fsblk_t)group *
+			EXT4_BLOCKS_PER_GROUP(sb)
+			+ start
+			+ le32_to_cpu(es->s_first_data_block);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long)discard_block,
+			count);
+	ret = sb_issue_discard(sb, discard_block, count);
+	if (ret == EOPNOTSUPP) {
+		ext4_warning(sb,
+			"discard not supported!");
+		ret = -EOPNOTSUPP;
+	}
+	cond_resched();
+
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+	return ret;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @e4b:		ext4 buddy
+ * @minblocks:		minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+  */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+	ext4_grpblk_t start, next, count = 0;
+	ext4_group_t group;
+	int ret = 0;
+
+	BUG_ON(e4b == NULL);
+
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	start = e4b->bd_info->bb_first_free;
+	ext4_lock_group(sb, group);
+
+	while (start < max) {
+
+		start = mb_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = mb_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minblocks) {
+			ret = ext4_trim_extent(sb, start,
+				next - start, group, e4b);
+			if (ret < 0)
+				break;
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if ((e4b->bd_info->bb_free - count) < minblocks)
+			break;
+	}
+	ext4_unlock_group(sb, group);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	if (ret < 0)
+		count = ret;
+
+	return count;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @minlen:		minimum extent length in Bytes
+ * @sb:			superblock for filesystem
+ *
+ * ext4_trim_fs goes through all allocation group searching for groups with more
+ * free space than minlen. For such a group ext4_trim_all_free function is
+ * invoked to trim all free space.
+ */
+int ext4_trim_fs(unsigned int minlen, struct super_block *sb)
+{
+	struct ext4_buddy e4b;
+	ext4_group_t group;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_grpblk_t minblocks, cnt;
+	int ret = 0;
+
+	minblocks = DIV_ROUND_UP(minlen, sb->s_blocksize);
+	if (unlikely(minblocks > EXT4_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+
+	for (group = 0; group < ngroups; group++) {
+
+		ret = ext4_mb_load_buddy(sb, group, &e4b);
+		if (ret) {
+			ext4_error(sb, "Error in loading buddy "
+					"information for %u", group);
+			break;
+		}
+
+		if (e4b.bd_info->bb_free >= minblocks) {
+			cnt = ext4_trim_all_free(sb, &e4b, minblocks);
+			if (cnt < 0) {
+				ret = cnt;
+				ext4_mb_unload_buddy(&e4b);
+				break;
+			}
+		}
+		ext4_mb_unload_buddy(&e4b);
+	}
+	return ret;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 4e8983a..995989b 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1114,6 +1114,7 @@ static const struct super_operations ext4_sops = {
 	.quota_write	= ext4_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
+	.trim_fs	= ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
-- 
1.7.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] Add batched discard support for ext3
  2010-08-04 13:44 ` [PATCH 2/3] Add batched discard support for ext3 Lukas Czerner
@ 2010-08-04 14:03   ` Jan Kara
  2010-08-04 14:32     ` Lukas Czerner
  2010-08-04 19:39   ` Andreas Dilger
  1 sibling, 1 reply; 24+ messages in thread
From: Jan Kara @ 2010-08-04 14:03 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: linux-ext4, jmoyer, rwheeler, eshishki, sandeen, jack, tytso

  Hi,

  The patch looks good. Just one small thing:

On Wed 04-08-10 15:44:34, Lukas Czerner wrote:
> +	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
> +	if (!gdp)
> +		goto err_out;
> +
> +	BUFFER_TRACE(gd_bh, "get_write_access");
                     ^^^^^ gdp_bh

  So you can add: Reviewed-by: Jan Kara <jack@suse.cz>

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] Add batched discard support for ext4
  2010-08-04 13:44 ` [PATCH 3/3] Add batched discard support for ext4 Lukas Czerner
@ 2010-08-04 14:17   ` Jan Kara
  0 siblings, 0 replies; 24+ messages in thread
From: Jan Kara @ 2010-08-04 14:17 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: linux-ext4, jmoyer, rwheeler, eshishki, sandeen, jack, tytso,
	Dmitry Monakhov

On Wed 04-08-10 15:44:35, Lukas Czerner wrote:
> Walk through each allocation group and trim all free extents. It can be
> invoked through TRIM ioctl on the file system. The main idea is to
> provide a way to trim the whole file system if needed, since some SSD's
> may suffer from performance loss after the whole device was filled (it
> does not mean that fs is full!).
> 
> It search fro free extents in each allocation group. When the free
> extent is found, blocks are marked as used in the buddy bitmap and then
> trimmed. Afterwards these blocks are marked as free in per-group buddy
> bitmap.
> 
> Since fstrim is a long operation it is good to have an ability to interrupt
> it by a signal. This was added by Dmitry Monakhov. Thanks Dimitry.
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
  This patch looks OK as well. Just one question:

> +	ret = sb_issue_discard(sb, discard_block, count);
> +	if (ret == EOPNOTSUPP) {
                   ^^ Here should be -EOPNOTSUPP, or generally a check
that ret < 0 would be better as -EIO or so can be returned as well. The
same comment actually applies for the ext3 patch.

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] Add batched discard support for ext3
  2010-08-04 14:03   ` Jan Kara
@ 2010-08-04 14:32     ` Lukas Czerner
  0 siblings, 0 replies; 24+ messages in thread
From: Lukas Czerner @ 2010-08-04 14:32 UTC (permalink / raw)
  To: Jan Kara
  Cc: Lukas Czerner, linux-ext4, jmoyer, rwheeler, eshishki, sandeen, tytso

On Wed, 4 Aug 2010, Jan Kara wrote:

>   Hi,
> 
>   The patch looks good. Just one small thing:
> 
> On Wed 04-08-10 15:44:34, Lukas Czerner wrote:
> > +	gdp = ext3_get_group_desc(sb, group, &gdp_bh);
> > +	if (!gdp)
> > +		goto err_out;
> > +
> > +	BUFFER_TRACE(gd_bh, "get_write_access");
>                      ^^^^^ gdp_bh
> 
>   So you can add: Reviewed-by: Jan Kara <jack@suse.cz>
> 
> 								Honza
> 

Hi,

thanks a lot for your review. I will resend both patches shortly.

Regards.

-Lukas

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] Add ioctl FITRIM.
  2010-08-04 13:44 [PATCH 1/3] Add ioctl FITRIM Lukas Czerner
  2010-08-04 13:44 ` [PATCH 2/3] Add batched discard support for ext3 Lukas Czerner
  2010-08-04 13:44 ` [PATCH 3/3] Add batched discard support for ext4 Lukas Czerner
@ 2010-08-04 14:57 ` Dmitry Monakhov
  2010-08-04 15:13   ` Lukas Czerner
  2 siblings, 1 reply; 24+ messages in thread
From: Dmitry Monakhov @ 2010-08-04 14:57 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: linux-ext4, jmoyer, rwheeler, eshishki, sandeen, jack, tytso

Lukas Czerner <lczerner@redhat.com> writes:

> Adds an filesystem independent ioctl to allow implementation of file
> system batched discard support.
>
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> ---
>  fs/ioctl.c         |   31 +++++++++++++++++++++++++++++++
>  include/linux/fs.h |    2 ++
>  2 files changed, 33 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ioctl.c b/fs/ioctl.c
> index 2d140a7..6c01c3c 100644
> --- a/fs/ioctl.c
> +++ b/fs/ioctl.c
> @@ -540,6 +540,33 @@ static int ioctl_fsthaw(struct file *filp)
>  	return thaw_super(sb);
>  }
>  
> +static int ioctl_fstrim(struct file *filp, unsigned long arg)
BTW why do we have to trim fs in one shot ?
IMHO it is much suitable to provide start,len parameters as we 
do in most functions(truncate, bdevdiscard, getdents).
It allow userspace caller to implement a fancy looking progress bars.
> +{
> +	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
> +	unsigned int minlen;
> +	int err;
> +
> +	if (!capable(CAP_SYS_ADMIN))
> +		return -EPERM;
> +
> +	/* If filesystem doesn't support trim feature, return. */
> +	if (sb->s_op->trim_fs == NULL)
> +		return -EOPNOTSUPP;
> +
> +	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
> +	if (sb->s_bdev == NULL)
> +		return -EINVAL;
> +
> +	err = get_user(minlen, (unsigned int __user *) arg);
> +	if (err)
> +		return err;
> +
> +	err = sb->s_op->trim_fs(minlen, sb);
> +	if (err)
> +		return err;
> +	return 0;
> +}
> +
>  /*
>   * When you add any new common ioctls to the switches above and below
>   * please update compat_sys_ioctl() too.
> @@ -590,6 +617,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
>  		error = ioctl_fsthaw(filp);
>  		break;
>  
> +	case FITRIM:
> +		error = ioctl_fstrim(filp, arg);
> +		break;
> +
>  	case FS_IOC_FIEMAP:
>  		return ioctl_fiemap(filp, arg);
>  
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 68ca1b0..01632e4 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -315,6 +315,7 @@ struct inodes_stat_t {
>  #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
>  #define FIFREEZE	_IOWR('X', 119, int)	/* Freeze */
>  #define FITHAW		_IOWR('X', 120, int)	/* Thaw */
> +#define FITRIM		_IOWR('X', 121, int)	/* Trim */
>  
>  #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
>  #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
> @@ -1580,6 +1581,7 @@ struct super_operations {
>  	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
>  #endif
>  	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
> +	int (*trim_fs) (unsigned int, struct super_block *);
>  };
>  
>  /*

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] Add ioctl FITRIM.
  2010-08-04 14:57 ` [PATCH 1/3] Add ioctl FITRIM Dmitry Monakhov
@ 2010-08-04 15:13   ` Lukas Czerner
  2010-08-04 15:26     ` Greg Freemyer
  2010-08-05  7:00     ` Dmitry Monakhov
  0 siblings, 2 replies; 24+ messages in thread
From: Lukas Czerner @ 2010-08-04 15:13 UTC (permalink / raw)
  To: Dmitry Monakhov
  Cc: Lukas Czerner, linux-ext4, jmoyer, rwheeler, eshishki, sandeen,
	jack, tytso

On Wed, 4 Aug 2010, Dmitry Monakhov wrote:

> Lukas Czerner <lczerner@redhat.com> writes:
> 
> > Adds an filesystem independent ioctl to allow implementation of file
> > system batched discard support.
> >
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > ---
> >  fs/ioctl.c         |   31 +++++++++++++++++++++++++++++++
> >  include/linux/fs.h |    2 ++
> >  2 files changed, 33 insertions(+), 0 deletions(-)
> >
> > diff --git a/fs/ioctl.c b/fs/ioctl.c
> > index 2d140a7..6c01c3c 100644
> > --- a/fs/ioctl.c
> > +++ b/fs/ioctl.c
> > @@ -540,6 +540,33 @@ static int ioctl_fsthaw(struct file *filp)
> >  	return thaw_super(sb);
> >  }
> >  
> > +static int ioctl_fstrim(struct file *filp, unsigned long arg)
> BTW why do we have to trim fs in one shot ?
> IMHO it is much suitable to provide start,len parameters as we 
> do in most functions(truncate, bdevdiscard, getdents).
> It allow userspace caller to implement a fancy looking progress bars.

Hi,

do you think it is really needed when even with todays SSD's it takes
just a couple of seconds ? And I suppose it will improve in future. But
generally I think we can do that..I would like to hear some more
opinions before I start looking at this.

Thanks.

-Lukas.

> > +{
> > +	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
> > +	unsigned int minlen;
> > +	int err;
> > +
> > +	if (!capable(CAP_SYS_ADMIN))
> > +		return -EPERM;
> > +
> > +	/* If filesystem doesn't support trim feature, return. */
> > +	if (sb->s_op->trim_fs == NULL)
> > +		return -EOPNOTSUPP;
> > +
> > +	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
> > +	if (sb->s_bdev == NULL)
> > +		return -EINVAL;
> > +
> > +	err = get_user(minlen, (unsigned int __user *) arg);
> > +	if (err)
> > +		return err;
> > +
> > +	err = sb->s_op->trim_fs(minlen, sb);
> > +	if (err)
> > +		return err;
> > +	return 0;
> > +}
> > +
> >  /*
> >   * When you add any new common ioctls to the switches above and below
> >   * please update compat_sys_ioctl() too.
> > @@ -590,6 +617,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
> >  		error = ioctl_fsthaw(filp);
> >  		break;
> >  
> > +	case FITRIM:
> > +		error = ioctl_fstrim(filp, arg);
> > +		break;
> > +
> >  	case FS_IOC_FIEMAP:
> >  		return ioctl_fiemap(filp, arg);
> >  
> > diff --git a/include/linux/fs.h b/include/linux/fs.h
> > index 68ca1b0..01632e4 100644
> > --- a/include/linux/fs.h
> > +++ b/include/linux/fs.h
> > @@ -315,6 +315,7 @@ struct inodes_stat_t {
> >  #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
> >  #define FIFREEZE	_IOWR('X', 119, int)	/* Freeze */
> >  #define FITHAW		_IOWR('X', 120, int)	/* Thaw */
> > +#define FITRIM		_IOWR('X', 121, int)	/* Trim */
> >  
> >  #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
> >  #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
> > @@ -1580,6 +1581,7 @@ struct super_operations {
> >  	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
> >  #endif
> >  	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
> > +	int (*trim_fs) (unsigned int, struct super_block *);
> >  };
> >  
> >  /*
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] Add ioctl FITRIM.
  2010-08-04 15:13   ` Lukas Czerner
@ 2010-08-04 15:26     ` Greg Freemyer
  2010-08-05  0:28       ` Ted Ts'o
  2010-08-05  7:00     ` Dmitry Monakhov
  1 sibling, 1 reply; 24+ messages in thread
From: Greg Freemyer @ 2010-08-04 15:26 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: Dmitry Monakhov, linux-ext4, jmoyer, rwheeler, eshishki, sandeen,
	jack, tytso, Mark Lord

On Wed, Aug 4, 2010 at 11:13 AM, Lukas Czerner <lczerner@redhat.com> wrote:
> On Wed, 4 Aug 2010, Dmitry Monakhov wrote:
>
>> Lukas Czerner <lczerner@redhat.com> writes:
>>
>> > Adds an filesystem independent ioctl to allow implementation of file
>> > system batched discard support.
>> >
>> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
>> > ---
>> >  fs/ioctl.c         |   31 +++++++++++++++++++++++++++++++
>> >  include/linux/fs.h |    2 ++
>> >  2 files changed, 33 insertions(+), 0 deletions(-)
>> >
>> > diff --git a/fs/ioctl.c b/fs/ioctl.c
>> > index 2d140a7..6c01c3c 100644
>> > --- a/fs/ioctl.c
>> > +++ b/fs/ioctl.c
>> > @@ -540,6 +540,33 @@ static int ioctl_fsthaw(struct file *filp)
>> >     return thaw_super(sb);
>> >  }
>> >
>> > +static int ioctl_fstrim(struct file *filp, unsigned long arg)
>> BTW why do we have to trim fs in one shot ?
>> IMHO it is much suitable to provide start,len parameters as we
>> do in most functions(truncate, bdevdiscard, getdents).
>> It allow userspace caller to implement a fancy looking progress bars.
>
> Hi,
>
> do you think it is really needed when even with todays SSD's it takes
> just a couple of seconds ? And I suppose it will improve in future. But
> generally I think we can do that..I would like to hear some more
> opinions before I start looking at this.
>
> Thanks.
>
> -Lukas.

Since the proposed patch is not aggregating discards into multiple
ranges per ATA command, I thought some of the non-optimized devices
would take minutes / hours?

If true, a way to control the progress from userspace is important.

If in general it is only going to take a few seconds for a full FITRIM
to run, it is much less important, but I suppose the the RT project
might find even that problematic.

Greg
--
To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] Add batched discard support for ext3
  2010-08-04 13:44 ` [PATCH 2/3] Add batched discard support for ext3 Lukas Czerner
  2010-08-04 14:03   ` Jan Kara
@ 2010-08-04 19:39   ` Andreas Dilger
  2010-08-05 14:00     ` Lukas Czerner
  1 sibling, 1 reply; 24+ messages in thread
From: Andreas Dilger @ 2010-08-04 19:39 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: linux-ext4, jmoyer, rwheeler, eshishki, sandeen, jack, tytso

On 2010-08-04, at 07:44, Lukas Czerner wrote:
> +		/*
> +		 * Clear bits in the bitmap
> +		 */
> +		for (bit = start; bit < next; bit++) {
> +			BUFFER_TRACE(bitmap_bh, "clear bit");
> +			if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
> +						bit, bitmap_bh->b_data)) {
> +				ext3_error(sb, __func__,
> +					"bit already cleared for block "E3FSBLK,
> +					 (unsigned long)bit);
> +				BUFFER_TRACE(bitmap_bh, "bit already cleared");
> +			} else {
> +				freed++;
> +			}
> +		}
> +
> +		/* Update couters */
> +		spin_lock(sb_bgl_lock(sbi, group));
> +		le16_add_cpu(&gdp->bg_free_blocks_count, freed);
> +		spin_unlock(sb_bgl_lock(sbi, group));
> +		percpu_counter_add(&sbi->s_freeblocks_counter, next - start);

It wouldn't be a terrible idea to put this code into a helper function that is shared with ext3_free_blocks_sb() so that it is kept in sync.  There are a lot of places that need to be kept coordinated (disk bitmap, memory bitmap, group descriptor, percpu counter) so I'd like that to be localized to one part of the code.


Cheers, Andreas






^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] Add ioctl FITRIM.
  2010-08-04 15:26     ` Greg Freemyer
@ 2010-08-05  0:28       ` Ted Ts'o
  2010-08-05  6:51         ` Dmitry Monakhov
  2010-08-05 15:47         ` Andreas Dilger
  0 siblings, 2 replies; 24+ messages in thread
From: Ted Ts'o @ 2010-08-05  0:28 UTC (permalink / raw)
  To: Greg Freemyer
  Cc: Lukas Czerner, Dmitry Monakhov, linux-ext4, jmoyer, rwheeler,
	eshishki, sandeen, jack, Mark Lord

On Wed, Aug 04, 2010 at 11:26:56AM -0400, Greg Freemyer wrote:
> 
> Since the proposed patch is not aggregating discards into multiple
> ranges per ATA command, I thought some of the non-optimized devices
> would take minutes / hours?
> 
> If true, a way to control the progress from userspace is important.
> 
> If in general it is only going to take a few seconds for a full FITRIM
> to run, it is much less important, but I suppose the the RT project
> might find even that problematic.

Even if it without the RT project, if disk activity is slowed or
completely stopped for a few seconds, I can think of plenty of
workloads where this would be totally unacceptable.  Suppose you are
running a web site; it doesn't really matter whether it is at Google,
Facebook, Twitter, etc.  If this means that one or more web pages get
stalled by "a few seconds" while the FITRIM is going on, this is
generally not considered acceptable.  Even if it slows down the server
by 30-50%, for some sites this would also be quite unacceptable.

This is a hard problem to solve, though, especially if there is an
insistence to solve it in a fs-independent fashion.  I could imagine
doing this at work, by doing things one block group at a time, and
then I could measure, for our specific hardware, how badly disk
performance would get hit, and for how long, and then the userspace
daemon could control how many block groups to do per unit time.
But this would be of necessity ext2/3/4 specific....

So I'm not sure what to suggest here.  Maybe the answer is we can have
a fs-independent ioctl for desktop workloads, and one which gives more
fine-grained control for those who need it?  That seems ugly, but it
might be the best compromise.

						- Ted

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] Add ioctl FITRIM.
  2010-08-05  0:28       ` Ted Ts'o
@ 2010-08-05  6:51         ` Dmitry Monakhov
  2010-08-05 15:47         ` Andreas Dilger
  1 sibling, 0 replies; 24+ messages in thread
From: Dmitry Monakhov @ 2010-08-05  6:51 UTC (permalink / raw)
  To: Ted Ts'o
  Cc: Greg Freemyer, Lukas Czerner, linux-ext4, jmoyer, rwheeler,
	eshishki, sandeen, jack, Mark Lord

Ted Ts'o <tytso@mit.edu> writes:

> On Wed, Aug 04, 2010 at 11:26:56AM -0400, Greg Freemyer wrote:
>> 
>> Since the proposed patch is not aggregating discards into multiple
>> ranges per ATA command, I thought some of the non-optimized devices
>> would take minutes / hours?
>> 
>> If true, a way to control the progress from userspace is important.
>> 
>> If in general it is only going to take a few seconds for a full FITRIM
>> to run, it is much less important, but I suppose the the RT project
>> might find even that problematic.
Few second may not being true, We always have to think about crazy
user, and crazy fs-layouts.
My SSD is able to process 8*10^3 requests per second
So in worst case( where 1k fs block are bysy like this 010101010101)
it can process about 10^3/s * 2*1Kb = 16Mb/s
Which is no good.
>
> Even if it without the RT project, if disk activity is slowed or
> completely stopped for a few seconds, I can think of plenty of
> workloads where this would be totally unacceptable.  Suppose you are
> running a web site; it doesn't really matter whether it is at Google,
> Facebook, Twitter, etc.  If this means that one or more web pages get
> stalled by "a few seconds" while the FITRIM is going on, this is
> generally not considered acceptable.  Even if it slows down the server
> by 30-50%, for some sites this would also be quite unacceptable.
>
> This is a hard problem to solve, though, especially if there is an
> insistence to solve it in a fs-independent fashion.  I could imagine
> doing this at work, by doing things one block group at a time, and
> then I could measure, for our specific hardware, how badly disk
> performance would get hit, and for how long, and then the userspace
> daemon could control how many block groups to do per unit time.
> But this would be of necessity ext2/3/4 specific....
>
> So I'm not sure what to suggest here.  Maybe the answer is we can have
> a fs-independent ioctl for desktop workloads, and one which gives more
> fine-grained control for those who need it?  That seems ugly, but it
> might be the best compromise.
Why do we have to invent a wheel again?
We already have  BLKDISCARD which has following arguments:
uint64_t range[2]

So IMHO it is reasonable that FITRIM should have following arguments 
uint64_t start, uint64_t len, uint64_t minlen.
No problems with compat, no problems with interactivity.
User who does not care about interactivity may just call
ioctl(fd, FITRIM, 0, LLONG_MAX, 0), 

>
> 						- Ted
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] Add ioctl FITRIM.
  2010-08-04 15:13   ` Lukas Czerner
  2010-08-04 15:26     ` Greg Freemyer
@ 2010-08-05  7:00     ` Dmitry Monakhov
  2010-08-05  8:36       ` Lukas Czerner
  1 sibling, 1 reply; 24+ messages in thread
From: Dmitry Monakhov @ 2010-08-05  7:00 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: linux-ext4, jmoyer, rwheeler, eshishki, sandeen, jack, tytso

Lukas Czerner <lczerner@redhat.com> writes:

> On Wed, 4 Aug 2010, Dmitry Monakhov wrote:
>
>> Lukas Czerner <lczerner@redhat.com> writes:
>> 
>> > Adds an filesystem independent ioctl to allow implementation of file
>> > system batched discard support.
>> >
>> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
>> > ---
>> >  fs/ioctl.c         |   31 +++++++++++++++++++++++++++++++
>> >  include/linux/fs.h |    2 ++
>> >  2 files changed, 33 insertions(+), 0 deletions(-)
>> >
>> > diff --git a/fs/ioctl.c b/fs/ioctl.c
>> > index 2d140a7..6c01c3c 100644
>> > --- a/fs/ioctl.c
>> > +++ b/fs/ioctl.c
>> > @@ -540,6 +540,33 @@ static int ioctl_fsthaw(struct file *filp)
>> >  	return thaw_super(sb);
>> >  }
>> >  
>> > +static int ioctl_fstrim(struct file *filp, unsigned long arg)
>> BTW why do we have to trim fs in one shot ?
>> IMHO it is much suitable to provide start,len parameters as we 
>> do in most functions(truncate, bdevdiscard, getdents).
>> It allow userspace caller to implement a fancy looking progress bars.
>
> Hi,
>
> do you think it is really needed when even with todays SSD's it takes
> just a couple of seconds ? And I suppose it will improve in future. But
> generally I think we can do that..I would like to hear some more
> opinions before I start looking at this.
Hi, Lukas
we may face a really long delays due to bad layouts and slow devices
Please read my response to Ted
I'm agree with you what this interface is important, BTW i already
enabled FITRIM support on my notebook, my speed difference is about 2-3%.
But let's provide right user interface from very beginning.
>
> Thanks.
>
> -Lukas.
>
>> > +{
>> > +	struct super_block *sb = filp->f_path.dentry->d_inode->i_sb;
>> > +	unsigned int minlen;
>> > +	int err;
>> > +
>> > +	if (!capable(CAP_SYS_ADMIN))
>> > +		return -EPERM;
>> > +
>> > +	/* If filesystem doesn't support trim feature, return. */
>> > +	if (sb->s_op->trim_fs == NULL)
>> > +		return -EOPNOTSUPP;
>> > +
>> > +	/* If a blockdevice-backed filesystem isn't specified, return EINVAL. */
>> > +	if (sb->s_bdev == NULL)
>> > +		return -EINVAL;
>> > +
>> > +	err = get_user(minlen, (unsigned int __user *) arg);
>> > +	if (err)
>> > +		return err;
>> > +
>> > +	err = sb->s_op->trim_fs(minlen, sb);
>> > +	if (err)
>> > +		return err;
>> > +	return 0;
>> > +}
>> > +
>> >  /*
>> >   * When you add any new common ioctls to the switches above and below
>> >   * please update compat_sys_ioctl() too.
>> > @@ -590,6 +617,10 @@ int do_vfs_ioctl(struct file *filp, unsigned int fd, unsigned int cmd,
>> >  		error = ioctl_fsthaw(filp);
>> >  		break;
>> >  
>> > +	case FITRIM:
>> > +		error = ioctl_fstrim(filp, arg);
>> > +		break;
>> > +
>> >  	case FS_IOC_FIEMAP:
>> >  		return ioctl_fiemap(filp, arg);
>> >  
>> > diff --git a/include/linux/fs.h b/include/linux/fs.h
>> > index 68ca1b0..01632e4 100644
>> > --- a/include/linux/fs.h
>> > +++ b/include/linux/fs.h
>> > @@ -315,6 +315,7 @@ struct inodes_stat_t {
>> >  #define FIGETBSZ   _IO(0x00,2)	/* get the block size used for bmap */
>> >  #define FIFREEZE	_IOWR('X', 119, int)	/* Freeze */
>> >  #define FITHAW		_IOWR('X', 120, int)	/* Thaw */
>> > +#define FITRIM		_IOWR('X', 121, int)	/* Trim */
>> >  
>> >  #define	FS_IOC_GETFLAGS			_IOR('f', 1, long)
>> >  #define	FS_IOC_SETFLAGS			_IOW('f', 2, long)
>> > @@ -1580,6 +1581,7 @@ struct super_operations {
>> >  	ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
>> >  #endif
>> >  	int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
>> > +	int (*trim_fs) (unsigned int, struct super_block *);
>> >  };
>> >  
>> >  /*
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
>> the body of a message to majordomo@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html
>> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] Add ioctl FITRIM.
  2010-08-05  7:00     ` Dmitry Monakhov
@ 2010-08-05  8:36       ` Lukas Czerner
  0 siblings, 0 replies; 24+ messages in thread
From: Lukas Czerner @ 2010-08-05  8:36 UTC (permalink / raw)
  To: Dmitry Monakhov
  Cc: Lukas Czerner, linux-ext4, jmoyer, rwheeler, eshishki, sandeen,
	jack, tytso

On Thu, 5 Aug 2010, Dmitry Monakhov wrote:

> Lukas Czerner <lczerner@redhat.com> writes:
> 
> > On Wed, 4 Aug 2010, Dmitry Monakhov wrote:
> >
> >> Lukas Czerner <lczerner@redhat.com> writes:
> >> 
> >> > Adds an filesystem independent ioctl to allow implementation of file
> >> > system batched discard support.
> >> >
> >> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> >> > ---
> >> >  fs/ioctl.c         |   31 +++++++++++++++++++++++++++++++
> >> >  include/linux/fs.h |    2 ++
> >> >  2 files changed, 33 insertions(+), 0 deletions(-)
> >> >
> >> > diff --git a/fs/ioctl.c b/fs/ioctl.c
> >> > index 2d140a7..6c01c3c 100644
> >> > --- a/fs/ioctl.c
> >> > +++ b/fs/ioctl.c
> >> > @@ -540,6 +540,33 @@ static int ioctl_fsthaw(struct file *filp)
> >> >  	return thaw_super(sb);
> >> >  }
> >> >  
> >> > +static int ioctl_fstrim(struct file *filp, unsigned long arg)
> >> BTW why do we have to trim fs in one shot ?
> >> IMHO it is much suitable to provide start,len parameters as we 
> >> do in most functions(truncate, bdevdiscard, getdents).
> >> It allow userspace caller to implement a fancy looking progress bars.
> >
> > Hi,
> >
> > do you think it is really needed when even with todays SSD's it takes
> > just a couple of seconds ? And I suppose it will improve in future. But
> > generally I think we can do that..I would like to hear some more
> > opinions before I start looking at this.
> Hi, Lukas
> we may face a really long delays due to bad layouts and slow devices
> Please read my response to Ted
> I'm agree with you what this interface is important, BTW i already
> enabled FITRIM support on my notebook, my speed difference is about 2-3%.
> But let's provide right user interface from very beginning.

Hi, Dimitry

I read the thread and really it makes sense to me. Sometimes it can be useful
to have more fine-grained control beside just specifying minlen argument,
which works quite well, however it is a little bit fuzzy because when you do
not know how much space was actually trimmed.

I think that there is no need to have two separate ioctls, even though
it would be more effective to specify block group instead of block range.
I am thinking about something like int optimize which will tell us to round
the range to block group boundaries. But I can not tell if it would
really help someone (probably not).

So I will try to do something to be able to break the FITRIM to smaller
pieces, uint64_t start, uint64_t len, uint64_t minlen seems good to me.

Thanks

-Lukas

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 2/3] Add batched discard support for ext3
  2010-08-04 19:39   ` Andreas Dilger
@ 2010-08-05 14:00     ` Lukas Czerner
  0 siblings, 0 replies; 24+ messages in thread
From: Lukas Czerner @ 2010-08-05 14:00 UTC (permalink / raw)
  To: Andreas Dilger
  Cc: Lukas Czerner, linux-ext4, jmoyer, rwheeler, eshishki, sandeen,
	jack, tytso

On Wed, 4 Aug 2010, Andreas Dilger wrote:

> On 2010-08-04, at 07:44, Lukas Czerner wrote:
> > +		/*
> > +		 * Clear bits in the bitmap
> > +		 */
> > +		for (bit = start; bit < next; bit++) {
> > +			BUFFER_TRACE(bitmap_bh, "clear bit");
> > +			if (!ext3_clear_bit_atomic(sb_bgl_lock(sbi, group),
> > +						bit, bitmap_bh->b_data)) {
> > +				ext3_error(sb, __func__,
> > +					"bit already cleared for block "E3FSBLK,
> > +					 (unsigned long)bit);
> > +				BUFFER_TRACE(bitmap_bh, "bit already cleared");
> > +			} else {
> > +				freed++;
> > +			}
> > +		}
> > +
> > +		/* Update couters */
> > +		spin_lock(sb_bgl_lock(sbi, group));
> > +		le16_add_cpu(&gdp->bg_free_blocks_count, freed);
> > +		spin_unlock(sb_bgl_lock(sbi, group));
> > +		percpu_counter_add(&sbi->s_freeblocks_counter, next - start);
> 
> It wouldn't be a terrible idea to put this code into a helper function that is shared with ext3_free_blocks_sb() so that it is kept in sync.  There are a lot of places that need to be kept coordinated (disk bitmap, memory bitmap, group descriptor, percpu counter) so I'd like that to be localized to one part of the code.
> 
> 
> Cheers, Andreas
> 

Ok, I will see what I can do with that.

Thanks.

-Lukas

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 1/3] Add ioctl FITRIM.
  2010-08-05  0:28       ` Ted Ts'o
  2010-08-05  6:51         ` Dmitry Monakhov
@ 2010-08-05 15:47         ` Andreas Dilger
  1 sibling, 0 replies; 24+ messages in thread
From: Andreas Dilger @ 2010-08-05 15:47 UTC (permalink / raw)
  To: Ted Ts'o
  Cc: Greg Freemyer, Lukas Czerner, Dmitry Monakhov, linux-ext4,
	jmoyer, rwheeler, eshishki, sandeen, jack, Mark Lord

On 2010-08-04, at 18:28, Ted Ts'o wrote:
> On Wed, Aug 04, 2010 at 11:26:56AM -0400, Greg Freemyer wrote:
>> If true, a way to control the progress from userspace is important.
>> 
>> If in general it is only going to take a few seconds for a full FITRIM
>> to run, it is much less important, but I suppose the the RT project
>> might find even that problematic.
> 
> Even if it without the RT project, if disk activity is slowed or
> completely stopped for a few seconds, I can think of plenty of
> workloads where this would be totally unacceptable.  Suppose you are
> running a web site; it doesn't really matter whether it is at Google,
> Facebook, Twitter, etc.  If this means that one or more web pages get
> stalled by "a few seconds" while the FITRIM is going on, this is
> generally not considered acceptable.  Even if it slows down the server
> by 30-50%, for some sites this would also be quite unacceptable.
> 
> This is a hard problem to solve, though, especially if there is an
> insistence to solve it in a fs-independent fashion.  I could imagine
> doing this at work, by doing things one block group at a time, and
> then I could measure, for our specific hardware, how badly disk
> performance would get hit, and for how long, and then the userspace
> daemon could control how many block groups to do per unit time.
> But this would be of necessity ext2/3/4 specific....

I think "blockgroup at a time" is simply the extN way of "range of blocks at a time".  Having an API that is requesting "trim free blocks from [M,N]" is a generic enough interface to apply to any filesystem.  If there is some way to query the "efficient trim increment size" (i.e. block group for extN, allocation group for xfs, ??? for btrfs) then userspace could do it that way, or simply pick some fraction of the filesystem and use a nice power-of-two value and hope it works out.

> So I'm not sure what to suggest here.  Maybe the answer is we can have
> a fs-independent ioctl for desktop workloads, and one which gives more
> fine-grained control for those who need it?  That seems ugly, but it
> might be the best compromise.

No, too ugly.

Cheers, Andreas






^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 3/3] Add batched discard support for ext4
  2010-08-10 14:19 [PATCH 0/3 ver. 7] Ext3/Ext4 Batched discard support Lukas Czerner
@ 2010-08-10 14:19 ` Lukas Czerner
  0 siblings, 0 replies; 24+ messages in thread
From: Lukas Czerner @ 2010-08-10 14:19 UTC (permalink / raw)
  To: linux-ext4
  Cc: dmonakhov, jmoyer, rwheeler, eshishki, sandeen, jack, tytso, lczerner

Walk through allocation groups and trim all free extents. It can be
invoked through FITRIM ioctl on the file system. The main idea is to
provide a way to trim the whole file system if needed, since some SSD's
may suffer from performance loss after the whole device was filled (it
does not mean that fs is full!).

It search for free extents in allocation groups specified by Byte range
start -> start+len. When the free extent is within this range, blocks
are marked as used and then trimmed. Afterwards these blocks are marked
as free in per-group bitmap.

Since fstrim is a long operation it is good to have an ability to
interrupt it by a signal. This was added by Dmitry Monakhov.
Thanks Dimitry.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Reviewed-by: Jan Kara <jack@suse.cz>
Reviewed-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/ext4.h    |    2 +
 fs/ext4/mballoc.c |  200 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c   |    1 +
 3 files changed, 203 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 19a4de5..2f96dc5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1558,6 +1558,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
+extern int ext4_trim_fs(struct super_block *, uint64_t, uint64_t, uint64_t);
+
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0e83dfd..4c4132b 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4690,3 +4690,203 @@ error_return:
 		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:		super block for the file system
+ * @start:	starting block of the free extent in the alloc. group
+ * @count:	number of blocks to TRIM
+ * @group:	alloc. group we are working with
+ * @e4b:	ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	ext4_fsblk_t discard_block;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_free_extent ex;
+	int ret = 0;
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	/*
+	 * Mark blocks used, so no one can reuse them while
+	 * being trimmed.
+	 */
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+
+	discard_block = (ext4_fsblk_t)group *
+			EXT4_BLOCKS_PER_GROUP(sb)
+			+ start
+			+ le32_to_cpu(es->s_first_data_block);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long)discard_block,
+			count);
+	ret = sb_issue_discard(sb, discard_block, count);
+	if (ret == -EOPNOTSUPP) {
+		ext4_warning(sb,
+			"discard not supported!");
+	} else if (ret < 0) {
+		ext4_std_error(sb, ret);
+	}
+
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+	return ret;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @e4b:		ext4 buddy
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @minblocks:		minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t next, count = 0;
+	ext4_group_t group;
+	int ret = 0;
+
+	BUG_ON(e4b == NULL);
+
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	start = (e4b->bd_info->bb_first_free > start) ?
+		e4b->bd_info->bb_first_free : start;
+	ext4_lock_group(sb, group);
+
+	while (start < max) {
+
+		start = mb_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = mb_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minblocks) {
+			ret = ext4_trim_extent(sb, start,
+				next - start, group, e4b);
+			if (ret < 0)
+				break;
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if (need_resched()) {
+			ext4_unlock_group(sb, group);
+			cond_resched();
+			ext4_lock_group(sb, group);
+		}
+
+		if ((e4b->bd_info->bb_free - count) < minblocks)
+			break;
+	}
+	ext4_unlock_group(sb, group);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	if (ret < 0)
+		count = ret;
+
+	return count;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @start:		First Byte to trim
+ * @len:		number of Bytes to trim from start
+ * @minlen:		minimum extent length in Bytes
+ *
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, uint64_t start, uint64_t len,
+		uint64_t minlen)
+{
+	struct ext4_buddy e4b;
+	ext4_fsblk_t first_group, last_group;
+	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+	ext4_grpblk_t cnt, first_block, last_block;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	int ret = 0;
+
+	start >>= sb->s_blocksize_bits;
+	len >>= sb->s_blocksize_bits;
+	minlen >>= sb->s_blocksize_bits;
+
+	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+
+	/* Determine first and last group to examine based on start and len */
+	first_group = (start - le32_to_cpu(es->s_first_data_block)) /
+		      EXT4_BLOCKS_PER_GROUP(sb);
+	last_group = (start + len - le32_to_cpu(es->s_first_data_block)) /
+		     EXT4_BLOCKS_PER_GROUP(sb);
+	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+
+	if (first_group > last_group)
+		return -EINVAL;
+
+	first_block = (start - le32_to_cpu(es->s_first_data_block)) %
+			EXT4_BLOCKS_PER_GROUP(sb);
+	last_block = EXT4_BLOCKS_PER_GROUP(sb);
+
+	for (group = first_group; group <= last_group; group++) {
+
+		ret = ext4_mb_load_buddy(sb, group, &e4b);
+		if (ret) {
+			ext4_error(sb, "Error in loading buddy "
+					"information for %u", group);
+			break;
+		}
+
+		if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+			len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+		else
+			last_block = len;
+
+		if (e4b.bd_info->bb_free >= minlen) {
+			cnt = ext4_trim_all_free(sb, &e4b, first_block,
+						last_block, minlen);
+			if (cnt < 0) {
+				ret = cnt;
+				ext4_mb_unload_buddy(&e4b);
+				break;
+			}
+		}
+		ext4_mb_unload_buddy(&e4b);
+
+		first_block = 0;
+	}
+	return ret;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e72d323..3b05b3a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1114,6 +1114,7 @@ static const struct super_operations ext4_sops = {
 	.quota_write	= ext4_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
+	.trim_fs	= ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
-- 
1.7.2.1


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] Add batched discard support for ext4
  2010-08-07 22:25   ` Jan Kara
@ 2010-08-10 11:32     ` Lukas Czerner
  0 siblings, 0 replies; 24+ messages in thread
From: Lukas Czerner @ 2010-08-10 11:32 UTC (permalink / raw)
  To: Jan Kara
  Cc: Lukas Czerner, linux-ext4, dmonakhov, jmoyer, rwheeler, eshishki,
	sandeen, tytso

On Sun, 8 Aug 2010, Jan Kara wrote:

> On Fri 06-08-10 13:31:16, Lukas Czerner wrote:
> ...
> > +/**
> > + * ext4_trim_fs() -- trim ioctl handle function
> > + * @sb:			superblock for filesystem
> > + * @start:		First Byte to trim
> > + * @len:		number of Bytes to trim from start
> > + * @minlen:		minimum extent length in Bytes
> > + *
> > + * ext4_trim_fs goes through all allocation groups containing Bytes from
> > + * start to start+len. For each such a group ext4_trim_all_free function
> > + * is invoked to trim all free space.
> > + */
> > +int ext4_trim_fs(struct super_block *sb, uint64_t start, uint64_t len,
> > +		uint64_t minlen)
> > +{
> > +	struct ext4_buddy e4b;
> > +	ext4_fsblk_t first_group, last_group;
> > +	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
> > +	ext4_grpblk_t cnt, first_block, last_block;
> > +	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
> > +	int ret = 0;
> > +
> > +	start >>= sb->s_blocksize_bits;
> > +	len >>= sb->s_blocksize_bits;
> > +	minlen >>= sb->s_blocksize_bits;
> > +
> > +	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
> > +		return -EINVAL;
> > +
> > +	/* Determine first and last group to examine based on start and len */
> > +	first_group = (start - le32_to_cpu(es->s_first_data_block)) /
> > +		      EXT4_BLOCKS_PER_GROUP(sb);
> > +	last_group = (start + len - le32_to_cpu(es->s_first_data_block)) /
> > +		     EXT4_BLOCKS_PER_GROUP(sb);
> > +	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
> > +
> > +	if (first_group > last_group)
> > +		return -EINVAL;
> > +
> > +	first_block = start % EXT4_BLOCKS_PER_GROUP(sb);
>   I think you should substract "le32_to_cpu(es->s_first_data_block)" from
> the start before doing modulo.
> 
> 									Honza

Oh, right thanks.

-Lukas

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] Add batched discard support for ext4
  2010-08-06 11:31 ` [PATCH 3/3] Add batched discard support for ext4 Lukas Czerner
  2010-08-06 13:03   ` Dmitry Monakhov
@ 2010-08-07 22:25   ` Jan Kara
  2010-08-10 11:32     ` Lukas Czerner
  1 sibling, 1 reply; 24+ messages in thread
From: Jan Kara @ 2010-08-07 22:25 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: linux-ext4, dmonakhov, jmoyer, rwheeler, eshishki, sandeen, jack, tytso

On Fri 06-08-10 13:31:16, Lukas Czerner wrote:
...
> +/**
> + * ext4_trim_fs() -- trim ioctl handle function
> + * @sb:			superblock for filesystem
> + * @start:		First Byte to trim
> + * @len:		number of Bytes to trim from start
> + * @minlen:		minimum extent length in Bytes
> + *
> + * ext4_trim_fs goes through all allocation groups containing Bytes from
> + * start to start+len. For each such a group ext4_trim_all_free function
> + * is invoked to trim all free space.
> + */
> +int ext4_trim_fs(struct super_block *sb, uint64_t start, uint64_t len,
> +		uint64_t minlen)
> +{
> +	struct ext4_buddy e4b;
> +	ext4_fsblk_t first_group, last_group;
> +	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
> +	ext4_grpblk_t cnt, first_block, last_block;
> +	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
> +	int ret = 0;
> +
> +	start >>= sb->s_blocksize_bits;
> +	len >>= sb->s_blocksize_bits;
> +	minlen >>= sb->s_blocksize_bits;
> +
> +	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
> +		return -EINVAL;
> +
> +	/* Determine first and last group to examine based on start and len */
> +	first_group = (start - le32_to_cpu(es->s_first_data_block)) /
> +		      EXT4_BLOCKS_PER_GROUP(sb);
> +	last_group = (start + len - le32_to_cpu(es->s_first_data_block)) /
> +		     EXT4_BLOCKS_PER_GROUP(sb);
> +	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
> +
> +	if (first_group > last_group)
> +		return -EINVAL;
> +
> +	first_block = start % EXT4_BLOCKS_PER_GROUP(sb);
  I think you should substract "le32_to_cpu(es->s_first_data_block)" from
the start before doing modulo.

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] Add batched discard support for ext4
  2010-08-06 13:03   ` Dmitry Monakhov
@ 2010-08-06 13:23     ` Lukas Czerner
  0 siblings, 0 replies; 24+ messages in thread
From: Lukas Czerner @ 2010-08-06 13:23 UTC (permalink / raw)
  To: Dmitry Monakhov
  Cc: Lukas Czerner, linux-ext4, jmoyer, rwheeler, eshishki, sandeen,
	jack, tytso

On Fri, 6 Aug 2010, Dmitry Monakhov wrote:

> Lukas Czerner <lczerner@redhat.com> writes:
> 
> > Walk through allocation groups and trim all free extents. It can be
> > invoked through FITRIM ioctl on the file system. The main idea is to
> > provide a way to trim the whole file system if needed, since some SSD's
> > may suffer from performance loss after the whole device was filled (it
> > does not mean that fs is full!).
> >
> > It search for free extents in allocation groups specified by Byte range
> > start -> start+len. When the free extent is within this range, blocks
> > are marked as used and then trimmed. Afterwards these blocks are marked
> > as free in per-group bitmap.
> >
> > Since fstrim is a long operation it is good to have an ability to
> > interrupt it by a signal. This was added by Dmitry Monakhov.
> > Thanks Dimitry.
> >
> > Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> > Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> > Reviewed-by: Jan Kara <jack@suse.cz>
> > ---
> >  fs/ext4/ext4.h    |    2 +
> >  fs/ext4/mballoc.c |  194 +++++++++++++++++++++++++++++++++++++++++++++++++++++
> >  fs/ext4/super.c   |    1 +
> >  3 files changed, 197 insertions(+), 0 deletions(-)
> >
> > diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> > index 19a4de5..2f96dc5 100644
> > --- a/fs/ext4/ext4.h
> > +++ b/fs/ext4/ext4.h
> > @@ -1558,6 +1558,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
> >  extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
> >  extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
> >  						ext4_group_t, int);
> > +extern int ext4_trim_fs(struct super_block *, uint64_t, uint64_t, uint64_t);
> > +
> >  /* inode.c */
> >  struct buffer_head *ext4_getblk(handle_t *, struct inode *,
> >  						ext4_lblk_t, int, int *);
> > diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> > index 0e83dfd..5210676 100644
> > --- a/fs/ext4/mballoc.c
> > +++ b/fs/ext4/mballoc.c
> > @@ -4690,3 +4690,197 @@ error_return:
> >  		kmem_cache_free(ext4_ac_cachep, ac);
> >  	return;
> >  }
> > +
> > +/**
> > + * ext4_trim_extent -- function to TRIM one single free extent in the group
> > + * @sb:		super block for the file system
> > + * @start:	starting block of the free extent in the alloc. group
> > + * @count:	number of blocks to TRIM
> > + * @group:	alloc. group we are working with
> > + * @e4b:	ext4 buddy for the group
> > + *
> > + * Trim "count" blocks starting at "start" in the "group". To assure that no
> > + * one will allocate those blocks, mark it as used in buddy bitmap. This must
> > + * be called with under the group lock.
> > + */
> > +static int ext4_trim_extent(struct super_block *sb, int start, int count,
> > +		ext4_group_t group, struct ext4_buddy *e4b)
> > +{
> > +	ext4_fsblk_t discard_block;
> > +	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
> > +	struct ext4_free_extent ex;
> > +	int ret = 0;
> > +
> > +	assert_spin_locked(ext4_group_lock_ptr(sb, group));
> > +
> > +	ex.fe_start = start;
> > +	ex.fe_group = group;
> > +	ex.fe_len = count;
> > +
> > +	/*
> > +	 * Mark blocks used, so no one can reuse them while
> > +	 * being trimmed.
> > +	 */
> > +	mb_mark_used(e4b, &ex);
> > +	ext4_unlock_group(sb, group);
> > +
> > +	discard_block = (ext4_fsblk_t)group *
> > +			EXT4_BLOCKS_PER_GROUP(sb)
> > +			+ start
> > +			+ le32_to_cpu(es->s_first_data_block);
> > +	trace_ext4_discard_blocks(sb,
> > +			(unsigned long long)discard_block,
> > +			count);
> > +	ret = sb_issue_discard(sb, discard_block, count);
> > +	if (ret == -EOPNOTSUPP) {
> > +		ext4_warning(sb,
> > +			"discard not supported!");
> > +	} else if (ret < 0) {
> > +		ext4_std_error(sb, ret);
> > +	}
> > +	cond_resched();
> OOps seems i'm was wrong when suggested to put cond_resched() here.
> It is useless because we probably already slept inside sb_issue_discard()
> But it is reasonable to move it to ext4_trim_all_free()
> because fs size may be huge so simple bitmap traverse will take long
> enough, but almost fully populated so there are no extents longer than
> minlen so discard would not being called.
> BTW: Since Jan already noted about that in ext3's patch so ext3 version
> is good.
> 
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index d3a0763..00e6147 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4688,7 +4688,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
>  	} else if (ret < 0) {
>  		ext4_std_error(sb, ret);
>  	}
> -	cond_resched();
>  
>  	ext4_lock_group(sb, group);
>  	mb_free_blocks(NULL, e4b, start, ex.fe_len);
> @@ -4745,6 +4744,11 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
>  		}
>  		start = next + 1;
>  
> +		if (need_resched()) {
> +			ext4_unlock_group(sb, group);
> +			cond_resched();
> +			ext4_lock_group(sb, group);
> +		}
>  		if (signal_pending(current)) {
>  			count = -ERESTARTSYS;
>  			break;
> 
> 

Hmm, are you sure about this ? Wouldn't make more sense to put just
cond_resched() into the ext4_trim_fs loop, since there is no need to
check for need_resched() and lock/unlock the group. Do you expect the
group traversal to take that long ?

There are other examples of traversing whole group, but none of it is
trying to reschedule in the middle of the group.

Regards
-Lukas


^ permalink raw reply	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] Add batched discard support for ext4
  2010-08-06 11:31 ` [PATCH 3/3] Add batched discard support for ext4 Lukas Czerner
@ 2010-08-06 13:03   ` Dmitry Monakhov
  2010-08-06 13:23     ` Lukas Czerner
  2010-08-07 22:25   ` Jan Kara
  1 sibling, 1 reply; 24+ messages in thread
From: Dmitry Monakhov @ 2010-08-06 13:03 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: linux-ext4, jmoyer, rwheeler, eshishki, sandeen, jack, tytso

Lukas Czerner <lczerner@redhat.com> writes:

> Walk through allocation groups and trim all free extents. It can be
> invoked through FITRIM ioctl on the file system. The main idea is to
> provide a way to trim the whole file system if needed, since some SSD's
> may suffer from performance loss after the whole device was filled (it
> does not mean that fs is full!).
>
> It search for free extents in allocation groups specified by Byte range
> start -> start+len. When the free extent is within this range, blocks
> are marked as used and then trimmed. Afterwards these blocks are marked
> as free in per-group bitmap.
>
> Since fstrim is a long operation it is good to have an ability to
> interrupt it by a signal. This was added by Dmitry Monakhov.
> Thanks Dimitry.
>
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> Reviewed-by: Jan Kara <jack@suse.cz>
> ---
>  fs/ext4/ext4.h    |    2 +
>  fs/ext4/mballoc.c |  194 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/super.c   |    1 +
>  3 files changed, 197 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
> index 19a4de5..2f96dc5 100644
> --- a/fs/ext4/ext4.h
> +++ b/fs/ext4/ext4.h
> @@ -1558,6 +1558,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
>  extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
>  extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
>  						ext4_group_t, int);
> +extern int ext4_trim_fs(struct super_block *, uint64_t, uint64_t, uint64_t);
> +
>  /* inode.c */
>  struct buffer_head *ext4_getblk(handle_t *, struct inode *,
>  						ext4_lblk_t, int, int *);
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index 0e83dfd..5210676 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4690,3 +4690,197 @@ error_return:
>  		kmem_cache_free(ext4_ac_cachep, ac);
>  	return;
>  }
> +
> +/**
> + * ext4_trim_extent -- function to TRIM one single free extent in the group
> + * @sb:		super block for the file system
> + * @start:	starting block of the free extent in the alloc. group
> + * @count:	number of blocks to TRIM
> + * @group:	alloc. group we are working with
> + * @e4b:	ext4 buddy for the group
> + *
> + * Trim "count" blocks starting at "start" in the "group". To assure that no
> + * one will allocate those blocks, mark it as used in buddy bitmap. This must
> + * be called with under the group lock.
> + */
> +static int ext4_trim_extent(struct super_block *sb, int start, int count,
> +		ext4_group_t group, struct ext4_buddy *e4b)
> +{
> +	ext4_fsblk_t discard_block;
> +	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
> +	struct ext4_free_extent ex;
> +	int ret = 0;
> +
> +	assert_spin_locked(ext4_group_lock_ptr(sb, group));
> +
> +	ex.fe_start = start;
> +	ex.fe_group = group;
> +	ex.fe_len = count;
> +
> +	/*
> +	 * Mark blocks used, so no one can reuse them while
> +	 * being trimmed.
> +	 */
> +	mb_mark_used(e4b, &ex);
> +	ext4_unlock_group(sb, group);
> +
> +	discard_block = (ext4_fsblk_t)group *
> +			EXT4_BLOCKS_PER_GROUP(sb)
> +			+ start
> +			+ le32_to_cpu(es->s_first_data_block);
> +	trace_ext4_discard_blocks(sb,
> +			(unsigned long long)discard_block,
> +			count);
> +	ret = sb_issue_discard(sb, discard_block, count);
> +	if (ret == -EOPNOTSUPP) {
> +		ext4_warning(sb,
> +			"discard not supported!");
> +	} else if (ret < 0) {
> +		ext4_std_error(sb, ret);
> +	}
> +	cond_resched();
OOps seems i'm was wrong when suggested to put cond_resched() here.
It is useless because we probably already slept inside sb_issue_discard()
But it is reasonable to move it to ext4_trim_all_free()
because fs size may be huge so simple bitmap traverse will take long
enough, but almost fully populated so there are no extents longer than
minlen so discard would not being called.
BTW: Since Jan already noted about that in ext3's patch so ext3 version
is good.

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index d3a0763..00e6147 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4688,7 +4688,6 @@ static int ext4_trim_extent(struct super_block *sb, int start, int count,
 	} else if (ret < 0) {
 		ext4_std_error(sb, ret);
 	}
-	cond_resched();
 
 	ext4_lock_group(sb, group);
 	mb_free_blocks(NULL, e4b, start, ex.fe_len);
@@ -4745,6 +4744,11 @@ ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
 		}
 		start = next + 1;
 
+		if (need_resched()) {
+			ext4_unlock_group(sb, group);
+			cond_resched();
+			ext4_lock_group(sb, group);
+		}
 		if (signal_pending(current)) {
 			count = -ERESTARTSYS;
 			break;


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* [PATCH 3/3] Add batched discard support for ext4
  2010-08-06 11:31 [PATCH 0/3] Batched discard support Lukas Czerner
@ 2010-08-06 11:31 ` Lukas Czerner
  2010-08-06 13:03   ` Dmitry Monakhov
  2010-08-07 22:25   ` Jan Kara
  0 siblings, 2 replies; 24+ messages in thread
From: Lukas Czerner @ 2010-08-06 11:31 UTC (permalink / raw)
  To: linux-ext4
  Cc: dmonakhov, jmoyer, rwheeler, eshishki, sandeen, jack, tytso, lczerner

Walk through allocation groups and trim all free extents. It can be
invoked through FITRIM ioctl on the file system. The main idea is to
provide a way to trim the whole file system if needed, since some SSD's
may suffer from performance loss after the whole device was filled (it
does not mean that fs is full!).

It search for free extents in allocation groups specified by Byte range
start -> start+len. When the free extent is within this range, blocks
are marked as used and then trimmed. Afterwards these blocks are marked
as free in per-group bitmap.

Since fstrim is a long operation it is good to have an ability to
interrupt it by a signal. This was added by Dmitry Monakhov.
Thanks Dimitry.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
Reviewed-by: Jan Kara <jack@suse.cz>
---
 fs/ext4/ext4.h    |    2 +
 fs/ext4/mballoc.c |  194 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c   |    1 +
 3 files changed, 197 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 19a4de5..2f96dc5 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1558,6 +1558,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
+extern int ext4_trim_fs(struct super_block *, uint64_t, uint64_t, uint64_t);
+
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 0e83dfd..5210676 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4690,3 +4690,197 @@ error_return:
 		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }
+
+/**
+ * ext4_trim_extent -- function to TRIM one single free extent in the group
+ * @sb:		super block for the file system
+ * @start:	starting block of the free extent in the alloc. group
+ * @count:	number of blocks to TRIM
+ * @group:	alloc. group we are working with
+ * @e4b:	ext4 buddy for the group
+ *
+ * Trim "count" blocks starting at "start" in the "group". To assure that no
+ * one will allocate those blocks, mark it as used in buddy bitmap. This must
+ * be called with under the group lock.
+ */
+static int ext4_trim_extent(struct super_block *sb, int start, int count,
+		ext4_group_t group, struct ext4_buddy *e4b)
+{
+	ext4_fsblk_t discard_block;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	struct ext4_free_extent ex;
+	int ret = 0;
+
+	assert_spin_locked(ext4_group_lock_ptr(sb, group));
+
+	ex.fe_start = start;
+	ex.fe_group = group;
+	ex.fe_len = count;
+
+	/*
+	 * Mark blocks used, so no one can reuse them while
+	 * being trimmed.
+	 */
+	mb_mark_used(e4b, &ex);
+	ext4_unlock_group(sb, group);
+
+	discard_block = (ext4_fsblk_t)group *
+			EXT4_BLOCKS_PER_GROUP(sb)
+			+ start
+			+ le32_to_cpu(es->s_first_data_block);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long)discard_block,
+			count);
+	ret = sb_issue_discard(sb, discard_block, count);
+	if (ret == -EOPNOTSUPP) {
+		ext4_warning(sb,
+			"discard not supported!");
+	} else if (ret < 0) {
+		ext4_std_error(sb, ret);
+	}
+	cond_resched();
+
+	ext4_lock_group(sb, group);
+	mb_free_blocks(NULL, e4b, start, ex.fe_len);
+	return ret;
+}
+
+/**
+ * ext4_trim_all_free -- function to trim all free space in alloc. group
+ * @sb:			super block for file system
+ * @e4b:		ext4 buddy
+ * @start:		first group block to examine
+ * @max:		last group block to examine
+ * @minblocks:		minimum extent block count
+ *
+ * ext4_trim_all_free walks through group's buddy bitmap searching for free
+ * extents. When the free block is found, ext4_trim_extent is called to TRIM
+ * the extent.
+ *
+ *
+ * ext4_trim_all_free walks through group's block bitmap searching for free
+ * extents. When the free extent is found, mark it as used in group buddy
+ * bitmap. Then issue a TRIM command on this extent and free the extent in
+ * the group buddy bitmap. This is done until whole group is scanned.
+ */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, struct ext4_buddy *e4b,
+		ext4_grpblk_t start, ext4_grpblk_t max, ext4_grpblk_t minblocks)
+{
+	void *bitmap;
+	ext4_grpblk_t next, count = 0;
+	ext4_group_t group;
+	int ret = 0;
+
+	BUG_ON(e4b == NULL);
+
+	bitmap = e4b->bd_bitmap;
+	group = e4b->bd_group;
+	start = (e4b->bd_info->bb_first_free > start) ?
+		e4b->bd_info->bb_first_free : start;
+	ext4_lock_group(sb, group);
+
+	while (start < max) {
+
+		start = mb_find_next_zero_bit(bitmap, max, start);
+		if (start >= max)
+			break;
+		next = mb_find_next_bit(bitmap, max, start);
+
+		if ((next - start) >= minblocks) {
+			ret = ext4_trim_extent(sb, start,
+				next - start, group, e4b);
+			if (ret < 0)
+				break;
+			count += next - start;
+		}
+		start = next + 1;
+
+		if (signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+
+		if ((e4b->bd_info->bb_free - count) < minblocks)
+			break;
+	}
+	ext4_unlock_group(sb, group);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	if (ret < 0)
+		count = ret;
+
+	return count;
+}
+
+/**
+ * ext4_trim_fs() -- trim ioctl handle function
+ * @sb:			superblock for filesystem
+ * @start:		First Byte to trim
+ * @len:		number of Bytes to trim from start
+ * @minlen:		minimum extent length in Bytes
+ *
+ * ext4_trim_fs goes through all allocation groups containing Bytes from
+ * start to start+len. For each such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(struct super_block *sb, uint64_t start, uint64_t len,
+		uint64_t minlen)
+{
+	struct ext4_buddy e4b;
+	ext4_fsblk_t first_group, last_group;
+	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+	ext4_grpblk_t cnt, first_block, last_block;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+	int ret = 0;
+
+	start >>= sb->s_blocksize_bits;
+	len >>= sb->s_blocksize_bits;
+	minlen >>= sb->s_blocksize_bits;
+
+	if (unlikely(minlen > EXT4_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+
+	/* Determine first and last group to examine based on start and len */
+	first_group = (start - le32_to_cpu(es->s_first_data_block)) /
+		      EXT4_BLOCKS_PER_GROUP(sb);
+	last_group = (start + len - le32_to_cpu(es->s_first_data_block)) /
+		     EXT4_BLOCKS_PER_GROUP(sb);
+	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
+
+	if (first_group > last_group)
+		return -EINVAL;
+
+	first_block = start % EXT4_BLOCKS_PER_GROUP(sb);
+	last_block = EXT4_BLOCKS_PER_GROUP(sb);
+
+	for (group = first_group; group <= last_group; group++) {
+
+		ret = ext4_mb_load_buddy(sb, group, &e4b);
+		if (ret) {
+			ext4_error(sb, "Error in loading buddy "
+					"information for %u", group);
+			break;
+		}
+
+		if (len >= EXT4_BLOCKS_PER_GROUP(sb))
+			len -= (EXT4_BLOCKS_PER_GROUP(sb) - first_block);
+		else
+			last_block = len;
+
+		if (e4b.bd_info->bb_free >= minlen) {
+			cnt = ext4_trim_all_free(sb, &e4b, first_block,
+						last_block, minlen);
+			if (cnt < 0) {
+				ret = cnt;
+				ext4_mb_unload_buddy(&e4b);
+				break;
+			}
+		}
+		ext4_mb_unload_buddy(&e4b);
+
+		first_block = 0;
+	}
+	return ret;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e72d323..3b05b3a 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1114,6 +1114,7 @@ static const struct super_operations ext4_sops = {
 	.quota_write	= ext4_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
+	.trim_fs	= ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
-- 
1.7.2


^ permalink raw reply related	[flat|nested] 24+ messages in thread

* Re: [PATCH 3/3] Add batched discard support for ext4
  2010-07-27 12:41 ` [PATCH 3/3] Add batched discard support for ext4 Lukas Czerner
@ 2010-07-27 16:28   ` Jan Kara
  0 siblings, 0 replies; 24+ messages in thread
From: Jan Kara @ 2010-07-27 16:28 UTC (permalink / raw)
  To: Lukas Czerner
  Cc: linux-ext4, jmoyer, rwheeler, eshishki, sandeen, jack, tytso,
	Dmitry Monakhov

On Tue 27-07-10 14:41:54, Lukas Czerner wrote:
> Walk through each allocation group and trim all free extents. It can be
> invoked through TRIM ioctl on the file system. The main idea is to
> provide a way to trim the whole file system if needed, since some SSD's
> may suffer from performance loss after the whole device was filled (it
> does not mean that fs is full!).
> 
> It search fro free extents in each allocation group. When the free
> extent is found, blocks are marked as used and then trimmed. Afterwards
> these blocks are marked as free in per-group bitmap.
> 
> Since fstrim is a long operation it is good to have an ability to interrupt
> it by a signal. This was added by Dmitry Monakhov. Thanks Dimitry.
> 
> Signed-off-by: Lukas Czerner <lczerner@redhat.com>
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
  A couple of comments below...

> ---
>  fs/ext4/ext4.h    |    2 +
>  fs/ext4/mballoc.c |  103 +++++++++++++++++++++++++++++++++++++++++++++++++++++
>  fs/ext4/super.c   |    1 +
>  3 files changed, 106 insertions(+), 0 deletions(-)
>
> diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
> index b423a36..f00b7dd 100644
> --- a/fs/ext4/mballoc.c
> +++ b/fs/ext4/mballoc.c
> @@ -4640,3 +4640,106 @@ error_return:
>  		kmem_cache_free(ext4_ac_cachep, ac);
>  	return;
>  }
> +
> +/**
> + * Trim "count" blocks starting at "start" in "group"
> + * This must be called under group lock
> + */
> +static void ext4_trim_extent(struct super_block *sb, int start, int count,
> +		ext4_group_t group)
> +{
> +	ext4_fsblk_t discard_block;
> +	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
> +
> +	discard_block = (ext4_fsblk_t)group *
> +			EXT4_BLOCKS_PER_GROUP(sb)
> +			+ start
> +			+ le32_to_cpu(es->s_first_data_block);
   You can use ext4_group_first_block_no() I believe.

> +	trace_ext4_discard_blocks(sb,
> +			(unsigned long long)discard_block,
> +			count);
> +	sb_issue_discard(sb, discard_block, count);
> +	cond_resched();
> +}
> +
> +/**
> + * Trim all free extents in group at least minblocks long
> + */
> +ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
> +		ext4_grpblk_t minblocks)
> +{
> +	struct buffer_head *bitmap_bh = NULL;
> +	ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
> +	ext4_grpblk_t start, next, count = 0;
> +	struct ext4_group_info *grp;
> +	int err = 0;
> +
> +	err = -EIO;
> +	bitmap_bh = ext4_read_block_bitmap(sb, group);
> +	if (!bitmap_bh)
> +		return 0;
> +
> +	grp = ext4_get_group_info(sb, group);
> +	start = grp->bb_first_free;
> +
> +	down_write(&grp->alloc_sem);
> +	while (start < max) {
> +
> +		start = mb_find_next_zero_bit(bitmap_bh->b_data, max, start);
> +		if (start >= max)
> +			break;
> +		next = mb_find_next_bit(bitmap_bh->b_data, max, start);
  Hmm, I don't think this is right. If you want to avoid doing the same
thing as you do for ext3, you have to use the buddy bitmap and not the
on-disk bitmap (we free blocks from a buddy bitmap only after a transaction
freeing them is committed). That way you avoid trimming blocks that were
freed in the transaction which is just committing (you mustn't do
that). So you have to load the buddy bitmap for the group into memory
(ext4_mb_load_buddy()), lock the group (ext4_group_lock()), and then you
can investigate the buddy bitmap (EXT4_MB_BITMAP()). You could actually use
the buddy information to make scanning bitmap faster (it carries
information about larger chunks of free blocks) but that's a voluntary
bonus I think ;).

									Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 24+ messages in thread

* [PATCH 3/3] Add batched discard support for ext4
  2010-07-27 12:41 [PATCH 0/3 v3] Batched discard support for Ext3/Ext4 Lukas Czerner
@ 2010-07-27 12:41 ` Lukas Czerner
  2010-07-27 16:28   ` Jan Kara
  0 siblings, 1 reply; 24+ messages in thread
From: Lukas Czerner @ 2010-07-27 12:41 UTC (permalink / raw)
  To: linux-ext4
  Cc: jmoyer, rwheeler, eshishki, sandeen, jack, tytso, Lukas Czerner,
	Dmitry Monakhov

Walk through each allocation group and trim all free extents. It can be
invoked through TRIM ioctl on the file system. The main idea is to
provide a way to trim the whole file system if needed, since some SSD's
may suffer from performance loss after the whole device was filled (it
does not mean that fs is full!).

It search fro free extents in each allocation group. When the free
extent is found, blocks are marked as used and then trimmed. Afterwards
these blocks are marked as free in per-group bitmap.

Since fstrim is a long operation it is good to have an ability to interrupt
it by a signal. This was added by Dmitry Monakhov. Thanks Dimitry.

Signed-off-by: Lukas Czerner <lczerner@redhat.com>
Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/ext4.h    |    2 +
 fs/ext4/mballoc.c |  103 +++++++++++++++++++++++++++++++++++++++++++++++++++++
 fs/ext4/super.c   |    1 +
 3 files changed, 106 insertions(+), 0 deletions(-)

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index bf938cf..ba0fff0 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1437,6 +1437,8 @@ extern int ext4_mb_add_groupinfo(struct super_block *sb,
 extern int ext4_mb_get_buddy_cache_lock(struct super_block *, ext4_group_t);
 extern void ext4_mb_put_buddy_cache_lock(struct super_block *,
 						ext4_group_t, int);
+extern int ext4_trim_fs(unsigned int, struct super_block *);
+
 /* inode.c */
 struct buffer_head *ext4_getblk(handle_t *, struct inode *,
 						ext4_lblk_t, int, int *);
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b423a36..f00b7dd 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -4640,3 +4640,106 @@ error_return:
 		kmem_cache_free(ext4_ac_cachep, ac);
 	return;
 }
+
+/**
+ * Trim "count" blocks starting at "start" in "group"
+ * This must be called under group lock
+ */
+static void ext4_trim_extent(struct super_block *sb, int start, int count,
+		ext4_group_t group)
+{
+	ext4_fsblk_t discard_block;
+	struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+	discard_block = (ext4_fsblk_t)group *
+			EXT4_BLOCKS_PER_GROUP(sb)
+			+ start
+			+ le32_to_cpu(es->s_first_data_block);
+	trace_ext4_discard_blocks(sb,
+			(unsigned long long)discard_block,
+			count);
+	sb_issue_discard(sb, discard_block, count);
+	cond_resched();
+}
+
+/**
+ * Trim all free extents in group at least minblocks long
+ */
+ext4_grpblk_t ext4_trim_all_free(struct super_block *sb, ext4_group_t group,
+		ext4_grpblk_t minblocks)
+{
+	struct buffer_head *bitmap_bh = NULL;
+	ext4_grpblk_t max = EXT4_BLOCKS_PER_GROUP(sb);
+	ext4_grpblk_t start, next, count = 0;
+	struct ext4_group_info *grp;
+	int err = 0;
+
+	err = -EIO;
+	bitmap_bh = ext4_read_block_bitmap(sb, group);
+	if (!bitmap_bh)
+		return 0;
+
+	grp = ext4_get_group_info(sb, group);
+	start = grp->bb_first_free;
+
+	down_write(&grp->alloc_sem);
+	while (start < max) {
+
+		start = mb_find_next_zero_bit(bitmap_bh->b_data, max, start);
+		if (start >= max)
+			break;
+		next = mb_find_next_bit(bitmap_bh->b_data, max, start);
+
+		if ((next - start) >= minblocks) {
+			count += next - start;
+			ext4_trim_extent(sb, start,
+				next - start, group);
+		}
+		start = next + 1;
+		if (signal_pending(current)) {
+			count = -ERESTARTSYS;
+			break;
+		}
+		if ((grp->bb_free - count) < minblocks)
+			break;
+	}
+	up_write(&grp->alloc_sem);
+
+	ext4_debug("trimmed %d blocks in the group %d\n",
+		count, group);
+
+	brelse(bitmap_bh);
+	return count;
+}
+
+/**
+ * ext4_trim_fs goes through all allocation groups searching for group with
+ * more free space than minlen. For such a group ext4_trim_all_free function
+ * is invoked to trim all free space.
+ */
+int ext4_trim_fs(unsigned int minlen, struct super_block *sb)
+{
+	ext4_group_t group;
+	ext4_group_t ngroups = ext4_get_groups_count(sb);
+	ext4_grpblk_t minblocks, cnt;
+	struct ext4_group_info *grp;
+	int ret = 0;
+
+	minblocks = DIV_ROUND_UP(minlen, sb->s_blocksize);
+	if (unlikely(minblocks > EXT4_BLOCKS_PER_GROUP(sb)))
+		return -EINVAL;
+
+	for (group = 0; group < ngroups; group++) {
+
+		grp = ext4_get_group_info(sb, group);
+
+		if (grp->bb_free >= minblocks) {
+			cnt = ext4_trim_all_free(sb, group, minblocks);
+			if (cnt < 0) {
+				ret = cnt;
+				break;
+			}
+		}
+	}
+	return ret;
+}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index e14d22c..253eb98 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1109,6 +1109,7 @@ static const struct super_operations ext4_sops = {
 	.quota_write	= ext4_quota_write,
 #endif
 	.bdev_try_to_free_page = bdev_try_to_free_page,
+	.trim_fs	= ext4_trim_fs
 };
 
 static const struct super_operations ext4_nojournal_sops = {
-- 
1.7.1.1


^ permalink raw reply related	[flat|nested] 24+ messages in thread

end of thread, other threads:[~2010-08-10 14:20 UTC | newest]

Thread overview: 24+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2010-08-04 13:44 [PATCH 1/3] Add ioctl FITRIM Lukas Czerner
2010-08-04 13:44 ` [PATCH 2/3] Add batched discard support for ext3 Lukas Czerner
2010-08-04 14:03   ` Jan Kara
2010-08-04 14:32     ` Lukas Czerner
2010-08-04 19:39   ` Andreas Dilger
2010-08-05 14:00     ` Lukas Czerner
2010-08-04 13:44 ` [PATCH 3/3] Add batched discard support for ext4 Lukas Czerner
2010-08-04 14:17   ` Jan Kara
2010-08-04 14:57 ` [PATCH 1/3] Add ioctl FITRIM Dmitry Monakhov
2010-08-04 15:13   ` Lukas Czerner
2010-08-04 15:26     ` Greg Freemyer
2010-08-05  0:28       ` Ted Ts'o
2010-08-05  6:51         ` Dmitry Monakhov
2010-08-05 15:47         ` Andreas Dilger
2010-08-05  7:00     ` Dmitry Monakhov
2010-08-05  8:36       ` Lukas Czerner
  -- strict thread matches above, loose matches on Subject: below --
2010-08-10 14:19 [PATCH 0/3 ver. 7] Ext3/Ext4 Batched discard support Lukas Czerner
2010-08-10 14:19 ` [PATCH 3/3] Add batched discard support for ext4 Lukas Czerner
2010-08-06 11:31 [PATCH 0/3] Batched discard support Lukas Czerner
2010-08-06 11:31 ` [PATCH 3/3] Add batched discard support for ext4 Lukas Czerner
2010-08-06 13:03   ` Dmitry Monakhov
2010-08-06 13:23     ` Lukas Czerner
2010-08-07 22:25   ` Jan Kara
2010-08-10 11:32     ` Lukas Czerner
2010-07-27 12:41 [PATCH 0/3 v3] Batched discard support for Ext3/Ext4 Lukas Czerner
2010-07-27 12:41 ` [PATCH 3/3] Add batched discard support for ext4 Lukas Czerner
2010-07-27 16:28   ` Jan Kara

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.