All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] quota: quota initialization improvements
       [not found] <0KUR00I4YVKZG820@mail.2ka.mipt.ru>
@ 2009-12-17  1:14 ` Dmitry Monakhov
  2009-12-17  1:15 ` [PATCH] ext4: journalled quota seed-up Dmitry Monakhov
  1 sibling, 0 replies; 6+ messages in thread
From: Dmitry Monakhov @ 2009-12-17  1:14 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, Dmitry Monakhov

vfs_dq_init() is called from many places (almost from all generic
vfs functions). What's why this function is so performance critical.
Current dquot_initialize() implementation is far from optimal because:

Usually it called for inode which already has quota initialized
 This fast path may be easily optimized. Just check i_dquot[].
 In fact, no locking is needed here.

In case of slow path(where inode has no quota yet) it requires to hold
dqptr_sem for write. This assumtions comes from the the rule what states:
"Any changes with i_dquot[] must be protected by sem locked for write".
In fact changing i_dquot from NULL => !NULL  (is what vfs_dq_init does)
is differ from changing it from !NULL => NULL (vfs_dq_drop, and others)

The difference is what if we allow i_dquot[] to change from
NULL => !NULL under dqptr_sem locked for read this will not break usual
charging functions (alloc/claim/free, etc.) because pointer assignment
is atomic for all arch. The only changes we have to do to prevent races
is just atomically copy i_dquot[] to local variable at the beginning of
each charging function.
Race between concurrent init() functions is solved by cmpxchg().

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/quota/dquot.c |   87 ++++++++++++++++++++++++++++++++---------------------
 1 files changed, 52 insertions(+), 35 deletions(-)

diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index de7e7de..6979224 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -1246,6 +1246,9 @@ static int info_bdq_free(struct dquot *dquot, qsize_t space)
  *	Initialize quota pointers in inode
  *	We do things in a bit complicated way but by that we avoid calling
  *	dqget() and thus filesystem callbacks under dqptr_sem.
+ *	Lock dqptr_sem for read here, is acceptable, because i_dquot it is
+ *	acceptable to change the value from NULL => !NULL, opposite change
+ *	must be protected by dqptr_sem for write.
  */
 int dquot_initialize(struct inode *inode, int type)
 {
@@ -1258,6 +1261,9 @@ int dquot_initialize(struct inode *inode, int type)
          * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return 0;
+	/* Check whenever quota was already initialized */
+	if (!dqinit_needed(inode, type))
+		return 0;
 
 	/* First get references to structures we might need. */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
@@ -1274,7 +1280,7 @@ int dquot_initialize(struct inode *inode, int type)
 		got[cnt] = dqget(sb, id, cnt);
 	}
 
-	down_write(&sb_dqopt(sb)->dqptr_sem);
+	down_read(&sb_dqopt(sb)->dqptr_sem);
 	/* Having dqptr_sem we know NOQUOTA flags can't be altered... */
 	if (IS_NOQUOTA(inode))
 		goto out_err;
@@ -1284,13 +1290,11 @@ int dquot_initialize(struct inode *inode, int type)
 		/* Avoid races with quotaoff() */
 		if (!sb_has_quota_active(sb, cnt))
 			continue;
-		if (!inode->i_dquot[cnt]) {
-			inode->i_dquot[cnt] = got[cnt];
+		if (!cmpxchg(&inode->i_dquot[cnt], NULL, got[cnt]))
 			got[cnt] = NULL;
-		}
 	}
 out_err:
-	up_write(&sb_dqopt(sb)->dqptr_sem);
+	up_read(&sb_dqopt(sb)->dqptr_sem);
 	/* Drop unused references */
 	dqput_all(got);
 	return ret;
@@ -1416,6 +1420,7 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
 {
 	int cnt, ret = QUOTA_OK;
 	char warntype[MAXQUOTAS];
+	struct dquot *dquot[MAXQUOTAS];
 
 	/*
 	 * First test before acquiring mutex - solves deadlocks when we
@@ -1432,14 +1437,16 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
 		goto out_unlock;
 	}
 
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		dquot[cnt] = inode->i_dquot[cnt];
 		warntype[cnt] = QUOTA_NL_NOWARN;
+	}
 
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!inode->i_dquot[cnt])
+		if (!dquot[cnt])
 			continue;
-		if (check_bdq(inode->i_dquot[cnt], number, warn, warntype+cnt)
+		if (check_bdq(dquot[cnt], number, warn, warntype+cnt)
 		    == NO_QUOTA) {
 			ret = NO_QUOTA;
 			spin_unlock(&dq_data_lock);
@@ -1447,21 +1454,21 @@ int __dquot_alloc_space(struct inode *inode, qsize_t number,
 		}
 	}
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!inode->i_dquot[cnt])
+		if (!dquot[cnt])
 			continue;
 		if (reserve)
-			dquot_resv_space(inode->i_dquot[cnt], number);
+			dquot_resv_space(dquot[cnt], number);
 		else
-			dquot_incr_space(inode->i_dquot[cnt], number);
+			dquot_incr_space(dquot[cnt], number);
 	}
 	inode_incr_space(inode, number, reserve);
 	spin_unlock(&dq_data_lock);
 
 	if (reserve)
 		goto out_flush_warn;
-	mark_all_dquot_dirty(inode->i_dquot);
+	mark_all_dquot_dirty(dquot);
 out_flush_warn:
-	flush_warnings(inode->i_dquot, warntype);
+	flush_warnings(dquot, warntype);
 out_unlock:
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 out:
@@ -1487,13 +1494,16 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 {
 	int cnt, ret = NO_QUOTA;
 	char warntype[MAXQUOTAS];
+	struct dquot *dquot[MAXQUOTAS];
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
 	if (IS_NOQUOTA(inode))
 		return QUOTA_OK;
-	for (cnt = 0; cnt < MAXQUOTAS; cnt++)
+	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
+		dquot[cnt] = inode->i_dquot[cnt];
 		warntype[cnt] = QUOTA_NL_NOWARN;
+	}
 	down_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	if (IS_NOQUOTA(inode)) {
 		up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -1501,24 +1511,24 @@ int dquot_alloc_inode(const struct inode *inode, qsize_t number)
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!inode->i_dquot[cnt])
+		if (!dquot[cnt])
 			continue;
-		if (check_idq(inode->i_dquot[cnt], number, warntype+cnt)
+		if (check_idq(dquot[cnt], number, warntype+cnt)
 		    == NO_QUOTA)
 			goto warn_put_all;
 	}
 
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!inode->i_dquot[cnt])
+		if (!dquot[cnt])
 			continue;
-		dquot_incr_inodes(inode->i_dquot[cnt], number);
+		dquot_incr_inodes(dquot[cnt], number);
 	}
 	ret = QUOTA_OK;
 warn_put_all:
 	spin_unlock(&dq_data_lock);
 	if (ret == QUOTA_OK)
-		mark_all_dquot_dirty(inode->i_dquot);
-	flush_warnings(inode->i_dquot, warntype);
+		mark_all_dquot_dirty(dquot);
+	flush_warnings(dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return ret;
 }
@@ -1528,6 +1538,7 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
 {
 	int cnt;
 	int ret = QUOTA_OK;
+	struct dquot *dquot[MAXQUOTAS];
 
 	if (IS_NOQUOTA(inode)) {
 		inode_claim_rsv_space(inode, number);
@@ -1544,14 +1555,14 @@ int dquot_claim_space(struct inode *inode, qsize_t number)
 	spin_lock(&dq_data_lock);
 	/* Claim reserved quotas to allocated quotas */
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (inode->i_dquot[cnt])
-			dquot_claim_reserved_space(inode->i_dquot[cnt],
-							number);
+		dquot[cnt] = inode->i_dquot[cnt];
+		if (dquot[cnt])
+			dquot_claim_reserved_space(dquot[cnt], number);
 	}
 	/* Update inode bytes */
 	inode_claim_rsv_space(inode, number);
 	spin_unlock(&dq_data_lock);
-	mark_all_dquot_dirty(inode->i_dquot);
+	mark_all_dquot_dirty(dquot);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 out:
 	return ret;
@@ -1565,6 +1576,8 @@ int __dquot_free_space(struct inode *inode, qsize_t number, int reserve)
 {
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
+	struct dquot *dquot[MAXQUOTAS];
+
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
@@ -1582,22 +1595,23 @@ out_sub:
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!inode->i_dquot[cnt])
+		dquot[cnt] = inode->i_dquot[cnt];
+		if (!dquot[cnt])
 			continue;
-		warntype[cnt] = info_bdq_free(inode->i_dquot[cnt], number);
+		warntype[cnt] = info_bdq_free(dquot[cnt], number);
 		if (reserve)
-			dquot_free_reserved_space(inode->i_dquot[cnt], number);
+			dquot_free_reserved_space(dquot[cnt], number);
 		else
-			dquot_decr_space(inode->i_dquot[cnt], number);
+			dquot_decr_space(dquot[cnt], number);
 	}
 	inode_decr_space(inode, number, reserve);
 	spin_unlock(&dq_data_lock);
 
 	if (reserve)
 		goto out_unlock;
-	mark_all_dquot_dirty(inode->i_dquot);
+	mark_all_dquot_dirty(dquot);
 out_unlock:
-	flush_warnings(inode->i_dquot, warntype);
+	flush_warnings(dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return QUOTA_OK;
 }
@@ -1625,6 +1639,8 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
 {
 	unsigned int cnt;
 	char warntype[MAXQUOTAS];
+	struct dquot *dquot[MAXQUOTAS];
+
 
 	/* First test before acquiring mutex - solves deadlocks when we
          * re-enter the quota code and are already holding the mutex */
@@ -1639,14 +1655,15 @@ int dquot_free_inode(const struct inode *inode, qsize_t number)
 	}
 	spin_lock(&dq_data_lock);
 	for (cnt = 0; cnt < MAXQUOTAS; cnt++) {
-		if (!inode->i_dquot[cnt])
+		dquot[cnt] = inode->i_dquot[cnt];
+		if (!dquot[cnt])
 			continue;
-		warntype[cnt] = info_idq_free(inode->i_dquot[cnt], number);
-		dquot_decr_inodes(inode->i_dquot[cnt], number);
+		warntype[cnt] = info_idq_free(dquot[cnt], number);
+		dquot_decr_inodes(dquot[cnt], number);
 	}
 	spin_unlock(&dq_data_lock);
-	mark_all_dquot_dirty(inode->i_dquot);
-	flush_warnings(inode->i_dquot, warntype);
+	mark_all_dquot_dirty(dquot);
+	flush_warnings(dquot, warntype);
 	up_read(&sb_dqopt(inode->i_sb)->dqptr_sem);
 	return QUOTA_OK;
 }
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* [PATCH]  ext4: journalled quota seed-up
       [not found] <0KUR00I4YVKZG820@mail.2ka.mipt.ru>
  2009-12-17  1:14 ` [PATCH] quota: quota initialization improvements Dmitry Monakhov
@ 2009-12-17  1:15 ` Dmitry Monakhov
  2009-12-17  1:33   ` Dmitry Monakhov
  2009-12-17 16:06   ` Jan Kara
  1 sibling, 2 replies; 6+ messages in thread
From: Dmitry Monakhov @ 2009-12-17  1:15 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, Dmitry Monakhov

 Currently each quota modification result in write_dquot()
 and later dquot_commit().  This means what each quota modification
 function must wait for dqio_mutex. Which is *huge* performance
 penalty on big SMP systems. ASAIU The core idea of this implementation
 is to guarantee that each quota modification will be written to journal
 atomically. But in fact this is not always true, because dquot may be
 changed after dquot modification, but before it was committed to disk.

 | Task 1                           | Task 2                      |
 | alloc_space(nr)                  |                             |
 | ->spin_lock(dq_data_lock)        |                             |
 | ->curspace += nr                 |                             |
 | ->spin_unlock(dq_data_lock)      |                             |
 | ->mark_dirty()                   | free_space(nr)              |
 | -->write_dquot()                 | ->spin_lock(dq_data_lock)   |
 | --->dquot_commit()               | ->curspace -= nr            |
 | ---->commit_dqblk()              | ->spin_unlock(dq_data_lock) |
 | ----->spin_lock(dq_data_lock)    |                             |
 | ----->mem2disk_dqblk(ddq, dquot) | <<< Copy updated value      |
 | ----->spin_unlock(dq_data_lock)  |                             |
 | ----->quota_write()              |                             |
Quota corruption not happens only because quota modification caller
started journal already. And ext3/4 allow only one running quota
at a time. Let's exploit this fact and avoid writing quota each time.
Act similar to dirty_for_io in general write_back path in page-cache.
If we have found what other task already started on copy and write the
dquot then we skip actual quota_write stage. And let that task do the job.
Side effect: Task which skip real quota_write() will not get an error
(if any). But this is not big deal because:
 1) Any error has global consequences (RO_remount, err_print, etc).
 2) Real IO is differed till the journall_commit.

Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
---
 fs/ext4/super.c          |   37 ++++++++++++++++++++++++++++++-------
 fs/quota/dquot.c         |   16 +++++++++++++---
 include/linux/quotaops.h |    1 +
 3 files changed, 44 insertions(+), 10 deletions(-)

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 3929091..164217e 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -3777,14 +3777,37 @@ static int ext4_release_dquot(struct dquot *dquot)
 
 static int ext4_mark_dquot_dirty(struct dquot *dquot)
 {
-	/* Are we journaling quotas? */
-	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
-	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
-		dquot_mark_dquot_dirty(dquot);
-		return ext4_write_dquot(dquot);
-	} else {
+	int ret, err;
+	handle_t *handle;
+	struct inode *inode;
+
+	/* Are we not journaling quotas? */
+	if (!EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] &&
+	    !EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA])
 		return dquot_mark_dquot_dirty(dquot);
-	}
+
+	/* journaling quotas case */
+	inode = dquot_to_inode(dquot);
+	handle = ext4_journal_start(inode,
+				EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
+	if (IS_ERR(handle))
+		return PTR_ERR(handle);
+	if (!__dquot_mark_dquot_dirty(dquot))
+		ret = dquot_commit(dquot);
+	else
+		/*
+		 * Dquot was already dirty. This means that other task already
+		 * started a transaction but not clear dirty bit yet (see
+		 * dquot_commit). Since the only one running transaction is
+		 * possible at a time. Then that task belongs to the same
+		 * transaction. We don'n have to actually write dquot changes
+		 * because that task will write it for for us.
+		 */
+		ret = 0;
+	err = ext4_journal_stop(handle);
+	if (!ret)
+		ret = err;
+	return ret;
 }
 
 static int ext4_write_info(struct super_block *sb, int type)
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 6979224..e03eea0 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -311,14 +311,24 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
 {
 	return dquot->dq_sb->dq_op->mark_dirty(dquot);
 }
-
-int dquot_mark_dquot_dirty(struct dquot *dquot)
+/* mark dquot dirty in atomic meaner, and return old dirty state */
+int __dquot_mark_dquot_dirty(struct dquot *dquot)
 {
+	int ret = 1;
 	spin_lock(&dq_list_lock);
-	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags))
+	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
 		list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
 				info[dquot->dq_type].dqi_dirty_list);
+		ret = 0;
+	}
 	spin_unlock(&dq_list_lock);
+	return ret;
+}
+EXPORT_SYMBOL(__dquot_mark_dquot_dirty);
+
+int dquot_mark_dquot_dirty(struct dquot *dquot)
+{
+	__dquot_mark_dquot_dirty(dquot);
 	return 0;
 }
 EXPORT_SYMBOL(dquot_mark_dquot_dirty);
diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
index ae7ef25..c950f7d 100644
--- a/include/linux/quotaops.h
+++ b/include/linux/quotaops.h
@@ -56,6 +56,7 @@ int dquot_commit(struct dquot *dquot);
 int dquot_acquire(struct dquot *dquot);
 int dquot_release(struct dquot *dquot);
 int dquot_commit_info(struct super_block *sb, int type);
+int __dquot_mark_dquot_dirty(struct dquot *dquot);
 int dquot_mark_dquot_dirty(struct dquot *dquot);
 
 int vfs_quota_on(struct super_block *sb, int type, int format_id,
-- 
1.6.0.4


^ permalink raw reply related	[flat|nested] 6+ messages in thread

* Re: [PATCH]  ext4: journalled quota seed-up
  2009-12-17  1:15 ` [PATCH] ext4: journalled quota seed-up Dmitry Monakhov
@ 2009-12-17  1:33   ` Dmitry Monakhov
  2009-12-17 10:07     ` Dmitry Monakhov
  2009-12-17 16:06   ` Jan Kara
  1 sibling, 1 reply; 6+ messages in thread
From: Dmitry Monakhov @ 2009-12-17  1:33 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, linux-ext4

Dmitry Monakhov <dmonakhov@openvz.org> writes:

Sorry forgot to add linux-ext4@vger.kernel.org list to CC
>  Currently each quota modification result in write_dquot()
>  and later dquot_commit().  This means what each quota modification
>  function must wait for dqio_mutex. Which is *huge* performance
>  penalty on big SMP systems. ASAIU The core idea of this implementation
>  is to guarantee that each quota modification will be written to journal
>  atomically. But in fact this is not always true, because dquot may be
>  changed after dquot modification, but before it was committed to disk.
>
>  | Task 1                           | Task 2                      |
>  | alloc_space(nr)                  |                             |
>  | ->spin_lock(dq_data_lock)        |                             |
>  | ->curspace += nr                 |                             |
>  | ->spin_unlock(dq_data_lock)      |                             |
>  | ->mark_dirty()                   | free_space(nr)              |
>  | -->write_dquot()                 | ->spin_lock(dq_data_lock)   |
>  | --->dquot_commit()               | ->curspace -= nr            |
>  | ---->commit_dqblk()              | ->spin_unlock(dq_data_lock) |
>  | ----->spin_lock(dq_data_lock)    |                             |
>  | ----->mem2disk_dqblk(ddq, dquot) | <<< Copy updated value      |
>  | ----->spin_unlock(dq_data_lock)  |                             |
>  | ----->quota_write()              |                             |
> Quota corruption not happens only because quota modification caller
> started journal already. And ext3/4 allow only one running quota
> at a time. Let's exploit this fact and avoid writing quota each time.
> Act similar to dirty_for_io in general write_back path in page-cache.
> If we have found what other task already started on copy and write the
> dquot then we skip actual quota_write stage. And let that task do the job.
> Side effect: Task which skip real quota_write() will not get an error
> (if any). But this is not big deal because:
>  1) Any error has global consequences (RO_remount, err_print, etc).
>  2) Real IO is differed till the journall_commit.
>
> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
> ---
>  fs/ext4/super.c          |   37 ++++++++++++++++++++++++++++++-------
>  fs/quota/dquot.c         |   16 +++++++++++++---
>  include/linux/quotaops.h |    1 +
>  3 files changed, 44 insertions(+), 10 deletions(-)
>
> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
> index 3929091..164217e 100644
> --- a/fs/ext4/super.c
> +++ b/fs/ext4/super.c
> @@ -3777,14 +3777,37 @@ static int ext4_release_dquot(struct dquot *dquot)
>  
>  static int ext4_mark_dquot_dirty(struct dquot *dquot)
>  {
> -	/* Are we journaling quotas? */
> -	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
> -	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
> -		dquot_mark_dquot_dirty(dquot);
> -		return ext4_write_dquot(dquot);
> -	} else {
> +	int ret, err;
> +	handle_t *handle;
> +	struct inode *inode;
> +
> +	/* Are we not journaling quotas? */
> +	if (!EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] &&
> +	    !EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA])
>  		return dquot_mark_dquot_dirty(dquot);
> -	}
> +
> +	/* journaling quotas case */
> +	inode = dquot_to_inode(dquot);
> +	handle = ext4_journal_start(inode,
> +				EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
> +	if (IS_ERR(handle))
> +		return PTR_ERR(handle);
> +	if (!__dquot_mark_dquot_dirty(dquot))
> +		ret = dquot_commit(dquot);
> +	else
> +		/*
> +		 * Dquot was already dirty. This means that other task already
> +		 * started a transaction but not clear dirty bit yet (see
> +		 * dquot_commit). Since the only one running transaction is
> +		 * possible at a time. Then that task belongs to the same
> +		 * transaction. We don'n have to actually write dquot changes
> +		 * because that task will write it for for us.
> +		 */
> +		ret = 0;
> +	err = ext4_journal_stop(handle);
> +	if (!ret)
> +		ret = err;
> +	return ret;
>  }
>  
>  static int ext4_write_info(struct super_block *sb, int type)
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 6979224..e03eea0 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -311,14 +311,24 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
>  {
>  	return dquot->dq_sb->dq_op->mark_dirty(dquot);
>  }
> -
> -int dquot_mark_dquot_dirty(struct dquot *dquot)
> +/* mark dquot dirty in atomic meaner, and return old dirty state */
> +int __dquot_mark_dquot_dirty(struct dquot *dquot)
>  {
> +	int ret = 1;
>  	spin_lock(&dq_list_lock);
> -	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags))
> +	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
>  		list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
>  				info[dquot->dq_type].dqi_dirty_list);
> +		ret = 0;
> +	}
>  	spin_unlock(&dq_list_lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL(__dquot_mark_dquot_dirty);
> +
> +int dquot_mark_dquot_dirty(struct dquot *dquot)
> +{
> +	__dquot_mark_dquot_dirty(dquot);
>  	return 0;
>  }
>  EXPORT_SYMBOL(dquot_mark_dquot_dirty);
> diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
> index ae7ef25..c950f7d 100644
> --- a/include/linux/quotaops.h
> +++ b/include/linux/quotaops.h
> @@ -56,6 +56,7 @@ int dquot_commit(struct dquot *dquot);
>  int dquot_acquire(struct dquot *dquot);
>  int dquot_release(struct dquot *dquot);
>  int dquot_commit_info(struct super_block *sb, int type);
> +int __dquot_mark_dquot_dirty(struct dquot *dquot);
>  int dquot_mark_dquot_dirty(struct dquot *dquot);
>  
>  int vfs_quota_on(struct super_block *sb, int type, int format_id,

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH]  ext4: journalled quota seed-up
  2009-12-17  1:33   ` Dmitry Monakhov
@ 2009-12-17 10:07     ` Dmitry Monakhov
  0 siblings, 0 replies; 6+ messages in thread
From: Dmitry Monakhov @ 2009-12-17 10:07 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel, linux-ext4

Dmitry Monakhov <dmonakhov@openvz.org> writes:

> Dmitry Monakhov <dmonakhov@openvz.org> writes:
>
> Sorry forgot to add linux-ext4@vger.kernel.org list to CC



>>  Currently each quota modification result in write_dquot()
>>  and later dquot_commit().  This means what each quota modification
>>  function must wait for dqio_mutex. Which is *huge* performance
>>  penalty on big SMP systems.
performance results:
simple tests-case: pwrite(,,40960,0); ftruncate(,0) in a loop
measured value: ops/ms == loops per msecond
mount: with journaled quota, and nodelalloc in order to force
curspace modification for each operation.
Seems it will be reasonable to prepare same patch for ext3.
| write-truncate | w/o quot | w jquot | w jquot mk_drt patch |
|   nodelalloc   |          |         |                      |
|     NR tasks 1 |     20.4 |    14.5 |                14.83 |
|                |     20.3 |    14.7 |                 14.7 |
|                |     20.4 |    14.8 |                 14.8 |
|                |          |         |                      |
|              4 |      6.1 |    4.76 |                  6.3 |
|                |      6.3 |     5.1 |                  6.1 |
|                |      6.5 |     4.8 |                  6.3 |
|                |          |         |                      |
|              8 |     3.75 |    2.59 |                 3.34 |
|                |      4.0 |     2.6 |                 3.44 |
|                |     3.76 |    2.63 |                 3.47 |
>>  ASAIU The core idea of this implementation
>>  is to guarantee that each quota modification will be written to journal
>>  atomically. But in fact this is not always true, because dquot may be
>>  changed after dquot modification, but before it was committed to disk.
>>
>>  | Task 1                           | Task 2                      |
>>  | alloc_space(nr)                  |                             |
>>  | ->spin_lock(dq_data_lock)        |                             |
>>  | ->curspace += nr                 |                             |
>>  | ->spin_unlock(dq_data_lock)      |                             |
>>  | ->mark_dirty()                   | free_space(nr)              |
>>  | -->write_dquot()                 | ->spin_lock(dq_data_lock)   |
>>  | --->dquot_commit()               | ->curspace -= nr            |
>>  | ---->commit_dqblk()              | ->spin_unlock(dq_data_lock) |
>>  | ----->spin_lock(dq_data_lock)    |                             |
>>  | ----->mem2disk_dqblk(ddq, dquot) | <<< Copy updated value      |
>>  | ----->spin_unlock(dq_data_lock)  |                             |
>>  | ----->quota_write()              |                             |
>> Quota corruption not happens only because quota modification caller
>> started journal already. And ext3/4 allow only one running quota
>> at a time. Let's exploit this fact and avoid writing quota each time.
>> Act similar to dirty_for_io in general write_back path in page-cache.
>> If we have found what other task already started on copy and write the
>> dquot then we skip actual quota_write stage. And let that task do the job.
>> Side effect: Task which skip real quota_write() will not get an error
>> (if any). But this is not big deal because:
>>  1) Any error has global consequences (RO_remount, err_print, etc).
>>  2) Real IO is differed till the journall_commit.
>>
>> Signed-off-by: Dmitry Monakhov <dmonakhov@openvz.org>
>> ---
>>  fs/ext4/super.c          |   37 ++++++++++++++++++++++++++++++-------
>>  fs/quota/dquot.c         |   16 +++++++++++++---
>>  include/linux/quotaops.h |    1 +
>>  3 files changed, 44 insertions(+), 10 deletions(-)
>>
>> diff --git a/fs/ext4/super.c b/fs/ext4/super.c
>> index 3929091..164217e 100644
>> --- a/fs/ext4/super.c
>> +++ b/fs/ext4/super.c
>> @@ -3777,14 +3777,37 @@ static int ext4_release_dquot(struct dquot *dquot)
>>  
>>  static int ext4_mark_dquot_dirty(struct dquot *dquot)
>>  {
>> -	/* Are we journaling quotas? */
>> -	if (EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] ||
>> -	    EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA]) {
>> -		dquot_mark_dquot_dirty(dquot);
>> -		return ext4_write_dquot(dquot);
>> -	} else {
>> +	int ret, err;
>> +	handle_t *handle;
>> +	struct inode *inode;
>> +
>> +	/* Are we not journaling quotas? */
>> +	if (!EXT4_SB(dquot->dq_sb)->s_qf_names[USRQUOTA] &&
>> +	    !EXT4_SB(dquot->dq_sb)->s_qf_names[GRPQUOTA])
>>  		return dquot_mark_dquot_dirty(dquot);
>> -	}
>> +
>> +	/* journaling quotas case */
>> +	inode = dquot_to_inode(dquot);
>> +	handle = ext4_journal_start(inode,
>> +				EXT4_QUOTA_TRANS_BLOCKS(dquot->dq_sb));
>> +	if (IS_ERR(handle))
>> +		return PTR_ERR(handle);
>> +	if (!__dquot_mark_dquot_dirty(dquot))
>> +		ret = dquot_commit(dquot);
>> +	else
>> +		/*
>> +		 * Dquot was already dirty. This means that other task already
>> +		 * started a transaction but not clear dirty bit yet (see
>> +		 * dquot_commit). Since the only one running transaction is
>> +		 * possible at a time. Then that task belongs to the same
>> +		 * transaction. We don'n have to actually write dquot changes
>> +		 * because that task will write it for for us.
>> +		 */
>> +		ret = 0;
>> +	err = ext4_journal_stop(handle);
>> +	if (!ret)
>> +		ret = err;
>> +	return ret;
>>  }
>>  
>>  static int ext4_write_info(struct super_block *sb, int type)
>> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
>> index 6979224..e03eea0 100644
>> --- a/fs/quota/dquot.c
>> +++ b/fs/quota/dquot.c
>> @@ -311,14 +311,24 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
>>  {
>>  	return dquot->dq_sb->dq_op->mark_dirty(dquot);
>>  }
>> -
>> -int dquot_mark_dquot_dirty(struct dquot *dquot)
>> +/* mark dquot dirty in atomic meaner, and return old dirty state */
>> +int __dquot_mark_dquot_dirty(struct dquot *dquot)
>>  {
>> +	int ret = 1;
>>  	spin_lock(&dq_list_lock);
>> -	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags))
>> +	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
>>  		list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
>>  				info[dquot->dq_type].dqi_dirty_list);
>> +		ret = 0;
>> +	}
>>  	spin_unlock(&dq_list_lock);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(__dquot_mark_dquot_dirty);
>> +
>> +int dquot_mark_dquot_dirty(struct dquot *dquot)
>> +{
>> +	__dquot_mark_dquot_dirty(dquot);
>>  	return 0;
>>  }
>>  EXPORT_SYMBOL(dquot_mark_dquot_dirty);
>> diff --git a/include/linux/quotaops.h b/include/linux/quotaops.h
>> index ae7ef25..c950f7d 100644
>> --- a/include/linux/quotaops.h
>> +++ b/include/linux/quotaops.h
>> @@ -56,6 +56,7 @@ int dquot_commit(struct dquot *dquot);
>>  int dquot_acquire(struct dquot *dquot);
>>  int dquot_release(struct dquot *dquot);
>>  int dquot_commit_info(struct super_block *sb, int type);
>> +int __dquot_mark_dquot_dirty(struct dquot *dquot);
>>  int dquot_mark_dquot_dirty(struct dquot *dquot);
>>  
>>  int vfs_quota_on(struct super_block *sb, int type, int format_id,
> --
> To unsubscribe from this list: send the line "unsubscribe linux-ext4" in
> the body of a message to majordomo@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH]  ext4: journalled quota seed-up
  2009-12-17  1:15 ` [PATCH] ext4: journalled quota seed-up Dmitry Monakhov
  2009-12-17  1:33   ` Dmitry Monakhov
@ 2009-12-17 16:06   ` Jan Kara
  2009-12-17 16:37     ` Dmitry Monakhov
  1 sibling, 1 reply; 6+ messages in thread
From: Jan Kara @ 2009-12-17 16:06 UTC (permalink / raw)
  To: Dmitry Monakhov; +Cc: Jan Kara, linux-fsdevel

  Hi,

On Thu 17-12-09 04:15:34, Dmitry Monakhov wrote:
>  Currently each quota modification result in write_dquot()
>  and later dquot_commit().  This means what each quota modification
>  function must wait for dqio_mutex. Which is *huge* performance
>  penalty on big SMP systems. ASAIU The core idea of this implementation
>  is to guarantee that each quota modification will be written to journal
>  atomically. But in fact this is not always true, because dquot may be
>  changed after dquot modification, but before it was committed to disk.
> 
>  | Task 1                           | Task 2                      |
>  | alloc_space(nr)                  |                             |
>  | ->spin_lock(dq_data_lock)        |                             |
>  | ->curspace += nr                 |                             |
>  | ->spin_unlock(dq_data_lock)      |                             |
>  | ->mark_dirty()                   | free_space(nr)              |
>  | -->write_dquot()                 | ->spin_lock(dq_data_lock)   |
>  | --->dquot_commit()               | ->curspace -= nr            |
>  | ---->commit_dqblk()              | ->spin_unlock(dq_data_lock) |
>  | ----->spin_lock(dq_data_lock)    |                             |
>  | ----->mem2disk_dqblk(ddq, dquot) | <<< Copy updated value      |
>  | ----->spin_unlock(dq_data_lock)  |                             |
>  | ----->quota_write()              |                             |
> Quota corruption not happens only because quota modification caller
> started journal already. And ext3/4 allow only one running quota
> at a time.
  It was actually designed to work this way. Ext3/4 handles for example
inodes like this as well... You're right that it might be good to document
this fact as in future someone might come with a filesystem with a
different scheme.

> Let's exploit this fact and avoid writing quota each time.
> Act similar to dirty_for_io in general write_back path in page-cache.
> If we have found what other task already started on copy and write the
> dquot then we skip actual quota_write stage. And let that task do the job.
> Side effect: Task which skip real quota_write() will not get an error
> (if any). But this is not big deal because:
>  1) Any error has global consequences (RO_remount, err_print, etc).
>  2) Real IO is differed till the journall_commit.
...
> +	if (!__dquot_mark_dquot_dirty(dquot))
> +		ret = dquot_commit(dquot);
  Hmm, but dquot_commit() clears the dirty bit just after acquiring
dqio_mutex so if I get it right, you save something only if someone is
waiting contended on dqio_mutex. So isn't this just working around a
contention on that mutex (and in a not very nice way)? For case where dquot
is already active (DQ_ACTIVE_B set), we can certainly make the locking much
more lightweight - essentially just a per-dquot lock for the time until we
copy the data from dquot to buffer_head...
  Also we could make writing dquot more lightweight (have ext4 specific
part of dquot and cache there buffer head which carries the dquot). Even
more, we could track look whether the dquot is already part of the
transaction and save get_write_access etc for that case (but that already
starts to be ugly)...

> +	else
> +		/*
> +		 * Dquot was already dirty. This means that other task already
> +		 * started a transaction but not clear dirty bit yet (see
> +		 * dquot_commit). Since the only one running transaction is
> +		 * possible at a time. Then that task belongs to the same
> +		 * transaction. We don'n have to actually write dquot changes
> +		 * because that task will write it for for us.
> +		 */
> +		ret = 0;
> +	err = ext4_journal_stop(handle);
> +	if (!ret)
> +		ret = err;
> +	return ret;
>  }
>  
>  static int ext4_write_info(struct super_block *sb, int type)
> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
> index 6979224..e03eea0 100644
> --- a/fs/quota/dquot.c
> +++ b/fs/quota/dquot.c
> @@ -311,14 +311,24 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
>  {
>  	return dquot->dq_sb->dq_op->mark_dirty(dquot);
>  }
> -
> -int dquot_mark_dquot_dirty(struct dquot *dquot)
> +/* mark dquot dirty in atomic meaner, and return old dirty state */
> +int __dquot_mark_dquot_dirty(struct dquot *dquot)
>  {
> +	int ret = 1;
>  	spin_lock(&dq_list_lock);
> -	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags))
> +	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
>  		list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
>  				info[dquot->dq_type].dqi_dirty_list);
> +		ret = 0;
> +	}
>  	spin_unlock(&dq_list_lock);
> +	return ret;
> +}
> +EXPORT_SYMBOL(__dquot_mark_dquot_dirty);
> +
> +int dquot_mark_dquot_dirty(struct dquot *dquot)
> +{
> +	__dquot_mark_dquot_dirty(dquot);
>  	return 0;
>  }
  Please just do the change do dquot_mark_dquot_dirty(). There's no need
for the new function.

								Honza
-- 
Jan Kara <jack@suse.cz>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 6+ messages in thread

* Re: [PATCH]  ext4: journalled quota seed-up
  2009-12-17 16:06   ` Jan Kara
@ 2009-12-17 16:37     ` Dmitry Monakhov
  0 siblings, 0 replies; 6+ messages in thread
From: Dmitry Monakhov @ 2009-12-17 16:37 UTC (permalink / raw)
  To: Jan Kara; +Cc: linux-fsdevel

Jan Kara <jack@suse.cz> writes:

>   Hi,
>
> On Thu 17-12-09 04:15:34, Dmitry Monakhov wrote:
>>  Currently each quota modification result in write_dquot()
>>  and later dquot_commit().  This means what each quota modification
>>  function must wait for dqio_mutex. Which is *huge* performance
>>  penalty on big SMP systems. ASAIU The core idea of this implementation
>>  is to guarantee that each quota modification will be written to journal
>>  atomically. But in fact this is not always true, because dquot may be
>>  changed after dquot modification, but before it was committed to disk.
>> 
>>  | Task 1                           | Task 2                      |
>>  | alloc_space(nr)                  |                             |
>>  | ->spin_lock(dq_data_lock)        |                             |
>>  | ->curspace += nr                 |                             |
>>  | ->spin_unlock(dq_data_lock)      |                             |
>>  | ->mark_dirty()                   | free_space(nr)              |
>>  | -->write_dquot()                 | ->spin_lock(dq_data_lock)   |
>>  | --->dquot_commit()               | ->curspace -= nr            |
>>  | ---->commit_dqblk()              | ->spin_unlock(dq_data_lock) |
>>  | ----->spin_lock(dq_data_lock)    |                             |
>>  | ----->mem2disk_dqblk(ddq, dquot) | <<< Copy updated value      |
>>  | ----->spin_unlock(dq_data_lock)  |                             |
>>  | ----->quota_write()              |                             |
>> Quota corruption not happens only because quota modification caller
>> started journal already. And ext3/4 allow only one running quota
>> at a time.
>   It was actually designed to work this way. Ext3/4 handles for example
> inodes like this as well... You're right that it might be good to document
> this fact as in future someone might come with a filesystem with a
> different scheme.
>
>> Let's exploit this fact and avoid writing quota each time.
>> Act similar to dirty_for_io in general write_back path in page-cache.
>> If we have found what other task already started on copy and write the
>> dquot then we skip actual quota_write stage. And let that task do the job.
>> Side effect: Task which skip real quota_write() will not get an error
>> (if any). But this is not big deal because:
>>  1) Any error has global consequences (RO_remount, err_print, etc).
>>  2) Real IO is differed till the journall_commit.
> ...
>> +	if (!__dquot_mark_dquot_dirty(dquot))
>> +		ret = dquot_commit(dquot);
>   Hmm, but dquot_commit() clears the dirty bit just after acquiring
> dqio_mutex so if I get it right, you save something only if someone is
> waiting contended on dqio_mutex. So isn't this just working around a
> contention on that mutex (and in a not very nice way)? For case where dquot
> is already active (DQ_ACTIVE_B set), we can certainly make the locking much
> more lightweight - essentially just a per-dquot lock for the time until we
> copy the data from dquot to buffer_head...
Yes this patch is aimed to decrease contention on dqio_mutex only, 
because it is the biggest performance consumer. The patch is not a
complete. This is just a proof of concept patch. And according to my
measurements, quota overhead becomes approximate to
(1 / nr_concurrent_task) which is not perfect. And off-course avoiding
dqio_mutex at all is really good.
>   Also we could make writing dquot more lightweight (have ext4 specific
> part of dquot and cache there buffer head which carries the dquot). Even
> more, we could track look whether the dquot is already part of the
> transaction and save get_write_access etc for that case (but that already
> starts to be ugly)...
When i've thought about this i come in to idea to introduce callback
which dquot_write() may attach to the journal, and call it at
the last journal_stop() :)
But your idea about BH tracking is much better.
>
>> +	else
>> +		/*
>> +		 * Dquot was already dirty. This means that other task already
>> +		 * started a transaction but not clear dirty bit yet (see
>> +		 * dquot_commit). Since the only one running transaction is
>> +		 * possible at a time. Then that task belongs to the same
>> +		 * transaction. We don'n have to actually write dquot changes
>> +		 * because that task will write it for for us.
>> +		 */
>> +		ret = 0;
>> +	err = ext4_journal_stop(handle);
>> +	if (!ret)
>> +		ret = err;
>> +	return ret;
>>  }
>>  
>>  static int ext4_write_info(struct super_block *sb, int type)
>> diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
>> index 6979224..e03eea0 100644
>> --- a/fs/quota/dquot.c
>> +++ b/fs/quota/dquot.c
>> @@ -311,14 +311,24 @@ static inline int mark_dquot_dirty(struct dquot *dquot)
>>  {
>>  	return dquot->dq_sb->dq_op->mark_dirty(dquot);
>>  }
>> -
>> -int dquot_mark_dquot_dirty(struct dquot *dquot)
>> +/* mark dquot dirty in atomic meaner, and return old dirty state */
>> +int __dquot_mark_dquot_dirty(struct dquot *dquot)
>>  {
>> +	int ret = 1;
>>  	spin_lock(&dq_list_lock);
>> -	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags))
>> +	if (!test_and_set_bit(DQ_MOD_B, &dquot->dq_flags)) {
>>  		list_add(&dquot->dq_dirty, &sb_dqopt(dquot->dq_sb)->
>>  				info[dquot->dq_type].dqi_dirty_list);
>> +		ret = 0;
>> +	}
>>  	spin_unlock(&dq_list_lock);
>> +	return ret;
>> +}
>> +EXPORT_SYMBOL(__dquot_mark_dquot_dirty);
>> +
>> +int dquot_mark_dquot_dirty(struct dquot *dquot)
>> +{
>> +	__dquot_mark_dquot_dirty(dquot);
>>  	return 0;
>>  }
>   Please just do the change do dquot_mark_dquot_dirty(). There's no need
> for the new function.
>
> 								Honza

^ permalink raw reply	[flat|nested] 6+ messages in thread

end of thread, other threads:[~2009-12-17 16:37 UTC | newest]

Thread overview: 6+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <0KUR00I4YVKZG820@mail.2ka.mipt.ru>
2009-12-17  1:14 ` [PATCH] quota: quota initialization improvements Dmitry Monakhov
2009-12-17  1:15 ` [PATCH] ext4: journalled quota seed-up Dmitry Monakhov
2009-12-17  1:33   ` Dmitry Monakhov
2009-12-17 10:07     ` Dmitry Monakhov
2009-12-17 16:06   ` Jan Kara
2009-12-17 16:37     ` Dmitry Monakhov

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.