linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] writeback: synchronize sync(2) against cgroup writeback membership switches
       [not found]   ` <8844b550-d91c-38d5-997a-a899d1e4aa42@gmail.com>
@ 2017-12-11 19:50     ` Tejun Heo
  2017-12-12  6:04       ` xuejiufei
  2017-12-12 16:38       ` [PATCH v2] " Tejun Heo
  0 siblings, 2 replies; 8+ messages in thread
From: Tejun Heo @ 2017-12-11 19:50 UTC (permalink / raw)
  To: Jens Axboe, Jan Kara
  Cc: cgroups, xuejiufei, kernel-team, linux-kernel, linux-fsdevel

sync_inodes_sb() can race against cgwb (cgroup writeback) membership
switches and fail to writeback some inodes.  For example, if an inode
switches to another wb while sync_inodes_sb() is in progress, the new
wb might not be visible to bdi_split_work_to_wbs() at all or the inode
might jump from a wb which hasn't issued writebacks yet to one which
already has.

This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb
switch path against sync_inodes_sb() so that sync_inodes_sb() is
guaranteed to see all the target wbs and inodes can't jump wbs to
escape syncing.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jiufei Xue <xuejiufei@gmail.com>
Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com
---
 fs/fs-writeback.c                |   50 +++++++++++++++++++++++++++++++++++++--
 include/linux/backing-dev-defs.h |    1 
 mm/backing-dev.c                 |    3 ++
 3 files changed, 52 insertions(+), 2 deletions(-)

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -331,11 +331,22 @@ struct inode_switch_wbs_context {
 	struct work_struct	work;
 };
 
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+	down_write(&bdi->wb_switch_rwsem);
+}
+
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+	up_write(&bdi->wb_switch_rwsem);
+}
+
 static void inode_switch_wbs_work_fn(struct work_struct *work)
 {
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	struct address_space *mapping = inode->i_mapping;
 	struct bdi_writeback *old_wb = inode->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
@@ -344,6 +355,12 @@ static void inode_switch_wbs_work_fn(str
 	void **slot;
 
 	/*
+	 * If @inode switches cgwb membership while sync_inodes_sb() is
+	 * being issued, sync_inodes_sb() might miss it.  Synchronize.
+	 */
+	down_read(&bdi->wb_switch_rwsem);
+
+	/*
 	 * By the time control reaches here, RCU grace period has passed
 	 * since I_WB_SWITCH assertion and all wb stat update transactions
 	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
@@ -435,6 +452,8 @@ skip_switch:
 	spin_unlock(&new_wb->list_lock);
 	spin_unlock(&old_wb->list_lock);
 
+	up_read(&bdi->wb_switch_rwsem);
+
 	if (switched) {
 		wb_wakeup(new_wb);
 		wb_put(old_wb);
@@ -475,9 +494,18 @@ static void inode_switch_wbs(struct inod
 	if (inode->i_state & I_WB_SWITCH)
 		return;
 
+	/*
+	 * Avoid starting new switches while sync_inodes_sb() is in
+	 * progress.  Otherwise, if the down_write protected issue path
+	 * blocks heavily, we might end up starting a large number of
+	 * switches which will block on the rwsem.
+	 */
+	if (!down_read_trylock(&bdi->wb_switch_rwsem))
+		return;
+
 	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
 	if (!isw)
-		return;
+		goto out_unlock;
 
 	/* find and pin the new wb */
 	rcu_read_lock();
@@ -511,12 +539,14 @@ static void inode_switch_wbs(struct inod
 	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
 	 */
 	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
-	return;
+	goto out_unlock;
 
 out_free:
 	if (isw->new_wb)
 		wb_put(isw->new_wb);
 	kfree(isw);
+out_unlock:
+	up_read(&bdi->wb_switch_rwsem);
 }
 
 /**
@@ -893,6 +923,9 @@ fs_initcall(cgroup_writeback_init);
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+
 static struct bdi_writeback *
 locked_inode_to_wb_and_lock_list(struct inode *inode)
 	__releases(&inode->i_lock)
@@ -2422,8 +2455,21 @@ void sync_inodes_sb(struct super_block *
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
+	/*
+	 * Protect against inode wb switch in inode_switch_wbs_work_fn().
+	 * We want to synchronize syncs against switches.  Synchronizing
+	 * among syncs isn't necessary but it shouldn't matter especially
+	 * as we're only protecting the issuing.
+	 *
+	 * Once we grab the rwsem, bdi_split_work_to_wbs() is guaranteed to
+	 * see all the target wbs.  If the wb membership hasn't changed,
+	 * through the ordering visible to the caller; otherwise, through
+	 * the transitive ordering via the rwsem.
+	 */
+	bdi_down_write_wb_switch_rwsem(bdi);
 	bdi_split_work_to_wbs(bdi, &work, false);
 	wb_wait_for_completion(bdi, &done);
+	bdi_up_write_wb_switch_rwsem(bdi);
 
 	wait_sb_inodes(sb);
 }
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -189,6 +189,7 @@ struct backing_dev_info {
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
 	struct rb_root cgwb_congested_tree; /* their congested states */
+	struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
 #else
 	struct bdi_writeback_congested *wb_congested;
 #endif
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -813,6 +813,9 @@ static int cgwb_bdi_init(struct backing_
 		wb_congested_put(bdi->wb_congested);
 		return err;
 	}
+
+	init_rwsem(&bdi->wb_switch_rwsem);
+
 	return 0;
 }
 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] writeback: synchronize sync(2) against cgroup writeback membership switches
  2017-12-11 19:50     ` [PATCH] writeback: synchronize sync(2) against cgroup writeback membership switches Tejun Heo
@ 2017-12-12  6:04       ` xuejiufei
  2017-12-12 16:30         ` Tejun Heo
  2017-12-12 16:38       ` [PATCH v2] " Tejun Heo
  1 sibling, 1 reply; 8+ messages in thread
From: xuejiufei @ 2017-12-12  6:04 UTC (permalink / raw)
  To: Tejun Heo, Jens Axboe, Jan Kara
  Cc: cgroups, kernel-team, linux-kernel, linux-fsdevel

Hi Tejun,

On 2017/12/12 上午3:50, Tejun Heo wrote:
> sync_inodes_sb() can race against cgwb (cgroup writeback) membership
> switches and fail to writeback some inodes.  For example, if an inode
> switches to another wb while sync_inodes_sb() is in progress, the new
> wb might not be visible to bdi_split_work_to_wbs() at all or the inode
> might jump from a wb which hasn't issued writebacks yet to one which
> already has.
> 
> This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb
> switch path against sync_inodes_sb() so that sync_inodes_sb() is
> guaranteed to see all the target wbs and inodes can't jump wbs to
> escape syncing.
> 
> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Jiufei Xue <xuejiufei@gmail.com>
> Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com
> ---
>  fs/fs-writeback.c                |   50 +++++++++++++++++++++++++++++++++++++--
>  include/linux/backing-dev-defs.h |    1 
>  mm/backing-dev.c                 |    3 ++
>  3 files changed, 52 insertions(+), 2 deletions(-)
> 
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -331,11 +331,22 @@ struct inode_switch_wbs_context {
>  	struct work_struct	work;
>  };
>  
> +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
> +{
> +	down_write(&bdi->wb_switch_rwsem);
> +}
> +
> +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
> +{
> +	up_write(&bdi->wb_switch_rwsem);
> +}
> +
>  static void inode_switch_wbs_work_fn(struct work_struct *work)
>  {
>  	struct inode_switch_wbs_context *isw =
>  		container_of(work, struct inode_switch_wbs_context, work);
>  	struct inode *inode = isw->inode;
> +	struct backing_dev_info *bdi = inode_to_bdi(inode);
>  	struct address_space *mapping = inode->i_mapping;
>  	struct bdi_writeback *old_wb = inode->i_wb;
>  	struct bdi_writeback *new_wb = isw->new_wb;
> @@ -344,6 +355,12 @@ static void inode_switch_wbs_work_fn(str
>  	void **slot;
>  
>  	/*
> +	 * If @inode switches cgwb membership while sync_inodes_sb() is
> +	 * being issued, sync_inodes_sb() might miss it.  Synchronize.
> +	 */
> +	down_read(&bdi->wb_switch_rwsem);
> +
> +	/*
>  	 * By the time control reaches here, RCU grace period has passed
>  	 * since I_WB_SWITCH assertion and all wb stat update transactions
>  	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
> @@ -435,6 +452,8 @@ skip_switch:
>  	spin_unlock(&new_wb->list_lock);
>  	spin_unlock(&old_wb->list_lock);
>  
> +	up_read(&bdi->wb_switch_rwsem);
> +
>  	if (switched) {
>  		wb_wakeup(new_wb);
>  		wb_put(old_wb);
> @@ -475,9 +494,18 @@ static void inode_switch_wbs(struct inod
>  	if (inode->i_state & I_WB_SWITCH)
>  		return;
>  
> +	/*
> +	 * Avoid starting new switches while sync_inodes_sb() is in
> +	 * progress.  Otherwise, if the down_write protected issue path
> +	 * blocks heavily, we might end up starting a large number of
> +	 * switches which will block on the rwsem.
> +	 */
> +	if (!down_read_trylock(&bdi->wb_switch_rwsem))
> +		return;
> +
>  	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
>  	if (!isw)
> -		return;
> +		goto out_unlock;
>  
>  	/* find and pin the new wb */
>  	rcu_read_lock();
> @@ -511,12 +539,14 @@ static void inode_switch_wbs(struct inod
>  	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
>  	 */
>  	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
> -	return;
> +	goto out_unlock;
>  
>  out_free:
>  	if (isw->new_wb)
>  		wb_put(isw->new_wb);
>  	kfree(isw);
> +out_unlock:
> +	up_read(&bdi->wb_switch_rwsem);
>  }
>  
>  /**
> @@ -893,6 +923,9 @@ fs_initcall(cgroup_writeback_init);
>  
>  #else	/* CONFIG_CGROUP_WRITEBACK */
>  
> +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
> +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
> +
>  static struct bdi_writeback *
>  locked_inode_to_wb_and_lock_list(struct inode *inode)
>  	__releases(&inode->i_lock)
> @@ -2422,8 +2455,21 @@ void sync_inodes_sb(struct super_block *
>  		return;
>  	WARN_ON(!rwsem_is_locked(&sb->s_umount));
>  
> +	/*
> +	 * Protect against inode wb switch in inode_switch_wbs_work_fn().
> +	 * We want to synchronize syncs against switches.  Synchronizing
> +	 * among syncs isn't necessary but it shouldn't matter especially
> +	 * as we're only protecting the issuing.
> +	 *
> +	 * Once we grab the rwsem, bdi_split_work_to_wbs() is guaranteed to
> +	 * see all the target wbs.  If the wb membership hasn't changed,
> +	 * through the ordering visible to the caller; otherwise, through
> +	 * the transitive ordering via the rwsem.
> +	 */
> +	bdi_down_write_wb_switch_rwsem(bdi);
>  	bdi_split_work_to_wbs(bdi, &work, false);
>  	wb_wait_for_completion(bdi, &done);
> +	bdi_up_write_wb_switch_rwsem(bdi);
>  
>  	wait_sb_inodes(sb);
>  }
> --- a/include/linux/backing-dev-defs.h
> +++ b/include/linux/backing-dev-defs.h
> @@ -189,6 +189,7 @@ struct backing_dev_info {
>  #ifdef CONFIG_CGROUP_WRITEBACK
>  	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
>  	struct rb_root cgwb_congested_tree; /* their congested states */
> +	struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
>  #else
>  	struct bdi_writeback_congested *wb_congested;
>  #endif
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -813,6 +813,9 @@ static int cgwb_bdi_init(struct backing_
>  		wb_congested_put(bdi->wb_congested);
>  		return err;
>  	}
> +
> +	init_rwsem(&bdi->wb_switch_rwsem);
> +

The initialization of wb_switch_rwsem should be in
the conditional compilation of CONFIG_CGROUP_WRITEBACK,
right?

Thanks.

Xuejiufei
>  	return 0;
>  }
>  
> 

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH] writeback: synchronize sync(2) against cgroup writeback membership switches
  2017-12-12  6:04       ` xuejiufei
@ 2017-12-12 16:30         ` Tejun Heo
  0 siblings, 0 replies; 8+ messages in thread
From: Tejun Heo @ 2017-12-12 16:30 UTC (permalink / raw)
  To: xuejiufei
  Cc: Jens Axboe, Jan Kara, cgroups, kernel-team, linux-kernel, linux-fsdevel

Hello,

On Tue, Dec 12, 2017 at 02:04:45PM +0800, xuejiufei wrote:
> The initialization of wb_switch_rwsem should be in
> the conditional compilation of CONFIG_CGROUP_WRITEBACK,
> right?

Oops, you're right.  Will update the patch.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

* [PATCH v2] writeback: synchronize sync(2) against cgroup writeback membership switches
  2017-12-11 19:50     ` [PATCH] writeback: synchronize sync(2) against cgroup writeback membership switches Tejun Heo
  2017-12-12  6:04       ` xuejiufei
@ 2017-12-12 16:38       ` Tejun Heo
  2017-12-13 11:00         ` Jan Kara
  1 sibling, 1 reply; 8+ messages in thread
From: Tejun Heo @ 2017-12-12 16:38 UTC (permalink / raw)
  To: Jens Axboe, Jan Kara
  Cc: cgroups, xuejiufei, kernel-team, linux-kernel, linux-fsdevel

sync_inodes_sb() can race against cgwb (cgroup writeback) membership
switches and fail to writeback some inodes.  For example, if an inode
switches to another wb while sync_inodes_sb() is in progress, the new
wb might not be visible to bdi_split_work_to_wbs() at all or the inode
might jump from a wb which hasn't issued writebacks yet to one which
already has.

This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb
switch path against sync_inodes_sb() so that sync_inodes_sb() is
guaranteed to see all the target wbs and inodes can't jump wbs to
escape syncing.

v2: Fixed misplaced rwsem init.  Spotted by Jiufei.

Signed-off-by: Tejun Heo <tj@kernel.org>
Reported-by: Jiufei Xue <xuejiufei@gmail.com>
Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com
---
 fs/fs-writeback.c                |   40 +++++++++++++++++++++++++++++++++++++--
 include/linux/backing-dev-defs.h |    1 
 mm/backing-dev.c                 |    1 
 3 files changed, 40 insertions(+), 2 deletions(-)

--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -331,11 +331,22 @@ struct inode_switch_wbs_context {
 	struct work_struct	work;
 };
 
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+	down_write(&bdi->wb_switch_rwsem);
+}
+
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
+{
+	up_write(&bdi->wb_switch_rwsem);
+}
+
 static void inode_switch_wbs_work_fn(struct work_struct *work)
 {
 	struct inode_switch_wbs_context *isw =
 		container_of(work, struct inode_switch_wbs_context, work);
 	struct inode *inode = isw->inode;
+	struct backing_dev_info *bdi = inode_to_bdi(inode);
 	struct address_space *mapping = inode->i_mapping;
 	struct bdi_writeback *old_wb = inode->i_wb;
 	struct bdi_writeback *new_wb = isw->new_wb;
@@ -344,6 +355,12 @@ static void inode_switch_wbs_work_fn(str
 	void **slot;
 
 	/*
+	 * If @inode switches cgwb membership while sync_inodes_sb() is
+	 * being issued, sync_inodes_sb() might miss it.  Synchronize.
+	 */
+	down_read(&bdi->wb_switch_rwsem);
+
+	/*
 	 * By the time control reaches here, RCU grace period has passed
 	 * since I_WB_SWITCH assertion and all wb stat update transactions
 	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
@@ -435,6 +452,8 @@ skip_switch:
 	spin_unlock(&new_wb->list_lock);
 	spin_unlock(&old_wb->list_lock);
 
+	up_read(&bdi->wb_switch_rwsem);
+
 	if (switched) {
 		wb_wakeup(new_wb);
 		wb_put(old_wb);
@@ -475,9 +494,18 @@ static void inode_switch_wbs(struct inod
 	if (inode->i_state & I_WB_SWITCH)
 		return;
 
+	/*
+	 * Avoid starting new switches while sync_inodes_sb() is in
+	 * progress.  Otherwise, if the down_write protected issue path
+	 * blocks heavily, we might end up starting a large number of
+	 * switches which will block on the rwsem.
+	 */
+	if (!down_read_trylock(&bdi->wb_switch_rwsem))
+		return;
+
 	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
 	if (!isw)
-		return;
+		goto out_unlock;
 
 	/* find and pin the new wb */
 	rcu_read_lock();
@@ -511,12 +539,14 @@ static void inode_switch_wbs(struct inod
 	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
 	 */
 	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
-	return;
+	goto out_unlock;
 
 out_free:
 	if (isw->new_wb)
 		wb_put(isw->new_wb);
 	kfree(isw);
+out_unlock:
+	up_read(&bdi->wb_switch_rwsem);
 }
 
 /**
@@ -893,6 +923,9 @@ fs_initcall(cgroup_writeback_init);
 
 #else	/* CONFIG_CGROUP_WRITEBACK */
 
+static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
+
 static struct bdi_writeback *
 locked_inode_to_wb_and_lock_list(struct inode *inode)
 	__releases(&inode->i_lock)
@@ -2422,8 +2455,11 @@ void sync_inodes_sb(struct super_block *
 		return;
 	WARN_ON(!rwsem_is_locked(&sb->s_umount));
 
+	/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
+	bdi_down_write_wb_switch_rwsem(bdi);
 	bdi_split_work_to_wbs(bdi, &work, false);
 	wb_wait_for_completion(bdi, &done);
+	bdi_up_write_wb_switch_rwsem(bdi);
 
 	wait_sb_inodes(sb);
 }
--- a/include/linux/backing-dev-defs.h
+++ b/include/linux/backing-dev-defs.h
@@ -189,6 +189,7 @@ struct backing_dev_info {
 #ifdef CONFIG_CGROUP_WRITEBACK
 	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
 	struct rb_root cgwb_congested_tree; /* their congested states */
+	struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
 #else
 	struct bdi_writeback_congested *wb_congested;
 #endif
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -706,6 +706,7 @@ static int cgwb_bdi_init(struct backing_
 
 	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
 	bdi->cgwb_congested_tree = RB_ROOT;
+	init_rwsem(&bdi->wb_switch_rwsem);
 
 	ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
 	if (!ret) {

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] writeback: synchronize sync(2) against cgroup writeback membership switches
  2017-12-12 16:38       ` [PATCH v2] " Tejun Heo
@ 2017-12-13 11:00         ` Jan Kara
  2017-12-13 15:39           ` Tejun Heo
  0 siblings, 1 reply; 8+ messages in thread
From: Jan Kara @ 2017-12-13 11:00 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Jens Axboe, Jan Kara, cgroups, xuejiufei, kernel-team,
	linux-kernel, linux-fsdevel

On Tue 12-12-17 08:38:30, Tejun Heo wrote:
> sync_inodes_sb() can race against cgwb (cgroup writeback) membership
> switches and fail to writeback some inodes.  For example, if an inode
> switches to another wb while sync_inodes_sb() is in progress, the new
> wb might not be visible to bdi_split_work_to_wbs() at all or the inode
> might jump from a wb which hasn't issued writebacks yet to one which
> already has.
> 
> This patch adds backing_dev_info->wb_switch_rwsem to synchronize cgwb
> switch path against sync_inodes_sb() so that sync_inodes_sb() is
> guaranteed to see all the target wbs and inodes can't jump wbs to
> escape syncing.
> 
> v2: Fixed misplaced rwsem init.  Spotted by Jiufei.

OK, but this effectively prevents writeback from sync_inodes_sb() to ever
make inode switch wbs. Cannot that be abused in some way like making sure
writeback of our memcg is "invisible" by forcing it out using sync(2)? It
just looks a bit dangerous to me...

								Honza

> Signed-off-by: Tejun Heo <tj@kernel.org>
> Reported-by: Jiufei Xue <xuejiufei@gmail.com>
> Link: http://lkml.kernel.org/r/dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com
> ---
>  fs/fs-writeback.c                |   40 +++++++++++++++++++++++++++++++++++++--
>  include/linux/backing-dev-defs.h |    1 
>  mm/backing-dev.c                 |    1 
>  3 files changed, 40 insertions(+), 2 deletions(-)
> 
> --- a/fs/fs-writeback.c
> +++ b/fs/fs-writeback.c
> @@ -331,11 +331,22 @@ struct inode_switch_wbs_context {
>  	struct work_struct	work;
>  };
>  
> +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi)
> +{
> +	down_write(&bdi->wb_switch_rwsem);
> +}
> +
> +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi)
> +{
> +	up_write(&bdi->wb_switch_rwsem);
> +}
> +
>  static void inode_switch_wbs_work_fn(struct work_struct *work)
>  {
>  	struct inode_switch_wbs_context *isw =
>  		container_of(work, struct inode_switch_wbs_context, work);
>  	struct inode *inode = isw->inode;
> +	struct backing_dev_info *bdi = inode_to_bdi(inode);
>  	struct address_space *mapping = inode->i_mapping;
>  	struct bdi_writeback *old_wb = inode->i_wb;
>  	struct bdi_writeback *new_wb = isw->new_wb;
> @@ -344,6 +355,12 @@ static void inode_switch_wbs_work_fn(str
>  	void **slot;
>  
>  	/*
> +	 * If @inode switches cgwb membership while sync_inodes_sb() is
> +	 * being issued, sync_inodes_sb() might miss it.  Synchronize.
> +	 */
> +	down_read(&bdi->wb_switch_rwsem);
> +
> +	/*
>  	 * By the time control reaches here, RCU grace period has passed
>  	 * since I_WB_SWITCH assertion and all wb stat update transactions
>  	 * between unlocked_inode_to_wb_begin/end() are guaranteed to be
> @@ -435,6 +452,8 @@ skip_switch:
>  	spin_unlock(&new_wb->list_lock);
>  	spin_unlock(&old_wb->list_lock);
>  
> +	up_read(&bdi->wb_switch_rwsem);
> +
>  	if (switched) {
>  		wb_wakeup(new_wb);
>  		wb_put(old_wb);
> @@ -475,9 +494,18 @@ static void inode_switch_wbs(struct inod
>  	if (inode->i_state & I_WB_SWITCH)
>  		return;
>  
> +	/*
> +	 * Avoid starting new switches while sync_inodes_sb() is in
> +	 * progress.  Otherwise, if the down_write protected issue path
> +	 * blocks heavily, we might end up starting a large number of
> +	 * switches which will block on the rwsem.
> +	 */
> +	if (!down_read_trylock(&bdi->wb_switch_rwsem))
> +		return;
> +
>  	isw = kzalloc(sizeof(*isw), GFP_ATOMIC);
>  	if (!isw)
> -		return;
> +		goto out_unlock;
>  
>  	/* find and pin the new wb */
>  	rcu_read_lock();
> @@ -511,12 +539,14 @@ static void inode_switch_wbs(struct inod
>  	 * Let's continue after I_WB_SWITCH is guaranteed to be visible.
>  	 */
>  	call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
> -	return;
> +	goto out_unlock;
>  
>  out_free:
>  	if (isw->new_wb)
>  		wb_put(isw->new_wb);
>  	kfree(isw);
> +out_unlock:
> +	up_read(&bdi->wb_switch_rwsem);
>  }
>  
>  /**
> @@ -893,6 +923,9 @@ fs_initcall(cgroup_writeback_init);
>  
>  #else	/* CONFIG_CGROUP_WRITEBACK */
>  
> +static void bdi_down_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
> +static void bdi_up_write_wb_switch_rwsem(struct backing_dev_info *bdi) { }
> +
>  static struct bdi_writeback *
>  locked_inode_to_wb_and_lock_list(struct inode *inode)
>  	__releases(&inode->i_lock)
> @@ -2422,8 +2455,11 @@ void sync_inodes_sb(struct super_block *
>  		return;
>  	WARN_ON(!rwsem_is_locked(&sb->s_umount));
>  
> +	/* protect against inode wb switch, see inode_switch_wbs_work_fn() */
> +	bdi_down_write_wb_switch_rwsem(bdi);
>  	bdi_split_work_to_wbs(bdi, &work, false);
>  	wb_wait_for_completion(bdi, &done);
> +	bdi_up_write_wb_switch_rwsem(bdi);
>  
>  	wait_sb_inodes(sb);
>  }
> --- a/include/linux/backing-dev-defs.h
> +++ b/include/linux/backing-dev-defs.h
> @@ -189,6 +189,7 @@ struct backing_dev_info {
>  #ifdef CONFIG_CGROUP_WRITEBACK
>  	struct radix_tree_root cgwb_tree; /* radix tree of active cgroup wbs */
>  	struct rb_root cgwb_congested_tree; /* their congested states */
> +	struct rw_semaphore wb_switch_rwsem; /* no cgwb switch while syncing */
>  #else
>  	struct bdi_writeback_congested *wb_congested;
>  #endif
> --- a/mm/backing-dev.c
> +++ b/mm/backing-dev.c
> @@ -706,6 +706,7 @@ static int cgwb_bdi_init(struct backing_
>  
>  	INIT_RADIX_TREE(&bdi->cgwb_tree, GFP_ATOMIC);
>  	bdi->cgwb_congested_tree = RB_ROOT;
> +	init_rwsem(&bdi->wb_switch_rwsem);
>  
>  	ret = wb_init(&bdi->wb, bdi, 1, GFP_KERNEL);
>  	if (!ret) {
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] writeback: synchronize sync(2) against cgroup writeback membership switches
  2017-12-13 11:00         ` Jan Kara
@ 2017-12-13 15:39           ` Tejun Heo
  2017-12-19 13:04             ` Jan Kara
  0 siblings, 1 reply; 8+ messages in thread
From: Tejun Heo @ 2017-12-13 15:39 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jens Axboe, cgroups, xuejiufei, kernel-team, linux-kernel, linux-fsdevel

Hello,

On Wed, Dec 13, 2017 at 12:00:04PM +0100, Jan Kara wrote:
> OK, but this effectively prevents writeback from sync_inodes_sb() to ever
> make inode switch wbs. Cannot that be abused in some way like making sure
> writeback of our memcg is "invisible" by forcing it out using sync(2)? It
> just looks a bit dangerous to me...

That's true.  There are a couple mitigating factors tho.

* While it can delay switching during sync(2), it'll all still be
  recorded and the switch will happen soon if needed.

* sync(2) is hugely disruptive with or without this and can easily be
  used to DOS the whole system.  People are working on restricting the
  blast radius of sync(2) to mitigate this problem, which most likely
  make this a non-problem too.

If you can think of a better solution, I'm all ears.

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] writeback: synchronize sync(2) against cgroup writeback membership switches
  2017-12-13 15:39           ` Tejun Heo
@ 2017-12-19 13:04             ` Jan Kara
  2017-12-19 13:31               ` Tejun Heo
  0 siblings, 1 reply; 8+ messages in thread
From: Jan Kara @ 2017-12-19 13:04 UTC (permalink / raw)
  To: Tejun Heo
  Cc: Jan Kara, Jens Axboe, cgroups, xuejiufei, kernel-team,
	linux-kernel, linux-fsdevel

On Wed 13-12-17 07:39:30, Tejun Heo wrote:
> Hello,
> 
> On Wed, Dec 13, 2017 at 12:00:04PM +0100, Jan Kara wrote:
> > OK, but this effectively prevents writeback from sync_inodes_sb() to ever
> > make inode switch wbs. Cannot that be abused in some way like making sure
> > writeback of our memcg is "invisible" by forcing it out using sync(2)? It
> > just looks a bit dangerous to me...
> 
> That's true.  There are a couple mitigating factors tho.
> 
> * While it can delay switching during sync(2), it'll all still be
>   recorded and the switch will happen soon if needed.
> 
> * sync(2) is hugely disruptive with or without this and can easily be
>   used to DOS the whole system.  People are working on restricting the
>   blast radius of sync(2) to mitigate this problem, which most likely
>   make this a non-problem too.
> 
> If you can think of a better solution, I'm all ears.

After some thinking about this I don't have a better solution. So you can
add:

Acked-by: Jan Kara <jack@suse.cz>

								Honza
-- 
Jan Kara <jack@suse.com>
SUSE Labs, CR

^ permalink raw reply	[flat|nested] 8+ messages in thread

* Re: [PATCH v2] writeback: synchronize sync(2) against cgroup writeback membership switches
  2017-12-19 13:04             ` Jan Kara
@ 2017-12-19 13:31               ` Tejun Heo
  0 siblings, 0 replies; 8+ messages in thread
From: Tejun Heo @ 2017-12-19 13:31 UTC (permalink / raw)
  To: Jan Kara
  Cc: Jens Axboe, cgroups, xuejiufei, kernel-team, linux-kernel, linux-fsdevel

On Tue, Dec 19, 2017 at 02:04:54PM +0100, Jan Kara wrote:
> After some thinking about this I don't have a better solution. So you can
> add:
> 
> Acked-by: Jan Kara <jack@suse.cz>

Thanks, Jan.  Jens, can you please route this through block tree?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 8+ messages in thread

end of thread, other threads:[~2017-12-19 13:31 UTC | newest]

Thread overview: 8+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
     [not found] <dc694ae2-f07f-61e1-7097-7c8411cee12d@gmail.com>
     [not found] ` <20171205182007.GV2421075@devbig577.frc2.facebook.com>
     [not found]   ` <8844b550-d91c-38d5-997a-a899d1e4aa42@gmail.com>
2017-12-11 19:50     ` [PATCH] writeback: synchronize sync(2) against cgroup writeback membership switches Tejun Heo
2017-12-12  6:04       ` xuejiufei
2017-12-12 16:30         ` Tejun Heo
2017-12-12 16:38       ` [PATCH v2] " Tejun Heo
2017-12-13 11:00         ` Jan Kara
2017-12-13 15:39           ` Tejun Heo
2017-12-19 13:04             ` Jan Kara
2017-12-19 13:31               ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).