linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* Re: [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
@ 2017-03-26 10:54 Julia Lawall
  2017-03-27 18:29 ` Tahsin Erdogan
  0 siblings, 1 reply; 9+ messages in thread
From: Julia Lawall @ 2017-03-26 10:54 UTC (permalink / raw)
  To: Tahsin Erdogan
  Cc: Tejun Heo, Jens Axboe, linux-block, David Rientjes, linux-kernel,
	linux-block, David Rientjes, linux-kernel

Is an unlock needed before line 848?  Or does blkg_lookup_check take care
of it?

julia

---------- Forwarded message ----------
Date: Sun, 26 Mar 2017 07:50:17 +0800
From: kbuild test robot <fengguang.wu@intel.com>
To: kbuild@01.org
Cc: Julia Lawall <julia.lawall@lip6.fr>
Subject: Re: [PATCH] blkcg: allocate struct blkcg_gq outside request queue
    spinlock

CC: kbuild-all@01.org
In-Reply-To: <20170324215627.12831-1-tahsin@google.com>
TO: Tahsin Erdogan <tahsin@google.com>
CC: Tejun Heo <tj@kernel.org>, Jens Axboe <axboe@kernel.dk>, linux-block@vger.kernel.org, David Rientjes <rientjes@google.com>, linux-kernel@vger.kernel.org, Tahsin Erdogan <tahsin@google.com>
CC: linux-block@vger.kernel.org, David Rientjes <rientjes@google.com>, linux-kernel@vger.kernel.org, Tahsin Erdogan <tahsin@google.com>

Hi Tahsin,

[auto build test WARNING on block/for-next]
[also build test WARNING on v4.11-rc3 next-20170324]
[if your patch is applied to the wrong git tree, please drop us a note to help improve the system]

url:    https://github.com/0day-ci/linux/commits/Tahsin-Erdogan/blkcg-allocate-struct-blkcg_gq-outside-request-queue-spinlock/20170326-020755
base:   https://git.kernel.org/pub/scm/linux/kernel/git/axboe/linux-block.git for-next
:::::: branch date: 6 hours ago
:::::: commit date: 6 hours ago

>> block/blk-cgroup.c:901:1-7: preceding lock on line 839
   block/blk-cgroup.c:901:1-7: preceding lock on line 876

git remote add linux-review https://github.com/0day-ci/linux
git remote update linux-review
git checkout d6344a76907e470f483dcb19998d438d19f6432d
vim +901 block/blk-cgroup.c

d6344a76 Tahsin Erdogan 2017-03-24  833  		goto fail;
5f6c2d2b Tejun Heo      2015-07-22  834  	}
e56da7e2 Tejun Heo      2012-03-05  835
d6344a76 Tahsin Erdogan 2017-03-24  836  	q = disk->queue;
d6344a76 Tahsin Erdogan 2017-03-24  837
e56da7e2 Tejun Heo      2012-03-05  838  	rcu_read_lock();
d6344a76 Tahsin Erdogan 2017-03-24 @839  	spin_lock_irq(q->queue_lock);
da8b0662 Tejun Heo      2012-04-13  840
d6344a76 Tahsin Erdogan 2017-03-24  841  	blkg = blkg_lookup_check(blkcg, pol, q);
d6344a76 Tahsin Erdogan 2017-03-24  842  	if (IS_ERR(blkg)) {
d6344a76 Tahsin Erdogan 2017-03-24  843  		ret = PTR_ERR(blkg);
d6344a76 Tahsin Erdogan 2017-03-24  844  		goto fail_unlock;
d6344a76 Tahsin Erdogan 2017-03-24  845  	}
d6344a76 Tahsin Erdogan 2017-03-24  846
d6344a76 Tahsin Erdogan 2017-03-24  847  	if (blkg)
d6344a76 Tahsin Erdogan 2017-03-24  848  		goto success;
d6344a76 Tahsin Erdogan 2017-03-24  849
d6344a76 Tahsin Erdogan 2017-03-24  850  	/*
d6344a76 Tahsin Erdogan 2017-03-24  851  	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
d6344a76 Tahsin Erdogan 2017-03-24  852  	 * non-root blkgs have access to their parents.
d6344a76 Tahsin Erdogan 2017-03-24  853  	 */
d6344a76 Tahsin Erdogan 2017-03-24  854  	while (true) {
d6344a76 Tahsin Erdogan 2017-03-24  855  		struct blkcg *pos = blkcg;
d6344a76 Tahsin Erdogan 2017-03-24  856  		struct blkcg *parent;
d6344a76 Tahsin Erdogan 2017-03-24  857  		struct blkcg_gq *new_blkg;
e56da7e2 Tejun Heo      2012-03-05  858
d6344a76 Tahsin Erdogan 2017-03-24  859  		parent = blkcg_parent(blkcg);
d6344a76 Tahsin Erdogan 2017-03-24  860  		while (parent && !__blkg_lookup(parent, q, false)) {
d6344a76 Tahsin Erdogan 2017-03-24  861  			pos = parent;
d6344a76 Tahsin Erdogan 2017-03-24  862  			parent = blkcg_parent(parent);
d6344a76 Tahsin Erdogan 2017-03-24  863  		}
d6344a76 Tahsin Erdogan 2017-03-24  864
d6344a76 Tahsin Erdogan 2017-03-24  865  		/* Drop locks to do new blkg allocation with GFP_KERNEL. */
d6344a76 Tahsin Erdogan 2017-03-24  866  		spin_unlock_irq(q->queue_lock);
d6344a76 Tahsin Erdogan 2017-03-24  867  		rcu_read_unlock();
d6344a76 Tahsin Erdogan 2017-03-24  868
d6344a76 Tahsin Erdogan 2017-03-24  869  		new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
d6344a76 Tahsin Erdogan 2017-03-24  870  		if (unlikely(!new_blkg)) {
d6344a76 Tahsin Erdogan 2017-03-24  871  			ret = -ENOMEM;
d6344a76 Tahsin Erdogan 2017-03-24  872  			goto fail;
d6344a76 Tahsin Erdogan 2017-03-24  873  		}
d6344a76 Tahsin Erdogan 2017-03-24  874
d6344a76 Tahsin Erdogan 2017-03-24  875  		rcu_read_lock();
d6344a76 Tahsin Erdogan 2017-03-24  876  		spin_lock_irq(q->queue_lock);
d6344a76 Tahsin Erdogan 2017-03-24  877
d6344a76 Tahsin Erdogan 2017-03-24  878  		blkg = blkg_lookup_check(pos, pol, q);
e56da7e2 Tejun Heo      2012-03-05  879  		if (IS_ERR(blkg)) {
e56da7e2 Tejun Heo      2012-03-05  880  			ret = PTR_ERR(blkg);
d6344a76 Tahsin Erdogan 2017-03-24  881  			goto fail_unlock;
d6344a76 Tahsin Erdogan 2017-03-24  882  		}
d6344a76 Tahsin Erdogan 2017-03-24  883
d6344a76 Tahsin Erdogan 2017-03-24  884  		if (blkg) {
d6344a76 Tahsin Erdogan 2017-03-24  885  			blkg_free(new_blkg);
d6344a76 Tahsin Erdogan 2017-03-24  886  		} else {
d6344a76 Tahsin Erdogan 2017-03-24  887  			blkg = blkg_create(pos, q, new_blkg);
d6344a76 Tahsin Erdogan 2017-03-24  888  			if (unlikely(IS_ERR(blkg))) {
d6344a76 Tahsin Erdogan 2017-03-24  889  				ret = PTR_ERR(blkg);
d6344a76 Tahsin Erdogan 2017-03-24  890  				goto fail_unlock;
d6344a76 Tahsin Erdogan 2017-03-24  891  			}
d6344a76 Tahsin Erdogan 2017-03-24  892  		}
d6344a76 Tahsin Erdogan 2017-03-24  893
d6344a76 Tahsin Erdogan 2017-03-24  894  		if (pos == blkcg)
d6344a76 Tahsin Erdogan 2017-03-24  895  			goto success;
d6344a76 Tahsin Erdogan 2017-03-24  896  	}
d6344a76 Tahsin Erdogan 2017-03-24  897  success:
d6344a76 Tahsin Erdogan 2017-03-24  898  	ctx->disk = disk;
d6344a76 Tahsin Erdogan 2017-03-24  899  	ctx->blkg = blkg;
d6344a76 Tahsin Erdogan 2017-03-24  900  	ctx->body = body;
d6344a76 Tahsin Erdogan 2017-03-24 @901  	return 0;
d6344a76 Tahsin Erdogan 2017-03-24  902
d6344a76 Tahsin Erdogan 2017-03-24  903  fail_unlock:
d6344a76 Tahsin Erdogan 2017-03-24  904  	spin_unlock_irq(q->queue_lock);

---
0-DAY kernel test infrastructure                Open Source Technology Center
https://lists.01.org/pipermail/kbuild-all                   Intel Corporation

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
  2017-03-26 10:54 [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock Julia Lawall
@ 2017-03-27 18:29 ` Tahsin Erdogan
  0 siblings, 0 replies; 9+ messages in thread
From: Tahsin Erdogan @ 2017-03-27 18:29 UTC (permalink / raw)
  To: Julia Lawall
  Cc: Tejun Heo, Jens Axboe, linux-block, David Rientjes, linux-kernel

On Sun, Mar 26, 2017 at 3:54 AM, Julia Lawall <julia.lawall@lip6.fr> wrote:
> Is an unlock needed before line 848?  Or does blkg_lookup_check take care
> of it?

Unlock is not needed. On success, function returns with locks held.
It is documented at line 805:

"... This function returns with RCU read
 * lock and queue lock held and must be paired with blkg_conf_finish()."

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
  2017-03-24 22:04   ` Jens Axboe
@ 2017-03-28 21:53     ` Tejun Heo
  0 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2017-03-28 21:53 UTC (permalink / raw)
  To: Jens Axboe; +Cc: Tahsin Erdogan, linux-block, David Rientjes, linux-kernel

On Fri, Mar 24, 2017 at 04:04:32PM -0600, Jens Axboe wrote:
> On 03/24/2017 03:56 PM, Tahsin Erdogan wrote:
> > blkg_conf_prep() currently calls blkg_lookup_create() while holding
> > request queue spinlock. This means allocating memory for struct
> > blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
> > failures in call paths like below:
> > 
> >   pcpu_alloc+0x68f/0x710
> >   __alloc_percpu_gfp+0xd/0x10
> >   __percpu_counter_init+0x55/0xc0
> >   cfq_pd_alloc+0x3b2/0x4e0
> >   blkg_alloc+0x187/0x230
> >   blkg_create+0x489/0x670
> >   blkg_lookup_create+0x9a/0x230
> >   blkg_conf_prep+0x1fb/0x240
> >   __cfqg_set_weight_device.isra.105+0x5c/0x180
> >   cfq_set_weight_on_dfl+0x69/0xc0
> >   cgroup_file_write+0x39/0x1c0
> >   kernfs_fop_write+0x13f/0x1d0
> >   __vfs_write+0x23/0x120
> >   vfs_write+0xc2/0x1f0
> >   SyS_write+0x44/0xb0
> >   entry_SYSCALL_64_fastpath+0x18/0xad
> > 
> > In the code path above, percpu allocator cannot call vmalloc() due to
> > queue spinlock.
> > 
> > A failure in this call path gives grief to tools which are trying to
> > configure io weights. We see occasional failures happen shortly after
> > reboots even when system is not under any memory pressure. Machines
> > with a lot of cpus are more vulnerable to this condition.
> > 
> > Do struct blkcg_gq allocations outside the queue spinlock to allow
> > blocking during memory allocations.
> 
> This looks much simpler/cleaner to me, compared to v5. Tejun, what do
> you think?

So, this patch in itself looks better but now we end up with two
separate mechanisms to handle non-atomic allocations.  This drop lock
/ alloc / relock / check invariants in the main path and preallocation
logic used in the init path.  Right now, both proposed implementations
aren't that satisfactory.  Personally, I'd prefer superficial ugliness
to structural duplications, but, ideally, we shouldn't have to make
this choice.  idk, it's a bug fix.  We can always clean things up
later.

Acked-by: Tejun Heo <tj@kernel.org>

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
  2017-03-24 21:56 ` [PATCH] " Tahsin Erdogan
@ 2017-03-24 22:04   ` Jens Axboe
  2017-03-28 21:53     ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: Jens Axboe @ 2017-03-24 22:04 UTC (permalink / raw)
  To: Tahsin Erdogan, Tejun Heo; +Cc: linux-block, David Rientjes, linux-kernel

On 03/24/2017 03:56 PM, Tahsin Erdogan wrote:
> blkg_conf_prep() currently calls blkg_lookup_create() while holding
> request queue spinlock. This means allocating memory for struct
> blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
> failures in call paths like below:
> 
>   pcpu_alloc+0x68f/0x710
>   __alloc_percpu_gfp+0xd/0x10
>   __percpu_counter_init+0x55/0xc0
>   cfq_pd_alloc+0x3b2/0x4e0
>   blkg_alloc+0x187/0x230
>   blkg_create+0x489/0x670
>   blkg_lookup_create+0x9a/0x230
>   blkg_conf_prep+0x1fb/0x240
>   __cfqg_set_weight_device.isra.105+0x5c/0x180
>   cfq_set_weight_on_dfl+0x69/0xc0
>   cgroup_file_write+0x39/0x1c0
>   kernfs_fop_write+0x13f/0x1d0
>   __vfs_write+0x23/0x120
>   vfs_write+0xc2/0x1f0
>   SyS_write+0x44/0xb0
>   entry_SYSCALL_64_fastpath+0x18/0xad
> 
> In the code path above, percpu allocator cannot call vmalloc() due to
> queue spinlock.
> 
> A failure in this call path gives grief to tools which are trying to
> configure io weights. We see occasional failures happen shortly after
> reboots even when system is not under any memory pressure. Machines
> with a lot of cpus are more vulnerable to this condition.
> 
> Do struct blkcg_gq allocations outside the queue spinlock to allow
> blocking during memory allocations.

This looks much simpler/cleaner to me, compared to v5. Tejun, what do
you think?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
  2017-03-13 16:17 [PATCH v5] " Tahsin Erdogan
@ 2017-03-24 21:56 ` Tahsin Erdogan
  2017-03-24 22:04   ` Jens Axboe
  0 siblings, 1 reply; 9+ messages in thread
From: Tahsin Erdogan @ 2017-03-24 21:56 UTC (permalink / raw)
  To: Tejun Heo, Jens Axboe
  Cc: linux-block, David Rientjes, linux-kernel, Tahsin Erdogan

blkg_conf_prep() currently calls blkg_lookup_create() while holding
request queue spinlock. This means allocating memory for struct
blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
failures in call paths like below:

  pcpu_alloc+0x68f/0x710
  __alloc_percpu_gfp+0xd/0x10
  __percpu_counter_init+0x55/0xc0
  cfq_pd_alloc+0x3b2/0x4e0
  blkg_alloc+0x187/0x230
  blkg_create+0x489/0x670
  blkg_lookup_create+0x9a/0x230
  blkg_conf_prep+0x1fb/0x240
  __cfqg_set_weight_device.isra.105+0x5c/0x180
  cfq_set_weight_on_dfl+0x69/0xc0
  cgroup_file_write+0x39/0x1c0
  kernfs_fop_write+0x13f/0x1d0
  __vfs_write+0x23/0x120
  vfs_write+0xc2/0x1f0
  SyS_write+0x44/0xb0
  entry_SYSCALL_64_fastpath+0x18/0xad

In the code path above, percpu allocator cannot call vmalloc() due to
queue spinlock.

A failure in this call path gives grief to tools which are trying to
configure io weights. We see occasional failures happen shortly after
reboots even when system is not under any memory pressure. Machines
with a lot of cpus are more vulnerable to this condition.

Do struct blkcg_gq allocations outside the queue spinlock to allow
blocking during memory allocations.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v6:
  Due to Jens' objection to conditionally dropping locks based on gfp
  flags, go back to v1 approach.
  Perform queue bypass and policy enabled checks at every iteration.
  Add blkg_lookup_check() to reduce code duplication.

v5:
  Removed stale blkg_alloc() in blkcg_init_queue()

  Pushed down radix_tree_preload() into blkg_create() because it
  disables preemption on return and makes it unsafe to call blocking
  memory allocations.

v4:
  Simplified error checking in blkg_create()
  Factored out __blkg_lookup_create()

v3:
  Pushed down all blkg allocations into blkg_create()

v2:
  Moved blkg creation into blkg_lookup_create() to avoid duplicating
  blkg_lookup_create() logic.

 block/blk-cgroup.c | 123 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 98 insertions(+), 25 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bbe7ee00bd3d..7c2947128f58 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 }
 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
 
+/* Performs queue bypass and policy enabled checks then looks up blkg. */
+static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
+					  const struct blkcg_policy *pol,
+					  struct request_queue *q)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	lockdep_assert_held(q->queue_lock);
+
+	if (!blkcg_policy_enabled(q, pol))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	/*
+	 * This could be the first entry point of blkcg implementation and
+	 * we shouldn't allow anything to go through for a bypassing queue.
+	 */
+	if (unlikely(blk_queue_bypass(q)))
+		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
+	return __blkg_lookup(blkcg, q, true /* update_hint */);
+}
+
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
  * @blkcg: target block cgroup
@@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	__acquires(rcu) __acquires(disk->queue->queue_lock)
 {
 	struct gendisk *disk;
+	struct request_queue *q;
 	struct blkcg_gq *blkg;
 	struct module *owner;
 	unsigned int major, minor;
@@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	if (!disk)
 		return -ENODEV;
 	if (part) {
-		owner = disk->fops->owner;
-		put_disk(disk);
-		module_put(owner);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto fail;
 	}
 
-	rcu_read_lock();
-	spin_lock_irq(disk->queue->queue_lock);
+	q = disk->queue;
 
-	if (blkcg_policy_enabled(disk->queue, pol))
-		blkg = blkg_lookup_create(blkcg, disk->queue);
-	else
-		blkg = ERR_PTR(-EOPNOTSUPP);
+	rcu_read_lock();
+	spin_lock_irq(q->queue_lock);
 
+	blkg = blkg_lookup_check(blkcg, pol, q);
 	if (IS_ERR(blkg)) {
 		ret = PTR_ERR(blkg);
+		goto fail_unlock;
+	}
+
+	if (blkg)
+		goto success;
+
+	/*
+	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
+	 * non-root blkgs have access to their parents.
+	 */
+	while (true) {
+		struct blkcg *pos = blkcg;
+		struct blkcg *parent;
+		struct blkcg_gq *new_blkg;
+
+		parent = blkcg_parent(blkcg);
+		while (parent && !__blkg_lookup(parent, q, false)) {
+			pos = parent;
+			parent = blkcg_parent(parent);
+		}
+
+		/* Drop locks to do new blkg allocation with GFP_KERNEL. */
+		spin_unlock_irq(q->queue_lock);
 		rcu_read_unlock();
-		spin_unlock_irq(disk->queue->queue_lock);
-		owner = disk->fops->owner;
-		put_disk(disk);
-		module_put(owner);
-		/*
-		 * If queue was bypassing, we should retry.  Do so after a
-		 * short msleep().  It isn't strictly necessary but queue
-		 * can be bypassing for some time and it's always nice to
-		 * avoid busy looping.
-		 */
-		if (ret == -EBUSY) {
-			msleep(10);
-			ret = restart_syscall();
+
+		new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
+		if (unlikely(!new_blkg)) {
+			ret = -ENOMEM;
+			goto fail;
 		}
-		return ret;
-	}
 
+		rcu_read_lock();
+		spin_lock_irq(q->queue_lock);
+
+		blkg = blkg_lookup_check(pos, pol, q);
+		if (IS_ERR(blkg)) {
+			ret = PTR_ERR(blkg);
+			goto fail_unlock;
+		}
+
+		if (blkg) {
+			blkg_free(new_blkg);
+		} else {
+			blkg = blkg_create(pos, q, new_blkg);
+			if (unlikely(IS_ERR(blkg))) {
+				ret = PTR_ERR(blkg);
+				goto fail_unlock;
+			}
+		}
+
+		if (pos == blkcg)
+			goto success;
+	}
+success:
 	ctx->disk = disk;
 	ctx->blkg = blkg;
 	ctx->body = body;
 	return 0;
+
+fail_unlock:
+	spin_unlock_irq(q->queue_lock);
+	rcu_read_unlock();
+fail:
+	owner = disk->fops->owner;
+	put_disk(disk);
+	module_put(owner);
+	/*
+	 * If queue was bypassing, we should retry.  Do so after a
+	 * short msleep().  It isn't strictly necessary but queue
+	 * can be bypassing for some time and it's always nice to
+	 * avoid busy looping.
+	 */
+	if (ret == -EBUSY) {
+		msleep(10);
+		ret = restart_syscall();
+	}
+	return ret;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
 
-- 
2.12.1.578.ge9c3154ca4-goog

^ permalink raw reply related	[flat|nested] 9+ messages in thread

* Re: [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
  2017-02-28 23:51   ` Tahsin Erdogan
@ 2017-03-01 16:55     ` Tejun Heo
  0 siblings, 0 replies; 9+ messages in thread
From: Tejun Heo @ 2017-03-01 16:55 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Jens Axboe, linux-block, David Rientjes, linux-kernel

Hello,

On Tue, Feb 28, 2017 at 03:51:27PM -0800, Tahsin Erdogan wrote:
> On Tue, Feb 28, 2017 at 2:47 PM, Tejun Heo <tj@kernel.org> wrote:
> >> +     if (!blkcg_policy_enabled(q, pol)) {
> >> +             ret = -EOPNOTSUPP;
> >> +             goto fail;
> >
> > Pulling this out of the queue_lock doesn't seem safe to me.  This
> > function may end up calling into callbacks of disabled policies this
> > way.
> 
> I will move this to within the lock. To make things safe, I am also
> thinking of rechecking both blkcg_policy_enabled()  and
> blk_queue_bypass() after reacquiring the locks in each iteration.
> 
> >> +             parent = blkcg_parent(blkcg);
> >> +             while (parent && !__blkg_lookup(parent, q, false)) {
> >> +                     pos = parent;
> >> +                     parent = blkcg_parent(parent);
> >> +             }
> >
> > Hmm... how about adding @new_blkg to blkg_lookup_create() and calling
> > it with non-NULL @new_blkg until it succeeds?  Wouldn't that be
> > simpler?
> >
> >> +
> >> +             new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
> 
> The challenge with that approach is creating a new_blkg with the right
> blkcg before passing to blkg_lookup_create(). blkg_lookup_create()
> walks down the hierarchy and will try to fill the first missing entry
> and the preallocated new_blkg must have been created with the right
> blkcg (feel free to send a code fragment if you think I am
> misunderstanding the suggestion).

Ah, indeed, but we can break out allocation of blkg and its
initialization, right?  It's a bit more work but then we'd be able to
do something like.


retry:
	new_blkg = alloc;
	lock;
	sanity checks;
	blkg = blkg_lookup_and_create(..., new_blkg);
	if (!blkg) {
		unlock;
		goto retry;
	}

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
  2017-02-28 22:47 ` Tejun Heo
@ 2017-02-28 23:51   ` Tahsin Erdogan
  2017-03-01 16:55     ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: Tahsin Erdogan @ 2017-02-28 23:51 UTC (permalink / raw)
  To: Tejun Heo; +Cc: Jens Axboe, linux-block, David Rientjes, linux-kernel

On Tue, Feb 28, 2017 at 2:47 PM, Tejun Heo <tj@kernel.org> wrote:
>> +     if (!blkcg_policy_enabled(q, pol)) {
>> +             ret = -EOPNOTSUPP;
>> +             goto fail;
>
> Pulling this out of the queue_lock doesn't seem safe to me.  This
> function may end up calling into callbacks of disabled policies this
> way.

I will move this to within the lock. To make things safe, I am also
thinking of rechecking both blkcg_policy_enabled()  and
blk_queue_bypass() after reacquiring the locks in each iteration.

>> +             parent = blkcg_parent(blkcg);
>> +             while (parent && !__blkg_lookup(parent, q, false)) {
>> +                     pos = parent;
>> +                     parent = blkcg_parent(parent);
>> +             }
>
> Hmm... how about adding @new_blkg to blkg_lookup_create() and calling
> it with non-NULL @new_blkg until it succeeds?  Wouldn't that be
> simpler?
>
>> +
>> +             new_blkg = blkg_alloc(pos, q, GFP_KERNEL);

The challenge with that approach is creating a new_blkg with the right
blkcg before passing to blkg_lookup_create(). blkg_lookup_create()
walks down the hierarchy and will try to fill the first missing entry
and the preallocated new_blkg must have been created with the right
blkcg (feel free to send a code fragment if you think I am
misunderstanding the suggestion).

^ permalink raw reply	[flat|nested] 9+ messages in thread

* Re: [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
  2017-02-28  2:49 Tahsin Erdogan
@ 2017-02-28 22:47 ` Tejun Heo
  2017-02-28 23:51   ` Tahsin Erdogan
  0 siblings, 1 reply; 9+ messages in thread
From: Tejun Heo @ 2017-02-28 22:47 UTC (permalink / raw)
  To: Tahsin Erdogan; +Cc: Jens Axboe, linux-block, David Rientjes, linux-kernel

Hello,

Overall, the approach looks good to me but please see below.

On Mon, Feb 27, 2017 at 06:49:57PM -0800, Tahsin Erdogan wrote:
> @@ -806,44 +807,99 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
>  	if (!disk)
>  		return -ENODEV;
>  	if (part) {
> -		owner = disk->fops->owner;
> -		put_disk(disk);
> -		module_put(owner);
> -		return -ENODEV;
> +		ret = -ENODEV;
> +		goto fail;
> +	}
> +
> +	q = disk->queue;
> +
> +	if (!blkcg_policy_enabled(q, pol)) {
> +		ret = -EOPNOTSUPP;
> +		goto fail;

Pulling this out of the queue_lock doesn't seem safe to me.  This
function may end up calling into callbacks of disabled policies this
way.

> +	/*
> +	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
> +	 * non-root blkgs have access to their parents.
> +	 */
> +	while (true) {
> +		struct blkcg *pos = blkcg;
> +		struct blkcg *parent;
> +		struct blkcg_gq *new_blkg;
> +
> +		parent = blkcg_parent(blkcg);
> +		while (parent && !__blkg_lookup(parent, q, false)) {
> +			pos = parent;
> +			parent = blkcg_parent(parent);
> +		}

Hmm... how about adding @new_blkg to blkg_lookup_create() and calling
it with non-NULL @new_blkg until it succeeds?  Wouldn't that be
simpler?

> +
> +		new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
> +		if (unlikely(!new_blkg)) {
> +			ret = -ENOMEM;
> +			goto fail;
> +		}
> +
> +		rcu_read_lock();
> +		spin_lock_irq(q->queue_lock);
> +
> +		/* Lookup again since we dropped the lock for blkg_alloc(). */
> +		blkg = __blkg_lookup(pos, q, false);
> +		if (blkg) {
> +			blkg_free(new_blkg);
> +		} else {
> +			blkg = blkg_create(pos, q, new_blkg);
> +			if (unlikely(IS_ERR(blkg))) {
> +				ret = PTR_ERR(blkg);
> +				goto fail_unlock;
> +			}

than duplicating the same logic here?

Thanks.

-- 
tejun

^ permalink raw reply	[flat|nested] 9+ messages in thread

* [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
@ 2017-02-28  2:49 Tahsin Erdogan
  2017-02-28 22:47 ` Tejun Heo
  0 siblings, 1 reply; 9+ messages in thread
From: Tahsin Erdogan @ 2017-02-28  2:49 UTC (permalink / raw)
  To: Tejun Heo, Jens Axboe
  Cc: linux-block, David Rientjes, linux-kernel, Tahsin Erdogan

blkg_conf_prep() currently calls blkg_lookup_create() while holding
request queue spinlock. This means allocating memory for struct
blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
failures in call paths like below:

  pcpu_alloc+0x68f/0x710
  __alloc_percpu_gfp+0xd/0x10
  __percpu_counter_init+0x55/0xc0
  cfq_pd_alloc+0x3b2/0x4e0
  blkg_alloc+0x187/0x230
  blkg_create+0x489/0x670
  blkg_lookup_create+0x9a/0x230
  blkg_conf_prep+0x1fb/0x240
  __cfqg_set_weight_device.isra.105+0x5c/0x180
  cfq_set_weight_on_dfl+0x69/0xc0
  cgroup_file_write+0x39/0x1c0
  kernfs_fop_write+0x13f/0x1d0
  __vfs_write+0x23/0x120
  vfs_write+0xc2/0x1f0
  SyS_write+0x44/0xb0
  entry_SYSCALL_64_fastpath+0x18/0xad

In the code path above, percpu allocator cannot call vmalloc() due to
queue spinlock.

A failure in this call path gives grief to tools which are trying to
configure io weights. We see occasional failures happen shortly after
reboots even when system is not under any memory pressure. Machines
with a lot of cpus are more vulnerable to this condition.

Do struct blkcg_gq allocations outside the queue spinlock to allow blocking
during memory allocations.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
 block/blk-cgroup.c | 108 ++++++++++++++++++++++++++++++++++++++++-------------
 1 file changed, 82 insertions(+), 26 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 295e98c2c8cc..8ec95f333bc8 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -788,6 +788,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	__acquires(rcu) __acquires(disk->queue->queue_lock)
 {
 	struct gendisk *disk;
+	struct request_queue *q;
 	struct blkcg_gq *blkg;
 	struct module *owner;
 	unsigned int major, minor;
@@ -806,44 +807,99 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	if (!disk)
 		return -ENODEV;
 	if (part) {
-		owner = disk->fops->owner;
-		put_disk(disk);
-		module_put(owner);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto fail;
+	}
+
+	q = disk->queue;
+
+	if (!blkcg_policy_enabled(q, pol)) {
+		ret = -EOPNOTSUPP;
+		goto fail;
 	}
 
 	rcu_read_lock();
-	spin_lock_irq(disk->queue->queue_lock);
+	spin_lock_irq(q->queue_lock);
 
-	if (blkcg_policy_enabled(disk->queue, pol))
-		blkg = blkg_lookup_create(blkcg, disk->queue);
-	else
-		blkg = ERR_PTR(-EOPNOTSUPP);
+	/*
+	 * This could be the first entry point of blkcg implementation and
+	 * we shouldn't allow anything to go through for a bypassing queue.
+	 */
+	if (unlikely(blk_queue_bypass(q))) {
+		ret = blk_queue_dying(q) ? -ENODEV : -EBUSY;
+		goto fail_unlock;
+	}
 
-	if (IS_ERR(blkg)) {
-		ret = PTR_ERR(blkg);
+	blkg = __blkg_lookup(blkcg, q, true);
+	if (blkg)
+		goto success;
+
+	/*
+	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
+	 * non-root blkgs have access to their parents.
+	 */
+	while (true) {
+		struct blkcg *pos = blkcg;
+		struct blkcg *parent;
+		struct blkcg_gq *new_blkg;
+
+		parent = blkcg_parent(blkcg);
+		while (parent && !__blkg_lookup(parent, q, false)) {
+			pos = parent;
+			parent = blkcg_parent(parent);
+		}
+
+		spin_unlock_irq(q->queue_lock);
 		rcu_read_unlock();
-		spin_unlock_irq(disk->queue->queue_lock);
-		owner = disk->fops->owner;
-		put_disk(disk);
-		module_put(owner);
-		/*
-		 * If queue was bypassing, we should retry.  Do so after a
-		 * short msleep().  It isn't strictly necessary but queue
-		 * can be bypassing for some time and it's always nice to
-		 * avoid busy looping.
-		 */
-		if (ret == -EBUSY) {
-			msleep(10);
-			ret = restart_syscall();
+
+		new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
+		if (unlikely(!new_blkg)) {
+			ret = -ENOMEM;
+			goto fail;
+		}
+
+		rcu_read_lock();
+		spin_lock_irq(q->queue_lock);
+
+		/* Lookup again since we dropped the lock for blkg_alloc(). */
+		blkg = __blkg_lookup(pos, q, false);
+		if (blkg) {
+			blkg_free(new_blkg);
+		} else {
+			blkg = blkg_create(pos, q, new_blkg);
+			if (unlikely(IS_ERR(blkg))) {
+				ret = PTR_ERR(blkg);
+				goto fail_unlock;
+			}
 		}
-		return ret;
-	}
 
+		if (pos == blkcg)
+			goto success;
+	}
+success:
 	ctx->disk = disk;
 	ctx->blkg = blkg;
 	ctx->body = body;
 	return 0;
+
+fail_unlock:
+	spin_unlock_irq(q->queue_lock);
+	rcu_read_unlock();
+fail:
+	owner = disk->fops->owner;
+	put_disk(disk);
+	module_put(owner);
+	/*
+	 * If queue was bypassing, we should retry.  Do so after a
+	 * short msleep().  It isn't strictly necessary but queue
+	 * can be bypassing for some time and it's always nice to
+	 * avoid busy looping.
+	 */
+	if (ret == -EBUSY) {
+		msleep(10);
+		ret = restart_syscall();
+	}
+	return ret;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
 
-- 
2.11.0.483.g087da7b7c-goog

^ permalink raw reply related	[flat|nested] 9+ messages in thread

end of thread, other threads:[~2017-03-28 21:54 UTC | newest]

Thread overview: 9+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-03-26 10:54 [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock Julia Lawall
2017-03-27 18:29 ` Tahsin Erdogan
  -- strict thread matches above, loose matches on Subject: below --
2017-03-13 16:17 [PATCH v5] " Tahsin Erdogan
2017-03-24 21:56 ` [PATCH] " Tahsin Erdogan
2017-03-24 22:04   ` Jens Axboe
2017-03-28 21:53     ` Tejun Heo
2017-02-28  2:49 Tahsin Erdogan
2017-02-28 22:47 ` Tejun Heo
2017-02-28 23:51   ` Tahsin Erdogan
2017-03-01 16:55     ` Tejun Heo

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).