linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Tahsin Erdogan <tahsin@google.com>
To: Tejun Heo <tj@kernel.org>, Jens Axboe <axboe@kernel.dk>
Cc: linux-block@vger.kernel.org, David Rientjes <rientjes@google.com>,
	linux-kernel@vger.kernel.org, Tahsin Erdogan <tahsin@google.com>
Subject: [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock
Date: Fri, 24 Mar 2017 14:56:27 -0700	[thread overview]
Message-ID: <20170324215627.12831-1-tahsin@google.com> (raw)
In-Reply-To: <CAAeU0aMvuzfkAyhzheYRxfb7NzLk2oJSzUk5+BoVcktrZDtcRw@mail.gmail.com>

blkg_conf_prep() currently calls blkg_lookup_create() while holding
request queue spinlock. This means allocating memory for struct
blkcg_gq has to be made non-blocking. This causes occasional -ENOMEM
failures in call paths like below:

  pcpu_alloc+0x68f/0x710
  __alloc_percpu_gfp+0xd/0x10
  __percpu_counter_init+0x55/0xc0
  cfq_pd_alloc+0x3b2/0x4e0
  blkg_alloc+0x187/0x230
  blkg_create+0x489/0x670
  blkg_lookup_create+0x9a/0x230
  blkg_conf_prep+0x1fb/0x240
  __cfqg_set_weight_device.isra.105+0x5c/0x180
  cfq_set_weight_on_dfl+0x69/0xc0
  cgroup_file_write+0x39/0x1c0
  kernfs_fop_write+0x13f/0x1d0
  __vfs_write+0x23/0x120
  vfs_write+0xc2/0x1f0
  SyS_write+0x44/0xb0
  entry_SYSCALL_64_fastpath+0x18/0xad

In the code path above, percpu allocator cannot call vmalloc() due to
queue spinlock.

A failure in this call path gives grief to tools which are trying to
configure io weights. We see occasional failures happen shortly after
reboots even when system is not under any memory pressure. Machines
with a lot of cpus are more vulnerable to this condition.

Do struct blkcg_gq allocations outside the queue spinlock to allow
blocking during memory allocations.

Suggested-by: Tejun Heo <tj@kernel.org>
Signed-off-by: Tahsin Erdogan <tahsin@google.com>
---
v6:
  Due to Jens' objection to conditionally dropping locks based on gfp
  flags, go back to v1 approach.
  Perform queue bypass and policy enabled checks at every iteration.
  Add blkg_lookup_check() to reduce code duplication.

v5:
  Removed stale blkg_alloc() in blkcg_init_queue()

  Pushed down radix_tree_preload() into blkg_create() because it
  disables preemption on return and makes it unsafe to call blocking
  memory allocations.

v4:
  Simplified error checking in blkg_create()
  Factored out __blkg_lookup_create()

v3:
  Pushed down all blkg allocations into blkg_create()

v2:
  Moved blkg creation into blkg_lookup_create() to avoid duplicating
  blkg_lookup_create() logic.

 block/blk-cgroup.c | 123 ++++++++++++++++++++++++++++++++++++++++++-----------
 1 file changed, 98 insertions(+), 25 deletions(-)

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index bbe7ee00bd3d..7c2947128f58 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -772,6 +772,27 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
 }
 EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
 
+/* Performs queue bypass and policy enabled checks then looks up blkg. */
+static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
+					  const struct blkcg_policy *pol,
+					  struct request_queue *q)
+{
+	WARN_ON_ONCE(!rcu_read_lock_held());
+	lockdep_assert_held(q->queue_lock);
+
+	if (!blkcg_policy_enabled(q, pol))
+		return ERR_PTR(-EOPNOTSUPP);
+
+	/*
+	 * This could be the first entry point of blkcg implementation and
+	 * we shouldn't allow anything to go through for a bypassing queue.
+	 */
+	if (unlikely(blk_queue_bypass(q)))
+		return ERR_PTR(blk_queue_dying(q) ? -ENODEV : -EBUSY);
+
+	return __blkg_lookup(blkcg, q, true /* update_hint */);
+}
+
 /**
  * blkg_conf_prep - parse and prepare for per-blkg config update
  * @blkcg: target block cgroup
@@ -789,6 +810,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	__acquires(rcu) __acquires(disk->queue->queue_lock)
 {
 	struct gendisk *disk;
+	struct request_queue *q;
 	struct blkcg_gq *blkg;
 	struct module *owner;
 	unsigned int major, minor;
@@ -807,44 +829,95 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 	if (!disk)
 		return -ENODEV;
 	if (part) {
-		owner = disk->fops->owner;
-		put_disk(disk);
-		module_put(owner);
-		return -ENODEV;
+		ret = -ENODEV;
+		goto fail;
 	}
 
-	rcu_read_lock();
-	spin_lock_irq(disk->queue->queue_lock);
+	q = disk->queue;
 
-	if (blkcg_policy_enabled(disk->queue, pol))
-		blkg = blkg_lookup_create(blkcg, disk->queue);
-	else
-		blkg = ERR_PTR(-EOPNOTSUPP);
+	rcu_read_lock();
+	spin_lock_irq(q->queue_lock);
 
+	blkg = blkg_lookup_check(blkcg, pol, q);
 	if (IS_ERR(blkg)) {
 		ret = PTR_ERR(blkg);
+		goto fail_unlock;
+	}
+
+	if (blkg)
+		goto success;
+
+	/*
+	 * Create blkgs walking down from blkcg_root to @blkcg, so that all
+	 * non-root blkgs have access to their parents.
+	 */
+	while (true) {
+		struct blkcg *pos = blkcg;
+		struct blkcg *parent;
+		struct blkcg_gq *new_blkg;
+
+		parent = blkcg_parent(blkcg);
+		while (parent && !__blkg_lookup(parent, q, false)) {
+			pos = parent;
+			parent = blkcg_parent(parent);
+		}
+
+		/* Drop locks to do new blkg allocation with GFP_KERNEL. */
+		spin_unlock_irq(q->queue_lock);
 		rcu_read_unlock();
-		spin_unlock_irq(disk->queue->queue_lock);
-		owner = disk->fops->owner;
-		put_disk(disk);
-		module_put(owner);
-		/*
-		 * If queue was bypassing, we should retry.  Do so after a
-		 * short msleep().  It isn't strictly necessary but queue
-		 * can be bypassing for some time and it's always nice to
-		 * avoid busy looping.
-		 */
-		if (ret == -EBUSY) {
-			msleep(10);
-			ret = restart_syscall();
+
+		new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
+		if (unlikely(!new_blkg)) {
+			ret = -ENOMEM;
+			goto fail;
 		}
-		return ret;
-	}
 
+		rcu_read_lock();
+		spin_lock_irq(q->queue_lock);
+
+		blkg = blkg_lookup_check(pos, pol, q);
+		if (IS_ERR(blkg)) {
+			ret = PTR_ERR(blkg);
+			goto fail_unlock;
+		}
+
+		if (blkg) {
+			blkg_free(new_blkg);
+		} else {
+			blkg = blkg_create(pos, q, new_blkg);
+			if (unlikely(IS_ERR(blkg))) {
+				ret = PTR_ERR(blkg);
+				goto fail_unlock;
+			}
+		}
+
+		if (pos == blkcg)
+			goto success;
+	}
+success:
 	ctx->disk = disk;
 	ctx->blkg = blkg;
 	ctx->body = body;
 	return 0;
+
+fail_unlock:
+	spin_unlock_irq(q->queue_lock);
+	rcu_read_unlock();
+fail:
+	owner = disk->fops->owner;
+	put_disk(disk);
+	module_put(owner);
+	/*
+	 * If queue was bypassing, we should retry.  Do so after a
+	 * short msleep().  It isn't strictly necessary but queue
+	 * can be bypassing for some time and it's always nice to
+	 * avoid busy looping.
+	 */
+	if (ret == -EBUSY) {
+		msleep(10);
+		ret = restart_syscall();
+	}
+	return ret;
 }
 EXPORT_SYMBOL_GPL(blkg_conf_prep);
 
-- 
2.12.1.578.ge9c3154ca4-goog

  reply	other threads:[~2017-03-24 21:57 UTC|newest]

Thread overview: 30+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2017-02-28  2:49 [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock Tahsin Erdogan
2017-02-28 22:47 ` Tejun Heo
2017-02-28 23:51   ` Tahsin Erdogan
2017-03-01 16:55     ` Tejun Heo
2017-03-01 23:43       ` [PATCH v2] " Tahsin Erdogan
2017-03-01 23:49         ` Tahsin Erdogan
2017-03-02 19:32         ` Tejun Heo
2017-03-02 22:33           ` Tahsin Erdogan
2017-03-03 19:23             ` Tejun Heo
2017-03-04  1:40               ` [PATCH v3] " Tahsin Erdogan
2017-03-04 19:23                 ` Tejun Heo
2017-03-05 14:12                   ` [PATCH v4] " Tahsin Erdogan
2017-03-05 14:24                     ` Tahsin Erdogan
2017-03-06 20:03                     ` Tejun Heo
2017-03-09  8:05                       ` [PATCH v5] " Tahsin Erdogan
2017-03-09 18:27                         ` Tejun Heo
2017-03-11 22:42                         ` Jens Axboe
2017-03-11 22:52                           ` Jens Axboe
2017-03-12  4:35                             ` Tahsin Erdogan
2017-03-13 14:32                               ` Jens Axboe
2017-03-13 16:17                                 ` Tahsin Erdogan
2017-03-24 21:56                                   ` Tahsin Erdogan [this message]
2017-03-24 22:04                                     ` [PATCH] " Jens Axboe
2017-03-28 21:53                                       ` Tejun Heo
2017-03-28 21:59                         ` [PATCH v5] " Jens Axboe
2017-03-28 22:01                           ` Tahsin Erdogan
2017-03-09  5:25                 ` [lkp-robot] [blkcg] ad63af3cb7: BUG:sleeping_function_called_from_invalid_context_at_mm/slab.h kernel test robot
2017-03-09  7:59                   ` Tahsin Erdogan
2017-03-26 10:54 [PATCH] blkcg: allocate struct blkcg_gq outside request queue spinlock Julia Lawall
2017-03-27 18:29 ` Tahsin Erdogan

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20170324215627.12831-1-tahsin@google.com \
    --to=tahsin@google.com \
    --cc=axboe@kernel.dk \
    --cc=linux-block@vger.kernel.org \
    --cc=linux-kernel@vger.kernel.org \
    --cc=rientjes@google.com \
    --cc=tj@kernel.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).