linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
From: Jens Axboe <jens.axboe@oracle.com>
To: Matthew Wilcox <matthew@wil.cx>
Cc: Andi Kleen <andi@firstfloor.org>,
	Andrew Morton <akpm@linux-foundation.org>,
	"Wilcox, Matthew R" <matthew.r.wilcox@intel.com>,
	chinang.ma@intel.com, linux-kernel@vger.kernel.org,
	sharad.c.tripathi@intel.com, arjan@linux.intel.com,
	suresh.b.siddha@intel.com, harita.chilukuri@intel.com,
	douglas.w.styner@intel.com, peter.xihong.wang@intel.com,
	hubert.nueckel@intel.com, chris.mason@oracle.com,
	srostedt@redhat.com, linux-scsi@vger.kernel.org,
	Andrew Vasquez <andrew.vasquez@qlogic.com>,
	Anirban Chakraborty <anirban.chakraborty@qlogic.com>
Subject: Re: Mainline kernel OLTP performance update
Date: Tue, 20 Jan 2009 14:27:03 +0100	[thread overview]
Message-ID: <20090120132703.GJ30821@kernel.dk> (raw)
In-Reply-To: <20090115024740.GX29283@parisc-linux.org>

On Wed, Jan 14 2009, Matthew Wilcox wrote:
> On Thu, Jan 15, 2009 at 03:39:05AM +0100, Andi Kleen wrote:
> > Andrew Morton <akpm@linux-foundation.org> writes:
> > >>    some of that back, but not as much as taking them out (even when
> > >>    the sysctl'd variable is in a __read_mostly section).  We tried a
> > >>    patch from Jens to speed up the search for a new partition, but it
> > >>    had no effect.
> > >
> > > I find this surprising.
> > 
> > The test system has thousands of disks/LUNs which it writes to
> > all the time, in addition to a workload which is a real cache pig. 
> > So any increase in the per LUN overhead directly leads to a lot
> > more cache misses in the kernel because it increases the working set
> > there sigificantly.
> 
> This particular system has 450 spindles, but they're amalgamated into
> 30 logical volumes by the hardware or firmware.  Linux sees 30 LUNs.
> Each one, though, has fifteen partitions on it, so that brings us back
> up to 450 partitions.
> 
> This system, btw, is a scale model of the full system that would be used
> to get published results.  If I remember correctly, a 1% performance
> regression on this system is likely to translate to a 2% regression on
> the full-scale system.

Matthew, lets see if we can get this a little closer to disappearing. I
don't see lookup problems in the current kernel with the one-hit cache,
but perhaps it's either not getting enough hits in this bigger test case
or perhaps it's simply the rcu locking and preempt disables that build
up enough to cause a slowdown.

First things first, can you get a run of 2.6.29-rc2 with this patch?
It'll enable you to turn off per-partition stats in sysfs. I'd suggest
doing a run with a 2.6.29-rc2 booted with this patch, and then another
run with part_stats set to 0 for every exposed spindle. Then post those
profiles!

diff --git a/block/blk-core.c b/block/blk-core.c
index a824e49..6f693ae 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -600,7 +600,8 @@ blk_init_queue_node(request_fn_proc *rfn, spinlock_t *lock, int node_id)
 	q->prep_rq_fn		= NULL;
 	q->unplug_fn		= generic_unplug_device;
 	q->queue_flags		= (1 << QUEUE_FLAG_CLUSTER |
-				   1 << QUEUE_FLAG_STACKABLE);
+				   1 << QUEUE_FLAG_STACKABLE |
+				   1 << QUEUE_FLAG_PART_STAT);
 	q->queue_lock		= lock;
 
 	blk_queue_segment_boundary(q, BLK_SEG_BOUNDARY_MASK);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index a29cb78..a6ec2e3 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -158,6 +158,29 @@ static ssize_t queue_rq_affinity_show(struct request_queue *q, char *page)
 	return queue_var_show(set != 0, page);
 }
 
+static ssize_t queue_part_stat_store(struct request_queue *q, const char *page,
+				     size_t count)
+{
+	unsigned long nm;
+	ssize_t ret = queue_var_store(&nm, page, count);
+
+	spin_lock_irq(q->queue_lock);
+	if (nm)
+		queue_flag_set(QUEUE_FLAG_PART_STAT, q);
+	else
+		queue_flag_clear(QUEUE_FLAG_PART_STAT, q);
+
+	spin_unlock_irq(q->queue_lock);
+	return ret;
+}
+
+static ssize_t queue_part_stat_show(struct request_queue *q, char *page)
+{
+	unsigned int set = test_bit(QUEUE_FLAG_PART_STAT, &q->queue_flags);
+
+	return queue_var_show(set != 0, page);
+}
+
 static ssize_t
 queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
 {
@@ -222,6 +245,12 @@ static struct queue_sysfs_entry queue_rq_affinity_entry = {
 	.store = queue_rq_affinity_store,
 };
 
+static struct queue_sysfs_entry queue_part_stat_entry = {
+	.attr = {.name = "part_stats", .mode = S_IRUGO | S_IWUSR },
+	.show = queue_part_stat_show,
+	.store = queue_part_stat_store,
+};
+
 static struct attribute *default_attrs[] = {
 	&queue_requests_entry.attr,
 	&queue_ra_entry.attr,
@@ -231,6 +260,7 @@ static struct attribute *default_attrs[] = {
 	&queue_hw_sector_size_entry.attr,
 	&queue_nomerges_entry.attr,
 	&queue_rq_affinity_entry.attr,
+	&queue_part_stat_entry.attr,
 	NULL,
 };
 
diff --git a/block/genhd.c b/block/genhd.c
index 397960c..09cbac2 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -208,6 +208,9 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 	struct hd_struct *part;
 	int i;
 
+	if (!blk_queue_part_stat(disk->queue))
+		goto part0;
+
 	ptbl = rcu_dereference(disk->part_tbl);
 
 	part = rcu_dereference(ptbl->last_lookup);
@@ -222,6 +225,7 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
 			return part;
 		}
 	}
+part0:
 	return &disk->part0;
 }
 EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 044467e..4d45842 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -449,6 +449,7 @@ struct request_queue
 #define QUEUE_FLAG_STACKABLE   13	/* supports request stacking */
 #define QUEUE_FLAG_NONROT      14	/* non-rotational device (SSD) */
 #define QUEUE_FLAG_VIRT        QUEUE_FLAG_NONROT /* paravirt device */
+#define QUEUE_FLAG_PART_STAT   15	/* per-partition stats enabled */
 
 static inline int queue_is_locked(struct request_queue *q)
 {
@@ -568,6 +569,8 @@ enum {
 #define blk_queue_flushing(q)	((q)->ordseq)
 #define blk_queue_stackable(q)	\
 	test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags)
+#define blk_queue_part_stat(q)	\
+	test_bit(QUEUE_FLAG_PART_STAT, &(q)->queue_flags)
 
 #define blk_fs_request(rq)	((rq)->cmd_type == REQ_TYPE_FS)
 #define blk_pc_request(rq)	((rq)->cmd_type == REQ_TYPE_BLOCK_PC)

-- 
Jens Axboe


  parent reply	other threads:[~2009-01-20 13:29 UTC|newest]

Thread overview: 122+ messages / expand[flat|nested]  mbox.gz  Atom feed  top
2009-01-13 21:10 Mainline kernel OLTP performance update Ma, Chinang
2009-01-13 22:44 ` Wilcox, Matthew R
2009-01-15  0:35   ` Andrew Morton
2009-01-15  1:21     ` Matthew Wilcox
2009-01-15  2:04       ` Andrew Morton
2009-01-15  2:27         ` Steven Rostedt
2009-01-15  7:11           ` Ma, Chinang
2009-01-19 18:04             ` Chris Mason
2009-01-19 18:37               ` Steven Rostedt
2009-01-19 18:55                 ` Chris Mason
2009-01-19 19:07                   ` Steven Rostedt
2009-01-19 23:40                 ` Ingo Molnar
2009-01-15  2:39         ` Andi Kleen
2009-01-15  2:47           ` Matthew Wilcox
2009-01-15  3:36             ` Andi Kleen
2009-01-20 13:27             ` Jens Axboe [this message]
     [not found]               ` <588992150B702C48B3312184F1B810AD03A497632C@azsmsx501.amr.corp.intel.com>
2009-01-22 11:29                 ` Jens Axboe
     [not found]                   ` <588992150B702C48B3312184F1B810AD03A4F59632@azsmsx501.amr.corp.intel.com>
2009-01-27  8:28                     ` Jens Axboe
2009-01-15  7:24         ` Nick Piggin
2009-01-15  9:46           ` Pekka Enberg
2009-01-15 13:52             ` Matthew Wilcox
2009-01-15 14:42               ` Pekka Enberg
2009-01-16 10:16               ` Pekka Enberg
2009-01-16 10:21                 ` Nick Piggin
2009-01-16 10:31                   ` Pekka Enberg
2009-01-16 10:42                     ` Nick Piggin
2009-01-16 10:55                       ` Pekka Enberg
2009-01-19  7:13                         ` Nick Piggin
2009-01-19  8:05                           ` Pekka Enberg
2009-01-19  8:33                             ` Nick Piggin
2009-01-19  8:42                               ` Nick Piggin
2009-01-19  8:47                                 ` Pekka Enberg
2009-01-19  8:57                                   ` Nick Piggin
2009-01-19  9:48                               ` Pekka Enberg
2009-01-19 10:03                                 ` Nick Piggin
2009-01-16 20:59                     ` Christoph Lameter
2009-01-16  0:27           ` Andrew Morton
2009-01-16  4:03             ` Nick Piggin
2009-01-16  4:12               ` Andrew Morton
2009-01-16  6:46                 ` Nick Piggin
2009-01-16  6:55                   ` Matthew Wilcox
2009-01-16  7:06                     ` Nick Piggin
2009-01-16  7:53                     ` Zhang, Yanmin
2009-01-16 10:20                       ` Andi Kleen
2009-01-20  5:16                         ` Zhang, Yanmin
2009-01-21 23:58                           ` Christoph Lameter
2009-01-22  8:36                             ` Zhang, Yanmin
2009-01-22  9:15                               ` Pekka Enberg
2009-01-22  9:28                                 ` Zhang, Yanmin
2009-01-22  9:47                                   ` Pekka Enberg
2009-01-23  3:02                                     ` Zhang, Yanmin
2009-01-23  6:52                                       ` Pekka Enberg
2009-01-23  8:06                                         ` Pekka Enberg
2009-01-23  8:30                                           ` Zhang, Yanmin
2009-01-23  8:40                                             ` Pekka Enberg
2009-01-23  9:46                                             ` Pekka Enberg
2009-01-23 15:22                                               ` Christoph Lameter
2009-01-23 15:31                                                 ` Pekka Enberg
2009-01-23 15:55                                                   ` Christoph Lameter
2009-01-23 16:01                                                     ` Pekka Enberg
2009-01-24  2:55                                                 ` Zhang, Yanmin
2009-01-24  7:36                                                   ` Pekka Enberg
2009-02-12  5:22                                                     ` Zhang, Yanmin
2009-02-12  5:47                                                       ` Zhang, Yanmin
2009-02-12 15:25                                                         ` Christoph Lameter
2009-02-12 16:07                                                           ` Pekka Enberg
2009-02-12 16:03                                                         ` Pekka Enberg
2009-01-26 17:36                                                   ` Christoph Lameter
2009-02-01  2:52                                                     ` Zhang, Yanmin
2009-01-23  8:33                                       ` Nick Piggin
2009-01-23  9:02                                         ` Zhang, Yanmin
2009-01-23 18:40                                           ` care and feeding of netperf (Re: Mainline kernel OLTP performance update) Rick Jones
2009-01-23 18:51                                             ` Grant Grundler
2009-01-24  3:03                                             ` Zhang, Yanmin
2009-01-26 18:26                                               ` Rick Jones
2009-01-16  7:00                   ` Mainline kernel OLTP performance update Andrew Morton
2009-01-16  7:25                     ` Nick Piggin
2009-01-16  8:59                     ` Nick Piggin
2009-01-16 18:11                   ` Rick Jones
2009-01-19  7:43                     ` Nick Piggin
2009-01-19 22:19                       ` Rick Jones
2009-01-15 14:12         ` James Bottomley
2009-01-15 17:44           ` Andrew Morton
2009-01-15 18:00             ` Matthew Wilcox
2009-01-15 18:14               ` Steven Rostedt
2009-01-15 18:44                 ` Gregory Haskins
2009-01-15 18:46                   ` Wilcox, Matthew R
2009-01-15 19:44                     ` Ma, Chinang
2009-01-16 18:14                       ` Gregory Haskins
2009-01-16 19:09                         ` Steven Rostedt
2009-01-20 12:45                         ` Gregory Haskins
2009-01-15 19:28                 ` Ma, Chinang
2009-01-15 16:48       ` Ma, Chinang
  -- strict thread matches above, loose matches on Subject: below --
2010-01-25 18:26 Ma, Chinang
2009-05-04 15:54 Styner, Douglas W
2009-05-06  6:29 ` Anirban Chakraborty
2009-05-06 15:53   ` Wilcox, Matthew R
2009-05-06 18:05     ` Styner, Douglas W
2009-05-06 18:12       ` Wilcox, Matthew R
2009-05-06 18:24         ` Anirban Chakraborty
2009-05-06 19:25           ` Wilcox, Matthew R
2009-05-06 18:19   ` Styner, Douglas W
2009-04-28 17:22 Styner, Douglas W
2009-04-28 17:08 Styner, Douglas W
2009-04-29  7:29 ` Andrew Morton
2009-04-29  8:28   ` Andi Kleen
2009-04-29 16:00     ` Styner, Douglas W
2009-04-29 16:06       ` Wilcox, Matthew R
2009-04-29 16:19         ` Andi Kleen
2009-04-29 15:48   ` Styner, Douglas W
2009-04-29 16:07     ` Andrew Morton
2009-04-29 16:25       ` Peter Zijlstra
2009-04-29 17:46         ` Chris Mason
2009-04-29 18:06           ` Pallipadi, Venkatesh
2009-04-29 18:25             ` Styner, Douglas W
2009-04-29 17:52         ` Styner, Douglas W
2009-04-23 16:49 Styner, Douglas W
2009-04-27  7:02 ` Andi Kleen
2009-04-28 16:57   ` Chuck Ebbert
2009-04-28 17:15     ` James Bottomley
2009-04-28 17:17       ` Styner, Douglas W
2009-01-12 18:30 Ma, Chinang

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=20090120132703.GJ30821@kernel.dk \
    --to=jens.axboe@oracle.com \
    --cc=akpm@linux-foundation.org \
    --cc=andi@firstfloor.org \
    --cc=andrew.vasquez@qlogic.com \
    --cc=anirban.chakraborty@qlogic.com \
    --cc=arjan@linux.intel.com \
    --cc=chinang.ma@intel.com \
    --cc=chris.mason@oracle.com \
    --cc=douglas.w.styner@intel.com \
    --cc=harita.chilukuri@intel.com \
    --cc=hubert.nueckel@intel.com \
    --cc=linux-kernel@vger.kernel.org \
    --cc=linux-scsi@vger.kernel.org \
    --cc=matthew.r.wilcox@intel.com \
    --cc=matthew@wil.cx \
    --cc=peter.xihong.wang@intel.com \
    --cc=sharad.c.tripathi@intel.com \
    --cc=srostedt@redhat.com \
    --cc=suresh.b.siddha@intel.com \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).