All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
@ 2017-06-28 21:12 Brian King
  2017-06-28 21:49   ` Jens Axboe
  2017-06-28 21:54   ` Jens Axboe
  0 siblings, 2 replies; 35+ messages in thread
From: Brian King @ 2017-06-28 21:12 UTC (permalink / raw)
  To: linux-block; +Cc: axboe, dm-devel, snitzer, agk, brking


This patch converts the in_flight counter in struct hd_struct from a pair of
atomics to a pair of percpu counters. This eliminates a couple of atomics from
the hot path. When running this on a Power system, to a single null_blk device
with 80 submission queues, irq mode 0, with 80 fio jobs, I saw IOPs go from
1.5M IO/s to 11.4 IO/s.

Signed-off-by: Brian King <brking@linux.vnet.ibm.com>
---

 block/bio.c               |    4 ++--
 block/blk-core.c          |    4 ++--
 block/blk-merge.c         |    2 +-
 block/genhd.c             |    2 +-
 block/partition-generic.c |    6 +++---
 drivers/md/dm.c           |   10 ++++++----
 include/linux/genhd.h     |   18 +++++++++---------
 7 files changed, 24 insertions(+), 22 deletions(-)

diff -puN include/linux/genhd.h~blk_in_flight_atomic_remove include/linux/genhd.h
--- linux-block/include/linux/genhd.h~blk_in_flight_atomic_remove	2017-06-28 16:06:43.037948079 -0500
+++ linux-block-bjking1/include/linux/genhd.h	2017-06-28 16:06:43.064947978 -0500
@@ -87,6 +87,7 @@ struct disk_stats {
 	unsigned long ticks[2];
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
+	unsigned long in_flight[2];
 };
 
 #define PARTITION_META_INFO_VOLNAMELTH	64
@@ -120,7 +121,6 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	atomic_t in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats __percpu *dkstats;
 #else
@@ -362,23 +362,23 @@ static inline void free_part_stats(struc
 #define part_stat_sub(cpu, gendiskp, field, subnd)			\
 	part_stat_add(cpu, gendiskp, field, -subnd)
 
-static inline void part_inc_in_flight(struct hd_struct *part, int rw)
+static inline void part_inc_in_flight(int cpu, struct hd_struct *part, int rw)
 {
-	atomic_inc(&part->in_flight[rw]);
+	part_stat_inc(cpu, part, in_flight[rw]);
 	if (part->partno)
-		atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+		part_stat_inc(cpu, &part_to_disk(part)->part0, in_flight[rw]);
 }
 
-static inline void part_dec_in_flight(struct hd_struct *part, int rw)
+static inline void part_dec_in_flight(int cpu, struct hd_struct *part, int rw)
 {
-	atomic_dec(&part->in_flight[rw]);
+	part_stat_dec(cpu, part, in_flight[rw]);
 	if (part->partno)
-		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+		part_stat_dec(cpu, &part_to_disk(part)->part0, in_flight[rw]);
 }
 
-static inline int part_in_flight(struct hd_struct *part)
+static inline unsigned long part_in_flight(struct hd_struct *part)
 {
-	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
+	return part_stat_read(part, in_flight[0]) + part_stat_read(part, in_flight[1]);
 }
 
 static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
diff -puN block/bio.c~blk_in_flight_atomic_remove block/bio.c
--- linux-block/block/bio.c~blk_in_flight_atomic_remove	2017-06-28 16:06:43.041948064 -0500
+++ linux-block-bjking1/block/bio.c	2017-06-28 16:06:43.065947974 -0500
@@ -1737,7 +1737,7 @@ void generic_start_io_acct(int rw, unsig
 	part_round_stats(cpu, part);
 	part_stat_inc(cpu, part, ios[rw]);
 	part_stat_add(cpu, part, sectors[rw], sectors);
-	part_inc_in_flight(part, rw);
+	part_inc_in_flight(cpu, part, rw);
 
 	part_stat_unlock();
 }
@@ -1751,7 +1751,7 @@ void generic_end_io_acct(int rw, struct
 
 	part_stat_add(cpu, part, ticks[rw], duration);
 	part_round_stats(cpu, part);
-	part_dec_in_flight(part, rw);
+	part_dec_in_flight(cpu, part, rw);
 
 	part_stat_unlock();
 }
diff -puN block/blk-core.c~blk_in_flight_atomic_remove block/blk-core.c
--- linux-block/block/blk-core.c~blk_in_flight_atomic_remove	2017-06-28 16:06:43.045948049 -0500
+++ linux-block-bjking1/block/blk-core.c	2017-06-28 16:06:43.066947970 -0500
@@ -2435,7 +2435,7 @@ void blk_account_io_done(struct request
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part, rw);
+		part_dec_in_flight(cpu, part, rw);
 
 		hd_struct_put(part);
 		part_stat_unlock();
@@ -2493,7 +2493,7 @@ void blk_account_io_start(struct request
 			hd_struct_get(part);
 		}
 		part_round_stats(cpu, part);
-		part_inc_in_flight(part, rw);
+		part_inc_in_flight(cpu, part, rw);
 		rq->part = part;
 	}
 
diff -puN block/blk-merge.c~blk_in_flight_atomic_remove block/blk-merge.c
--- linux-block/block/blk-merge.c~blk_in_flight_atomic_remove	2017-06-28 16:06:43.048948038 -0500
+++ linux-block-bjking1/block/blk-merge.c	2017-06-28 16:06:43.067947967 -0500
@@ -634,7 +634,7 @@ static void blk_account_io_merge(struct
 		part = req->part;
 
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part, rq_data_dir(req));
+		part_dec_in_flight(cpu, part, rq_data_dir(req));
 
 		hd_struct_put(part);
 		part_stat_unlock();
diff -puN block/genhd.c~blk_in_flight_atomic_remove block/genhd.c
--- linux-block/block/genhd.c~blk_in_flight_atomic_remove	2017-06-28 16:06:43.052948023 -0500
+++ linux-block-bjking1/block/genhd.c	2017-06-28 16:06:43.068947963 -0500
@@ -1220,7 +1220,7 @@ static int diskstats_show(struct seq_fil
 		part_round_stats(cpu, hd);
 		part_stat_unlock();
 		seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
-			   "%u %lu %lu %lu %u %u %u %u\n",
+			   "%u %lu %lu %lu %u %lu %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
 			   disk_name(gp, hd->partno, buf),
 			   part_stat_read(hd, ios[READ]),
diff -puN block/partition-generic.c~blk_in_flight_atomic_remove block/partition-generic.c
--- linux-block/block/partition-generic.c~blk_in_flight_atomic_remove	2017-06-28 16:06:43.055948012 -0500
+++ linux-block-bjking1/block/partition-generic.c	2017-06-28 16:06:43.069947959 -0500
@@ -120,7 +120,7 @@ ssize_t part_stat_show(struct device *de
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
-		"%8u %8u %8u"
+		"%8lu %8u %8u"
 		"\n",
 		part_stat_read(p, ios[READ]),
 		part_stat_read(p, merges[READ]),
@@ -140,8 +140,8 @@ ssize_t part_inflight_show(struct device
 {
 	struct hd_struct *p = dev_to_part(dev);
 
-	return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
-		atomic_read(&p->in_flight[1]));
+	return sprintf(buf, "%8lu %8lu\n", part_stat_read(p, in_flight[0]),
+		part_stat_read(p, in_flight[1]));
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff -puN drivers/md/dm.c~blk_in_flight_atomic_remove drivers/md/dm.c
--- linux-block/drivers/md/dm.c~blk_in_flight_atomic_remove	2017-06-28 16:06:43.058948000 -0500
+++ linux-block-bjking1/drivers/md/dm.c	2017-06-28 16:06:43.070947955 -0500
@@ -517,9 +517,9 @@ static void start_io_acct(struct dm_io *
 
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
+	part_inc_in_flight(cpu, &dm_disk(md)->part0, rw);
+	atomic_inc(&md->pending[rw]);
 	part_stat_unlock();
-	atomic_set(&dm_disk(md)->part0.in_flight[rw],
-		atomic_inc_return(&md->pending[rw]));
 
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@ -532,7 +532,7 @@ static void end_io_acct(struct dm_io *io
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->bio;
 	unsigned long duration = jiffies - io->start_time;
-	int pending;
+	int pending, cpu;
 	int rw = bio_data_dir(bio);
 
 	generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
@@ -546,9 +546,11 @@ static void end_io_acct(struct dm_io *io
 	 * After this is decremented the bio must not be touched if it is
 	 * a flush.
 	 */
+	cpu = part_stat_lock();
 	pending = atomic_dec_return(&md->pending[rw]);
-	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
+	part_dec_in_flight(cpu, &dm_disk(md)->part0, rw);
 	pending += atomic_read(&md->pending[rw^0x1]);
+	part_stat_unlock();
 
 	/* nudge anyone waiting on suspend queue */
 	if (!pending)
_

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-28 21:12 [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu Brian King
@ 2017-06-28 21:49   ` Jens Axboe
  2017-06-28 21:54   ` Jens Axboe
  1 sibling, 0 replies; 35+ messages in thread
From: Jens Axboe @ 2017-06-28 21:49 UTC (permalink / raw)
  To: Brian King, linux-block; +Cc: dm-devel, snitzer, agk

On 06/28/2017 03:12 PM, Brian King wrote:
> This patch converts the in_flight counter in struct hd_struct from a
> pair of atomics to a pair of percpu counters. This eliminates a couple
> of atomics from the hot path. When running this on a Power system, to
> a single null_blk device with 80 submission queues, irq mode 0, with
> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.

This has been done before, but I've never really liked it. The reason is
that it means that reading the part stat inflight count now has to
iterate over every possible CPU. Did you use partitions in your testing?
How many CPUs were configured? When I last tested this a few years ago
on even a quad core nehalem (which is notoriously shitty for cross-node
latencies), it was a net loss.

I do agree that we should do something about it, and it's one of those
items I've highlighted in talks about blk-mq on pending issues to fix
up. It's just not great as it currently stands, but I don't think per
CPU counters is the right way to fix it, at least not for the inflight
counter.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
@ 2017-06-28 21:49   ` Jens Axboe
  0 siblings, 0 replies; 35+ messages in thread
From: Jens Axboe @ 2017-06-28 21:49 UTC (permalink / raw)
  To: Brian King, linux-block; +Cc: dm-devel, agk, snitzer

On 06/28/2017 03:12 PM, Brian King wrote:
> This patch converts the in_flight counter in struct hd_struct from a
> pair of atomics to a pair of percpu counters. This eliminates a couple
> of atomics from the hot path. When running this on a Power system, to
> a single null_blk device with 80 submission queues, irq mode 0, with
> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.

This has been done before, but I've never really liked it. The reason is
that it means that reading the part stat inflight count now has to
iterate over every possible CPU. Did you use partitions in your testing?
How many CPUs were configured? When I last tested this a few years ago
on even a quad core nehalem (which is notoriously shitty for cross-node
latencies), it was a net loss.

I do agree that we should do something about it, and it's one of those
items I've highlighted in talks about blk-mq on pending issues to fix
up. It's just not great as it currently stands, but I don't think per
CPU counters is the right way to fix it, at least not for the inflight
counter.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-28 21:12 [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu Brian King
@ 2017-06-28 21:54   ` Jens Axboe
  2017-06-28 21:54   ` Jens Axboe
  1 sibling, 0 replies; 35+ messages in thread
From: Jens Axboe @ 2017-06-28 21:54 UTC (permalink / raw)
  To: Brian King, linux-block; +Cc: dm-devel, snitzer, agk

On 06/28/2017 03:12 PM, Brian King wrote:
> -static inline int part_in_flight(struct hd_struct *part)
> +static inline unsigned long part_in_flight(struct hd_struct *part)
>  {
> -	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
> +	return part_stat_read(part, in_flight[0]) + part_stat_read(part, in_flight[1]);

One obvious improvement would be to not do this twice, but only have to
loop once. Instead of making this an array, make it a structure with a
read and write count.

It still doesn't really fix the issue of someone running on a kernel
with a ton of possible CPUs configured. But it does reduce the overhead
by 50%.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
@ 2017-06-28 21:54   ` Jens Axboe
  0 siblings, 0 replies; 35+ messages in thread
From: Jens Axboe @ 2017-06-28 21:54 UTC (permalink / raw)
  To: Brian King, linux-block; +Cc: dm-devel, agk, snitzer

On 06/28/2017 03:12 PM, Brian King wrote:
> -static inline int part_in_flight(struct hd_struct *part)
> +static inline unsigned long part_in_flight(struct hd_struct *part)
>  {
> -	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
> +	return part_stat_read(part, in_flight[0]) + part_stat_read(part, in_flight[1]);

One obvious improvement would be to not do this twice, but only have to
loop once. Instead of making this an array, make it a structure with a
read and write count.

It still doesn't really fix the issue of someone running on a kernel
with a ton of possible CPUs configured. But it does reduce the overhead
by 50%.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-28 21:54   ` Jens Axboe
  (?)
@ 2017-06-28 21:59   ` Jens Axboe
  2017-06-28 22:07     ` [dm-devel] " Brian King
  -1 siblings, 1 reply; 35+ messages in thread
From: Jens Axboe @ 2017-06-28 21:59 UTC (permalink / raw)
  To: Brian King, linux-block; +Cc: dm-devel, snitzer, agk

On 06/28/2017 03:54 PM, Jens Axboe wrote:
> On 06/28/2017 03:12 PM, Brian King wrote:
>> -static inline int part_in_flight(struct hd_struct *part)
>> +static inline unsigned long part_in_flight(struct hd_struct *part)
>>  {
>> -	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
>> +	return part_stat_read(part, in_flight[0]) + part_stat_read(part, in_flight[1]);
> 
> One obvious improvement would be to not do this twice, but only have to
> loop once. Instead of making this an array, make it a structure with a
> read and write count.
> 
> It still doesn't really fix the issue of someone running on a kernel
> with a ton of possible CPUs configured. But it does reduce the overhead
> by 50%.

Or something as simple as this:

#define part_stat_read_double(part, field1, field2)			\
({									\
	typeof((part)->dkstats->field1) res = 0;			\
	unsigned int _cpu;						\
	for_each_possible_cpu(_cpu) {					\
		res += per_cpu_ptr((part)->dkstats, _cpu)->field1;	\
		res += per_cpu_ptr((part)->dkstats, _cpu)->field2;	\
	}								\
	res;								\
})

static inline unsigned long part_in_flight(struct hd_struct *part)
{
	return part_stat_read_double(part, in_flight[0], in_flight[1]);
}

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-28 21:49   ` Jens Axboe
@ 2017-06-28 22:04     ` Brian King
  -1 siblings, 0 replies; 35+ messages in thread
From: Brian King @ 2017-06-28 22:04 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: dm-devel, snitzer, agk

On 06/28/2017 04:49 PM, Jens Axboe wrote:
> On 06/28/2017 03:12 PM, Brian King wrote:
>> This patch converts the in_flight counter in struct hd_struct from a
>> pair of atomics to a pair of percpu counters. This eliminates a couple
>> of atomics from the hot path. When running this on a Power system, to
>> a single null_blk device with 80 submission queues, irq mode 0, with
>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
> 
> This has been done before, but I've never really liked it. The reason is
> that it means that reading the part stat inflight count now has to
> iterate over every possible CPU. Did you use partitions in your testing?
> How many CPUs were configured? When I last tested this a few years ago

I did not use partitions. I was running this on a 4 socket Power 8 machine with
5 cores per socket, running with 4 threads per core, so a total of 80
logical CPUs were usable in Linux.

I was missing the fact that part_round_stats_single calls part_in_flight
and had only noticed the sysfs and procfs users of part_in_flight previously.

-Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
@ 2017-06-28 22:04     ` Brian King
  0 siblings, 0 replies; 35+ messages in thread
From: Brian King @ 2017-06-28 22:04 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: dm-devel, agk, snitzer

On 06/28/2017 04:49 PM, Jens Axboe wrote:
> On 06/28/2017 03:12 PM, Brian King wrote:
>> This patch converts the in_flight counter in struct hd_struct from a
>> pair of atomics to a pair of percpu counters. This eliminates a couple
>> of atomics from the hot path. When running this on a Power system, to
>> a single null_blk device with 80 submission queues, irq mode 0, with
>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
> 
> This has been done before, but I've never really liked it. The reason is
> that it means that reading the part stat inflight count now has to
> iterate over every possible CPU. Did you use partitions in your testing?
> How many CPUs were configured? When I last tested this a few years ago

I did not use partitions. I was running this on a 4 socket Power 8 machine with
5 cores per socket, running with 4 threads per core, so a total of 80
logical CPUs were usable in Linux.

I was missing the fact that part_round_stats_single calls part_in_flight
and had only noticed the sysfs and procfs users of part_in_flight previously.

-Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-28 21:59   ` Jens Axboe
@ 2017-06-28 22:07     ` Brian King
  2017-06-28 22:19       ` Jens Axboe
  0 siblings, 1 reply; 35+ messages in thread
From: Brian King @ 2017-06-28 22:07 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: dm-devel, agk, snitzer

On 06/28/2017 04:59 PM, Jens Axboe wrote:
> On 06/28/2017 03:54 PM, Jens Axboe wrote:
>> On 06/28/2017 03:12 PM, Brian King wrote:
>>> -static inline int part_in_flight(struct hd_struct *part)
>>> +static inline unsigned long part_in_flight(struct hd_struct *part)
>>>  {
>>> -	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
>>> +	return part_stat_read(part, in_flight[0]) + part_stat_read(part, in_flight[1]);
>>
>> One obvious improvement would be to not do this twice, but only have to
>> loop once. Instead of making this an array, make it a structure with a
>> read and write count.
>>
>> It still doesn't really fix the issue of someone running on a kernel
>> with a ton of possible CPUs configured. But it does reduce the overhead
>> by 50%.
> 
> Or something as simple as this:
> 
> #define part_stat_read_double(part, field1, field2)			\
> ({									\
> 	typeof((part)->dkstats->field1) res = 0;			\
> 	unsigned int _cpu;						\
> 	for_each_possible_cpu(_cpu) {					\
> 		res += per_cpu_ptr((part)->dkstats, _cpu)->field1;	\
> 		res += per_cpu_ptr((part)->dkstats, _cpu)->field2;	\
> 	}								\
> 	res;								\
> })
> 
> static inline unsigned long part_in_flight(struct hd_struct *part)
> {
> 	return part_stat_read_double(part, in_flight[0], in_flight[1]);
> }
> 

I'll give this a try and also see about running some more exhaustive
runs to see if there are any cases where we go backwards in performance.

I'll also run with partitions and see how that impacts this.

Thanks,

Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-28 22:07     ` [dm-devel] " Brian King
@ 2017-06-28 22:19       ` Jens Axboe
  2017-06-29 12:59         ` Brian King
  0 siblings, 1 reply; 35+ messages in thread
From: Jens Axboe @ 2017-06-28 22:19 UTC (permalink / raw)
  To: Brian King, linux-block; +Cc: dm-devel, agk, snitzer

On 06/28/2017 04:07 PM, Brian King wrote:
> On 06/28/2017 04:59 PM, Jens Axboe wrote:
>> On 06/28/2017 03:54 PM, Jens Axboe wrote:
>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>> -static inline int part_in_flight(struct hd_struct *part)
>>>> +static inline unsigned long part_in_flight(struct hd_struct *part)
>>>>  {
>>>> -	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
>>>> +	return part_stat_read(part, in_flight[0]) + part_stat_read(part, in_flight[1]);
>>>
>>> One obvious improvement would be to not do this twice, but only have to
>>> loop once. Instead of making this an array, make it a structure with a
>>> read and write count.
>>>
>>> It still doesn't really fix the issue of someone running on a kernel
>>> with a ton of possible CPUs configured. But it does reduce the overhead
>>> by 50%.
>>
>> Or something as simple as this:
>>
>> #define part_stat_read_double(part, field1, field2)			\
>> ({									\
>> 	typeof((part)->dkstats->field1) res = 0;			\
>> 	unsigned int _cpu;						\
>> 	for_each_possible_cpu(_cpu) {					\
>> 		res += per_cpu_ptr((part)->dkstats, _cpu)->field1;	\
>> 		res += per_cpu_ptr((part)->dkstats, _cpu)->field2;	\
>> 	}								\
>> 	res;								\
>> })
>>
>> static inline unsigned long part_in_flight(struct hd_struct *part)
>> {
>> 	return part_stat_read_double(part, in_flight[0], in_flight[1]);
>> }
>>
> 
> I'll give this a try and also see about running some more exhaustive
> runs to see if there are any cases where we go backwards in performance.
> 
> I'll also run with partitions and see how that impacts this.

And do something nuts, like setting NR_CPUS to 512 or whatever. What do
distros ship with?

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-28 21:49   ` Jens Axboe
  (?)
  (?)
@ 2017-06-29  8:40   ` Ming Lei
  2017-06-29 15:58     ` Jens Axboe
  -1 siblings, 1 reply; 35+ messages in thread
From: Ming Lei @ 2017-06-29  8:40 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Brian King, linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
> On 06/28/2017 03:12 PM, Brian King wrote:
>> This patch converts the in_flight counter in struct hd_struct from a
>> pair of atomics to a pair of percpu counters. This eliminates a couple
>> of atomics from the hot path. When running this on a Power system, to
>> a single null_blk device with 80 submission queues, irq mode 0, with
>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>
> This has been done before, but I've never really liked it. The reason is
> that it means that reading the part stat inflight count now has to
> iterate over every possible CPU. Did you use partitions in your testing?
> How many CPUs were configured? When I last tested this a few years ago
> on even a quad core nehalem (which is notoriously shitty for cross-node
> latencies), it was a net loss.

One year ago, I saw null_blk's IOPS can be decreased to 10%
of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
96 cores, and dual numa nodes) too, the performance can be
recovered basically if per numa-node counter is introduced and
used in this case, but the patch was never posted out.
If anyone is interested in that, I can rebase the patch on current
block tree and post out. I guess the performance issue might be
related with system cache coherency implementation more or less.
This issue on ARM64 can be observed with the following userspace
atomic counting test too:

       http://kernel.ubuntu.com/~ming/test/cache/

>
> I do agree that we should do something about it, and it's one of those
> items I've highlighted in talks about blk-mq on pending issues to fix
> up. It's just not great as it currently stands, but I don't think per
> CPU counters is the right way to fix it, at least not for the inflight
> counter.

Yeah, it won't be a issue for non-mq path, and for blk-mq path,
maybe we can use some blk-mq knowledge(tagset?) to figure out
the 'in_flight' counter. I thought about it before, but never got a perfect
solution, and looks it is a bit hard, :-)


Thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-28 22:19       ` Jens Axboe
@ 2017-06-29 12:59         ` Brian King
  0 siblings, 0 replies; 35+ messages in thread
From: Brian King @ 2017-06-29 12:59 UTC (permalink / raw)
  To: Jens Axboe, linux-block; +Cc: dm-devel, agk, snitzer

On 06/28/2017 05:19 PM, Jens Axboe wrote:
> On 06/28/2017 04:07 PM, Brian King wrote:
>> On 06/28/2017 04:59 PM, Jens Axboe wrote:
>>> On 06/28/2017 03:54 PM, Jens Axboe wrote:
>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>> -static inline int part_in_flight(struct hd_struct *part)
>>>>> +static inline unsigned long part_in_flight(struct hd_struct *part)
>>>>>  {
>>>>> -	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
>>>>> +	return part_stat_read(part, in_flight[0]) + part_stat_read(part, in_flight[1]);
>>>>
>>>> One obvious improvement would be to not do this twice, but only have to
>>>> loop once. Instead of making this an array, make it a structure with a
>>>> read and write count.
>>>>
>>>> It still doesn't really fix the issue of someone running on a kernel
>>>> with a ton of possible CPUs configured. But it does reduce the overhead
>>>> by 50%.
>>>
>>> Or something as simple as this:
>>>
>>> #define part_stat_read_double(part, field1, field2)			\
>>> ({									\
>>> 	typeof((part)->dkstats->field1) res = 0;			\
>>> 	unsigned int _cpu;						\
>>> 	for_each_possible_cpu(_cpu) {					\
>>> 		res += per_cpu_ptr((part)->dkstats, _cpu)->field1;	\
>>> 		res += per_cpu_ptr((part)->dkstats, _cpu)->field2;	\
>>> 	}								\
>>> 	res;								\
>>> })
>>>
>>> static inline unsigned long part_in_flight(struct hd_struct *part)
>>> {
>>> 	return part_stat_read_double(part, in_flight[0], in_flight[1]);
>>> }
>>>
>>
>> I'll give this a try and also see about running some more exhaustive
>> runs to see if there are any cases where we go backwards in performance.
>>
>> I'll also run with partitions and see how that impacts this.
> 
> And do something nuts, like setting NR_CPUS to 512 or whatever. What do
> distros ship with?

Both RHEL and SLES set NR_CPUS=2048 for the Power architecture. I can easily
switch the SMT mode of the machine I used for this from 4 to 8 to have a total
of 160 online logical CPUs and see how that affects the performance. I'll
see if I can find a larger machine as well.

Thanks,

Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-29  8:40   ` Ming Lei
@ 2017-06-29 15:58     ` Jens Axboe
  2017-06-29 16:00       ` Jens Axboe
  2017-06-29 16:25       ` Ming Lei
  0 siblings, 2 replies; 35+ messages in thread
From: Jens Axboe @ 2017-06-29 15:58 UTC (permalink / raw)
  To: Ming Lei
  Cc: Brian King, linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On 06/29/2017 02:40 AM, Ming Lei wrote:
> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>> On 06/28/2017 03:12 PM, Brian King wrote:
>>> This patch converts the in_flight counter in struct hd_struct from a
>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>> of atomics from the hot path. When running this on a Power system, to
>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>
>> This has been done before, but I've never really liked it. The reason is
>> that it means that reading the part stat inflight count now has to
>> iterate over every possible CPU. Did you use partitions in your testing?
>> How many CPUs were configured? When I last tested this a few years ago
>> on even a quad core nehalem (which is notoriously shitty for cross-node
>> latencies), it was a net loss.
> 
> One year ago, I saw null_blk's IOPS can be decreased to 10%
> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
> 96 cores, and dual numa nodes) too, the performance can be
> recovered basically if per numa-node counter is introduced and
> used in this case, but the patch was never posted out.
> If anyone is interested in that, I can rebase the patch on current
> block tree and post out. I guess the performance issue might be
> related with system cache coherency implementation more or less.
> This issue on ARM64 can be observed with the following userspace
> atomic counting test too:
> 
>        http://kernel.ubuntu.com/~ming/test/cache/

How well did the per-node thing work? Doesn't seem to me like it would
go far enough. And per CPU is too much. One potential improvement would
be to change the part_stat_read() to just loop online CPUs, instead of
all possible CPUs. When CPUs go on/offline, use that as the slow path to
ensure the stats are sane. Often there's a huge difference between
NR_CPUS configured and what the system has. As Brian states, RH ships
with 2048, while I doubt a lot of customers actually run that...

Outside of coming up with a more clever data structure that is fully
CPU topology aware, one thing that could work is just having X cache
line separated read/write inflight counters per node, where X is some
suitable value (like 4). That prevents us from having cross node
traffic, and it also keeps the cross cpu traffic fairly low. That should
provide a nice balance between cost of incrementing the inflight
counting, and the cost of looping for reading it.

And that brings me to the next part...

>> I do agree that we should do something about it, and it's one of those
>> items I've highlighted in talks about blk-mq on pending issues to fix
>> up. It's just not great as it currently stands, but I don't think per
>> CPU counters is the right way to fix it, at least not for the inflight
>> counter.
> 
> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
> we can use some blk-mq knowledge(tagset?) to figure out the
> 'in_flight' counter. I thought about it before, but never got a
> perfect solution, and looks it is a bit hard, :-)

The tags are already a bit spread out, so it's worth a shot. That would
remove the need to do anything in the inc/dec path, as the tags already
do that. The inlight count could be easily retrieved with
sbitmap_weight(). The only issue here is that we need separate read and
write counters, and the weight would obviously only get us the total
count. But we can have a slower path for that, just iterate the tags and
count them. The fast path only cares about total count.

Let me try that out real quick.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-29 15:58     ` Jens Axboe
@ 2017-06-29 16:00       ` Jens Axboe
  2017-06-29 18:42         ` Jens Axboe
  2017-06-29 16:25       ` Ming Lei
  1 sibling, 1 reply; 35+ messages in thread
From: Jens Axboe @ 2017-06-29 16:00 UTC (permalink / raw)
  To: Ming Lei
  Cc: Brian King, linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On 06/29/2017 09:58 AM, Jens Axboe wrote:
> On 06/29/2017 02:40 AM, Ming Lei wrote:
>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>> of atomics from the hot path. When running this on a Power system, to
>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>
>>> This has been done before, but I've never really liked it. The reason is
>>> that it means that reading the part stat inflight count now has to
>>> iterate over every possible CPU. Did you use partitions in your testing?
>>> How many CPUs were configured? When I last tested this a few years ago
>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>> latencies), it was a net loss.
>>
>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>> 96 cores, and dual numa nodes) too, the performance can be
>> recovered basically if per numa-node counter is introduced and
>> used in this case, but the patch was never posted out.
>> If anyone is interested in that, I can rebase the patch on current
>> block tree and post out. I guess the performance issue might be
>> related with system cache coherency implementation more or less.
>> This issue on ARM64 can be observed with the following userspace
>> atomic counting test too:
>>
>>        http://kernel.ubuntu.com/~ming/test/cache/
> 
> How well did the per-node thing work? Doesn't seem to me like it would
> go far enough. And per CPU is too much. One potential improvement would
> be to change the part_stat_read() to just loop online CPUs, instead of
> all possible CPUs. When CPUs go on/offline, use that as the slow path to
> ensure the stats are sane. Often there's a huge difference between
> NR_CPUS configured and what the system has. As Brian states, RH ships
> with 2048, while I doubt a lot of customers actually run that...
> 
> Outside of coming up with a more clever data structure that is fully
> CPU topology aware, one thing that could work is just having X cache
> line separated read/write inflight counters per node, where X is some
> suitable value (like 4). That prevents us from having cross node
> traffic, and it also keeps the cross cpu traffic fairly low. That should
> provide a nice balance between cost of incrementing the inflight
> counting, and the cost of looping for reading it.
> 
> And that brings me to the next part...
> 
>>> I do agree that we should do something about it, and it's one of those
>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>> up. It's just not great as it currently stands, but I don't think per
>>> CPU counters is the right way to fix it, at least not for the inflight
>>> counter.
>>
>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>> we can use some blk-mq knowledge(tagset?) to figure out the
>> 'in_flight' counter. I thought about it before, but never got a
>> perfect solution, and looks it is a bit hard, :-)
> 
> The tags are already a bit spread out, so it's worth a shot. That would
> remove the need to do anything in the inc/dec path, as the tags already
> do that. The inlight count could be easily retrieved with
> sbitmap_weight(). The only issue here is that we need separate read and
> write counters, and the weight would obviously only get us the total
> count. But we can have a slower path for that, just iterate the tags and
> count them. The fast path only cares about total count.
> 
> Let me try that out real quick.

Well, that only works for whole disk stats, of course... There's no way
around iterating the tags and checking for this to truly work.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-29 15:58     ` Jens Axboe
  2017-06-29 16:00       ` Jens Axboe
@ 2017-06-29 16:25       ` Ming Lei
  2017-06-29 17:31         ` Brian King
  1 sibling, 1 reply; 35+ messages in thread
From: Ming Lei @ 2017-06-29 16:25 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Brian King, linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

[-- Attachment #1: Type: text/plain, Size: 4405 bytes --]

On Thu, Jun 29, 2017 at 11:58 PM, Jens Axboe <axboe@kernel.dk> wrote:
> On 06/29/2017 02:40 AM, Ming Lei wrote:
>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>> of atomics from the hot path. When running this on a Power system, to
>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>
>>> This has been done before, but I've never really liked it. The reason is
>>> that it means that reading the part stat inflight count now has to
>>> iterate over every possible CPU. Did you use partitions in your testing?
>>> How many CPUs were configured? When I last tested this a few years ago
>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>> latencies), it was a net loss.
>>
>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>> 96 cores, and dual numa nodes) too, the performance can be
>> recovered basically if per numa-node counter is introduced and
>> used in this case, but the patch was never posted out.
>> If anyone is interested in that, I can rebase the patch on current
>> block tree and post out. I guess the performance issue might be
>> related with system cache coherency implementation more or less.
>> This issue on ARM64 can be observed with the following userspace
>> atomic counting test too:
>>
>>        http://kernel.ubuntu.com/~ming/test/cache/
>
> How well did the per-node thing work? Doesn't seem to me like it would

Last time, on ARM64, I remembered that the IOPS was basically recovered,
but now I don't have a such machine to test. Could Brian test the attached patch
to see if it works on big Power machine?

And the idea is simple, just make the atomic counter per-node.

> go far enough. And per CPU is too much. One potential improvement would
> be to change the part_stat_read() to just loop online CPUs, instead of
> all possible CPUs. When CPUs go on/offline, use that as the slow path to
> ensure the stats are sane. Often there's a huge difference between
> NR_CPUS configured and what the system has. As Brian states, RH ships
> with 2048, while I doubt a lot of customers actually run that...

One observation I saw on arm64 dual socket before is that atomic inc/dec on
counter stored in local numa node is much cheaper than cross-node, that is
why I tried the per-node counter. And wrt. in-flight atomic counter, both inc
and dec should happen on CPUs belonging to same numa node in case of
blk-mq.

>
> Outside of coming up with a more clever data structure that is fully
> CPU topology aware, one thing that could work is just having X cache
> line separated read/write inflight counters per node, where X is some
> suitable value (like 4). That prevents us from having cross node
> traffic, and it also keeps the cross cpu traffic fairly low. That should
> provide a nice balance between cost of incrementing the inflight
> counting, and the cost of looping for reading it.
>
> And that brings me to the next part...
>
>>> I do agree that we should do something about it, and it's one of those
>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>> up. It's just not great as it currently stands, but I don't think per
>>> CPU counters is the right way to fix it, at least not for the inflight
>>> counter.
>>
>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>> we can use some blk-mq knowledge(tagset?) to figure out the
>> 'in_flight' counter. I thought about it before, but never got a
>> perfect solution, and looks it is a bit hard, :-)
>
> The tags are already a bit spread out, so it's worth a shot. That would
> remove the need to do anything in the inc/dec path, as the tags already
> do that. The inlight count could be easily retrieved with
> sbitmap_weight(). The only issue here is that we need separate read and
> write counters, and the weight would obviously only get us the total
> count. But we can have a slower path for that, just iterate the tags and
> count them. The fast path only cares about total count.
>
> Let me try that out real quick.
>
> --
> Jens Axboe
>



Thanks,
Ming Lei

[-- Attachment #2: per-node-atomic-counter.patch --]
[-- Type: text/x-patch, Size: 6865 bytes --]

diff --git a/block/partition-generic.c b/block/partition-generic.c
index c5ec8246e25e..bd6644bf9643 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -140,8 +140,9 @@ ssize_t part_inflight_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 
-	return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
-		atomic_read(&p->in_flight[1]));
+	return sprintf(buf, "%8u %8u\n",
+		pnode_counter_read_all(&p->in_flight[0]),
+		pnode_counter_read_all(&p->in_flight[1]));
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 96bd13e581cd..aac0b2235410 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -521,7 +521,7 @@ static void start_io_acct(struct dm_io *io)
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
-	atomic_set(&dm_disk(md)->part0.in_flight[rw],
+	pnode_counter_set(&dm_disk(md)->part0.in_flight[rw],
 		atomic_inc_return(&md->pending[rw]));
 
 	if (unlikely(dm_stats_used(&md->stats)))
@@ -550,7 +550,7 @@ static void end_io_acct(struct dm_io *io)
 	 * a flush.
 	 */
 	pending = atomic_dec_return(&md->pending[rw]);
-	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
+	pnode_counter_set(&dm_disk(md)->part0.in_flight[rw], pending);
 	pending += atomic_read(&md->pending[rw^0x1]);
 
 	/* nudge anyone waiting on suspend queue */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e619fae2f037..40c9bc74a120 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/percpu-refcount.h>
 #include <linux/uuid.h>
+#include <linux/pernode_counter.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -120,7 +121,7 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	atomic_t in_flight[2];
+	struct pnode_counter in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats __percpu *dkstats;
 #else
@@ -364,21 +365,22 @@ static inline void free_part_stats(struct hd_struct *part)
 
 static inline void part_inc_in_flight(struct hd_struct *part, int rw)
 {
-	atomic_inc(&part->in_flight[rw]);
+	pnode_counter_inc(&part->in_flight[rw]);
 	if (part->partno)
-		atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+		pnode_counter_inc(&part_to_disk(part)->part0.in_flight[rw]);
 }
 
 static inline void part_dec_in_flight(struct hd_struct *part, int rw)
 {
-	atomic_dec(&part->in_flight[rw]);
+	pnode_counter_dec(&part->in_flight[rw]);
 	if (part->partno)
-		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+		pnode_counter_dec(&part_to_disk(part)->part0.in_flight[rw]);
 }
 
 static inline int part_in_flight(struct hd_struct *part)
 {
-	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
+	return pnode_counter_read_all(&part->in_flight[0]) + \
+		pnode_counter_read_all(&part->in_flight[1]);
 }
 
 static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
@@ -627,11 +629,34 @@ extern ssize_t part_fail_store(struct device *dev,
 			       const char *buf, size_t count);
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+static inline int hd_counter_init(struct hd_struct *part)
+{
+	if (pnode_counter_init(&part->in_flight[0], GFP_KERNEL))
+		return -ENOMEM;
+	if (pnode_counter_init(&part->in_flight[1], GFP_KERNEL)) {
+		pnode_counter_deinit(&part->in_flight[0]);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static inline void hd_counter_deinit(struct hd_struct *part)
+{
+	pnode_counter_deinit(&part->in_flight[0]);
+	pnode_counter_deinit(&part->in_flight[1]);
+}
+
 static inline int hd_ref_init(struct hd_struct *part)
 {
+	if (hd_counter_init(part))
+		return -ENOMEM;
+
 	if (percpu_ref_init(&part->ref, __delete_partition, 0,
-				GFP_KERNEL))
+				GFP_KERNEL)) {
+		hd_counter_deinit(part);
 		return -ENOMEM;
+	}
 	return 0;
 }
 
@@ -659,6 +684,7 @@ static inline void hd_free_part(struct hd_struct *part)
 {
 	free_part_stats(part);
 	free_part_info(part);
+	hd_counter_deinit(part);
 	percpu_ref_exit(&part->ref);
 }
 
diff --git a/include/linux/pernode_counter.h b/include/linux/pernode_counter.h
new file mode 100644
index 000000000000..127639fbc25f
--- /dev/null
+++ b/include/linux/pernode_counter.h
@@ -0,0 +1,118 @@
+#ifndef _LINUX_PERNODE_COUNTER_H
+#define _LINUX_PERNODE_COUNTER_H
+/*
+ * A simple per node atomic counter for use in block io accounting.
+ */
+
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/gfp.h>
+
+struct node_counter {
+	atomic_t counter_in_node;
+};
+
+struct pnode_counter {
+	struct node_counter * __percpu  *counter;
+	struct node_counter **nodes;
+};
+
+static inline int pnode_counter_init(struct pnode_counter *pnc, gfp_t gfp)
+{
+	struct node_counter **nodes;
+	int i, j, cpu;
+
+	nodes = kzalloc(nr_node_ids * sizeof(struct node_counter *), gfp);
+	if (!nodes)
+		goto err_nodes;
+
+	for_each_node(i) {
+		nodes[i] = kzalloc_node(sizeof(struct node_counter), gfp, i);
+		if (!nodes[i])
+			goto err_node_counter;
+	}
+
+	pnc->counter = alloc_percpu_gfp(struct node_counter *, gfp);
+	if (!pnc->counter)
+		goto err_node_counter;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(pnc->counter, cpu) = nodes[cpu_to_node(cpu)];
+
+	pnc->nodes = nodes;
+
+	return 0;
+
+ err_node_counter:
+	for (j = 0; j < i; j++)
+		kfree(nodes[j]);
+	kfree(nodes);
+ err_nodes:
+	return -ENOMEM;
+}
+
+static inline void pnode_counter_deinit(struct pnode_counter *pnc)
+{
+	int cpu;
+
+	for_each_possible_cpu(cpu) {
+		struct node_counter *node = *per_cpu_ptr(pnc->counter, cpu);
+
+		kfree(node);
+		*per_cpu_ptr(pnc->counter, cpu) = NULL;
+	}
+	free_percpu(pnc->counter);
+	kfree(pnc->nodes);
+}
+
+static inline void pnode_counter_inc(struct pnode_counter *pnc)
+{
+	struct node_counter *node = this_cpu_read(*pnc->counter);
+
+	atomic_inc(&node->counter_in_node);
+}
+
+static inline void pnode_counter_inc_cpu(struct pnode_counter *pnc, int cpu)
+{
+	struct node_counter *node = *per_cpu_ptr(pnc->counter, cpu);
+
+	atomic_inc(&node->counter_in_node);
+}
+
+static inline void pnode_counter_dec(struct pnode_counter *pnc)
+{
+	struct node_counter *node = this_cpu_read(*pnc->counter);
+
+	atomic_dec(&node->counter_in_node);
+}
+
+static inline void pnode_counter_dec_cpu(struct pnode_counter *pnc, int cpu)
+{
+	struct node_counter *node = *per_cpu_ptr(pnc->counter, cpu);
+
+	atomic_dec(&node->counter_in_node);
+}
+
+static inline void pnode_counter_set(struct pnode_counter *pnc, int val)
+{
+	int i;
+	struct node_counter *node = this_cpu_read(*pnc->counter);
+
+	for_each_node(i)
+		atomic_set(&pnc->nodes[i]->counter_in_node, 0);
+	atomic_set(&node->counter_in_node, val);
+}
+
+static inline long pnode_counter_read_all(struct pnode_counter *pnc)
+{
+	int i;
+	long val = 0;
+
+	for_each_node(i)
+		val += atomic_read(&pnc->nodes[i]->counter_in_node);
+
+	return val;
+}
+
+#endif /* _LINUX_PERNODE_COUNTER_H */

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-29 16:25       ` Ming Lei
@ 2017-06-29 17:31         ` Brian King
  2017-06-30  1:08             ` Ming Lei
  0 siblings, 1 reply; 35+ messages in thread
From: Brian King @ 2017-06-29 17:31 UTC (permalink / raw)
  To: Ming Lei, Jens Axboe
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On 06/29/2017 11:25 AM, Ming Lei wrote:
> On Thu, Jun 29, 2017 at 11:58 PM, Jens Axboe <axboe@kernel.dk> wrote:
>> On 06/29/2017 02:40 AM, Ming Lei wrote:
>>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>>> of atomics from the hot path. When running this on a Power system, to
>>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>>
>>>> This has been done before, but I've never really liked it. The reason is
>>>> that it means that reading the part stat inflight count now has to
>>>> iterate over every possible CPU. Did you use partitions in your testing?
>>>> How many CPUs were configured? When I last tested this a few years ago
>>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>>> latencies), it was a net loss.
>>>
>>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>>> 96 cores, and dual numa nodes) too, the performance can be
>>> recovered basically if per numa-node counter is introduced and
>>> used in this case, but the patch was never posted out.
>>> If anyone is interested in that, I can rebase the patch on current
>>> block tree and post out. I guess the performance issue might be
>>> related with system cache coherency implementation more or less.
>>> This issue on ARM64 can be observed with the following userspace
>>> atomic counting test too:
>>>
>>>        http://kernel.ubuntu.com/~ming/test/cache/
>>
>> How well did the per-node thing work? Doesn't seem to me like it would
> 
> Last time, on ARM64, I remembered that the IOPS was basically recovered,
> but now I don't have a such machine to test. Could Brian test the attached patch
> to see if it works on big Power machine?
> 
> And the idea is simple, just make the atomic counter per-node.

I tried loading the patch and get an oops right away on boot. Haven't been able to 
debug anything yet. This is on top of an older kernel, so not sure if that is
the issue or not. I can try upstream and see if i have different results...

^[[-1;-1fUbuntu 16.04^[[-1;-1f.  .  .  .Unable to handle kernel paging request for data at address 0xc00031313a333532
Faulting instruction address: 0xc0000000002552c4
Oops: Kernel access of bad area, sig: 11 [#1]
SMP NR_CPUS=1024 NUMA 
PowerNV
Modules linked in: dm_round_robin vmx_crypto powernv_rng leds_powernv powernv_op_panel led_class rng_core dm_multipath autofs4 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq multipath dm_mirror dm_region_hash dm_log cxlflash bnx2x mdio libcrc32c nvme nvme_core lpfc cxl crc_t10dif crct10dif_generic crct10dif_common
CPU: 9 PID: 3485 Comm: multipathd Not tainted 4.9.10-dirty #7
task: c000000fba0d0000 task.stack: c000000fb8a1c000
NIP: c0000000002552c4 LR: c000000000255274 CTR: 0000000000000000
REGS: c000000fb8a1f350 TRAP: 0300   Not tainted  (4.9.10-dirty)
MSR: 900000000280b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 24028848  XER: 00000000
CFAR: c000000000008a60 DAR: c00031313a333532 DSISR: 40000000 SOFTE: 1 
GPR00: c0000000001f8980 c000000fb8a1f5d0 c000000000f24300 c000000fc4007c00 
GPR04: 00000000024000c0 c0000000002fc0ac 000000000000025d c000000fc5d346e0 
GPR08: 0000000fc50d6000 0000000000000000 c00031313a333532 c000000fbc836240 
GPR12: 0000000000002200 c00000000ff02400 00003fff79cfebf0 0000000000000000 
GPR16: 00000000c138fd03 000001003a12bf70 00003fff79ae75e0 00003fff79d269e8 
GPR20: 0000000000000001 00003fff79cfebf0 000001003a393aa0 00003fff79d067b8 
GPR24: 0000000000000000 c000000000b04948 000000000000a1ff c0000000002fc0ac 
GPR28: 00000000024000c0 0000000000000007 c0000000002fc0ac c000000fc4007c00 
NIP [c0000000002552c4] __kmalloc_track_caller+0x94/0x2f0
LR [c000000000255274] __kmalloc_track_caller+0x44/0x2f0
Call Trace:
[c000000fb8a1f5d0] [c000000fb8a1f610] 0xc000000fb8a1f610 (unreliable)
[c000000fb8a1f620] [c0000000001f8980] kstrdup+0x50/0xb0
[c000000fb8a1f660] [c0000000002fc0ac] __kernfs_new_node+0x4c/0x140
[c000000fb8a1f6b0] [c0000000002fd9f4] kernfs_new_node+0x34/0x80
[c000000fb8a1f6e0] [c000000000300708] kernfs_create_link+0x38/0xf0
[c000000fb8a1f720] [c000000000301cb8] sysfs_do_create_link_sd.isra.0+0xa8/0x160
[c000000fb8a1f770] [c00000000054a658] device_add+0x2b8/0x740
[c000000fb8a1f830] [c00000000054ae58] device_create_groups_vargs+0x178/0x190
[c000000fb8a1f890] [c0000000001fcd70] bdi_register+0x80/0x1d0
[c000000fb8a1f8c0] [c0000000001fd244] bdi_register_owner+0x44/0x80
[c000000fb8a1f940] [c000000000453bbc] device_add_disk+0x1cc/0x500
[c000000fb8a1f9f0] [c000000000764dec] dm_create+0x33c/0x5f0
[c000000fb8a1fa90] [c00000000076d9ac] dev_create+0x8c/0x3e0
[c000000fb8a1fb40] [c00000000076d1fc] ctl_ioctl+0x38c/0x580
[c000000fb8a1fd20] [c00000000076d410] dm_ctl_ioctl+0x20/0x30
[c000000fb8a1fd40] [c0000000002799ac] do_vfs_ioctl+0xcc/0x8f0
[c000000fb8a1fde0] [c00000000027a230] SyS_ioctl+0x60/0xc0
[c000000fb8a1fe30] [c00000000000bfe0] system_call+0x38/0xfc
Instruction dump:
39290008 7cc8482a e94d0030 e9230000 7ce95214 7d49502a e9270010 2faa0000 
419e007c 2fa90000 419e0074 e93f0022 <7f4a482a> 39200000 88ad023a 992d023a 
---[ end trace dcdac2d3f668d033 ]---



> 
>> go far enough. And per CPU is too much. One potential improvement would
>> be to change the part_stat_read() to just loop online CPUs, instead of
>> all possible CPUs. When CPUs go on/offline, use that as the slow path to
>> ensure the stats are sane. Often there's a huge difference between
>> NR_CPUS configured and what the system has. As Brian states, RH ships
>> with 2048, while I doubt a lot of customers actually run that...
> 
> One observation I saw on arm64 dual socket before is that atomic inc/dec on
> counter stored in local numa node is much cheaper than cross-node, that is
> why I tried the per-node counter. And wrt. in-flight atomic counter, both inc
> and dec should happen on CPUs belonging to same numa node in case of
> blk-mq.
> 
>>
>> Outside of coming up with a more clever data structure that is fully
>> CPU topology aware, one thing that could work is just having X cache
>> line separated read/write inflight counters per node, where X is some
>> suitable value (like 4). That prevents us from having cross node
>> traffic, and it also keeps the cross cpu traffic fairly low. That should
>> provide a nice balance between cost of incrementing the inflight
>> counting, and the cost of looping for reading it.
>>
>> And that brings me to the next part...
>>
>>>> I do agree that we should do something about it, and it's one of those
>>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>>> up. It's just not great as it currently stands, but I don't think per
>>>> CPU counters is the right way to fix it, at least not for the inflight
>>>> counter.
>>>
>>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>>> we can use some blk-mq knowledge(tagset?) to figure out the
>>> 'in_flight' counter. I thought about it before, but never got a
>>> perfect solution, and looks it is a bit hard, :-)
>>
>> The tags are already a bit spread out, so it's worth a shot. That would
>> remove the need to do anything in the inc/dec path, as the tags already
>> do that. The inlight count could be easily retrieved with
>> sbitmap_weight(). The only issue here is that we need separate read and
>> write counters, and the weight would obviously only get us the total
>> count. But we can have a slower path for that, just iterate the tags and
>> count them. The fast path only cares about total count.
>>
>> Let me try that out real quick.
>>
>> --
>> Jens Axboe
>>
> 
> 
> 
> Thanks,
> Ming Lei
> 


-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-29 16:00       ` Jens Axboe
@ 2017-06-29 18:42         ` Jens Axboe
  2017-06-30  1:20           ` Ming Lei
  0 siblings, 1 reply; 35+ messages in thread
From: Jens Axboe @ 2017-06-29 18:42 UTC (permalink / raw)
  To: Ming Lei
  Cc: Brian King, linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On 06/29/2017 10:00 AM, Jens Axboe wrote:
> On 06/29/2017 09:58 AM, Jens Axboe wrote:
>> On 06/29/2017 02:40 AM, Ming Lei wrote:
>>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>>> of atomics from the hot path. When running this on a Power system, to
>>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>>
>>>> This has been done before, but I've never really liked it. The reason is
>>>> that it means that reading the part stat inflight count now has to
>>>> iterate over every possible CPU. Did you use partitions in your testing?
>>>> How many CPUs were configured? When I last tested this a few years ago
>>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>>> latencies), it was a net loss.
>>>
>>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>>> 96 cores, and dual numa nodes) too, the performance can be
>>> recovered basically if per numa-node counter is introduced and
>>> used in this case, but the patch was never posted out.
>>> If anyone is interested in that, I can rebase the patch on current
>>> block tree and post out. I guess the performance issue might be
>>> related with system cache coherency implementation more or less.
>>> This issue on ARM64 can be observed with the following userspace
>>> atomic counting test too:
>>>
>>>        http://kernel.ubuntu.com/~ming/test/cache/
>>
>> How well did the per-node thing work? Doesn't seem to me like it would
>> go far enough. And per CPU is too much. One potential improvement would
>> be to change the part_stat_read() to just loop online CPUs, instead of
>> all possible CPUs. When CPUs go on/offline, use that as the slow path to
>> ensure the stats are sane. Often there's a huge difference between
>> NR_CPUS configured and what the system has. As Brian states, RH ships
>> with 2048, while I doubt a lot of customers actually run that...
>>
>> Outside of coming up with a more clever data structure that is fully
>> CPU topology aware, one thing that could work is just having X cache
>> line separated read/write inflight counters per node, where X is some
>> suitable value (like 4). That prevents us from having cross node
>> traffic, and it also keeps the cross cpu traffic fairly low. That should
>> provide a nice balance between cost of incrementing the inflight
>> counting, and the cost of looping for reading it.
>>
>> And that brings me to the next part...
>>
>>>> I do agree that we should do something about it, and it's one of those
>>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>>> up. It's just not great as it currently stands, but I don't think per
>>>> CPU counters is the right way to fix it, at least not for the inflight
>>>> counter.
>>>
>>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>>> we can use some blk-mq knowledge(tagset?) to figure out the
>>> 'in_flight' counter. I thought about it before, but never got a
>>> perfect solution, and looks it is a bit hard, :-)
>>
>> The tags are already a bit spread out, so it's worth a shot. That would
>> remove the need to do anything in the inc/dec path, as the tags already
>> do that. The inlight count could be easily retrieved with
>> sbitmap_weight(). The only issue here is that we need separate read and
>> write counters, and the weight would obviously only get us the total
>> count. But we can have a slower path for that, just iterate the tags and
>> count them. The fast path only cares about total count.
>>
>> Let me try that out real quick.
> 
> Well, that only works for whole disk stats, of course... There's no way
> around iterating the tags and checking for this to truly work.

Totally untested proof of concept for using the tags for this. I based
this on top of Brian's patch, so it includes his patch plus the
_double() stuff I did which is no longer really needed.


diff --git a/block/bio.c b/block/bio.c
index 9cf98b29588a..ec99d9ba0f33 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -1737,7 +1737,7 @@ void generic_start_io_acct(int rw, unsigned long sectors,
 	part_round_stats(cpu, part);
 	part_stat_inc(cpu, part, ios[rw]);
 	part_stat_add(cpu, part, sectors[rw], sectors);
-	part_inc_in_flight(part, rw);
+	part_inc_in_flight(cpu, part, rw);
 
 	part_stat_unlock();
 }
@@ -1751,7 +1751,7 @@ void generic_end_io_acct(int rw, struct hd_struct *part,
 
 	part_stat_add(cpu, part, ticks[rw], duration);
 	part_round_stats(cpu, part);
-	part_dec_in_flight(part, rw);
+	part_dec_in_flight(cpu, part, rw);
 
 	part_stat_unlock();
 }
diff --git a/block/blk-core.c b/block/blk-core.c
index af393d5a9680..6ab2efbe940b 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -2434,8 +2434,13 @@ void blk_account_io_done(struct request *req)
 
 		part_stat_inc(cpu, part, ios[rw]);
 		part_stat_add(cpu, part, ticks[rw], duration);
-		part_round_stats(cpu, part);
-		part_dec_in_flight(part, rw);
+
+		if (req->q->mq_ops)
+			part_round_stats_mq(req->q, cpu, part);
+		else {
+			part_round_stats(cpu, part);
+			part_dec_in_flight(cpu, part, rw);
+		}
 
 		hd_struct_put(part);
 		part_stat_unlock();
@@ -2492,8 +2497,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
 			part = &rq->rq_disk->part0;
 			hd_struct_get(part);
 		}
-		part_round_stats(cpu, part);
-		part_inc_in_flight(part, rw);
+		if (rq->q->mq_ops)
+			part_round_stats_mq(rq->q, cpu, part);
+		else {
+			part_round_stats(cpu, part);
+			part_inc_in_flight(cpu, part, rw);
+		}
 		rq->part = part;
 	}
 
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 99038830fb42..3b5eb2d4b964 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -634,7 +634,7 @@ static void blk_account_io_merge(struct request *req)
 		part = req->part;
 
 		part_round_stats(cpu, part);
-		part_dec_in_flight(part, rq_data_dir(req));
+		part_dec_in_flight(cpu, part, rq_data_dir(req));
 
 		hd_struct_put(part);
 		part_stat_unlock();
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index d0be72ccb091..a7b897740c47 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -214,7 +214,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
 		bitnr += tags->nr_reserved_tags;
 	rq = tags->rqs[bitnr];
 
-	if (rq->q == hctx->queue)
+	if (rq && rq->q == hctx->queue)
 		iter_data->fn(hctx, rq, iter_data->data, reserved);
 	return true;
 }
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 05dfa3f270ae..cad4d2c26285 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -43,6 +43,58 @@ static LIST_HEAD(all_q_list);
 static void blk_mq_poll_stats_start(struct request_queue *q);
 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
 
+struct mq_inflight {
+	struct hd_struct *part;
+	unsigned int inflight;
+};
+
+static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
+				  struct request *rq, void *priv,
+				  bool reserved)
+{
+	struct mq_inflight *mi = priv;
+
+	if (rq->part == mi->part &&
+	    test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
+		mi->inflight++;
+}
+
+unsigned long part_in_flight_mq(struct request_queue *q,
+				struct hd_struct *part)
+{
+	struct mq_inflight mi = { .part = part, .inflight = 0 };
+
+	blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+	return mi.inflight;
+}
+
+static void __part_round_stats_mq(struct request_queue *q, int cpu,
+				  struct hd_struct *part, unsigned long now)
+{
+	unsigned long inflight;
+
+	if (now == part->stamp)
+		return;
+
+	inflight = part_in_flight_mq(q, part);
+	if (inflight) {
+		__part_stat_add(cpu, part, time_in_queue,
+				inflight * (now - part->stamp));
+		__part_stat_add(cpu, part, io_ticks, (now - part->stamp));
+	}
+	part->stamp = now;
+}
+
+void part_round_stats_mq(struct request_queue *q, int cpu,
+			 struct hd_struct *part)
+{
+	unsigned long now = jiffies;
+
+	if (part->partno)
+		__part_round_stats_mq(q, cpu, &part_to_disk(part)->part0, now);
+	__part_round_stats_mq(q, cpu, part, now);
+}
+
 static int blk_mq_poll_stats_bkt(const struct request *rq)
 {
 	int ddir, bytes, bucket;
diff --git a/block/blk.h b/block/blk.h
index 01ebb8185f6b..4803289467ac 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -264,6 +264,11 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
 		q->last_merge = NULL;
 }
 
+extern void part_round_stats_mq(struct request_queue *q, int cpu,
+					struct hd_struct *part);
+extern unsigned long part_in_flight_mq(struct request_queue *q,
+					struct hd_struct *part);
+
 /*
  * Internal io_context interface
  */
diff --git a/block/genhd.c b/block/genhd.c
index 7f520fa25d16..8ec19773ce68 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1204,6 +1204,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	struct disk_part_iter piter;
 	struct hd_struct *hd;
 	char buf[BDEVNAME_SIZE];
+	unsigned long inflight;
 	int cpu;
 
 	/*
@@ -1217,10 +1218,17 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 	disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
 	while ((hd = disk_part_iter_next(&piter))) {
 		cpu = part_stat_lock();
-		part_round_stats(cpu, hd);
+		if (gp->queue->mq_ops)
+			part_round_stats_mq(gp->queue, cpu, hd);
+		else
+			part_round_stats(cpu, hd);
 		part_stat_unlock();
+		if (gp->queue->mq_ops)
+			inflight = part_in_flight_mq(gp->queue, hd);
+		else
+			inflight = part_in_flight(hd);
 		seq_printf(seqf, "%4d %7d %s %lu %lu %lu "
-			   "%u %lu %lu %lu %u %u %u %u\n",
+			   "%u %lu %lu %lu %u %lu %u %u\n",
 			   MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
 			   disk_name(gp, hd->partno, buf),
 			   part_stat_read(hd, ios[READ]),
@@ -1231,7 +1239,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
 			   part_stat_read(hd, merges[WRITE]),
 			   part_stat_read(hd, sectors[WRITE]),
 			   jiffies_to_msecs(part_stat_read(hd, ticks[WRITE])),
-			   part_in_flight(hd),
+			   inflight,
 			   jiffies_to_msecs(part_stat_read(hd, io_ticks)),
 			   jiffies_to_msecs(part_stat_read(hd, time_in_queue))
 			);
diff --git a/block/partition-generic.c b/block/partition-generic.c
index c5ec8246e25e..94aa92c3c010 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -19,6 +19,7 @@
 #include <linux/blktrace_api.h>
 
 #include "partitions/check.h"
+#include "blk.h"
 
 #ifdef CONFIG_BLK_DEV_MD
 extern void md_autodetect_dev(dev_t dev);
@@ -111,16 +112,27 @@ static ssize_t part_discard_alignment_show(struct device *dev,
 ssize_t part_stat_show(struct device *dev,
 		       struct device_attribute *attr, char *buf)
 {
+	struct gendisk *disk = dev_to_disk(dev);
 	struct hd_struct *p = dev_to_part(dev);
+	unsigned long inflight;
 	int cpu;
 
 	cpu = part_stat_lock();
-	part_round_stats(cpu, p);
+	if (disk->queue->mq_ops)
+		part_round_stats_mq(disk->queue, cpu, p);
+	else
+		part_round_stats(cpu, p);
 	part_stat_unlock();
+
+	if (disk->queue->mq_ops)
+		inflight = part_in_flight_mq(disk->queue, p);
+	else
+		inflight = part_in_flight(p);
+
 	return sprintf(buf,
 		"%8lu %8lu %8llu %8u "
 		"%8lu %8lu %8llu %8u "
-		"%8u %8u %8u"
+		"%8lu %8u %8u"
 		"\n",
 		part_stat_read(p, ios[READ]),
 		part_stat_read(p, merges[READ]),
@@ -130,7 +142,7 @@ ssize_t part_stat_show(struct device *dev,
 		part_stat_read(p, merges[WRITE]),
 		(unsigned long long)part_stat_read(p, sectors[WRITE]),
 		jiffies_to_msecs(part_stat_read(p, ticks[WRITE])),
-		part_in_flight(p),
+		inflight,
 		jiffies_to_msecs(part_stat_read(p, io_ticks)),
 		jiffies_to_msecs(part_stat_read(p, time_in_queue)));
 }
@@ -140,8 +152,8 @@ ssize_t part_inflight_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 
-	return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
-		atomic_read(&p->in_flight[1]));
+	return sprintf(buf, "%8lu %8lu\n", part_stat_read(p, in_flight[0]),
+		part_stat_read(p, in_flight[1]));
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 402946035308..1034abffd10d 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -517,9 +517,9 @@ static void start_io_acct(struct dm_io *io)
 
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
+	part_inc_in_flight(cpu, &dm_disk(md)->part0, rw);
+	atomic_inc(&md->pending[rw]);
 	part_stat_unlock();
-	atomic_set(&dm_disk(md)->part0.in_flight[rw],
-		atomic_inc_return(&md->pending[rw]));
 
 	if (unlikely(dm_stats_used(&md->stats)))
 		dm_stats_account_io(&md->stats, bio_data_dir(bio),
@@ -532,7 +532,7 @@ static void end_io_acct(struct dm_io *io)
 	struct mapped_device *md = io->md;
 	struct bio *bio = io->bio;
 	unsigned long duration = jiffies - io->start_time;
-	int pending;
+	int pending, cpu;
 	int rw = bio_data_dir(bio);
 
 	generic_end_io_acct(rw, &dm_disk(md)->part0, io->start_time);
@@ -546,9 +546,11 @@ static void end_io_acct(struct dm_io *io)
 	 * After this is decremented the bio must not be touched if it is
 	 * a flush.
 	 */
+	cpu = part_stat_lock();
 	pending = atomic_dec_return(&md->pending[rw]);
-	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
+	part_dec_in_flight(cpu, &dm_disk(md)->part0, rw);
 	pending += atomic_read(&md->pending[rw^0x1]);
+	part_stat_unlock();
 
 	/* nudge anyone waiting on suspend queue */
 	if (!pending)
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e619fae2f037..d050a509bdd4 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -87,6 +87,7 @@ struct disk_stats {
 	unsigned long ticks[2];
 	unsigned long io_ticks;
 	unsigned long time_in_queue;
+	unsigned long in_flight[2];
 };
 
 #define PARTITION_META_INFO_VOLNAMELTH	64
@@ -120,7 +121,6 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	atomic_t in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats __percpu *dkstats;
 #else
@@ -292,6 +292,17 @@ extern struct hd_struct *disk_map_sector_rcu(struct gendisk *disk,
 #define __part_stat_add(cpu, part, field, addnd)			\
 	(per_cpu_ptr((part)->dkstats, (cpu))->field += (addnd))
 
+#define part_stat_read_double(part, field1, field2)					\
+({									\
+	typeof((part)->dkstats->field1) res = 0;				\
+	unsigned int _cpu;						\
+	for_each_possible_cpu(_cpu) {					\
+		res += per_cpu_ptr((part)->dkstats, _cpu)->field1;	\
+		res += per_cpu_ptr((part)->dkstats, _cpu)->field2;	\
+	}								\
+	res;								\
+})
+
 #define part_stat_read(part, field)					\
 ({									\
 	typeof((part)->dkstats->field) res = 0;				\
@@ -362,23 +373,23 @@ static inline void free_part_stats(struct hd_struct *part)
 #define part_stat_sub(cpu, gendiskp, field, subnd)			\
 	part_stat_add(cpu, gendiskp, field, -subnd)
 
-static inline void part_inc_in_flight(struct hd_struct *part, int rw)
+static inline void part_inc_in_flight(int cpu, struct hd_struct *part, int rw)
 {
-	atomic_inc(&part->in_flight[rw]);
+	part_stat_inc(cpu, part, in_flight[rw]);
 	if (part->partno)
-		atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+		part_stat_inc(cpu, &part_to_disk(part)->part0, in_flight[rw]);
 }
 
-static inline void part_dec_in_flight(struct hd_struct *part, int rw)
+static inline void part_dec_in_flight(int cpu, struct hd_struct *part, int rw)
 {
-	atomic_dec(&part->in_flight[rw]);
+	part_stat_dec(cpu, part, in_flight[rw]);
 	if (part->partno)
-		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+		part_stat_dec(cpu, &part_to_disk(part)->part0, in_flight[rw]);
 }
 
-static inline int part_in_flight(struct hd_struct *part)
+static inline unsigned long part_in_flight(struct hd_struct *part)
 {
-	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
+	return part_stat_read_double(part, in_flight[0], in_flight[1]);
 }
 
 static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)

-- 
Jens Axboe

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-29 17:31         ` Brian King
@ 2017-06-30  1:08             ` Ming Lei
  0 siblings, 0 replies; 35+ messages in thread
From: Ming Lei @ 2017-06-30  1:08 UTC (permalink / raw)
  To: Brian King
  Cc: Ming Lei, Jens Axboe, linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On Thu, Jun 29, 2017 at 12:31:05PM -0500, Brian King wrote:
> On 06/29/2017 11:25 AM, Ming Lei wrote:
> > On Thu, Jun 29, 2017 at 11:58 PM, Jens Axboe <axboe@kernel.dk> wrote:
> >> On 06/29/2017 02:40 AM, Ming Lei wrote:
> >>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
> >>>> On 06/28/2017 03:12 PM, Brian King wrote:
> >>>>> This patch converts the in_flight counter in struct hd_struct from a
> >>>>> pair of atomics to a pair of percpu counters. This eliminates a cou=
ple
> >>>>> of atomics from the hot path. When running this on a Power system, =
to
> >>>>> a single null_blk device with 80 submission queues, irq mode 0, with
> >>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
> >>>>
> >>>> This has been done before, but I've never really liked it. The reaso=
n is
> >>>> that it means that reading the part stat inflight count now has to
> >>>> iterate over every possible CPU. Did you use partitions in your test=
ing?
> >>>> How many CPUs were configured? When I last tested this a few years a=
go
> >>>> on even a quad core nehalem (which is notoriously shitty for cross-n=
ode
> >>>> latencies), it was a net loss.
> >>>
> >>> One year ago, I saw null_blk's IOPS can be decreased to 10%
> >>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
> >>> 96 cores, and dual numa nodes) too, the performance can be
> >>> recovered basically if per numa-node counter is introduced and
> >>> used in this case, but the patch was never posted out.
> >>> If anyone is interested in that, I can rebase the patch on current
> >>> block tree and post out. I guess the performance issue might be
> >>> related with system cache coherency implementation more or less.
> >>> This issue on ARM64 can be observed with the following userspace
> >>> atomic counting test too:
> >>>
> >>>        http://kernel.ubuntu.com/~ming/test/cache/
> >>
> >> How well did the per-node thing work? Doesn't seem to me like it would
> >=20
> > Last time, on ARM64, I remembered that the IOPS was basically recovered,
> > but now I don't have a such machine to test. Could Brian test the attac=
hed patch
> > to see if it works on big Power machine?
> >=20
> > And the idea is simple, just make the atomic counter per-node.
>=20
> I tried loading the patch and get an oops right away on boot. Haven't bee=
n able to=20
> debug anything yet. This is on top of an older kernel, so not sure if tha=
t is
> the issue or not. I can try upstream and see if i have different results.=
=2E.
>=20
> =1B[-1;-1fUbuntu 16.04=1B[-1;-1f.  .  .  .Unable to handle kernel paging =
request for data at address 0xc00031313a333532
> Faulting instruction address: 0xc0000000002552c4
> Oops: Kernel access of bad area, sig: 11 [#1]
> SMP NR_CPUS=3D1024 NUMA=20
> PowerNV
> Modules linked in: dm_round_robin vmx_crypto powernv_rng leds_powernv pow=
ernv_op_panel led_class rng_core dm_multipath autofs4 raid10 raid456 async_=
raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq multipath=
 dm_mirror dm_region_hash dm_log cxlflash bnx2x mdio libcrc32c nvme nvme_co=
re lpfc cxl crc_t10dif crct10dif_generic crct10dif_common
> CPU: 9 PID: 3485 Comm: multipathd Not tainted 4.9.10-dirty #7
> task: c000000fba0d0000 task.stack: c000000fb8a1c000
> NIP: c0000000002552c4 LR: c000000000255274 CTR: 0000000000000000
> REGS: c000000fb8a1f350 TRAP: 0300   Not tainted  (4.9.10-dirty)
> MSR: 900000000280b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 24028848 =
 XER: 00000000
> CFAR: c000000000008a60 DAR: c00031313a333532 DSISR: 40000000 SOFTE: 1=20
> GPR00: c0000000001f8980 c000000fb8a1f5d0 c000000000f24300 c000000fc4007c0=
0=20
> GPR04: 00000000024000c0 c0000000002fc0ac 000000000000025d c000000fc5d346e=
0=20
> GPR08: 0000000fc50d6000 0000000000000000 c00031313a333532 c000000fbc83624=
0=20
> GPR12: 0000000000002200 c00000000ff02400 00003fff79cfebf0 000000000000000=
0=20
> GPR16: 00000000c138fd03 000001003a12bf70 00003fff79ae75e0 00003fff79d269e=
8=20
> GPR20: 0000000000000001 00003fff79cfebf0 000001003a393aa0 00003fff79d067b=
8=20
> GPR24: 0000000000000000 c000000000b04948 000000000000a1ff c0000000002fc0a=
c=20
> GPR28: 00000000024000c0 0000000000000007 c0000000002fc0ac c000000fc4007c0=
0=20
> NIP [c0000000002552c4] __kmalloc_track_caller+0x94/0x2f0
> LR [c000000000255274] __kmalloc_track_caller+0x44/0x2f0
> Call Trace:
> [c000000fb8a1f5d0] [c000000fb8a1f610] 0xc000000fb8a1f610 (unreliable)
> [c000000fb8a1f620] [c0000000001f8980] kstrdup+0x50/0xb0
> [c000000fb8a1f660] [c0000000002fc0ac] __kernfs_new_node+0x4c/0x140
> [c000000fb8a1f6b0] [c0000000002fd9f4] kernfs_new_node+0x34/0x80
> [c000000fb8a1f6e0] [c000000000300708] kernfs_create_link+0x38/0xf0
> [c000000fb8a1f720] [c000000000301cb8] sysfs_do_create_link_sd.isra.0+0xa8=
/0x160
> [c000000fb8a1f770] [c00000000054a658] device_add+0x2b8/0x740
> [c000000fb8a1f830] [c00000000054ae58] device_create_groups_vargs+0x178/0x=
190
> [c000000fb8a1f890] [c0000000001fcd70] bdi_register+0x80/0x1d0
> [c000000fb8a1f8c0] [c0000000001fd244] bdi_register_owner+0x44/0x80
> [c000000fb8a1f940] [c000000000453bbc] device_add_disk+0x1cc/0x500
> [c000000fb8a1f9f0] [c000000000764dec] dm_create+0x33c/0x5f0
> [c000000fb8a1fa90] [c00000000076d9ac] dev_create+0x8c/0x3e0
> [c000000fb8a1fb40] [c00000000076d1fc] ctl_ioctl+0x38c/0x580
> [c000000fb8a1fd20] [c00000000076d410] dm_ctl_ioctl+0x20/0x30
> [c000000fb8a1fd40] [c0000000002799ac] do_vfs_ioctl+0xcc/0x8f0
> [c000000fb8a1fde0] [c00000000027a230] SyS_ioctl+0x60/0xc0
> [c000000fb8a1fe30] [c00000000000bfe0] system_call+0x38/0xfc
> Instruction dump:
> 39290008 7cc8482a e94d0030 e9230000 7ce95214 7d49502a e9270010 2faa0000=
=20
> 419e007c 2fa90000 419e0074 e93f0022 <7f4a482a> 39200000 88ad023a 992d023a=
=20
> ---[ end trace dcdac2d3f668d033 ]---
>

Thanks for the test!

Looks there is one double free in patch, could you test the new one?

---
diff --git a/block/partition-generic.c b/block/partition-generic.c
index c5ec8246e25e..bd6644bf9643 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -140,8 +140,9 @@ ssize_t part_inflight_show(struct device *dev,
 {
 	struct hd_struct *p =3D dev_to_part(dev);
=20
-	return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
-		atomic_read(&p->in_flight[1]));
+	return sprintf(buf, "%8u %8u\n",
+		pnode_counter_read_all(&p->in_flight[0]),
+		pnode_counter_read_all(&p->in_flight[1]));
 }
=20
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 96bd13e581cd..aac0b2235410 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -521,7 +521,7 @@ static void start_io_acct(struct dm_io *io)
 	cpu =3D part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
-	atomic_set(&dm_disk(md)->part0.in_flight[rw],
+	pnode_counter_set(&dm_disk(md)->part0.in_flight[rw],
 		atomic_inc_return(&md->pending[rw]));
=20
 	if (unlikely(dm_stats_used(&md->stats)))
@@ -550,7 +550,7 @@ static void end_io_acct(struct dm_io *io)
 	 * a flush.
 	 */
 	pending =3D atomic_dec_return(&md->pending[rw]);
-	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
+	pnode_counter_set(&dm_disk(md)->part0.in_flight[rw], pending);
 	pending +=3D atomic_read(&md->pending[rw^0x1]);
=20
 	/* nudge anyone waiting on suspend queue */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e619fae2f037..40c9bc74a120 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/percpu-refcount.h>
 #include <linux/uuid.h>
+#include <linux/pernode_counter.h>
=20
 #ifdef CONFIG_BLOCK
=20
@@ -120,7 +121,7 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	atomic_t in_flight[2];
+	struct pnode_counter in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats __percpu *dkstats;
 #else
@@ -364,21 +365,22 @@ static inline void free_part_stats(struct hd_struct *=
part)
=20
 static inline void part_inc_in_flight(struct hd_struct *part, int rw)
 {
-	atomic_inc(&part->in_flight[rw]);
+	pnode_counter_inc(&part->in_flight[rw]);
 	if (part->partno)
-		atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+		pnode_counter_inc(&part_to_disk(part)->part0.in_flight[rw]);
 }
=20
 static inline void part_dec_in_flight(struct hd_struct *part, int rw)
 {
-	atomic_dec(&part->in_flight[rw]);
+	pnode_counter_dec(&part->in_flight[rw]);
 	if (part->partno)
-		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+		pnode_counter_dec(&part_to_disk(part)->part0.in_flight[rw]);
 }
=20
 static inline int part_in_flight(struct hd_struct *part)
 {
-	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]=
);
+	return pnode_counter_read_all(&part->in_flight[0]) + \
+		pnode_counter_read_all(&part->in_flight[1]);
 }
=20
 static inline struct partition_meta_info *alloc_part_info(struct gendisk *=
disk)
@@ -627,11 +629,34 @@ extern ssize_t part_fail_store(struct device *dev,
 			       const char *buf, size_t count);
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
=20
+static inline int hd_counter_init(struct hd_struct *part)
+{
+	if (pnode_counter_init(&part->in_flight[0], GFP_KERNEL))
+		return -ENOMEM;
+	if (pnode_counter_init(&part->in_flight[1], GFP_KERNEL)) {
+		pnode_counter_deinit(&part->in_flight[0]);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static inline void hd_counter_deinit(struct hd_struct *part)
+{
+	pnode_counter_deinit(&part->in_flight[0]);
+	pnode_counter_deinit(&part->in_flight[1]);
+}
+
 static inline int hd_ref_init(struct hd_struct *part)
 {
+	if (hd_counter_init(part))
+		return -ENOMEM;
+
 	if (percpu_ref_init(&part->ref, __delete_partition, 0,
-				GFP_KERNEL))
+				GFP_KERNEL)) {
+		hd_counter_deinit(part);
 		return -ENOMEM;
+	}
 	return 0;
 }
=20
@@ -659,6 +684,7 @@ static inline void hd_free_part(struct hd_struct *part)
 {
 	free_part_stats(part);
 	free_part_info(part);
+	hd_counter_deinit(part);
 	percpu_ref_exit(&part->ref);
 }
=20
diff --git a/include/linux/pernode_counter.h b/include/linux/pernode_counte=
r.h
new file mode 100644
index 000000000000..e95b5f2d1d9c
--- /dev/null
+++ b/include/linux/pernode_counter.h
@@ -0,0 +1,117 @@
+#ifndef _LINUX_PERNODE_COUNTER_H
+#define _LINUX_PERNODE_COUNTER_H
+/*
+ * A simple per node atomic counter for use in block io accounting.
+ */
+
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/gfp.h>
+
+struct node_counter {
+	atomic_t counter_in_node;
+};
+
+struct pnode_counter {
+	struct node_counter * __percpu  *counter;
+	struct node_counter **nodes;
+};
+
+static inline int pnode_counter_init(struct pnode_counter *pnc, gfp_t gfp)
+{
+	struct node_counter **nodes;
+	int i, j, cpu;
+
+	nodes =3D kzalloc(nr_node_ids * sizeof(struct node_counter *), gfp);
+	if (!nodes)
+		goto err_nodes;
+
+	for_each_node(i) {
+		nodes[i] =3D kzalloc_node(sizeof(struct node_counter), gfp, i);
+		if (!nodes[i])
+			goto err_node_counter;
+	}
+
+	pnc->counter =3D alloc_percpu_gfp(struct node_counter *, gfp);
+	if (!pnc->counter)
+		goto err_node_counter;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(pnc->counter, cpu) =3D nodes[cpu_to_node(cpu)];
+
+	pnc->nodes =3D nodes;
+
+	return 0;
+
+ err_node_counter:
+	for (j =3D 0; j < i; j++)
+		kfree(nodes[j]);
+	kfree(nodes);
+ err_nodes:
+	return -ENOMEM;
+}
+
+static inline void pnode_counter_deinit(struct pnode_counter *pnc)
+{
+	int i;
+
+	for_each_node(i) {
+		struct node_counter *node =3D pnc->nodes[i];
+
+		kfree(node);
+	}
+	free_percpu(pnc->counter);
+	kfree(pnc->nodes);
+}
+
+static inline void pnode_counter_inc(struct pnode_counter *pnc)
+{
+	struct node_counter *node =3D this_cpu_read(*pnc->counter);
+
+	atomic_inc(&node->counter_in_node);
+}
+
+static inline void pnode_counter_inc_cpu(struct pnode_counter *pnc, int cp=
u)
+{
+	struct node_counter *node =3D *per_cpu_ptr(pnc->counter, cpu);
+
+	atomic_inc(&node->counter_in_node);
+}
+
+static inline void pnode_counter_dec(struct pnode_counter *pnc)
+{
+	struct node_counter *node =3D this_cpu_read(*pnc->counter);
+
+	atomic_dec(&node->counter_in_node);
+}
+
+static inline void pnode_counter_dec_cpu(struct pnode_counter *pnc, int cp=
u)
+{
+	struct node_counter *node =3D *per_cpu_ptr(pnc->counter, cpu);
+
+	atomic_dec(&node->counter_in_node);
+}
+
+static inline void pnode_counter_set(struct pnode_counter *pnc, int val)
+{
+	int i;
+	struct node_counter *node =3D this_cpu_read(*pnc->counter);
+
+	for_each_node(i)
+		atomic_set(&pnc->nodes[i]->counter_in_node, 0);
+	atomic_set(&node->counter_in_node, val);
+}
+
+static inline long pnode_counter_read_all(struct pnode_counter *pnc)
+{
+	int i;
+	long val =3D 0;
+
+	for_each_node(i)
+		val +=3D atomic_read(&pnc->nodes[i]->counter_in_node);
+
+	return val;
+}
+
+#endif /* _LINUX_PERNODE_COUNTER_H */


Thanks,
Ming

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
@ 2017-06-30  1:08             ` Ming Lei
  0 siblings, 0 replies; 35+ messages in thread
From: Ming Lei @ 2017-06-30  1:08 UTC (permalink / raw)
  To: Brian King
  Cc: Ming Lei, Jens Axboe, linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On Thu, Jun 29, 2017 at 12:31:05PM -0500, Brian King wrote:
> On 06/29/2017 11:25 AM, Ming Lei wrote:
> > On Thu, Jun 29, 2017 at 11:58 PM, Jens Axboe <axboe@kernel.dk> wrote:
> >> On 06/29/2017 02:40 AM, Ming Lei wrote:
> >>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
> >>>> On 06/28/2017 03:12 PM, Brian King wrote:
> >>>>> This patch converts the in_flight counter in struct hd_struct from a
> >>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
> >>>>> of atomics from the hot path. When running this on a Power system, to
> >>>>> a single null_blk device with 80 submission queues, irq mode 0, with
> >>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
> >>>>
> >>>> This has been done before, but I've never really liked it. The reason is
> >>>> that it means that reading the part stat inflight count now has to
> >>>> iterate over every possible CPU. Did you use partitions in your testing?
> >>>> How many CPUs were configured? When I last tested this a few years ago
> >>>> on even a quad core nehalem (which is notoriously shitty for cross-node
> >>>> latencies), it was a net loss.
> >>>
> >>> One year ago, I saw null_blk's IOPS can be decreased to 10%
> >>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
> >>> 96 cores, and dual numa nodes) too, the performance can be
> >>> recovered basically if per numa-node counter is introduced and
> >>> used in this case, but the patch was never posted out.
> >>> If anyone is interested in that, I can rebase the patch on current
> >>> block tree and post out. I guess the performance issue might be
> >>> related with system cache coherency implementation more or less.
> >>> This issue on ARM64 can be observed with the following userspace
> >>> atomic counting test too:
> >>>
> >>>        http://kernel.ubuntu.com/~ming/test/cache/
> >>
> >> How well did the per-node thing work? Doesn't seem to me like it would
> > 
> > Last time, on ARM64, I remembered that the IOPS was basically recovered,
> > but now I don't have a such machine to test. Could Brian test the attached patch
> > to see if it works on big Power machine?
> > 
> > And the idea is simple, just make the atomic counter per-node.
> 
> I tried loading the patch and get an oops right away on boot. Haven't been able to 
> debug anything yet. This is on top of an older kernel, so not sure if that is
> the issue or not. I can try upstream and see if i have different results...
> 
> ^[[-1;-1fUbuntu 16.04^[[-1;-1f.  .  .  .Unable to handle kernel paging request for data at address 0xc00031313a333532
> Faulting instruction address: 0xc0000000002552c4
> Oops: Kernel access of bad area, sig: 11 [#1]
> SMP NR_CPUS=1024 NUMA 
> PowerNV
> Modules linked in: dm_round_robin vmx_crypto powernv_rng leds_powernv powernv_op_panel led_class rng_core dm_multipath autofs4 raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq multipath dm_mirror dm_region_hash dm_log cxlflash bnx2x mdio libcrc32c nvme nvme_core lpfc cxl crc_t10dif crct10dif_generic crct10dif_common
> CPU: 9 PID: 3485 Comm: multipathd Not tainted 4.9.10-dirty #7
> task: c000000fba0d0000 task.stack: c000000fb8a1c000
> NIP: c0000000002552c4 LR: c000000000255274 CTR: 0000000000000000
> REGS: c000000fb8a1f350 TRAP: 0300   Not tainted  (4.9.10-dirty)
> MSR: 900000000280b033 <SF,HV,VEC,VSX,EE,FP,ME,IR,DR,RI,LE>  CR: 24028848  XER: 00000000
> CFAR: c000000000008a60 DAR: c00031313a333532 DSISR: 40000000 SOFTE: 1 
> GPR00: c0000000001f8980 c000000fb8a1f5d0 c000000000f24300 c000000fc4007c00 
> GPR04: 00000000024000c0 c0000000002fc0ac 000000000000025d c000000fc5d346e0 
> GPR08: 0000000fc50d6000 0000000000000000 c00031313a333532 c000000fbc836240 
> GPR12: 0000000000002200 c00000000ff02400 00003fff79cfebf0 0000000000000000 
> GPR16: 00000000c138fd03 000001003a12bf70 00003fff79ae75e0 00003fff79d269e8 
> GPR20: 0000000000000001 00003fff79cfebf0 000001003a393aa0 00003fff79d067b8 
> GPR24: 0000000000000000 c000000000b04948 000000000000a1ff c0000000002fc0ac 
> GPR28: 00000000024000c0 0000000000000007 c0000000002fc0ac c000000fc4007c00 
> NIP [c0000000002552c4] __kmalloc_track_caller+0x94/0x2f0
> LR [c000000000255274] __kmalloc_track_caller+0x44/0x2f0
> Call Trace:
> [c000000fb8a1f5d0] [c000000fb8a1f610] 0xc000000fb8a1f610 (unreliable)
> [c000000fb8a1f620] [c0000000001f8980] kstrdup+0x50/0xb0
> [c000000fb8a1f660] [c0000000002fc0ac] __kernfs_new_node+0x4c/0x140
> [c000000fb8a1f6b0] [c0000000002fd9f4] kernfs_new_node+0x34/0x80
> [c000000fb8a1f6e0] [c000000000300708] kernfs_create_link+0x38/0xf0
> [c000000fb8a1f720] [c000000000301cb8] sysfs_do_create_link_sd.isra.0+0xa8/0x160
> [c000000fb8a1f770] [c00000000054a658] device_add+0x2b8/0x740
> [c000000fb8a1f830] [c00000000054ae58] device_create_groups_vargs+0x178/0x190
> [c000000fb8a1f890] [c0000000001fcd70] bdi_register+0x80/0x1d0
> [c000000fb8a1f8c0] [c0000000001fd244] bdi_register_owner+0x44/0x80
> [c000000fb8a1f940] [c000000000453bbc] device_add_disk+0x1cc/0x500
> [c000000fb8a1f9f0] [c000000000764dec] dm_create+0x33c/0x5f0
> [c000000fb8a1fa90] [c00000000076d9ac] dev_create+0x8c/0x3e0
> [c000000fb8a1fb40] [c00000000076d1fc] ctl_ioctl+0x38c/0x580
> [c000000fb8a1fd20] [c00000000076d410] dm_ctl_ioctl+0x20/0x30
> [c000000fb8a1fd40] [c0000000002799ac] do_vfs_ioctl+0xcc/0x8f0
> [c000000fb8a1fde0] [c00000000027a230] SyS_ioctl+0x60/0xc0
> [c000000fb8a1fe30] [c00000000000bfe0] system_call+0x38/0xfc
> Instruction dump:
> 39290008 7cc8482a e94d0030 e9230000 7ce95214 7d49502a e9270010 2faa0000 
> 419e007c 2fa90000 419e0074 e93f0022 <7f4a482a> 39200000 88ad023a 992d023a 
> ---[ end trace dcdac2d3f668d033 ]---
>

Thanks for the test!

Looks there is one double free in patch, could you test the new one?

---
diff --git a/block/partition-generic.c b/block/partition-generic.c
index c5ec8246e25e..bd6644bf9643 100644
--- a/block/partition-generic.c
+++ b/block/partition-generic.c
@@ -140,8 +140,9 @@ ssize_t part_inflight_show(struct device *dev,
 {
 	struct hd_struct *p = dev_to_part(dev);
 
-	return sprintf(buf, "%8u %8u\n", atomic_read(&p->in_flight[0]),
-		atomic_read(&p->in_flight[1]));
+	return sprintf(buf, "%8u %8u\n",
+		pnode_counter_read_all(&p->in_flight[0]),
+		pnode_counter_read_all(&p->in_flight[1]));
 }
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 96bd13e581cd..aac0b2235410 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -521,7 +521,7 @@ static void start_io_acct(struct dm_io *io)
 	cpu = part_stat_lock();
 	part_round_stats(cpu, &dm_disk(md)->part0);
 	part_stat_unlock();
-	atomic_set(&dm_disk(md)->part0.in_flight[rw],
+	pnode_counter_set(&dm_disk(md)->part0.in_flight[rw],
 		atomic_inc_return(&md->pending[rw]));
 
 	if (unlikely(dm_stats_used(&md->stats)))
@@ -550,7 +550,7 @@ static void end_io_acct(struct dm_io *io)
 	 * a flush.
 	 */
 	pending = atomic_dec_return(&md->pending[rw]);
-	atomic_set(&dm_disk(md)->part0.in_flight[rw], pending);
+	pnode_counter_set(&dm_disk(md)->part0.in_flight[rw], pending);
 	pending += atomic_read(&md->pending[rw^0x1]);
 
 	/* nudge anyone waiting on suspend queue */
diff --git a/include/linux/genhd.h b/include/linux/genhd.h
index e619fae2f037..40c9bc74a120 100644
--- a/include/linux/genhd.h
+++ b/include/linux/genhd.h
@@ -15,6 +15,7 @@
 #include <linux/slab.h>
 #include <linux/percpu-refcount.h>
 #include <linux/uuid.h>
+#include <linux/pernode_counter.h>
 
 #ifdef CONFIG_BLOCK
 
@@ -120,7 +121,7 @@ struct hd_struct {
 	int make_it_fail;
 #endif
 	unsigned long stamp;
-	atomic_t in_flight[2];
+	struct pnode_counter in_flight[2];
 #ifdef	CONFIG_SMP
 	struct disk_stats __percpu *dkstats;
 #else
@@ -364,21 +365,22 @@ static inline void free_part_stats(struct hd_struct *part)
 
 static inline void part_inc_in_flight(struct hd_struct *part, int rw)
 {
-	atomic_inc(&part->in_flight[rw]);
+	pnode_counter_inc(&part->in_flight[rw]);
 	if (part->partno)
-		atomic_inc(&part_to_disk(part)->part0.in_flight[rw]);
+		pnode_counter_inc(&part_to_disk(part)->part0.in_flight[rw]);
 }
 
 static inline void part_dec_in_flight(struct hd_struct *part, int rw)
 {
-	atomic_dec(&part->in_flight[rw]);
+	pnode_counter_dec(&part->in_flight[rw]);
 	if (part->partno)
-		atomic_dec(&part_to_disk(part)->part0.in_flight[rw]);
+		pnode_counter_dec(&part_to_disk(part)->part0.in_flight[rw]);
 }
 
 static inline int part_in_flight(struct hd_struct *part)
 {
-	return atomic_read(&part->in_flight[0]) + atomic_read(&part->in_flight[1]);
+	return pnode_counter_read_all(&part->in_flight[0]) + \
+		pnode_counter_read_all(&part->in_flight[1]);
 }
 
 static inline struct partition_meta_info *alloc_part_info(struct gendisk *disk)
@@ -627,11 +629,34 @@ extern ssize_t part_fail_store(struct device *dev,
 			       const char *buf, size_t count);
 #endif /* CONFIG_FAIL_MAKE_REQUEST */
 
+static inline int hd_counter_init(struct hd_struct *part)
+{
+	if (pnode_counter_init(&part->in_flight[0], GFP_KERNEL))
+		return -ENOMEM;
+	if (pnode_counter_init(&part->in_flight[1], GFP_KERNEL)) {
+		pnode_counter_deinit(&part->in_flight[0]);
+		return -ENOMEM;
+	}
+
+	return 0;
+}
+
+static inline void hd_counter_deinit(struct hd_struct *part)
+{
+	pnode_counter_deinit(&part->in_flight[0]);
+	pnode_counter_deinit(&part->in_flight[1]);
+}
+
 static inline int hd_ref_init(struct hd_struct *part)
 {
+	if (hd_counter_init(part))
+		return -ENOMEM;
+
 	if (percpu_ref_init(&part->ref, __delete_partition, 0,
-				GFP_KERNEL))
+				GFP_KERNEL)) {
+		hd_counter_deinit(part);
 		return -ENOMEM;
+	}
 	return 0;
 }
 
@@ -659,6 +684,7 @@ static inline void hd_free_part(struct hd_struct *part)
 {
 	free_part_stats(part);
 	free_part_info(part);
+	hd_counter_deinit(part);
 	percpu_ref_exit(&part->ref);
 }
 
diff --git a/include/linux/pernode_counter.h b/include/linux/pernode_counter.h
new file mode 100644
index 000000000000..e95b5f2d1d9c
--- /dev/null
+++ b/include/linux/pernode_counter.h
@@ -0,0 +1,117 @@
+#ifndef _LINUX_PERNODE_COUNTER_H
+#define _LINUX_PERNODE_COUNTER_H
+/*
+ * A simple per node atomic counter for use in block io accounting.
+ */
+
+#include <linux/smp.h>
+#include <linux/percpu.h>
+#include <linux/types.h>
+#include <linux/gfp.h>
+
+struct node_counter {
+	atomic_t counter_in_node;
+};
+
+struct pnode_counter {
+	struct node_counter * __percpu  *counter;
+	struct node_counter **nodes;
+};
+
+static inline int pnode_counter_init(struct pnode_counter *pnc, gfp_t gfp)
+{
+	struct node_counter **nodes;
+	int i, j, cpu;
+
+	nodes = kzalloc(nr_node_ids * sizeof(struct node_counter *), gfp);
+	if (!nodes)
+		goto err_nodes;
+
+	for_each_node(i) {
+		nodes[i] = kzalloc_node(sizeof(struct node_counter), gfp, i);
+		if (!nodes[i])
+			goto err_node_counter;
+	}
+
+	pnc->counter = alloc_percpu_gfp(struct node_counter *, gfp);
+	if (!pnc->counter)
+		goto err_node_counter;
+
+	for_each_possible_cpu(cpu)
+		*per_cpu_ptr(pnc->counter, cpu) = nodes[cpu_to_node(cpu)];
+
+	pnc->nodes = nodes;
+
+	return 0;
+
+ err_node_counter:
+	for (j = 0; j < i; j++)
+		kfree(nodes[j]);
+	kfree(nodes);
+ err_nodes:
+	return -ENOMEM;
+}
+
+static inline void pnode_counter_deinit(struct pnode_counter *pnc)
+{
+	int i;
+
+	for_each_node(i) {
+		struct node_counter *node = pnc->nodes[i];
+
+		kfree(node);
+	}
+	free_percpu(pnc->counter);
+	kfree(pnc->nodes);
+}
+
+static inline void pnode_counter_inc(struct pnode_counter *pnc)
+{
+	struct node_counter *node = this_cpu_read(*pnc->counter);
+
+	atomic_inc(&node->counter_in_node);
+}
+
+static inline void pnode_counter_inc_cpu(struct pnode_counter *pnc, int cpu)
+{
+	struct node_counter *node = *per_cpu_ptr(pnc->counter, cpu);
+
+	atomic_inc(&node->counter_in_node);
+}
+
+static inline void pnode_counter_dec(struct pnode_counter *pnc)
+{
+	struct node_counter *node = this_cpu_read(*pnc->counter);
+
+	atomic_dec(&node->counter_in_node);
+}
+
+static inline void pnode_counter_dec_cpu(struct pnode_counter *pnc, int cpu)
+{
+	struct node_counter *node = *per_cpu_ptr(pnc->counter, cpu);
+
+	atomic_dec(&node->counter_in_node);
+}
+
+static inline void pnode_counter_set(struct pnode_counter *pnc, int val)
+{
+	int i;
+	struct node_counter *node = this_cpu_read(*pnc->counter);
+
+	for_each_node(i)
+		atomic_set(&pnc->nodes[i]->counter_in_node, 0);
+	atomic_set(&node->counter_in_node, val);
+}
+
+static inline long pnode_counter_read_all(struct pnode_counter *pnc)
+{
+	int i;
+	long val = 0;
+
+	for_each_node(i)
+		val += atomic_read(&pnc->nodes[i]->counter_in_node);
+
+	return val;
+}
+
+#endif /* _LINUX_PERNODE_COUNTER_H */


Thanks,
Ming

^ permalink raw reply related	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-29 18:42         ` Jens Axboe
@ 2017-06-30  1:20           ` Ming Lei
  2017-06-30  2:17             ` Jens Axboe
  0 siblings, 1 reply; 35+ messages in thread
From: Ming Lei @ 2017-06-30  1:20 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Brian King, linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On Fri, Jun 30, 2017 at 2:42 AM, Jens Axboe <axboe@kernel.dk> wrote:
> On 06/29/2017 10:00 AM, Jens Axboe wrote:
>> On 06/29/2017 09:58 AM, Jens Axboe wrote:
>>> On 06/29/2017 02:40 AM, Ming Lei wrote:
>>>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>>>> of atomics from the hot path. When running this on a Power system, to
>>>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>>>
>>>>> This has been done before, but I've never really liked it. The reason is
>>>>> that it means that reading the part stat inflight count now has to
>>>>> iterate over every possible CPU. Did you use partitions in your testing?
>>>>> How many CPUs were configured? When I last tested this a few years ago
>>>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>>>> latencies), it was a net loss.
>>>>
>>>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>>>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>>>> 96 cores, and dual numa nodes) too, the performance can be
>>>> recovered basically if per numa-node counter is introduced and
>>>> used in this case, but the patch was never posted out.
>>>> If anyone is interested in that, I can rebase the patch on current
>>>> block tree and post out. I guess the performance issue might be
>>>> related with system cache coherency implementation more or less.
>>>> This issue on ARM64 can be observed with the following userspace
>>>> atomic counting test too:
>>>>
>>>>        http://kernel.ubuntu.com/~ming/test/cache/
>>>
>>> How well did the per-node thing work? Doesn't seem to me like it would
>>> go far enough. And per CPU is too much. One potential improvement would
>>> be to change the part_stat_read() to just loop online CPUs, instead of
>>> all possible CPUs. When CPUs go on/offline, use that as the slow path to
>>> ensure the stats are sane. Often there's a huge difference between
>>> NR_CPUS configured and what the system has. As Brian states, RH ships
>>> with 2048, while I doubt a lot of customers actually run that...
>>>
>>> Outside of coming up with a more clever data structure that is fully
>>> CPU topology aware, one thing that could work is just having X cache
>>> line separated read/write inflight counters per node, where X is some
>>> suitable value (like 4). That prevents us from having cross node
>>> traffic, and it also keeps the cross cpu traffic fairly low. That should
>>> provide a nice balance between cost of incrementing the inflight
>>> counting, and the cost of looping for reading it.
>>>
>>> And that brings me to the next part...
>>>
>>>>> I do agree that we should do something about it, and it's one of those
>>>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>>>> up. It's just not great as it currently stands, but I don't think per
>>>>> CPU counters is the right way to fix it, at least not for the inflight
>>>>> counter.
>>>>
>>>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>>>> we can use some blk-mq knowledge(tagset?) to figure out the
>>>> 'in_flight' counter. I thought about it before, but never got a
>>>> perfect solution, and looks it is a bit hard, :-)
>>>
>>> The tags are already a bit spread out, so it's worth a shot. That would
>>> remove the need to do anything in the inc/dec path, as the tags already
>>> do that. The inlight count could be easily retrieved with
>>> sbitmap_weight(). The only issue here is that we need separate read and
>>> write counters, and the weight would obviously only get us the total
>>> count. But we can have a slower path for that, just iterate the tags and
>>> count them. The fast path only cares about total count.
>>>
>>> Let me try that out real quick.
>>
>> Well, that only works for whole disk stats, of course... There's no way
>> around iterating the tags and checking for this to truly work.
>
> Totally untested proof of concept for using the tags for this. I based
> this on top of Brian's patch, so it includes his patch plus the
> _double() stuff I did which is no longer really needed.
>
>
> diff --git a/block/bio.c b/block/bio.c
> index 9cf98b29588a..ec99d9ba0f33 100644
> --- a/block/bio.c
> +++ b/block/bio.c
> @@ -1737,7 +1737,7 @@ void generic_start_io_acct(int rw, unsigned long sectors,
>         part_round_stats(cpu, part);
>         part_stat_inc(cpu, part, ios[rw]);
>         part_stat_add(cpu, part, sectors[rw], sectors);
> -       part_inc_in_flight(part, rw);
> +       part_inc_in_flight(cpu, part, rw);
>
>         part_stat_unlock();
>  }
> @@ -1751,7 +1751,7 @@ void generic_end_io_acct(int rw, struct hd_struct *part,
>
>         part_stat_add(cpu, part, ticks[rw], duration);
>         part_round_stats(cpu, part);
> -       part_dec_in_flight(part, rw);
> +       part_dec_in_flight(cpu, part, rw);
>
>         part_stat_unlock();
>  }
> diff --git a/block/blk-core.c b/block/blk-core.c
> index af393d5a9680..6ab2efbe940b 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -2434,8 +2434,13 @@ void blk_account_io_done(struct request *req)
>
>                 part_stat_inc(cpu, part, ios[rw]);
>                 part_stat_add(cpu, part, ticks[rw], duration);
> -               part_round_stats(cpu, part);
> -               part_dec_in_flight(part, rw);
> +
> +               if (req->q->mq_ops)
> +                       part_round_stats_mq(req->q, cpu, part);
> +               else {
> +                       part_round_stats(cpu, part);
> +                       part_dec_in_flight(cpu, part, rw);
> +               }
>
>                 hd_struct_put(part);
>                 part_stat_unlock();
> @@ -2492,8 +2497,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
>                         part = &rq->rq_disk->part0;
>                         hd_struct_get(part);
>                 }
> -               part_round_stats(cpu, part);
> -               part_inc_in_flight(part, rw);
> +               if (rq->q->mq_ops)
> +                       part_round_stats_mq(rq->q, cpu, part);
> +               else {
> +                       part_round_stats(cpu, part);
> +                       part_inc_in_flight(cpu, part, rw);
> +               }
>                 rq->part = part;
>         }
>
> diff --git a/block/blk-merge.c b/block/blk-merge.c
> index 99038830fb42..3b5eb2d4b964 100644
> --- a/block/blk-merge.c
> +++ b/block/blk-merge.c
> @@ -634,7 +634,7 @@ static void blk_account_io_merge(struct request *req)
>                 part = req->part;
>
>                 part_round_stats(cpu, part);
> -               part_dec_in_flight(part, rq_data_dir(req));
> +               part_dec_in_flight(cpu, part, rq_data_dir(req));
>
>                 hd_struct_put(part);
>                 part_stat_unlock();
> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
> index d0be72ccb091..a7b897740c47 100644
> --- a/block/blk-mq-tag.c
> +++ b/block/blk-mq-tag.c
> @@ -214,7 +214,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>                 bitnr += tags->nr_reserved_tags;
>         rq = tags->rqs[bitnr];
>
> -       if (rq->q == hctx->queue)
> +       if (rq && rq->q == hctx->queue)
>                 iter_data->fn(hctx, rq, iter_data->data, reserved);
>         return true;
>  }
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 05dfa3f270ae..cad4d2c26285 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -43,6 +43,58 @@ static LIST_HEAD(all_q_list);
>  static void blk_mq_poll_stats_start(struct request_queue *q);
>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>
> +struct mq_inflight {
> +       struct hd_struct *part;
> +       unsigned int inflight;
> +};
> +
> +static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
> +                                 struct request *rq, void *priv,
> +                                 bool reserved)
> +{
> +       struct mq_inflight *mi = priv;
> +
> +       if (rq->part == mi->part &&
> +           test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
> +               mi->inflight++;
> +}
> +
> +unsigned long part_in_flight_mq(struct request_queue *q,
> +                               struct hd_struct *part)
> +{
> +       struct mq_inflight mi = { .part = part, .inflight = 0 };
> +
> +       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
> +       return mi.inflight;
> +}

Compared with the totally percpu approach, this way might help 1:M or
N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
each CPU(especially there are huge hw queues on a big system), :-(


Thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-30  1:20           ` Ming Lei
@ 2017-06-30  2:17             ` Jens Axboe
  2017-06-30 13:05                 ` Brian King
  0 siblings, 1 reply; 35+ messages in thread
From: Jens Axboe @ 2017-06-30  2:17 UTC (permalink / raw)
  To: Ming Lei
  Cc: Brian King, linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On 06/29/2017 07:20 PM, Ming Lei wrote:
> On Fri, Jun 30, 2017 at 2:42 AM, Jens Axboe <axboe@kernel.dk> wrote:
>> On 06/29/2017 10:00 AM, Jens Axboe wrote:
>>> On 06/29/2017 09:58 AM, Jens Axboe wrote:
>>>> On 06/29/2017 02:40 AM, Ming Lei wrote:
>>>>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>>>>> of atomics from the hot path. When running this on a Power system, to
>>>>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>>>>
>>>>>> This has been done before, but I've never really liked it. The reason is
>>>>>> that it means that reading the part stat inflight count now has to
>>>>>> iterate over every possible CPU. Did you use partitions in your testing?
>>>>>> How many CPUs were configured? When I last tested this a few years ago
>>>>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>>>>> latencies), it was a net loss.
>>>>>
>>>>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>>>>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>>>>> 96 cores, and dual numa nodes) too, the performance can be
>>>>> recovered basically if per numa-node counter is introduced and
>>>>> used in this case, but the patch was never posted out.
>>>>> If anyone is interested in that, I can rebase the patch on current
>>>>> block tree and post out. I guess the performance issue might be
>>>>> related with system cache coherency implementation more or less.
>>>>> This issue on ARM64 can be observed with the following userspace
>>>>> atomic counting test too:
>>>>>
>>>>>        http://kernel.ubuntu.com/~ming/test/cache/
>>>>
>>>> How well did the per-node thing work? Doesn't seem to me like it would
>>>> go far enough. And per CPU is too much. One potential improvement would
>>>> be to change the part_stat_read() to just loop online CPUs, instead of
>>>> all possible CPUs. When CPUs go on/offline, use that as the slow path to
>>>> ensure the stats are sane. Often there's a huge difference between
>>>> NR_CPUS configured and what the system has. As Brian states, RH ships
>>>> with 2048, while I doubt a lot of customers actually run that...
>>>>
>>>> Outside of coming up with a more clever data structure that is fully
>>>> CPU topology aware, one thing that could work is just having X cache
>>>> line separated read/write inflight counters per node, where X is some
>>>> suitable value (like 4). That prevents us from having cross node
>>>> traffic, and it also keeps the cross cpu traffic fairly low. That should
>>>> provide a nice balance between cost of incrementing the inflight
>>>> counting, and the cost of looping for reading it.
>>>>
>>>> And that brings me to the next part...
>>>>
>>>>>> I do agree that we should do something about it, and it's one of those
>>>>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>>>>> up. It's just not great as it currently stands, but I don't think per
>>>>>> CPU counters is the right way to fix it, at least not for the inflight
>>>>>> counter.
>>>>>
>>>>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>>>>> we can use some blk-mq knowledge(tagset?) to figure out the
>>>>> 'in_flight' counter. I thought about it before, but never got a
>>>>> perfect solution, and looks it is a bit hard, :-)
>>>>
>>>> The tags are already a bit spread out, so it's worth a shot. That would
>>>> remove the need to do anything in the inc/dec path, as the tags already
>>>> do that. The inlight count could be easily retrieved with
>>>> sbitmap_weight(). The only issue here is that we need separate read and
>>>> write counters, and the weight would obviously only get us the total
>>>> count. But we can have a slower path for that, just iterate the tags and
>>>> count them. The fast path only cares about total count.
>>>>
>>>> Let me try that out real quick.
>>>
>>> Well, that only works for whole disk stats, of course... There's no way
>>> around iterating the tags and checking for this to truly work.
>>
>> Totally untested proof of concept for using the tags for this. I based
>> this on top of Brian's patch, so it includes his patch plus the
>> _double() stuff I did which is no longer really needed.
>>
>>
>> diff --git a/block/bio.c b/block/bio.c
>> index 9cf98b29588a..ec99d9ba0f33 100644
>> --- a/block/bio.c
>> +++ b/block/bio.c
>> @@ -1737,7 +1737,7 @@ void generic_start_io_acct(int rw, unsigned long sectors,
>>         part_round_stats(cpu, part);
>>         part_stat_inc(cpu, part, ios[rw]);
>>         part_stat_add(cpu, part, sectors[rw], sectors);
>> -       part_inc_in_flight(part, rw);
>> +       part_inc_in_flight(cpu, part, rw);
>>
>>         part_stat_unlock();
>>  }
>> @@ -1751,7 +1751,7 @@ void generic_end_io_acct(int rw, struct hd_struct *part,
>>
>>         part_stat_add(cpu, part, ticks[rw], duration);
>>         part_round_stats(cpu, part);
>> -       part_dec_in_flight(part, rw);
>> +       part_dec_in_flight(cpu, part, rw);
>>
>>         part_stat_unlock();
>>  }
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index af393d5a9680..6ab2efbe940b 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -2434,8 +2434,13 @@ void blk_account_io_done(struct request *req)
>>
>>                 part_stat_inc(cpu, part, ios[rw]);
>>                 part_stat_add(cpu, part, ticks[rw], duration);
>> -               part_round_stats(cpu, part);
>> -               part_dec_in_flight(part, rw);
>> +
>> +               if (req->q->mq_ops)
>> +                       part_round_stats_mq(req->q, cpu, part);
>> +               else {
>> +                       part_round_stats(cpu, part);
>> +                       part_dec_in_flight(cpu, part, rw);
>> +               }
>>
>>                 hd_struct_put(part);
>>                 part_stat_unlock();
>> @@ -2492,8 +2497,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
>>                         part = &rq->rq_disk->part0;
>>                         hd_struct_get(part);
>>                 }
>> -               part_round_stats(cpu, part);
>> -               part_inc_in_flight(part, rw);
>> +               if (rq->q->mq_ops)
>> +                       part_round_stats_mq(rq->q, cpu, part);
>> +               else {
>> +                       part_round_stats(cpu, part);
>> +                       part_inc_in_flight(cpu, part, rw);
>> +               }
>>                 rq->part = part;
>>         }
>>
>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>> index 99038830fb42..3b5eb2d4b964 100644
>> --- a/block/blk-merge.c
>> +++ b/block/blk-merge.c
>> @@ -634,7 +634,7 @@ static void blk_account_io_merge(struct request *req)
>>                 part = req->part;
>>
>>                 part_round_stats(cpu, part);
>> -               part_dec_in_flight(part, rq_data_dir(req));
>> +               part_dec_in_flight(cpu, part, rq_data_dir(req));
>>
>>                 hd_struct_put(part);
>>                 part_stat_unlock();
>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>> index d0be72ccb091..a7b897740c47 100644
>> --- a/block/blk-mq-tag.c
>> +++ b/block/blk-mq-tag.c
>> @@ -214,7 +214,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>>                 bitnr += tags->nr_reserved_tags;
>>         rq = tags->rqs[bitnr];
>>
>> -       if (rq->q == hctx->queue)
>> +       if (rq && rq->q == hctx->queue)
>>                 iter_data->fn(hctx, rq, iter_data->data, reserved);
>>         return true;
>>  }
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 05dfa3f270ae..cad4d2c26285 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -43,6 +43,58 @@ static LIST_HEAD(all_q_list);
>>  static void blk_mq_poll_stats_start(struct request_queue *q);
>>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>
>> +struct mq_inflight {
>> +       struct hd_struct *part;
>> +       unsigned int inflight;
>> +};
>> +
>> +static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
>> +                                 struct request *rq, void *priv,
>> +                                 bool reserved)
>> +{
>> +       struct mq_inflight *mi = priv;
>> +
>> +       if (rq->part == mi->part &&
>> +           test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
>> +               mi->inflight++;
>> +}
>> +
>> +unsigned long part_in_flight_mq(struct request_queue *q,
>> +                               struct hd_struct *part)
>> +{
>> +       struct mq_inflight mi = { .part = part, .inflight = 0 };
>> +
>> +       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
>> +       return mi.inflight;
>> +}
> 
> Compared with the totally percpu approach, this way might help 1:M or
> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
> each CPU(especially there are huge hw queues on a big system), :-(

Not disagreeing with that, without having some mechanism to only
loop queues that have pending requests. That would be similar to the
ctx_map for sw to hw queues. But I don't think that would be worthwhile
doing, I like your pnode approach better. However, I'm still not fully
convinced that one per node is enough to get the scalability we need.

Would be great if Brian could re-test with your updated patch, so we
know how it works for him at least.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-30  2:17             ` Jens Axboe
@ 2017-06-30 13:05                 ` Brian King
  0 siblings, 0 replies; 35+ messages in thread
From: Brian King @ 2017-06-30 13:05 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Alasdair Kergon, Mike Snitzer

On 06/29/2017 09:17 PM, Jens Axboe wrote:
> On 06/29/2017 07:20 PM, Ming Lei wrote:
>> On Fri, Jun 30, 2017 at 2:42 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>> On 06/29/2017 10:00 AM, Jens Axboe wrote:
>>>> On 06/29/2017 09:58 AM, Jens Axboe wrote:
>>>>> On 06/29/2017 02:40 AM, Ming Lei wrote:
>>>>>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>>>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>>>>>> of atomics from the hot path. When running this on a Power system, to
>>>>>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>>>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>>>>>
>>>>>>> This has been done before, but I've never really liked it. The reason is
>>>>>>> that it means that reading the part stat inflight count now has to
>>>>>>> iterate over every possible CPU. Did you use partitions in your testing?
>>>>>>> How many CPUs were configured? When I last tested this a few years ago
>>>>>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>>>>>> latencies), it was a net loss.
>>>>>>
>>>>>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>>>>>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>>>>>> 96 cores, and dual numa nodes) too, the performance can be
>>>>>> recovered basically if per numa-node counter is introduced and
>>>>>> used in this case, but the patch was never posted out.
>>>>>> If anyone is interested in that, I can rebase the patch on current
>>>>>> block tree and post out. I guess the performance issue might be
>>>>>> related with system cache coherency implementation more or less.
>>>>>> This issue on ARM64 can be observed with the following userspace
>>>>>> atomic counting test too:
>>>>>>
>>>>>>        http://kernel.ubuntu.com/~ming/test/cache/
>>>>>
>>>>> How well did the per-node thing work? Doesn't seem to me like it would
>>>>> go far enough. And per CPU is too much. One potential improvement would
>>>>> be to change the part_stat_read() to just loop online CPUs, instead of
>>>>> all possible CPUs. When CPUs go on/offline, use that as the slow path to
>>>>> ensure the stats are sane. Often there's a huge difference between
>>>>> NR_CPUS configured and what the system has. As Brian states, RH ships
>>>>> with 2048, while I doubt a lot of customers actually run that...
>>>>>
>>>>> Outside of coming up with a more clever data structure that is fully
>>>>> CPU topology aware, one thing that could work is just having X cache
>>>>> line separated read/write inflight counters per node, where X is some
>>>>> suitable value (like 4). That prevents us from having cross node
>>>>> traffic, and it also keeps the cross cpu traffic fairly low. That should
>>>>> provide a nice balance between cost of incrementing the inflight
>>>>> counting, and the cost of looping for reading it.
>>>>>
>>>>> And that brings me to the next part...
>>>>>
>>>>>>> I do agree that we should do something about it, and it's one of those
>>>>>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>>>>>> up. It's just not great as it currently stands, but I don't think per
>>>>>>> CPU counters is the right way to fix it, at least not for the inflight
>>>>>>> counter.
>>>>>>
>>>>>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>>>>>> we can use some blk-mq knowledge(tagset?) to figure out the
>>>>>> 'in_flight' counter. I thought about it before, but never got a
>>>>>> perfect solution, and looks it is a bit hard, :-)
>>>>>
>>>>> The tags are already a bit spread out, so it's worth a shot. That would
>>>>> remove the need to do anything in the inc/dec path, as the tags already
>>>>> do that. The inlight count could be easily retrieved with
>>>>> sbitmap_weight(). The only issue here is that we need separate read and
>>>>> write counters, and the weight would obviously only get us the total
>>>>> count. But we can have a slower path for that, just iterate the tags and
>>>>> count them. The fast path only cares about total count.
>>>>>
>>>>> Let me try that out real quick.
>>>>
>>>> Well, that only works for whole disk stats, of course... There's no way
>>>> around iterating the tags and checking for this to truly work.
>>>
>>> Totally untested proof of concept for using the tags for this. I based
>>> this on top of Brian's patch, so it includes his patch plus the
>>> _double() stuff I did which is no longer really needed.
>>>
>>>
>>> diff --git a/block/bio.c b/block/bio.c
>>> index 9cf98b29588a..ec99d9ba0f33 100644
>>> --- a/block/bio.c
>>> +++ b/block/bio.c
>>> @@ -1737,7 +1737,7 @@ void generic_start_io_acct(int rw, unsigned long sectors,
>>>         part_round_stats(cpu, part);
>>>         part_stat_inc(cpu, part, ios[rw]);
>>>         part_stat_add(cpu, part, sectors[rw], sectors);
>>> -       part_inc_in_flight(part, rw);
>>> +       part_inc_in_flight(cpu, part, rw);
>>>
>>>         part_stat_unlock();
>>>  }
>>> @@ -1751,7 +1751,7 @@ void generic_end_io_acct(int rw, struct hd_struct *part,
>>>
>>>         part_stat_add(cpu, part, ticks[rw], duration);
>>>         part_round_stats(cpu, part);
>>> -       part_dec_in_flight(part, rw);
>>> +       part_dec_in_flight(cpu, part, rw);
>>>
>>>         part_stat_unlock();
>>>  }
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index af393d5a9680..6ab2efbe940b 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -2434,8 +2434,13 @@ void blk_account_io_done(struct request *req)
>>>
>>>                 part_stat_inc(cpu, part, ios[rw]);
>>>                 part_stat_add(cpu, part, ticks[rw], duration);
>>> -               part_round_stats(cpu, part);
>>> -               part_dec_in_flight(part, rw);
>>> +
>>> +               if (req->q->mq_ops)
>>> +                       part_round_stats_mq(req->q, cpu, part);
>>> +               else {
>>> +                       part_round_stats(cpu, part);
>>> +                       part_dec_in_flight(cpu, part, rw);
>>> +               }
>>>
>>>                 hd_struct_put(part);
>>>                 part_stat_unlock();
>>> @@ -2492,8 +2497,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
>>>                         part = &rq->rq_disk->part0;
>>>                         hd_struct_get(part);
>>>                 }
>>> -               part_round_stats(cpu, part);
>>> -               part_inc_in_flight(part, rw);
>>> +               if (rq->q->mq_ops)
>>> +                       part_round_stats_mq(rq->q, cpu, part);
>>> +               else {
>>> +                       part_round_stats(cpu, part);
>>> +                       part_inc_in_flight(cpu, part, rw);
>>> +               }
>>>                 rq->part = part;
>>>         }
>>>
>>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>>> index 99038830fb42..3b5eb2d4b964 100644
>>> --- a/block/blk-merge.c
>>> +++ b/block/blk-merge.c
>>> @@ -634,7 +634,7 @@ static void blk_account_io_merge(struct request *req)
>>>                 part = req->part;
>>>
>>>                 part_round_stats(cpu, part);
>>> -               part_dec_in_flight(part, rq_data_dir(req));
>>> +               part_dec_in_flight(cpu, part, rq_data_dir(req));
>>>
>>>                 hd_struct_put(part);
>>>                 part_stat_unlock();
>>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>>> index d0be72ccb091..a7b897740c47 100644
>>> --- a/block/blk-mq-tag.c
>>> +++ b/block/blk-mq-tag.c
>>> @@ -214,7 +214,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>>>                 bitnr += tags->nr_reserved_tags;
>>>         rq = tags->rqs[bitnr];
>>>
>>> -       if (rq->q == hctx->queue)
>>> +       if (rq && rq->q == hctx->queue)
>>>                 iter_data->fn(hctx, rq, iter_data->data, reserved);
>>>         return true;
>>>  }
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 05dfa3f270ae..cad4d2c26285 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -43,6 +43,58 @@ static LIST_HEAD(all_q_list);
>>>  static void blk_mq_poll_stats_start(struct request_queue *q);
>>>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>>
>>> +struct mq_inflight {
>>> +       struct hd_struct *part;
>>> +       unsigned int inflight;
>>> +};
>>> +
>>> +static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
>>> +                                 struct request *rq, void *priv,
>>> +                                 bool reserved)
>>> +{
>>> +       struct mq_inflight *mi = priv;
>>> +
>>> +       if (rq->part == mi->part &&
>>> +           test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
>>> +               mi->inflight++;
>>> +}
>>> +
>>> +unsigned long part_in_flight_mq(struct request_queue *q,
>>> +                               struct hd_struct *part)
>>> +{
>>> +       struct mq_inflight mi = { .part = part, .inflight = 0 };
>>> +
>>> +       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
>>> +       return mi.inflight;
>>> +}
>>
>> Compared with the totally percpu approach, this way might help 1:M or
>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>> each CPU(especially there are huge hw queues on a big system), :-(
> 
> Not disagreeing with that, without having some mechanism to only
> loop queues that have pending requests. That would be similar to the
> ctx_map for sw to hw queues. But I don't think that would be worthwhile
> doing, I like your pnode approach better. However, I'm still not fully
> convinced that one per node is enough to get the scalability we need.
> 
> Would be great if Brian could re-test with your updated patch, so we
> know how it works for him at least.

I'll try running with both approaches today and see how they compare.

Thanks!!!

Brian



-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
@ 2017-06-30 13:05                 ` Brian King
  0 siblings, 0 replies; 35+ messages in thread
From: Brian King @ 2017-06-30 13:05 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On 06/29/2017 09:17 PM, Jens Axboe wrote:
> On 06/29/2017 07:20 PM, Ming Lei wrote:
>> On Fri, Jun 30, 2017 at 2:42 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>> On 06/29/2017 10:00 AM, Jens Axboe wrote:
>>>> On 06/29/2017 09:58 AM, Jens Axboe wrote:
>>>>> On 06/29/2017 02:40 AM, Ming Lei wrote:
>>>>>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>>>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>>>>>> of atomics from the hot path. When running this on a Power system, to
>>>>>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>>>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>>>>>
>>>>>>> This has been done before, but I've never really liked it. The reason is
>>>>>>> that it means that reading the part stat inflight count now has to
>>>>>>> iterate over every possible CPU. Did you use partitions in your testing?
>>>>>>> How many CPUs were configured? When I last tested this a few years ago
>>>>>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>>>>>> latencies), it was a net loss.
>>>>>>
>>>>>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>>>>>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>>>>>> 96 cores, and dual numa nodes) too, the performance can be
>>>>>> recovered basically if per numa-node counter is introduced and
>>>>>> used in this case, but the patch was never posted out.
>>>>>> If anyone is interested in that, I can rebase the patch on current
>>>>>> block tree and post out. I guess the performance issue might be
>>>>>> related with system cache coherency implementation more or less.
>>>>>> This issue on ARM64 can be observed with the following userspace
>>>>>> atomic counting test too:
>>>>>>
>>>>>>        http://kernel.ubuntu.com/~ming/test/cache/
>>>>>
>>>>> How well did the per-node thing work? Doesn't seem to me like it would
>>>>> go far enough. And per CPU is too much. One potential improvement would
>>>>> be to change the part_stat_read() to just loop online CPUs, instead of
>>>>> all possible CPUs. When CPUs go on/offline, use that as the slow path to
>>>>> ensure the stats are sane. Often there's a huge difference between
>>>>> NR_CPUS configured and what the system has. As Brian states, RH ships
>>>>> with 2048, while I doubt a lot of customers actually run that...
>>>>>
>>>>> Outside of coming up with a more clever data structure that is fully
>>>>> CPU topology aware, one thing that could work is just having X cache
>>>>> line separated read/write inflight counters per node, where X is some
>>>>> suitable value (like 4). That prevents us from having cross node
>>>>> traffic, and it also keeps the cross cpu traffic fairly low. That should
>>>>> provide a nice balance between cost of incrementing the inflight
>>>>> counting, and the cost of looping for reading it.
>>>>>
>>>>> And that brings me to the next part...
>>>>>
>>>>>>> I do agree that we should do something about it, and it's one of those
>>>>>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>>>>>> up. It's just not great as it currently stands, but I don't think per
>>>>>>> CPU counters is the right way to fix it, at least not for the inflight
>>>>>>> counter.
>>>>>>
>>>>>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>>>>>> we can use some blk-mq knowledge(tagset?) to figure out the
>>>>>> 'in_flight' counter. I thought about it before, but never got a
>>>>>> perfect solution, and looks it is a bit hard, :-)
>>>>>
>>>>> The tags are already a bit spread out, so it's worth a shot. That would
>>>>> remove the need to do anything in the inc/dec path, as the tags already
>>>>> do that. The inlight count could be easily retrieved with
>>>>> sbitmap_weight(). The only issue here is that we need separate read and
>>>>> write counters, and the weight would obviously only get us the total
>>>>> count. But we can have a slower path for that, just iterate the tags and
>>>>> count them. The fast path only cares about total count.
>>>>>
>>>>> Let me try that out real quick.
>>>>
>>>> Well, that only works for whole disk stats, of course... There's no way
>>>> around iterating the tags and checking for this to truly work.
>>>
>>> Totally untested proof of concept for using the tags for this. I based
>>> this on top of Brian's patch, so it includes his patch plus the
>>> _double() stuff I did which is no longer really needed.
>>>
>>>
>>> diff --git a/block/bio.c b/block/bio.c
>>> index 9cf98b29588a..ec99d9ba0f33 100644
>>> --- a/block/bio.c
>>> +++ b/block/bio.c
>>> @@ -1737,7 +1737,7 @@ void generic_start_io_acct(int rw, unsigned long sectors,
>>>         part_round_stats(cpu, part);
>>>         part_stat_inc(cpu, part, ios[rw]);
>>>         part_stat_add(cpu, part, sectors[rw], sectors);
>>> -       part_inc_in_flight(part, rw);
>>> +       part_inc_in_flight(cpu, part, rw);
>>>
>>>         part_stat_unlock();
>>>  }
>>> @@ -1751,7 +1751,7 @@ void generic_end_io_acct(int rw, struct hd_struct *part,
>>>
>>>         part_stat_add(cpu, part, ticks[rw], duration);
>>>         part_round_stats(cpu, part);
>>> -       part_dec_in_flight(part, rw);
>>> +       part_dec_in_flight(cpu, part, rw);
>>>
>>>         part_stat_unlock();
>>>  }
>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>> index af393d5a9680..6ab2efbe940b 100644
>>> --- a/block/blk-core.c
>>> +++ b/block/blk-core.c
>>> @@ -2434,8 +2434,13 @@ void blk_account_io_done(struct request *req)
>>>
>>>                 part_stat_inc(cpu, part, ios[rw]);
>>>                 part_stat_add(cpu, part, ticks[rw], duration);
>>> -               part_round_stats(cpu, part);
>>> -               part_dec_in_flight(part, rw);
>>> +
>>> +               if (req->q->mq_ops)
>>> +                       part_round_stats_mq(req->q, cpu, part);
>>> +               else {
>>> +                       part_round_stats(cpu, part);
>>> +                       part_dec_in_flight(cpu, part, rw);
>>> +               }
>>>
>>>                 hd_struct_put(part);
>>>                 part_stat_unlock();
>>> @@ -2492,8 +2497,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
>>>                         part = &rq->rq_disk->part0;
>>>                         hd_struct_get(part);
>>>                 }
>>> -               part_round_stats(cpu, part);
>>> -               part_inc_in_flight(part, rw);
>>> +               if (rq->q->mq_ops)
>>> +                       part_round_stats_mq(rq->q, cpu, part);
>>> +               else {
>>> +                       part_round_stats(cpu, part);
>>> +                       part_inc_in_flight(cpu, part, rw);
>>> +               }
>>>                 rq->part = part;
>>>         }
>>>
>>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>>> index 99038830fb42..3b5eb2d4b964 100644
>>> --- a/block/blk-merge.c
>>> +++ b/block/blk-merge.c
>>> @@ -634,7 +634,7 @@ static void blk_account_io_merge(struct request *req)
>>>                 part = req->part;
>>>
>>>                 part_round_stats(cpu, part);
>>> -               part_dec_in_flight(part, rq_data_dir(req));
>>> +               part_dec_in_flight(cpu, part, rq_data_dir(req));
>>>
>>>                 hd_struct_put(part);
>>>                 part_stat_unlock();
>>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>>> index d0be72ccb091..a7b897740c47 100644
>>> --- a/block/blk-mq-tag.c
>>> +++ b/block/blk-mq-tag.c
>>> @@ -214,7 +214,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>>>                 bitnr += tags->nr_reserved_tags;
>>>         rq = tags->rqs[bitnr];
>>>
>>> -       if (rq->q == hctx->queue)
>>> +       if (rq && rq->q == hctx->queue)
>>>                 iter_data->fn(hctx, rq, iter_data->data, reserved);
>>>         return true;
>>>  }
>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>> index 05dfa3f270ae..cad4d2c26285 100644
>>> --- a/block/blk-mq.c
>>> +++ b/block/blk-mq.c
>>> @@ -43,6 +43,58 @@ static LIST_HEAD(all_q_list);
>>>  static void blk_mq_poll_stats_start(struct request_queue *q);
>>>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>>
>>> +struct mq_inflight {
>>> +       struct hd_struct *part;
>>> +       unsigned int inflight;
>>> +};
>>> +
>>> +static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
>>> +                                 struct request *rq, void *priv,
>>> +                                 bool reserved)
>>> +{
>>> +       struct mq_inflight *mi = priv;
>>> +
>>> +       if (rq->part == mi->part &&
>>> +           test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
>>> +               mi->inflight++;
>>> +}
>>> +
>>> +unsigned long part_in_flight_mq(struct request_queue *q,
>>> +                               struct hd_struct *part)
>>> +{
>>> +       struct mq_inflight mi = { .part = part, .inflight = 0 };
>>> +
>>> +       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
>>> +       return mi.inflight;
>>> +}
>>
>> Compared with the totally percpu approach, this way might help 1:M or
>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>> each CPU(especially there are huge hw queues on a big system), :-(
> 
> Not disagreeing with that, without having some mechanism to only
> loop queues that have pending requests. That would be similar to the
> ctx_map for sw to hw queues. But I don't think that would be worthwhile
> doing, I like your pnode approach better. However, I'm still not fully
> convinced that one per node is enough to get the scalability we need.
> 
> Would be great if Brian could re-test with your updated patch, so we
> know how it works for him at least.

I'll try running with both approaches today and see how they compare.

Thanks!!!

Brian



-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-30 13:05                 ` Brian King
  (?)
@ 2017-06-30 14:08                 ` Jens Axboe
  2017-06-30 18:33                   ` Brian King
  2017-07-01  4:17                   ` Jens Axboe
  -1 siblings, 2 replies; 35+ messages in thread
From: Jens Axboe @ 2017-06-30 14:08 UTC (permalink / raw)
  To: Brian King, Ming Lei
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Alasdair Kergon, Mike Snitzer

On 06/30/2017 07:05 AM, Brian King wrote:
> On 06/29/2017 09:17 PM, Jens Axboe wrote:
>> On 06/29/2017 07:20 PM, Ming Lei wrote:
>>> On Fri, Jun 30, 2017 at 2:42 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>> On 06/29/2017 10:00 AM, Jens Axboe wrote:
>>>>> On 06/29/2017 09:58 AM, Jens Axboe wrote:
>>>>>> On 06/29/2017 02:40 AM, Ming Lei wrote:
>>>>>>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>>>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>>>>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>>>>>>> of atomics from the hot path. When running this on a Power system, to
>>>>>>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>>>>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>>>>>>
>>>>>>>> This has been done before, but I've never really liked it. The reason is
>>>>>>>> that it means that reading the part stat inflight count now has to
>>>>>>>> iterate over every possible CPU. Did you use partitions in your testing?
>>>>>>>> How many CPUs were configured? When I last tested this a few years ago
>>>>>>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>>>>>>> latencies), it was a net loss.
>>>>>>>
>>>>>>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>>>>>>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>>>>>>> 96 cores, and dual numa nodes) too, the performance can be
>>>>>>> recovered basically if per numa-node counter is introduced and
>>>>>>> used in this case, but the patch was never posted out.
>>>>>>> If anyone is interested in that, I can rebase the patch on current
>>>>>>> block tree and post out. I guess the performance issue might be
>>>>>>> related with system cache coherency implementation more or less.
>>>>>>> This issue on ARM64 can be observed with the following userspace
>>>>>>> atomic counting test too:
>>>>>>>
>>>>>>>        http://kernel.ubuntu.com/~ming/test/cache/
>>>>>>
>>>>>> How well did the per-node thing work? Doesn't seem to me like it would
>>>>>> go far enough. And per CPU is too much. One potential improvement would
>>>>>> be to change the part_stat_read() to just loop online CPUs, instead of
>>>>>> all possible CPUs. When CPUs go on/offline, use that as the slow path to
>>>>>> ensure the stats are sane. Often there's a huge difference between
>>>>>> NR_CPUS configured and what the system has. As Brian states, RH ships
>>>>>> with 2048, while I doubt a lot of customers actually run that...
>>>>>>
>>>>>> Outside of coming up with a more clever data structure that is fully
>>>>>> CPU topology aware, one thing that could work is just having X cache
>>>>>> line separated read/write inflight counters per node, where X is some
>>>>>> suitable value (like 4). That prevents us from having cross node
>>>>>> traffic, and it also keeps the cross cpu traffic fairly low. That should
>>>>>> provide a nice balance between cost of incrementing the inflight
>>>>>> counting, and the cost of looping for reading it.
>>>>>>
>>>>>> And that brings me to the next part...
>>>>>>
>>>>>>>> I do agree that we should do something about it, and it's one of those
>>>>>>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>>>>>>> up. It's just not great as it currently stands, but I don't think per
>>>>>>>> CPU counters is the right way to fix it, at least not for the inflight
>>>>>>>> counter.
>>>>>>>
>>>>>>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>>>>>>> we can use some blk-mq knowledge(tagset?) to figure out the
>>>>>>> 'in_flight' counter. I thought about it before, but never got a
>>>>>>> perfect solution, and looks it is a bit hard, :-)
>>>>>>
>>>>>> The tags are already a bit spread out, so it's worth a shot. That would
>>>>>> remove the need to do anything in the inc/dec path, as the tags already
>>>>>> do that. The inlight count could be easily retrieved with
>>>>>> sbitmap_weight(). The only issue here is that we need separate read and
>>>>>> write counters, and the weight would obviously only get us the total
>>>>>> count. But we can have a slower path for that, just iterate the tags and
>>>>>> count them. The fast path only cares about total count.
>>>>>>
>>>>>> Let me try that out real quick.
>>>>>
>>>>> Well, that only works for whole disk stats, of course... There's no way
>>>>> around iterating the tags and checking for this to truly work.
>>>>
>>>> Totally untested proof of concept for using the tags for this. I based
>>>> this on top of Brian's patch, so it includes his patch plus the
>>>> _double() stuff I did which is no longer really needed.
>>>>
>>>>
>>>> diff --git a/block/bio.c b/block/bio.c
>>>> index 9cf98b29588a..ec99d9ba0f33 100644
>>>> --- a/block/bio.c
>>>> +++ b/block/bio.c
>>>> @@ -1737,7 +1737,7 @@ void generic_start_io_acct(int rw, unsigned long sectors,
>>>>         part_round_stats(cpu, part);
>>>>         part_stat_inc(cpu, part, ios[rw]);
>>>>         part_stat_add(cpu, part, sectors[rw], sectors);
>>>> -       part_inc_in_flight(part, rw);
>>>> +       part_inc_in_flight(cpu, part, rw);
>>>>
>>>>         part_stat_unlock();
>>>>  }
>>>> @@ -1751,7 +1751,7 @@ void generic_end_io_acct(int rw, struct hd_struct *part,
>>>>
>>>>         part_stat_add(cpu, part, ticks[rw], duration);
>>>>         part_round_stats(cpu, part);
>>>> -       part_dec_in_flight(part, rw);
>>>> +       part_dec_in_flight(cpu, part, rw);
>>>>
>>>>         part_stat_unlock();
>>>>  }
>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>> index af393d5a9680..6ab2efbe940b 100644
>>>> --- a/block/blk-core.c
>>>> +++ b/block/blk-core.c
>>>> @@ -2434,8 +2434,13 @@ void blk_account_io_done(struct request *req)
>>>>
>>>>                 part_stat_inc(cpu, part, ios[rw]);
>>>>                 part_stat_add(cpu, part, ticks[rw], duration);
>>>> -               part_round_stats(cpu, part);
>>>> -               part_dec_in_flight(part, rw);
>>>> +
>>>> +               if (req->q->mq_ops)
>>>> +                       part_round_stats_mq(req->q, cpu, part);
>>>> +               else {
>>>> +                       part_round_stats(cpu, part);
>>>> +                       part_dec_in_flight(cpu, part, rw);
>>>> +               }
>>>>
>>>>                 hd_struct_put(part);
>>>>                 part_stat_unlock();
>>>> @@ -2492,8 +2497,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
>>>>                         part = &rq->rq_disk->part0;
>>>>                         hd_struct_get(part);
>>>>                 }
>>>> -               part_round_stats(cpu, part);
>>>> -               part_inc_in_flight(part, rw);
>>>> +               if (rq->q->mq_ops)
>>>> +                       part_round_stats_mq(rq->q, cpu, part);
>>>> +               else {
>>>> +                       part_round_stats(cpu, part);
>>>> +                       part_inc_in_flight(cpu, part, rw);
>>>> +               }
>>>>                 rq->part = part;
>>>>         }
>>>>
>>>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>>>> index 99038830fb42..3b5eb2d4b964 100644
>>>> --- a/block/blk-merge.c
>>>> +++ b/block/blk-merge.c
>>>> @@ -634,7 +634,7 @@ static void blk_account_io_merge(struct request *req)
>>>>                 part = req->part;
>>>>
>>>>                 part_round_stats(cpu, part);
>>>> -               part_dec_in_flight(part, rq_data_dir(req));
>>>> +               part_dec_in_flight(cpu, part, rq_data_dir(req));
>>>>
>>>>                 hd_struct_put(part);
>>>>                 part_stat_unlock();
>>>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>>>> index d0be72ccb091..a7b897740c47 100644
>>>> --- a/block/blk-mq-tag.c
>>>> +++ b/block/blk-mq-tag.c
>>>> @@ -214,7 +214,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>>>>                 bitnr += tags->nr_reserved_tags;
>>>>         rq = tags->rqs[bitnr];
>>>>
>>>> -       if (rq->q == hctx->queue)
>>>> +       if (rq && rq->q == hctx->queue)
>>>>                 iter_data->fn(hctx, rq, iter_data->data, reserved);
>>>>         return true;
>>>>  }
>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>> index 05dfa3f270ae..cad4d2c26285 100644
>>>> --- a/block/blk-mq.c
>>>> +++ b/block/blk-mq.c
>>>> @@ -43,6 +43,58 @@ static LIST_HEAD(all_q_list);
>>>>  static void blk_mq_poll_stats_start(struct request_queue *q);
>>>>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>>>
>>>> +struct mq_inflight {
>>>> +       struct hd_struct *part;
>>>> +       unsigned int inflight;
>>>> +};
>>>> +
>>>> +static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
>>>> +                                 struct request *rq, void *priv,
>>>> +                                 bool reserved)
>>>> +{
>>>> +       struct mq_inflight *mi = priv;
>>>> +
>>>> +       if (rq->part == mi->part &&
>>>> +           test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
>>>> +               mi->inflight++;
>>>> +}
>>>> +
>>>> +unsigned long part_in_flight_mq(struct request_queue *q,
>>>> +                               struct hd_struct *part)
>>>> +{
>>>> +       struct mq_inflight mi = { .part = part, .inflight = 0 };
>>>> +
>>>> +       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
>>>> +       return mi.inflight;
>>>> +}
>>>
>>> Compared with the totally percpu approach, this way might help 1:M or
>>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>>> each CPU(especially there are huge hw queues on a big system), :-(
>>
>> Not disagreeing with that, without having some mechanism to only
>> loop queues that have pending requests. That would be similar to the
>> ctx_map for sw to hw queues. But I don't think that would be worthwhile
>> doing, I like your pnode approach better. However, I'm still not fully
>> convinced that one per node is enough to get the scalability we need.
>>
>> Would be great if Brian could re-test with your updated patch, so we
>> know how it works for him at least.
> 
> I'll try running with both approaches today and see how they compare.

Focus on Ming's, a variant of that is the most likely path forward,
imho. It'd be great to do a quick run on mine as well, just to establish
how it compares to mainline, though.


-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-30 14:08                 ` [dm-devel] " Jens Axboe
@ 2017-06-30 18:33                   ` Brian King
  2017-06-30 23:23                     ` Ming Lei
  2017-07-01  4:17                   ` Jens Axboe
  1 sibling, 1 reply; 35+ messages in thread
From: Brian King @ 2017-06-30 18:33 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Alasdair Kergon, Mike Snitzer

On 06/30/2017 09:08 AM, Jens Axboe wrote:
>>>> Compared with the totally percpu approach, this way might help 1:M or
>>>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>>>> each CPU(especially there are huge hw queues on a big system), :-(
>>>
>>> Not disagreeing with that, without having some mechanism to only
>>> loop queues that have pending requests. That would be similar to the
>>> ctx_map for sw to hw queues. But I don't think that would be worthwhile
>>> doing, I like your pnode approach better. However, I'm still not fully
>>> convinced that one per node is enough to get the scalability we need.
>>>
>>> Would be great if Brian could re-test with your updated patch, so we
>>> know how it works for him at least.
>>
>> I'll try running with both approaches today and see how they compare.
> 
> Focus on Ming's, a variant of that is the most likely path forward,
> imho. It'd be great to do a quick run on mine as well, just to establish
> how it compares to mainline, though.

On my initial runs, the one from you Jens, appears to perform a bit better, although
both are a huge improvement from what I was seeing before.

I ran 4k random reads using fio to nullblk in two configurations on my 20 core
system with 4 NUMA nodes and 4-way SMT, so 80 logical CPUs. I ran both 80 threads
to a single null_blk as well as 80 threads to 80 null_block devices, so one thread
per null_blk. This is what I saw on this machine:

Using the Per node atomic change from Ming Lei
1 null_blk, 80 threads
iops=9376.5K

80 null_blk, 1 thread
iops=9523.5K


Using the alternate patch from Jens using the tags
1 null_blk, 80 threads
iops=9725.8K

80 null_blk, 1 thread
iops=9569.4K

Its interesting that with this change the single device, 80 threads scenario
actually got better than the 80 null_blk scenario. I'll try on a larger machine
as well. I've got a 32 core machine I can try this on too. Next week I can
work with our performance team on running this on a system with a bunch of nvme
devices so we can then test the disk partition case as well and see if there is
any noticeable overhead.

Thanks,

Brian


-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-30 18:33                   ` Brian King
@ 2017-06-30 23:23                     ` Ming Lei
  2017-06-30 23:26                       ` Jens Axboe
  0 siblings, 1 reply; 35+ messages in thread
From: Ming Lei @ 2017-06-30 23:23 UTC (permalink / raw)
  To: Brian King
  Cc: Jens Axboe, linux-block, open list:DEVICE-MAPPER (LVM),
	Alasdair Kergon, Mike Snitzer

Hi Bian,

On Sat, Jul 1, 2017 at 2:33 AM, Brian King <brking@linux.vnet.ibm.com> wrote:
> On 06/30/2017 09:08 AM, Jens Axboe wrote:
>>>>> Compared with the totally percpu approach, this way might help 1:M or
>>>>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>>>>> each CPU(especially there are huge hw queues on a big system), :-(
>>>>
>>>> Not disagreeing with that, without having some mechanism to only
>>>> loop queues that have pending requests. That would be similar to the
>>>> ctx_map for sw to hw queues. But I don't think that would be worthwhile
>>>> doing, I like your pnode approach better. However, I'm still not fully
>>>> convinced that one per node is enough to get the scalability we need.
>>>>
>>>> Would be great if Brian could re-test with your updated patch, so we
>>>> know how it works for him at least.
>>>
>>> I'll try running with both approaches today and see how they compare.
>>
>> Focus on Ming's, a variant of that is the most likely path forward,
>> imho. It'd be great to do a quick run on mine as well, just to establish
>> how it compares to mainline, though.
>
> On my initial runs, the one from you Jens, appears to perform a bit better, although
> both are a huge improvement from what I was seeing before.
>
> I ran 4k random reads using fio to nullblk in two configurations on my 20 core
> system with 4 NUMA nodes and 4-way SMT, so 80 logical CPUs. I ran both 80 threads
> to a single null_blk as well as 80 threads to 80 null_block devices, so one thread

Could you share what the '80 null_block devices' is?  It means you
create 80 null_blk
devices? Or you create one null_blk and make its hw queue number as 80
via module
parameter of ''submit_queues"?

I guess we should focus on multi-queue case since it is the normal way of NVMe.

> per null_blk. This is what I saw on this machine:
>
> Using the Per node atomic change from Ming Lei
> 1 null_blk, 80 threads
> iops=9376.5K
>
> 80 null_blk, 1 thread
> iops=9523.5K
>
>
> Using the alternate patch from Jens using the tags
> 1 null_blk, 80 threads
> iops=9725.8K
>
> 80 null_blk, 1 thread
> iops=9569.4K

If 1 thread means single fio job, looks the number is too too high, that means
one random IO can complete in about 0.1us(100ns) on single CPU, not sure if it
is possible, :-)


Thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-30 23:23                     ` Ming Lei
@ 2017-06-30 23:26                       ` Jens Axboe
  2017-07-01  2:18                         ` Brian King
  0 siblings, 1 reply; 35+ messages in thread
From: Jens Axboe @ 2017-06-30 23:26 UTC (permalink / raw)
  To: Ming Lei, Brian King
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Alasdair Kergon, Mike Snitzer

On 06/30/2017 05:23 PM, Ming Lei wrote:
> Hi Bian,
> 
> On Sat, Jul 1, 2017 at 2:33 AM, Brian King <brking@linux.vnet.ibm.com> wrote:
>> On 06/30/2017 09:08 AM, Jens Axboe wrote:
>>>>>> Compared with the totally percpu approach, this way might help 1:M or
>>>>>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>>>>>> each CPU(especially there are huge hw queues on a big system), :-(
>>>>>
>>>>> Not disagreeing with that, without having some mechanism to only
>>>>> loop queues that have pending requests. That would be similar to the
>>>>> ctx_map for sw to hw queues. But I don't think that would be worthwhile
>>>>> doing, I like your pnode approach better. However, I'm still not fully
>>>>> convinced that one per node is enough to get the scalability we need.
>>>>>
>>>>> Would be great if Brian could re-test with your updated patch, so we
>>>>> know how it works for him at least.
>>>>
>>>> I'll try running with both approaches today and see how they compare.
>>>
>>> Focus on Ming's, a variant of that is the most likely path forward,
>>> imho. It'd be great to do a quick run on mine as well, just to establish
>>> how it compares to mainline, though.
>>
>> On my initial runs, the one from you Jens, appears to perform a bit better, although
>> both are a huge improvement from what I was seeing before.
>>
>> I ran 4k random reads using fio to nullblk in two configurations on my 20 core
>> system with 4 NUMA nodes and 4-way SMT, so 80 logical CPUs. I ran both 80 threads
>> to a single null_blk as well as 80 threads to 80 null_block devices, so one thread
> 
> Could you share what the '80 null_block devices' is?  It means you
> create 80 null_blk
> devices? Or you create one null_blk and make its hw queue number as 80
> via module
> parameter of ''submit_queues"?

That's a valid question, was going to ask that too. But I assumed that Brian
used submit_queues to set as many queues as he has logical CPUs in the system.
> 
> I guess we should focus on multi-queue case since it is the normal way of NVMe.
> 
>> per null_blk. This is what I saw on this machine:
>>
>> Using the Per node atomic change from Ming Lei
>> 1 null_blk, 80 threads
>> iops=9376.5K
>>
>> 80 null_blk, 1 thread
>> iops=9523.5K
>>
>>
>> Using the alternate patch from Jens using the tags
>> 1 null_blk, 80 threads
>> iops=9725.8K
>>
>> 80 null_blk, 1 thread
>> iops=9569.4K
> 
> If 1 thread means single fio job, looks the number is too too high, that means
> one random IO can complete in about 0.1us(100ns) on single CPU, not sure if it
> is possible, :-)

It means either 1 null_blk device, 80 threads running IO to it. Or 80 null_blk
devices, each with a thread running IO to it. See above, he details that it's
80 threads on 80 devices for that case.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-30 23:26                       ` Jens Axboe
@ 2017-07-01  2:18                         ` Brian King
  2017-07-04  1:20                           ` Ming Lei
  0 siblings, 1 reply; 35+ messages in thread
From: Brian King @ 2017-07-01  2:18 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Alasdair Kergon, Mike Snitzer

On 06/30/2017 06:26 PM, Jens Axboe wrote:
> On 06/30/2017 05:23 PM, Ming Lei wrote:
>> Hi Bian,
>>
>> On Sat, Jul 1, 2017 at 2:33 AM, Brian King <brking@linux.vnet.ibm.com> wrote:
>>> On 06/30/2017 09:08 AM, Jens Axboe wrote:
>>>>>>> Compared with the totally percpu approach, this way might help 1:M or
>>>>>>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>>>>>>> each CPU(especially there are huge hw queues on a big system), :-(
>>>>>>
>>>>>> Not disagreeing with that, without having some mechanism to only
>>>>>> loop queues that have pending requests. That would be similar to the
>>>>>> ctx_map for sw to hw queues. But I don't think that would be worthwhile
>>>>>> doing, I like your pnode approach better. However, I'm still not fully
>>>>>> convinced that one per node is enough to get the scalability we need.
>>>>>>
>>>>>> Would be great if Brian could re-test with your updated patch, so we
>>>>>> know how it works for him at least.
>>>>>
>>>>> I'll try running with both approaches today and see how they compare.
>>>>
>>>> Focus on Ming's, a variant of that is the most likely path forward,
>>>> imho. It'd be great to do a quick run on mine as well, just to establish
>>>> how it compares to mainline, though.
>>>
>>> On my initial runs, the one from you Jens, appears to perform a bit better, although
>>> both are a huge improvement from what I was seeing before.
>>>
>>> I ran 4k random reads using fio to nullblk in two configurations on my 20 core
>>> system with 4 NUMA nodes and 4-way SMT, so 80 logical CPUs. I ran both 80 threads
>>> to a single null_blk as well as 80 threads to 80 null_block devices, so one thread
>>
>> Could you share what the '80 null_block devices' is?  It means you
>> create 80 null_blk
>> devices? Or you create one null_blk and make its hw queue number as 80
>> via module
>> parameter of ''submit_queues"?
> 
> That's a valid question, was going to ask that too. But I assumed that Brian
> used submit_queues to set as many queues as he has logical CPUs in the system.
>>
>> I guess we should focus on multi-queue case since it is the normal way of NVMe.
>>
>>> per null_blk. This is what I saw on this machine:
>>>
>>> Using the Per node atomic change from Ming Lei
>>> 1 null_blk, 80 threads
>>> iops=9376.5K
>>>
>>> 80 null_blk, 1 thread
>>> iops=9523.5K
>>>
>>>
>>> Using the alternate patch from Jens using the tags
>>> 1 null_blk, 80 threads
>>> iops=9725.8K
>>>
>>> 80 null_blk, 1 thread
>>> iops=9569.4K
>>
>> If 1 thread means single fio job, looks the number is too too high, that means
>> one random IO can complete in about 0.1us(100ns) on single CPU, not sure if it
>> is possible, :-)
> 
> It means either 1 null_blk device, 80 threads running IO to it. Or 80 null_blk
> devices, each with a thread running IO to it. See above, he details that it's
> 80 threads on 80 devices for that case.

Right. So the two modes I'm running in are:

1. 80 null_blk devices, each with one submit_queue, with one fio job per null_blk device,
   so 80 threads total. 80 logical CPUs
2. 1 null_blk device, with 80 submit_queues, 80 fio jobs, 80 logical CPUs.

In theory, the two should result in similar numbers. 

Here are the commands and fio configurations:

Scenario #1
modprobe null_blk submit_queues=80 nr_devices=1 irqmode=0

FIO config:
[global]
buffered=0
invalidate=1
bs=4k
iodepth=64
numjobs=80
group_reporting=1
rw=randrw
rwmixread=100
rwmixwrite=0
ioengine=libaio
runtime=60
time_based

[job1]
filename=/dev/nullb0


Scenario #2
modprobe null_blk submit_queues=1 nr_devices=80 irqmode=0

FIO config
[global]
buffered=0
invalidate=1
bs=4k
iodepth=64
numjobs=1
group_reporting=1
rw=randrw
rwmixread=100
rwmixwrite=0
ioengine=libaio
runtime=60
time_based

[job1]
filename=/dev/nullb0
[job2]
filename=/dev/nullb1
[job3]
filename=/dev/nullb2
[job4]
filename=/dev/nullb3
[job5]
filename=/dev/nullb4
[job6]
filename=/dev/nullb5
[job7]
filename=/dev/nullb6
[job8]
filename=/dev/nullb7
[job9]
filename=/dev/nullb8
[job10]
filename=/dev/nullb9
[job11]
filename=/dev/nullb10
[job12]
filename=/dev/nullb11
[job13]
filename=/dev/nullb12
[job14]
filename=/dev/nullb13
[job15]
filename=/dev/nullb14
[job16]
filename=/dev/nullb15
[job17]
filename=/dev/nullb16
[job18]
filename=/dev/nullb17
[job19]
filename=/dev/nullb18
[job20]
filename=/dev/nullb19
[job21]
filename=/dev/nullb20
[job22]
filename=/dev/nullb21
[job23]
filename=/dev/nullb22
[job24]
filename=/dev/nullb23
[job25]
filename=/dev/nullb24
[job26]
filename=/dev/nullb25
[job27]
filename=/dev/nullb26
[job28]
filename=/dev/nullb27
[job29]
filename=/dev/nullb28
[job30]
filename=/dev/nullb29
[job31]
filename=/dev/nullb30
[job32]
filename=/dev/nullb31
[job33]
filename=/dev/nullb32
[job34]
filename=/dev/nullb33
[job35]
filename=/dev/nullb34
[job36]
filename=/dev/nullb35
[job37]
filename=/dev/nullb36
[job38]
filename=/dev/nullb37
[job39]
filename=/dev/nullb38
[job40]
filename=/dev/nullb39
[job41]
filename=/dev/nullb40
[job42]
filename=/dev/nullb41
[job43]
filename=/dev/nullb42
[job44]
filename=/dev/nullb43
[job45]
filename=/dev/nullb44
[job46]
filename=/dev/nullb45
[job47]
filename=/dev/nullb46
[job48]
filename=/dev/nullb47
[job49]
filename=/dev/nullb48
[job50]
filename=/dev/nullb49
[job51]
filename=/dev/nullb50
[job52]
filename=/dev/nullb51
[job53]
filename=/dev/nullb52
[job54]
filename=/dev/nullb53
[job55]
filename=/dev/nullb54
[job56]
filename=/dev/nullb55
[job57]
filename=/dev/nullb56
[job58]
filename=/dev/nullb57
[job59]
filename=/dev/nullb58
[job60]
filename=/dev/nullb59
[job61]
filename=/dev/nullb60
[job62]
filename=/dev/nullb61
[job63]
filename=/dev/nullb62
[job64]
filename=/dev/nullb63
[job65]
filename=/dev/nullb64
[job66]
filename=/dev/nullb65
[job67]
filename=/dev/nullb66
[job68]
filename=/dev/nullb67
[job69]
filename=/dev/nullb68
[job70]
filename=/dev/nullb69
[job71]
filename=/dev/nullb70
[job72]
filename=/dev/nullb71
[job73]
filename=/dev/nullb72
[job74]
filename=/dev/nullb73
[job75]
filename=/dev/nullb74
[job76]
filename=/dev/nullb75
[job77]
filename=/dev/nullb76
[job78]
filename=/dev/nullb77
[job79]
filename=/dev/nullb78
[job80]
filename=/dev/nullb79





-Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-06-30 14:08                 ` [dm-devel] " Jens Axboe
  2017-06-30 18:33                   ` Brian King
@ 2017-07-01  4:17                   ` Jens Axboe
  2017-07-01  4:59                     ` Jens Axboe
  1 sibling, 1 reply; 35+ messages in thread
From: Jens Axboe @ 2017-07-01  4:17 UTC (permalink / raw)
  To: Brian King, Ming Lei
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Alasdair Kergon, Mike Snitzer

On 06/30/2017 08:08 AM, Jens Axboe wrote:
> On 06/30/2017 07:05 AM, Brian King wrote:
>> On 06/29/2017 09:17 PM, Jens Axboe wrote:
>>> On 06/29/2017 07:20 PM, Ming Lei wrote:
>>>> On Fri, Jun 30, 2017 at 2:42 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>> On 06/29/2017 10:00 AM, Jens Axboe wrote:
>>>>>> On 06/29/2017 09:58 AM, Jens Axboe wrote:
>>>>>>> On 06/29/2017 02:40 AM, Ming Lei wrote:
>>>>>>>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>>>>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>>>>>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>>>>>>>> of atomics from the hot path. When running this on a Power system, to
>>>>>>>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>>>>>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>>>>>>>
>>>>>>>>> This has been done before, but I've never really liked it. The reason is
>>>>>>>>> that it means that reading the part stat inflight count now has to
>>>>>>>>> iterate over every possible CPU. Did you use partitions in your testing?
>>>>>>>>> How many CPUs were configured? When I last tested this a few years ago
>>>>>>>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>>>>>>>> latencies), it was a net loss.
>>>>>>>>
>>>>>>>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>>>>>>>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>>>>>>>> 96 cores, and dual numa nodes) too, the performance can be
>>>>>>>> recovered basically if per numa-node counter is introduced and
>>>>>>>> used in this case, but the patch was never posted out.
>>>>>>>> If anyone is interested in that, I can rebase the patch on current
>>>>>>>> block tree and post out. I guess the performance issue might be
>>>>>>>> related with system cache coherency implementation more or less.
>>>>>>>> This issue on ARM64 can be observed with the following userspace
>>>>>>>> atomic counting test too:
>>>>>>>>
>>>>>>>>        http://kernel.ubuntu.com/~ming/test/cache/
>>>>>>>
>>>>>>> How well did the per-node thing work? Doesn't seem to me like it would
>>>>>>> go far enough. And per CPU is too much. One potential improvement would
>>>>>>> be to change the part_stat_read() to just loop online CPUs, instead of
>>>>>>> all possible CPUs. When CPUs go on/offline, use that as the slow path to
>>>>>>> ensure the stats are sane. Often there's a huge difference between
>>>>>>> NR_CPUS configured and what the system has. As Brian states, RH ships
>>>>>>> with 2048, while I doubt a lot of customers actually run that...
>>>>>>>
>>>>>>> Outside of coming up with a more clever data structure that is fully
>>>>>>> CPU topology aware, one thing that could work is just having X cache
>>>>>>> line separated read/write inflight counters per node, where X is some
>>>>>>> suitable value (like 4). That prevents us from having cross node
>>>>>>> traffic, and it also keeps the cross cpu traffic fairly low. That should
>>>>>>> provide a nice balance between cost of incrementing the inflight
>>>>>>> counting, and the cost of looping for reading it.
>>>>>>>
>>>>>>> And that brings me to the next part...
>>>>>>>
>>>>>>>>> I do agree that we should do something about it, and it's one of those
>>>>>>>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>>>>>>>> up. It's just not great as it currently stands, but I don't think per
>>>>>>>>> CPU counters is the right way to fix it, at least not for the inflight
>>>>>>>>> counter.
>>>>>>>>
>>>>>>>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>>>>>>>> we can use some blk-mq knowledge(tagset?) to figure out the
>>>>>>>> 'in_flight' counter. I thought about it before, but never got a
>>>>>>>> perfect solution, and looks it is a bit hard, :-)
>>>>>>>
>>>>>>> The tags are already a bit spread out, so it's worth a shot. That would
>>>>>>> remove the need to do anything in the inc/dec path, as the tags already
>>>>>>> do that. The inlight count could be easily retrieved with
>>>>>>> sbitmap_weight(). The only issue here is that we need separate read and
>>>>>>> write counters, and the weight would obviously only get us the total
>>>>>>> count. But we can have a slower path for that, just iterate the tags and
>>>>>>> count them. The fast path only cares about total count.
>>>>>>>
>>>>>>> Let me try that out real quick.
>>>>>>
>>>>>> Well, that only works for whole disk stats, of course... There's no way
>>>>>> around iterating the tags and checking for this to truly work.
>>>>>
>>>>> Totally untested proof of concept for using the tags for this. I based
>>>>> this on top of Brian's patch, so it includes his patch plus the
>>>>> _double() stuff I did which is no longer really needed.
>>>>>
>>>>>
>>>>> diff --git a/block/bio.c b/block/bio.c
>>>>> index 9cf98b29588a..ec99d9ba0f33 100644
>>>>> --- a/block/bio.c
>>>>> +++ b/block/bio.c
>>>>> @@ -1737,7 +1737,7 @@ void generic_start_io_acct(int rw, unsigned long sectors,
>>>>>         part_round_stats(cpu, part);
>>>>>         part_stat_inc(cpu, part, ios[rw]);
>>>>>         part_stat_add(cpu, part, sectors[rw], sectors);
>>>>> -       part_inc_in_flight(part, rw);
>>>>> +       part_inc_in_flight(cpu, part, rw);
>>>>>
>>>>>         part_stat_unlock();
>>>>>  }
>>>>> @@ -1751,7 +1751,7 @@ void generic_end_io_acct(int rw, struct hd_struct *part,
>>>>>
>>>>>         part_stat_add(cpu, part, ticks[rw], duration);
>>>>>         part_round_stats(cpu, part);
>>>>> -       part_dec_in_flight(part, rw);
>>>>> +       part_dec_in_flight(cpu, part, rw);
>>>>>
>>>>>         part_stat_unlock();
>>>>>  }
>>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>>> index af393d5a9680..6ab2efbe940b 100644
>>>>> --- a/block/blk-core.c
>>>>> +++ b/block/blk-core.c
>>>>> @@ -2434,8 +2434,13 @@ void blk_account_io_done(struct request *req)
>>>>>
>>>>>                 part_stat_inc(cpu, part, ios[rw]);
>>>>>                 part_stat_add(cpu, part, ticks[rw], duration);
>>>>> -               part_round_stats(cpu, part);
>>>>> -               part_dec_in_flight(part, rw);
>>>>> +
>>>>> +               if (req->q->mq_ops)
>>>>> +                       part_round_stats_mq(req->q, cpu, part);
>>>>> +               else {
>>>>> +                       part_round_stats(cpu, part);
>>>>> +                       part_dec_in_flight(cpu, part, rw);
>>>>> +               }
>>>>>
>>>>>                 hd_struct_put(part);
>>>>>                 part_stat_unlock();
>>>>> @@ -2492,8 +2497,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
>>>>>                         part = &rq->rq_disk->part0;
>>>>>                         hd_struct_get(part);
>>>>>                 }
>>>>> -               part_round_stats(cpu, part);
>>>>> -               part_inc_in_flight(part, rw);
>>>>> +               if (rq->q->mq_ops)
>>>>> +                       part_round_stats_mq(rq->q, cpu, part);
>>>>> +               else {
>>>>> +                       part_round_stats(cpu, part);
>>>>> +                       part_inc_in_flight(cpu, part, rw);
>>>>> +               }
>>>>>                 rq->part = part;
>>>>>         }
>>>>>
>>>>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>>>>> index 99038830fb42..3b5eb2d4b964 100644
>>>>> --- a/block/blk-merge.c
>>>>> +++ b/block/blk-merge.c
>>>>> @@ -634,7 +634,7 @@ static void blk_account_io_merge(struct request *req)
>>>>>                 part = req->part;
>>>>>
>>>>>                 part_round_stats(cpu, part);
>>>>> -               part_dec_in_flight(part, rq_data_dir(req));
>>>>> +               part_dec_in_flight(cpu, part, rq_data_dir(req));
>>>>>
>>>>>                 hd_struct_put(part);
>>>>>                 part_stat_unlock();
>>>>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>>>>> index d0be72ccb091..a7b897740c47 100644
>>>>> --- a/block/blk-mq-tag.c
>>>>> +++ b/block/blk-mq-tag.c
>>>>> @@ -214,7 +214,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>>>>>                 bitnr += tags->nr_reserved_tags;
>>>>>         rq = tags->rqs[bitnr];
>>>>>
>>>>> -       if (rq->q == hctx->queue)
>>>>> +       if (rq && rq->q == hctx->queue)
>>>>>                 iter_data->fn(hctx, rq, iter_data->data, reserved);
>>>>>         return true;
>>>>>  }
>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>> index 05dfa3f270ae..cad4d2c26285 100644
>>>>> --- a/block/blk-mq.c
>>>>> +++ b/block/blk-mq.c
>>>>> @@ -43,6 +43,58 @@ static LIST_HEAD(all_q_list);
>>>>>  static void blk_mq_poll_stats_start(struct request_queue *q);
>>>>>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>>>>
>>>>> +struct mq_inflight {
>>>>> +       struct hd_struct *part;
>>>>> +       unsigned int inflight;
>>>>> +};
>>>>> +
>>>>> +static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
>>>>> +                                 struct request *rq, void *priv,
>>>>> +                                 bool reserved)
>>>>> +{
>>>>> +       struct mq_inflight *mi = priv;
>>>>> +
>>>>> +       if (rq->part == mi->part &&
>>>>> +           test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
>>>>> +               mi->inflight++;
>>>>> +}
>>>>> +
>>>>> +unsigned long part_in_flight_mq(struct request_queue *q,
>>>>> +                               struct hd_struct *part)
>>>>> +{
>>>>> +       struct mq_inflight mi = { .part = part, .inflight = 0 };
>>>>> +
>>>>> +       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
>>>>> +       return mi.inflight;
>>>>> +}
>>>>
>>>> Compared with the totally percpu approach, this way might help 1:M or
>>>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>>>> each CPU(especially there are huge hw queues on a big system), :-(
>>>
>>> Not disagreeing with that, without having some mechanism to only
>>> loop queues that have pending requests. That would be similar to the
>>> ctx_map for sw to hw queues. But I don't think that would be worthwhile
>>> doing, I like your pnode approach better. However, I'm still not fully
>>> convinced that one per node is enough to get the scalability we need.
>>>
>>> Would be great if Brian could re-test with your updated patch, so we
>>> know how it works for him at least.
>>
>> I'll try running with both approaches today and see how they compare.
> 
> Focus on Ming's, a variant of that is the most likely path forward,
> imho. It'd be great to do a quick run on mine as well, just to establish
> how it compares to mainline, though.

Here's a cleaned up series:

http://git.kernel.dk/cgit/linux-block/log/?h=mq-inflight

(it's against mainline for now, I will update it to be against
for-4.13/block in a rebase).

One optimization on top of this I want to do is to only iterate once,
even for a partition - pass in both parts, and increment two different
counts. If we collapse the two part time stamps, then that's doable, and
it means we only have to iterate once.

Essentially this series makes the inc/dec a noop, since we don't have to
do anything. The reading is basically no worse than a cpu online
iteration, since we never have more queues than online CPUs. That's an
improvement over per-cpu for-each-possible loops. For a lot of cases,
it's much less, since we have fewer queues than CPUs. I'll need an hour
or two to hone this a bit more, but then it would be great if you can
retest. I'll send out an email when that's done, it'll be some time over
this weekend.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-07-01  4:17                   ` Jens Axboe
@ 2017-07-01  4:59                     ` Jens Axboe
  2017-07-01 16:43                       ` Jens Axboe
  0 siblings, 1 reply; 35+ messages in thread
From: Jens Axboe @ 2017-07-01  4:59 UTC (permalink / raw)
  To: Brian King, Ming Lei
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Alasdair Kergon, Mike Snitzer

On 06/30/2017 10:17 PM, Jens Axboe wrote:
> On 06/30/2017 08:08 AM, Jens Axboe wrote:
>> On 06/30/2017 07:05 AM, Brian King wrote:
>>> On 06/29/2017 09:17 PM, Jens Axboe wrote:
>>>> On 06/29/2017 07:20 PM, Ming Lei wrote:
>>>>> On Fri, Jun 30, 2017 at 2:42 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>> On 06/29/2017 10:00 AM, Jens Axboe wrote:
>>>>>>> On 06/29/2017 09:58 AM, Jens Axboe wrote:
>>>>>>>> On 06/29/2017 02:40 AM, Ming Lei wrote:
>>>>>>>>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>>>>>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>>>>>>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>>>>>>>>> of atomics from the hot path. When running this on a Power system, to
>>>>>>>>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>>>>>>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>>>>>>>>
>>>>>>>>>> This has been done before, but I've never really liked it. The reason is
>>>>>>>>>> that it means that reading the part stat inflight count now has to
>>>>>>>>>> iterate over every possible CPU. Did you use partitions in your testing?
>>>>>>>>>> How many CPUs were configured? When I last tested this a few years ago
>>>>>>>>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>>>>>>>>> latencies), it was a net loss.
>>>>>>>>>
>>>>>>>>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>>>>>>>>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>>>>>>>>> 96 cores, and dual numa nodes) too, the performance can be
>>>>>>>>> recovered basically if per numa-node counter is introduced and
>>>>>>>>> used in this case, but the patch was never posted out.
>>>>>>>>> If anyone is interested in that, I can rebase the patch on current
>>>>>>>>> block tree and post out. I guess the performance issue might be
>>>>>>>>> related with system cache coherency implementation more or less.
>>>>>>>>> This issue on ARM64 can be observed with the following userspace
>>>>>>>>> atomic counting test too:
>>>>>>>>>
>>>>>>>>>        http://kernel.ubuntu.com/~ming/test/cache/
>>>>>>>>
>>>>>>>> How well did the per-node thing work? Doesn't seem to me like it would
>>>>>>>> go far enough. And per CPU is too much. One potential improvement would
>>>>>>>> be to change the part_stat_read() to just loop online CPUs, instead of
>>>>>>>> all possible CPUs. When CPUs go on/offline, use that as the slow path to
>>>>>>>> ensure the stats are sane. Often there's a huge difference between
>>>>>>>> NR_CPUS configured and what the system has. As Brian states, RH ships
>>>>>>>> with 2048, while I doubt a lot of customers actually run that...
>>>>>>>>
>>>>>>>> Outside of coming up with a more clever data structure that is fully
>>>>>>>> CPU topology aware, one thing that could work is just having X cache
>>>>>>>> line separated read/write inflight counters per node, where X is some
>>>>>>>> suitable value (like 4). That prevents us from having cross node
>>>>>>>> traffic, and it also keeps the cross cpu traffic fairly low. That should
>>>>>>>> provide a nice balance between cost of incrementing the inflight
>>>>>>>> counting, and the cost of looping for reading it.
>>>>>>>>
>>>>>>>> And that brings me to the next part...
>>>>>>>>
>>>>>>>>>> I do agree that we should do something about it, and it's one of those
>>>>>>>>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>>>>>>>>> up. It's just not great as it currently stands, but I don't think per
>>>>>>>>>> CPU counters is the right way to fix it, at least not for the inflight
>>>>>>>>>> counter.
>>>>>>>>>
>>>>>>>>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>>>>>>>>> we can use some blk-mq knowledge(tagset?) to figure out the
>>>>>>>>> 'in_flight' counter. I thought about it before, but never got a
>>>>>>>>> perfect solution, and looks it is a bit hard, :-)
>>>>>>>>
>>>>>>>> The tags are already a bit spread out, so it's worth a shot. That would
>>>>>>>> remove the need to do anything in the inc/dec path, as the tags already
>>>>>>>> do that. The inlight count could be easily retrieved with
>>>>>>>> sbitmap_weight(). The only issue here is that we need separate read and
>>>>>>>> write counters, and the weight would obviously only get us the total
>>>>>>>> count. But we can have a slower path for that, just iterate the tags and
>>>>>>>> count them. The fast path only cares about total count.
>>>>>>>>
>>>>>>>> Let me try that out real quick.
>>>>>>>
>>>>>>> Well, that only works for whole disk stats, of course... There's no way
>>>>>>> around iterating the tags and checking for this to truly work.
>>>>>>
>>>>>> Totally untested proof of concept for using the tags for this. I based
>>>>>> this on top of Brian's patch, so it includes his patch plus the
>>>>>> _double() stuff I did which is no longer really needed.
>>>>>>
>>>>>>
>>>>>> diff --git a/block/bio.c b/block/bio.c
>>>>>> index 9cf98b29588a..ec99d9ba0f33 100644
>>>>>> --- a/block/bio.c
>>>>>> +++ b/block/bio.c
>>>>>> @@ -1737,7 +1737,7 @@ void generic_start_io_acct(int rw, unsigned long sectors,
>>>>>>         part_round_stats(cpu, part);
>>>>>>         part_stat_inc(cpu, part, ios[rw]);
>>>>>>         part_stat_add(cpu, part, sectors[rw], sectors);
>>>>>> -       part_inc_in_flight(part, rw);
>>>>>> +       part_inc_in_flight(cpu, part, rw);
>>>>>>
>>>>>>         part_stat_unlock();
>>>>>>  }
>>>>>> @@ -1751,7 +1751,7 @@ void generic_end_io_acct(int rw, struct hd_struct *part,
>>>>>>
>>>>>>         part_stat_add(cpu, part, ticks[rw], duration);
>>>>>>         part_round_stats(cpu, part);
>>>>>> -       part_dec_in_flight(part, rw);
>>>>>> +       part_dec_in_flight(cpu, part, rw);
>>>>>>
>>>>>>         part_stat_unlock();
>>>>>>  }
>>>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>>>> index af393d5a9680..6ab2efbe940b 100644
>>>>>> --- a/block/blk-core.c
>>>>>> +++ b/block/blk-core.c
>>>>>> @@ -2434,8 +2434,13 @@ void blk_account_io_done(struct request *req)
>>>>>>
>>>>>>                 part_stat_inc(cpu, part, ios[rw]);
>>>>>>                 part_stat_add(cpu, part, ticks[rw], duration);
>>>>>> -               part_round_stats(cpu, part);
>>>>>> -               part_dec_in_flight(part, rw);
>>>>>> +
>>>>>> +               if (req->q->mq_ops)
>>>>>> +                       part_round_stats_mq(req->q, cpu, part);
>>>>>> +               else {
>>>>>> +                       part_round_stats(cpu, part);
>>>>>> +                       part_dec_in_flight(cpu, part, rw);
>>>>>> +               }
>>>>>>
>>>>>>                 hd_struct_put(part);
>>>>>>                 part_stat_unlock();
>>>>>> @@ -2492,8 +2497,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
>>>>>>                         part = &rq->rq_disk->part0;
>>>>>>                         hd_struct_get(part);
>>>>>>                 }
>>>>>> -               part_round_stats(cpu, part);
>>>>>> -               part_inc_in_flight(part, rw);
>>>>>> +               if (rq->q->mq_ops)
>>>>>> +                       part_round_stats_mq(rq->q, cpu, part);
>>>>>> +               else {
>>>>>> +                       part_round_stats(cpu, part);
>>>>>> +                       part_inc_in_flight(cpu, part, rw);
>>>>>> +               }
>>>>>>                 rq->part = part;
>>>>>>         }
>>>>>>
>>>>>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>>>>>> index 99038830fb42..3b5eb2d4b964 100644
>>>>>> --- a/block/blk-merge.c
>>>>>> +++ b/block/blk-merge.c
>>>>>> @@ -634,7 +634,7 @@ static void blk_account_io_merge(struct request *req)
>>>>>>                 part = req->part;
>>>>>>
>>>>>>                 part_round_stats(cpu, part);
>>>>>> -               part_dec_in_flight(part, rq_data_dir(req));
>>>>>> +               part_dec_in_flight(cpu, part, rq_data_dir(req));
>>>>>>
>>>>>>                 hd_struct_put(part);
>>>>>>                 part_stat_unlock();
>>>>>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>>>>>> index d0be72ccb091..a7b897740c47 100644
>>>>>> --- a/block/blk-mq-tag.c
>>>>>> +++ b/block/blk-mq-tag.c
>>>>>> @@ -214,7 +214,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>>>>>>                 bitnr += tags->nr_reserved_tags;
>>>>>>         rq = tags->rqs[bitnr];
>>>>>>
>>>>>> -       if (rq->q == hctx->queue)
>>>>>> +       if (rq && rq->q == hctx->queue)
>>>>>>                 iter_data->fn(hctx, rq, iter_data->data, reserved);
>>>>>>         return true;
>>>>>>  }
>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>>> index 05dfa3f270ae..cad4d2c26285 100644
>>>>>> --- a/block/blk-mq.c
>>>>>> +++ b/block/blk-mq.c
>>>>>> @@ -43,6 +43,58 @@ static LIST_HEAD(all_q_list);
>>>>>>  static void blk_mq_poll_stats_start(struct request_queue *q);
>>>>>>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>>>>>
>>>>>> +struct mq_inflight {
>>>>>> +       struct hd_struct *part;
>>>>>> +       unsigned int inflight;
>>>>>> +};
>>>>>> +
>>>>>> +static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
>>>>>> +                                 struct request *rq, void *priv,
>>>>>> +                                 bool reserved)
>>>>>> +{
>>>>>> +       struct mq_inflight *mi = priv;
>>>>>> +
>>>>>> +       if (rq->part == mi->part &&
>>>>>> +           test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
>>>>>> +               mi->inflight++;
>>>>>> +}
>>>>>> +
>>>>>> +unsigned long part_in_flight_mq(struct request_queue *q,
>>>>>> +                               struct hd_struct *part)
>>>>>> +{
>>>>>> +       struct mq_inflight mi = { .part = part, .inflight = 0 };
>>>>>> +
>>>>>> +       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
>>>>>> +       return mi.inflight;
>>>>>> +}
>>>>>
>>>>> Compared with the totally percpu approach, this way might help 1:M or
>>>>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>>>>> each CPU(especially there are huge hw queues on a big system), :-(
>>>>
>>>> Not disagreeing with that, without having some mechanism to only
>>>> loop queues that have pending requests. That would be similar to the
>>>> ctx_map for sw to hw queues. But I don't think that would be worthwhile
>>>> doing, I like your pnode approach better. However, I'm still not fully
>>>> convinced that one per node is enough to get the scalability we need.
>>>>
>>>> Would be great if Brian could re-test with your updated patch, so we
>>>> know how it works for him at least.
>>>
>>> I'll try running with both approaches today and see how they compare.
>>
>> Focus on Ming's, a variant of that is the most likely path forward,
>> imho. It'd be great to do a quick run on mine as well, just to establish
>> how it compares to mainline, though.
> 
> Here's a cleaned up series:
> 
> http://git.kernel.dk/cgit/linux-block/log/?h=mq-inflight
> 
> (it's against mainline for now, I will update it to be against
> for-4.13/block in a rebase).
> 
> One optimization on top of this I want to do is to only iterate once,
> even for a partition - pass in both parts, and increment two different
> counts. If we collapse the two part time stamps, then that's doable, and
> it means we only have to iterate once.
> 
> Essentially this series makes the inc/dec a noop, since we don't have to
> do anything. The reading is basically no worse than a cpu online
> iteration, since we never have more queues than online CPUs. That's an
> improvement over per-cpu for-each-possible loops. For a lot of cases,
> it's much less, since we have fewer queues than CPUs. I'll need an hour
> or two to hone this a bit more, but then it would be great if you can
> retest. I'll send out an email when that's done, it'll be some time over
> this weekend.

Did the double-read with one iteration change, it was pretty trivial:

http://git.kernel.dk/cgit/linux-block/commit/?h=mq-inflight&id=b841804f826df072f706ae86d0eb533342f0297a

And updated the branch here:

http://git.kernel.dk/cgit/linux-block/log/?h=mq-inflight

to include that, and be based on top of for-4.13/block. If you prefer just
pulling a branch, pull:

git://git.kernel.dk/linux-block mq-inflight

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-07-01  4:59                     ` Jens Axboe
@ 2017-07-01 16:43                       ` Jens Axboe
  2017-07-04 20:55                         ` Brian King
  0 siblings, 1 reply; 35+ messages in thread
From: Jens Axboe @ 2017-07-01 16:43 UTC (permalink / raw)
  To: Brian King, Ming Lei
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Alasdair Kergon, Mike Snitzer

On 06/30/2017 10:59 PM, Jens Axboe wrote:
> On 06/30/2017 10:17 PM, Jens Axboe wrote:
>> On 06/30/2017 08:08 AM, Jens Axboe wrote:
>>> On 06/30/2017 07:05 AM, Brian King wrote:
>>>> On 06/29/2017 09:17 PM, Jens Axboe wrote:
>>>>> On 06/29/2017 07:20 PM, Ming Lei wrote:
>>>>>> On Fri, Jun 30, 2017 at 2:42 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>> On 06/29/2017 10:00 AM, Jens Axboe wrote:
>>>>>>>> On 06/29/2017 09:58 AM, Jens Axboe wrote:
>>>>>>>>> On 06/29/2017 02:40 AM, Ming Lei wrote:
>>>>>>>>>> On Thu, Jun 29, 2017 at 5:49 AM, Jens Axboe <axboe@kernel.dk> wrote:
>>>>>>>>>>> On 06/28/2017 03:12 PM, Brian King wrote:
>>>>>>>>>>>> This patch converts the in_flight counter in struct hd_struct from a
>>>>>>>>>>>> pair of atomics to a pair of percpu counters. This eliminates a couple
>>>>>>>>>>>> of atomics from the hot path. When running this on a Power system, to
>>>>>>>>>>>> a single null_blk device with 80 submission queues, irq mode 0, with
>>>>>>>>>>>> 80 fio jobs, I saw IOPs go from 1.5M IO/s to 11.4 IO/s.
>>>>>>>>>>>
>>>>>>>>>>> This has been done before, but I've never really liked it. The reason is
>>>>>>>>>>> that it means that reading the part stat inflight count now has to
>>>>>>>>>>> iterate over every possible CPU. Did you use partitions in your testing?
>>>>>>>>>>> How many CPUs were configured? When I last tested this a few years ago
>>>>>>>>>>> on even a quad core nehalem (which is notoriously shitty for cross-node
>>>>>>>>>>> latencies), it was a net loss.
>>>>>>>>>>
>>>>>>>>>> One year ago, I saw null_blk's IOPS can be decreased to 10%
>>>>>>>>>> of non-RQF_IO_STAT on a dual socket ARM64(each CPU has
>>>>>>>>>> 96 cores, and dual numa nodes) too, the performance can be
>>>>>>>>>> recovered basically if per numa-node counter is introduced and
>>>>>>>>>> used in this case, but the patch was never posted out.
>>>>>>>>>> If anyone is interested in that, I can rebase the patch on current
>>>>>>>>>> block tree and post out. I guess the performance issue might be
>>>>>>>>>> related with system cache coherency implementation more or less.
>>>>>>>>>> This issue on ARM64 can be observed with the following userspace
>>>>>>>>>> atomic counting test too:
>>>>>>>>>>
>>>>>>>>>>        http://kernel.ubuntu.com/~ming/test/cache/
>>>>>>>>>
>>>>>>>>> How well did the per-node thing work? Doesn't seem to me like it would
>>>>>>>>> go far enough. And per CPU is too much. One potential improvement would
>>>>>>>>> be to change the part_stat_read() to just loop online CPUs, instead of
>>>>>>>>> all possible CPUs. When CPUs go on/offline, use that as the slow path to
>>>>>>>>> ensure the stats are sane. Often there's a huge difference between
>>>>>>>>> NR_CPUS configured and what the system has. As Brian states, RH ships
>>>>>>>>> with 2048, while I doubt a lot of customers actually run that...
>>>>>>>>>
>>>>>>>>> Outside of coming up with a more clever data structure that is fully
>>>>>>>>> CPU topology aware, one thing that could work is just having X cache
>>>>>>>>> line separated read/write inflight counters per node, where X is some
>>>>>>>>> suitable value (like 4). That prevents us from having cross node
>>>>>>>>> traffic, and it also keeps the cross cpu traffic fairly low. That should
>>>>>>>>> provide a nice balance between cost of incrementing the inflight
>>>>>>>>> counting, and the cost of looping for reading it.
>>>>>>>>>
>>>>>>>>> And that brings me to the next part...
>>>>>>>>>
>>>>>>>>>>> I do agree that we should do something about it, and it's one of those
>>>>>>>>>>> items I've highlighted in talks about blk-mq on pending issues to fix
>>>>>>>>>>> up. It's just not great as it currently stands, but I don't think per
>>>>>>>>>>> CPU counters is the right way to fix it, at least not for the inflight
>>>>>>>>>>> counter.
>>>>>>>>>>
>>>>>>>>>> Yeah, it won't be a issue for non-mq path, and for blk-mq path, maybe
>>>>>>>>>> we can use some blk-mq knowledge(tagset?) to figure out the
>>>>>>>>>> 'in_flight' counter. I thought about it before, but never got a
>>>>>>>>>> perfect solution, and looks it is a bit hard, :-)
>>>>>>>>>
>>>>>>>>> The tags are already a bit spread out, so it's worth a shot. That would
>>>>>>>>> remove the need to do anything in the inc/dec path, as the tags already
>>>>>>>>> do that. The inlight count could be easily retrieved with
>>>>>>>>> sbitmap_weight(). The only issue here is that we need separate read and
>>>>>>>>> write counters, and the weight would obviously only get us the total
>>>>>>>>> count. But we can have a slower path for that, just iterate the tags and
>>>>>>>>> count them. The fast path only cares about total count.
>>>>>>>>>
>>>>>>>>> Let me try that out real quick.
>>>>>>>>
>>>>>>>> Well, that only works for whole disk stats, of course... There's no way
>>>>>>>> around iterating the tags and checking for this to truly work.
>>>>>>>
>>>>>>> Totally untested proof of concept for using the tags for this. I based
>>>>>>> this on top of Brian's patch, so it includes his patch plus the
>>>>>>> _double() stuff I did which is no longer really needed.
>>>>>>>
>>>>>>>
>>>>>>> diff --git a/block/bio.c b/block/bio.c
>>>>>>> index 9cf98b29588a..ec99d9ba0f33 100644
>>>>>>> --- a/block/bio.c
>>>>>>> +++ b/block/bio.c
>>>>>>> @@ -1737,7 +1737,7 @@ void generic_start_io_acct(int rw, unsigned long sectors,
>>>>>>>         part_round_stats(cpu, part);
>>>>>>>         part_stat_inc(cpu, part, ios[rw]);
>>>>>>>         part_stat_add(cpu, part, sectors[rw], sectors);
>>>>>>> -       part_inc_in_flight(part, rw);
>>>>>>> +       part_inc_in_flight(cpu, part, rw);
>>>>>>>
>>>>>>>         part_stat_unlock();
>>>>>>>  }
>>>>>>> @@ -1751,7 +1751,7 @@ void generic_end_io_acct(int rw, struct hd_struct *part,
>>>>>>>
>>>>>>>         part_stat_add(cpu, part, ticks[rw], duration);
>>>>>>>         part_round_stats(cpu, part);
>>>>>>> -       part_dec_in_flight(part, rw);
>>>>>>> +       part_dec_in_flight(cpu, part, rw);
>>>>>>>
>>>>>>>         part_stat_unlock();
>>>>>>>  }
>>>>>>> diff --git a/block/blk-core.c b/block/blk-core.c
>>>>>>> index af393d5a9680..6ab2efbe940b 100644
>>>>>>> --- a/block/blk-core.c
>>>>>>> +++ b/block/blk-core.c
>>>>>>> @@ -2434,8 +2434,13 @@ void blk_account_io_done(struct request *req)
>>>>>>>
>>>>>>>                 part_stat_inc(cpu, part, ios[rw]);
>>>>>>>                 part_stat_add(cpu, part, ticks[rw], duration);
>>>>>>> -               part_round_stats(cpu, part);
>>>>>>> -               part_dec_in_flight(part, rw);
>>>>>>> +
>>>>>>> +               if (req->q->mq_ops)
>>>>>>> +                       part_round_stats_mq(req->q, cpu, part);
>>>>>>> +               else {
>>>>>>> +                       part_round_stats(cpu, part);
>>>>>>> +                       part_dec_in_flight(cpu, part, rw);
>>>>>>> +               }
>>>>>>>
>>>>>>>                 hd_struct_put(part);
>>>>>>>                 part_stat_unlock();
>>>>>>> @@ -2492,8 +2497,12 @@ void blk_account_io_start(struct request *rq, bool new_io)
>>>>>>>                         part = &rq->rq_disk->part0;
>>>>>>>                         hd_struct_get(part);
>>>>>>>                 }
>>>>>>> -               part_round_stats(cpu, part);
>>>>>>> -               part_inc_in_flight(part, rw);
>>>>>>> +               if (rq->q->mq_ops)
>>>>>>> +                       part_round_stats_mq(rq->q, cpu, part);
>>>>>>> +               else {
>>>>>>> +                       part_round_stats(cpu, part);
>>>>>>> +                       part_inc_in_flight(cpu, part, rw);
>>>>>>> +               }
>>>>>>>                 rq->part = part;
>>>>>>>         }
>>>>>>>
>>>>>>> diff --git a/block/blk-merge.c b/block/blk-merge.c
>>>>>>> index 99038830fb42..3b5eb2d4b964 100644
>>>>>>> --- a/block/blk-merge.c
>>>>>>> +++ b/block/blk-merge.c
>>>>>>> @@ -634,7 +634,7 @@ static void blk_account_io_merge(struct request *req)
>>>>>>>                 part = req->part;
>>>>>>>
>>>>>>>                 part_round_stats(cpu, part);
>>>>>>> -               part_dec_in_flight(part, rq_data_dir(req));
>>>>>>> +               part_dec_in_flight(cpu, part, rq_data_dir(req));
>>>>>>>
>>>>>>>                 hd_struct_put(part);
>>>>>>>                 part_stat_unlock();
>>>>>>> diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
>>>>>>> index d0be72ccb091..a7b897740c47 100644
>>>>>>> --- a/block/blk-mq-tag.c
>>>>>>> +++ b/block/blk-mq-tag.c
>>>>>>> @@ -214,7 +214,7 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
>>>>>>>                 bitnr += tags->nr_reserved_tags;
>>>>>>>         rq = tags->rqs[bitnr];
>>>>>>>
>>>>>>> -       if (rq->q == hctx->queue)
>>>>>>> +       if (rq && rq->q == hctx->queue)
>>>>>>>                 iter_data->fn(hctx, rq, iter_data->data, reserved);
>>>>>>>         return true;
>>>>>>>  }
>>>>>>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>>>>>>> index 05dfa3f270ae..cad4d2c26285 100644
>>>>>>> --- a/block/blk-mq.c
>>>>>>> +++ b/block/blk-mq.c
>>>>>>> @@ -43,6 +43,58 @@ static LIST_HEAD(all_q_list);
>>>>>>>  static void blk_mq_poll_stats_start(struct request_queue *q);
>>>>>>>  static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
>>>>>>>
>>>>>>> +struct mq_inflight {
>>>>>>> +       struct hd_struct *part;
>>>>>>> +       unsigned int inflight;
>>>>>>> +};
>>>>>>> +
>>>>>>> +static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
>>>>>>> +                                 struct request *rq, void *priv,
>>>>>>> +                                 bool reserved)
>>>>>>> +{
>>>>>>> +       struct mq_inflight *mi = priv;
>>>>>>> +
>>>>>>> +       if (rq->part == mi->part &&
>>>>>>> +           test_bit(REQ_ATOM_STARTED, &rq->atomic_flags))
>>>>>>> +               mi->inflight++;
>>>>>>> +}
>>>>>>> +
>>>>>>> +unsigned long part_in_flight_mq(struct request_queue *q,
>>>>>>> +                               struct hd_struct *part)
>>>>>>> +{
>>>>>>> +       struct mq_inflight mi = { .part = part, .inflight = 0 };
>>>>>>> +
>>>>>>> +       blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
>>>>>>> +       return mi.inflight;
>>>>>>> +}
>>>>>>
>>>>>> Compared with the totally percpu approach, this way might help 1:M or
>>>>>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>>>>>> each CPU(especially there are huge hw queues on a big system), :-(
>>>>>
>>>>> Not disagreeing with that, without having some mechanism to only
>>>>> loop queues that have pending requests. That would be similar to the
>>>>> ctx_map for sw to hw queues. But I don't think that would be worthwhile
>>>>> doing, I like your pnode approach better. However, I'm still not fully
>>>>> convinced that one per node is enough to get the scalability we need.
>>>>>
>>>>> Would be great if Brian could re-test with your updated patch, so we
>>>>> know how it works for him at least.
>>>>
>>>> I'll try running with both approaches today and see how they compare.
>>>
>>> Focus on Ming's, a variant of that is the most likely path forward,
>>> imho. It'd be great to do a quick run on mine as well, just to establish
>>> how it compares to mainline, though.
>>
>> Here's a cleaned up series:
>>
>> http://git.kernel.dk/cgit/linux-block/log/?h=mq-inflight
>>
>> (it's against mainline for now, I will update it to be against
>> for-4.13/block in a rebase).
>>
>> One optimization on top of this I want to do is to only iterate once,
>> even for a partition - pass in both parts, and increment two different
>> counts. If we collapse the two part time stamps, then that's doable, and
>> it means we only have to iterate once.
>>
>> Essentially this series makes the inc/dec a noop, since we don't have to
>> do anything. The reading is basically no worse than a cpu online
>> iteration, since we never have more queues than online CPUs. That's an
>> improvement over per-cpu for-each-possible loops. For a lot of cases,
>> it's much less, since we have fewer queues than CPUs. I'll need an hour
>> or two to hone this a bit more, but then it would be great if you can
>> retest. I'll send out an email when that's done, it'll be some time over
>> this weekend.
> 
> Did the double-read with one iteration change, it was pretty trivial:
> 
> http://git.kernel.dk/cgit/linux-block/commit/?h=mq-inflight&id=b841804f826df072f706ae86d0eb533342f0297a

Now:

http://git.kernel.dk/cgit/linux-block/commit/?h=mq-inflight&id=87f73ef2b9edb6834001df8f7cb48c7a116e8cd3

> And updated the branch here:
> 
> http://git.kernel.dk/cgit/linux-block/log/?h=mq-inflight
> 
> to include that, and be based on top of for-4.13/block. If you prefer just
> pulling a branch, pull:
> 
> git://git.kernel.dk/linux-block mq-inflight

I've tested it, and made a few adjustments and fixes. The branches are
the same, just rebased. Works fine for me, even with partitions now.

Let me know how it works for you.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-07-01  2:18                         ` Brian King
@ 2017-07-04  1:20                           ` Ming Lei
  2017-07-04 20:58                             ` Brian King
  0 siblings, 1 reply; 35+ messages in thread
From: Ming Lei @ 2017-07-04  1:20 UTC (permalink / raw)
  To: Brian King
  Cc: Jens Axboe, linux-block, open list:DEVICE-MAPPER (LVM),
	Alasdair Kergon, Mike Snitzer

On Sat, Jul 1, 2017 at 10:18 AM, Brian King <brking@linux.vnet.ibm.com> wrote:
> On 06/30/2017 06:26 PM, Jens Axboe wrote:
>> On 06/30/2017 05:23 PM, Ming Lei wrote:
>>> Hi Bian,
>>>
>>> On Sat, Jul 1, 2017 at 2:33 AM, Brian King <brking@linux.vnet.ibm.com> wrote:
>>>> On 06/30/2017 09:08 AM, Jens Axboe wrote:
>>>>>>>> Compared with the totally percpu approach, this way might help 1:M or
>>>>>>>> N:M mapping, but won't help 1:1 map(NVMe), when hctx is mapped to
>>>>>>>> each CPU(especially there are huge hw queues on a big system), :-(
>>>>>>>
>>>>>>> Not disagreeing with that, without having some mechanism to only
>>>>>>> loop queues that have pending requests. That would be similar to the
>>>>>>> ctx_map for sw to hw queues. But I don't think that would be worthwhile
>>>>>>> doing, I like your pnode approach better. However, I'm still not fully
>>>>>>> convinced that one per node is enough to get the scalability we need.
>>>>>>>
>>>>>>> Would be great if Brian could re-test with your updated patch, so we
>>>>>>> know how it works for him at least.
>>>>>>
>>>>>> I'll try running with both approaches today and see how they compare.
>>>>>
>>>>> Focus on Ming's, a variant of that is the most likely path forward,
>>>>> imho. It'd be great to do a quick run on mine as well, just to establish
>>>>> how it compares to mainline, though.
>>>>
>>>> On my initial runs, the one from you Jens, appears to perform a bit better, although
>>>> both are a huge improvement from what I was seeing before.
>>>>
>>>> I ran 4k random reads using fio to nullblk in two configurations on my 20 core
>>>> system with 4 NUMA nodes and 4-way SMT, so 80 logical CPUs. I ran both 80 threads
>>>> to a single null_blk as well as 80 threads to 80 null_block devices, so one thread
>>>
>>> Could you share what the '80 null_block devices' is?  It means you
>>> create 80 null_blk
>>> devices? Or you create one null_blk and make its hw queue number as 80
>>> via module
>>> parameter of ''submit_queues"?
>>
>> That's a valid question, was going to ask that too. But I assumed that Brian
>> used submit_queues to set as many queues as he has logical CPUs in the system.
>>>
>>> I guess we should focus on multi-queue case since it is the normal way of NVMe.
>>>
>>>> per null_blk. This is what I saw on this machine:
>>>>
>>>> Using the Per node atomic change from Ming Lei
>>>> 1 null_blk, 80 threads
>>>> iops=9376.5K
>>>>
>>>> 80 null_blk, 1 thread
>>>> iops=9523.5K
>>>>
>>>>
>>>> Using the alternate patch from Jens using the tags
>>>> 1 null_blk, 80 threads
>>>> iops=9725.8K
>>>>
>>>> 80 null_blk, 1 thread
>>>> iops=9569.4K
>>>
>>> If 1 thread means single fio job, looks the number is too too high, that means
>>> one random IO can complete in about 0.1us(100ns) on single CPU, not sure if it
>>> is possible, :-)
>>
>> It means either 1 null_blk device, 80 threads running IO to it. Or 80 null_blk
>> devices, each with a thread running IO to it. See above, he details that it's
>> 80 threads on 80 devices for that case.
>
> Right. So the two modes I'm running in are:
>
> 1. 80 null_blk devices, each with one submit_queue, with one fio job per null_blk device,
>    so 80 threads total. 80 logical CPUs
> 2. 1 null_blk device, with 80 submit_queues, 80 fio jobs, 80 logical CPUs.
>
> In theory, the two should result in similar numbers.
>
> Here are the commands and fio configurations:
>
> Scenario #1
> modprobe null_blk submit_queues=80 nr_devices=1 irqmode=0
>
> FIO config:
> [global]
> buffered=0
> invalidate=1
> bs=4k
> iodepth=64
> numjobs=80
> group_reporting=1
> rw=randrw
> rwmixread=100
> rwmixwrite=0
> ioengine=libaio
> runtime=60
> time_based
>
> [job1]
> filename=/dev/nullb0
>
>
> Scenario #2
> modprobe null_blk submit_queues=1 nr_devices=80 irqmode=0
>
> FIO config
> [global]
> buffered=0
> invalidate=1
> bs=4k
> iodepth=64
> numjobs=1
> group_reporting=1
> rw=randrw
> rwmixread=100
> rwmixwrite=0
> ioengine=libaio
> runtime=60
> time_based
>
> [job1]
> filename=/dev/nullb0
> [job2]
> filename=/dev/nullb1
> [job3]
> filename=/dev/nullb2
> [job4]
> filename=/dev/nullb3
> [job5]
> filename=/dev/nullb4
> [job6]
> filename=/dev/nullb5
> [job7]
> filename=/dev/nullb6
> [job8]
> filename=/dev/nullb7
> [job9]
> filename=/dev/nullb8
> [job10]
> filename=/dev/nullb9
> [job11]
> filename=/dev/nullb10
> [job12]
> filename=/dev/nullb11
> [job13]
> filename=/dev/nullb12
> [job14]
> filename=/dev/nullb13
> [job15]
> filename=/dev/nullb14
> [job16]
> filename=/dev/nullb15
> [job17]
> filename=/dev/nullb16
> [job18]
> filename=/dev/nullb17
> [job19]
> filename=/dev/nullb18
> [job20]
> filename=/dev/nullb19
> [job21]
> filename=/dev/nullb20
> [job22]
> filename=/dev/nullb21
> [job23]
> filename=/dev/nullb22
> [job24]
> filename=/dev/nullb23
> [job25]
> filename=/dev/nullb24
> [job26]
> filename=/dev/nullb25
> [job27]
> filename=/dev/nullb26
> [job28]
> filename=/dev/nullb27
> [job29]
> filename=/dev/nullb28
> [job30]
> filename=/dev/nullb29
> [job31]
> filename=/dev/nullb30
> [job32]
> filename=/dev/nullb31
> [job33]
> filename=/dev/nullb32
> [job34]
> filename=/dev/nullb33
> [job35]
> filename=/dev/nullb34
> [job36]
> filename=/dev/nullb35
> [job37]
> filename=/dev/nullb36
> [job38]
> filename=/dev/nullb37
> [job39]
> filename=/dev/nullb38
> [job40]
> filename=/dev/nullb39
> [job41]
> filename=/dev/nullb40
> [job42]
> filename=/dev/nullb41
> [job43]
> filename=/dev/nullb42
> [job44]
> filename=/dev/nullb43
> [job45]
> filename=/dev/nullb44
> [job46]
> filename=/dev/nullb45
> [job47]
> filename=/dev/nullb46
> [job48]
> filename=/dev/nullb47
> [job49]
> filename=/dev/nullb48
> [job50]
> filename=/dev/nullb49
> [job51]
> filename=/dev/nullb50
> [job52]
> filename=/dev/nullb51
> [job53]
> filename=/dev/nullb52
> [job54]
> filename=/dev/nullb53
> [job55]
> filename=/dev/nullb54
> [job56]
> filename=/dev/nullb55
> [job57]
> filename=/dev/nullb56
> [job58]
> filename=/dev/nullb57
> [job59]
> filename=/dev/nullb58
> [job60]
> filename=/dev/nullb59
> [job61]
> filename=/dev/nullb60
> [job62]
> filename=/dev/nullb61
> [job63]
> filename=/dev/nullb62
> [job64]
> filename=/dev/nullb63
> [job65]
> filename=/dev/nullb64
> [job66]
> filename=/dev/nullb65
> [job67]
> filename=/dev/nullb66
> [job68]
> filename=/dev/nullb67
> [job69]
> filename=/dev/nullb68
> [job70]
> filename=/dev/nullb69
> [job71]
> filename=/dev/nullb70
> [job72]
> filename=/dev/nullb71
> [job73]
> filename=/dev/nullb72
> [job74]
> filename=/dev/nullb73
> [job75]
> filename=/dev/nullb74
> [job76]
> filename=/dev/nullb75
> [job77]
> filename=/dev/nullb76
> [job78]
> filename=/dev/nullb77
> [job79]
> filename=/dev/nullb78
> [job80]
> filename=/dev/nullb79

IMO it should be more reasonable to use single null_blk with 80 queues
via setting submit_queues as 80 than simply 80 null_blks.  So suggest to switch
to test 80 queues in your future test.


thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-07-01 16:43                       ` Jens Axboe
@ 2017-07-04 20:55                         ` Brian King
  2017-07-04 21:57                           ` Jens Axboe
  0 siblings, 1 reply; 35+ messages in thread
From: Brian King @ 2017-07-04 20:55 UTC (permalink / raw)
  To: Jens Axboe, Ming Lei
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On 07/01/2017 11:43 AM, Jens Axboe wrote:
> Now:
> 
> http://git.kernel.dk/cgit/linux-block/commit/?h=mq-inflight&id=87f73ef2b9edb6834001df8f7cb48c7a116e8cd3
> 
>> And updated the branch here:
>>
>> http://git.kernel.dk/cgit/linux-block/log/?h=mq-inflight
>>
>> to include that, and be based on top of for-4.13/block. If you prefer just
>> pulling a branch, pull:
>>
>> git://git.kernel.dk/linux-block mq-inflight
> 
> I've tested it, and made a few adjustments and fixes. The branches are
> the same, just rebased. Works fine for me, even with partitions now.
> 
> Let me know how it works for you.

This is looking very nice. This is what I'm seeing now:

1 null_blk device / 80 submit_queues / fio numjobs=80
9.4M IOPs

80 null_blk devices / 1 submit_queue per null_blk / fio numjobs=1 (1 per null_blk)
9.3M IOPs

1 null_blk device / 160 submit_queues / fio numjobs=160
9.8M IOPs

160 null_blk devices / 1 submit_queue per null_blk / fio numjobs=1 (1 per null_blk)
8.9M IOPs

What is queued in mq-inflight looks to resolve the issue I've been seeing.

Thanks!!!

Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-07-04  1:20                           ` Ming Lei
@ 2017-07-04 20:58                             ` Brian King
  0 siblings, 0 replies; 35+ messages in thread
From: Brian King @ 2017-07-04 20:58 UTC (permalink / raw)
  To: Ming Lei
  Cc: Jens Axboe, linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On 07/03/2017 08:20 PM, Ming Lei wrote:
>> Right. So the two modes I'm running in are:
>>
>> 1. 80 null_blk devices, each with one submit_queue, with one fio job per null_blk device,
>>    so 80 threads total. 80 logical CPUs
>> 2. 1 null_blk device, with 80 submit_queues, 80 fio jobs, 80 logical CPUs.
>>
>> In theory, the two should result in similar numbers.
>>
>> Here are the commands and fio configurations:
>>
>> Scenario #1
>> modprobe null_blk submit_queues=80 nr_devices=1 irqmode=0
>>
>> FIO config:
>> [global]
>> buffered=0
>> invalidate=1
>> bs=4k
>> iodepth=64
>> numjobs=80
>> group_reporting=1
>> rw=randrw
>> rwmixread=100
>> rwmixwrite=0
>> ioengine=libaio
>> runtime=60
>> time_based
>>
>> [job1]
>> filename=/dev/nullb0

> 
> IMO it should be more reasonable to use single null_blk with 80 queues
> via setting submit_queues as 80 than simply 80 null_blks.  So suggest to switch
> to test 80 queues in your future test.

That should be what my Scenario #1 covers. 

Thanks,

Brian

-- 
Brian King
Power Linux I/O
IBM Linux Technology Center

^ permalink raw reply	[flat|nested] 35+ messages in thread

* Re: [dm-devel] [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu
  2017-07-04 20:55                         ` Brian King
@ 2017-07-04 21:57                           ` Jens Axboe
  0 siblings, 0 replies; 35+ messages in thread
From: Jens Axboe @ 2017-07-04 21:57 UTC (permalink / raw)
  To: Brian King, Ming Lei
  Cc: linux-block, open list:DEVICE-MAPPER (LVM),
	Mike Snitzer, Alasdair Kergon

On 07/04/2017 02:55 PM, Brian King wrote:
> On 07/01/2017 11:43 AM, Jens Axboe wrote:
>> Now:
>>
>> http://git.kernel.dk/cgit/linux-block/commit/?h=mq-inflight&id=87f73ef2b9edb6834001df8f7cb48c7a116e8cd3
>>
>>> And updated the branch here:
>>>
>>> http://git.kernel.dk/cgit/linux-block/log/?h=mq-inflight
>>>
>>> to include that, and be based on top of for-4.13/block. If you prefer just
>>> pulling a branch, pull:
>>>
>>> git://git.kernel.dk/linux-block mq-inflight
>>
>> I've tested it, and made a few adjustments and fixes. The branches are
>> the same, just rebased. Works fine for me, even with partitions now.
>>
>> Let me know how it works for you.
> 
> This is looking very nice. This is what I'm seeing now:
> 
> 1 null_blk device / 80 submit_queues / fio numjobs=80
> 9.4M IOPs
> 
> 80 null_blk devices / 1 submit_queue per null_blk / fio numjobs=1 (1 per null_blk)
> 9.3M IOPs
> 
> 1 null_blk device / 160 submit_queues / fio numjobs=160
> 9.8M IOPs
> 
> 160 null_blk devices / 1 submit_queue per null_blk / fio numjobs=1 (1 per null_blk)
> 8.9M IOPs
> 
> What is queued in mq-inflight looks to resolve the issue I've been seeing.

Good to hear! I'll test some corner cases and post a "real" version
for review. Thanks for all your testing.

-- 
Jens Axboe

^ permalink raw reply	[flat|nested] 35+ messages in thread

end of thread, other threads:[~2017-07-04 21:57 UTC | newest]

Thread overview: 35+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2017-06-28 21:12 [PATCH 1/1] block: Convert hd_struct in_flight from atomic to percpu Brian King
2017-06-28 21:49 ` Jens Axboe
2017-06-28 21:49   ` Jens Axboe
2017-06-28 22:04   ` Brian King
2017-06-28 22:04     ` Brian King
2017-06-29  8:40   ` Ming Lei
2017-06-29 15:58     ` Jens Axboe
2017-06-29 16:00       ` Jens Axboe
2017-06-29 18:42         ` Jens Axboe
2017-06-30  1:20           ` Ming Lei
2017-06-30  2:17             ` Jens Axboe
2017-06-30 13:05               ` [dm-devel] " Brian King
2017-06-30 13:05                 ` Brian King
2017-06-30 14:08                 ` [dm-devel] " Jens Axboe
2017-06-30 18:33                   ` Brian King
2017-06-30 23:23                     ` Ming Lei
2017-06-30 23:26                       ` Jens Axboe
2017-07-01  2:18                         ` Brian King
2017-07-04  1:20                           ` Ming Lei
2017-07-04 20:58                             ` Brian King
2017-07-01  4:17                   ` Jens Axboe
2017-07-01  4:59                     ` Jens Axboe
2017-07-01 16:43                       ` Jens Axboe
2017-07-04 20:55                         ` Brian King
2017-07-04 21:57                           ` Jens Axboe
2017-06-29 16:25       ` Ming Lei
2017-06-29 17:31         ` Brian King
2017-06-30  1:08           ` Ming Lei
2017-06-30  1:08             ` Ming Lei
2017-06-28 21:54 ` Jens Axboe
2017-06-28 21:54   ` Jens Axboe
2017-06-28 21:59   ` Jens Axboe
2017-06-28 22:07     ` [dm-devel] " Brian King
2017-06-28 22:19       ` Jens Axboe
2017-06-29 12:59         ` Brian King

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.