linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH RFC] block: defer task/vm accounting until successful
@ 2020-08-28  2:41 Jens Axboe
  2020-08-28  7:48 ` Ming Lei
  0 siblings, 1 reply; 3+ messages in thread
From: Jens Axboe @ 2020-08-28  2:41 UTC (permalink / raw)
  To: linux-block

We currently increment the task/vm counts when we first attempt to queue a
bio. But this isn't necessarily correct - if the request allocation fails
with -EAGAIN, for example, and the caller retries, then we'll over-account
by as many retries as are done.

This can happen for polled IO, where we cannot wait for requests. Hence
retries can get aggressive, if we're running out of requests. If this
happens, then watching the IO rates in vmstat are incorrect as they count
every issue attempt as successful and hence the stats are inflated by
quite a lot potentially.

Abstract out the accounting function, and move the blk-mq accounting into
blk_mq_make_request() when we know we got a request. For the non-mq path,
just do it when the bio is queued.

Signed-off-by: Jens Axboe <axboe@kernel.dk>

---

diff --git a/block/blk-core.c b/block/blk-core.c
index d9d632639bd1..aabd016faf79 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -28,7 +28,6 @@
 #include <linux/slab.h>
 #include <linux/swap.h>
 #include <linux/writeback.h>
-#include <linux/task_io_accounting_ops.h>
 #include <linux/fault-inject.h>
 #include <linux/list_sort.h>
 #include <linux/delay.h>
@@ -1113,6 +1112,8 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
 	struct bio_list bio_list_on_stack[2];
 	blk_qc_t ret = BLK_QC_T_NONE;
 
+	blk_account_bio(bio);
+
 	BUG_ON(bio->bi_next);
 
 	bio_list_init(&bio_list_on_stack[0]);
@@ -1232,35 +1233,6 @@ blk_qc_t submit_bio(struct bio *bio)
 	if (blkcg_punt_bio_submit(bio))
 		return BLK_QC_T_NONE;
 
-	/*
-	 * If it's a regular read/write or a barrier with data attached,
-	 * go through the normal accounting stuff before submission.
-	 */
-	if (bio_has_data(bio)) {
-		unsigned int count;
-
-		if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
-			count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
-		else
-			count = bio_sectors(bio);
-
-		if (op_is_write(bio_op(bio))) {
-			count_vm_events(PGPGOUT, count);
-		} else {
-			task_io_account_read(bio->bi_iter.bi_size);
-			count_vm_events(PGPGIN, count);
-		}
-
-		if (unlikely(block_dump)) {
-			char b[BDEVNAME_SIZE];
-			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
-			current->comm, task_pid_nr(current),
-				op_is_write(bio_op(bio)) ? "WRITE" : "READ",
-				(unsigned long long)bio->bi_iter.bi_sector,
-				bio_devname(bio, b), count);
-		}
-	}
-
 	/*
 	 * If we're reading data that is part of the userspace workingset, count
 	 * submission time as memory stall.  When the device is congested, or
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 0015a1892153..b77c66dfc19c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -27,6 +27,7 @@
 #include <linux/crash_dump.h>
 #include <linux/prefetch.h>
 #include <linux/blk-crypto.h>
+#include <linux/task_io_accounting_ops.h>
 
 #include <trace/events/block.h>
 
@@ -2111,6 +2112,39 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
 	}
 }
 
+void blk_account_bio(struct bio *bio)
+{
+	unsigned int count;
+
+	/*
+	 * If it's a regular read/write or a barrier with data attached,
+	 * go through the normal accounting stuff before submission.
+	 */
+	if (unlikely(!bio_has_data(bio)))
+		return;
+
+	if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
+		count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
+	else
+		count = bio_sectors(bio);
+
+	if (op_is_write(bio_op(bio))) {
+		count_vm_events(PGPGOUT, count);
+	} else {
+		task_io_account_read(bio->bi_iter.bi_size);
+		count_vm_events(PGPGIN, count);
+	}
+
+	if (unlikely(block_dump)) {
+		char b[BDEVNAME_SIZE];
+		printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
+		current->comm, task_pid_nr(current),
+			op_is_write(bio_op(bio)) ? "WRITE" : "READ",
+			(unsigned long long)bio->bi_iter.bi_sector,
+			bio_devname(bio, b), count);
+	}
+}
+
 /**
  * blk_mq_submit_bio - Create and send a request to block device.
  * @bio: Bio pointer.
@@ -2165,6 +2199,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
 		goto queue_exit;
 	}
 
+	blk_account_bio(bio);
 	trace_block_getrq(q, bio, bio->bi_opf);
 
 	rq_qos_track(q, rq, bio);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 863a2f3346d4..10e66e190fac 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -253,4 +253,6 @@ static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
 	return NULL;
 }
 
+void blk_account_bio(struct bio *bio);
+
 #endif
-- 
Jens Axboe


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* Re: [PATCH RFC] block: defer task/vm accounting until successful
  2020-08-28  2:41 [PATCH RFC] block: defer task/vm accounting until successful Jens Axboe
@ 2020-08-28  7:48 ` Ming Lei
  2020-08-28 14:24   ` Jens Axboe
  0 siblings, 1 reply; 3+ messages in thread
From: Ming Lei @ 2020-08-28  7:48 UTC (permalink / raw)
  To: Jens Axboe; +Cc: linux-block

On Thu, Aug 27, 2020 at 08:41:30PM -0600, Jens Axboe wrote:
> We currently increment the task/vm counts when we first attempt to queue a
> bio. But this isn't necessarily correct - if the request allocation fails
> with -EAGAIN, for example, and the caller retries, then we'll over-account
> by as many retries as are done.
> 
> This can happen for polled IO, where we cannot wait for requests. Hence
> retries can get aggressive, if we're running out of requests. If this
> happens, then watching the IO rates in vmstat are incorrect as they count
> every issue attempt as successful and hence the stats are inflated by
> quite a lot potentially.
> 
> Abstract out the accounting function, and move the blk-mq accounting into
> blk_mq_make_request() when we know we got a request. For the non-mq path,
> just do it when the bio is queued.
> 
> Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> ---
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index d9d632639bd1..aabd016faf79 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -28,7 +28,6 @@
>  #include <linux/slab.h>
>  #include <linux/swap.h>
>  #include <linux/writeback.h>
> -#include <linux/task_io_accounting_ops.h>
>  #include <linux/fault-inject.h>
>  #include <linux/list_sort.h>
>  #include <linux/delay.h>
> @@ -1113,6 +1112,8 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>  	struct bio_list bio_list_on_stack[2];
>  	blk_qc_t ret = BLK_QC_T_NONE;
>  
> +	blk_account_bio(bio);
> +
>  	BUG_ON(bio->bi_next);
>  
>  	bio_list_init(&bio_list_on_stack[0]);
> @@ -1232,35 +1233,6 @@ blk_qc_t submit_bio(struct bio *bio)
>  	if (blkcg_punt_bio_submit(bio))
>  		return BLK_QC_T_NONE;
>  
> -	/*
> -	 * If it's a regular read/write or a barrier with data attached,
> -	 * go through the normal accounting stuff before submission.
> -	 */
> -	if (bio_has_data(bio)) {
> -		unsigned int count;
> -
> -		if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
> -			count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
> -		else
> -			count = bio_sectors(bio);
> -
> -		if (op_is_write(bio_op(bio))) {
> -			count_vm_events(PGPGOUT, count);
> -		} else {
> -			task_io_account_read(bio->bi_iter.bi_size);
> -			count_vm_events(PGPGIN, count);
> -		}
> -
> -		if (unlikely(block_dump)) {
> -			char b[BDEVNAME_SIZE];
> -			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
> -			current->comm, task_pid_nr(current),
> -				op_is_write(bio_op(bio)) ? "WRITE" : "READ",
> -				(unsigned long long)bio->bi_iter.bi_sector,
> -				bio_devname(bio, b), count);
> -		}
> -	}
> -
>  	/*
>  	 * If we're reading data that is part of the userspace workingset, count
>  	 * submission time as memory stall.  When the device is congested, or
> diff --git a/block/blk-mq.c b/block/blk-mq.c
> index 0015a1892153..b77c66dfc19c 100644
> --- a/block/blk-mq.c
> +++ b/block/blk-mq.c
> @@ -27,6 +27,7 @@
>  #include <linux/crash_dump.h>
>  #include <linux/prefetch.h>
>  #include <linux/blk-crypto.h>
> +#include <linux/task_io_accounting_ops.h>
>  
>  #include <trace/events/block.h>
>  
> @@ -2111,6 +2112,39 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
>  	}
>  }
>  
> +void blk_account_bio(struct bio *bio)
> +{
> +	unsigned int count;
> +
> +	/*
> +	 * If it's a regular read/write or a barrier with data attached,
> +	 * go through the normal accounting stuff before submission.
> +	 */
> +	if (unlikely(!bio_has_data(bio)))
> +		return;
> +
> +	if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
> +		count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
> +	else
> +		count = bio_sectors(bio);
> +
> +	if (op_is_write(bio_op(bio))) {
> +		count_vm_events(PGPGOUT, count);
> +	} else {
> +		task_io_account_read(bio->bi_iter.bi_size);
> +		count_vm_events(PGPGIN, count);
> +	}
> +
> +	if (unlikely(block_dump)) {
> +		char b[BDEVNAME_SIZE];
> +		printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
> +		current->comm, task_pid_nr(current),
> +			op_is_write(bio_op(bio)) ? "WRITE" : "READ",
> +			(unsigned long long)bio->bi_iter.bi_sector,
> +			bio_devname(bio, b), count);
> +	}
> +}
> +
>  /**
>   * blk_mq_submit_bio - Create and send a request to block device.
>   * @bio: Bio pointer.
> @@ -2165,6 +2199,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
>  		goto queue_exit;
>  	}
>  
> +	blk_account_bio(bio);

bio merged to plug list or sched will not be accounted in this way.


Thanks,
Ming


^ permalink raw reply	[flat|nested] 3+ messages in thread

* Re: [PATCH RFC] block: defer task/vm accounting until successful
  2020-08-28  7:48 ` Ming Lei
@ 2020-08-28 14:24   ` Jens Axboe
  0 siblings, 0 replies; 3+ messages in thread
From: Jens Axboe @ 2020-08-28 14:24 UTC (permalink / raw)
  To: Ming Lei; +Cc: linux-block

On 8/28/20 1:48 AM, Ming Lei wrote:
> On Thu, Aug 27, 2020 at 08:41:30PM -0600, Jens Axboe wrote:
>> We currently increment the task/vm counts when we first attempt to queue a
>> bio. But this isn't necessarily correct - if the request allocation fails
>> with -EAGAIN, for example, and the caller retries, then we'll over-account
>> by as many retries as are done.
>>
>> This can happen for polled IO, where we cannot wait for requests. Hence
>> retries can get aggressive, if we're running out of requests. If this
>> happens, then watching the IO rates in vmstat are incorrect as they count
>> every issue attempt as successful and hence the stats are inflated by
>> quite a lot potentially.
>>
>> Abstract out the accounting function, and move the blk-mq accounting into
>> blk_mq_make_request() when we know we got a request. For the non-mq path,
>> just do it when the bio is queued.
>>
>> Signed-off-by: Jens Axboe <axboe@kernel.dk>
>>
>> ---
>>
>> diff --git a/block/blk-core.c b/block/blk-core.c
>> index d9d632639bd1..aabd016faf79 100644
>> --- a/block/blk-core.c
>> +++ b/block/blk-core.c
>> @@ -28,7 +28,6 @@
>>  #include <linux/slab.h>
>>  #include <linux/swap.h>
>>  #include <linux/writeback.h>
>> -#include <linux/task_io_accounting_ops.h>
>>  #include <linux/fault-inject.h>
>>  #include <linux/list_sort.h>
>>  #include <linux/delay.h>
>> @@ -1113,6 +1112,8 @@ static blk_qc_t __submit_bio_noacct(struct bio *bio)
>>  	struct bio_list bio_list_on_stack[2];
>>  	blk_qc_t ret = BLK_QC_T_NONE;
>>  
>> +	blk_account_bio(bio);
>> +
>>  	BUG_ON(bio->bi_next);
>>  
>>  	bio_list_init(&bio_list_on_stack[0]);
>> @@ -1232,35 +1233,6 @@ blk_qc_t submit_bio(struct bio *bio)
>>  	if (blkcg_punt_bio_submit(bio))
>>  		return BLK_QC_T_NONE;
>>  
>> -	/*
>> -	 * If it's a regular read/write or a barrier with data attached,
>> -	 * go through the normal accounting stuff before submission.
>> -	 */
>> -	if (bio_has_data(bio)) {
>> -		unsigned int count;
>> -
>> -		if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
>> -			count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
>> -		else
>> -			count = bio_sectors(bio);
>> -
>> -		if (op_is_write(bio_op(bio))) {
>> -			count_vm_events(PGPGOUT, count);
>> -		} else {
>> -			task_io_account_read(bio->bi_iter.bi_size);
>> -			count_vm_events(PGPGIN, count);
>> -		}
>> -
>> -		if (unlikely(block_dump)) {
>> -			char b[BDEVNAME_SIZE];
>> -			printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
>> -			current->comm, task_pid_nr(current),
>> -				op_is_write(bio_op(bio)) ? "WRITE" : "READ",
>> -				(unsigned long long)bio->bi_iter.bi_sector,
>> -				bio_devname(bio, b), count);
>> -		}
>> -	}
>> -
>>  	/*
>>  	 * If we're reading data that is part of the userspace workingset, count
>>  	 * submission time as memory stall.  When the device is congested, or
>> diff --git a/block/blk-mq.c b/block/blk-mq.c
>> index 0015a1892153..b77c66dfc19c 100644
>> --- a/block/blk-mq.c
>> +++ b/block/blk-mq.c
>> @@ -27,6 +27,7 @@
>>  #include <linux/crash_dump.h>
>>  #include <linux/prefetch.h>
>>  #include <linux/blk-crypto.h>
>> +#include <linux/task_io_accounting_ops.h>
>>  
>>  #include <trace/events/block.h>
>>  
>> @@ -2111,6 +2112,39 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
>>  	}
>>  }
>>  
>> +void blk_account_bio(struct bio *bio)
>> +{
>> +	unsigned int count;
>> +
>> +	/*
>> +	 * If it's a regular read/write or a barrier with data attached,
>> +	 * go through the normal accounting stuff before submission.
>> +	 */
>> +	if (unlikely(!bio_has_data(bio)))
>> +		return;
>> +
>> +	if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
>> +		count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
>> +	else
>> +		count = bio_sectors(bio);
>> +
>> +	if (op_is_write(bio_op(bio))) {
>> +		count_vm_events(PGPGOUT, count);
>> +	} else {
>> +		task_io_account_read(bio->bi_iter.bi_size);
>> +		count_vm_events(PGPGIN, count);
>> +	}
>> +
>> +	if (unlikely(block_dump)) {
>> +		char b[BDEVNAME_SIZE];
>> +		printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
>> +		current->comm, task_pid_nr(current),
>> +			op_is_write(bio_op(bio)) ? "WRITE" : "READ",
>> +			(unsigned long long)bio->bi_iter.bi_sector,
>> +			bio_devname(bio, b), count);
>> +	}
>> +}
>> +
>>  /**
>>   * blk_mq_submit_bio - Create and send a request to block device.
>>   * @bio: Bio pointer.
>> @@ -2165,6 +2199,7 @@ blk_qc_t blk_mq_submit_bio(struct bio *bio)
>>  		goto queue_exit;
>>  	}
>>  
>> +	blk_account_bio(bio);
> 
> bio merged to plug list or sched will not be accounted in this way.

Indeed, it's either one or the other... The only other alternative I
could think of is to mark the bio as accounted already, and only do the
accounting on the first queue. That might be a saner way to go.

-- 
Jens Axboe


^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-08-28 14:24 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-08-28  2:41 [PATCH RFC] block: defer task/vm accounting until successful Jens Axboe
2020-08-28  7:48 ` Ming Lei
2020-08-28 14:24   ` Jens Axboe

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).