linux-kernel.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH] block: add io_error stat for block device
@ 2020-09-10  2:20 zhenwei pi
  2020-09-25  3:55 ` PING: " zhenwei pi
  2020-10-22  2:45 ` zhenwei pi
  0 siblings, 2 replies; 3+ messages in thread
From: zhenwei pi @ 2020-09-10  2:20 UTC (permalink / raw)
  To: axboe; +Cc: pizhenwei, linux-block, linux-kernel

Currently if hitting block req error, block layer only prints error
log with a rate limitation. Then agent has to parse kernel log to
record what happens.

In this patch, add read/write/discard/flush stat counter to record
io errors.

Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
---
 block/blk-core.c          | 14 +++++++++++---
 block/genhd.c             | 19 +++++++++++++++++++
 include/linux/part_stat.h |  1 +
 3 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/block/blk-core.c b/block/blk-core.c
index 10c08ac50697..8f1424835700 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -1558,9 +1558,17 @@ bool blk_update_request(struct request *req, blk_status_t error,
 		req->q->integrity.profile->complete_fn(req, nr_bytes);
 #endif
 
-	if (unlikely(error && !blk_rq_is_passthrough(req) &&
-		     !(req->rq_flags & RQF_QUIET)))
-		print_req_error(req, error, __func__);
+	if (unlikely(error && !blk_rq_is_passthrough(req))) {
+		if (op_is_flush(req_op(req)))
+			part_stat_inc(&req->rq_disk->part0,
+				io_errors[STAT_FLUSH]);
+		else
+			part_stat_inc(&req->rq_disk->part0,
+				io_errors[op_stat_group(req_op(req))]);
+
+		if (!(req->rq_flags & RQF_QUIET))
+			print_req_error(req, error, __func__);
+	}
 
 	blk_account_io_completion(req, nr_bytes);
 
diff --git a/block/genhd.c b/block/genhd.c
index 99c64641c314..852035095485 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -104,6 +104,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
 			stat->sectors[group] += ptr->sectors[group];
 			stat->ios[group] += ptr->ios[group];
 			stat->merges[group] += ptr->merges[group];
+			stat->io_errors[group] += ptr->io_errors[group];
 		}
 
 		stat->io_ticks += ptr->io_ticks;
@@ -1374,6 +1375,22 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
 	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
 }
 
+static ssize_t io_error_show(struct device *dev,
+		      struct device_attribute *attr, char *buf)
+{
+	struct hd_struct *p = dev_to_part(dev);
+	struct disk_stats stat;
+
+	part_stat_read_all(p, &stat);
+
+	return sprintf(buf,
+		"%8lu %8lu %8lu %8lu\n",
+		stat.io_errors[STAT_READ],
+		stat.io_errors[STAT_WRITE],
+		stat.io_errors[STAT_DISCARD],
+		stat.io_errors[STAT_FLUSH]);
+}
+
 static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
 static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
 static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
@@ -1386,6 +1403,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
 static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
 static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
 static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
+static DEVICE_ATTR(io_error, 0444, io_error_show, NULL);
 
 #ifdef CONFIG_FAIL_MAKE_REQUEST
 ssize_t part_fail_show(struct device *dev,
@@ -1437,6 +1455,7 @@ static struct attribute *disk_attrs[] = {
 #ifdef CONFIG_FAIL_IO_TIMEOUT
 	&dev_attr_fail_timeout.attr,
 #endif
+	&dev_attr_io_error.attr,
 	NULL
 };
 
diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
index 24125778ef3e..4fe3836d2308 100644
--- a/include/linux/part_stat.h
+++ b/include/linux/part_stat.h
@@ -9,6 +9,7 @@ struct disk_stats {
 	unsigned long sectors[NR_STAT_GROUPS];
 	unsigned long ios[NR_STAT_GROUPS];
 	unsigned long merges[NR_STAT_GROUPS];
+	unsigned long io_errors[NR_STAT_GROUPS];
 	unsigned long io_ticks;
 	local_t in_flight[2];
 };
-- 
2.11.0


^ permalink raw reply related	[flat|nested] 3+ messages in thread

* PING: [PATCH] block: add io_error stat for block device
  2020-09-10  2:20 [PATCH] block: add io_error stat for block device zhenwei pi
@ 2020-09-25  3:55 ` zhenwei pi
  2020-10-22  2:45 ` zhenwei pi
  1 sibling, 0 replies; 3+ messages in thread
From: zhenwei pi @ 2020-09-25  3:55 UTC (permalink / raw)
  To: axboe; +Cc: linux-block, linux-kernel

Hi, Jens

How do you think about error stat of a block device?

On 9/10/20 10:20 AM, zhenwei pi wrote:
> Currently if hitting block req error, block layer only prints error
> log with a rate limitation. Then agent has to parse kernel log to
> record what happens.
> 
> In this patch, add read/write/discard/flush stat counter to record
> io errors.
> 
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
>   block/blk-core.c          | 14 +++++++++++---
>   block/genhd.c             | 19 +++++++++++++++++++
>   include/linux/part_stat.h |  1 +
>   3 files changed, 31 insertions(+), 3 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 10c08ac50697..8f1424835700 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1558,9 +1558,17 @@ bool blk_update_request(struct request *req, blk_status_t error,
>   		req->q->integrity.profile->complete_fn(req, nr_bytes);
>   #endif
>   
> -	if (unlikely(error && !blk_rq_is_passthrough(req) &&
> -		     !(req->rq_flags & RQF_QUIET)))
> -		print_req_error(req, error, __func__);
> +	if (unlikely(error && !blk_rq_is_passthrough(req))) {
> +		if (op_is_flush(req_op(req)))
> +			part_stat_inc(&req->rq_disk->part0,
> +				io_errors[STAT_FLUSH]);
> +		else
> +			part_stat_inc(&req->rq_disk->part0,
> +				io_errors[op_stat_group(req_op(req))]);
> +
> +		if (!(req->rq_flags & RQF_QUIET))
> +			print_req_error(req, error, __func__);
> +	}
>   
>   	blk_account_io_completion(req, nr_bytes);
>   
> diff --git a/block/genhd.c b/block/genhd.c
> index 99c64641c314..852035095485 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -104,6 +104,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
>   			stat->sectors[group] += ptr->sectors[group];
>   			stat->ios[group] += ptr->ios[group];
>   			stat->merges[group] += ptr->merges[group];
> +			stat->io_errors[group] += ptr->io_errors[group];
>   		}
>   
>   		stat->io_ticks += ptr->io_ticks;
> @@ -1374,6 +1375,22 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
>   	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
>   }
>   
> +static ssize_t io_error_show(struct device *dev,
> +		      struct device_attribute *attr, char *buf)
> +{
> +	struct hd_struct *p = dev_to_part(dev);
> +	struct disk_stats stat;
> +
> +	part_stat_read_all(p, &stat);
> +
> +	return sprintf(buf,
> +		"%8lu %8lu %8lu %8lu\n",
> +		stat.io_errors[STAT_READ],
> +		stat.io_errors[STAT_WRITE],
> +		stat.io_errors[STAT_DISCARD],
> +		stat.io_errors[STAT_FLUSH]);
> +}
> +
>   static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
>   static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
>   static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
> @@ -1386,6 +1403,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
>   static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
>   static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
>   static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
> +static DEVICE_ATTR(io_error, 0444, io_error_show, NULL);
>   
>   #ifdef CONFIG_FAIL_MAKE_REQUEST
>   ssize_t part_fail_show(struct device *dev,
> @@ -1437,6 +1455,7 @@ static struct attribute *disk_attrs[] = {
>   #ifdef CONFIG_FAIL_IO_TIMEOUT
>   	&dev_attr_fail_timeout.attr,
>   #endif
> +	&dev_attr_io_error.attr,
>   	NULL
>   };
>   
> diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
> index 24125778ef3e..4fe3836d2308 100644
> --- a/include/linux/part_stat.h
> +++ b/include/linux/part_stat.h
> @@ -9,6 +9,7 @@ struct disk_stats {
>   	unsigned long sectors[NR_STAT_GROUPS];
>   	unsigned long ios[NR_STAT_GROUPS];
>   	unsigned long merges[NR_STAT_GROUPS];
> +	unsigned long io_errors[NR_STAT_GROUPS];
>   	unsigned long io_ticks;
>   	local_t in_flight[2];
>   };
> 

-- 
zhenwei pi

^ permalink raw reply	[flat|nested] 3+ messages in thread

* PING: [PATCH] block: add io_error stat for block device
  2020-09-10  2:20 [PATCH] block: add io_error stat for block device zhenwei pi
  2020-09-25  3:55 ` PING: " zhenwei pi
@ 2020-10-22  2:45 ` zhenwei pi
  1 sibling, 0 replies; 3+ messages in thread
From: zhenwei pi @ 2020-10-22  2:45 UTC (permalink / raw)
  To: axboe; +Cc: linux-block, linux-kernel

Hi, Jens

What do you think about this, adding io error stat for block devices is 
reasonable?

On 9/10/20 10:20 AM, zhenwei pi wrote:
> Currently if hitting block req error, block layer only prints error
> log with a rate limitation. Then agent has to parse kernel log to
> record what happens.
> 
> In this patch, add read/write/discard/flush stat counter to record
> io errors.
> 
> Signed-off-by: zhenwei pi <pizhenwei@bytedance.com>
> ---
>   block/blk-core.c          | 14 +++++++++++---
>   block/genhd.c             | 19 +++++++++++++++++++
>   include/linux/part_stat.h |  1 +
>   3 files changed, 31 insertions(+), 3 deletions(-)
> 
> diff --git a/block/blk-core.c b/block/blk-core.c
> index 10c08ac50697..8f1424835700 100644
> --- a/block/blk-core.c
> +++ b/block/blk-core.c
> @@ -1558,9 +1558,17 @@ bool blk_update_request(struct request *req, blk_status_t error,
>   		req->q->integrity.profile->complete_fn(req, nr_bytes);
>   #endif
>   
> -	if (unlikely(error && !blk_rq_is_passthrough(req) &&
> -		     !(req->rq_flags & RQF_QUIET)))
> -		print_req_error(req, error, __func__);
> +	if (unlikely(error && !blk_rq_is_passthrough(req))) {
> +		if (op_is_flush(req_op(req)))
> +			part_stat_inc(&req->rq_disk->part0,
> +				io_errors[STAT_FLUSH]);
> +		else
> +			part_stat_inc(&req->rq_disk->part0,
> +				io_errors[op_stat_group(req_op(req))]);
> +
> +		if (!(req->rq_flags & RQF_QUIET))
> +			print_req_error(req, error, __func__);
> +	}
>   
>   	blk_account_io_completion(req, nr_bytes);
>   
> diff --git a/block/genhd.c b/block/genhd.c
> index 99c64641c314..852035095485 100644
> --- a/block/genhd.c
> +++ b/block/genhd.c
> @@ -104,6 +104,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
>   			stat->sectors[group] += ptr->sectors[group];
>   			stat->ios[group] += ptr->ios[group];
>   			stat->merges[group] += ptr->merges[group];
> +			stat->io_errors[group] += ptr->io_errors[group];
>   		}
>   
>   		stat->io_ticks += ptr->io_ticks;
> @@ -1374,6 +1375,22 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
>   	return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
>   }
>   
> +static ssize_t io_error_show(struct device *dev,
> +		      struct device_attribute *attr, char *buf)
> +{
> +	struct hd_struct *p = dev_to_part(dev);
> +	struct disk_stats stat;
> +
> +	part_stat_read_all(p, &stat);
> +
> +	return sprintf(buf,
> +		"%8lu %8lu %8lu %8lu\n",
> +		stat.io_errors[STAT_READ],
> +		stat.io_errors[STAT_WRITE],
> +		stat.io_errors[STAT_DISCARD],
> +		stat.io_errors[STAT_FLUSH]);
> +}
> +
>   static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
>   static DEVICE_ATTR(ext_range, 0444, disk_ext_range_show, NULL);
>   static DEVICE_ATTR(removable, 0444, disk_removable_show, NULL);
> @@ -1386,6 +1403,7 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
>   static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
>   static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
>   static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
> +static DEVICE_ATTR(io_error, 0444, io_error_show, NULL);
>   
>   #ifdef CONFIG_FAIL_MAKE_REQUEST
>   ssize_t part_fail_show(struct device *dev,
> @@ -1437,6 +1455,7 @@ static struct attribute *disk_attrs[] = {
>   #ifdef CONFIG_FAIL_IO_TIMEOUT
>   	&dev_attr_fail_timeout.attr,
>   #endif
> +	&dev_attr_io_error.attr,
>   	NULL
>   };
>   
> diff --git a/include/linux/part_stat.h b/include/linux/part_stat.h
> index 24125778ef3e..4fe3836d2308 100644
> --- a/include/linux/part_stat.h
> +++ b/include/linux/part_stat.h
> @@ -9,6 +9,7 @@ struct disk_stats {
>   	unsigned long sectors[NR_STAT_GROUPS];
>   	unsigned long ios[NR_STAT_GROUPS];
>   	unsigned long merges[NR_STAT_GROUPS];
> +	unsigned long io_errors[NR_STAT_GROUPS];
>   	unsigned long io_ticks;
>   	local_t in_flight[2];
>   };
> 

-- 
zhenwei pi

^ permalink raw reply	[flat|nested] 3+ messages in thread

end of thread, other threads:[~2020-10-22  2:45 UTC | newest]

Thread overview: 3+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2020-09-10  2:20 [PATCH] block: add io_error stat for block device zhenwei pi
2020-09-25  3:55 ` PING: " zhenwei pi
2020-10-22  2:45 ` zhenwei pi

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).