linux-block.vger.kernel.org archive mirror
 help / color / mirror / Atom feed
* [PATCH 0/2] loop: Better discard for block devices
@ 2018-10-30 23:06 Evan Green
  2018-10-30 23:06 ` [PATCH 1/2] loop: Report EOPNOTSUPP properly Evan Green
                   ` (2 more replies)
  0 siblings, 3 replies; 23+ messages in thread
From: Evan Green @ 2018-10-30 23:06 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Gwendal Grignou, Alexis Savery, Evan Green, linux-block, linux-kernel

This series addresses some errors seen when using the loop
device directly backed by a block device. The first change plumbs
out the correct error message, and the second change prevents the
error from occurring in many cases.


Evan Green (2):
  loop: Report EOPNOTSUPP properly
  loop: Better discard support for block devices

 drivers/block/loop.c | 75 ++++++++++++++++++++++++++++++++++++----------------
 1 file changed, 52 insertions(+), 23 deletions(-)

-- 
2.16.4

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 1/2] loop: Report EOPNOTSUPP properly
  2018-10-30 23:06 [PATCH 0/2] loop: Better discard for block devices Evan Green
@ 2018-10-30 23:06 ` Evan Green
  2018-11-28  1:06   ` Ming Lei
  2018-10-30 23:06 ` [PATCH 2/2] loop: Better discard support for block devices Evan Green
  2018-10-30 23:50 ` [PATCH 0/2] loop: Better discard " Bart Van Assche
  2 siblings, 1 reply; 23+ messages in thread
From: Evan Green @ 2018-10-30 23:06 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Gwendal Grignou, Alexis Savery, Evan Green, linux-block, linux-kernel

Properly plumb out EOPNOTSUPP from loop driver operations, which may
get returned when for instance a discard operation is attempted but not
supported by the underlying block device. Before this change, everything
was reported in the log as an I/O error, which is scary and not
helpful in debugging.

Signed-off-by: Evan Green <evgreen@chromium.org>
---

 drivers/block/loop.c | 14 +++++++++++---
 1 file changed, 11 insertions(+), 3 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index abad6d15f9563..28990fc94841a 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -458,8 +458,13 @@ static void lo_complete_rq(struct request *rq)
 
 	if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
 	    req_op(rq) != REQ_OP_READ) {
-		if (cmd->ret < 0)
-			ret = BLK_STS_IOERR;
+		if (cmd->ret < 0) {
+			if (cmd->ret == -EOPNOTSUPP)
+				ret = BLK_STS_NOTSUPP;
+			else
+				ret = BLK_STS_IOERR;
+		}
+
 		goto end_io;
 	}
 
@@ -1788,7 +1793,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
  failed:
 	/* complete non-aio request */
 	if (!cmd->use_aio || ret) {
-		cmd->ret = ret ? -EIO : 0;
+		if (ret == -EOPNOTSUPP)
+			cmd->ret = ret;
+		else
+			cmd->ret = ret ? -EIO : 0;
 		blk_mq_complete_request(rq);
 	}
 }
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* [PATCH 2/2] loop: Better discard support for block devices
  2018-10-30 23:06 [PATCH 0/2] loop: Better discard for block devices Evan Green
  2018-10-30 23:06 ` [PATCH 1/2] loop: Report EOPNOTSUPP properly Evan Green
@ 2018-10-30 23:06 ` Evan Green
  2018-11-26 18:53   ` Evan Green
  2018-11-28  1:26   ` Ming Lei
  2018-10-30 23:50 ` [PATCH 0/2] loop: Better discard " Bart Van Assche
  2 siblings, 2 replies; 23+ messages in thread
From: Evan Green @ 2018-10-30 23:06 UTC (permalink / raw)
  To: Jens Axboe
  Cc: Gwendal Grignou, Alexis Savery, Evan Green, linux-block, linux-kernel

If the backing device for a loop device is a block device,
then mirror the discard properties of the underlying block
device into the loop device. While in there, differentiate
between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
different for block devices, but which the loop device had
just been lumping together.

Signed-off-by: Evan Green <evgreen@chromium.org>
---

 drivers/block/loop.c | 61 +++++++++++++++++++++++++++++++++++-----------------
 1 file changed, 41 insertions(+), 20 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 28990fc94841a..176e65101c4ef 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -417,19 +417,14 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq,
 	return ret;
 }
 
-static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
+static int lo_discard(struct loop_device *lo, struct request *rq,
+		int mode, loff_t pos)
 {
-	/*
-	 * We use punch hole to reclaim the free space used by the
-	 * image a.k.a. discard. However we do not support discard if
-	 * encryption is enabled, because it may give an attacker
-	 * useful information.
-	 */
 	struct file *file = lo->lo_backing_file;
-	int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+	struct request_queue *q = lo->lo_queue;
 	int ret;
 
-	if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+	if (!blk_queue_discard(q)) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -603,8 +598,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
 	case REQ_OP_FLUSH:
 		return lo_req_flush(lo, rq);
 	case REQ_OP_DISCARD:
+		return lo_discard(lo, rq,
+			FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, pos);
+
 	case REQ_OP_WRITE_ZEROES:
-		return lo_discard(lo, rq, pos);
+		return lo_discard(lo, rq,
+			FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, pos);
+
 	case REQ_OP_WRITE:
 		if (lo->transfer)
 			return lo_write_transfer(lo, rq, pos);
@@ -859,6 +859,25 @@ static void loop_config_discard(struct loop_device *lo)
 	struct file *file = lo->lo_backing_file;
 	struct inode *inode = file->f_mapping->host;
 	struct request_queue *q = lo->lo_queue;
+	struct request_queue *backingq;
+
+	/*
+	 * If the backing device is a block device, mirror its discard
+	 * capabilities.
+	 */
+	if (S_ISBLK(inode->i_mode)) {
+		backingq = bdev_get_queue(inode->i_bdev);
+		blk_queue_max_discard_sectors(q,
+			backingq->limits.max_discard_sectors);
+
+		blk_queue_max_write_zeroes_sectors(q,
+			backingq->limits.max_write_zeroes_sectors);
+
+		q->limits.discard_granularity =
+			backingq->limits.discard_granularity;
+
+		q->limits.discard_alignment =
+			backingq->limits.discard_alignment;
 
 	/*
 	 * We use punch hole to reclaim the free space used by the
@@ -866,22 +885,24 @@ static void loop_config_discard(struct loop_device *lo)
 	 * encryption is enabled, because it may give an attacker
 	 * useful information.
 	 */
-	if ((!file->f_op->fallocate) ||
-	    lo->lo_encrypt_key_size) {
+	} else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
 		q->limits.discard_granularity = 0;
 		q->limits.discard_alignment = 0;
 		blk_queue_max_discard_sectors(q, 0);
 		blk_queue_max_write_zeroes_sectors(q, 0);
-		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
-		return;
-	}
 
-	q->limits.discard_granularity = inode->i_sb->s_blocksize;
-	q->limits.discard_alignment = 0;
+	} else {
+		q->limits.discard_granularity = inode->i_sb->s_blocksize;
+		q->limits.discard_alignment = 0;
+
+		blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
+		blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
+	}
 
-	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
-	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
-	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+	if (q->limits.max_discard_sectors || q->limits.max_write_zeroes_sectors)
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+	else
+		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
 }
 
 static void loop_unprepare_queue(struct loop_device *lo)
-- 
2.16.4

^ permalink raw reply related	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/2] loop: Better discard for block devices
  2018-10-30 23:06 [PATCH 0/2] loop: Better discard for block devices Evan Green
  2018-10-30 23:06 ` [PATCH 1/2] loop: Report EOPNOTSUPP properly Evan Green
  2018-10-30 23:06 ` [PATCH 2/2] loop: Better discard support for block devices Evan Green
@ 2018-10-30 23:50 ` Bart Van Assche
  2018-11-01 18:15   ` Evan Green
  2 siblings, 1 reply; 23+ messages in thread
From: Bart Van Assche @ 2018-10-30 23:50 UTC (permalink / raw)
  To: Evan Green, Jens Axboe
  Cc: Gwendal Grignou, Alexis Savery, linux-block, linux-kernel

On Tue, 2018-10-30 at 16:06 -0700, Evan Green wrote:
> This series addresses some errors seen when using the loop
> device directly backed by a block device. The first change plumbs
> out the correct error message, and the second change prevents the
> error from occurring in many cases.

Hi Evan,

Can you provide some information about the use case? Why do you think that
it would be useful to support backing a loop device by a block device? Why
to use the loop driver instead of dm-linear for this use case?

Thanks,

Bart.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/2] loop: Better discard for block devices
  2018-10-30 23:50 ` [PATCH 0/2] loop: Better discard " Bart Van Assche
@ 2018-11-01 18:15   ` Evan Green
  2018-11-01 22:41     ` Gwendal Grignou
  2018-11-01 22:44     ` Gwendal Grignou
  0 siblings, 2 replies; 23+ messages in thread
From: Evan Green @ 2018-11-01 18:15 UTC (permalink / raw)
  To: bvanassche; +Cc: axboe, Gwendal Grignou, asavery, linux-block, linux-kernel

On Tue, Oct 30, 2018 at 4:50 PM Bart Van Assche <bvanassche@acm.org> wrote:
>
> On Tue, 2018-10-30 at 16:06 -0700, Evan Green wrote:
> > This series addresses some errors seen when using the loop
> > device directly backed by a block device. The first change plumbs
> > out the correct error message, and the second change prevents the
> > error from occurring in many cases.
>
> Hi Evan,
>
> Can you provide some information about the use case? Why do you think that
> it would be useful to support backing a loop device by a block device? Why
> to use the loop driver instead of dm-linear for this use case?
>

Hi Bart,
In our case, the Chrome OS installer uses the loop device to map
slices of the disk that will ultimately represent partitions [1]. I
believe it has been doing install this way for a very long time, and
has been working well. It actually continues to work, but on block
devices that don't support discard operations, things are a tiny bit
bumpy. This series is meant to smooth out those bumps. As far as I
knew this was a supported scenario.

-Evan
[1] https://chromium.googlesource.com/chromiumos/platform/installer/+/master/chromeos-install

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/2] loop: Better discard for block devices
  2018-11-01 18:15   ` Evan Green
@ 2018-11-01 22:41     ` Gwendal Grignou
  2018-11-01 22:44     ` Gwendal Grignou
  1 sibling, 0 replies; 23+ messages in thread
From: Gwendal Grignou @ 2018-11-01 22:41 UTC (permalink / raw)
  To: evgreen; +Cc: bvanassche, axboe, asavery, linux-block, linux-kernel

[-- Attachment #1: Type: text/plain, Size: 1887 bytes --]

On Thu, Nov 1, 2018 at 11:15 AM Evan Green <evgreen@chromium.org> wrote:

> On Tue, Oct 30, 2018 at 4:50 PM Bart Van Assche <bvanassche@acm.org>
> wrote:
> >
> > On Tue, 2018-10-30 at 16:06 -0700, Evan Green wrote:
> > > This series addresses some errors seen when using the loop
> > > device directly backed by a block device. The first change plumbs
> > > out the correct error message, and the second change prevents the
> > > error from occurring in many cases.
> >
> > Hi Evan,
> >
> > Can you provide some information about the use case? Why do you think
> that
> > it would be useful to support backing a loop device by a block device?
> Why
> > to use the loop driver instead of dm-linear for this use case?
> >
>
> Hi Bart,
> In our case, the Chrome OS installer uses the loop device to map
> slices of the disk that will ultimately represent partitions [1]. I
> believe it has been doing install this way for a very long time, and
> has been working well. It actually continues to work, but on block
> devices that don't support discard operations, things are a tiny bit
> bumpy. This series is meant to smooth out those bumps. As far as I
> knew this was a supported scenario.
>
> -Evan
> [1]
> https://chromium.googlesource.com/chromiumos/platform/installer/+/master/chromeos-install



The code has moved to
https://chromium.googlesource.com/chromiumos/platform2/+/master/installer/chromeos-install
but the idea is the same. We create a loop device to abstract the
persistent destination. The destination can be a block device or a file.
The later case is used for creating master images to be flashed on memory
chip before soldering on the production line.
It is handy when the final device is 4K block aligned but the builder is
using 512b block aligned device, we can mount a device over a file that
will behave like the real device we will flash the image on.

Gwendal.

[-- Attachment #2: Type: text/html, Size: 2712 bytes --]

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/2] loop: Better discard for block devices
  2018-11-01 18:15   ` Evan Green
  2018-11-01 22:41     ` Gwendal Grignou
@ 2018-11-01 22:44     ` Gwendal Grignou
  2018-11-02 16:02       ` Bart Van Assche
  1 sibling, 1 reply; 23+ messages in thread
From: Gwendal Grignou @ 2018-11-01 22:44 UTC (permalink / raw)
  To: evgreen; +Cc: bvanassche, axboe, asavery, linux-block, linux-kernel

On Thu, Nov 1, 2018 at 11:15 AM Evan Green <evgreen@chromium.org> wrote:
>
> On Tue, Oct 30, 2018 at 4:50 PM Bart Van Assche <bvanassche@acm.org> wrote:
> >
> > On Tue, 2018-10-30 at 16:06 -0700, Evan Green wrote:
> > > This series addresses some errors seen when using the loop
> > > device directly backed by a block device. The first change plumbs
> > > out the correct error message, and the second change prevents the
> > > error from occurring in many cases.
> >
> > Hi Evan,
> >
> > Can you provide some information about the use case? Why do you think that
> > it would be useful to support backing a loop device by a block device? Why
> > to use the loop driver instead of dm-linear for this use case?
> >
>
> Hi Bart,
> In our case, the Chrome OS installer uses the loop device to map
> slices of the disk that will ultimately represent partitions [1]. I
> believe it has been doing install this way for a very long time, and
> has been working well. It actually continues to work, but on block
> devices that don't support discard operations, things are a tiny bit
> bumpy. This series is meant to smooth out those bumps. As far as I
> knew this was a supported scenario.
>
> -Evan
> [1] https://chromium.googlesource.com/chromiumos/platform/installer/+/master/chromeos-install

The code has moved to
https://chromium.googlesource.com/chromiumos/platform2/+/master/installer/chromeos-install
but the idea is the same. We create a loop device to abstract the
persistent destination. The destination can be a block device or a
file. The later case is used for creating master images to be flashed
on memory chip before soldering on the production line.
It is handy when the final device is 4K block aligned but the builder
is using 512b block aligned device, we can mount a device over a file
that will behave like the real device we will flash the image on.

Gwendal.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/2] loop: Better discard for block devices
  2018-11-01 22:44     ` Gwendal Grignou
@ 2018-11-02 16:02       ` Bart Van Assche
  2018-11-05 20:35         ` Evan Green
  0 siblings, 1 reply; 23+ messages in thread
From: Bart Van Assche @ 2018-11-02 16:02 UTC (permalink / raw)
  To: Gwendal Grignou, evgreen; +Cc: axboe, asavery, linux-block, linux-kernel

On Thu, 2018-11-01 at 15:44 -0700, Gwendal Grignou wrote:
> On Thu, Nov 1, 2018 at 11:15 AM Evan Green <evgreen@chromium.org> wrote:
> > 
> > On Tue, Oct 30, 2018 at 4:50 PM Bart Van Assche <bvanassche@acm.org> wrote:
> > > 
> > > On Tue, 2018-10-30 at 16:06 -0700, Evan Green wrote:
> > > > This series addresses some errors seen when using the loop
> > > > device directly backed by a block device. The first change plumbs
> > > > out the correct error message, and the second change prevents the
> > > > error from occurring in many cases.
> > > 
> > > Hi Evan,
> > > 
> > > Can you provide some information about the use case? Why do you think that
> > > it would be useful to support backing a loop device by a block device? Why
> > > to use the loop driver instead of dm-linear for this use case?
> > > 
> > 
> > Hi Bart,
> > In our case, the Chrome OS installer uses the loop device to map
> > slices of the disk that will ultimately represent partitions [1]. I
> > believe it has been doing install this way for a very long time, and
> > has been working well. It actually continues to work, but on block
> > devices that don't support discard operations, things are a tiny bit
> > bumpy. This series is meant to smooth out those bumps. As far as I
> > knew this was a supported scenario.
> > 
> > -Evan
> > [1] https://chromium.googlesource.com/chromiumos/platform/installer/+/master/chromeos-install
> 
> The code has moved to
> https://chromium.googlesource.com/chromiumos/platform2/+/master/installer/chromeos-install
> but the idea is the same. We create a loop device to abstract the
> persistent destination. The destination can be a block device or a
> file. The later case is used for creating master images to be flashed
> on memory chip before soldering on the production line.
> It is handy when the final device is 4K block aligned but the builder
> is using 512b block aligned device, we can mount a device over a file
> that will behave like the real device we will flash the image on.

Hi Evan and Gwendal,

Since this is a new use case for the loop driver you may want to add a test
for this use case to the blktests project. Many block layer contributors run
these tests to verify their own block layer changes. Contributing a blktests
test for this new use case will make it easier for others to verify that
their changes do not break your use case.

Bart.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 0/2] loop: Better discard for block devices
  2018-11-02 16:02       ` Bart Van Assche
@ 2018-11-05 20:35         ` Evan Green
  0 siblings, 0 replies; 23+ messages in thread
From: Evan Green @ 2018-11-05 20:35 UTC (permalink / raw)
  To: bvanassche; +Cc: Gwendal Grignou, axboe, asavery, linux-block, linux-kernel

On Fri, Nov 2, 2018 at 9:02 AM Bart Van Assche <bvanassche@acm.org> wrote:
>
> On Thu, 2018-11-01 at 15:44 -0700, Gwendal Grignou wrote:
> > On Thu, Nov 1, 2018 at 11:15 AM Evan Green <evgreen@chromium.org> wrote:
> > >
> > > On Tue, Oct 30, 2018 at 4:50 PM Bart Van Assche <bvanassche@acm.org> wrote:
> > > >
> > > > On Tue, 2018-10-30 at 16:06 -0700, Evan Green wrote:
> > > > > This series addresses some errors seen when using the loop
> > > > > device directly backed by a block device. The first change plumbs
> > > > > out the correct error message, and the second change prevents the
> > > > > error from occurring in many cases.
> > > >
> > > > Hi Evan,
> > > >
> > > > Can you provide some information about the use case? Why do you think that
> > > > it would be useful to support backing a loop device by a block device? Why
> > > > to use the loop driver instead of dm-linear for this use case?
> > > >
> > >
> > > Hi Bart,
> > > In our case, the Chrome OS installer uses the loop device to map
> > > slices of the disk that will ultimately represent partitions [1]. I
> > > believe it has been doing install this way for a very long time, and
> > > has been working well. It actually continues to work, but on block
> > > devices that don't support discard operations, things are a tiny bit
> > > bumpy. This series is meant to smooth out those bumps. As far as I
> > > knew this was a supported scenario.
> > >
> > > -Evan
> > > [1] https://chromium.googlesource.com/chromiumos/platform/installer/+/master/chromeos-install
> >
> > The code has moved to
> > https://chromium.googlesource.com/chromiumos/platform2/+/master/installer/chromeos-install
> > but the idea is the same. We create a loop device to abstract the
> > persistent destination. The destination can be a block device or a
> > file. The later case is used for creating master images to be flashed
> > on memory chip before soldering on the production line.
> > It is handy when the final device is 4K block aligned but the builder
> > is using 512b block aligned device, we can mount a device over a file
> > that will behave like the real device we will flash the image on.
>
> Hi Evan and Gwendal,
>
> Since this is a new use case for the loop driver you may want to add a test
> for this use case to the blktests project. Many block layer contributors run
> these tests to verify their own block layer changes. Contributing a blktests
> test for this new use case will make it easier for others to verify that
> their changes do not break your use case.
>

Good idea. Thanks Bart.

> Bart.

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-10-30 23:06 ` [PATCH 2/2] loop: Better discard support for block devices Evan Green
@ 2018-11-26 18:53   ` Evan Green
  2018-11-27  2:55     ` Ming Lei
  2018-11-28  1:26   ` Ming Lei
  1 sibling, 1 reply; 23+ messages in thread
From: Evan Green @ 2018-11-26 18:53 UTC (permalink / raw)
  To: axboe; +Cc: Gwendal Grignou, asavery, linux-block, linux-kernel

On Tue, Oct 30, 2018 at 4:06 PM Evan Green <evgreen@chromium.org> wrote:
>
> If the backing device for a loop device is a block device,
> then mirror the discard properties of the underlying block
> device into the loop device. While in there, differentiate
> between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
> different for block devices, but which the loop device had
> just been lumping together.
>
> Signed-off-by: Evan Green <evgreen@chromium.org>

Any thoughts on this patch? This fixes issues for us when using a loop
device backed by a block device, where we see many logs like:

[  372.767286] print_req_error: I/O error, dev loop5, sector 88125696

-Evan

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-11-26 18:53   ` Evan Green
@ 2018-11-27  2:55     ` Ming Lei
  2018-11-27 23:34       ` Evan Green
  0 siblings, 1 reply; 23+ messages in thread
From: Ming Lei @ 2018-11-27  2:55 UTC (permalink / raw)
  To: evgreen
  Cc: Jens Axboe, gwendal, asavery, linux-block, Linux Kernel Mailing List

On Tue, Nov 27, 2018 at 2:55 AM Evan Green <evgreen@chromium.org> wrote:
>
> On Tue, Oct 30, 2018 at 4:06 PM Evan Green <evgreen@chromium.org> wrote:
> >
> > If the backing device for a loop device is a block device,

This shouldn't be a very common use case wrt. loop.

> > then mirror the discard properties of the underlying block
> > device into the loop device. While in there, differentiate
> > between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
> > different for block devices, but which the loop device had
> > just been lumping together.
> >
> > Signed-off-by: Evan Green <evgreen@chromium.org>
>
> Any thoughts on this patch? This fixes issues for us when using a loop
> device backed by a block device, where we see many logs like:
>
> [  372.767286] print_req_error: I/O error, dev loop5, sector 88125696

Seems not see any explanation about this IO error and the fix in your patch.
Could you describe it a bit more?


thanks,
Ming Lei

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-11-27  2:55     ` Ming Lei
@ 2018-11-27 23:34       ` Evan Green
  2018-11-28  1:28         ` Ming Lei
  0 siblings, 1 reply; 23+ messages in thread
From: Evan Green @ 2018-11-27 23:34 UTC (permalink / raw)
  To: tom.leiming; +Cc: axboe, Gwendal Grignou, asavery, linux-block, linux-kernel

Hi Ming,

On Mon, Nov 26, 2018 at 6:55 PM Ming Lei <tom.leiming@gmail.com> wrote:
>
> On Tue, Nov 27, 2018 at 2:55 AM Evan Green <evgreen@chromium.org> wrote:
> >
> > On Tue, Oct 30, 2018 at 4:06 PM Evan Green <evgreen@chromium.org> wrote:
> > >
> > > If the backing device for a loop device is a block device,
>
> This shouldn't be a very common use case wrt. loop.

Yeah, I'm starting to gather that. Or maybe I'm just the first one to
mention it on the kernel lists ;) We've used this in our Chrome OS
installer, I believe for many years. Gwendal piped in with a few
reasons we do it this way on the cover letter, but in general I think
it allows us to have a unified set of functions to install to a file,
disk, or prepare an image that may have a different block size than
those on the running system.

>
> > > then mirror the discard properties of the underlying block
> > > device into the loop device. While in there, differentiate
> > > between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
> > > different for block devices, but which the loop device had
> > > just been lumping together.
> > >
> > > Signed-off-by: Evan Green <evgreen@chromium.org>
> >
> > Any thoughts on this patch? This fixes issues for us when using a loop
> > device backed by a block device, where we see many logs like:
> >
> > [  372.767286] print_req_error: I/O error, dev loop5, sector 88125696
>
> Seems not see any explanation about this IO error and the fix in your patch.
> Could you describe it a bit more?

Sure, I probably should have included more context with the series.

The loop device always reports that it supports discard, by setting up
the max_discard_sectors and max_write_zeroes_sectors in the blk queue.
When the loop device gets a discard or write-zeroes request, it turns
around and calls fallocate on the underlying device with the
PUNCH_HOLE flag. This makes sense when you're backed by a file and
hoping to just deallocate the space, but may fail when you're backed
by a block device that doesn't support discard, or doesn't write
zeroes to discarded sectors. Weirdly, lo_discard already had some code
for preserving EOPNOTSUPP, but then later the error is smashed into
EIO. Patch 1 pipes out EOPNOTSUPP properly, so it doesn't get squashed
into EIO.

Patch 2 reflects the discard characteristics of the underlying device
into the loop device. That way, if you're backed by a file or a block
device that does support discard, everything works great, and user
mode can even see and use the correct discard and write zero
granularities. If you're backed by a block device that does not
support discard, this is exposed to user mode, which then usually
avoids calling fallocate, and doesn't feel betrayed that their
requests are unexpectedly failing.

-Evan

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 1/2] loop: Report EOPNOTSUPP properly
  2018-10-30 23:06 ` [PATCH 1/2] loop: Report EOPNOTSUPP properly Evan Green
@ 2018-11-28  1:06   ` Ming Lei
  0 siblings, 0 replies; 23+ messages in thread
From: Ming Lei @ 2018-11-28  1:06 UTC (permalink / raw)
  To: Evan Green
  Cc: Jens Axboe, Gwendal Grignou, Alexis Savery, linux-block, linux-kernel

On Tue, Oct 30, 2018 at 04:06:23PM -0700, Evan Green wrote:
> Properly plumb out EOPNOTSUPP from loop driver operations, which may
> get returned when for instance a discard operation is attempted but not
> supported by the underlying block device. Before this change, everything
> was reported in the log as an I/O error, which is scary and not
> helpful in debugging.
> 
> Signed-off-by: Evan Green <evgreen@chromium.org>
> ---
> 
>  drivers/block/loop.c | 14 +++++++++++---
>  1 file changed, 11 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index abad6d15f9563..28990fc94841a 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -458,8 +458,13 @@ static void lo_complete_rq(struct request *rq)
>  
>  	if (!cmd->use_aio || cmd->ret < 0 || cmd->ret == blk_rq_bytes(rq) ||
>  	    req_op(rq) != REQ_OP_READ) {
> -		if (cmd->ret < 0)
> -			ret = BLK_STS_IOERR;
> +		if (cmd->ret < 0) {
> +			if (cmd->ret == -EOPNOTSUPP)
> +				ret = BLK_STS_NOTSUPP;
> +			else
> +				ret = BLK_STS_IOERR;
> +		}
> +
>  		goto end_io;
>  	}
>  
> @@ -1788,7 +1793,10 @@ static void loop_handle_cmd(struct loop_cmd *cmd)
>   failed:
>  	/* complete non-aio request */
>  	if (!cmd->use_aio || ret) {
> -		cmd->ret = ret ? -EIO : 0;
> +		if (ret == -EOPNOTSUPP)
> +			cmd->ret = ret;
> +		else
> +			cmd->ret = ret ? -EIO : 0;
>  		blk_mq_complete_request(rq);
>  	}
>  }
> -- 
> 2.16.4
> 

Looks fine:

Reviewed-by: Ming Lei <ming.lei@redhat.com>

Thanks,
Ming

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-10-30 23:06 ` [PATCH 2/2] loop: Better discard support for block devices Evan Green
  2018-11-26 18:53   ` Evan Green
@ 2018-11-28  1:26   ` Ming Lei
  2018-12-04 22:19     ` Evan Green
  1 sibling, 1 reply; 23+ messages in thread
From: Ming Lei @ 2018-11-28  1:26 UTC (permalink / raw)
  To: Evan Green
  Cc: Jens Axboe, Gwendal Grignou, Alexis Savery, linux-block, linux-kernel

On Tue, Oct 30, 2018 at 04:06:24PM -0700, Evan Green wrote:
> If the backing device for a loop device is a block device,
> then mirror the discard properties of the underlying block
> device into the loop device. While in there, differentiate
> between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
> different for block devices, but which the loop device had
> just been lumping together.
> 
> Signed-off-by: Evan Green <evgreen@chromium.org>
> ---
> 
>  drivers/block/loop.c | 61 +++++++++++++++++++++++++++++++++++-----------------
>  1 file changed, 41 insertions(+), 20 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 28990fc94841a..176e65101c4ef 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -417,19 +417,14 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq,
>  	return ret;
>  }
>  
> -static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
> +static int lo_discard(struct loop_device *lo, struct request *rq,
> +		int mode, loff_t pos)
>  {
> -	/*
> -	 * We use punch hole to reclaim the free space used by the
> -	 * image a.k.a. discard. However we do not support discard if
> -	 * encryption is enabled, because it may give an attacker
> -	 * useful information.
> -	 */
>  	struct file *file = lo->lo_backing_file;
> -	int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
> +	struct request_queue *q = lo->lo_queue;
>  	int ret;
>  
> -	if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> +	if (!blk_queue_discard(q)) {
>  		ret = -EOPNOTSUPP;
>  		goto out;
>  	}
> @@ -603,8 +598,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
>  	case REQ_OP_FLUSH:
>  		return lo_req_flush(lo, rq);
>  	case REQ_OP_DISCARD:
> +		return lo_discard(lo, rq,
> +			FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, pos);
> +
>  	case REQ_OP_WRITE_ZEROES:
> -		return lo_discard(lo, rq, pos);
> +		return lo_discard(lo, rq,
> +			FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, pos);
> +
>  	case REQ_OP_WRITE:
>  		if (lo->transfer)
>  			return lo_write_transfer(lo, rq, pos);
> @@ -859,6 +859,25 @@ static void loop_config_discard(struct loop_device *lo)
>  	struct file *file = lo->lo_backing_file;
>  	struct inode *inode = file->f_mapping->host;
>  	struct request_queue *q = lo->lo_queue;
> +	struct request_queue *backingq;
> +
> +	/*
> +	 * If the backing device is a block device, mirror its discard
> +	 * capabilities.
> +	 */
> +	if (S_ISBLK(inode->i_mode)) {
> +		backingq = bdev_get_queue(inode->i_bdev);
> +		blk_queue_max_discard_sectors(q,
> +			backingq->limits.max_discard_sectors);
> +
> +		blk_queue_max_write_zeroes_sectors(q,
> +			backingq->limits.max_write_zeroes_sectors);
> +
> +		q->limits.discard_granularity =
> +			backingq->limits.discard_granularity;
> +
> +		q->limits.discard_alignment =
> +			backingq->limits.discard_alignment;

I think it isn't necessary to mirror backing queue's discard/write_zeros
capabilities, given either fs of the underlying queue can deal with well.

>  
>  	/*
>  	 * We use punch hole to reclaim the free space used by the
> @@ -866,22 +885,24 @@ static void loop_config_discard(struct loop_device *lo)
>  	 * encryption is enabled, because it may give an attacker
>  	 * useful information.
>  	 */
> -	if ((!file->f_op->fallocate) ||
> -	    lo->lo_encrypt_key_size) {
> +	} else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
>  		q->limits.discard_granularity = 0;
>  		q->limits.discard_alignment = 0;
>  		blk_queue_max_discard_sectors(q, 0);
>  		blk_queue_max_write_zeroes_sectors(q, 0);
> -		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> -		return;
> -	}
>  
> -	q->limits.discard_granularity = inode->i_sb->s_blocksize;
> -	q->limits.discard_alignment = 0;
> +	} else {
> +		q->limits.discard_granularity = inode->i_sb->s_blocksize;
> +		q->limits.discard_alignment = 0;
> +
> +		blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> +		blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> +	}
>  
> -	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> -	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> -	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> +	if (q->limits.max_discard_sectors || q->limits.max_write_zeroes_sectors)
> +		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> +	else
> +		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
>  }

Looks it should work just by mirroring backing queue's discard
capability to loop queue in case that the loop is backed by
block device, doesn't it? Meantime the unified discard limits &
write_zeros limits can be kept.

thanks,
Ming

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-11-27 23:34       ` Evan Green
@ 2018-11-28  1:28         ` Ming Lei
  0 siblings, 0 replies; 23+ messages in thread
From: Ming Lei @ 2018-11-28  1:28 UTC (permalink / raw)
  To: Evan Green
  Cc: tom.leiming, axboe, Gwendal Grignou, asavery, linux-block, linux-kernel

On Tue, Nov 27, 2018 at 03:34:04PM -0800, Evan Green wrote:
> Hi Ming,
> 
> On Mon, Nov 26, 2018 at 6:55 PM Ming Lei <tom.leiming@gmail.com> wrote:
> >
> > On Tue, Nov 27, 2018 at 2:55 AM Evan Green <evgreen@chromium.org> wrote:
> > >
> > > On Tue, Oct 30, 2018 at 4:06 PM Evan Green <evgreen@chromium.org> wrote:
> > > >
> > > > If the backing device for a loop device is a block device,
> >
> > This shouldn't be a very common use case wrt. loop.
> 
> Yeah, I'm starting to gather that. Or maybe I'm just the first one to
> mention it on the kernel lists ;) We've used this in our Chrome OS
> installer, I believe for many years. Gwendal piped in with a few
> reasons we do it this way on the cover letter, but in general I think
> it allows us to have a unified set of functions to install to a file,
> disk, or prepare an image that may have a different block size than
> those on the running system.

OK, got it, it makes sense.

> 
> >
> > > > then mirror the discard properties of the underlying block
> > > > device into the loop device. While in there, differentiate
> > > > between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
> > > > different for block devices, but which the loop device had
> > > > just been lumping together.
> > > >
> > > > Signed-off-by: Evan Green <evgreen@chromium.org>
> > >
> > > Any thoughts on this patch? This fixes issues for us when using a loop
> > > device backed by a block device, where we see many logs like:
> > >
> > > [  372.767286] print_req_error: I/O error, dev loop5, sector 88125696
> >
> > Seems not see any explanation about this IO error and the fix in your patch.
> > Could you describe it a bit more?
> 
> Sure, I probably should have included more context with the series.
> 
> The loop device always reports that it supports discard, by setting up
> the max_discard_sectors and max_write_zeroes_sectors in the blk queue.
> When the loop device gets a discard or write-zeroes request, it turns
> around and calls fallocate on the underlying device with the
> PUNCH_HOLE flag. This makes sense when you're backed by a file and
> hoping to just deallocate the space, but may fail when you're backed
> by a block device that doesn't support discard, or doesn't write
> zeroes to discarded sectors. Weirdly, lo_discard already had some code
> for preserving EOPNOTSUPP, but then later the error is smashed into
> EIO. Patch 1 pipes out EOPNOTSUPP properly, so it doesn't get squashed
> into EIO.
> 
> Patch 2 reflects the discard characteristics of the underlying device
> into the loop device. That way, if you're backed by a file or a block
> device that does support discard, everything works great, and user
> mode can even see and use the correct discard and write zero
> granularities. If you're backed by a block device that does not
> support discard, this is exposed to user mode, which then usually
> avoids calling fallocate, and doesn't feel betrayed that their
> requests are unexpectedly failing.

Thanks for your detailed explanation, and I think we need to fix it.

Thanks,
Ming

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-11-28  1:26   ` Ming Lei
@ 2018-12-04 22:19     ` Evan Green
  2018-12-05  1:10       ` Ming Lei
  0 siblings, 1 reply; 23+ messages in thread
From: Evan Green @ 2018-12-04 22:19 UTC (permalink / raw)
  To: ming.lei; +Cc: axboe, Gwendal Grignou, asavery, linux-block, linux-kernel

Hi Ming,

On Tue, Nov 27, 2018 at 5:26 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Tue, Oct 30, 2018 at 04:06:24PM -0700, Evan Green wrote:
> > If the backing device for a loop device is a block device,
> > then mirror the discard properties of the underlying block
> > device into the loop device. While in there, differentiate
> > between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
> > different for block devices, but which the loop device had
> > just been lumping together.
> >
> > Signed-off-by: Evan Green <evgreen@chromium.org>
> > ---
> >
> >  drivers/block/loop.c | 61 +++++++++++++++++++++++++++++++++++-----------------
> >  1 file changed, 41 insertions(+), 20 deletions(-)
> >
> > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > index 28990fc94841a..176e65101c4ef 100644
> > --- a/drivers/block/loop.c
> > +++ b/drivers/block/loop.c
> > @@ -417,19 +417,14 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq,
> >       return ret;
> >  }
> >
> > -static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
> > +static int lo_discard(struct loop_device *lo, struct request *rq,
> > +             int mode, loff_t pos)
> >  {
> > -     /*
> > -      * We use punch hole to reclaim the free space used by the
> > -      * image a.k.a. discard. However we do not support discard if
> > -      * encryption is enabled, because it may give an attacker
> > -      * useful information.
> > -      */
> >       struct file *file = lo->lo_backing_file;
> > -     int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
> > +     struct request_queue *q = lo->lo_queue;
> >       int ret;
> >
> > -     if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> > +     if (!blk_queue_discard(q)) {
> >               ret = -EOPNOTSUPP;
> >               goto out;
> >       }
> > @@ -603,8 +598,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
> >       case REQ_OP_FLUSH:
> >               return lo_req_flush(lo, rq);
> >       case REQ_OP_DISCARD:
> > +             return lo_discard(lo, rq,
> > +                     FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, pos);
> > +
> >       case REQ_OP_WRITE_ZEROES:
> > -             return lo_discard(lo, rq, pos);
> > +             return lo_discard(lo, rq,
> > +                     FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, pos);
> > +
> >       case REQ_OP_WRITE:
> >               if (lo->transfer)
> >                       return lo_write_transfer(lo, rq, pos);
> > @@ -859,6 +859,25 @@ static void loop_config_discard(struct loop_device *lo)
> >       struct file *file = lo->lo_backing_file;
> >       struct inode *inode = file->f_mapping->host;
> >       struct request_queue *q = lo->lo_queue;
> > +     struct request_queue *backingq;
> > +
> > +     /*
> > +      * If the backing device is a block device, mirror its discard
> > +      * capabilities.
> > +      */
> > +     if (S_ISBLK(inode->i_mode)) {
> > +             backingq = bdev_get_queue(inode->i_bdev);
> > +             blk_queue_max_discard_sectors(q,
> > +                     backingq->limits.max_discard_sectors);
> > +
> > +             blk_queue_max_write_zeroes_sectors(q,
> > +                     backingq->limits.max_write_zeroes_sectors);
> > +
> > +             q->limits.discard_granularity =
> > +                     backingq->limits.discard_granularity;
> > +
> > +             q->limits.discard_alignment =
> > +                     backingq->limits.discard_alignment;
>
> I think it isn't necessary to mirror backing queue's discard/write_zeros
> capabilities, given either fs of the underlying queue can deal with well.
>
> >
> >       /*
> >        * We use punch hole to reclaim the free space used by the
> > @@ -866,22 +885,24 @@ static void loop_config_discard(struct loop_device *lo)
> >        * encryption is enabled, because it may give an attacker
> >        * useful information.
> >        */
> > -     if ((!file->f_op->fallocate) ||
> > -         lo->lo_encrypt_key_size) {
> > +     } else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> >               q->limits.discard_granularity = 0;
> >               q->limits.discard_alignment = 0;
> >               blk_queue_max_discard_sectors(q, 0);
> >               blk_queue_max_write_zeroes_sectors(q, 0);
> > -             blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> > -             return;
> > -     }
> >
> > -     q->limits.discard_granularity = inode->i_sb->s_blocksize;
> > -     q->limits.discard_alignment = 0;
> > +     } else {
> > +             q->limits.discard_granularity = inode->i_sb->s_blocksize;
> > +             q->limits.discard_alignment = 0;
> > +
> > +             blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> > +             blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> > +     }
> >
> > -     blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> > -     blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> > -     blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> > +     if (q->limits.max_discard_sectors || q->limits.max_write_zeroes_sectors)
> > +             blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> > +     else
> > +             blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> >  }
>
> Looks it should work just by mirroring backing queue's discard
> capability to loop queue in case that the loop is backed by
> block device, doesn't it? Meantime the unified discard limits &
> write_zeros limits can be kept.

I tested this out, and you're right that I could just flip the
QUEUE_FLAG_DISCARD based on whether its a block device, and leave
everything else alone, to completely disable discard support for loop
devices backed by block devices. This seems to work for programs like
mkfs.ext4, but still leaves problems for coreutils cp.

But is that really the right call? With this change, we're not only
able to use loop devices in this way, but we're able to use the
discard and zero functionality of the underlying block device by
simply passing it through. To me that seemed better than disabling all
discard support for block devices, which would severely slow us down
on some devices.
-Evan

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-12-04 22:19     ` Evan Green
@ 2018-12-05  1:10       ` Ming Lei
  2018-12-05 19:35         ` Evan Green
  0 siblings, 1 reply; 23+ messages in thread
From: Ming Lei @ 2018-12-05  1:10 UTC (permalink / raw)
  To: Evan Green; +Cc: axboe, Gwendal Grignou, asavery, linux-block, linux-kernel

On Tue, Dec 04, 2018 at 02:19:46PM -0800, Evan Green wrote:
> Hi Ming,
> 
> On Tue, Nov 27, 2018 at 5:26 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Tue, Oct 30, 2018 at 04:06:24PM -0700, Evan Green wrote:
> > > If the backing device for a loop device is a block device,
> > > then mirror the discard properties of the underlying block
> > > device into the loop device. While in there, differentiate
> > > between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
> > > different for block devices, but which the loop device had
> > > just been lumping together.
> > >
> > > Signed-off-by: Evan Green <evgreen@chromium.org>
> > > ---
> > >
> > >  drivers/block/loop.c | 61 +++++++++++++++++++++++++++++++++++-----------------
> > >  1 file changed, 41 insertions(+), 20 deletions(-)
> > >
> > > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > > index 28990fc94841a..176e65101c4ef 100644
> > > --- a/drivers/block/loop.c
> > > +++ b/drivers/block/loop.c
> > > @@ -417,19 +417,14 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq,
> > >       return ret;
> > >  }
> > >
> > > -static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
> > > +static int lo_discard(struct loop_device *lo, struct request *rq,
> > > +             int mode, loff_t pos)
> > >  {
> > > -     /*
> > > -      * We use punch hole to reclaim the free space used by the
> > > -      * image a.k.a. discard. However we do not support discard if
> > > -      * encryption is enabled, because it may give an attacker
> > > -      * useful information.
> > > -      */
> > >       struct file *file = lo->lo_backing_file;
> > > -     int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
> > > +     struct request_queue *q = lo->lo_queue;
> > >       int ret;
> > >
> > > -     if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> > > +     if (!blk_queue_discard(q)) {
> > >               ret = -EOPNOTSUPP;
> > >               goto out;
> > >       }
> > > @@ -603,8 +598,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
> > >       case REQ_OP_FLUSH:
> > >               return lo_req_flush(lo, rq);
> > >       case REQ_OP_DISCARD:
> > > +             return lo_discard(lo, rq,
> > > +                     FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, pos);
> > > +
> > >       case REQ_OP_WRITE_ZEROES:
> > > -             return lo_discard(lo, rq, pos);
> > > +             return lo_discard(lo, rq,
> > > +                     FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, pos);
> > > +
> > >       case REQ_OP_WRITE:
> > >               if (lo->transfer)
> > >                       return lo_write_transfer(lo, rq, pos);
> > > @@ -859,6 +859,25 @@ static void loop_config_discard(struct loop_device *lo)
> > >       struct file *file = lo->lo_backing_file;
> > >       struct inode *inode = file->f_mapping->host;
> > >       struct request_queue *q = lo->lo_queue;
> > > +     struct request_queue *backingq;
> > > +
> > > +     /*
> > > +      * If the backing device is a block device, mirror its discard
> > > +      * capabilities.
> > > +      */
> > > +     if (S_ISBLK(inode->i_mode)) {
> > > +             backingq = bdev_get_queue(inode->i_bdev);
> > > +             blk_queue_max_discard_sectors(q,
> > > +                     backingq->limits.max_discard_sectors);
> > > +
> > > +             blk_queue_max_write_zeroes_sectors(q,
> > > +                     backingq->limits.max_write_zeroes_sectors);
> > > +
> > > +             q->limits.discard_granularity =
> > > +                     backingq->limits.discard_granularity;
> > > +
> > > +             q->limits.discard_alignment =
> > > +                     backingq->limits.discard_alignment;
> >
> > I think it isn't necessary to mirror backing queue's discard/write_zeros
> > capabilities, given either fs of the underlying queue can deal with well.
> >
> > >
> > >       /*
> > >        * We use punch hole to reclaim the free space used by the
> > > @@ -866,22 +885,24 @@ static void loop_config_discard(struct loop_device *lo)
> > >        * encryption is enabled, because it may give an attacker
> > >        * useful information.
> > >        */
> > > -     if ((!file->f_op->fallocate) ||
> > > -         lo->lo_encrypt_key_size) {
> > > +     } else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> > >               q->limits.discard_granularity = 0;
> > >               q->limits.discard_alignment = 0;
> > >               blk_queue_max_discard_sectors(q, 0);
> > >               blk_queue_max_write_zeroes_sectors(q, 0);
> > > -             blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> > > -             return;
> > > -     }
> > >
> > > -     q->limits.discard_granularity = inode->i_sb->s_blocksize;
> > > -     q->limits.discard_alignment = 0;
> > > +     } else {
> > > +             q->limits.discard_granularity = inode->i_sb->s_blocksize;
> > > +             q->limits.discard_alignment = 0;
> > > +
> > > +             blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> > > +             blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> > > +     }
> > >
> > > -     blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> > > -     blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> > > -     blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> > > +     if (q->limits.max_discard_sectors || q->limits.max_write_zeroes_sectors)
> > > +             blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> > > +     else
> > > +             blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> > >  }
> >
> > Looks it should work just by mirroring backing queue's discard
> > capability to loop queue in case that the loop is backed by
> > block device, doesn't it? Meantime the unified discard limits &
> > write_zeros limits can be kept.
> 
> I tested this out, and you're right that I could just flip the
> QUEUE_FLAG_DISCARD based on whether its a block device, and leave

What I meant actually is to do the following discard config:

	bool discard;
	if (S_ISBLK(inode->i_mode)) {
		struct request_queue *backingq = bdev_get_queue(inode->i_bdev);
		discard = blk_queue_discard(backingq);
	} else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size)
		discard = false;
	else
		discard = true;

	if (discard) {
		blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
		blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
	} else {
		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
	}

> everything else alone, to completely disable discard support for loop
> devices backed by block devices. This seems to work for programs like
> mkfs.ext4, but still leaves problems for coreutils cp.
> 
> But is that really the right call? With this change, we're not only
> able to use loop devices in this way, but we're able to use the
> discard and zero functionality of the underlying block device by
> simply passing it through. To me that seemed better than disabling all
> discard support for block devices, which would severely slow us down
> on some devices.

I guess the above approach can do the same job with yours, but simpler.

thanks,
Ming

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-12-05  1:10       ` Ming Lei
@ 2018-12-05 19:35         ` Evan Green
  2018-12-06  0:22           ` Ming Lei
  2018-12-06  3:15           ` Martin K. Petersen
  0 siblings, 2 replies; 23+ messages in thread
From: Evan Green @ 2018-12-05 19:35 UTC (permalink / raw)
  To: ming.lei; +Cc: axboe, Gwendal Grignou, asavery, linux-block, linux-kernel

Hi Ming,

On Tue, Dec 4, 2018 at 5:11 PM Ming Lei <ming.lei@redhat.com> wrote:
>
> On Tue, Dec 04, 2018 at 02:19:46PM -0800, Evan Green wrote:
> > Hi Ming,
> >
> > On Tue, Nov 27, 2018 at 5:26 PM Ming Lei <ming.lei@redhat.com> wrote:
> > >
> > > On Tue, Oct 30, 2018 at 04:06:24PM -0700, Evan Green wrote:
> > > > If the backing device for a loop device is a block device,
> > > > then mirror the discard properties of the underlying block
> > > > device into the loop device. While in there, differentiate
> > > > between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
> > > > different for block devices, but which the loop device had
> > > > just been lumping together.
> > > >
> > > > Signed-off-by: Evan Green <evgreen@chromium.org>
> > > > ---
> > > >
> > > >  drivers/block/loop.c | 61 +++++++++++++++++++++++++++++++++++-----------------
> > > >  1 file changed, 41 insertions(+), 20 deletions(-)
> > > >
> > > > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > > > index 28990fc94841a..176e65101c4ef 100644
> > > > --- a/drivers/block/loop.c
> > > > +++ b/drivers/block/loop.c
> > > > @@ -417,19 +417,14 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq,
> > > >       return ret;
> > > >  }
> > > >
> > > > -static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
> > > > +static int lo_discard(struct loop_device *lo, struct request *rq,
> > > > +             int mode, loff_t pos)
> > > >  {
> > > > -     /*
> > > > -      * We use punch hole to reclaim the free space used by the
> > > > -      * image a.k.a. discard. However we do not support discard if
> > > > -      * encryption is enabled, because it may give an attacker
> > > > -      * useful information.
> > > > -      */
> > > >       struct file *file = lo->lo_backing_file;
> > > > -     int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
> > > > +     struct request_queue *q = lo->lo_queue;
> > > >       int ret;
> > > >
> > > > -     if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> > > > +     if (!blk_queue_discard(q)) {
> > > >               ret = -EOPNOTSUPP;
> > > >               goto out;
> > > >       }
> > > > @@ -603,8 +598,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
> > > >       case REQ_OP_FLUSH:
> > > >               return lo_req_flush(lo, rq);
> > > >       case REQ_OP_DISCARD:
> > > > +             return lo_discard(lo, rq,
> > > > +                     FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, pos);
> > > > +
> > > >       case REQ_OP_WRITE_ZEROES:
> > > > -             return lo_discard(lo, rq, pos);
> > > > +             return lo_discard(lo, rq,
> > > > +                     FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, pos);
> > > > +
> > > >       case REQ_OP_WRITE:
> > > >               if (lo->transfer)
> > > >                       return lo_write_transfer(lo, rq, pos);
> > > > @@ -859,6 +859,25 @@ static void loop_config_discard(struct loop_device *lo)
> > > >       struct file *file = lo->lo_backing_file;
> > > >       struct inode *inode = file->f_mapping->host;
> > > >       struct request_queue *q = lo->lo_queue;
> > > > +     struct request_queue *backingq;
> > > > +
> > > > +     /*
> > > > +      * If the backing device is a block device, mirror its discard
> > > > +      * capabilities.
> > > > +      */
> > > > +     if (S_ISBLK(inode->i_mode)) {
> > > > +             backingq = bdev_get_queue(inode->i_bdev);
> > > > +             blk_queue_max_discard_sectors(q,
> > > > +                     backingq->limits.max_discard_sectors);
> > > > +
> > > > +             blk_queue_max_write_zeroes_sectors(q,
> > > > +                     backingq->limits.max_write_zeroes_sectors);
> > > > +
> > > > +             q->limits.discard_granularity =
> > > > +                     backingq->limits.discard_granularity;
> > > > +
> > > > +             q->limits.discard_alignment =
> > > > +                     backingq->limits.discard_alignment;
> > >
> > > I think it isn't necessary to mirror backing queue's discard/write_zeros
> > > capabilities, given either fs of the underlying queue can deal with well.
> > >
> > > >
> > > >       /*
> > > >        * We use punch hole to reclaim the free space used by the
> > > > @@ -866,22 +885,24 @@ static void loop_config_discard(struct loop_device *lo)
> > > >        * encryption is enabled, because it may give an attacker
> > > >        * useful information.
> > > >        */
> > > > -     if ((!file->f_op->fallocate) ||
> > > > -         lo->lo_encrypt_key_size) {
> > > > +     } else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> > > >               q->limits.discard_granularity = 0;
> > > >               q->limits.discard_alignment = 0;
> > > >               blk_queue_max_discard_sectors(q, 0);
> > > >               blk_queue_max_write_zeroes_sectors(q, 0);
> > > > -             blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> > > > -             return;
> > > > -     }
> > > >
> > > > -     q->limits.discard_granularity = inode->i_sb->s_blocksize;
> > > > -     q->limits.discard_alignment = 0;
> > > > +     } else {
> > > > +             q->limits.discard_granularity = inode->i_sb->s_blocksize;
> > > > +             q->limits.discard_alignment = 0;
> > > > +
> > > > +             blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> > > > +             blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> > > > +     }
> > > >
> > > > -     blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> > > > -     blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> > > > -     blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> > > > +     if (q->limits.max_discard_sectors || q->limits.max_write_zeroes_sectors)
> > > > +             blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> > > > +     else
> > > > +             blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> > > >  }
> > >
> > > Looks it should work just by mirroring backing queue's discard
> > > capability to loop queue in case that the loop is backed by
> > > block device, doesn't it? Meantime the unified discard limits &
> > > write_zeros limits can be kept.
> >
> > I tested this out, and you're right that I could just flip the
> > QUEUE_FLAG_DISCARD based on whether its a block device, and leave
>
> What I meant actually is to do the following discard config:
>
>         bool discard;
>         if (S_ISBLK(inode->i_mode)) {
>                 struct request_queue *backingq = bdev_get_queue(inode->i_bdev);
>                 discard = blk_queue_discard(backingq);
>         } else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size)
>                 discard = false;
>         else
>                 discard = true;
>
>         if (discard) {
>                 blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
>                 blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
>                 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
>         } else {
>                 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
>         }

Ah, I see. But I think it's useful to reflect max_discard_sectors,
max_write_zeroes_sectors, discard_granularity, and discard_alignment
from the block device to the loop device. With the exception of
discard_alignment, these parameters are visible via sysfs, so usermode
can actually use these to make more intelligent use of fallocate.
Without this part of it, I still see issues with GNU cp.

I'm not totally sure about discard_alignment, that seems to be useful
in cases of merging blk requests. So I can stop mirroring that one if
it's harmful or not helpful. But unless it's a nak, I'd really love to
keep most of the mirroring. In which case the bool doesn't do a whole
lot of simplifying.

>
> > everything else alone, to completely disable discard support for loop
> > devices backed by block devices. This seems to work for programs like
> > mkfs.ext4, but still leaves problems for coreutils cp.
> >
> > But is that really the right call? With this change, we're not only
> > able to use loop devices in this way, but we're able to use the
> > discard and zero functionality of the underlying block device by
> > simply passing it through. To me that seemed better than disabling all
> > discard support for block devices, which would severely slow us down
> > on some devices.
>
> I guess the above approach can do the same job with yours, but simpler.
>
> thanks,
> Ming

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-12-05 19:35         ` Evan Green
@ 2018-12-06  0:22           ` Ming Lei
  2018-12-06  3:15           ` Martin K. Petersen
  1 sibling, 0 replies; 23+ messages in thread
From: Ming Lei @ 2018-12-06  0:22 UTC (permalink / raw)
  To: Evan Green; +Cc: axboe, Gwendal Grignou, asavery, linux-block, linux-kernel

On Wed, Dec 05, 2018 at 11:35:57AM -0800, Evan Green wrote:
> Hi Ming,
> 
> On Tue, Dec 4, 2018 at 5:11 PM Ming Lei <ming.lei@redhat.com> wrote:
> >
> > On Tue, Dec 04, 2018 at 02:19:46PM -0800, Evan Green wrote:
> > > Hi Ming,
> > >
> > > On Tue, Nov 27, 2018 at 5:26 PM Ming Lei <ming.lei@redhat.com> wrote:
> > > >
> > > > On Tue, Oct 30, 2018 at 04:06:24PM -0700, Evan Green wrote:
> > > > > If the backing device for a loop device is a block device,
> > > > > then mirror the discard properties of the underlying block
> > > > > device into the loop device. While in there, differentiate
> > > > > between REQ_OP_DISCARD and REQ_OP_WRITE_ZEROES, which are
> > > > > different for block devices, but which the loop device had
> > > > > just been lumping together.
> > > > >
> > > > > Signed-off-by: Evan Green <evgreen@chromium.org>
> > > > > ---
> > > > >
> > > > >  drivers/block/loop.c | 61 +++++++++++++++++++++++++++++++++++-----------------
> > > > >  1 file changed, 41 insertions(+), 20 deletions(-)
> > > > >
> > > > > diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> > > > > index 28990fc94841a..176e65101c4ef 100644
> > > > > --- a/drivers/block/loop.c
> > > > > +++ b/drivers/block/loop.c
> > > > > @@ -417,19 +417,14 @@ static int lo_read_transfer(struct loop_device *lo, struct request *rq,
> > > > >       return ret;
> > > > >  }
> > > > >
> > > > > -static int lo_discard(struct loop_device *lo, struct request *rq, loff_t pos)
> > > > > +static int lo_discard(struct loop_device *lo, struct request *rq,
> > > > > +             int mode, loff_t pos)
> > > > >  {
> > > > > -     /*
> > > > > -      * We use punch hole to reclaim the free space used by the
> > > > > -      * image a.k.a. discard. However we do not support discard if
> > > > > -      * encryption is enabled, because it may give an attacker
> > > > > -      * useful information.
> > > > > -      */
> > > > >       struct file *file = lo->lo_backing_file;
> > > > > -     int mode = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
> > > > > +     struct request_queue *q = lo->lo_queue;
> > > > >       int ret;
> > > > >
> > > > > -     if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> > > > > +     if (!blk_queue_discard(q)) {
> > > > >               ret = -EOPNOTSUPP;
> > > > >               goto out;
> > > > >       }
> > > > > @@ -603,8 +598,13 @@ static int do_req_filebacked(struct loop_device *lo, struct request *rq)
> > > > >       case REQ_OP_FLUSH:
> > > > >               return lo_req_flush(lo, rq);
> > > > >       case REQ_OP_DISCARD:
> > > > > +             return lo_discard(lo, rq,
> > > > > +                     FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, pos);
> > > > > +
> > > > >       case REQ_OP_WRITE_ZEROES:
> > > > > -             return lo_discard(lo, rq, pos);
> > > > > +             return lo_discard(lo, rq,
> > > > > +                     FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE, pos);
> > > > > +
> > > > >       case REQ_OP_WRITE:
> > > > >               if (lo->transfer)
> > > > >                       return lo_write_transfer(lo, rq, pos);
> > > > > @@ -859,6 +859,25 @@ static void loop_config_discard(struct loop_device *lo)
> > > > >       struct file *file = lo->lo_backing_file;
> > > > >       struct inode *inode = file->f_mapping->host;
> > > > >       struct request_queue *q = lo->lo_queue;
> > > > > +     struct request_queue *backingq;
> > > > > +
> > > > > +     /*
> > > > > +      * If the backing device is a block device, mirror its discard
> > > > > +      * capabilities.
> > > > > +      */
> > > > > +     if (S_ISBLK(inode->i_mode)) {
> > > > > +             backingq = bdev_get_queue(inode->i_bdev);
> > > > > +             blk_queue_max_discard_sectors(q,
> > > > > +                     backingq->limits.max_discard_sectors);
> > > > > +
> > > > > +             blk_queue_max_write_zeroes_sectors(q,
> > > > > +                     backingq->limits.max_write_zeroes_sectors);
> > > > > +
> > > > > +             q->limits.discard_granularity =
> > > > > +                     backingq->limits.discard_granularity;
> > > > > +
> > > > > +             q->limits.discard_alignment =
> > > > > +                     backingq->limits.discard_alignment;
> > > >
> > > > I think it isn't necessary to mirror backing queue's discard/write_zeros
> > > > capabilities, given either fs of the underlying queue can deal with well.
> > > >
> > > > >
> > > > >       /*
> > > > >        * We use punch hole to reclaim the free space used by the
> > > > > @@ -866,22 +885,24 @@ static void loop_config_discard(struct loop_device *lo)
> > > > >        * encryption is enabled, because it may give an attacker
> > > > >        * useful information.
> > > > >        */
> > > > > -     if ((!file->f_op->fallocate) ||
> > > > > -         lo->lo_encrypt_key_size) {
> > > > > +     } else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
> > > > >               q->limits.discard_granularity = 0;
> > > > >               q->limits.discard_alignment = 0;
> > > > >               blk_queue_max_discard_sectors(q, 0);
> > > > >               blk_queue_max_write_zeroes_sectors(q, 0);
> > > > > -             blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> > > > > -             return;
> > > > > -     }
> > > > >
> > > > > -     q->limits.discard_granularity = inode->i_sb->s_blocksize;
> > > > > -     q->limits.discard_alignment = 0;
> > > > > +     } else {
> > > > > +             q->limits.discard_granularity = inode->i_sb->s_blocksize;
> > > > > +             q->limits.discard_alignment = 0;
> > > > > +
> > > > > +             blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> > > > > +             blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> > > > > +     }
> > > > >
> > > > > -     blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> > > > > -     blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> > > > > -     blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> > > > > +     if (q->limits.max_discard_sectors || q->limits.max_write_zeroes_sectors)
> > > > > +             blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> > > > > +     else
> > > > > +             blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> > > > >  }
> > > >
> > > > Looks it should work just by mirroring backing queue's discard
> > > > capability to loop queue in case that the loop is backed by
> > > > block device, doesn't it? Meantime the unified discard limits &
> > > > write_zeros limits can be kept.
> > >
> > > I tested this out, and you're right that I could just flip the
> > > QUEUE_FLAG_DISCARD based on whether its a block device, and leave
> >
> > What I meant actually is to do the following discard config:
> >
> >         bool discard;
> >         if (S_ISBLK(inode->i_mode)) {
> >                 struct request_queue *backingq = bdev_get_queue(inode->i_bdev);
> >                 discard = blk_queue_discard(backingq);
> >         } else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size)
> >                 discard = false;
> >         else
> >                 discard = true;
> >
> >         if (discard) {
> >                 blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
> >                 blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
> >                 blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
> >         } else {
> >                 blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
> >         }
> 
> Ah, I see. But I think it's useful to reflect max_discard_sectors,
> max_write_zeroes_sectors, discard_granularity, and discard_alignment
> from the block device to the loop device. With the exception of
> discard_alignment, these parameters are visible via sysfs, so usermode
> can actually use these to make more intelligent use of fallocate.

Could you share us what the intelligent use of fallocate is?

The block layer code of blk_bio_discard_split() can deal with all the
magic limits.

The unified discard limits is simpler from implement view of loop, or
from userspace view.

> Without this part of it, I still see issues with GNU cp.

Could you investigate the cause of the GNU cp issue?

Thanks,
Ming

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-12-05 19:35         ` Evan Green
  2018-12-06  0:22           ` Ming Lei
@ 2018-12-06  3:15           ` Martin K. Petersen
  2018-12-10 17:31             ` Evan Green
  1 sibling, 1 reply; 23+ messages in thread
From: Martin K. Petersen @ 2018-12-06  3:15 UTC (permalink / raw)
  To: Evan Green
  Cc: ming.lei, axboe, Gwendal Grignou, asavery, linux-block, linux-kernel


Evan,

> Ah, I see. But I think it's useful to reflect max_discard_sectors,
> max_write_zeroes_sectors, discard_granularity, and discard_alignment
> from the block device to the loop device. With the exception of
> discard_alignment, these parameters are visible via sysfs,

discard_alignment is visible in sysfs, just not in the queue directory
since alignment can be different on a per-partition basis. So there is
one discard_alignment at the root of each /sys/block/foo directory and
one for each partition subdirectory. This mirrors the alignment_offset
variable which indicates a partitions alignment wrt. the underlying
physical block size.

That said, there are not many devices that actually report a non-zero
discard alignment so it's not as useful as the device manufacturers
(that were looking for an implementation shortcut) envisioned.

> I'm not totally sure about discard_alignment, that seems to be useful
> in cases of merging blk requests. So I can stop mirroring that one if
> it's harmful or not helpful. But unless it's a nak, I'd really love to
> keep most of the mirroring. In which case the bool doesn't do a whole
> lot of simplifying.

I think it's fine to export these. The block device topology was
explicitly designed to be stackable like this.

-- 
Martin K. Petersen	Oracle Linux Engineering

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-12-06  3:15           ` Martin K. Petersen
@ 2018-12-10 17:31             ` Evan Green
  2018-12-18 23:48               ` Evan Green
  0 siblings, 1 reply; 23+ messages in thread
From: Evan Green @ 2018-12-10 17:31 UTC (permalink / raw)
  To: martin.petersen
  Cc: ming.lei, axboe, Gwendal Grignou, asavery, linux-block, linux-kernel

On Wed, Dec 5, 2018 at 7:15 PM Martin K. Petersen
<martin.petersen@oracle.com> wrote:
>
>
> Evan,
>
> > Ah, I see. But I think it's useful to reflect max_discard_sectors,
> > max_write_zeroes_sectors, discard_granularity, and discard_alignment
> > from the block device to the loop device. With the exception of
> > discard_alignment, these parameters are visible via sysfs,
>
> discard_alignment is visible in sysfs, just not in the queue directory
> since alignment can be different on a per-partition basis. So there is
> one discard_alignment at the root of each /sys/block/foo directory and
> one for each partition subdirectory. This mirrors the alignment_offset
> variable which indicates a partitions alignment wrt. the underlying
> physical block size.
>
> That said, there are not many devices that actually report a non-zero
> discard alignment so it's not as useful as the device manufacturers
> (that were looking for an implementation shortcut) envisioned.

Ah ok, thanks.

>
> > I'm not totally sure about discard_alignment, that seems to be useful
> > in cases of merging blk requests. So I can stop mirroring that one if
> > it's harmful or not helpful. But unless it's a nak, I'd really love to
> > keep most of the mirroring. In which case the bool doesn't do a whole
> > lot of simplifying.
>
> I think it's fine to export these. The block device topology was
> explicitly designed to be stackable like this.

Yeah, it seemed to fall in pretty naturally, which is why I was hoping
it might not be so controversial. Thanks Martin.
-Evan

^ permalink raw reply	[flat|nested] 23+ messages in thread

* Re: [PATCH 2/2] loop: Better discard support for block devices
  2018-12-10 17:31             ` Evan Green
@ 2018-12-18 23:48               ` Evan Green
  0 siblings, 0 replies; 23+ messages in thread
From: Evan Green @ 2018-12-18 23:48 UTC (permalink / raw)
  To: martin.petersen
  Cc: ming.lei, axboe, Gwendal Grignou, asavery, linux-block, linux-kernel

On Mon, Dec 10, 2018 at 9:31 AM Evan Green <evgreen@chromium.org> wrote:
>
> On Wed, Dec 5, 2018 at 7:15 PM Martin K. Petersen
> <martin.petersen@oracle.com> wrote:
> >
> >
> > Evan,
> >
> > > Ah, I see. But I think it's useful to reflect max_discard_sectors,
> > > max_write_zeroes_sectors, discard_granularity, and discard_alignment
> > > from the block device to the loop device. With the exception of
> > > discard_alignment, these parameters are visible via sysfs,
> >
> > discard_alignment is visible in sysfs, just not in the queue directory
> > since alignment can be different on a per-partition basis. So there is
> > one discard_alignment at the root of each /sys/block/foo directory and
> > one for each partition subdirectory. This mirrors the alignment_offset
> > variable which indicates a partitions alignment wrt. the underlying
> > physical block size.
> >
> > That said, there are not many devices that actually report a non-zero
> > discard alignment so it's not as useful as the device manufacturers
> > (that were looking for an implementation shortcut) envisioned.
>
> Ah ok, thanks.
>
> >
> > > I'm not totally sure about discard_alignment, that seems to be useful
> > > in cases of merging blk requests. So I can stop mirroring that one if
> > > it's harmful or not helpful. But unless it's a nak, I'd really love to
> > > keep most of the mirroring. In which case the bool doesn't do a whole
> > > lot of simplifying.
> >
> > I think it's fine to export these. The block device topology was
> > explicitly designed to be stackable like this.
>
> Yeah, it seemed to fall in pretty naturally, which is why I was hoping
> it might not be so controversial. Thanks Martin.

Any other thoughts about this series?

Ming, I did go back and experiment a little. The pseudocode you had
proposed earlier would work, and solve the error prints I was seeing.
But I still would like to keep the reflection of the block device
properties, since that allows discard to be used on block devices that
do support it.
-Evan

^ permalink raw reply	[flat|nested] 23+ messages in thread

* [PATCH 2/2] loop: Better discard support for block devices
  2020-03-17 15:11 [PATCH 0/2] loop: Better discard support " Andrzej Pietrasiewicz
@ 2020-03-17 15:11 ` Andrzej Pietrasiewicz
  0 siblings, 0 replies; 23+ messages in thread
From: Andrzej Pietrasiewicz @ 2020-03-17 15:11 UTC (permalink / raw)
  To: linux-block; +Cc: Jens Axboe, kernel, andrzej.p, Evan Green

From: Evan Green <evgreen@chromium.org>

If the backing device for a loop device is a block device,
then mirror the "write zeroes" capabilities of the underlying
block device into the loop device. Copy this capability into both
max_write_zeroes_sectors and max_discard_sectors of the loop device.

The reason for this is that REQ_OP_DISCARD on a loop device translates
into blkdev_issue_zeroout(), rather than blkdev_issue_discard(). This
presents a consistent interface for loop devices (that discarded data
is zeroed), regardless of the backing device type of the loop device.
There should be no behavior change for loop devices backed by regular
files.

This change fixes blktest block/003, and removes an extraneous
error print in block/013 when testing on a loop device backed
by a block device that does not support discard.

Signed-off-by: Evan Green <evgreen@chromium.org>
[rebase onto v5.6-rc6]
Signed-off-by: Andrzej Pietrasiewicz <andrzej.p@collabora.com>

---

Changes in v5:
- Don't mirror discard if lo_encrypt_key_size is non-zero (Gwendal)

Changes in v4:
- Mirror blkdev's write_zeroes into loopdev's discard_sectors.

Changes in v3:
- Updated commit description

Changes in v2: None
---
 drivers/block/loop.c | 42 +++++++++++++++++++++++++++++++-----------
 1 file changed, 31 insertions(+), 11 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 87307454f768..8de3b2b098d3 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -427,11 +427,12 @@ static int lo_fallocate(struct loop_device *lo, struct request *rq, loff_t pos,
 	 * information.
 	 */
 	struct file *file = lo->lo_backing_file;
+	struct request_queue *q = lo->lo_queue;
 	int ret;
 
 	mode |= FALLOC_FL_KEEP_SIZE;
 
-	if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
+	if (!blk_queue_discard(q)) {
 		ret = -EOPNOTSUPP;
 		goto out;
 	}
@@ -867,28 +868,47 @@ static void loop_config_discard(struct loop_device *lo)
 	struct inode *inode = file->f_mapping->host;
 	struct request_queue *q = lo->lo_queue;
 
+
+	/*
+	 * If the backing device is a block device, mirror its zeroing
+	 * capability. REQ_OP_DISCARD translates to a zero-out even when backed
+	 * by block devices to keep consistent behavior with file-backed loop
+	 * devices.
+	 */
+	if (S_ISBLK(inode->i_mode) && !lo->lo_encrypt_key_size) {
+		struct request_queue *backingq;
+
+		backingq = bdev_get_queue(inode->i_bdev);
+		blk_queue_max_discard_sectors(q,
+			backingq->limits.max_write_zeroes_sectors);
+
+		blk_queue_max_write_zeroes_sectors(q,
+			backingq->limits.max_write_zeroes_sectors);
+
 	/*
 	 * We use punch hole to reclaim the free space used by the
 	 * image a.k.a. discard. However we do not support discard if
 	 * encryption is enabled, because it may give an attacker
 	 * useful information.
 	 */
-	if ((!file->f_op->fallocate) ||
-	    lo->lo_encrypt_key_size) {
+	} else if ((!file->f_op->fallocate) || lo->lo_encrypt_key_size) {
 		q->limits.discard_granularity = 0;
 		q->limits.discard_alignment = 0;
 		blk_queue_max_discard_sectors(q, 0);
 		blk_queue_max_write_zeroes_sectors(q, 0);
-		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
-		return;
-	}
 
-	q->limits.discard_granularity = inode->i_sb->s_blocksize;
-	q->limits.discard_alignment = 0;
+	} else {
+		q->limits.discard_granularity = inode->i_sb->s_blocksize;
+		q->limits.discard_alignment = 0;
+
+		blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
+		blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
+	}
 
-	blk_queue_max_discard_sectors(q, UINT_MAX >> 9);
-	blk_queue_max_write_zeroes_sectors(q, UINT_MAX >> 9);
-	blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+	if (q->limits.max_write_zeroes_sectors)
+		blk_queue_flag_set(QUEUE_FLAG_DISCARD, q);
+	else
+		blk_queue_flag_clear(QUEUE_FLAG_DISCARD, q);
 }
 
 static void loop_unprepare_queue(struct loop_device *lo)
-- 
2.17.1


^ permalink raw reply related	[flat|nested] 23+ messages in thread

end of thread, other threads:[~2020-03-17 15:11 UTC | newest]

Thread overview: 23+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-10-30 23:06 [PATCH 0/2] loop: Better discard for block devices Evan Green
2018-10-30 23:06 ` [PATCH 1/2] loop: Report EOPNOTSUPP properly Evan Green
2018-11-28  1:06   ` Ming Lei
2018-10-30 23:06 ` [PATCH 2/2] loop: Better discard support for block devices Evan Green
2018-11-26 18:53   ` Evan Green
2018-11-27  2:55     ` Ming Lei
2018-11-27 23:34       ` Evan Green
2018-11-28  1:28         ` Ming Lei
2018-11-28  1:26   ` Ming Lei
2018-12-04 22:19     ` Evan Green
2018-12-05  1:10       ` Ming Lei
2018-12-05 19:35         ` Evan Green
2018-12-06  0:22           ` Ming Lei
2018-12-06  3:15           ` Martin K. Petersen
2018-12-10 17:31             ` Evan Green
2018-12-18 23:48               ` Evan Green
2018-10-30 23:50 ` [PATCH 0/2] loop: Better discard " Bart Van Assche
2018-11-01 18:15   ` Evan Green
2018-11-01 22:41     ` Gwendal Grignou
2018-11-01 22:44     ` Gwendal Grignou
2018-11-02 16:02       ` Bart Van Assche
2018-11-05 20:35         ` Evan Green
2020-03-17 15:11 [PATCH 0/2] loop: Better discard support " Andrzej Pietrasiewicz
2020-03-17 15:11 ` [PATCH 2/2] " Andrzej Pietrasiewicz

This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).