All of lore.kernel.org
 help / color / mirror / Atom feed
* [PATCH] [RFC] target/file: add support of direct and async I/O
@ 2018-03-09  0:42 Andrei Vagin
  2018-03-15 14:26 ` Bryant G. Ly
                   ` (11 more replies)
  0 siblings, 12 replies; 13+ messages in thread
From: Andrei Vagin @ 2018-03-09  0:42 UTC (permalink / raw)
  To: target-devel

Direct I/O allows to not affect the write-back cache, this is
expected when a non-buffered mode is used.

Async I/O allows to handle a few commands concurrently, so a target shows a
better perfomance:

Mode: O_DSYNC Async: 1
$ ./fio --bs=4K --direct=1 --rw=randwrite --ioengine=libaio --iodepthd --name=/dev/sda --runtime  --numjobs=2
  WRITE: bwE.9MiB/s (48.1MB/s), 21.9MiB/s-23.0MiB/s (22.0MB/s-25.2MB/s), io‘9MiB (963MB), run 002-20020msec

Mode: O_DSYNC Async: 0
$ ./fio --bs=4K --direct=1 --rw=randwrite --ioengine=libaio --iodepthd --name=/dev/sdb --runtime  --numjobs=2
  WRITE: bw\x1607KiB/s (1645kB/s), 802KiB/s-805KiB/s (821kB/s-824kB/s), io1.8MiB (33.4MB), run 280-20295msec

Known issue:

DIF (PI) emulation doesn't work when a target uses async I/O, because
DIF metadata is saved in a separate file, and it is another non-trivial
task how to synchronize writing in two files, so that a following read
operation always returns a consisten metadata for a specified block.

Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
Signed-off-by: Andrei Vagin <avagin@openvz.org>
---
 drivers/target/target_core_file.c | 124 ++++++++++++++++++++++++++++++++++++--
 drivers/target/target_core_file.h |   1 +
 2 files changed, 120 insertions(+), 5 deletions(-)

diff --git a/drivers/target/target_core_file.c b/drivers/target/target_core_file.c
index 9b2c0c773022..e8c07a0f7084 100644
--- a/drivers/target/target_core_file.c
+++ b/drivers/target/target_core_file.c
@@ -250,6 +250,96 @@ static void fd_destroy_device(struct se_device *dev)
 	}
 }
 
+struct target_core_file_cmd {
+	atomic_t	ref;
+	unsigned long	len;
+	long		ret;
+	struct se_cmd	*cmd;
+	struct kiocb	iocb;
+	struct bio_vec	bvec[0];
+};
+
+static void cmd_rw_aio_do_completion(struct target_core_file_cmd *cmd)
+{
+	if (!atomic_dec_and_test(&cmd->ref))
+		return;
+
+	if (cmd->ret != cmd->len)
+		target_complete_cmd(cmd->cmd, SAM_STAT_CHECK_CONDITION);
+	else
+		target_complete_cmd(cmd->cmd, SAM_STAT_GOOD);
+
+	kfree(cmd);
+}
+
+static void cmd_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
+{
+	struct target_core_file_cmd *cmd;
+
+	cmd = container_of(iocb, struct target_core_file_cmd, iocb);
+
+	cmd->ret = ret;
+	cmd_rw_aio_do_completion(cmd);
+}
+
+static int fd_do_aio_rw(struct se_cmd *cmd, struct fd_dev *fd_dev,
+		    u32 block_size, struct scatterlist *sgl,
+		    u32 sgl_nents, u32 data_length, int is_write)
+{
+	struct file *file = fd_dev->fd_file;
+	struct target_core_file_cmd *aio_cmd;
+	struct scatterlist *sg;
+	struct iov_iter iter = {};
+	struct bio_vec *bvec;
+	ssize_t len = 0;
+	loff_t pos = (cmd->t_task_lba * block_size);
+	int ret = 0, i;
+
+	aio_cmd = kmalloc(sizeof(struct target_core_file_cmd) +
+			  sgl_nents * sizeof(struct bio_vec),
+			  GFP_KERNEL | __GFP_ZERO);
+	if (!aio_cmd)
+		return -ENOMEM;
+
+	bvec = aio_cmd->bvec;
+
+	for_each_sg(sgl, sg, sgl_nents, i) {
+		bvec[i].bv_page = sg_page(sg);
+		bvec[i].bv_len = sg->length;
+		bvec[i].bv_offset = sg->offset;
+
+		len += sg->length;
+	}
+
+	iov_iter_bvec(&iter, ITER_BVEC | is_write, bvec, sgl_nents, len);
+
+	atomic_set(&aio_cmd->ref, 2);
+
+	aio_cmd->cmd = cmd;
+	aio_cmd->len = len;
+	aio_cmd->iocb.ki_pos = pos;
+	aio_cmd->iocb.ki_filp = file;
+	aio_cmd->iocb.ki_complete = cmd_rw_aio_complete;
+	aio_cmd->iocb.ki_flags = 0;
+
+	if (!(fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE))
+		aio_cmd->iocb.ki_flags |= IOCB_DIRECT;
+	if (is_write && (cmd->se_cmd_flags & SCF_FUA))
+		aio_cmd->iocb.ki_flags |= IOCB_DSYNC;
+
+	if (is_write)
+		ret = call_write_iter(file, &aio_cmd->iocb, &iter);
+	else
+		ret = call_read_iter(file, &aio_cmd->iocb, &iter);
+
+	cmd_rw_aio_do_completion(aio_cmd);
+
+	if (ret != -EIOCBQUEUED)
+		aio_cmd->iocb.ki_complete(&aio_cmd->iocb, ret, 0);
+
+	return 0;
+}
+
 static int fd_do_rw(struct se_cmd *cmd, struct file *fd,
 		    u32 block_size, struct scatterlist *sgl,
 		    u32 sgl_nents, u32 data_length, int is_write)
@@ -536,6 +626,7 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 	struct file *pfile = fd_dev->fd_prot_file;
 	sense_reason_t rc;
 	int ret = 0;
+	int aio = fd_dev->fbd_flags & FDBD_HAS_ASYNC_IO;
 	/*
 	 * We are currently limited by the number of iovecs (2048) per
 	 * single vfs_[writev,readv] call.
@@ -550,7 +641,11 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 	 * Call vectorized fileio functions to map struct scatterlist
 	 * physical memory addresses to struct iovec virtual memory.
 	 */
-	if (data_direction = DMA_FROM_DEVICE) {
+	if (aio) {
+		ret = fd_do_aio_rw(cmd, fd_dev, dev->dev_attrib.block_size,
+			       sgl, sgl_nents, cmd->data_length,
+				!(data_direction = DMA_FROM_DEVICE));
+	} else if (data_direction = DMA_FROM_DEVICE) {
 		if (cmd->prot_type && dev->dev_attrib.pi_prot_type) {
 			ret = fd_do_rw(cmd, pfile, dev->prot_length,
 				       cmd->t_prot_sg, cmd->t_prot_nents,
@@ -616,18 +711,21 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
 	if (ret < 0)
 		return TCM_LOGICAL_UNIT_COMMUNICATION_FAILURE;
 
-	target_complete_cmd(cmd, SAM_STAT_GOOD);
+	if (!aio)
+		target_complete_cmd(cmd, SAM_STAT_GOOD);
 	return 0;
 }
 
 enum {
-	Opt_fd_dev_name, Opt_fd_dev_size, Opt_fd_buffered_io, Opt_err
+	Opt_fd_dev_name, Opt_fd_dev_size, Opt_fd_buffered_io,
+	Opt_fd_async_io, Opt_err
 };
 
 static match_table_t tokens = {
 	{Opt_fd_dev_name, "fd_dev_name=%s"},
 	{Opt_fd_dev_size, "fd_dev_size=%s"},
 	{Opt_fd_buffered_io, "fd_buffered_io=%d"},
+	{Opt_fd_async_io, "fd_async_io=%d"},
 	{Opt_err, NULL}
 };
 
@@ -693,6 +791,21 @@ static ssize_t fd_set_configfs_dev_params(struct se_device *dev,
 
 			fd_dev->fbd_flags |= FDBD_HAS_BUFFERED_IO_WCE;
 			break;
+		case Opt_fd_async_io:
+			ret = match_int(args, &arg);
+			if (ret)
+				goto out;
+			if (arg != 1) {
+				pr_err("bogus fd_async_io=%d value\n", arg);
+				ret = -EINVAL;
+				goto out;
+			}
+
+			pr_debug("FILEIO: Using async I/O"
+				" operations for struct fd_dev\n");
+
+			fd_dev->fbd_flags |= FDBD_HAS_ASYNC_IO;
+			break;
 		default:
 			break;
 		}
@@ -709,10 +822,11 @@ static ssize_t fd_show_configfs_dev_params(struct se_device *dev, char *b)
 	ssize_t bl = 0;
 
 	bl = sprintf(b + bl, "TCM FILEIO ID: %u", fd_dev->fd_dev_id);
-	bl += sprintf(b + bl, "        File: %s  Size: %llu  Mode: %s\n",
+	bl += sprintf(b + bl, "        File: %s  Size: %llu  Mode: %s Async: %d\n",
 		fd_dev->fd_dev_name, fd_dev->fd_dev_size,
 		(fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE) ?
-		"Buffered-WCE" : "O_DSYNC");
+		"Buffered-WCE" : "O_DSYNC",
+		!!(fd_dev->fbd_flags & FDBD_HAS_ASYNC_IO));
 	return bl;
 }
 
diff --git a/drivers/target/target_core_file.h b/drivers/target/target_core_file.h
index 53be5ffd3261..929b1ecd544e 100644
--- a/drivers/target/target_core_file.h
+++ b/drivers/target/target_core_file.h
@@ -22,6 +22,7 @@
 #define FBDF_HAS_PATH		0x01
 #define FBDF_HAS_SIZE		0x02
 #define FDBD_HAS_BUFFERED_IO_WCE 0x04
+#define FDBD_HAS_ASYNC_IO	 0x08
 #define FDBD_FORMAT_UNIT_SIZE	2048
 
 struct fd_dev {
-- 
2.13.6

--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
@ 2018-03-15 14:26 ` Bryant G. Ly
  2018-03-15 17:04 ` Andrei Vagin
                   ` (10 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Bryant G. Ly @ 2018-03-15 14:26 UTC (permalink / raw)
  To: target-devel

On 3/8/18 6:42 PM, Andrei Vagin wrote:

> Direct I/O allows to not affect the write-back cache, this is
> expected when a non-buffered mode is used.
>
> Async I/O allows to handle a few commands concurrently, so a target shows a
> better perfomance:
>
> Mode: O_DSYNC Async: 1
> $ ./fio --bs=4K --direct=1 --rw=randwrite --ioengine=libaio --iodepthd --name=/dev/sda --runtime  --numjobs=2
>   WRITE: bwE.9MiB/s (48.1MB/s), 21.9MiB/s-23.0MiB/s (22.0MB/s-25.2MB/s), io‘9MiB (963MB), run 002-20020msec
>
> Mode: O_DSYNC Async: 0
> $ ./fio --bs=4K --direct=1 --rw=randwrite --ioengine=libaio --iodepthd --name=/dev/sdb --runtime  --numjobs=2
>   WRITE: bw\x1607KiB/s (1645kB/s), 802KiB/s-805KiB/s (821kB/s-824kB/s), io1.8MiB (33.4MB), run 280-20295msec
>
> Known issue:
>
> DIF (PI) emulation doesn't work when a target uses async I/O, because
> DIF metadata is saved in a separate file, and it is another non-trivial
> task how to synchronize writing in two files, so that a following read
> operation always returns a consisten metadata for a specified block.
>
> Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
> Signed-off-by: Andrei Vagin <avagin@openvz.org>
> ---
>  drivers/target/target_core_file.c | 124 ++++++++++++++++++++++++++++++++++++--
>  drivers/target/target_core_file.h |   1 +
>  2 files changed, 120 insertions(+), 5 deletions(-)
>
>
Tested-by: Bryant G. Ly <bryantly@linux.vnet.ibm.com>

Patch looks good to me - Thanks for the performance enhancement! 

Btw I have been running I/O tests with HTX against this patch for 24 hrs and have no problems.

-Bryant

--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
  2018-03-15 14:26 ` Bryant G. Ly
@ 2018-03-15 17:04 ` Andrei Vagin
  2018-03-16  7:50 ` Christoph Hellwig
                   ` (9 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Andrei Vagin @ 2018-03-15 17:04 UTC (permalink / raw)
  To: target-devel

On Thu, Mar 15, 2018 at 09:26:57AM -0500, Bryant G. Ly wrote:
> On 3/8/18 6:42 PM, Andrei Vagin wrote:
> 
> > Direct I/O allows to not affect the write-back cache, this is
> > expected when a non-buffered mode is used.
> >
> > Async I/O allows to handle a few commands concurrently, so a target shows a
> > better perfomance:
> >
> > Mode: O_DSYNC Async: 1
> > $ ./fio --bs=4K --direct=1 --rw=randwrite --ioengine=libaio --iodepthd --name=/dev/sda --runtime  --numjobs=2
> >   WRITE: bwE.9MiB/s (48.1MB/s), 21.9MiB/s-23.0MiB/s (22.0MB/s-25.2MB/s), io‘9MiB (963MB), run 002-20020msec
> >
> > Mode: O_DSYNC Async: 0
> > $ ./fio --bs=4K --direct=1 --rw=randwrite --ioengine=libaio --iodepthd --name=/dev/sdb --runtime  --numjobs=2
> >   WRITE: bw\x1607KiB/s (1645kB/s), 802KiB/s-805KiB/s (821kB/s-824kB/s), io1.8MiB (33.4MB), run 280-20295msec
> >
> > Known issue:
> >
> > DIF (PI) emulation doesn't work when a target uses async I/O, because
> > DIF metadata is saved in a separate file, and it is another non-trivial
> > task how to synchronize writing in two files, so that a following read
> > operation always returns a consisten metadata for a specified block.
> >
> > Cc: "Nicholas A. Bellinger" <nab@linux-iscsi.org>
> > Signed-off-by: Andrei Vagin <avagin@openvz.org>
> > ---
> >  drivers/target/target_core_file.c | 124 ++++++++++++++++++++++++++++++++++++--
> >  drivers/target/target_core_file.h |   1 +
> >  2 files changed, 120 insertions(+), 5 deletions(-)
> >
> >
> Tested-by: Bryant G. Ly <bryantly@linux.vnet.ibm.com>
> 
> Patch looks good to me - Thanks for the performance enhancement! 
> 
> Btw I have been running I/O tests with HTX against this patch for 24 hrs and have no problems.

Bryant, thank you for the feedback.

> 
> -Bryant
> 
--
To unsubscribe from this list: send the line "unsubscribe target-devel" in
the body of a message to majordomo@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
  2018-03-15 14:26 ` Bryant G. Ly
  2018-03-15 17:04 ` Andrei Vagin
@ 2018-03-16  7:50 ` Christoph Hellwig
  2018-03-17  0:13 ` Andrei Vagin
                   ` (8 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2018-03-16  7:50 UTC (permalink / raw)
  To: target-devel

> DIF (PI) emulation doesn't work when a target uses async I/O, because
> DIF metadata is saved in a separate file, and it is another non-trivial
> task how to synchronize writing in two files, so that a following read
> operation always returns a consisten metadata for a specified block.

There literally is no way to do that, even without aio.  The file
DIF implementation should probably regarded as an early bringup /
prototype tool, not something really usable.

> +static void cmd_rw_aio_do_completion(struct target_core_file_cmd *cmd)
> +{
> +	if (!atomic_dec_and_test(&cmd->ref))
> +		return;

There is no need for reference counting.  If the read_iter/write iter
method returns -EIOCBQUEUED the completion callback needs to complete
the I/O and free the structure, else the method caller.

> +	if (!(fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE))
> +		aio_cmd->iocb.ki_flags |= IOCB_DIRECT;

aio without IOCB_DIRECT doesn't make any sense. But the WCE flag
really has nothing to do with buffers vs direct I/O anyway.

> +	if (is_write)
> +		ret = call_write_iter(file, &aio_cmd->iocb, &iter);
> +	else
> +		ret = call_read_iter(file, &aio_cmd->iocb, &iter);

Please call the methods directly instead of through the wrappers.

> +
>  static int fd_do_rw(struct se_cmd *cmd, struct file *fd,
>  		    u32 block_size, struct scatterlist *sgl,
>  		    u32 sgl_nents, u32 data_length, int is_write)
> @@ -536,6 +626,7 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
>  	struct file *pfile = fd_dev->fd_prot_file;
>  	sense_reason_t rc;
>  	int ret = 0;
> +	int aio = fd_dev->fbd_flags & FDBD_HAS_ASYNC_IO;
>  	/*
>  	 * We are currently limited by the number of iovecs (2048) per
>  	 * single vfs_[writev,readv] call.
> @@ -550,7 +641,11 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
>  	 * Call vectorized fileio functions to map struct scatterlist
>  	 * physical memory addresses to struct iovec virtual memory.
>  	 */
> -	if (data_direction = DMA_FROM_DEVICE) {
> +	if (aio) {

fd_execute_rw shares basically no code with the aio case.  I'd rather
have a very high level wrapper here:

static sense_reason_t
fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
		enum dma_data_direction data_direction)
{
	if (FD_DEV(cmd->se_dev)->fbd_flags & FDBD_HAS_ASYNC_IO)
		return fd_execute_rw_aio(cmd, sgl, sgl_nents, dma_direction);
	return fd_execute_rw_buffered(cmd, sgl, sgl_nents, dma_direction);
}

and keep the code separate.


^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
                   ` (2 preceding siblings ...)
  2018-03-16  7:50 ` Christoph Hellwig
@ 2018-03-17  0:13 ` Andrei Vagin
  2018-03-17  8:25 ` Christoph Hellwig
                   ` (7 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Andrei Vagin @ 2018-03-17  0:13 UTC (permalink / raw)
  To: target-devel

Hi Christoph,

Thank you for the review. All comments look reasonable. I will fix and
set a final version soon.

Pls, answer on one inline question.

On Fri, Mar 16, 2018 at 12:50:27AM -0700, Christoph Hellwig wrote:
> > DIF (PI) emulation doesn't work when a target uses async I/O, because
> > DIF metadata is saved in a separate file, and it is another non-trivial
> > task how to synchronize writing in two files, so that a following read
> > operation always returns a consisten metadata for a specified block.
> 
> There literally is no way to do that, even without aio.  The file
> DIF implementation should probably regarded as an early bringup /
> prototype tool, not something really usable.
> 
> > +static void cmd_rw_aio_do_completion(struct target_core_file_cmd *cmd)
> > +{
> > +	if (!atomic_dec_and_test(&cmd->ref))
> > +		return;
> 
> There is no need for reference counting.  If the read_iter/write iter
> method returns -EIOCBQUEUED the completion callback needs to complete
> the I/O and free the structure, else the method caller.
> 
> > +	if (!(fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE))
> > +		aio_cmd->iocb.ki_flags |= IOCB_DIRECT;
> 
> aio without IOCB_DIRECT doesn't make any sense. But the WCE flag
> really has nothing to do with buffers vs direct I/O anyway.
> 
> > +	if (is_write)
> > +		ret = call_write_iter(file, &aio_cmd->iocb, &iter);
> > +	else
> > +		ret = call_read_iter(file, &aio_cmd->iocb, &iter);
> 
> Please call the methods directly instead of through the wrappers.

Do you mean to call file->f_op->write_iter(kio, iter) instead of
call_write_iter()? What is wrong with these wrappers?

Thanks,
Andrei

> 
> > +
> >  static int fd_do_rw(struct se_cmd *cmd, struct file *fd,
> >  		    u32 block_size, struct scatterlist *sgl,
> >  		    u32 sgl_nents, u32 data_length, int is_write)
> > @@ -536,6 +626,7 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
> >  	struct file *pfile = fd_dev->fd_prot_file;
> >  	sense_reason_t rc;
> >  	int ret = 0;
> > +	int aio = fd_dev->fbd_flags & FDBD_HAS_ASYNC_IO;
> >  	/*
> >  	 * We are currently limited by the number of iovecs (2048) per
> >  	 * single vfs_[writev,readv] call.
> > @@ -550,7 +641,11 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
> >  	 * Call vectorized fileio functions to map struct scatterlist
> >  	 * physical memory addresses to struct iovec virtual memory.
> >  	 */
> > -	if (data_direction = DMA_FROM_DEVICE) {
> > +	if (aio) {
> 
> fd_execute_rw shares basically no code with the aio case.  I'd rather
> have a very high level wrapper here:
> 
> static sense_reason_t
> fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
> 		enum dma_data_direction data_direction)
> {
> 	if (FD_DEV(cmd->se_dev)->fbd_flags & FDBD_HAS_ASYNC_IO)
> 		return fd_execute_rw_aio(cmd, sgl, sgl_nents, dma_direction);
> 	return fd_execute_rw_buffered(cmd, sgl, sgl_nents, dma_direction);
> }
> 
> and keep the code separate.
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
                   ` (3 preceding siblings ...)
  2018-03-17  0:13 ` Andrei Vagin
@ 2018-03-17  8:25 ` Christoph Hellwig
  2018-03-20  7:54 ` Andrei Vagin
                   ` (6 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2018-03-17  8:25 UTC (permalink / raw)
  To: target-devel

On Fri, Mar 16, 2018 at 05:13:25PM -0700, Andrei Vagin wrote:
> > Please call the methods directly instead of through the wrappers.
> 
> Do you mean to call file->f_op->write_iter(kio, iter) instead of
> call_write_iter()? What is wrong with these wrappers?

Yes. They are completely pointless and just obsfucate the code.
I plan to remove them eventually.

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
                   ` (4 preceding siblings ...)
  2018-03-17  8:25 ` Christoph Hellwig
@ 2018-03-20  7:54 ` Andrei Vagin
  2018-03-20  8:47 ` Christoph Hellwig
                   ` (5 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Andrei Vagin @ 2018-03-20  7:54 UTC (permalink / raw)
  To: target-devel

On Fri, Mar 16, 2018 at 12:50:27AM -0700, Christoph Hellwig wrote:
> > DIF (PI) emulation doesn't work when a target uses async I/O, because
> > DIF metadata is saved in a separate file, and it is another non-trivial
> > task how to synchronize writing in two files, so that a following read
> > operation always returns a consisten metadata for a specified block.
> 
> There literally is no way to do that, even without aio.  The file
> DIF implementation should probably regarded as an early bringup /
> prototype tool, not something really usable.
> 
> > +static void cmd_rw_aio_do_completion(struct target_core_file_cmd *cmd)
> > +{
> > +	if (!atomic_dec_and_test(&cmd->ref))
> > +		return;
> 
> There is no need for reference counting.  If the read_iter/write iter
> method returns -EIOCBQUEUED the completion callback needs to complete
> the I/O and free the structure, else the method caller.

I was near to send a final version, but I decided to investigate how a
reference counter was appeared in drivers/block/loop.c:

commit 92d773324b7edbd36bf0c28c1e0157763aeccc92
Author: Shaohua Li <shli@fb.com>
Date:   Fri Sep 1 11:15:17 2017 -0700

    block/loop: fix use after free
    
    lo_rw_aio->call_read_iter->
    1       aops->direct_IO
    2       iov_iter_revert
    lo_rw_aio_complete could happen between 1 and 2, the bio and bvec
could
    be freed before 2, which accesses bvec.
    
    Signed-off-by: Shaohua Li <shli@fb.com>
    Signed-off-by: Jens Axboe <axboe@kernel.dk>

This commit looks reasonable, doesn't it? In out case, bvec-s are
freed from the callback too.

> 
> > +	if (!(fd_dev->fbd_flags & FDBD_HAS_BUFFERED_IO_WCE))
> > +		aio_cmd->iocb.ki_flags |= IOCB_DIRECT;
> 
> aio without IOCB_DIRECT doesn't make any sense. But the WCE flag
> really has nothing to do with buffers vs direct I/O anyway.
> 
> > +	if (is_write)
> > +		ret = call_write_iter(file, &aio_cmd->iocb, &iter);
> > +	else
> > +		ret = call_read_iter(file, &aio_cmd->iocb, &iter);
> 
> Please call the methods directly instead of through the wrappers.
> 
> > +
> >  static int fd_do_rw(struct se_cmd *cmd, struct file *fd,
> >  		    u32 block_size, struct scatterlist *sgl,
> >  		    u32 sgl_nents, u32 data_length, int is_write)
> > @@ -536,6 +626,7 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
> >  	struct file *pfile = fd_dev->fd_prot_file;
> >  	sense_reason_t rc;
> >  	int ret = 0;
> > +	int aio = fd_dev->fbd_flags & FDBD_HAS_ASYNC_IO;
> >  	/*
> >  	 * We are currently limited by the number of iovecs (2048) per
> >  	 * single vfs_[writev,readv] call.
> > @@ -550,7 +641,11 @@ fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
> >  	 * Call vectorized fileio functions to map struct scatterlist
> >  	 * physical memory addresses to struct iovec virtual memory.
> >  	 */
> > -	if (data_direction = DMA_FROM_DEVICE) {
> > +	if (aio) {
> 
> fd_execute_rw shares basically no code with the aio case.  I'd rather
> have a very high level wrapper here:
> 
> static sense_reason_t
> fd_execute_rw(struct se_cmd *cmd, struct scatterlist *sgl, u32 sgl_nents,
> 		enum dma_data_direction data_direction)
> {
> 	if (FD_DEV(cmd->se_dev)->fbd_flags & FDBD_HAS_ASYNC_IO)
> 		return fd_execute_rw_aio(cmd, sgl, sgl_nents, dma_direction);
> 	return fd_execute_rw_buffered(cmd, sgl, sgl_nents, dma_direction);
> }
> 
> and keep the code separate.
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
                   ` (5 preceding siblings ...)
  2018-03-20  7:54 ` Andrei Vagin
@ 2018-03-20  8:47 ` Christoph Hellwig
  2018-03-20 20:41 ` Andrei Vagin
                   ` (4 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2018-03-20  8:47 UTC (permalink / raw)
  To: target-devel

On Tue, Mar 20, 2018 at 12:54:58AM -0700, Andrei Vagin wrote:
> commit 92d773324b7edbd36bf0c28c1e0157763aeccc92
> Author: Shaohua Li <shli@fb.com>
> Date:   Fri Sep 1 11:15:17 2017 -0700
> 
>     block/loop: fix use after free
>     
>     lo_rw_aio->call_read_iter->
>     1       aops->direct_IO
>     2       iov_iter_revert
>     lo_rw_aio_complete could happen between 1 and 2, the bio and bvec
> could
>     be freed before 2, which accesses bvec.
>     
>     Signed-off-by: Shaohua Li <shli@fb.com>
>     Signed-off-by: Jens Axboe <axboe@kernel.dk>
> 
> This commit looks reasonable, doesn't it? In out case, bvec-s are
> freed from the callback too.

It looks completely bogus, but it papers over a valid issue. The
iovec/bvec passed to ->read_iter/->write_iter is caller allocated
and freed, so we should always free it after the calls.  The actual
I/O completion is separate from that.  Below is the real fix for the
loop driver:

---
From 101f857c5b67492cfd75212d104699813f1bd345 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Mar 2018 09:35:55 +0100
Subject: loop: remove loop_cmd refcounting

Instead of introducing an unneeded refcount free the bvec after calling
into ->read_iter and ->write_iter, which follows the the normal read/write
path conventions.

Fixes: 92d77332 ("block/loop: fix use after free")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 14 ++------------
 drivers/block/loop.h |  1 -
 2 files changed, 2 insertions(+), 13 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ee62d2d517bf..2578a39640b8 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -463,15 +463,6 @@ static void lo_complete_rq(struct request *rq)
 	blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK);
 }
 
-static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
-{
-	if (!atomic_dec_and_test(&cmd->ref))
-		return;
-	kfree(cmd->bvec);
-	cmd->bvec = NULL;
-	blk_mq_complete_request(cmd->rq);
-}
-
 static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 {
 	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
@@ -479,7 +470,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 	if (cmd->css)
 		css_put(cmd->css);
 	cmd->ret = ret;
-	lo_rw_aio_do_completion(cmd);
+	blk_mq_complete_request(cmd->rq);
 }
 
 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
@@ -527,7 +518,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 		segments = bio_segments(bio);
 	}
-	atomic_set(&cmd->ref, 2);
 
 	iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
 		      segments, blk_rq_bytes(rq));
@@ -545,7 +535,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	else
 		ret = call_read_iter(file, &cmd->iocb, &iter);
 
-	lo_rw_aio_do_completion(cmd);
+	kfree(cmd->bvec);
 	kthread_associate_blkcg(NULL);
 
 	if (ret != -EIOCBQUEUED)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 0f45416e4fcf..ba65848c7c33 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -68,7 +68,6 @@ struct loop_cmd {
 	struct kthread_work work;
 	struct request *rq;
 	bool use_aio; /* use AIO interface to handle I/O */
-	atomic_t ref; /* only for aio */
 	long ret;
 	struct kiocb iocb;
 	struct bio_vec *bvec;
-- 
2.14.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
                   ` (6 preceding siblings ...)
  2018-03-20  8:47 ` Christoph Hellwig
@ 2018-03-20 20:41 ` Andrei Vagin
  2018-03-21  7:23 ` Christoph Hellwig
                   ` (3 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Andrei Vagin @ 2018-03-20 20:41 UTC (permalink / raw)
  To: target-devel

On Tue, Mar 20, 2018 at 01:47:01AM -0700, Christoph Hellwig wrote:
> On Tue, Mar 20, 2018 at 12:54:58AM -0700, Andrei Vagin wrote:
> > commit 92d773324b7edbd36bf0c28c1e0157763aeccc92
> > Author: Shaohua Li <shli@fb.com>
> > Date:   Fri Sep 1 11:15:17 2017 -0700
> > 
> >     block/loop: fix use after free
> >     
> >     lo_rw_aio->call_read_iter->
> >     1       aops->direct_IO
> >     2       iov_iter_revert
> >     lo_rw_aio_complete could happen between 1 and 2, the bio and bvec
> > could
> >     be freed before 2, which accesses bvec.
> >     
> >     Signed-off-by: Shaohua Li <shli@fb.com>
> >     Signed-off-by: Jens Axboe <axboe@kernel.dk>
> > 
> > This commit looks reasonable, doesn't it? In out case, bvec-s are
> > freed from the callback too.
> 
> It looks completely bogus, but it papers over a valid issue. The
> iovec/bvec passed to ->read_iter/->write_iter is caller allocated
> and freed, so we should always free it after the calls.  The actual
> I/O completion is separate from that.  Below is the real fix for the
> loop driver:

I'm afraid this patch doesn't work, because we are not always allocate bvec,
sometimes we get it from bio. In this case, we have to call
blk_mq_complete_request after read_iter/write_iter.

[  102.559165] =================================
[  102.560396] BUG: KASAN: use-after-free in iov_iter_revert+0xa7/0x440
[  102.561331] Read of size 4 at addr ffff880111704850 by task loop0/690

[  102.562501] CPU: 1 PID: 690 Comm: loop0 Not tainted 4.16.0-rc6-00030-gcf309ac88b27-dirty #503
[  102.563768] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1.fc26 04/01/2014
[  102.564999] Call Trace:
[  102.565370]  dump_stack+0xda/0x16f
[  102.565857]  ? _atomic_dec_and_lock+0x101/0x101
[  102.566508]  ? del_timer_sync+0xac/0xd0
[  102.567203]  print_address_description+0x6a/0x270
[  102.567963]  kasan_report+0x258/0x380
[  102.568572]  ? iov_iter_revert+0xa7/0x440
[  102.569243]  iov_iter_revert+0xa7/0x440
[  102.569879]  generic_file_read_iter+0xfd0/0x12f0
[  102.570646]  ? match_held_lock+0x7f/0x340
[  102.571257]  ? __save_stack_trace+0x82/0x100
[  102.571909]  ? filemap_range_has_page+0x1b0/0x1b0
[  102.572562]  ? ret_from_fork+0x3a/0x50
[  102.573102]  ? fuse_simple_request+0x1a7/0x280
[  102.573760]  ? save_stack+0x89/0xb0
[  102.574305]  ? find_held_lock+0x6d/0xd0
[  102.574985]  ? fuse_change_attributes+0x243/0x350
[  102.576072]  ? lock_downgrade+0x320/0x320
[  102.576855]  ? map_id_range_down+0x19a/0x1c0
[  102.577685]  ? do_raw_spin_unlock+0x147/0x220
[  102.578657]  ? do_raw_spin_trylock+0x100/0x100
[  102.579464]  ? do_raw_write_trylock+0x100/0x100
[  102.580304]  ? _raw_spin_unlock+0x24/0x30
[  102.581014]  ? fuse_change_attributes+0x32e/0x350
[  102.581854]  ? fuse_change_attributes_common+0x300/0x300
[  102.582810]  ? fuse_simple_request+0x1a7/0x280
[  102.583600]  ? fuse_simple_request+0x1a7/0x280
[  102.584415]  ? fuse_do_getattr+0x27b/0x6e0
[  102.585149]  ? fuse_dentry_revalidate+0x640/0x640
[  102.585913]  ? find_held_lock+0x6d/0xd0
[  102.586763]  ? lock_release+0x4f0/0x4f0
[  102.587634]  ? match_held_lock+0x7f/0x340
[  102.588854]  lo_rw_aio+0x928/0xd20
[  102.589517]  ? save_trace+0x1e0/0x1e0
[  102.590141]  ? lo_compat_ioctl+0xe0/0xe0
[  102.590888]  ? clear_buddies+0x42/0x210
[  102.591650]  ? debug_check_no_locks_freed+0x1b0/0x1b0
[  102.592611]  ? finish_task_switch+0x181/0x500
[  102.593496]  ? do_raw_spin_unlock+0x147/0x220
[  102.594387]  loop_queue_work+0x10aa/0x1900
[  102.595191]  ? _raw_spin_unlock_irq+0x29/0x40
[  102.596229]  ? _raw_spin_unlock_irq+0x29/0x40
[  102.597865]  ? finish_task_switch+0x181/0x500
[  102.599310]  ? finish_task_switch+0x1d5/0x500
[  102.600118]  ? lo_rw_aio+0xd20/0xd20
[  102.600760]  ? set_load_weight+0x110/0x110
[  102.601465]  ? __switch_to_asm+0x34/0x70
[  102.602165]  ? __switch_to_asm+0x34/0x70
[  102.602867]  ? __switch_to_asm+0x40/0x70
[  102.603571]  ? __switch_to_asm+0x34/0x70
[  102.604279]  ? __switch_to_asm+0x40/0x70
[  102.604982]  ? __switch_to_asm+0x34/0x70
[  102.605856]  ? __switch_to_asm+0x40/0x70
[  102.606698]  ? __switch_to_asm+0x34/0x70
[  102.607473]  ? __switch_to_asm+0x34/0x70
[  102.608206]  ? __switch_to_asm+0x40/0x70
[  102.608980]  ? __switch_to_asm+0x34/0x70
[  102.609877]  ? __switch_to_asm+0x40/0x70
[  102.610632]  ? __switch_to_asm+0x34/0x70
[  102.611368]  ? __switch_to_asm+0x40/0x70
[  102.612074]  ? __schedule+0x50e/0x11b0
[  102.612791]  ? mark_held_locks+0x1c/0x90
[  102.613503]  ? _raw_spin_unlock_irq+0x29/0x40
[  102.614282]  ? match_held_lock+0x7f/0x340
[  102.615001]  ? save_trace+0x1e0/0x1e0
[  102.615661]  ? finish_task_switch+0x181/0x500
[  102.616434]  ? finish_task_switch+0x10d/0x500
[  102.617203]  ? __switch_to_asm+0x34/0x70
[  102.617902]  ? __switch_to_asm+0x34/0x70
[  102.618609]  ? set_load_weight+0x110/0x110
[  102.619350]  ? __switch_to_asm+0x34/0x70
[  102.620136]  ? __switch_to_asm+0x34/0x70
[  102.620859]  ? __switch_to_asm+0x40/0x70
[  102.621584]  ? find_held_lock+0x6d/0xd0
[  102.622328]  ? kthread_worker_fn+0x229/0x520
[  102.623033]  ? lock_downgrade+0x320/0x320
[  102.623683]  ? do_raw_spin_unlock+0x147/0x220
[  102.624443]  ? do_raw_spin_trylock+0x100/0x100
[  102.625550]  ? rcu_note_context_switch+0x4e0/0x4e0
[  102.626597]  ? match_held_lock+0x7f/0x340
[  102.627390]  ? mark_held_locks+0x1c/0x90
[  102.628016]  ? _raw_spin_unlock_irq+0x29/0x40
[  102.628732]  kthread_worker_fn+0x272/0x520
[  102.629429]  ? kthread+0x1e0/0x1e0
[  102.630110]  ? find_held_lock+0x6d/0xd0
[  102.630847]  ? lock_downgrade+0x320/0x320
[  102.631554]  ? __schedule+0x11b0/0x11b0
[  102.632222]  ? do_raw_spin_unlock+0x147/0x220
[  102.633128]  ? do_raw_spin_trylock+0x100/0x100
[  102.634013]  ? __lockdep_init_map+0x98/0x2a0
[  102.634845]  ? __init_waitqueue_head+0xbe/0xf0
[  102.635703]  ? mark_held_locks+0x1c/0x90
[  102.636315]  ? _raw_spin_unlock_irqrestore+0x32/0x60
[  102.637063]  ? loop_get_status64+0x100/0x100
[  102.637695]  kthread+0x1b7/0x1e0
[  102.638243]  ? kthread_create_worker_on_cpu+0xc0/0xc0
[  102.639212]  ret_from_fork+0x3a/0x50

[  102.640354] Allocated by task 691:
[  102.641024]  kasan_kmalloc+0xa0/0xd0
[  102.641718]  kmem_cache_alloc+0xca/0x2d0
[  102.642375]  mempool_alloc+0x117/0x2f0
[  102.642946]  bio_alloc_bioset+0x34d/0x4a0
[  102.643516]  mpage_alloc.isra.8+0x45/0x110
[  102.644082]  do_mpage_readpage+0xa70/0xc60
[  102.644664]  mpage_readpages+0x2d1/0x400
[  102.645209]  __do_page_cache_readahead+0x528/0x740
[  102.645873]  force_page_cache_readahead+0x10b/0x170
[  102.646560]  generic_file_read_iter+0xeb2/0x12f0
[  102.647198]  __vfs_read+0x2a9/0x3c0
[  102.647705]  vfs_read+0xb7/0x1b0
[  102.648161]  SyS_read+0xb6/0x140
[  102.648619]  do_syscall_64+0x193/0x470
[  102.649142]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

[  102.650063] Freed by task 693:
[  102.650502]  __kasan_slab_free+0x136/0x180
[  102.651074]  kmem_cache_free+0xc6/0x310
[  102.651711]  bio_put+0xdb/0xf0
[  102.652146]  bio_endio+0x23b/0x400
[  102.652672]  blk_update_request+0x12a/0x660
[  102.653252]  blk_mq_end_request+0x26/0x80
[  102.653826]  flush_smp_call_function_queue+0x23d/0x350
[  102.654544]  smp_call_function_single_interrupt+0xdc/0x460

[  102.655537] The buggy address belongs to the object at ffff8801117047c0
                which belongs to the cache bio-0 of size 200
[  102.657169] The buggy address is located 144 bytes inside of
                200-byte region [ffff8801117047c0, ffff880111704888)
[  102.658773] The buggy address belongs to the page:
[  102.659459] page:ffffea000445c100 count:1 mapcount:0 mapping:0000000000000000 index:0x0 compound_mapcount: 0
[  102.660947] flags: 0x2fffc000008100(slab|head)
[  102.661724] raw: 002fffc000008100 0000000000000000 0000000000000000 0000000100150015
[  102.662833] raw: dead000000000100 dead000000000200 ffff880119a58700 0000000000000000
[  102.663908] page dumped because: kasan: bad access detected

[  102.664948] Memory state around the buggy address:
[  102.665626]  ffff880111704700: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  102.666814]  ffff880111704780: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
[  102.668278] >ffff880111704800: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  102.669706]                                                  ^
[  102.670707]  ffff880111704880: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  102.672119]  ffff880111704900: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
[  102.673514] =================================


> 
> ---
> From 101f857c5b67492cfd75212d104699813f1bd345 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Tue, 20 Mar 2018 09:35:55 +0100
> Subject: loop: remove loop_cmd refcounting
> 
> Instead of introducing an unneeded refcount free the bvec after calling
> into ->read_iter and ->write_iter, which follows the the normal read/write
> path conventions.
> 
> Fixes: 92d77332 ("block/loop: fix use after free")
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/block/loop.c | 14 ++------------
>  drivers/block/loop.h |  1 -
>  2 files changed, 2 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index ee62d2d517bf..2578a39640b8 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -463,15 +463,6 @@ static void lo_complete_rq(struct request *rq)
>  	blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK);
>  }
>  
> -static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
> -{
> -	if (!atomic_dec_and_test(&cmd->ref))
> -		return;
> -	kfree(cmd->bvec);
> -	cmd->bvec = NULL;
> -	blk_mq_complete_request(cmd->rq);
> -}
> -
>  static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
>  {
>  	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
> @@ -479,7 +470,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
>  	if (cmd->css)
>  		css_put(cmd->css);
>  	cmd->ret = ret;
> -	lo_rw_aio_do_completion(cmd);
> +	blk_mq_complete_request(cmd->rq);
>  }
>  
>  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
> @@ -527,7 +518,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
>  		segments = bio_segments(bio);
>  	}
> -	atomic_set(&cmd->ref, 2);
>  
>  	iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
>  		      segments, blk_rq_bytes(rq));
> @@ -545,7 +535,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  	else
>  		ret = call_read_iter(file, &cmd->iocb, &iter);
>  
> -	lo_rw_aio_do_completion(cmd);
> +	kfree(cmd->bvec);
>  	kthread_associate_blkcg(NULL);
>  
>  	if (ret != -EIOCBQUEUED)
> diff --git a/drivers/block/loop.h b/drivers/block/loop.h
> index 0f45416e4fcf..ba65848c7c33 100644
> --- a/drivers/block/loop.h
> +++ b/drivers/block/loop.h
> @@ -68,7 +68,6 @@ struct loop_cmd {
>  	struct kthread_work work;
>  	struct request *rq;
>  	bool use_aio; /* use AIO interface to handle I/O */
> -	atomic_t ref; /* only for aio */
>  	long ret;
>  	struct kiocb iocb;
>  	struct bio_vec *bvec;
> -- 
> 2.14.2
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
                   ` (7 preceding siblings ...)
  2018-03-20 20:41 ` Andrei Vagin
@ 2018-03-21  7:23 ` Christoph Hellwig
  2018-03-21 18:16 ` Andrei Vagin
                   ` (2 subsequent siblings)
  11 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2018-03-21  7:23 UTC (permalink / raw)
  To: target-devel

> I'm afraid this patch doesn't work, because we are not always allocate bvec,
> sometimes we get it from bio. In this case, we have to call
> blk_mq_complete_request after read_iter/write_iter.

The issue there is that we really need to NULL ->bvec after freeing it.
Which normally is an anti-pattern but seems to be needed here (and warrants
a comment).  Updated patch below:


From b8eed7302071023852c00f8296165c2e9bbc51f4 Mon Sep 17 00:00:00 2001
From: Christoph Hellwig <hch@lst.de>
Date: Tue, 20 Mar 2018 09:35:55 +0100
Subject: loop: remove loop_cmd refcounting

Instead of introducing an unneeded refcount free the bvec after calling
into ->read_iter and ->write_iter, which follows the the normal read/write
path conventions.

Fixes: 92d77332 ("block/loop: fix use after free")
Signed-off-by: Christoph Hellwig <hch@lst.de>
---
 drivers/block/loop.c | 19 +++++++------------
 drivers/block/loop.h |  1 -
 2 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index ee62d2d517bf..9cb6078cd4f0 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -463,15 +463,6 @@ static void lo_complete_rq(struct request *rq)
 	blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK);
 }
 
-static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
-{
-	if (!atomic_dec_and_test(&cmd->ref))
-		return;
-	kfree(cmd->bvec);
-	cmd->bvec = NULL;
-	blk_mq_complete_request(cmd->rq);
-}
-
 static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 {
 	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
@@ -479,7 +470,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
 	if (cmd->css)
 		css_put(cmd->css);
 	cmd->ret = ret;
-	lo_rw_aio_do_completion(cmd);
+	blk_mq_complete_request(cmd->rq);
 }
 
 static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
@@ -527,7 +518,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
 		segments = bio_segments(bio);
 	}
-	atomic_set(&cmd->ref, 2);
 
 	iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
 		      segments, blk_rq_bytes(rq));
@@ -545,7 +535,12 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	else
 		ret = call_read_iter(file, &cmd->iocb, &iter);
 
-	lo_rw_aio_do_completion(cmd);
+	/*
+	 * Clear bvec to NULL as lo_rw_aio only allocates and sets it for
+	 * the multiple bios per request case and we want a clean slate here.
+	 */
+	kfree(cmd->bvec);
+	cmd->bvec = NULL;
 	kthread_associate_blkcg(NULL);
 
 	if (ret != -EIOCBQUEUED)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index 0f45416e4fcf..ba65848c7c33 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -68,7 +68,6 @@ struct loop_cmd {
 	struct kthread_work work;
 	struct request *rq;
 	bool use_aio; /* use AIO interface to handle I/O */
-	atomic_t ref; /* only for aio */
 	long ret;
 	struct kiocb iocb;
 	struct bio_vec *bvec;
-- 
2.14.2


^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
                   ` (8 preceding siblings ...)
  2018-03-21  7:23 ` Christoph Hellwig
@ 2018-03-21 18:16 ` Andrei Vagin
  2018-03-21 18:47 ` Christoph Hellwig
  2018-03-21 20:25 ` Andrei Vagin
  11 siblings, 0 replies; 13+ messages in thread
From: Andrei Vagin @ 2018-03-21 18:16 UTC (permalink / raw)
  To: target-devel

On Wed, Mar 21, 2018 at 12:23:40AM -0700, Christoph Hellwig wrote:
> > I'm afraid this patch doesn't work, because we are not always allocate bvec,
> > sometimes we get it from bio. In this case, we have to call
> > blk_mq_complete_request after read_iter/write_iter.
> 
> The issue there is that we really need to NULL ->bvec after freeing it.
> Which normally is an anti-pattern but seems to be needed here (and warrants
> a comment).  Updated patch below:

At first, I thought so too, but if we look at the kasan output, we can
find that this bvec was allocated in bio_alloc_bioset().

[  102.640354] Allocated by task 691:
[  102.641024]  kasan_kmalloc+0xa0/0xd0
[  102.641718]  kmem_cache_alloc+0xca/0x2d0
[  102.642375]  mempool_alloc+0x117/0x2f0
[  102.642946]  bio_alloc_bioset+0x34d/0x4a0
[  102.643516]  mpage_alloc.isra.8+0x45/0x110
[  102.644082]  do_mpage_readpage+0xa70/0xc60
[  102.644664]  mpage_readpages+0x2d1/0x400
[  102.645209]  __do_page_cache_readahead+0x528/0x740
[  102.645873]  force_page_cache_readahead+0x10b/0x170
[  102.646560]  generic_file_read_iter+0xeb2/0x12f0
[  102.647198]  __vfs_read+0x2a9/0x3c0
[  102.647705]  vfs_read+0xb7/0x1b0
[  102.648161]  SyS_read+0xb6/0x140
[  102.648619]  do_syscall_64+0x193/0x470
[  102.649142]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

[  102.650063] Freed by task 693:
[  102.650502]  __kasan_slab_free+0x136/0x180
[  102.651074]  kmem_cache_free+0xc6/0x310
[  102.651711]  bio_put+0xdb/0xf0
[  102.652146]  bio_endio+0x23b/0x400
[  102.652672]  blk_update_request+0x12a/0x660
[  102.653252]  blk_mq_end_request+0x26/0x80
[  102.653826]  flush_smp_call_function_queue+0x23d/0x350
[  102.654544]  smp_call_function_single_interrupt+0xdc/0x460

If we look at lo_rw_aio, we can find that bvec can be allocated or got
from bio. I think the problem with the second case.

static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
                     loff_t pos, bool rw)
{
...
        if (rq->bio != rq->biotail) {
		...
                bvec = kmalloc(sizeof(struct bio_vec) * segments,
GFP_NOIO);
		...
	} else {
		...
                bvec = __bvec_iter_bvec(bio->bi_io_vec,
bio->bi_iter);

BTW: I tried your patch and I still get the same kasan warning.


> 
> 
> From b8eed7302071023852c00f8296165c2e9bbc51f4 Mon Sep 17 00:00:00 2001
> From: Christoph Hellwig <hch@lst.de>
> Date: Tue, 20 Mar 2018 09:35:55 +0100
> Subject: loop: remove loop_cmd refcounting
> 
> Instead of introducing an unneeded refcount free the bvec after calling
> into ->read_iter and ->write_iter, which follows the the normal read/write
> path conventions.
> 
> Fixes: 92d77332 ("block/loop: fix use after free")
> Signed-off-by: Christoph Hellwig <hch@lst.de>
> ---
>  drivers/block/loop.c | 19 +++++++------------
>  drivers/block/loop.h |  1 -
>  2 files changed, 7 insertions(+), 13 deletions(-)
> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index ee62d2d517bf..9cb6078cd4f0 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -463,15 +463,6 @@ static void lo_complete_rq(struct request *rq)
>  	blk_mq_end_request(rq, cmd->ret < 0 ? BLK_STS_IOERR : BLK_STS_OK);
>  }
>  
> -static void lo_rw_aio_do_completion(struct loop_cmd *cmd)
> -{
> -	if (!atomic_dec_and_test(&cmd->ref))
> -		return;
> -	kfree(cmd->bvec);
> -	cmd->bvec = NULL;

You can remove bvec from loop_cmd, it was only a place out of lo_rw_aio
where it is used.


> -	blk_mq_complete_request(cmd->rq);
> -}
> -
>  static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
>  {
>  	struct loop_cmd *cmd = container_of(iocb, struct loop_cmd, iocb);
> @@ -479,7 +470,7 @@ static void lo_rw_aio_complete(struct kiocb *iocb, long ret, long ret2)
>  	if (cmd->css)
>  		css_put(cmd->css);
>  	cmd->ret = ret;
> -	lo_rw_aio_do_completion(cmd);
> +	blk_mq_complete_request(cmd->rq);
>  }
>  
>  static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
> @@ -527,7 +518,6 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  		bvec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
>  		segments = bio_segments(bio);
>  	}
> -	atomic_set(&cmd->ref, 2);
>  
>  	iov_iter_bvec(&iter, ITER_BVEC | rw, bvec,
>  		      segments, blk_rq_bytes(rq));
> @@ -545,7 +535,12 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  	else
>  		ret = call_read_iter(file, &cmd->iocb, &iter);
>  
> -	lo_rw_aio_do_completion(cmd);
> +	/*
> +	 * Clear bvec to NULL as lo_rw_aio only allocates and sets it for
> +	 * the multiple bios per request case and we want a clean slate here.
> +	 */
> +	kfree(cmd->bvec);
> +	cmd->bvec = NULL;
>  	kthread_associate_blkcg(NULL);
>  
>  	if (ret != -EIOCBQUEUED)
> diff --git a/drivers/block/loop.h b/drivers/block/loop.h
> index 0f45416e4fcf..ba65848c7c33 100644
> --- a/drivers/block/loop.h
> +++ b/drivers/block/loop.h
> @@ -68,7 +68,6 @@ struct loop_cmd {
>  	struct kthread_work work;
>  	struct request *rq;
>  	bool use_aio; /* use AIO interface to handle I/O */
> -	atomic_t ref; /* only for aio */
>  	long ret;
>  	struct kiocb iocb;
>  	struct bio_vec *bvec;
> -- 
> 2.14.2
> 

^ permalink raw reply	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
                   ` (9 preceding siblings ...)
  2018-03-21 18:16 ` Andrei Vagin
@ 2018-03-21 18:47 ` Christoph Hellwig
  2018-03-21 20:25 ` Andrei Vagin
  11 siblings, 0 replies; 13+ messages in thread
From: Christoph Hellwig @ 2018-03-21 18:47 UTC (permalink / raw)
  To: target-devel

On Wed, Mar 21, 2018 at 11:16:09AM -0700, Andrei Vagin wrote:
> If we look at lo_rw_aio, we can find that bvec can be allocated or got
> from bio. I think the problem with the second case.
> 
> static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>                      loff_t pos, bool rw)
> {
> ...
>         if (rq->bio != rq->biotail) {
> 		...
>                 bvec = kmalloc(sizeof(struct bio_vec) * segments,
> GFP_NOIO);
> 		...
> 	} else {
> 		...
>                 bvec = __bvec_iter_bvec(bio->bi_io_vec,
> bio->bi_iter);

That is the local 'bvec' variable, not cmd->bvec which is only
assigned to the kmalloc return value.

> BTW: I tried your patch and I still get the same kasan warning.

Strange, as the trace can't really happen with the patch applied.
In fact the ->bvec field isn't even needed, e.g. we should do something
like this on top of the previous patch:

diff --git a/drivers/block/loop.c b/drivers/block/loop.c
index 9cb6078cd4f0..0401fade3d60 100644
--- a/drivers/block/loop.c
+++ b/drivers/block/loop.c
@@ -477,7 +477,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		     loff_t pos, bool rw)
 {
 	struct iov_iter iter;
-	struct bio_vec *bvec;
+	struct bio_vec *bvec, *alloc_bvec = NULL;
 	struct request *rq = cmd->rq;
 	struct bio *bio = rq->bio;
 	struct file *file = lo->lo_backing_file;
@@ -491,10 +491,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 
 		__rq_for_each_bio(bio, rq)
 			segments += bio_segments(bio);
-		bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_NOIO);
-		if (!bvec)
+		alloc_bvec = kmalloc_array(segments,
+				sizeof(struct bio_vec), GFP_NOIO);
+		if (!alloc_bvec)
 			return -EIO;
-		cmd->bvec = bvec;
 
 		/*
 		 * The bios of the request may be started from the middle of
@@ -502,11 +502,12 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 		 * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
 		 * API will take care of all details for us.
 		 */
+		bvec = alloc_bvec;
 		rq_for_each_segment(tmp, rq, iter) {
 			*bvec = tmp;
 			bvec++;
 		}
-		bvec = cmd->bvec;
+		bvec = alloc_bvec;
 		offset = 0;
 	} else {
 		/*
@@ -535,12 +536,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
 	else
 		ret = call_read_iter(file, &cmd->iocb, &iter);
 
-	/*
-	 * Clear bvec to NULL as lo_rw_aio only allocates and sets it for
-	 * the multiple bios per request case and we want a clean slate here.
-	 */
-	kfree(cmd->bvec);
-	cmd->bvec = NULL;
+	kfree(alloc_bvec);
 	kthread_associate_blkcg(NULL);
 
 	if (ret != -EIOCBQUEUED)
diff --git a/drivers/block/loop.h b/drivers/block/loop.h
index ba65848c7c33..b8eef1977f64 100644
--- a/drivers/block/loop.h
+++ b/drivers/block/loop.h
@@ -70,7 +70,6 @@ struct loop_cmd {
 	bool use_aio; /* use AIO interface to handle I/O */
 	long ret;
 	struct kiocb iocb;
-	struct bio_vec *bvec;
 	struct cgroup_subsys_state *css;
 };
 

^ permalink raw reply related	[flat|nested] 13+ messages in thread

* Re: [PATCH] [RFC] target/file: add support of direct and async I/O
  2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
                   ` (10 preceding siblings ...)
  2018-03-21 18:47 ` Christoph Hellwig
@ 2018-03-21 20:25 ` Andrei Vagin
  11 siblings, 0 replies; 13+ messages in thread
From: Andrei Vagin @ 2018-03-21 20:25 UTC (permalink / raw)
  To: target-devel

On Wed, Mar 21, 2018 at 11:47:02AM -0700, Christoph Hellwig wrote:
> On Wed, Mar 21, 2018 at 11:16:09AM -0700, Andrei Vagin wrote:
> > If we look at lo_rw_aio, we can find that bvec can be allocated or got
> > from bio. I think the problem with the second case.
> > 
> > static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
> >                      loff_t pos, bool rw)
> > {
> > ...
> >         if (rq->bio != rq->biotail) {
> > 		...
> >                 bvec = kmalloc(sizeof(struct bio_vec) * segments,
> > GFP_NOIO);
> > 		...
> > 	} else {
> > 		...
> >                 bvec = __bvec_iter_bvec(bio->bi_io_vec,
> > bio->bi_iter);
> 
> That is the local 'bvec' variable, not cmd->bvec which is only
> assigned to the kmalloc return value.
> 
> > BTW: I tried your patch and I still get the same kasan warning.
> 
> Strange, as the trace can't really happen with the patch applied.

In the trace, we see that bvec is accessed from iov_iter_revert(), but it
has been already released. This means than bvec was not allocated in
lo_rw_aio(). In the third trace, we can see that this bvec was released
in lo_rw_aio_complete()->bio_endio(). This means that we used bvec from
bio.

 [  101.436741] Freed by task 693:
[  101.437187]  __kasan_slab_free+0x136/0x180
[  101.437790]  kmem_cache_free+0xc6/0x310
[  101.438360]  bio_put+0xdb/0xf0
[  101.438844]  bio_endio+0x23b/0x400
[  101.439385]  blk_update_request+0x12a/0x660
[  101.440139]  blk_mq_end_request+0x26/0x80
[  101.440724]  __blk_mq_complete_request+0x30a/0x400
[  101.441406]  blk_mq_complete_request+0x110/0x160
[  101.442071]  lo_rw_aio_complete+0xbe/0x240
[  101.442664]  fuse_aio_complete+0x177/0x220


Bellow you can find traces from the kernel with two last patches.

[  101.335363] =================================
[  101.339372] BUG: KASAN: use-after-free in iov_iter_revert+0xa7/0x440
[  101.340612] Read of size 4 at addr ffff8800b95003d0 by task loop0/697

[  101.342050] CPU: 0 PID: 697 Comm: loop0 Not tainted 4.16.0-rc6-00035-g5b7b2f4a8aba #510
[  101.343481] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1.fc26 04/01/2014
[  101.344762] Call Trace:
[  101.345140]  dump_stack+0xda/0x16f
[  101.345669]  ? _atomic_dec_and_lock+0x101/0x101
[  101.346394]  ? del_timer_sync+0xac/0xd0
[  101.347157]  print_address_description+0x6a/0x270
[  101.348014]  kasan_report+0x258/0x380
[  101.348671]  ? iov_iter_revert+0xa7/0x440
[  101.349511]  iov_iter_revert+0xa7/0x440
[  101.350362]  generic_file_read_iter+0xfd0/0x12f0
[  101.351224]  ? match_held_lock+0x7f/0x340
[  101.351958]  ? __save_stack_trace+0x82/0x100
[  101.352883]  ? filemap_range_has_page+0x1b0/0x1b0
[  101.353788]  ? ret_from_fork+0x3a/0x50
[  101.354716]  ? fuse_simple_request+0x1a7/0x280
[  101.355966]  ? save_stack+0x89/0xb0
[  101.356918]  ? find_held_lock+0x6d/0xd0
[  101.357735]  ? fuse_change_attributes+0x243/0x350
[  101.358676]  ? lock_downgrade+0x320/0x320
[  101.360030]  ? map_id_range_down+0x19a/0x1c0
[  101.361880]  ? do_raw_spin_unlock+0x147/0x220
[  101.363089]  ? do_raw_spin_trylock+0x100/0x100
[  101.363960]  ? do_raw_write_trylock+0x100/0x100
[  101.364816]  ? _raw_spin_unlock+0x24/0x30
[  101.365644]  ? fuse_change_attributes+0x32e/0x350
[  101.366567]  ? fuse_change_attributes_common+0x300/0x300
[  101.367539]  ? fuse_simple_request+0x1a7/0x280
[  101.368339]  ? fuse_simple_request+0x1a7/0x280
[  101.369173]  ? fuse_do_getattr+0x27b/0x6e0
[  101.369950]  ? fuse_dentry_revalidate+0x640/0x640
[  101.370822]  ? find_held_lock+0x6d/0xd0
[  101.371638]  ? lock_release+0x4f0/0x4f0
[  101.372351]  ? match_held_lock+0x7f/0x340
[  101.373130]  lo_rw_aio+0x942/0xd10
[  101.373841]  ? lo_rw_aio_complete+0x240/0x240
[  101.374665]  ? match_held_lock+0x7f/0x340
[  101.375413]  ? clear_buddies+0x42/0x210
[  101.376330]  ? debug_check_no_locks_freed+0x1b0/0x1b0
[  101.377266]  ? finish_task_switch+0x181/0x500
[  101.378099]  ? do_raw_spin_unlock+0x147/0x220
[  101.378872]  loop_queue_work+0x10aa/0x1900
[  101.379635]  ? _raw_spin_unlock_irq+0x29/0x40
[  101.380268]  ? _raw_spin_unlock_irq+0x29/0x40
[  101.380902]  ? finish_task_switch+0x181/0x500
[  101.381534]  ? finish_task_switch+0x10d/0x500
[  101.382165]  ? lo_compat_ioctl+0xe0/0xe0
[  101.382754]  ? set_load_weight+0x110/0x110
[  101.383503]  ? __switch_to_asm+0x34/0x70
[  101.384211]  ? __switch_to_asm+0x34/0x70
[  101.384920]  ? __switch_to_asm+0x40/0x70
[  101.385628]  ? __switch_to_asm+0x34/0x70
[  101.386713]  ? __switch_to_asm+0x40/0x70
[  101.387564]  ? __switch_to_asm+0x34/0x70
[  101.389006]  ? __switch_to_asm+0x40/0x70
[  101.389927]  ? __switch_to_asm+0x34/0x70
[  101.390752]  ? __switch_to_asm+0x40/0x70
[  101.391522]  ? __switch_to_asm+0x34/0x70
[  101.392687]  ? __switch_to_asm+0x40/0x70
[  101.394136]  ? __switch_to_asm+0x34/0x70
[  101.394955]  ? __switch_to_asm+0x40/0x70
[  101.395532]  ? __switch_to_asm+0x34/0x70
[  101.396092]  ? __switch_to_asm+0x40/0x70
[  101.396673]  ? __schedule+0x50e/0x11b0
[  101.397241]  ? mark_held_locks+0x1c/0x90
[  101.397896]  ? _raw_spin_unlock_irq+0x29/0x40
[  101.398839]  ? match_held_lock+0x7f/0x340
[  101.399564]  ? save_trace+0x1e0/0x1e0
[  101.400233]  ? finish_task_switch+0x181/0x500
[  101.401026]  ? finish_task_switch+0x10d/0x500
[  101.401818]  ? __switch_to_asm+0x34/0x70
[  101.402536]  ? __switch_to_asm+0x34/0x70
[  101.403255]  ? set_load_weight+0x110/0x110
[  101.403996]  ? __switch_to_asm+0x34/0x70
[  101.404730]  ? __switch_to_asm+0x34/0x70
[  101.405424]  ? __switch_to_asm+0x40/0x70
[  101.406150]  ? find_held_lock+0x6d/0xd0
[  101.406869]  ? kthread_worker_fn+0x229/0x520
[  101.407635]  ? lock_downgrade+0x320/0x320
[  101.408356]  ? do_raw_spin_unlock+0x147/0x220
[  101.409316]  ? do_raw_spin_trylock+0x100/0x100
[  101.409969]  ? rcu_note_context_switch+0x4e0/0x4e0
[  101.410729]  ? match_held_lock+0x7f/0x340
[  101.411440]  ? mark_held_locks+0x1c/0x90
[  101.412150]  ? _raw_spin_unlock_irq+0x29/0x40
[  101.412959]  kthread_worker_fn+0x272/0x520
[  101.413648]  ? kthread+0x1e0/0x1e0
[  101.414482]  ? find_held_lock+0x6d/0xd0
[  101.415400]  ? lock_downgrade+0x320/0x320
[  101.416319]  ? __schedule+0x11b0/0x11b0
[  101.417130]  ? do_raw_spin_unlock+0x147/0x220
[  101.417847]  ? do_raw_spin_trylock+0x100/0x100
[  101.418560]  ? __lockdep_init_map+0x98/0x2a0
[  101.419446]  ? __init_waitqueue_head+0xbe/0xf0
[  101.420333]  ? mark_held_locks+0x1c/0x90
[  101.421444]  ? _raw_spin_unlock_irqrestore+0x32/0x60
[  101.422726]  ? loop_get_status64+0x100/0x100
[  101.423629]  kthread+0x1b7/0x1e0
[  101.424212]  ? kthread_create_worker_on_cpu+0xc0/0xc0
[  101.425137]  ret_from_fork+0x3a/0x50

[  101.426481] Allocated by task 699:
[  101.427260]  kasan_kmalloc+0xa0/0xd0
[  101.427986]  kmem_cache_alloc+0xca/0x2d0
[  101.428659]  mempool_alloc+0x117/0x2f0
[  101.429330]  bio_alloc_bioset+0x34d/0x4a0
[  101.430147]  mpage_alloc.isra.8+0x45/0x110
[  101.430652]  do_mpage_readpage+0xa70/0xc60
[  101.431158]  mpage_readpages+0x2d1/0x400
[  101.431688]  __do_page_cache_readahead+0x528/0x740
[  101.432381]  force_page_cache_readahead+0x10b/0x170
[  101.433091]  generic_file_read_iter+0xeb2/0x12f0
[  101.433763]  __vfs_read+0x2a9/0x3c0
[  101.434288]  vfs_read+0xb7/0x1b0
[  101.434759]  SyS_read+0xb6/0x140
[  101.435231]  do_syscall_64+0x193/0x470
[  101.435786]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

[  101.436741] Freed by task 693:
[  101.437187]  __kasan_slab_free+0x136/0x180
[  101.437790]  kmem_cache_free+0xc6/0x310
[  101.438360]  bio_put+0xdb/0xf0
[  101.438844]  bio_endio+0x23b/0x400
[  101.439385]  blk_update_request+0x12a/0x660
[  101.440139]  blk_mq_end_request+0x26/0x80
[  101.440724]  __blk_mq_complete_request+0x30a/0x400
[  101.441406]  blk_mq_complete_request+0x110/0x160
[  101.442071]  lo_rw_aio_complete+0xbe/0x240
[  101.442664]  fuse_aio_complete+0x177/0x220
[  101.443241]  request_end+0x237/0x470
[  101.443814]  fuse_dev_do_write+0x1325/0x1910
[  101.444477]  fuse_dev_write+0x11d/0x180
[  101.445037]  do_iter_readv_writev+0x2a5/0x370
[  101.445663]  do_iter_write+0xd1/0x280
[  101.446187]  vfs_writev+0x145/0x230
[  101.446690]  do_writev+0xc0/0x1b0
[  101.447170]  do_syscall_64+0x193/0x470
[  101.447711]  entry_SYSCALL_64_after_hwframe+0x42/0xb7

[  101.448672] The buggy address belongs to the object at ffff8800b9500340
                which belongs to the cache bio-0 of size 200
[  101.450534] The buggy address is located 144 bytes inside of
                200-byte region [ffff8800b9500340, ffff8800b9500408)
[  101.452229] The buggy address belongs to the page:
[  101.452939] page:ffffea0002e54000 count:1 mapcount:0 mapping:0000000000000000 index:0xffff8800b95004c0 compound_mapcount: 0
[  101.454468] flags: 0x1fffc000008100(slab|head)
[  101.455149] raw: 001fffc000008100 0000000000000000 ffff8800b95004c0 0000000100150006
[  101.456525] raw: ffffea00045105a0 ffff8801198e2580 ffff880119a58700 0000000000000000
[  101.457560] page dumped because: kasan: bad access detected

[  101.458526] Memory state around the buggy address:
[  101.459174]  ffff8800b9500280: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  101.460319]  ffff8800b9500300: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb
[  101.461412] >ffff8800b9500380: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
[  101.462647]                                                  ^
[  101.463898]  ffff8800b9500400: fb fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  101.465217]  ffff8800b9500480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc
[  101.466250] =================================
[  101.467443] Disabling lock debugging due to kernel taint


> In fact the ->bvec field isn't even needed, e.g. we should do something
> like this on top of the previous patch:

I said this in a previous mail ;)

> 
> diff --git a/drivers/block/loop.c b/drivers/block/loop.c
> index 9cb6078cd4f0..0401fade3d60 100644
> --- a/drivers/block/loop.c
> +++ b/drivers/block/loop.c
> @@ -477,7 +477,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  		     loff_t pos, bool rw)
>  {
>  	struct iov_iter iter;
> -	struct bio_vec *bvec;
> +	struct bio_vec *bvec, *alloc_bvec = NULL;
>  	struct request *rq = cmd->rq;
>  	struct bio *bio = rq->bio;
>  	struct file *file = lo->lo_backing_file;
> @@ -491,10 +491,10 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  
>  		__rq_for_each_bio(bio, rq)
>  			segments += bio_segments(bio);
> -		bvec = kmalloc(sizeof(struct bio_vec) * segments, GFP_NOIO);
> -		if (!bvec)
> +		alloc_bvec = kmalloc_array(segments,
> +				sizeof(struct bio_vec), GFP_NOIO);
> +		if (!alloc_bvec)
>  			return -EIO;
> -		cmd->bvec = bvec;
>  
>  		/*
>  		 * The bios of the request may be started from the middle of
> @@ -502,11 +502,12 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  		 * copy bio->bi_iov_vec to new bvec. The rq_for_each_segment
>  		 * API will take care of all details for us.
>  		 */
> +		bvec = alloc_bvec;
>  		rq_for_each_segment(tmp, rq, iter) {
>  			*bvec = tmp;
>  			bvec++;
>  		}
> -		bvec = cmd->bvec;
> +		bvec = alloc_bvec;
>  		offset = 0;
>  	} else {
>  		/*
> @@ -535,12 +536,7 @@ static int lo_rw_aio(struct loop_device *lo, struct loop_cmd *cmd,
>  	else
>  		ret = call_read_iter(file, &cmd->iocb, &iter);
>  
> -	/*
> -	 * Clear bvec to NULL as lo_rw_aio only allocates and sets it for
> -	 * the multiple bios per request case and we want a clean slate here.
> -	 */
> -	kfree(cmd->bvec);
> -	cmd->bvec = NULL;
> +	kfree(alloc_bvec);
>  	kthread_associate_blkcg(NULL);
>  
>  	if (ret != -EIOCBQUEUED)
> diff --git a/drivers/block/loop.h b/drivers/block/loop.h
> index ba65848c7c33..b8eef1977f64 100644
> --- a/drivers/block/loop.h
> +++ b/drivers/block/loop.h
> @@ -70,7 +70,6 @@ struct loop_cmd {
>  	bool use_aio; /* use AIO interface to handle I/O */
>  	long ret;
>  	struct kiocb iocb;
> -	struct bio_vec *bvec;
>  	struct cgroup_subsys_state *css;
>  };
>  

^ permalink raw reply	[flat|nested] 13+ messages in thread

end of thread, other threads:[~2018-03-21 20:25 UTC | newest]

Thread overview: 13+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2018-03-09  0:42 [PATCH] [RFC] target/file: add support of direct and async I/O Andrei Vagin
2018-03-15 14:26 ` Bryant G. Ly
2018-03-15 17:04 ` Andrei Vagin
2018-03-16  7:50 ` Christoph Hellwig
2018-03-17  0:13 ` Andrei Vagin
2018-03-17  8:25 ` Christoph Hellwig
2018-03-20  7:54 ` Andrei Vagin
2018-03-20  8:47 ` Christoph Hellwig
2018-03-20 20:41 ` Andrei Vagin
2018-03-21  7:23 ` Christoph Hellwig
2018-03-21 18:16 ` Andrei Vagin
2018-03-21 18:47 ` Christoph Hellwig
2018-03-21 20:25 ` Andrei Vagin

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.