Re: [PATCH] block/file-posix: Optimize for macOS

From: Stefan Hajnoczi <stefanha@redhat.com>
To: Akihiko Odaki <akihiko.odaki@gmail.com>
Cc: Kevin Wolf <kwolf@redhat.com>, Fam Zheng <fam@euphon.net>,
	qemu-block@nongnu.org, Markus Armbruster <armbru@redhat.com>,
	qemu-devel@nongnu.org, Max Reitz <mreitz@redhat.com>,
	John Snow <jsnow@redhat.com>
Subject: Re: [PATCH] block/file-posix: Optimize for macOS
Date: Wed, 24 Feb 2021 16:27:45 +0000	[thread overview]
Message-ID: <YDZ+gVgfvNejLfFQ@stefanha-x1.localdomain> (raw)
In-Reply-To: <20210219085148.90545-1-akihiko.odaki@gmail.com>

[-- Attachment #1: Type: text/plain, Size: 7322 bytes --]

On Fri, Feb 19, 2021 at 05:51:48PM +0900, Akihiko Odaki wrote:
> This commit introduces "punch hole" operation and optimizes transfer
> block size for macOS.
> 
> This commit introduces two additional members,
> discard_granularity and opt_io to BlockSizes type in
> include/block/block.h. Also, the members of the type are now
> optional. Set -1 to discard_granularity and 0 to other members
> for the default values.

I remember BlockSizes was added specifically for s390 DASD devices.
Normally QEMU does not automatically expose details of the underlying
hardware to the guest because it breaks live migration compatibility.

If a VM is running on host A where the value happens to be 512 bytes and
is migrated to host B where the value happens to be 4KB then:

1. The value reported to the guest by the storage device will change
   unexpectedly and the guest software is probably not prepared for this
   to happen.

2. I/O requests that violate the constraint imposed by host B's value
   will suddenly start failing and the VM may no longer be operational.

I think there was an argument that DASDs are passthrough devices and the
user always knows what they are doing, so we can ignore this problem for
DASDs.

This reasoning does not apply to POSIX files on macOS hosts, so I think
we need to figure out what to do here. The easiest option is not to
merge this patch series, but if this feature is important to you then we
need to think about how to extend the block size probing to be live
migration friendly or to change the QEMU command-line to support this
use case without unexpected live migration breakage.

CCing Kevin Wolf and Markus Armbruster.

> Signed-off-by: Akihiko Odaki <akihiko.odaki@gmail.com>
> ---
>  block/file-posix.c    | 40 ++++++++++++++++++++++++++++++++++++++--
>  block/nvme.c          |  2 ++
>  hw/block/block.c      | 12 ++++++++++--
>  include/block/block.h |  2 ++
>  4 files changed, 52 insertions(+), 4 deletions(-)
> 
> diff --git a/block/file-posix.c b/block/file-posix.c
> index 00cdaaa2d41..defbc04c8e7 100644
> --- a/block/file-posix.c
> +++ b/block/file-posix.c
> @@ -44,6 +44,7 @@
>  #if defined(__APPLE__) && (__MACH__)
>  #include <paths.h>
>  #include <sys/param.h>
> +#include <sys/mount.h>
>  #include <IOKit/IOKitLib.h>
>  #include <IOKit/IOBSD.h>
>  #include <IOKit/storage/IOMediaBSDClient.h>
> @@ -1274,6 +1275,8 @@ static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
>      if (check_for_dasd(s->fd) < 0) {
>          return -ENOTSUP;
>      }
> +    bsz->opt_io = 0;
> +    bsz->discard_granularity = -1;
>      ret = probe_logical_blocksize(s->fd, &bsz->log);
>      if (ret < 0) {
>          return ret;
> @@ -1568,6 +1571,7 @@ out:
>      }
>  }
>  
> +G_GNUC_UNUSED
>  static int translate_err(int err)
>  {
>      if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP ||
> @@ -1777,16 +1781,27 @@ static int handle_aiocb_discard(void *opaque)
>              }
>          } while (errno == EINTR);
>  
> -        ret = -errno;
> +        ret = translate_err(-errno);
>  #endif
>      } else {
>  #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
>          ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
>                             aiocb->aio_offset, aiocb->aio_nbytes);
> +        ret = translate_err(-errno);
> +#elif defined(__APPLE__) && (__MACH__)
> +        fpunchhole_t fpunchhole;
> +        fpunchhole.fp_flags = 0;
> +        fpunchhole.reserved = 0;
> +        fpunchhole.fp_offset = aiocb->aio_offset;
> +        fpunchhole.fp_length = aiocb->aio_nbytes;
> +        if (fcntl(s->fd, F_PUNCHHOLE, &fpunchhole) == -1) {
> +            ret = errno == ENODEV ? -ENOTSUP : -errno;
> +        } else {
> +            ret = 0;
> +        }
>  #endif
>      }
>  
> -    ret = translate_err(ret);
>      if (ret == -ENOTSUP) {
>          s->has_discard = false;
>      }
> @@ -2095,6 +2110,26 @@ static int raw_co_flush_to_disk(BlockDriverState *bs)
>      return raw_thread_pool_submit(bs, handle_aiocb_flush, &acb);
>  }
>  
> +static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
> +{
> +#if defined(__APPLE__) && (__MACH__)
> +    BDRVRawState *s = bs->opaque;
> +    struct statfs buf;
> +
> +    if (!fstatfs(s->fd, &buf)) {
> +        bsz->phys = 0;
> +        bsz->log = 0;
> +        bsz->opt_io = buf.f_iosize;
> +        bsz->discard_granularity = buf.f_bsize;
> +        return 0;
> +    }
> +
> +    return -errno;
> +#else
> +    return -ENOTSUP;
> +#endif
> +}
> +
>  static void raw_aio_attach_aio_context(BlockDriverState *bs,
>                                         AioContext *new_context)
>  {
> @@ -3229,6 +3264,7 @@ BlockDriver bdrv_file = {
>      .bdrv_refresh_limits = raw_refresh_limits,
>      .bdrv_io_plug = raw_aio_plug,
>      .bdrv_io_unplug = raw_aio_unplug,
> +    .bdrv_probe_blocksizes = raw_probe_blocksizes,
>      .bdrv_attach_aio_context = raw_aio_attach_aio_context,
>  
>      .bdrv_co_truncate = raw_co_truncate,
> diff --git a/block/nvme.c b/block/nvme.c
> index 5a6fbacf4a5..6d84bbdb632 100644
> --- a/block/nvme.c
> +++ b/block/nvme.c
> @@ -983,6 +983,8 @@ static int nvme_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz)
>      uint32_t blocksize = nvme_get_blocksize(bs);
>      bsz->phys = blocksize;
>      bsz->log = blocksize;
> +    bsz->opt_io = 0;
> +    bsz->discard_granularity = -1;
>      return 0;
>  }
>  
> diff --git a/hw/block/block.c b/hw/block/block.c
> index 1e34573da71..c907e5a7722 100644
> --- a/hw/block/block.c
> +++ b/hw/block/block.c
> @@ -70,19 +70,27 @@ bool blkconf_blocksizes(BlockConf *conf, Error **errp)
>      backend_ret = blk_probe_blocksizes(blk, &blocksizes);
>      /* fill in detected values if they are not defined via qemu command line */
>      if (!conf->physical_block_size) {
> -        if (!backend_ret) {
> +        if (!backend_ret && blocksizes.phys) {
>             conf->physical_block_size = blocksizes.phys;
>          } else {
>              conf->physical_block_size = BDRV_SECTOR_SIZE;
>          }
>      }
>      if (!conf->logical_block_size) {
> -        if (!backend_ret) {
> +        if (!backend_ret && blocksizes.log) {
>              conf->logical_block_size = blocksizes.log;
>          } else {
>              conf->logical_block_size = BDRV_SECTOR_SIZE;
>          }
>      }
> +    if (!backend_ret) {
> +        if (!conf->opt_io_size) {
> +            conf->opt_io_size = blocksizes.opt_io;
> +        }
> +        if (conf->discard_granularity == -1) {
> +            conf->discard_granularity = blocksizes.discard_granularity;
> +        }
> +    }
>  
>      if (conf->logical_block_size > conf->physical_block_size) {
>          error_setg(errp,
> diff --git a/include/block/block.h b/include/block/block.h
> index a193545b6ab..31bf3acacad 100644
> --- a/include/block/block.h
> +++ b/include/block/block.h
> @@ -91,6 +91,8 @@ typedef enum {
>  typedef struct BlockSizes {
>      uint32_t phys;
>      uint32_t log;
> +    uint32_t discard_granularity;
> +    uint32_t opt_io;
>  } BlockSizes;
>  
>  typedef struct HDGeometry {
> -- 
> 2.24.3 (Apple Git-128)
> 

[-- Attachment #2: signature.asc --]
[-- Type: application/pgp-signature, Size: 488 bytes --]