From mboxrd@z Thu Jan 1 00:00:00 1970 Received: from eggs.gnu.org ([2001:4830:134:3::10]:59533) by lists.gnu.org with esmtp (Exim 4.71) (envelope-from ) id 1d3Nps-0000Jb-Rb for qemu-devel@nongnu.org; Wed, 26 Apr 2017 10:22:31 -0400 Received: from Debian-exim by eggs.gnu.org with spam-scanned (Exim 4.71) (envelope-from ) id 1d3Npr-0000hF-02 for qemu-devel@nongnu.org; Wed, 26 Apr 2017 10:22:28 -0400 Date: Wed, 26 Apr 2017 16:22:16 +0200 From: Kevin Wolf Message-ID: <20170426142216.GI4538@noname.str.redhat.com> References: <20170426033413.17192-1-famz@redhat.com> <20170426033413.17192-21-famz@redhat.com> MIME-Version: 1.0 Content-Type: text/plain; charset=us-ascii Content-Disposition: inline In-Reply-To: <20170426033413.17192-21-famz@redhat.com> Subject: Re: [Qemu-devel] [PATCH v15 20/21] file-posix: Add image locking to perm operations List-Id: List-Unsubscribe: , List-Archive: List-Post: List-Help: List-Subscribe: , To: Fam Zheng Cc: qemu-devel@nongnu.org, eblake@redhat.com, Max Reitz , qemu-block@nongnu.org Am 26.04.2017 um 05:34 hat Fam Zheng geschrieben: > This extends the permission bits of op blocker API to external using > Linux OFD locks. > > Each permission in @perm and @shared_perm is represented by a locked > byte in the image file. Requesting a permission in @perm is translated > to a shared lock of the corresponding byte; rejecting to share the same > permission is translated to a shared lock of a separate byte. With that, > we use 2x number of bytes of distinct permission types. > > virtlockd in libvirt locks the first byte, so we do locking from a > higher offset. > > Suggested-by: Kevin Wolf > Signed-off-by: Fam Zheng > --- > block/file-posix.c | 267 ++++++++++++++++++++++++++++++++++++++++++++++++++++- > 1 file changed, 264 insertions(+), 3 deletions(-) > > diff --git a/block/file-posix.c b/block/file-posix.c > index 2114720..b92fdc3 100644 > --- a/block/file-posix.c > +++ b/block/file-posix.c > @@ -129,12 +129,28 @@ do { \ > > #define MAX_BLOCKSIZE 4096 > > +/* Posix file locking bytes. Libvirt takes byte 0, we start from higher bytes, > + * leaving a few more bytes for its future use. */ > +#define RAW_LOCK_PERM_BASE 100 > +#define RAW_LOCK_SHARED_BASE 200 > +#ifdef F_OFD_SETLK > +#define RAW_LOCK_SUPPORTED 1 > +#else > +#define RAW_LOCK_SUPPORTED 0 > +#endif > + > typedef struct BDRVRawState { > int fd; > + int lock_fd; > + bool use_lock; > int type; > int open_flags; > size_t buf_align; > > + /* The current permissions. */ > + uint64_t perm; > + uint64_t shared_perm; > + > #ifdef CONFIG_XFS > bool is_xfs:1; > #endif > @@ -440,6 +456,8 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, > } > s->use_linux_aio = (aio == BLOCKDEV_AIO_OPTIONS_NATIVE); > > + s->use_lock = qemu_opt_get_bool(opts, "locking", true); > + > s->open_flags = open_flags; > raw_parse_flags(bdrv_flags, &s->open_flags); > > @@ -455,6 +473,21 @@ static int raw_open_common(BlockDriverState *bs, QDict *options, > } > s->fd = fd; > > + s->lock_fd = -1; > + fd = qemu_open(filename, O_RDONLY); Note that with /dev/fdset there can be cases where we can open a file O_RDWR, but not O_RDONLY. Should we better just use the same flags as for the s->fd? > + if (fd < 0) { > + if (RAW_LOCK_SUPPORTED) { > + ret = -errno; > + error_setg_errno(errp, errno, "Could not open '%s' for locking", > + filename); > + qemu_close(s->fd); > + goto fail; > + } > + } You still open the fd and possibly error out even with s->use_lock == false. Should the code starting from qemu_open() to here be conditional on s->use_lock? > + s->lock_fd = fd; > + s->perm = 0; > + s->shared_perm = BLK_PERM_ALL; > + > #ifdef CONFIG_LINUX_AIO > /* Currently Linux does AIO only for files opened with O_DIRECT */ > if (s->use_linux_aio && !(s->open_flags & O_DIRECT)) { > @@ -542,6 +575,156 @@ static int raw_open(BlockDriverState *bs, QDict *options, int flags, > return raw_open_common(bs, options, flags, 0, errp); > } > > +typedef enum { > + RAW_PL_PREPARE, > + RAW_PL_COMMIT, > + RAW_PL_ABORT, > +} RawPermLockOp; > + > +/* Lock wanted bytes by @perm and ~@shared_perm in the file; if @unlock == This function doesn't take @perm or @shared_perm. This comment is especially confusing because shared_perm_lock_bits == ~shared_perm. We should be explicit here that shared_perm_lock_bits is the mask of all permissions that _cannot_ be shared. > + * true, also unlock the unneeded bytes. */ > +static int raw_apply_lock_bytes(BDRVRawState *s, > + uint64_t perm_lock_bits, > + uint64_t shared_perm_lock_bits, > + bool unlock, Error **errp) > +{ > + int ret; > + int i; > + > + for (i = 0; i < BLK_PERM_MAX; ++i) { > + int off = RAW_LOCK_PERM_BASE + i; > + if (perm_lock_bits & (1ULL << i)) { > + ret = qemu_lock_fd(s->lock_fd, off, 1, false); > + if (ret) { > + error_setg(errp, "Failed to lock byte %d", off); Should bdrv_perm_names() be used in this function, too? Maybe not that important because we never expect this to fail (we don't do any exclusive locks). > + return ret; > + } > + } else if (unlock) { > + ret = qemu_unlock_fd(s->lock_fd, off, 1); > + if (ret) { > + error_setg(errp, "Failed to unlock byte %d", off); > + return ret; > + } > + } > + } > + for (i = 0; i < BLK_PERM_MAX; ++i) { > + int off = RAW_LOCK_SHARED_BASE + i; > + if (shared_perm_lock_bits & (1ULL << i)) { > + ret = qemu_lock_fd(s->lock_fd, off, 1, false); > + if (ret) { > + error_setg(errp, "Failed to lock byte %d", off); > + return ret; > + } > + } else if (unlock) { > + ret = qemu_unlock_fd(s->lock_fd, off, 1); > + if (ret) { > + error_setg(errp, "Failed to unlock byte %d", off); > + return ret; > + } > + } > + } > + return 0; > +} Note: If this function returns an error, we may have acquired some of the locks. The caller probably wants to call it again to reset the permissions to the old mask. > +/* Check "unshared" bytes implied by @perm and ~@shared_perm in the file. */ > +static int raw_check_lock_bytes(BDRVRawState *s, > + uint64_t perm, uint64_t shared_perm, > + Error **errp) > +{ > + int ret; > + int i; > + > + for (i = 0; i < BLK_PERM_MAX; ++i) { > + int off = RAW_LOCK_SHARED_BASE + i; > + uint64_t p = 1ULL << i; > + if (perm & p) { > + ret = qemu_lock_fd_test(s->lock_fd, off, 1, true); > + if (ret) { > + error_setg(errp, > + "Failed to check byte %d for \"%s\" permission", > + off, bdrv_perm_names(p)); bdrv_perm_names() returns a malloced string, which is leaked here. > + error_append_hint(errp, > + "Is another process using the image?\n"); We probably need to tweak the error messages a bit. When I saw this one this morning, I felt that if I didn't know how you were going to implement the locking, it would make zero sense to me: $ ./qemu-img snapshot -l /tmp/test.qcow2 qemu-img: Could not open '/tmp/test.qcow2': Failed to check byte 101 for shared "write" permission Is another process using the image? Something shorter and less technical like 'Failed to get "write" lock' is probably the friendlier message. > + return ret; > + } > + } > + } > + for (i = 0; i < BLK_PERM_MAX; ++i) { > + int off = RAW_LOCK_PERM_BASE + i; > + uint64_t p = 1ULL << i; > + if (!(shared_perm & p)) { > + ret = qemu_lock_fd_test(s->lock_fd, off, 1, true); > + if (ret) { > + error_setg(errp, > + "Failed to check byte %d for shared \"%s\" permission", > + off, bdrv_perm_names(p)); > + error_append_hint(errp, > + "Is another process using the image?\n"); > + return ret; > + } > + } > + } > + return 0; > +} > + > +static int raw_handle_perm_lock(BlockDriverState *bs, > + RawPermLockOp op, > + uint64_t new_perm, uint64_t new_shared, > + Error **errp) > +{ > + BDRVRawState *s = bs->opaque; > + int ret = 0; > + Error *local_err = NULL; > + > + if (!RAW_LOCK_SUPPORTED) { > + return 0; > + } > + > + if (!s->use_lock) { > + return 0; > + } > + > + if (bdrv_get_flags(bs) & BDRV_O_INACTIVE) { > + return 0; > + } I don't like the handling of BDRV_O_INACTIVE here in the file-posix driver. In theory, the users of the node already shouldn't be requesting any problematic permissions while the image is inactive. What are the actual problems that we're solving with this and the .bdrv_invalidate_cache/.bdrv_inactivate callbacks? > + assert(s->lock_fd > 0); > + > + switch (op) { > + case RAW_PL_PREPARE: > + ret = raw_apply_lock_bytes(s, s->perm | new_perm, > + ~s->shared_perm | ~new_shared, > + false, errp); > + if (!ret) { > + ret = raw_check_lock_bytes(s, new_perm, new_shared, errp); > + if (!ret) { > + return 0; > + } > + } > + op = RAW_PL_ABORT; op isn't used after this place, I wouldn't be surprised if some compiler or static analyser complained about it. > + /* fall through to unlock bytes. */ Good, this undoes whatever raw_apply_lock_bytes() already has done. > + case RAW_PL_ABORT: > + raw_apply_lock_bytes(s, s->perm, ~s->shared_perm, true, &local_err); > + if (local_err) { > + /* Theoretically the above call only unlocks bytes and it cannot > + * fail. Something weird happened, report it. > + */ > + error_report_err(local_err); > + } > + break; > + case RAW_PL_COMMIT: > + raw_apply_lock_bytes(s, new_perm, ~new_shared, true, &local_err); > + if (local_err) { > + /* Theoretically the above call only unlocks bytes and it cannot > + * fail. Something weird happened, report it. > + */ > + error_report_err(local_err); > + } > + break; > + } > + return ret; > +} > + > static int raw_reopen_prepare(BDRVReopenState *state, > BlockReopenQueue *queue, Error **errp) > { > @@ -549,6 +732,8 @@ static int raw_reopen_prepare(BDRVReopenState *state, > BDRVRawReopenState *rs; > int ret = 0; > Error *local_err = NULL; > + uint64_t clear_perms = state->flags & BDRV_O_RDWR ? 0 : > + BLK_PERM_WRITE | BLK_PERM_RESIZE | BLK_PERM_WRITE_UNCHANGED; > > assert(state != NULL); > assert(state->bs != NULL); > @@ -613,13 +798,22 @@ static int raw_reopen_prepare(BDRVReopenState *state, > if (rs->fd != -1) { > raw_probe_alignment(state->bs, rs->fd, &local_err); > if (local_err) { > - qemu_close(rs->fd); > - rs->fd = -1; > error_propagate(errp, local_err); > ret = -EINVAL; > + goto fail; > } > } > > + ret = raw_handle_perm_lock(state->bs, RAW_PL_PREPARE, > + s->perm & ~clear_perms, > + s->shared_perm, errp); Is this a workaround for bdrv_can_set_read_only() not checking whether there are still writers attached? Because I think in theory, we should be able to assert() that clear_perms are already clear. > + if (ret) { > + goto fail; > + } > + return 0; > +fail: > + qemu_close(rs->fd); > + rs->fd = -1; > return ret; > } > > @@ -627,6 +821,8 @@ static void raw_reopen_commit(BDRVReopenState *state) > { > BDRVRawReopenState *rs = state->opaque; > BDRVRawState *s = state->bs->opaque; > + uint64_t clear_perms = state->flags & BDRV_O_RDWR ? 0 : > + BLK_PERM_WRITE | BLK_PERM_RESIZE | BLK_PERM_WRITE_UNCHANGED; > > s->open_flags = rs->open_flags; > > @@ -635,12 +831,17 @@ static void raw_reopen_commit(BDRVReopenState *state) > > g_free(state->opaque); > state->opaque = NULL; > + raw_handle_perm_lock(state->bs, RAW_PL_COMMIT, s->perm & ~clear_perms, > + s->shared_perm, NULL); Shouldn't we update s->perm here? Or probably move the update into raw_handle_perm_lock(). Or even more probably, as said above, replace the permission handling in raw_reopen_* by checking permissions in the generic block layer beforehand. > } > > > static void raw_reopen_abort(BDRVReopenState *state) > { > + BDRVRawState *s = state->bs->opaque; > BDRVRawReopenState *rs = state->opaque; > + uint64_t clear_perms = state->flags & BDRV_O_RDWR ? 0 : > + BLK_PERM_WRITE | BLK_PERM_RESIZE | BLK_PERM_WRITE_UNCHANGED; > > /* nothing to do if NULL, we didn't get far enough */ > if (rs == NULL) { > @@ -653,6 +854,8 @@ static void raw_reopen_abort(BDRVReopenState *state) > } > g_free(state->opaque); > state->opaque = NULL; > + raw_handle_perm_lock(state->bs, RAW_PL_ABORT, s->perm & ~clear_perms, > + s->shared_perm, NULL); > } > > static int hdev_get_max_transfer_length(BlockDriverState *bs, int fd) > @@ -1410,6 +1613,10 @@ static void raw_close(BlockDriverState *bs) > qemu_close(s->fd); > s->fd = -1; > } > + if (s->lock_fd >= 0) { > + qemu_close(s->lock_fd); > + s->lock_fd = -1; > + } > } > > static int raw_truncate(BlockDriverState *bs, int64_t offset) > @@ -1947,6 +2154,56 @@ static QemuOptsList raw_create_opts = { > } > }; > > +static int raw_check_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared, > + Error **errp) > +{ > + return raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, errp); > +} > + > +static void raw_set_perm(BlockDriverState *bs, uint64_t perm, uint64_t shared) > +{ > + BDRVRawState *s = bs->opaque; > + raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL); > + s->perm = perm; > + s->shared_perm = shared; > +} > + > +static void raw_abort_perm_update(BlockDriverState *bs) > +{ > + BDRVRawState *s = bs->opaque; > + > + raw_handle_perm_lock(bs, RAW_PL_ABORT, s->perm, s->shared_perm, NULL); The parameters are supposed to be the new permissions, which are obviously ignored in the case of RAW_PL_ABORT. Would passing 0 for both be more obvious? > +} > + > +static int raw_inactivate(BlockDriverState *bs) > +{ > + int ret; > + uint64_t perm = 0; > + uint64_t shared = BLK_PERM_ALL; > + > + ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, perm, shared, NULL); > + if (ret) { > + return ret; > + } > + raw_handle_perm_lock(bs, RAW_PL_COMMIT, perm, shared, NULL); > + return 0; > +} > + > + > +static void raw_invalidate_cache(BlockDriverState *bs, Error **errp) > +{ > + BDRVRawState *s = bs->opaque; > + int ret; > + > + assert(!(bdrv_get_flags(bs) & BDRV_O_INACTIVE)); > + ret = raw_handle_perm_lock(bs, RAW_PL_PREPARE, s->perm, s->shared_perm, > + errp); > + if (ret) { > + return; > + } > + raw_handle_perm_lock(bs, RAW_PL_COMMIT, s->perm, s->shared_perm, NULL); > +} Looks okay if we really need BDRV_O_INACTIVE handling in file-posix. Kevin