[Qemu-devel] [PATCH] file-posix: Cache lseek result for data regions

* [Qemu-devel] [PATCH] file-posix: Cache lseek result for data regions
@ 2019-01-24 14:17 Kevin Wolf
  2019-01-24 14:40 ` Vladimir Sementsov-Ogievskiy
                   ` (2 more replies)
  0 siblings, 3 replies; 15+ messages in thread
From: Kevin Wolf @ 2019-01-24 14:17 UTC (permalink / raw)
  To: qemu-block; +Cc: kwolf, vsementsov, mreitz, eblake, qemu-devel

Depending on the exact image layout and the storage backend (tmpfs is
konwn to have very slow SEEK_HOLE/SEEK_DATA), caching lseek results can
save us a lot of time e.g. during a mirror block job or qemu-img convert
with a fragmented source image (.bdrv_co_block_status on the protocol
layer can be called for every single cluster in the extreme case).

We may only cache data regions because of possible concurrent writers.
This means that we can later treat a recently punched hole as data, but
this is safe. We can't cache holes because then we might treat recently
written data as holes, which can cause corruption.

Signed-off-by: Kevin Wolf <kwolf@redhat.com>
---
 block/file-posix.c | 51 ++++++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 2 deletions(-)

diff --git a/block/file-posix.c b/block/file-posix.c
index 8aee7a3fb8..7272c7c99d 100644
--- a/block/file-posix.c
+++ b/block/file-posix.c
@@ -168,6 +168,12 @@ typedef struct BDRVRawState {
     bool needs_alignment;
     bool check_cache_dropped;
 
+    struct seek_data_cache {
+        bool        valid;
+        uint64_t    start;
+        uint64_t    end;
+    } seek_data_cache;
+
     PRManager *pr_mgr;
 } BDRVRawState;
 
@@ -1555,8 +1561,17 @@ static int handle_aiocb_write_zeroes_unmap(void *opaque)
 {
     RawPosixAIOData *aiocb = opaque;
     BDRVRawState *s G_GNUC_UNUSED = aiocb->bs->opaque;
+    struct seek_data_cache *sdc;
     int ret;
 
+    /* Invalidate seek_data_cache if it overlaps */
+    sdc = &s->seek_data_cache;
+    if (sdc->valid && !(sdc->end < aiocb->aio_offset ||
+                        sdc->start > aiocb->aio_offset + aiocb->aio_nbytes))
+    {
+        sdc->valid = false;
+    }
+
     /* First try to write zeros and unmap at the same time */
 
 #ifdef CONFIG_FALLOCATE_PUNCH_HOLE
@@ -1634,11 +1649,20 @@ static int handle_aiocb_discard(void *opaque)
     RawPosixAIOData *aiocb = opaque;
     int ret = -EOPNOTSUPP;
     BDRVRawState *s = aiocb->bs->opaque;
+    struct seek_data_cache *sdc;
 
     if (!s->has_discard) {
         return -ENOTSUP;
     }
 
+    /* Invalidate seek_data_cache if it overlaps */
+    sdc = &s->seek_data_cache;
+    if (sdc->valid && !(sdc->end < aiocb->aio_offset ||
+                        sdc->start > aiocb->aio_offset + aiocb->aio_nbytes))
+    {
+        sdc->valid = false;
+    }
+
     if (aiocb->aio_type & QEMU_AIO_BLKDEV) {
 #ifdef BLKDISCARD
         do {
@@ -2424,6 +2448,8 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
                                             int64_t *map,
                                             BlockDriverState **file)
 {
+    BDRVRawState *s = bs->opaque;
+    struct seek_data_cache *sdc;
     off_t data = 0, hole = 0;
     int ret;
 
@@ -2439,6 +2465,14 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
         return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
     }
 
+    sdc = &s->seek_data_cache;
+    if (sdc->valid && sdc->start <= offset && sdc->end > offset) {
+        *pnum = MIN(bytes, sdc->end - offset);
+        *map = offset;
+        *file = bs;
+        return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID;
+    }
+
     ret = find_allocation(bs, offset, &data, &hole);
     if (ret == -ENXIO) {
         /* Trailing hole */
@@ -2451,14 +2485,27 @@ static int coroutine_fn raw_co_block_status(BlockDriverState *bs,
     } else if (data == offset) {
         /* On a data extent, compute bytes to the end of the extent,
          * possibly including a partial sector at EOF. */
-        *pnum = MIN(bytes, hole - offset);
+        *pnum = hole - offset;
         ret = BDRV_BLOCK_DATA;
     } else {
         /* On a hole, compute bytes to the beginning of the next extent.  */
         assert(hole == offset);
-        *pnum = MIN(bytes, data - offset);
+        *pnum = data - offset;
         ret = BDRV_BLOCK_ZERO;
     }
+
+    /* Caching allocated ranges is okay even if another process writes to the
+     * same file because we allow declaring things allocated even if there is a
+     * hole. However, we cannot cache holes without risking corruption. */
+    if (ret == BDRV_BLOCK_DATA) {
+        *sdc = (struct seek_data_cache) {
+            .valid  = true,
+            .start  = offset,
+            .end    = offset + *pnum,
+        };
+    }
+
+    *pnum = MIN(*pnum, bytes);
     *map = offset;
     *file = bs;
     return ret | BDRV_BLOCK_OFFSET_VALID;
-- 
2.20.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread