[Qemu-devel] CoW image commit+shrink(= make_empty) support

* [Qemu-devel] CoW image commit+shrink(= make_empty) support
@ 2012-06-07  6:19 Taisuke Yamada
  2012-06-07 14:14 ` Jeff Cody
  2012-06-08 10:39 ` Kevin Wolf
  0 siblings, 2 replies; 31+ messages in thread
From: Taisuke Yamada @ 2012-06-07  6:19 UTC (permalink / raw)
  To: qemu-devel; +Cc: jcody

[-- Attachment #1: Type: text/plain, Size: 1352 bytes --]

I attended Paolo Bonzini's qemu session ("Live Disk Operations: Juggling
Data and Trying to go Unnoticed") in LinuxCon Japan, and he adviced me
to post the bits I have regarding my question on qemu's  support on shrinking
CoW image.

Here's my problem description.

I recently designed a experimental system which holds VM master images
on a HDD and CoW snapshots on a SSD. VMs run on CoW snapshots only.
This split-image configration is done to keep VM I/Os on a SSD

As SSD capacity is rather limited, I need to do a writeback commit from SSD to
HDD time to time, and that is done during weekend/midnight. The problem is
although a commit is made, that alone won't shrink CoW image - all unused blocks
are still kept in a snapshot, and uses up space.

Patch attached is a workaround I added to cope with the problem,
but the basic problem I faced was that both QCOW2/QED format still does not
support "bdrv_make_empty" API.

Implementing the API (say, by hole punching) seemed like a lot of effort, so
I ended up creating a new CoW image, and then replace current CoW
snapshot with a new (empty) one. But I find the code ugly.

In his talk, Paolo suggested possibility of using new "live op" API for this
task, but I'm not aware of the actual API. Is there any documentation or
source code I can look at to re-implement above feature?

Best Regards,

[-- Attachment #2: qemu-block-refresh.patch --]
[-- Type: application/octet-stream, Size: 6575 bytes --]

From 85a0001b90efe59e52f89b62cda56771d79d0242 Mon Sep 17 00:00:00 2001
From: Taisuke Yamada <tai@rakugaki.org>
Date: Thu, 7 Jun 2012 13:35:31 +0900
Subject: [PATCH] - Imported block-refresh (commit+shrink) patch from qemu-kvm
 custom branch.

Signed-off-by: Taisuke Yamada <tai@rakugaki.org>

diff --git a/block/vmdk.c b/block/vmdk.c
index 18e9b4c..3241be6 100644
--- a/block/vmdk.c
+++ b/block/vmdk.c
@@ -1388,6 +1388,8 @@ static int vmdk_create(const char *filename, QEMUOptionParameter *options)
             total_size = options->value.n;
         } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FILE)) {
             backing_file = options->value.s;
+        } else if (!strcmp(options->name, BLOCK_OPT_BACKING_FMT)) {
+            ; // ignore - VMDK is the only supported format, after all
         } else if (!strcmp(options->name, BLOCK_OPT_COMPAT6)) {
             flags |= options->value.n ? BLOCK_FLAG_COMPAT6 : 0;
         } else if (!strcmp(options->name, BLOCK_OPT_SUBFMT)) {
@@ -1574,6 +1576,11 @@ static QEMUOptionParameter vmdk_create_options[] = {
         .help = "File name of a base image"
     },
     {
+        .name = BLOCK_OPT_BACKING_FMT,
+        .type = OPT_STRING,
+        .help = "File format of a base image"
+    },
+    {
         .name = BLOCK_OPT_COMPAT6,
         .type = OPT_FLAG,
         .help = "VMDK version 6 image"
diff --git a/blockdev.c b/blockdev.c
index 67895b2..e5df77a 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -832,6 +832,90 @@ exit:
 }
 
 
+void qmp_block_refresh(const char *device, Error **errp)
+{
+    BlockDriverState *bs;
+    BlockDriver *drv;
+    int ret = 0;
+    int flags;
+    char curfile[1024], newfile[1024], oldfile[1024];
+    pid_t pid = getpid();
+
+    bs = bdrv_find(device);
+    if (!bs) {
+        error_set(errp, QERR_DEVICE_NOT_FOUND, device);
+        return;
+    }
+
+    // Check if this device is really in diff+backing file configuration
+    // Otherwise, there is no point in "refresh"ing.
+    if (!bs->backing_hd) {
+        return; // success-as-nop
+    }
+
+    // Keep a copy of parameters needed after bdrv_close(bs).
+    drv   = bs->drv;
+    flags = bs->open_flags;
+    snprintf(curfile, sizeof(curfile), "%s",        bs->filename);
+    snprintf(newfile, sizeof(newfile), "%s.%d.new", bs->filename, pid);
+    ret = snprintf(oldfile, sizeof(oldfile), "%s.%d.old", bs->filename, pid);
+
+    // Pathname too long
+    if (ret < 0 || ret >= sizeof(oldfile)) {
+        error_set(errp, QERR_BUFFER_OVERRUN);
+        return;
+    }
+
+    // Quick sanity check to prevent unexpected overwrite.
+    if (access(newfile, F_OK) == 0 || access(oldfile, F_OK) == 0) {
+        error_set(errp, QERR_UNSUPPORTED);
+        return;
+    }
+
+    // Commit all data back to backing file
+    ret = bdrv_commit(bs);
+    if (ret != 0) {
+        error_set(errp, QERR_IO_ERROR);
+        return;
+    }
+
+    // Create new block image, with same backing file and flags
+    ret = bdrv_img_create(newfile,
+                          bs->drv->format_name,
+                          bs->backing_file,
+                          bs->backing_hd->drv->format_name,
+                          NULL, -1, bs->open_flags);
+    if (ret != 0) {
+        error_set(errp, QERR_OPEN_FILE_FAILED, newfile);
+        return;
+    }
+
+    // Switch block image by close-rename-open
+    bdrv_close(bs);
+    if (rename(curfile, oldfile) != 0) goto error_rename_cur;
+    if (rename(newfile, curfile) != 0) goto error_rename_new;
+    if (bdrv_open(bs, curfile, flags, drv) != 0) goto error_reopen;
+
+    // Successfully completed
+    unlink(oldfile);
+    return;
+
+    // Upon failure, try to recover back to original state
+error_reopen:
+    rename(curfile, newfile);
+error_rename_new:
+    rename(oldfile, curfile);
+error_rename_cur:
+    ret = bdrv_open(bs, curfile, flags, drv);
+
+    if (ret != 0) {
+        error_set(errp, QERR_OPEN_FILE_FAILED, curfile);
+    }
+    else {
+        error_set(errp, QERR_UNDEFINED_ERROR);
+    }
+}
+
 static void eject_device(BlockDriverState *bs, int force, Error **errp)
 {
     if (bdrv_in_use(bs)) {
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 18cb415..d096dc2 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -901,6 +901,20 @@ Snapshot device, using snapshot file as target if provided
 ETEXI
 
     {
+        .name       = "block_refresh",
+        .args_type  = "device:B",
+        .params     = "device",
+        .help       = "refresh (commit+regenerate) cow image of device.",
+        .mhandler.cmd = hmp_block_refresh,
+    },
+
+STEXI
+@item block_refresh
+@findex block_refresh
+Refresh cow image of device.
+ETEXI
+
+    {
         .name       = "drive_add",
         .args_type  = "pci_addr:s,opts:s",
         .params     = "[[<domain>:]<bus>:]<slot>\n"
diff --git a/hmp.c b/hmp.c
index bb0952e..c3792fd 100644
--- a/hmp.c
+++ b/hmp.c
@@ -713,6 +713,15 @@ void hmp_snapshot_blkdev(Monitor *mon, const QDict *qdict)
     hmp_handle_error(mon, &errp);
 }
 
+void hmp_block_refresh(Monitor *mon, const QDict *qdict)
+{
+    const char *device = qdict_get_str(qdict, "device");
+    Error *errp = NULL;
+
+    qmp_block_refresh(device, &errp);
+    hmp_handle_error(mon, &errp);
+}
+
 void hmp_migrate_cancel(Monitor *mon, const QDict *qdict)
 {
     qmp_migrate_cancel(NULL);
diff --git a/hmp.h b/hmp.h
index 443b812..f5f61b6 100644
--- a/hmp.h
+++ b/hmp.h
@@ -55,6 +55,7 @@ void hmp_set_password(Monitor *mon, const QDict *qdict);
 void hmp_expire_password(Monitor *mon, const QDict *qdict);
 void hmp_eject(Monitor *mon, const QDict *qdict);
 void hmp_change(Monitor *mon, const QDict *qdict);
+void hmp_block_refresh(Monitor *mon, const QDict *qdict);
 void hmp_block_set_io_throttle(Monitor *mon, const QDict *qdict);
 void hmp_block_stream(Monitor *mon, const QDict *qdict);
 void hmp_block_job_set_speed(Monitor *mon, const QDict *qdict);
diff --git a/qapi-schema.json b/qapi-schema.json
index 2ca7195..05c6b34 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -1244,6 +1244,20 @@
             '*mode': 'NewImageMode'} }
 
 ##
+# @block-refresh
+#
+# Re-generates CoW image of a block device. For certain block formats,
+# this shrinks snapshot image back to minimal size.
+#
+# @device:  the name of the device to refresh the image.
+#
+# Returns: nothing on success
+#          If @device is not a valid block device, DeviceNotFound
+##
+{ 'command': 'block-refresh',
+  'data': { 'device': 'str' } }
+
+##
 # @human-monitor-command:
 #
 # Execute a command on the human monitor and return the output.

^ permalink raw reply related	[flat|nested] 31+ messages in thread