All of lore.kernel.org
 help / color / mirror / Atom feed
* [Qemu-devel] [PATCH v2 0/5] qcow2: Add metadata overlap checks
@ 2013-08-28 14:55 Max Reitz
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 1/5] qcow2: Add corrupt bit Max Reitz
                   ` (4 more replies)
  0 siblings, 5 replies; 15+ messages in thread
From: Max Reitz @ 2013-08-28 14:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi, Max Reitz

If a qcow2 image file becomes corrupted, any write may inadvertently
overwrite important metadata structures such as the L1 table. This
series adds functionality for detecting, preventing and (to some extent)
repairing such collisions.

v2:
 - Generally implemented Kevin's comments, especially:
   - new QMP event QEVENT_BLOCK_IMAGE_CORRUPTED
   - removed BDRV_O_REPAIR in favor of BDRV_O_CHECK | BDRV_O_RDWR
   - always check full clusters for overlaps
 - removed qcow2_check_allocations in favor of some
   qcow2_check_refcounts extensions that will hopefully include all
   that functionality

Max Reitz (5):
  qcow2: Add corrupt bit
  qcow2: Metadata overlap checks
  qcow2: Employ metadata overlap checks
  qcow2: More complete consistency check
  qemu-iotests: Overlapping cluster allocations

 block/qcow2-cache.c        |  17 ++
 block/qcow2-cluster.c      |  25 ++-
 block/qcow2-refcount.c     | 397 +++++++++++++++++++++++++++++++++++++++++----
 block/qcow2-snapshot.c     |  22 +++
 block/qcow2.c              |  87 +++++++++-
 block/qcow2.h              |  36 +++-
 docs/specs/qcow2.txt       |   7 +-
 include/block/block.h      |   1 +
 include/monitor/monitor.h  |   1 +
 monitor.c                  |   1 +
 tests/qemu-iotests/031.out |  12 +-
 tests/qemu-iotests/036.out |   2 +-
 tests/qemu-iotests/060     | 111 +++++++++++++
 tests/qemu-iotests/060.out |  37 +++++
 tests/qemu-iotests/group   |   1 +
 15 files changed, 708 insertions(+), 49 deletions(-)
 create mode 100755 tests/qemu-iotests/060
 create mode 100644 tests/qemu-iotests/060.out

-- 
1.8.3.1

^ permalink raw reply	[flat|nested] 15+ messages in thread

* [Qemu-devel] [PATCH v2 1/5] qcow2: Add corrupt bit
  2013-08-28 14:55 [Qemu-devel] [PATCH v2 0/5] qcow2: Add metadata overlap checks Max Reitz
@ 2013-08-28 14:55 ` Max Reitz
  2013-08-29  8:23   ` Kevin Wolf
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 2/5] qcow2: Metadata overlap checks Max Reitz
                   ` (3 subsequent siblings)
  4 siblings, 1 reply; 15+ messages in thread
From: Max Reitz @ 2013-08-28 14:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi, Max Reitz

This adds an incompatible bit indicating corruption to qcow2. Any image
with this bit set may not be written to unless for repairing (and
subsequently clearing the bit if the repair has been successful).

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2.c              | 45 +++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.h              |  7 ++++++-
 docs/specs/qcow2.txt       |  7 ++++++-
 tests/qemu-iotests/031.out | 12 ++++++------
 tests/qemu-iotests/036.out |  2 +-
 5 files changed, 64 insertions(+), 9 deletions(-)

diff --git a/block/qcow2.c b/block/qcow2.c
index 78097e5..894ef40 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -272,6 +272,37 @@ static int qcow2_mark_clean(BlockDriverState *bs)
     return 0;
 }
 
+/*
+ * Marks the image as corrupt.
+ */
+int qcow2_mark_corrupt(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT;
+    return qcow2_update_header(bs);
+}
+
+/*
+ * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes
+ * before if necessary.
+ */
+int qcow2_mark_consistent(BlockDriverState *bs)
+{
+    BDRVQcowState *s = bs->opaque;
+
+    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
+        int ret = bdrv_flush(bs);
+        if (ret < 0) {
+            return ret;
+        }
+
+        s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT;
+        return qcow2_update_header(bs);
+    }
+    return 0;
+}
+
 static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
                        BdrvCheckMode fix)
 {
@@ -402,6 +433,15 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
         goto fail;
     }
 
+    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
+        /* Corrupt images may not be written to unless they are being repaired
+         */
+        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
+            ret = -EACCES;
+            goto fail;
+        }
+    }
+
     /* Check support for various header values */
     if (header.refcount_order != 4) {
         report_unsupported(bs, "%d bit reference counts",
@@ -1130,6 +1170,11 @@ int qcow2_update_header(BlockDriverState *bs)
             .name = "dirty bit",
         },
         {
+            .type = QCOW2_FEAT_TYPE_INCOMPATIBLE,
+            .bit  = QCOW2_INCOMPAT_CORRUPT_BITNR,
+            .name = "corrupt bit",
+        },
+        {
             .type = QCOW2_FEAT_TYPE_COMPATIBLE,
             .bit  = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR,
             .name = "lazy refcounts",
diff --git a/block/qcow2.h b/block/qcow2.h
index dba9771..4297487 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -119,9 +119,12 @@ enum {
 /* Incompatible feature bits */
 enum {
     QCOW2_INCOMPAT_DIRTY_BITNR   = 0,
+    QCOW2_INCOMPAT_CORRUPT_BITNR = 1,
     QCOW2_INCOMPAT_DIRTY         = 1 << QCOW2_INCOMPAT_DIRTY_BITNR,
+    QCOW2_INCOMPAT_CORRUPT       = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR,
 
-    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY,
+    QCOW2_INCOMPAT_MASK          = QCOW2_INCOMPAT_DIRTY
+                                 | QCOW2_INCOMPAT_CORRUPT,
 };
 
 /* Compatible feature bits */
@@ -361,6 +364,8 @@ int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov,
                   int64_t sector_num, int nb_sectors);
 
 int qcow2_mark_dirty(BlockDriverState *bs);
+int qcow2_mark_corrupt(BlockDriverState *bs);
+int qcow2_mark_consistent(BlockDriverState *bs);
 int qcow2_update_header(BlockDriverState *bs);
 
 /* qcow2-refcount.c functions */
diff --git a/docs/specs/qcow2.txt b/docs/specs/qcow2.txt
index 36a559d..33eca36 100644
--- a/docs/specs/qcow2.txt
+++ b/docs/specs/qcow2.txt
@@ -80,7 +80,12 @@ in the description of a field.
                                 tables to repair refcounts before accessing the
                                 image.
 
-                    Bits 1-63:  Reserved (set to 0)
+                    Bit 1:      Corrupt bit.  If this bit is set then any data
+                                structure may be corrupt and the image must not
+                                be written to (unless for regaining
+                                consistency).
+
+                    Bits 2-63:  Reserved (set to 0)
 
          80 -  87:  compatible_features
                     Bitmask of compatible features. An implementation can
diff --git a/tests/qemu-iotests/031.out b/tests/qemu-iotests/031.out
index 796c993..a943344 100644
--- a/tests/qemu-iotests/031.out
+++ b/tests/qemu-iotests/031.out
@@ -54,7 +54,7 @@ header_length             72
 
 Header extension:
 magic                     0x6803f857
-length                    96
+length                    144
 data                      <binary>
 
 Header extension:
@@ -68,7 +68,7 @@ No errors were found on the image.
 
 magic                     0x514649fb
 version                   2
-backing_file_offset       0xf8
+backing_file_offset       0x128
 backing_file_size         0x17
 cluster_bits              16
 size                      67108864
@@ -92,7 +92,7 @@ data                      'host_device'
 
 Header extension:
 magic                     0x6803f857
-length                    96
+length                    144
 data                      <binary>
 
 Header extension:
@@ -155,7 +155,7 @@ header_length             104
 
 Header extension:
 magic                     0x6803f857
-length                    96
+length                    144
 data                      <binary>
 
 Header extension:
@@ -169,7 +169,7 @@ No errors were found on the image.
 
 magic                     0x514649fb
 version                   3
-backing_file_offset       0x118
+backing_file_offset       0x148
 backing_file_size         0x17
 cluster_bits              16
 size                      67108864
@@ -193,7 +193,7 @@ data                      'host_device'
 
 Header extension:
 magic                     0x6803f857
-length                    96
+length                    144
 data                      <binary>
 
 Header extension:
diff --git a/tests/qemu-iotests/036.out b/tests/qemu-iotests/036.out
index 063ca22..55a3e6e 100644
--- a/tests/qemu-iotests/036.out
+++ b/tests/qemu-iotests/036.out
@@ -46,7 +46,7 @@ header_length             104
 
 Header extension:
 magic                     0x6803f857
-length                    96
+length                    144
 data                      <binary>
 
 *** done
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [Qemu-devel] [PATCH v2 2/5] qcow2: Metadata overlap checks
  2013-08-28 14:55 [Qemu-devel] [PATCH v2 0/5] qcow2: Add metadata overlap checks Max Reitz
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 1/5] qcow2: Add corrupt bit Max Reitz
@ 2013-08-28 14:55 ` Max Reitz
  2013-08-29  8:51   ` Kevin Wolf
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 3/5] qcow2: Employ metadata " Max Reitz
                   ` (2 subsequent siblings)
  4 siblings, 1 reply; 15+ messages in thread
From: Max Reitz @ 2013-08-28 14:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi, Max Reitz

Two new functions are added; the first one checks a given range in the
image file for overlaps with metadata (main header, L1 tables, L2
tables, refcount table and blocks).

The second one should be used immediately before writing to the image
file as it calls the first function and, upon collision, marks the
image as corrupt and makes the BDS unusable, thereby preventing
further access.

Both functions take a bitmask argument specifying the structures which
should be checked for overlaps, making it possible to also check
metadata writes against colliding with other structures.

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-refcount.c    | 148 ++++++++++++++++++++++++++++++++++++++++++++++
 block/qcow2.h             |  28 +++++++++
 include/monitor/monitor.h |   1 +
 monitor.c                 |   1 +
 4 files changed, 178 insertions(+)

diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 1244693..d06a9df 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -25,6 +25,7 @@
 #include "qemu-common.h"
 #include "block/block_int.h"
 #include "block/qcow2.h"
+#include "qemu/range.h"
 
 static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
 static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
@@ -1372,3 +1373,150 @@ fail:
     return ret;
 }
 
+#define overlaps_with(ofs, sz) \
+    ranges_overlap(offset, size, \
+        (ofs) & ~(cluster_mask), \
+        ((sz) + ((ofs) & (cluster_mask)) + (cluster_mask)) & ~(cluster_mask))
+
+/*
+ * Checks if the given offset into the image file is actually free to use by
+ * looking for overlaps with important metadata sections (L1/L2 tables etc.),
+ * i.e. a sanity check without relying on the refcount tables.
+ *
+ * The chk parameter specifies exactly what checks to perform (being a bitmask
+ * of QCow2MetadataOverlap values).
+ *
+ * Returns:
+ * - 0 if writing to this offset will not affect the mentioned metadata
+ * - a positive QCow2MetadataOverlap value indicating one overlapping section
+ * - a negative value (-errno) indicating an error while performing a check,
+ *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
+ */
+int qcow2_check_metadata_overlap(BlockDriverState *bs, int chk, int64_t offset,
+                                 int64_t size)
+{
+    BDRVQcowState *s = bs->opaque;
+    int64_t cluster_mask = s->cluster_size - 1;
+    int i, j;
+
+    if (!size) {
+        return 0;
+    }
+
+    if (chk & QCOW2_OL_MAIN_HEADER) {
+        if (offset < s->cluster_size) {
+            return QCOW2_OL_MAIN_HEADER;
+        }
+    }
+
+    size = (size + (offset & cluster_mask) + cluster_mask) & ~cluster_mask;
+    offset &= ~cluster_mask;
+
+    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
+        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
+            return QCOW2_OL_ACTIVE_L1;
+        }
+    }
+
+    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
+        if (overlaps_with(s->refcount_table_offset,
+            s->refcount_table_size * sizeof(uint64_t))) {
+            return QCOW2_OL_REFCOUNT_TABLE;
+        }
+    }
+
+    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
+        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
+            return QCOW2_OL_SNAPSHOT_TABLE;
+        }
+    }
+
+    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            if (s->snapshots[i].l1_size &&
+                overlaps_with(s->snapshots[i].l1_table_offset,
+                s->snapshots[i].l1_size * sizeof(uint64_t))) {
+                return QCOW2_OL_INACTIVE_L1;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
+        for (i = 0; i < s->l1_size; i++) {
+            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
+                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
+                s->cluster_size)) {
+                return QCOW2_OL_ACTIVE_L2;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
+        for (i = 0; i < s->refcount_table_size; i++) {
+            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
+                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
+                s->cluster_size)) {
+                return QCOW2_OL_REFCOUNT_BLOCK;
+            }
+        }
+    }
+
+    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
+        for (i = 0; i < s->nb_snapshots; i++) {
+            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
+            uint32_t l1_sz  = s->snapshots[i].l1_size;
+            uint64_t *l1 = g_malloc(l1_sz * sizeof(uint64_t));
+            int ret;
+
+            ret = bdrv_read(bs->file, l1_ofs / BDRV_SECTOR_SIZE, (uint8_t *)l1,
+                            l1_sz * sizeof(uint64_t) / BDRV_SECTOR_SIZE);
+
+            if (ret < 0) {
+                g_free(l1);
+                return ret;
+            }
+
+            for (j = 0; j < l1_sz; j++) {
+                if ((l1[j] & L1E_OFFSET_MASK) &&
+                    overlaps_with(l1[j] & L1E_OFFSET_MASK, s->cluster_size)) {
+                    g_free(l1);
+                    return QCOW2_OL_INACTIVE_L2;
+                }
+            }
+
+            g_free(l1);
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * First performs a check for metadata overlaps (through
+ * qcow2_check_metadata_overlap); if that fails with a negative value (error
+ * while performing a check), that value is returned. If an impending overlap
+ * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
+ * and -EIO returned.
+ *
+ * Returns 0 if there were neither overlaps nor errors while checking for
+ * overlaps; or a negative value (-errno) on error.
+ */
+int qcow2_pre_write_overlap_check(BlockDriverState *bs, int chk, int64_t offset,
+                                  int64_t size)
+{
+    int ret = qcow2_check_metadata_overlap(bs, chk, offset, size);
+
+    if (ret < 0) {
+        return ret;
+    } else if (ret > 0) {
+        fprintf(stderr, "qcow2: Preventing invalid write on metadata; "
+                "image marked as corrupt.\n");
+        bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IMAGE_CORRUPTED,
+                BDRV_ACTION_REPORT, false);
+        qcow2_mark_corrupt(bs);
+        bs->drv = NULL; /* make BDS unusable */
+        return -EIO;
+    }
+
+    return 0;
+}
diff --git a/block/qcow2.h b/block/qcow2.h
index 4297487..fa2425b 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -289,6 +289,29 @@ enum {
     QCOW2_CLUSTER_ZERO
 };
 
+typedef enum QCow2MetadataOverlap {
+    QCOW2_OL_NONE           = 0,
+    QCOW2_OL_MAIN_HEADER    = (1 << 0),
+    QCOW2_OL_ACTIVE_L1      = (1 << 1),
+    QCOW2_OL_ACTIVE_L2      = (1 << 2),
+    QCOW2_OL_REFCOUNT_TABLE = (1 << 3),
+    QCOW2_OL_REFCOUNT_BLOCK = (1 << 4),
+    QCOW2_OL_SNAPSHOT_TABLE = (1 << 5),
+    QCOW2_OL_INACTIVE_L1    = (1 << 6),
+    /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv
+     * reads. */
+    QCOW2_OL_INACTIVE_L2    = (1 << 7),
+} QCow2MetadataOverlap;
+
+/* Perform all overlap checks which don't require disk access */
+#define QCOW2_OL_CACHED \
+    (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_ACTIVE_L2 | \
+     QCOW2_OL_REFCOUNT_TABLE | QCOW2_OL_REFCOUNT_BLOCK | \
+     QCOW2_OL_SNAPSHOT_TABLE | QCOW2_OL_INACTIVE_L1)
+
+/* The default checks to perform */
+#define QCOW2_OL_DEFAULT QCOW2_OL_CACHED
+
 #define L1E_OFFSET_MASK 0x00ffffffffffff00ULL
 #define L2E_OFFSET_MASK 0x00ffffffffffff00ULL
 #define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL
@@ -390,6 +413,11 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 
 void qcow2_process_discards(BlockDriverState *bs, int ret);
 
+int qcow2_check_metadata_overlap(BlockDriverState *bs, int chk, int64_t offset,
+                                 int64_t size);
+int qcow2_pre_write_overlap_check(BlockDriverState *bs, int chk, int64_t offset,
+                                  int64_t size);
+
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size);
diff --git a/include/monitor/monitor.h b/include/monitor/monitor.h
index 1942cc4..10fa0e3 100644
--- a/include/monitor/monitor.h
+++ b/include/monitor/monitor.h
@@ -48,6 +48,7 @@ typedef enum MonitorEvent {
     QEVENT_BALLOON_CHANGE,
     QEVENT_SPICE_MIGRATE_COMPLETED,
     QEVENT_GUEST_PANICKED,
+    QEVENT_BLOCK_IMAGE_CORRUPTED,
 
     /* Add to 'monitor_event_names' array in monitor.c when
      * defining new events here */
diff --git a/monitor.c b/monitor.c
index ee9744c..2c542e1 100644
--- a/monitor.c
+++ b/monitor.c
@@ -504,6 +504,7 @@ static const char *monitor_event_names[] = {
     [QEVENT_BALLOON_CHANGE] = "BALLOON_CHANGE",
     [QEVENT_SPICE_MIGRATE_COMPLETED] = "SPICE_MIGRATE_COMPLETED",
     [QEVENT_GUEST_PANICKED] = "GUEST_PANICKED",
+    [QEVENT_BLOCK_IMAGE_CORRUPTED] = "BLOCK_IMAGE_CORRUPTED",
 };
 QEMU_BUILD_BUG_ON(ARRAY_SIZE(monitor_event_names) != QEVENT_MAX)
 
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [Qemu-devel] [PATCH v2 3/5] qcow2: Employ metadata overlap checks
  2013-08-28 14:55 [Qemu-devel] [PATCH v2 0/5] qcow2: Add metadata overlap checks Max Reitz
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 1/5] qcow2: Add corrupt bit Max Reitz
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 2/5] qcow2: Metadata overlap checks Max Reitz
@ 2013-08-28 14:55 ` Max Reitz
  2013-08-29  9:18   ` Kevin Wolf
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 4/5] qcow2: More complete consistency check Max Reitz
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 5/5] qemu-iotests: Overlapping cluster allocations Max Reitz
  4 siblings, 1 reply; 15+ messages in thread
From: Max Reitz @ 2013-08-28 14:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi, Max Reitz

The pre-write overlap check function is now called before most of the
qcow2 writes (aborting it on collision or other error).

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cache.c    | 17 +++++++++++++++++
 block/qcow2-cluster.c  | 21 +++++++++++++++++++++
 block/qcow2-snapshot.c | 22 ++++++++++++++++++++++
 block/qcow2.c          | 36 +++++++++++++++++++++++++++++++++++-
 4 files changed, 95 insertions(+), 1 deletion(-)

diff --git a/block/qcow2-cache.c b/block/qcow2-cache.c
index 2f3114e..7bcae09 100644
--- a/block/qcow2-cache.c
+++ b/block/qcow2-cache.c
@@ -115,6 +115,23 @@ static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i)
     }
 
     if (c == s->refcount_block_cache) {
+        ret = qcow2_pre_write_overlap_check(bs,
+                QCOW2_OL_DEFAULT & ~QCOW2_OL_REFCOUNT_BLOCK,
+                c->entries[i].offset, s->cluster_size);
+    } else if (c == s->l2_table_cache) {
+        ret = qcow2_pre_write_overlap_check(bs,
+                QCOW2_OL_DEFAULT & ~QCOW2_OL_ACTIVE_L2,
+                c->entries[i].offset, s->cluster_size);
+    } else {
+        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+                c->entries[i].offset, s->cluster_size);
+    }
+
+    if (ret < 0) {
+        return ret;
+    }
+
+    if (c == s->refcount_block_cache) {
         BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART);
     } else if (c == s->l2_table_cache) {
         BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE);
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index cca76d4..7c248aa 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -80,6 +80,14 @@ int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
         goto fail;
     }
 
+    /* the L1 position has not yet been updated, so these clusters must
+     * indeed be completely free */
+    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+                                        new_l1_table_offset, new_l1_size2);
+    if (ret < 0) {
+        goto fail;
+    }
+
     BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE);
     for(i = 0; i < s->l1_size; i++)
         new_l1_table[i] = cpu_to_be64(new_l1_table[i]);
@@ -149,6 +157,13 @@ static int write_l1_entry(BlockDriverState *bs, int l1_index)
         buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]);
     }
 
+    ret = qcow2_pre_write_overlap_check(bs,
+            QCOW2_OL_DEFAULT & ~QCOW2_OL_ACTIVE_L1,
+            s->l1_table_offset + 8 * l1_start_index, sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
     BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE);
     ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset + 8 * l1_start_index,
         buf, sizeof(buf));
@@ -368,6 +383,12 @@ static int coroutine_fn copy_sectors(BlockDriverState *bs,
                         &s->aes_encrypt_key);
     }
 
+    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+            cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE);
+    if (ret < 0) {
+        goto out;
+    }
+
     BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE);
     ret = bdrv_co_writev(bs->file, (cluster_offset >> 9) + n_start, n, &qiov);
     if (ret < 0) {
diff --git a/block/qcow2-snapshot.c b/block/qcow2-snapshot.c
index 0caac90..e7e6013 100644
--- a/block/qcow2-snapshot.c
+++ b/block/qcow2-snapshot.c
@@ -189,6 +189,15 @@ static int qcow2_write_snapshots(BlockDriverState *bs)
         return ret;
     }
 
+    /* The snapshot list position has not yet been updated, so these clusters
+     * must indeed be completely free */
+    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT, offset,
+                                        s->snapshots_size);
+    if (ret < 0) {
+        return ret;
+    }
+
+
     /* Write all snapshots to the new list */
     for(i = 0; i < s->nb_snapshots; i++) {
         sn = s->snapshots + i;
@@ -363,6 +372,12 @@ int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info)
         l1_table[i] = cpu_to_be64(s->l1_table[i]);
     }
 
+    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+            sn->l1_table_offset, s->l1_size * sizeof(uint64_t));
+    if (ret < 0) {
+        goto fail;
+    }
+
     ret = bdrv_pwrite(bs->file, sn->l1_table_offset, l1_table,
                       s->l1_size * sizeof(uint64_t));
     if (ret < 0) {
@@ -475,6 +490,13 @@ int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id)
         goto fail;
     }
 
+    ret = qcow2_pre_write_overlap_check(bs,
+            QCOW2_OL_DEFAULT & ~QCOW2_OL_ACTIVE_L1,
+            s->l1_table_offset, cur_l1_bytes);
+    if (ret < 0) {
+        goto fail;
+    }
+
     ret = bdrv_pwrite_sync(bs->file, s->l1_table_offset, sn_l1_table,
                            cur_l1_bytes);
     if (ret < 0) {
diff --git a/block/qcow2.c b/block/qcow2.c
index 894ef40..e860834 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -622,6 +622,8 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
     qcow2_free_snapshots(bs);
     qcow2_refcount_close(bs);
     g_free(s->l1_table);
+    /* else pre-write overlap checks in cache_destroy may crash */
+    s->l1_table = NULL;
     if (s->l2_table_cache) {
         qcow2_cache_destroy(bs, s->l2_table_cache);
     }
@@ -921,6 +923,13 @@ static coroutine_fn int qcow2_co_writev(BlockDriverState *bs,
                 cur_nr_sectors * 512);
         }
 
+        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+                cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE,
+                cur_nr_sectors * BDRV_SECTOR_SIZE);
+        if (ret < 0) {
+            goto fail;
+        }
+
         qemu_co_mutex_unlock(&s->lock);
         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO);
         trace_qcow2_writev_data(qemu_coroutine_self(),
@@ -987,6 +996,8 @@ static void qcow2_close(BlockDriverState *bs)
 {
     BDRVQcowState *s = bs->opaque;
     g_free(s->l1_table);
+    /* else pre-write overlap checks in cache_destroy may crash */
+    s->l1_table = NULL;
 
     qcow2_cache_flush(bs, s->l2_table_cache);
     qcow2_cache_flush(bs, s->refcount_block_cache);
@@ -1664,6 +1675,14 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
 
     if (ret != Z_STREAM_END || out_len >= s->cluster_size) {
         /* could not compress: write normal cluster */
+
+        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+                sector_num * BDRV_SECTOR_SIZE,
+                s->cluster_sectors * BDRV_SECTOR_SIZE);
+        if (ret < 0) {
+            goto fail;
+        }
+
         ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors);
         if (ret < 0) {
             goto fail;
@@ -1676,6 +1695,13 @@ static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num,
             goto fail;
         }
         cluster_offset &= s->cluster_offset_mask;
+
+        ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+                cluster_offset, out_len);
+        if (ret < 0) {
+            goto fail;
+        }
+
         BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED);
         ret = bdrv_pwrite(bs->file, cluster_offset, out_buf, out_len);
         if (ret < 0) {
@@ -1753,10 +1779,18 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
     BDRVQcowState *s = bs->opaque;
     int growable = bs->growable;
     int ret;
+    int64_t offset = qcow2_vm_state_offset(s) + pos;
 
     BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
     bs->growable = 1;
-    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
+
+    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT, offset,
+                                        qiov->size);
+    if (ret < 0) {
+        return ret;
+    }
+
+    ret = bdrv_pwritev(bs, offset, qiov);
     bs->growable = growable;
 
     return ret;
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [Qemu-devel] [PATCH v2 4/5] qcow2: More complete consistency check
  2013-08-28 14:55 [Qemu-devel] [PATCH v2 0/5] qcow2: Add metadata overlap checks Max Reitz
                   ` (2 preceding siblings ...)
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 3/5] qcow2: Employ metadata " Max Reitz
@ 2013-08-28 14:55 ` Max Reitz
  2013-08-29 11:36   ` Kevin Wolf
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 5/5] qemu-iotests: Overlapping cluster allocations Max Reitz
  4 siblings, 1 reply; 15+ messages in thread
From: Max Reitz @ 2013-08-28 14:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi, Max Reitz

The qcow2_check_refcounts function has been extended to be able to fix
OFLAG_COPIED errors and multiple references on refcount blocks.

If no corruptions remain after an image repair (and no errors have been
encountered), clear the corrupt flag in qcow2_check.

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 block/qcow2-cluster.c  |   4 +-
 block/qcow2-refcount.c | 249 ++++++++++++++++++++++++++++++++++++++++++-------
 block/qcow2.c          |   6 +-
 block/qcow2.h          |   1 +
 include/block/block.h  |   1 +
 5 files changed, 222 insertions(+), 39 deletions(-)

diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 7c248aa..2d5aa92 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -145,7 +145,7 @@ static int l2_load(BlockDriverState *bs, uint64_t l2_offset,
  * and we really don't want bdrv_pread to perform a read-modify-write)
  */
 #define L1_ENTRIES_PER_SECTOR (512 / 8)
-static int write_l1_entry(BlockDriverState *bs, int l1_index)
+int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index)
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t buf[L1_ENTRIES_PER_SECTOR];
@@ -254,7 +254,7 @@ static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table)
     /* update the L1 entry */
     trace_qcow2_l2_allocate_write_l1(bs, l1_index);
     s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED;
-    ret = write_l1_entry(bs, l1_index);
+    ret = qcow2_write_l1_entry(bs, l1_index);
     if (ret < 0) {
         goto fail;
     }
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index d06a9df..333d10b 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -1034,7 +1034,7 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
     BDRVQcowState *s = bs->opaque;
     uint64_t *l2_table, l2_entry;
     uint64_t next_contiguous_offset = 0;
-    int i, l2_size, nb_csectors, refcount;
+    int i, l2_size, nb_csectors;
 
     /* Read L2 table from disk */
     l2_size = s->l2_size * sizeof(uint64_t);
@@ -1086,23 +1086,8 @@ static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res,
 
         case QCOW2_CLUSTER_NORMAL:
         {
-            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
             uint64_t offset = l2_entry & L2E_OFFSET_MASK;
 
-            if (flags & CHECK_OFLAG_COPIED) {
-                refcount = get_refcount(bs, offset >> s->cluster_bits);
-                if (refcount < 0) {
-                    fprintf(stderr, "Can't get refcount for offset %"
-                        PRIx64 ": %s\n", l2_entry, strerror(-refcount));
-                    goto fail;
-                }
-                if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
-                    fprintf(stderr, "ERROR OFLAG_COPIED: offset=%"
-                        PRIx64 " refcount=%d\n", l2_entry, refcount);
-                    res->corruptions++;
-                }
-            }
-
             if (flags & CHECK_FRAG_INFO) {
                 res->bfi.allocated_clusters++;
                 if (next_contiguous_offset &&
@@ -1159,7 +1144,7 @@ static int check_refcounts_l1(BlockDriverState *bs,
 {
     BDRVQcowState *s = bs->opaque;
     uint64_t *l1_table, l2_offset, l1_size2;
-    int i, refcount, ret;
+    int i, ret;
 
     l1_size2 = l1_size * sizeof(uint64_t);
 
@@ -1183,22 +1168,6 @@ static int check_refcounts_l1(BlockDriverState *bs,
     for(i = 0; i < l1_size; i++) {
         l2_offset = l1_table[i];
         if (l2_offset) {
-            /* QCOW_OFLAG_COPIED must be set iff refcount == 1 */
-            if (flags & CHECK_OFLAG_COPIED) {
-                refcount = get_refcount(bs, (l2_offset & ~QCOW_OFLAG_COPIED)
-                    >> s->cluster_bits);
-                if (refcount < 0) {
-                    fprintf(stderr, "Can't get refcount for l2_offset %"
-                        PRIx64 ": %s\n", l2_offset, strerror(-refcount));
-                    goto fail;
-                }
-                if ((refcount == 1) != ((l2_offset & QCOW_OFLAG_COPIED) != 0)) {
-                    fprintf(stderr, "ERROR OFLAG_COPIED: l2_offset=%" PRIx64
-                        " refcount=%d\n", l2_offset, refcount);
-                    res->corruptions++;
-                }
-            }
-
             /* Mark L2 table as used */
             l2_offset &= L1E_OFFSET_MASK;
             inc_refcounts(bs, res, refcount_table, refcount_table_size,
@@ -1230,6 +1199,40 @@ fail:
 }
 
 /*
+ * Writes one sector of the refcount table to the disk
+ */
+#define RT_ENTRIES_PER_SECTOR (512 / sizeof(uint64_t))
+static int write_reftable_entry(BlockDriverState *bs, int rt_index)
+{
+    BDRVQcowState *s = bs->opaque;
+    uint64_t buf[RT_ENTRIES_PER_SECTOR];
+    int rt_start_index;
+    int i, ret;
+
+    rt_start_index = rt_index & ~(RT_ENTRIES_PER_SECTOR - 1);
+    for (i = 0; i < RT_ENTRIES_PER_SECTOR; i++) {
+        buf[i] = cpu_to_be64(s->refcount_table[rt_start_index + i]);
+    }
+
+    ret = qcow2_pre_write_overlap_check(bs,
+            QCOW2_OL_DEFAULT & ~QCOW2_OL_REFCOUNT_TABLE,
+            s->refcount_table_offset + rt_start_index * sizeof(uint64_t),
+            sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
+    BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE);
+    ret = bdrv_pwrite_sync(bs->file, s->refcount_table_offset +
+            rt_start_index * sizeof(uint64_t), buf, sizeof(buf));
+    if (ret < 0) {
+        return ret;
+    }
+
+    return 0;
+}
+
+/*
  * Checks an image for refcount consistency.
  *
  * Returns 0 if no errors are found, the number of errors in case the image is
@@ -1240,7 +1243,8 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
 {
     BDRVQcowState *s = bs->opaque;
     int64_t size, i, highest_cluster;
-    int nb_clusters, refcount1, refcount2;
+    uint64_t *l2_table = NULL;
+    int nb_clusters, refcount1, refcount2, j;
     QCowSnapshot *sn;
     uint16_t *refcount_table;
     int ret;
@@ -1305,10 +1309,85 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
             inc_refcounts(bs, res, refcount_table, nb_clusters,
                 offset, s->cluster_size);
             if (refcount_table[cluster] != 1) {
-                fprintf(stderr, "ERROR refcount block %" PRId64
+                fprintf(stderr, "%s refcount block %" PRId64
                     " refcount=%d\n",
+                    fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                            "ERROR",
                     i, refcount_table[cluster]);
-                res->corruptions++;
+                if (fix & BDRV_FIX_ERRORS) {
+                    int64_t new_offset;
+                    void *refcount_block;
+
+                    /* allocate new refcount block */
+                    new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
+                    if (new_offset < 0) {
+                        fprintf(stderr, "Could not allocate new cluster\n");
+                        res->corruptions++;
+                        continue;
+                    }
+                    /* fetch current content */
+                    ret = qcow2_cache_get(bs, s->refcount_block_cache, offset,
+                            &refcount_block);
+                    if (ret < 0) {
+                        fprintf(stderr, "Could not fetch refcount block\n");
+                        qcow2_free_clusters(bs, new_offset, s->cluster_size,
+                                QCOW2_DISCARD_ALWAYS);
+                        res->corruptions++;
+                        continue;
+                    }
+                    /* new block has not yet been entered into refcount table,
+                     * therefore it is no refcount block yet (regarding this
+                     * check) */
+                    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
+                            new_offset, s->cluster_sectors * BDRV_SECTOR_SIZE);
+                    if (ret < 0) {
+                        fprintf(stderr, "Could not write refcount block (would "
+                                "overlap with existing metadata)\n");
+                        /* the image will be marked corrupt here, so don't even
+                         * attempt on freeing the cluster */
+                        res->corruptions++;
+                        goto fail;
+                    }
+                    /* write to new block */
+                    ret = bdrv_write(bs->file, new_offset >> BDRV_SECTOR_BITS,
+                            refcount_block, s->cluster_sectors);
+                    if (ret < 0) {
+                        fprintf(stderr, "Could not write refcount block\n");
+                        qcow2_free_clusters(bs, new_offset, s->cluster_size,
+                                QCOW2_DISCARD_ALWAYS);
+                        res->corruptions++;
+                        continue;
+                    }
+                    /* update refcount table */
+                    assert(!(new_offset & (s->cluster_size - 1)));
+                    s->refcount_table[i] = new_offset;
+                    ret = write_reftable_entry(bs, i);
+                    if (ret < 0) {
+                        fprintf(stderr, "Could not update refcount table\n");
+                        s->refcount_table[i] = offset;
+                        qcow2_free_clusters(bs, new_offset, s->cluster_size,
+                                QCOW2_DISCARD_ALWAYS);
+                        res->corruptions++;
+                        continue;
+                    }
+                    qcow2_cache_put(bs, s->refcount_block_cache,
+                            &refcount_block);
+                    /* update refcounts */
+                    if ((new_offset >> s->cluster_bits) >= nb_clusters) {
+                        /* increase refcount_table size if necessary */
+                        int old_nb_clusters = nb_clusters;
+                        nb_clusters = (new_offset >> s->cluster_bits) + 1;
+                        refcount_table = g_realloc(refcount_table,
+                                nb_clusters * sizeof(uint16_t));
+                        memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
+                                - old_nb_clusters) * sizeof(uint16_t));
+                    }
+                    refcount_table[cluster]--;
+                    inc_refcounts(bs, res, refcount_table, nb_clusters,
+                            new_offset, s->cluster_size);
+                } else {
+                    res->corruptions++;
+                }
             }
         }
     }
@@ -1364,10 +1443,108 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
         }
     }
 
+    l2_table = g_malloc(s->l2_size * sizeof(uint64_t));
+
+    /* check OFLAG_COPIED */
+    for (i = 0; i < s->l1_size; i++) {
+        uint64_t l1_entry = s->l1_table[i];
+        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
+        bool l2_dirty = false;
+        int refcount;
+
+        if (!l2_offset) {
+            continue;
+        }
+
+        refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
+        if (refcount < 0) {
+            /* don't print message nor increment check_errors, since the above
+             * loop will have done this already */
+            continue;
+        }
+        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
+            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_entry=%" PRIx64
+                    " refcount=%d\n",
+                    fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                            "ERROR",
+                    l1_entry, refcount);
+            if (fix & BDRV_FIX_ERRORS) {
+                s->l1_table[i] = refcount == 1
+                               ? l1_entry |  QCOW_OFLAG_COPIED
+                               : l1_entry & ~QCOW_OFLAG_COPIED;
+                ret = qcow2_write_l1_entry(bs, i);
+                if (ret < 0) {
+                    res->check_errors++;
+                    goto fail;
+                }
+            } else {
+                res->corruptions++;
+            }
+        }
+
+        ret = bdrv_pread(bs->file, l2_offset, l2_table,
+                s->l2_size * sizeof(uint64_t));
+        if (ret != s->l2_size * sizeof(uint64_t)) {
+            fprintf(stderr, "ERROR: Could not read L2 table\n");
+            res->check_errors++;
+            if (ret >= 0) {
+                ret = -EIO;
+            }
+            goto fail;
+        }
+
+        for (j = 0; j < s->l2_size; j++) {
+            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
+            uint64_t data_offset;
+
+            if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_NORMAL) {
+                continue;
+            }
+
+            data_offset = l2_entry & L2E_OFFSET_MASK;
+
+            refcount = get_refcount(bs, data_offset >> s->cluster_bits);
+            if (refcount < 0) {
+                /* don't print message nor increment check_errors */
+                continue;
+            }
+            if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
+                fprintf(stderr, "%s OFLAG_COPIED data cluster: l2_entry=%"
+                        PRIx64 " refcount=%d\n",
+                        fix & BDRV_FIX_ERRORS ? "Repairing" :
+                                                "ERROR",
+                        l2_entry, refcount);
+                if (fix & BDRV_FIX_ERRORS) {
+                    l2_table[j] = cpu_to_be64(refcount == 1
+                                ? l2_entry |  QCOW_OFLAG_COPIED
+                                : l2_entry & ~QCOW_OFLAG_COPIED);
+                    l2_dirty = true;
+                } else {
+                    res->corruptions++;
+                }
+            }
+        }
+
+        if (l2_dirty) {
+            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
+                    s->l2_size * sizeof(uint64_t));
+            if (ret != s->l2_size * sizeof(uint64_t)) {
+                fprintf(stderr, "ERROR: Could not write L2 table\n");
+                res->check_errors++;
+                if (ret >= 0) {
+                    ret = -EIO;
+                }
+                goto fail;
+            }
+        }
+    }
+
+
     res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
     ret = 0;
 
 fail:
+    g_free(l2_table);
     g_free(refcount_table);
 
     return ret;
diff --git a/block/qcow2.c b/block/qcow2.c
index e860834..7ed4c5d 100644
--- a/block/qcow2.c
+++ b/block/qcow2.c
@@ -312,7 +312,11 @@ static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result,
     }
 
     if (fix && result->check_errors == 0 && result->corruptions == 0) {
-        return qcow2_mark_clean(bs);
+        ret = qcow2_mark_clean(bs);
+        if (ret < 0) {
+            return ret;
+        }
+        return qcow2_mark_consistent(bs);
     }
     return ret;
 }
diff --git a/block/qcow2.h b/block/qcow2.h
index fa2425b..1dd9f2d 100644
--- a/block/qcow2.h
+++ b/block/qcow2.h
@@ -421,6 +421,7 @@ int qcow2_pre_write_overlap_check(BlockDriverState *bs, int chk, int64_t offset,
 /* qcow2-cluster.c functions */
 int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size,
                         bool exact_size);
+int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index);
 void qcow2_l2_cache_reset(BlockDriverState *bs);
 int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset);
 void qcow2_encrypt_sectors(BDRVQcowState *s, int64_t sector_num,
diff --git a/include/block/block.h b/include/block/block.h
index 742fce5..e6b391c 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -413,6 +413,7 @@ typedef enum {
 
     BLKDBG_REFTABLE_LOAD,
     BLKDBG_REFTABLE_GROW,
+    BLKDBG_REFTABLE_UPDATE,
 
     BLKDBG_REFBLOCK_LOAD,
     BLKDBG_REFBLOCK_UPDATE,
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* [Qemu-devel] [PATCH v2 5/5] qemu-iotests: Overlapping cluster allocations
  2013-08-28 14:55 [Qemu-devel] [PATCH v2 0/5] qcow2: Add metadata overlap checks Max Reitz
                   ` (3 preceding siblings ...)
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 4/5] qcow2: More complete consistency check Max Reitz
@ 2013-08-28 14:55 ` Max Reitz
  4 siblings, 0 replies; 15+ messages in thread
From: Max Reitz @ 2013-08-28 14:55 UTC (permalink / raw)
  To: qemu-devel; +Cc: Kevin Wolf, Stefan Hajnoczi, Max Reitz

A new test on corrupted images with overlapping cluster allocations.

Signed-off-by: Max Reitz <mreitz@redhat.com>
---
 tests/qemu-iotests/060     | 111 +++++++++++++++++++++++++++++++++++++++++++++
 tests/qemu-iotests/060.out |  37 +++++++++++++++
 tests/qemu-iotests/group   |   1 +
 3 files changed, 149 insertions(+)
 create mode 100755 tests/qemu-iotests/060
 create mode 100644 tests/qemu-iotests/060.out

diff --git a/tests/qemu-iotests/060 b/tests/qemu-iotests/060
new file mode 100755
index 0000000..65bb09f
--- /dev/null
+++ b/tests/qemu-iotests/060
@@ -0,0 +1,111 @@
+#!/bin/bash
+#
+# Test case for image corruption (overlapping data structures) in qcow2
+#
+# Copyright (C) 2013 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+# creator
+owner=mreitz@redhat.com
+
+seq=`basename $0`
+echo "QA output created by $seq"
+
+here=`pwd`
+tmp=/tmp/$$
+status=1	# failure is the default!
+
+_cleanup()
+{
+	_cleanup_test_img
+}
+trap "_cleanup; exit \$status" 0 1 2 3 15
+
+# get standard environment, filters and checks
+. ./common.rc
+. ./common.filter
+
+# This tests qocw2-specific low-level functionality
+_supported_fmt qcow2
+_supported_proto generic
+_supported_os Linux
+
+rt_offset=65536  # 0x10000 (XXX: just an assumption)
+rb_offset=131072 # 0x20000 (XXX: just an assumption)
+l1_offset=196608 # 0x30000 (XXX: just an assumption)
+l2_offset=262144 # 0x40000 (XXX: just an assumption)
+
+IMGOPTS="compat=1.1"
+
+echo
+echo "=== Testing L2 reference into L1 ==="
+echo
+_make_test_img 64M
+# Link first L1 entry (first L2 table) onto itself
+# (Note the MSb in the L1 entry is set, ensuring the refcount is one - else any
+# later write will result in a COW operation, effectively ruining this attempt
+# on image corruption)
+poke_file "$TEST_IMG" "$l1_offset" "\x80\x00\x00\x00\x00\x03\x00\x00"
+_check_test_img
+
+# The corrupt bit should not be set anyway
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+
+# Try to write something, thereby forcing the corrupt bit to be set
+$QEMU_IO -c "write -P 0x2a 0 512" "$TEST_IMG" | _filter_qemu_io
+
+# The corrupt bit must now be set
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+
+# Try to open the image R/W (which should fail)
+$QEMU_IO -c "read 0 512" "$TEST_IMG" 2>&1 | _filter_qemu_io | sed -e "s/can't open device .*$/can't open device/"
+
+# Try to open it RO (which should succeed)
+$QEMU_IO -c "read 0 512" -r "$TEST_IMG" | _filter_qemu_io
+
+# We could now try to fix the image, but this would probably fail (how should an
+# L2 table linked onto the L1 table be fixed?)
+
+echo
+echo "=== Testing cluster data reference into refcount block ==="
+echo
+_make_test_img 64M
+# Allocate L2 table
+truncate -s "$(($l2_offset+65536))" "$TEST_IMG"
+poke_file "$TEST_IMG" "$l1_offset" "\x80\x00\x00\x00\x00\x04\x00\x00"
+# Mark cluster as used
+poke_file "$TEST_IMG" "$(($rb_offset+8))" "\x00\x01"
+# Redirect new data cluster onto refcount block
+poke_file "$TEST_IMG" "$l2_offset" "\x80\x00\x00\x00\x00\x02\x00\x00"
+_check_test_img
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+$QEMU_IO -c "write -P 0x2a 0 512" "$TEST_IMG" | _filter_qemu_io
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+
+# Try to fix it
+_check_test_img -r all
+
+# The corrupt bit should be cleared
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+
+# Look if it's really really fixed
+$QEMU_IO -c "write -P 0x2a 0 512" "$TEST_IMG" | _filter_qemu_io
+./qcow2.py "$TEST_IMG" dump-header | grep incompatible_features
+
+# success, all done
+echo "*** done"
+rm -f $seq.full
+status=0
diff --git a/tests/qemu-iotests/060.out b/tests/qemu-iotests/060.out
new file mode 100644
index 0000000..5e484be
--- /dev/null
+++ b/tests/qemu-iotests/060.out
@@ -0,0 +1,37 @@
+QA output created by 060
+
+=== Testing L2 reference into L1 ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 
+ERROR cluster 3 refcount=1 reference=3
+
+1 errors were found on the image.
+Data may be corrupted, or further writes to the image may corrupt it.
+incompatible_features     0x0
+qcow2: Preventing invalid write on metadata; image marked as corrupt.
+write failed: Input/output error
+incompatible_features     0x2
+qemu-io: can't open device
+no file open, try 'help open'
+read 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+
+=== Testing cluster data reference into refcount block ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864 
+ERROR refcount block 0 refcount=2
+ERROR cluster 2 refcount=1 reference=2
+
+2 errors were found on the image.
+Data may be corrupted, or further writes to the image may corrupt it.
+incompatible_features     0x0
+qcow2: Preventing invalid write on metadata; image marked as corrupt.
+write failed: Input/output error
+incompatible_features     0x2
+Repairing refcount block 0 refcount=2
+No errors were found on the image.
+incompatible_features     0x0
+wrote 512/512 bytes at offset 0
+512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
+incompatible_features     0x0
+*** done
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 43c05d6..0845eb5 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -64,3 +64,4 @@
 055 rw auto
 056 rw auto backing
 059 rw auto
+060 rw auto
-- 
1.8.3.1

^ permalink raw reply related	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/5] qcow2: Add corrupt bit
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 1/5] qcow2: Add corrupt bit Max Reitz
@ 2013-08-29  8:23   ` Kevin Wolf
  2013-08-29  8:27     ` Max Reitz
  0 siblings, 1 reply; 15+ messages in thread
From: Kevin Wolf @ 2013-08-29  8:23 UTC (permalink / raw)
  To: Max Reitz; +Cc: qemu-devel, Stefan Hajnoczi

Am 28.08.2013 um 16:55 hat Max Reitz geschrieben:
> This adds an incompatible bit indicating corruption to qcow2. Any image
> with this bit set may not be written to unless for repairing (and
> subsequently clearing the bit if the repair has been successful).
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/qcow2.c              | 45 +++++++++++++++++++++++++++++++++++++++++++++
>  block/qcow2.h              |  7 ++++++-
>  docs/specs/qcow2.txt       |  7 ++++++-
>  tests/qemu-iotests/031.out | 12 ++++++------
>  tests/qemu-iotests/036.out |  2 +-
>  5 files changed, 64 insertions(+), 9 deletions(-)

> @@ -402,6 +433,15 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
>          goto fail;
>      }
>  
> +    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
> +        /* Corrupt images may not be written to unless they are being repaired
> +         */
> +        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
> +            ret = -EACCES;

Perhaps a (q)error_report() call would be appropriate so that the user
isn't confused with only the "Permission denied" message (should it be
EPERM rather than EACCES, too? Or maybe EROFS?)

> +            goto fail;
> +        }
> +    }
> +

Kevin

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/5] qcow2: Add corrupt bit
  2013-08-29  8:23   ` Kevin Wolf
@ 2013-08-29  8:27     ` Max Reitz
  2013-08-29  8:57       ` Kevin Wolf
  0 siblings, 1 reply; 15+ messages in thread
From: Max Reitz @ 2013-08-29  8:27 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel, Stefan Hajnoczi

Am 29.08.2013 10:23, schrieb Kevin Wolf:
> Am 28.08.2013 um 16:55 hat Max Reitz geschrieben:
>> This adds an incompatible bit indicating corruption to qcow2. Any image
>> with this bit set may not be written to unless for repairing (and
>> subsequently clearing the bit if the repair has been successful).
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>>   block/qcow2.c              | 45 +++++++++++++++++++++++++++++++++++++++++++++
>>   block/qcow2.h              |  7 ++++++-
>>   docs/specs/qcow2.txt       |  7 ++++++-
>>   tests/qemu-iotests/031.out | 12 ++++++------
>>   tests/qemu-iotests/036.out |  2 +-
>>   5 files changed, 64 insertions(+), 9 deletions(-)
>> @@ -402,6 +433,15 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
>>           goto fail;
>>       }
>>   
>> +    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
>> +        /* Corrupt images may not be written to unless they are being repaired
>> +         */
>> +        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
>> +            ret = -EACCES;
> Perhaps a (q)error_report() call would be appropriate so that the user
> isn't confused with only the "Permission denied" message
Seems reasonable.
> (should it be
> EPERM rather than EACCES, too? Or maybe EROFS?)
I chose the value based on the following:
$ touch foo
$ chmod -w foo
$ echo 'bar' > foo
zsh: permission denied: foo
(which is EACCES)

EROFS sounds nice, but I wouldn't go for it since it's the image that's 
read-only and not the underlying FS (which I think EROFS is for…?)


Max

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH v2 2/5] qcow2: Metadata overlap checks
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 2/5] qcow2: Metadata overlap checks Max Reitz
@ 2013-08-29  8:51   ` Kevin Wolf
  2013-08-29  8:57     ` Max Reitz
  0 siblings, 1 reply; 15+ messages in thread
From: Kevin Wolf @ 2013-08-29  8:51 UTC (permalink / raw)
  To: Max Reitz; +Cc: qemu-devel, Stefan Hajnoczi

Am 28.08.2013 um 16:55 hat Max Reitz geschrieben:
> Two new functions are added; the first one checks a given range in the
> image file for overlaps with metadata (main header, L1 tables, L2
> tables, refcount table and blocks).
> 
> The second one should be used immediately before writing to the image
> file as it calls the first function and, upon collision, marks the
> image as corrupt and makes the BDS unusable, thereby preventing
> further access.
> 
> Both functions take a bitmask argument specifying the structures which
> should be checked for overlaps, making it possible to also check
> metadata writes against colliding with other structures.
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/qcow2-refcount.c    | 148 ++++++++++++++++++++++++++++++++++++++++++++++
>  block/qcow2.h             |  28 +++++++++
>  include/monitor/monitor.h |   1 +
>  monitor.c                 |   1 +
>  4 files changed, 178 insertions(+)
> 
> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
> index 1244693..d06a9df 100644
> --- a/block/qcow2-refcount.c
> +++ b/block/qcow2-refcount.c
> @@ -25,6 +25,7 @@
>  #include "qemu-common.h"
>  #include "block/block_int.h"
>  #include "block/qcow2.h"
> +#include "qemu/range.h"
>  
>  static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
>  static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
> @@ -1372,3 +1373,150 @@ fail:
>      return ret;
>  }
>  
> +#define overlaps_with(ofs, sz) \
> +    ranges_overlap(offset, size, \
> +        (ofs) & ~(cluster_mask), \
> +        ((sz) + ((ofs) & (cluster_mask)) + (cluster_mask)) & ~(cluster_mask))

It's not actually necessary to have both ranges rounded to cluster
boundaries, one of them is enough to detect all overlaps. So you can
simplify either this macro or the code below.

> +/*
> + * Checks if the given offset into the image file is actually free to use by
> + * looking for overlaps with important metadata sections (L1/L2 tables etc.),
> + * i.e. a sanity check without relying on the refcount tables.
> + *
> + * The chk parameter specifies exactly what checks to perform (being a bitmask
> + * of QCow2MetadataOverlap values).
> + *
> + * Returns:
> + * - 0 if writing to this offset will not affect the mentioned metadata
> + * - a positive QCow2MetadataOverlap value indicating one overlapping section
> + * - a negative value (-errno) indicating an error while performing a check,
> + *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
> + */
> +int qcow2_check_metadata_overlap(BlockDriverState *bs, int chk, int64_t offset,
> +                                 int64_t size)
> +{
> +    BDRVQcowState *s = bs->opaque;
> +    int64_t cluster_mask = s->cluster_size - 1;
> +    int i, j;
> +
> +    if (!size) {
> +        return 0;
> +    }
> +
> +    if (chk & QCOW2_OL_MAIN_HEADER) {
> +        if (offset < s->cluster_size) {
> +            return QCOW2_OL_MAIN_HEADER;
> +        }
> +    }
> +
> +    size = (size + (offset & cluster_mask) + cluster_mask) & ~cluster_mask;
> +    offset &= ~cluster_mask;

Attempt for improved readability:

size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
offset = start_of_cluster(s, offset);

What do you think?

> +
> +    if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) {
> +        if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) {
> +            return QCOW2_OL_ACTIVE_L1;
> +        }
> +    }
> +
> +    if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) {
> +        if (overlaps_with(s->refcount_table_offset,
> +            s->refcount_table_size * sizeof(uint64_t))) {
> +            return QCOW2_OL_REFCOUNT_TABLE;
> +        }
> +    }
> +
> +    if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) {
> +        if (overlaps_with(s->snapshots_offset, s->snapshots_size)) {
> +            return QCOW2_OL_SNAPSHOT_TABLE;
> +        }
> +    }
> +
> +    if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) {
> +        for (i = 0; i < s->nb_snapshots; i++) {
> +            if (s->snapshots[i].l1_size &&
> +                overlaps_with(s->snapshots[i].l1_table_offset,
> +                s->snapshots[i].l1_size * sizeof(uint64_t))) {
> +                return QCOW2_OL_INACTIVE_L1;
> +            }
> +        }
> +    }
> +
> +    if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) {
> +        for (i = 0; i < s->l1_size; i++) {
> +            if ((s->l1_table[i] & L1E_OFFSET_MASK) &&
> +                overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK,
> +                s->cluster_size)) {
> +                return QCOW2_OL_ACTIVE_L2;
> +            }
> +        }
> +    }
> +
> +    if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) {
> +        for (i = 0; i < s->refcount_table_size; i++) {
> +            if ((s->refcount_table[i] & REFT_OFFSET_MASK) &&
> +                overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK,
> +                s->cluster_size)) {
> +                return QCOW2_OL_REFCOUNT_BLOCK;
> +            }
> +        }
> +    }
> +
> +    if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) {
> +        for (i = 0; i < s->nb_snapshots; i++) {
> +            uint64_t l1_ofs = s->snapshots[i].l1_table_offset;
> +            uint32_t l1_sz  = s->snapshots[i].l1_size;
> +            uint64_t *l1 = g_malloc(l1_sz * sizeof(uint64_t));
> +            int ret;
> +
> +            ret = bdrv_read(bs->file, l1_ofs / BDRV_SECTOR_SIZE, (uint8_t *)l1,
> +                            l1_sz * sizeof(uint64_t) / BDRV_SECTOR_SIZE);
> +
> +            if (ret < 0) {
> +                g_free(l1);
> +                return ret;
> +            }
> +
> +            for (j = 0; j < l1_sz; j++) {
> +                if ((l1[j] & L1E_OFFSET_MASK) &&
> +                    overlaps_with(l1[j] & L1E_OFFSET_MASK, s->cluster_size)) {
> +                    g_free(l1);
> +                    return QCOW2_OL_INACTIVE_L2;
> +                }
> +            }
> +
> +            g_free(l1);
> +        }
> +    }
> +
> +    return 0;
> +}
> +
> +/*
> + * First performs a check for metadata overlaps (through
> + * qcow2_check_metadata_overlap); if that fails with a negative value (error
> + * while performing a check), that value is returned. If an impending overlap
> + * is detected, the BDS will be made unusable, the qcow2 file marked corrupt
> + * and -EIO returned.
> + *
> + * Returns 0 if there were neither overlaps nor errors while checking for
> + * overlaps; or a negative value (-errno) on error.
> + */
> +int qcow2_pre_write_overlap_check(BlockDriverState *bs, int chk, int64_t offset,
> +                                  int64_t size)
> +{
> +    int ret = qcow2_check_metadata_overlap(bs, chk, offset, size);
> +
> +    if (ret < 0) {
> +        return ret;
> +    } else if (ret > 0) {
> +        fprintf(stderr, "qcow2: Preventing invalid write on metadata; "
> +                "image marked as corrupt.\n");
> +        bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IMAGE_CORRUPTED,
> +                BDRV_ACTION_REPORT, false);

The dummy BDRV_ACTION_REPORT shows that you're abusing an interface for
something it was never meant for. The additional information of
QEVENT_BLOCK_IO_ERROR doesn't really make sense here, but we have other
information to report.

Let's directly create a JSON message and call monitor_protocol_event()
here. We can tell what kind of metadata would have been overwritten and
at which offset/size this happened, like:

    static const char *metadata_ol_names[] = {
        [QCOW2_OL_MAIN_HEADER]  = "qcow2 header",
        [QCOW2_OL_ACTIVE_L1]    = "active L1 table",
        ...
    };

    assert(ret < ARRAY_SIZE(metadata_ol_names));
    message = g_strdup_printf("Prevented %s overwrite", metadata_ol_names[ret]);
    data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, "
                              "'offset': %" PRId64 ", 'size': %" PRId64 " }",
                              bdrv->device_name, message, offset, size);
    monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data);

> +        qcow2_mark_corrupt(bs);
> +        bs->drv = NULL; /* make BDS unusable */
> +        return -EIO;
> +    }
> +
> +    return 0;
> +}

Kevin

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH v2 2/5] qcow2: Metadata overlap checks
  2013-08-29  8:51   ` Kevin Wolf
@ 2013-08-29  8:57     ` Max Reitz
  0 siblings, 0 replies; 15+ messages in thread
From: Max Reitz @ 2013-08-29  8:57 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel, Stefan Hajnoczi

Am 29.08.2013 10:51, schrieb Kevin Wolf:
> Am 28.08.2013 um 16:55 hat Max Reitz geschrieben:
>> Two new functions are added; the first one checks a given range in the
>> image file for overlaps with metadata (main header, L1 tables, L2
>> tables, refcount table and blocks).
>>
>> The second one should be used immediately before writing to the image
>> file as it calls the first function and, upon collision, marks the
>> image as corrupt and makes the BDS unusable, thereby preventing
>> further access.
>>
>> Both functions take a bitmask argument specifying the structures which
>> should be checked for overlaps, making it possible to also check
>> metadata writes against colliding with other structures.
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>>   block/qcow2-refcount.c    | 148 ++++++++++++++++++++++++++++++++++++++++++++++
>>   block/qcow2.h             |  28 +++++++++
>>   include/monitor/monitor.h |   1 +
>>   monitor.c                 |   1 +
>>   4 files changed, 178 insertions(+)
>>
>> diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
>> index 1244693..d06a9df 100644
>> --- a/block/qcow2-refcount.c
>> +++ b/block/qcow2-refcount.c
>> @@ -25,6 +25,7 @@
>>   #include "qemu-common.h"
>>   #include "block/block_int.h"
>>   #include "block/qcow2.h"
>> +#include "qemu/range.h"
>>   
>>   static int64_t alloc_clusters_noref(BlockDriverState *bs, int64_t size);
>>   static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs,
>> @@ -1372,3 +1373,150 @@ fail:
>>       return ret;
>>   }
>>   
>> +#define overlaps_with(ofs, sz) \
>> +    ranges_overlap(offset, size, \
>> +        (ofs) & ~(cluster_mask), \
>> +        ((sz) + ((ofs) & (cluster_mask)) + (cluster_mask)) & ~(cluster_mask))
> It's not actually necessary to have both ranges rounded to cluster
> boundaries, one of them is enough to detect all overlaps. So you can
> simplify either this macro or the code below.
Right, the macro it is, then.

>> +/*
>> + * Checks if the given offset into the image file is actually free to use by
>> + * looking for overlaps with important metadata sections (L1/L2 tables etc.),
>> + * i.e. a sanity check without relying on the refcount tables.
>> + *
>> + * The chk parameter specifies exactly what checks to perform (being a bitmask
>> + * of QCow2MetadataOverlap values).
>> + *
>> + * Returns:
>> + * - 0 if writing to this offset will not affect the mentioned metadata
>> + * - a positive QCow2MetadataOverlap value indicating one overlapping section
>> + * - a negative value (-errno) indicating an error while performing a check,
>> + *   e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2
>> + */
>> +int qcow2_check_metadata_overlap(BlockDriverState *bs, int chk, int64_t offset,
>> +                                 int64_t size)
>> +{
>> +    BDRVQcowState *s = bs->opaque;
>> +    int64_t cluster_mask = s->cluster_size - 1;
>> +    int i, j;
>> +
>> +    if (!size) {
>> +        return 0;
>> +    }
>> +
>> +    if (chk & QCOW2_OL_MAIN_HEADER) {
>> +        if (offset < s->cluster_size) {
>> +            return QCOW2_OL_MAIN_HEADER;
>> +        }
>> +    }
>> +
>> +    size = (size + (offset & cluster_mask) + cluster_mask) & ~cluster_mask;
>> +    offset &= ~cluster_mask;
> Attempt for improved readability:
>
> size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size);
> offset = start_of_cluster(s, offset);
>
> What do you think?
I like start_of_cluster, but I don't know whether I really like 
align_offset (I just don't like its name) - but it's the right function, 
so I'll go with it.

>> +    } else if (ret > 0) {
>> +        fprintf(stderr, "qcow2: Preventing invalid write on metadata; "
>> +                "image marked as corrupt.\n");
>> +        bdrv_emit_qmp_error_event(bs, QEVENT_BLOCK_IMAGE_CORRUPTED,
>> +                BDRV_ACTION_REPORT, false);
> The dummy BDRV_ACTION_REPORT shows that you're abusing an interface for
> something it was never meant for. The additional information of
> QEVENT_BLOCK_IO_ERROR doesn't really make sense here, but we have other
> information to report.
>
> Let's directly create a JSON message and call monitor_protocol_event()
> here. We can tell what kind of metadata would have been overwritten and
> at which offset/size this happened, like:
>
>      static const char *metadata_ol_names[] = {
>          [QCOW2_OL_MAIN_HEADER]  = "qcow2 header",
>          [QCOW2_OL_ACTIVE_L1]    = "active L1 table",
>          ...
>      };
>
>      assert(ret < ARRAY_SIZE(metadata_ol_names));
>      message = g_strdup_printf("Prevented %s overwrite", metadata_ol_names[ret]);
>      data = qobject_from_jsonf("{ 'device': %s, 'msg': %s, "
>                                "'offset': %" PRId64 ", 'size': %" PRId64 " }",
>                                bdrv->device_name, message, offset, size);
>      monitor_protocol_event(QEVENT_BLOCK_IMAGE_CORRUPTED, data);
Okay, thanks.


Max

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH v2 1/5] qcow2: Add corrupt bit
  2013-08-29  8:27     ` Max Reitz
@ 2013-08-29  8:57       ` Kevin Wolf
  0 siblings, 0 replies; 15+ messages in thread
From: Kevin Wolf @ 2013-08-29  8:57 UTC (permalink / raw)
  To: Max Reitz; +Cc: qemu-devel, Stefan Hajnoczi

Am 29.08.2013 um 10:27 hat Max Reitz geschrieben:
> Am 29.08.2013 10:23, schrieb Kevin Wolf:
> >Am 28.08.2013 um 16:55 hat Max Reitz geschrieben:
> >>This adds an incompatible bit indicating corruption to qcow2. Any image
> >>with this bit set may not be written to unless for repairing (and
> >>subsequently clearing the bit if the repair has been successful).
> >>
> >>Signed-off-by: Max Reitz <mreitz@redhat.com>
> >>---
> >>  block/qcow2.c              | 45 +++++++++++++++++++++++++++++++++++++++++++++
> >>  block/qcow2.h              |  7 ++++++-
> >>  docs/specs/qcow2.txt       |  7 ++++++-
> >>  tests/qemu-iotests/031.out | 12 ++++++------
> >>  tests/qemu-iotests/036.out |  2 +-
> >>  5 files changed, 64 insertions(+), 9 deletions(-)
> >>@@ -402,6 +433,15 @@ static int qcow2_open(BlockDriverState *bs, QDict *options, int flags)
> >>          goto fail;
> >>      }
> >>+    if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) {
> >>+        /* Corrupt images may not be written to unless they are being repaired
> >>+         */
> >>+        if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) {
> >>+            ret = -EACCES;
> >Perhaps a (q)error_report() call would be appropriate so that the user
> >isn't confused with only the "Permission denied" message
> Seems reasonable.
> >(should it be
> >EPERM rather than EACCES, too? Or maybe EROFS?)
> I chose the value based on the following:
> $ touch foo
> $ chmod -w foo
> $ echo 'bar' > foo
> zsh: permission denied: foo
> (which is EACCES)

Well, the point is that when I get EACCES (i.e. a "Permission denied"
message), the first thing I do is to check the file permissions of the
image file, because that's really what EACCES says. So it's kind of
misleading.

> EROFS sounds nice, but I wouldn't go for it since it's the image
> that's read-only and not the underlying FS (which I think EROFS is
> for…?)

Unfortunately errno values are hardly ever a perfect match... I think
I'm less likely to be misled by EROFS because I _know_ that my /home (or
/var, for that matter) isn't read-only.

In the end, it's probably not all that important as long as we print a
clear (q)error_report() message in addition.

Kevin

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH v2 3/5] qcow2: Employ metadata overlap checks
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 3/5] qcow2: Employ metadata " Max Reitz
@ 2013-08-29  9:18   ` Kevin Wolf
  2013-08-29  9:20     ` Max Reitz
  0 siblings, 1 reply; 15+ messages in thread
From: Kevin Wolf @ 2013-08-29  9:18 UTC (permalink / raw)
  To: Max Reitz; +Cc: qemu-devel, Stefan Hajnoczi

Am 28.08.2013 um 16:55 hat Max Reitz geschrieben:
> The pre-write overlap check function is now called before most of the
> qcow2 writes (aborting it on collision or other error).
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>
> ---
>  block/qcow2-cache.c    | 17 +++++++++++++++++
>  block/qcow2-cluster.c  | 21 +++++++++++++++++++++
>  block/qcow2-snapshot.c | 22 ++++++++++++++++++++++
>  block/qcow2.c          | 36 +++++++++++++++++++++++++++++++++++-
>  4 files changed, 95 insertions(+), 1 deletion(-)

> @@ -1753,10 +1779,18 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
>      BDRVQcowState *s = bs->opaque;
>      int growable = bs->growable;
>      int ret;
> +    int64_t offset = qcow2_vm_state_offset(s) + pos;
>  
>      BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
>      bs->growable = 1;
> -    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
> +
> +    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT, offset,
> +                                        qiov->size);
> +    if (ret < 0) {
> +        return ret;
> +    }
> +
> +    ret = bdrv_pwritev(bs, offset, qiov);
>      bs->growable = growable;
>  
>      return ret;

Sorry for not catching it in v1, this one is actually wrong. The
bdrv_pwritev() is against bs, not bs->file, so you would be comparing
virtual/guest offsets with physical/host offsets.

I think you can simply leave the check out here, the one in
qcow2_co_writev() should already do the right thing inside
bdrv_pwritev().

Kevin

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH v2 3/5] qcow2: Employ metadata overlap checks
  2013-08-29  9:18   ` Kevin Wolf
@ 2013-08-29  9:20     ` Max Reitz
  0 siblings, 0 replies; 15+ messages in thread
From: Max Reitz @ 2013-08-29  9:20 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel, Stefan Hajnoczi

Am 29.08.2013 11:18, schrieb Kevin Wolf:
> Am 28.08.2013 um 16:55 hat Max Reitz geschrieben:
>> The pre-write overlap check function is now called before most of the
>> qcow2 writes (aborting it on collision or other error).
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
>> ---
>>   block/qcow2-cache.c    | 17 +++++++++++++++++
>>   block/qcow2-cluster.c  | 21 +++++++++++++++++++++
>>   block/qcow2-snapshot.c | 22 ++++++++++++++++++++++
>>   block/qcow2.c          | 36 +++++++++++++++++++++++++++++++++++-
>>   4 files changed, 95 insertions(+), 1 deletion(-)
>> @@ -1753,10 +1779,18 @@ static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov,
>>       BDRVQcowState *s = bs->opaque;
>>       int growable = bs->growable;
>>       int ret;
>> +    int64_t offset = qcow2_vm_state_offset(s) + pos;
>>   
>>       BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE);
>>       bs->growable = 1;
>> -    ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov);
>> +
>> +    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT, offset,
>> +                                        qiov->size);
>> +    if (ret < 0) {
>> +        return ret;
>> +    }
>> +
>> +    ret = bdrv_pwritev(bs, offset, qiov);
>>       bs->growable = growable;
>>   
>>       return ret;
> Sorry for not catching it in v1, this one is actually wrong. The
> bdrv_pwritev() is against bs, not bs->file, so you would be comparing
> virtual/guest offsets with physical/host offsets.
Oh, yes, sorry; thanks for noticing. I do this mistake way too often – I 
think it's time for some regex to catch it…

Max

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH v2 4/5] qcow2: More complete consistency check
  2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 4/5] qcow2: More complete consistency check Max Reitz
@ 2013-08-29 11:36   ` Kevin Wolf
  2013-08-29 12:09     ` Max Reitz
  0 siblings, 1 reply; 15+ messages in thread
From: Kevin Wolf @ 2013-08-29 11:36 UTC (permalink / raw)
  To: Max Reitz; +Cc: qemu-devel, Stefan Hajnoczi

Am 28.08.2013 um 16:55 hat Max Reitz geschrieben:
> The qcow2_check_refcounts function has been extended to be able to fix
> OFLAG_COPIED errors and multiple references on refcount blocks.
> 
> If no corruptions remain after an image repair (and no errors have been
> encountered), clear the corrupt flag in qcow2_check.
> 
> Signed-off-by: Max Reitz <mreitz@redhat.com>

This would be easier to review if you kept the code changes of the
actual check (mostly code movement, I guess) and the repair of each type
of errors in separate patches.

>  block/qcow2-cluster.c  |   4 +-
>  block/qcow2-refcount.c | 249 ++++++++++++++++++++++++++++++++++++++++++-------
>  block/qcow2.c          |   6 +-
>  block/qcow2.h          |   1 +
>  include/block/block.h  |   1 +
>  5 files changed, 222 insertions(+), 39 deletions(-)

> @@ -1240,7 +1243,8 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
>  {
>      BDRVQcowState *s = bs->opaque;
>      int64_t size, i, highest_cluster;
> -    int nb_clusters, refcount1, refcount2;
> +    uint64_t *l2_table = NULL;
> +    int nb_clusters, refcount1, refcount2, j;
>      QCowSnapshot *sn;
>      uint16_t *refcount_table;
>      int ret;
> @@ -1305,10 +1309,85 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
>              inc_refcounts(bs, res, refcount_table, nb_clusters,
>                  offset, s->cluster_size);
>              if (refcount_table[cluster] != 1) {
> -                fprintf(stderr, "ERROR refcount block %" PRId64
> +                fprintf(stderr, "%s refcount block %" PRId64
>                      " refcount=%d\n",
> +                    fix & BDRV_FIX_ERRORS ? "Repairing" :
> +                                            "ERROR",
>                      i, refcount_table[cluster]);
> -                res->corruptions++;
> +                if (fix & BDRV_FIX_ERRORS) {

This is a quite long if block. Probably worth splitting into its own
function. (The res->corruptions++ part could be in a common fail: part
then.)

> +                    int64_t new_offset;
> +                    void *refcount_block;
> +
> +                    /* allocate new refcount block */
> +                    new_offset = qcow2_alloc_clusters(bs, s->cluster_size);

How trustworthy is qcow2_alloc_clusters() when we know that our refcount
information is corrupted? Couldn't we end up accidentally overwriting
things?

> +                    if (new_offset < 0) {
> +                        fprintf(stderr, "Could not allocate new cluster\n");
> +                        res->corruptions++;
> +                        continue;
> +                    }
> +                    /* fetch current content */
> +                    ret = qcow2_cache_get(bs, s->refcount_block_cache, offset,
> +                            &refcount_block);
> +                    if (ret < 0) {
> +                        fprintf(stderr, "Could not fetch refcount block\n");

strerror(-ret) would be useful information in almost all of the error
cases.

> +                        qcow2_free_clusters(bs, new_offset, s->cluster_size,
> +                                QCOW2_DISCARD_ALWAYS);
> +                        res->corruptions++;
> +                        continue;
> +                    }
> +                    /* new block has not yet been entered into refcount table,
> +                     * therefore it is no refcount block yet (regarding this
> +                     * check) */
> +                    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
> +                            new_offset, s->cluster_sectors * BDRV_SECTOR_SIZE);
> +                    if (ret < 0) {
> +                        fprintf(stderr, "Could not write refcount block (would "
> +                                "overlap with existing metadata)\n");
> +                        /* the image will be marked corrupt here, so don't even
> +                         * attempt on freeing the cluster */
> +                        res->corruptions++;
> +                        goto fail;
> +                    }
> +                    /* write to new block */
> +                    ret = bdrv_write(bs->file, new_offset >> BDRV_SECTOR_BITS,
> +                            refcount_block, s->cluster_sectors);
> +                    if (ret < 0) {
> +                        fprintf(stderr, "Could not write refcount block\n");
> +                        qcow2_free_clusters(bs, new_offset, s->cluster_size,
> +                                QCOW2_DISCARD_ALWAYS);
> +                        res->corruptions++;
> +                        continue;
> +                    }
> +                    /* update refcount table */
> +                    assert(!(new_offset & (s->cluster_size - 1)));
> +                    s->refcount_table[i] = new_offset;
> +                    ret = write_reftable_entry(bs, i);
> +                    if (ret < 0) {
> +                        fprintf(stderr, "Could not update refcount table\n");
> +                        s->refcount_table[i] = offset;
> +                        qcow2_free_clusters(bs, new_offset, s->cluster_size,
> +                                QCOW2_DISCARD_ALWAYS);
> +                        res->corruptions++;
> +                        continue;
> +                    }
> +                    qcow2_cache_put(bs, s->refcount_block_cache,
> +                            &refcount_block);
> +                    /* update refcounts */
> +                    if ((new_offset >> s->cluster_bits) >= nb_clusters) {
> +                        /* increase refcount_table size if necessary */
> +                        int old_nb_clusters = nb_clusters;
> +                        nb_clusters = (new_offset >> s->cluster_bits) + 1;
> +                        refcount_table = g_realloc(refcount_table,
> +                                nb_clusters * sizeof(uint16_t));
> +                        memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
> +                                - old_nb_clusters) * sizeof(uint16_t));
> +                    }
> +                    refcount_table[cluster]--;
> +                    inc_refcounts(bs, res, refcount_table, nb_clusters,
> +                            new_offset, s->cluster_size);

res->corruptions_fixed++?

> +                } else {
> +                    res->corruptions++;
> +                }
>              }
>          }
>      }
> @@ -1364,10 +1443,108 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
>          }
>      }
>  
> +    l2_table = g_malloc(s->l2_size * sizeof(uint64_t));

qemu_blockalign() is better than g_malloc() because it avoids using a
bounce buffer for O_DIRECT images.

> +
> +    /* check OFLAG_COPIED */
> +    for (i = 0; i < s->l1_size; i++) {
> +        uint64_t l1_entry = s->l1_table[i];
> +        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
> +        bool l2_dirty = false;
> +        int refcount;
> +
> +        if (!l2_offset) {
> +            continue;
> +        }
> +
> +        refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
> +        if (refcount < 0) {
> +            /* don't print message nor increment check_errors, since the above
> +             * loop will have done this already */
> +            continue;
> +        }
> +        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
> +            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_entry=%" PRIx64
> +                    " refcount=%d\n",
> +                    fix & BDRV_FIX_ERRORS ? "Repairing" :
> +                                            "ERROR",
> +                    l1_entry, refcount);
> +            if (fix & BDRV_FIX_ERRORS) {
> +                s->l1_table[i] = refcount == 1
> +                               ? l1_entry |  QCOW_OFLAG_COPIED
> +                               : l1_entry & ~QCOW_OFLAG_COPIED;
> +                ret = qcow2_write_l1_entry(bs, i);
> +                if (ret < 0) {
> +                    res->check_errors++;
> +                    goto fail;
> +                }

else res->corruptions_fixed++?

> +            } else {
> +                res->corruptions++;
> +            }
> +        }
> +
> +        ret = bdrv_pread(bs->file, l2_offset, l2_table,
> +                s->l2_size * sizeof(uint64_t));
> +        if (ret != s->l2_size * sizeof(uint64_t)) {
> +            fprintf(stderr, "ERROR: Could not read L2 table\n");
> +            res->check_errors++;
> +            if (ret >= 0) {
> +                ret = -EIO;
> +            }

Doesn't happen. You can just check ret < 0 instead of ret != ... in the
first place, bdrv_pread() never returns short reads.

> +            goto fail;
> +        }
> +
> +        for (j = 0; j < s->l2_size; j++) {
> +            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
> +            uint64_t data_offset;
> +
> +            if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_NORMAL) {
> +                continue;
> +            }
> +
> +            data_offset = l2_entry & L2E_OFFSET_MASK;
> +
> +            refcount = get_refcount(bs, data_offset >> s->cluster_bits);
> +            if (refcount < 0) {
> +                /* don't print message nor increment check_errors */

Why?

> +                continue;
> +            }
> +            if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
> +                fprintf(stderr, "%s OFLAG_COPIED data cluster: l2_entry=%"
> +                        PRIx64 " refcount=%d\n",
> +                        fix & BDRV_FIX_ERRORS ? "Repairing" :
> +                                                "ERROR",
> +                        l2_entry, refcount);
> +                if (fix & BDRV_FIX_ERRORS) {
> +                    l2_table[j] = cpu_to_be64(refcount == 1
> +                                ? l2_entry |  QCOW_OFLAG_COPIED
> +                                : l2_entry & ~QCOW_OFLAG_COPIED);
> +                    l2_dirty = true;

res->corruptions_fixed++

> +                } else {
> +                    res->corruptions++;
> +                }
> +            }
> +        }
> +
> +        if (l2_dirty) {
> +            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
> +                    s->l2_size * sizeof(uint64_t));
> +            if (ret != s->l2_size * sizeof(uint64_t)) {
> +                fprintf(stderr, "ERROR: Could not write L2 table\n");
> +                res->check_errors++;
> +                if (ret >= 0) {
> +                    ret = -EIO;
> +                }

bdrv_pwrite() also doesn't return short writes.

> +                goto fail;
> +            }
> +        }
> +    }
> +
> +
>      res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
>      ret = 0;
>  
>  fail:
> +    g_free(l2_table);
>      g_free(refcount_table);
>  
>      return ret;

Kevin

^ permalink raw reply	[flat|nested] 15+ messages in thread

* Re: [Qemu-devel] [PATCH v2 4/5] qcow2: More complete consistency check
  2013-08-29 11:36   ` Kevin Wolf
@ 2013-08-29 12:09     ` Max Reitz
  0 siblings, 0 replies; 15+ messages in thread
From: Max Reitz @ 2013-08-29 12:09 UTC (permalink / raw)
  To: Kevin Wolf; +Cc: qemu-devel, Stefan Hajnoczi

Am 29.08.2013 13:36, schrieb Kevin Wolf:
> Am 28.08.2013 um 16:55 hat Max Reitz geschrieben:
>> The qcow2_check_refcounts function has been extended to be able to fix
>> OFLAG_COPIED errors and multiple references on refcount blocks.
>>
>> If no corruptions remain after an image repair (and no errors have been
>> encountered), clear the corrupt flag in qcow2_check.
>>
>> Signed-off-by: Max Reitz <mreitz@redhat.com>
> This would be easier to review if you kept the code changes of the
> actual check (mostly code movement, I guess) and the repair of each type
> of errors in separate patches.
>
>>   block/qcow2-cluster.c  |   4 +-
>>   block/qcow2-refcount.c | 249 ++++++++++++++++++++++++++++++++++++++++++-------
>>   block/qcow2.c          |   6 +-
>>   block/qcow2.h          |   1 +
>>   include/block/block.h  |   1 +
>>   5 files changed, 222 insertions(+), 39 deletions(-)
>> @@ -1240,7 +1243,8 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
>>   {
>>       BDRVQcowState *s = bs->opaque;
>>       int64_t size, i, highest_cluster;
>> -    int nb_clusters, refcount1, refcount2;
>> +    uint64_t *l2_table = NULL;
>> +    int nb_clusters, refcount1, refcount2, j;
>>       QCowSnapshot *sn;
>>       uint16_t *refcount_table;
>>       int ret;
>> @@ -1305,10 +1309,85 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
>>               inc_refcounts(bs, res, refcount_table, nb_clusters,
>>                   offset, s->cluster_size);
>>               if (refcount_table[cluster] != 1) {
>> -                fprintf(stderr, "ERROR refcount block %" PRId64
>> +                fprintf(stderr, "%s refcount block %" PRId64
>>                       " refcount=%d\n",
>> +                    fix & BDRV_FIX_ERRORS ? "Repairing" :
>> +                                            "ERROR",
>>                       i, refcount_table[cluster]);
>> -                res->corruptions++;
>> +                if (fix & BDRV_FIX_ERRORS) {
> This is a quite long if block. Probably worth splitting into its own
> function. (The res->corruptions++ part could be in a common fail: part
> then.)
Seems reasonable.

>> +                    int64_t new_offset;
>> +                    void *refcount_block;
>> +
>> +                    /* allocate new refcount block */
>> +                    new_offset = qcow2_alloc_clusters(bs, s->cluster_size);
> How trustworthy is qcow2_alloc_clusters() when we know that our refcount
> information is corrupted? Couldn't we end up accidentally overwriting
> things?
I personally don't really see any other way to do this than to just fail 
without any attempt on repairing. If the refcounts are completely 
broken, I don't see a reasonable way of allocating a free cluster. 
Furthermore, the following pre-write check should at least prevent this 
from overwriting any metadata.

>> +                    if (new_offset < 0) {
>> +                        fprintf(stderr, "Could not allocate new cluster\n");
>> +                        res->corruptions++;
>> +                        continue;
>> +                    }
>> +                    /* fetch current content */
>> +                    ret = qcow2_cache_get(bs, s->refcount_block_cache, offset,
>> +                            &refcount_block);
>> +                    if (ret < 0) {
>> +                        fprintf(stderr, "Could not fetch refcount block\n");
> strerror(-ret) would be useful information in almost all of the error
> cases.
Yes, right.

>> +                        qcow2_free_clusters(bs, new_offset, s->cluster_size,
>> +                                QCOW2_DISCARD_ALWAYS);
>> +                        res->corruptions++;
>> +                        continue;
>> +                    }
>> +                    /* new block has not yet been entered into refcount table,
>> +                     * therefore it is no refcount block yet (regarding this
>> +                     * check) */
>> +                    ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_DEFAULT,
>> +                            new_offset, s->cluster_sectors * BDRV_SECTOR_SIZE);
>> +                    if (ret < 0) {
>> +                        fprintf(stderr, "Could not write refcount block (would "
>> +                                "overlap with existing metadata)\n");
>> +                        /* the image will be marked corrupt here, so don't even
>> +                         * attempt on freeing the cluster */
>> +                        res->corruptions++;
>> +                        goto fail;
>> +                    }
>> +                    /* write to new block */
>> +                    ret = bdrv_write(bs->file, new_offset >> BDRV_SECTOR_BITS,
>> +                            refcount_block, s->cluster_sectors);
>> +                    if (ret < 0) {
>> +                        fprintf(stderr, "Could not write refcount block\n");
>> +                        qcow2_free_clusters(bs, new_offset, s->cluster_size,
>> +                                QCOW2_DISCARD_ALWAYS);
>> +                        res->corruptions++;
>> +                        continue;
>> +                    }
>> +                    /* update refcount table */
>> +                    assert(!(new_offset & (s->cluster_size - 1)));
>> +                    s->refcount_table[i] = new_offset;
>> +                    ret = write_reftable_entry(bs, i);
>> +                    if (ret < 0) {
>> +                        fprintf(stderr, "Could not update refcount table\n");
>> +                        s->refcount_table[i] = offset;
>> +                        qcow2_free_clusters(bs, new_offset, s->cluster_size,
>> +                                QCOW2_DISCARD_ALWAYS);
>> +                        res->corruptions++;
>> +                        continue;
>> +                    }
>> +                    qcow2_cache_put(bs, s->refcount_block_cache,
>> +                            &refcount_block);
>> +                    /* update refcounts */
>> +                    if ((new_offset >> s->cluster_bits) >= nb_clusters) {
>> +                        /* increase refcount_table size if necessary */
>> +                        int old_nb_clusters = nb_clusters;
>> +                        nb_clusters = (new_offset >> s->cluster_bits) + 1;
>> +                        refcount_table = g_realloc(refcount_table,
>> +                                nb_clusters * sizeof(uint16_t));
>> +                        memset(&refcount_table[old_nb_clusters], 0, (nb_clusters
>> +                                - old_nb_clusters) * sizeof(uint16_t));
>> +                    }
>> +                    refcount_table[cluster]--;
>> +                    inc_refcounts(bs, res, refcount_table, nb_clusters,
>> +                            new_offset, s->cluster_size);
> res->corruptions_fixed++?
Oh, yes, I forgot that.

>> +                } else {
>> +                    res->corruptions++;
>> +                }
>>               }
>>           }
>>       }
>> @@ -1364,10 +1443,108 @@ int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res,
>>           }
>>       }
>>   
>> +    l2_table = g_malloc(s->l2_size * sizeof(uint64_t));
> qemu_blockalign() is better than g_malloc() because it avoids using a
> bounce buffer for O_DIRECT images.
Ah, okay. I'll include that in my other series as well.

>> +
>> +    /* check OFLAG_COPIED */
>> +    for (i = 0; i < s->l1_size; i++) {
>> +        uint64_t l1_entry = s->l1_table[i];
>> +        uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK;
>> +        bool l2_dirty = false;
>> +        int refcount;
>> +
>> +        if (!l2_offset) {
>> +            continue;
>> +        }
>> +
>> +        refcount = get_refcount(bs, l2_offset >> s->cluster_bits);
>> +        if (refcount < 0) {
>> +            /* don't print message nor increment check_errors, since the above
>> +             * loop will have done this already */
>> +            continue;
>> +        }
>> +        if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) {
>> +            fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_entry=%" PRIx64
>> +                    " refcount=%d\n",
>> +                    fix & BDRV_FIX_ERRORS ? "Repairing" :
>> +                                            "ERROR",
>> +                    l1_entry, refcount);
>> +            if (fix & BDRV_FIX_ERRORS) {
>> +                s->l1_table[i] = refcount == 1
>> +                               ? l1_entry |  QCOW_OFLAG_COPIED
>> +                               : l1_entry & ~QCOW_OFLAG_COPIED;
>> +                ret = qcow2_write_l1_entry(bs, i);
>> +                if (ret < 0) {
>> +                    res->check_errors++;
>> +                    goto fail;
>> +                }
> else res->corruptions_fixed++?
>
>> +            } else {
>> +                res->corruptions++;
>> +            }
>> +        }
>> +
>> +        ret = bdrv_pread(bs->file, l2_offset, l2_table,
>> +                s->l2_size * sizeof(uint64_t));
>> +        if (ret != s->l2_size * sizeof(uint64_t)) {
>> +            fprintf(stderr, "ERROR: Could not read L2 table\n");
>> +            res->check_errors++;
>> +            if (ret >= 0) {
>> +                ret = -EIO;
>> +            }
> Doesn't happen. You can just check ret < 0 instead of ret != ... in the
> first place, bdrv_pread() never returns short reads.
Okay, I think I just saw it that way in some other code.

>> +            goto fail;
>> +        }
>> +
>> +        for (j = 0; j < s->l2_size; j++) {
>> +            uint64_t l2_entry = be64_to_cpu(l2_table[j]);
>> +            uint64_t data_offset;
>> +
>> +            if (qcow2_get_cluster_type(l2_entry) != QCOW2_CLUSTER_NORMAL) {
>> +                continue;
>> +            }
>> +
>> +            data_offset = l2_entry & L2E_OFFSET_MASK;
>> +
>> +            refcount = get_refcount(bs, data_offset >> s->cluster_bits);
>> +            if (refcount < 0) {
>> +                /* don't print message nor increment check_errors */
> Why?
I didn't want to duplicate the whole comment from above, but maybe it 
makes sense if it's not obvious:

/* don't print message nor increment check_errors, since the above
  * loop will have done this already */

>> +                continue;
>> +            }
>> +            if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) {
>> +                fprintf(stderr, "%s OFLAG_COPIED data cluster: l2_entry=%"
>> +                        PRIx64 " refcount=%d\n",
>> +                        fix & BDRV_FIX_ERRORS ? "Repairing" :
>> +                                                "ERROR",
>> +                        l2_entry, refcount);
>> +                if (fix & BDRV_FIX_ERRORS) {
>> +                    l2_table[j] = cpu_to_be64(refcount == 1
>> +                                ? l2_entry |  QCOW_OFLAG_COPIED
>> +                                : l2_entry & ~QCOW_OFLAG_COPIED);
>> +                    l2_dirty = true;
> res->corruptions_fixed++
>
>> +                } else {
>> +                    res->corruptions++;
>> +                }
>> +            }
>> +        }
>> +
>> +        if (l2_dirty) {
>> +            ret = bdrv_pwrite(bs->file, l2_offset, l2_table,
>> +                    s->l2_size * sizeof(uint64_t));
>> +            if (ret != s->l2_size * sizeof(uint64_t)) {
>> +                fprintf(stderr, "ERROR: Could not write L2 table\n");
>> +                res->check_errors++;
>> +                if (ret >= 0) {
>> +                    ret = -EIO;
>> +                }
> bdrv_pwrite() also doesn't return short writes.
>
>> +                goto fail;
>> +            }
>> +        }
>> +    }
>> +
>> +
>>       res->image_end_offset = (highest_cluster + 1) * s->cluster_size;
>>       ret = 0;
>>   
>>   fail:
>> +    g_free(l2_table);
>>       g_free(refcount_table);
>>   
>>       return ret;
> Kevin
Max

^ permalink raw reply	[flat|nested] 15+ messages in thread

end of thread, other threads:[~2013-08-29 12:10 UTC | newest]

Thread overview: 15+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2013-08-28 14:55 [Qemu-devel] [PATCH v2 0/5] qcow2: Add metadata overlap checks Max Reitz
2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 1/5] qcow2: Add corrupt bit Max Reitz
2013-08-29  8:23   ` Kevin Wolf
2013-08-29  8:27     ` Max Reitz
2013-08-29  8:57       ` Kevin Wolf
2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 2/5] qcow2: Metadata overlap checks Max Reitz
2013-08-29  8:51   ` Kevin Wolf
2013-08-29  8:57     ` Max Reitz
2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 3/5] qcow2: Employ metadata " Max Reitz
2013-08-29  9:18   ` Kevin Wolf
2013-08-29  9:20     ` Max Reitz
2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 4/5] qcow2: More complete consistency check Max Reitz
2013-08-29 11:36   ` Kevin Wolf
2013-08-29 12:09     ` Max Reitz
2013-08-28 14:55 ` [Qemu-devel] [PATCH v2 5/5] qemu-iotests: Overlapping cluster allocations Max Reitz

This is an external index of several public inboxes,
see mirroring instructions on how to clone and mirror
all data and code used by this external index.