From: Sasha Levin <Alexander.Levin@microsoft.com>
To: "stable@vger.kernel.org" <stable@vger.kernel.org>,
"linux-kernel@vger.kernel.org" <linux-kernel@vger.kernel.org>
Cc: Joe Thornber <ejt@redhat.com>, Mike Snitzer <snitzer@redhat.com>,
Sasha Levin <Alexander.Levin@microsoft.com>
Subject: [PATCH AUTOSEL 4.14 25/37] dm thin metadata: try to avoid ever aborting transactions
Date: Mon, 1 Oct 2018 00:39:08 +0000 [thread overview]
Message-ID: <20181001003850.147107-25-alexander.levin@microsoft.com> (raw)
In-Reply-To: <20181001003850.147107-1-alexander.levin@microsoft.com>
From: Joe Thornber <ejt@redhat.com>
[ Upstream commit 3ab91828166895600efd9cdc3a0eb32001f7204a ]
Committing a transaction can consume some metadata of it's own, we now
reserve a small amount of metadata to cover this. Free metadata
reported by the kernel will not include this reserve.
If any of the reserve has been used after a commit we enter a new
internal state PM_OUT_OF_METADATA_SPACE. This is reported as
PM_READ_ONLY, so no userland changes are needed. If the metadata
device is resized the pool will move back to PM_WRITE.
These changes mean we never need to abort and rollback a transaction due
to running out of metadata space. This is particularly important
because there have been a handful of reports of data corruption against
DM thin-provisioning that can all be attributed to the thin-pool having
ran out of metadata space.
Signed-off-by: Joe Thornber <ejt@redhat.com>
Signed-off-by: Mike Snitzer <snitzer@redhat.com>
Signed-off-by: Sasha Levin <alexander.levin@microsoft.com>
---
drivers/md/dm-thin-metadata.c | 36 ++++++++++++++++-
drivers/md/dm-thin.c | 73 +++++++++++++++++++++++++++++++----
2 files changed, 100 insertions(+), 9 deletions(-)
diff --git a/drivers/md/dm-thin-metadata.c b/drivers/md/dm-thin-metadata.c
index 36ef284ad086..9f621b0c0e17 100644
--- a/drivers/md/dm-thin-metadata.c
+++ b/drivers/md/dm-thin-metadata.c
@@ -188,6 +188,12 @@ struct dm_pool_metadata {
unsigned long flags;
sector_t data_block_size;
+ /*
+ * We reserve a section of the metadata for commit overhead.
+ * All reported space does *not* include this.
+ */
+ dm_block_t metadata_reserve;
+
/*
* Set if a transaction has to be aborted but the attempt to roll back
* to the previous (good) transaction failed. The only pool metadata
@@ -825,6 +831,22 @@ static int __commit_transaction(struct dm_pool_metadata *pmd)
return dm_tm_commit(pmd->tm, sblock);
}
+static void __set_metadata_reserve(struct dm_pool_metadata *pmd)
+{
+ int r;
+ dm_block_t total;
+ dm_block_t max_blocks = 4096; /* 16M */
+
+ r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total);
+ if (r) {
+ DMERR("could not get size of metadata device");
+ pmd->metadata_reserve = max_blocks;
+ } else {
+ sector_div(total, 10);
+ pmd->metadata_reserve = min(max_blocks, total);
+ }
+}
+
struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
sector_t data_block_size,
bool format_device)
@@ -858,6 +880,8 @@ struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev,
return ERR_PTR(r);
}
+ __set_metadata_reserve(pmd);
+
return pmd;
}
@@ -1829,6 +1853,13 @@ int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd,
down_read(&pmd->root_lock);
if (!pmd->fail_io)
r = dm_sm_get_nr_free(pmd->metadata_sm, result);
+
+ if (!r) {
+ if (*result < pmd->metadata_reserve)
+ *result = 0;
+ else
+ *result -= pmd->metadata_reserve;
+ }
up_read(&pmd->root_lock);
return r;
@@ -1941,8 +1972,11 @@ int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_cou
int r = -EINVAL;
down_write(&pmd->root_lock);
- if (!pmd->fail_io)
+ if (!pmd->fail_io) {
r = __resize_space_map(pmd->metadata_sm, new_count);
+ if (!r)
+ __set_metadata_reserve(pmd);
+ }
up_write(&pmd->root_lock);
return r;
diff --git a/drivers/md/dm-thin.c b/drivers/md/dm-thin.c
index 6cf9ad4e4e16..699c40c7fe60 100644
--- a/drivers/md/dm-thin.c
+++ b/drivers/md/dm-thin.c
@@ -200,7 +200,13 @@ struct dm_thin_new_mapping;
enum pool_mode {
PM_WRITE, /* metadata may be changed */
PM_OUT_OF_DATA_SPACE, /* metadata may be changed, though data may not be allocated */
+
+ /*
+ * Like READ_ONLY, except may switch back to WRITE on metadata resize. Reported as READ_ONLY.
+ */
+ PM_OUT_OF_METADATA_SPACE,
PM_READ_ONLY, /* metadata may not be changed */
+
PM_FAIL, /* all I/O fails */
};
@@ -1382,7 +1388,35 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode);
static void requeue_bios(struct pool *pool);
-static void check_for_space(struct pool *pool)
+static bool is_read_only_pool_mode(enum pool_mode mode)
+{
+ return (mode == PM_OUT_OF_METADATA_SPACE || mode == PM_READ_ONLY);
+}
+
+static bool is_read_only(struct pool *pool)
+{
+ return is_read_only_pool_mode(get_pool_mode(pool));
+}
+
+static void check_for_metadata_space(struct pool *pool)
+{
+ int r;
+ const char *ooms_reason = NULL;
+ dm_block_t nr_free;
+
+ r = dm_pool_get_free_metadata_block_count(pool->pmd, &nr_free);
+ if (r)
+ ooms_reason = "Could not get free metadata blocks";
+ else if (!nr_free)
+ ooms_reason = "No free metadata blocks";
+
+ if (ooms_reason && !is_read_only(pool)) {
+ DMERR("%s", ooms_reason);
+ set_pool_mode(pool, PM_OUT_OF_METADATA_SPACE);
+ }
+}
+
+static void check_for_data_space(struct pool *pool)
{
int r;
dm_block_t nr_free;
@@ -1408,14 +1442,16 @@ static int commit(struct pool *pool)
{
int r;
- if (get_pool_mode(pool) >= PM_READ_ONLY)
+ if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE)
return -EINVAL;
r = dm_pool_commit_metadata(pool->pmd);
if (r)
metadata_operation_failed(pool, "dm_pool_commit_metadata", r);
- else
- check_for_space(pool);
+ else {
+ check_for_metadata_space(pool);
+ check_for_data_space(pool);
+ }
return r;
}
@@ -1481,6 +1517,19 @@ static int alloc_data_block(struct thin_c *tc, dm_block_t *result)
return r;
}
+ r = dm_pool_get_free_metadata_block_count(pool->pmd, &free_blocks);
+ if (r) {
+ metadata_operation_failed(pool, "dm_pool_get_free_metadata_block_count", r);
+ return r;
+ }
+
+ if (!free_blocks) {
+ /* Let's commit before we use up the metadata reserve. */
+ r = commit(pool);
+ if (r)
+ return r;
+ }
+
return 0;
}
@@ -1512,6 +1561,7 @@ static blk_status_t should_error_unserviceable_bio(struct pool *pool)
case PM_OUT_OF_DATA_SPACE:
return pool->pf.error_if_no_space ? BLK_STS_NOSPC : 0;
+ case PM_OUT_OF_METADATA_SPACE:
case PM_READ_ONLY:
case PM_FAIL:
return BLK_STS_IOERR;
@@ -2475,8 +2525,9 @@ static void set_pool_mode(struct pool *pool, enum pool_mode new_mode)
error_retry_list(pool);
break;
+ case PM_OUT_OF_METADATA_SPACE:
case PM_READ_ONLY:
- if (old_mode != new_mode)
+ if (!is_read_only_pool_mode(old_mode))
notify_of_pool_mode_change(pool, "read-only");
dm_pool_metadata_read_only(pool->pmd);
pool->process_bio = process_bio_read_only;
@@ -3412,6 +3463,10 @@ static int maybe_resize_metadata_dev(struct dm_target *ti, bool *need_commit)
DMINFO("%s: growing the metadata device from %llu to %llu blocks",
dm_device_name(pool->pool_md),
sb_metadata_dev_size, metadata_dev_size);
+
+ if (get_pool_mode(pool) == PM_OUT_OF_METADATA_SPACE)
+ set_pool_mode(pool, PM_WRITE);
+
r = dm_pool_resize_metadata_dev(pool->pmd, metadata_dev_size);
if (r) {
metadata_operation_failed(pool, "dm_pool_resize_metadata_dev", r);
@@ -3715,7 +3770,7 @@ static int pool_message(struct dm_target *ti, unsigned argc, char **argv)
struct pool_c *pt = ti->private;
struct pool *pool = pt->pool;
- if (get_pool_mode(pool) >= PM_READ_ONLY) {
+ if (get_pool_mode(pool) >= PM_OUT_OF_METADATA_SPACE) {
DMERR("%s: unable to service pool target messages in READ_ONLY or FAIL mode",
dm_device_name(pool->pool_md));
return -EOPNOTSUPP;
@@ -3789,6 +3844,7 @@ static void pool_status(struct dm_target *ti, status_type_t type,
dm_block_t nr_blocks_data;
dm_block_t nr_blocks_metadata;
dm_block_t held_root;
+ enum pool_mode mode;
char buf[BDEVNAME_SIZE];
char buf2[BDEVNAME_SIZE];
struct pool_c *pt = ti->private;
@@ -3859,9 +3915,10 @@ static void pool_status(struct dm_target *ti, status_type_t type,
else
DMEMIT("- ");
- if (pool->pf.mode == PM_OUT_OF_DATA_SPACE)
+ mode = get_pool_mode(pool);
+ if (mode == PM_OUT_OF_DATA_SPACE)
DMEMIT("out_of_data_space ");
- else if (pool->pf.mode == PM_READ_ONLY)
+ else if (is_read_only_pool_mode(mode))
DMEMIT("ro ");
else
DMEMIT("rw ");
--
2.17.1
next prev parent reply other threads:[~2018-10-01 0:40 UTC|newest]
Thread overview: 37+ messages / expand[flat|nested] mbox.gz Atom feed top
2018-10-01 0:38 [PATCH AUTOSEL 4.14 01/37] netfilter: xt_cluster: add dependency on conntrack module Sasha Levin
2018-10-01 0:38 ` [PATCH AUTOSEL 4.14 02/37] HID: add support for Apple Magic Keyboards Sasha Levin
2018-10-01 0:38 ` [PATCH AUTOSEL 4.14 03/37] usb: gadget: fotg210-udc: Fix memory leak of fotg210->ep[i] Sasha Levin
2018-10-01 0:38 ` [PATCH AUTOSEL 4.14 05/37] HID: hid-saitek: Add device ID for RAT 7 Contagion Sasha Levin
2018-10-01 0:38 ` [PATCH AUTOSEL 4.14 04/37] pinctrl: msm: Really mask level interrupts to prevent latching Sasha Levin
2018-10-01 0:38 ` [PATCH AUTOSEL 4.14 06/37] scsi: iscsi: target: Set conn->sess to NULL when iscsi_login_set_conn_values fails Sasha Levin
2018-10-01 0:38 ` [PATCH AUTOSEL 4.14 07/37] scsi: qedi: Add the CRC size within iSCSI NVM image Sasha Levin
2018-10-01 0:38 ` [PATCH AUTOSEL 4.14 08/37] perf evsel: Fix potential null pointer dereference in perf_evsel__new_idx() Sasha Levin
2018-10-01 0:38 ` [PATCH AUTOSEL 4.14 10/37] perf probe powerpc: Ignore SyS symbols irrespective of endianness Sasha Levin
2018-10-01 0:38 ` [PATCH AUTOSEL 4.14 09/37] perf util: Fix bad memory access in trace info Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 11/37] netfilter: nf_tables: release chain in flushing set Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 12/37] Revert "iio: temperature: maxim_thermocouple: add MAX31856 part" Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 13/37] RDMA/ucma: check fd type in ucma_migrate_id() Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 14/37] HID: sensor-hub: Restore fixup for Lenovo ThinkPad Helix 2 sensor hub report Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 15/37] USB: yurex: Check for truncation in yurex_read() Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 16/37] nvmet-rdma: fix possible bogus dereference under heavy load Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 17/37] net/mlx5: Consider PCI domain in search for next dev Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 18/37] drm/nouveau/TBDdevinit: don't fail when PMU/PRE_OS is missing from VBIOS Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 20/37] dm raid: fix rebuild of specific devices by updating superblock Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 19/37] drm/nouveau/disp: fix DP disable race Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 22/37] net: ena: fix driver when PAGE_SIZE == 64kB Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 21/37] fs/cifs: suppress a string overflow warning Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 23/37] net: ena: fix missing calls to READ_ONCE Sasha Levin
2018-10-01 0:39 ` Sasha Levin [this message]
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 24/37] perf/x86/intel: Add support/quirk for the MISPREDICT bit on Knights Landing CPUs Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 26/37] netfilter: conntrack: timeout interface depend on CONFIG_NF_CONNTRACK_TIMEOUT Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 28/37] hexagon: modify ffs() and fls() to return int Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 27/37] arch/hexagon: fix kernel/dma.c build warning Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 29/37] arm64: jump_label.h: use asm_volatile_goto macro instead of "asm goto" Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 30/37] drm/amdgpu: fix error handling in amdgpu_cs_user_fence_chunk Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 31/37] r8169: Clear RTL_FLAG_TASK_*_PENDING when clearing RTL_FLAG_TASK_ENABLED Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 32/37] s390/qeth: use vzalloc for QUERY OAT buffer Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 33/37] s390/qeth: don't dump past end of unknown HW header Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 34/37] cifs: read overflow in is_valid_oplock_break() Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 35/37] xen/manage: don't complain about an empty value in control/sysrq node Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 37/37] xen: fix GCC warning and remove duplicate EVTCHN_ROW/EVTCHN_COL usage Sasha Levin
2018-10-01 0:39 ` [PATCH AUTOSEL 4.14 36/37] xen: avoid crash in disable_hotplug_cpu Sasha Levin
Reply instructions:
You may reply publicly to this message via plain-text email
using any one of the following methods:
* Save the following mbox file, import it into your mail client,
and reply-to-all from there: mbox
Avoid top-posting and favor interleaved quoting:
https://en.wikipedia.org/wiki/Posting_style#Interleaved_style
* Reply using the --to, --cc, and --in-reply-to
switches of git-send-email(1):
git send-email \
--in-reply-to=20181001003850.147107-25-alexander.levin@microsoft.com \
--to=alexander.levin@microsoft.com \
--cc=ejt@redhat.com \
--cc=linux-kernel@vger.kernel.org \
--cc=snitzer@redhat.com \
--cc=stable@vger.kernel.org \
/path/to/YOUR_REPLY
https://kernel.org/pub/software/scm/git/docs/git-send-email.html
* If your mail client supports setting the In-Reply-To header
via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line
before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).